提交 0eda4b95 编写于 作者: 幻灰龙's avatar 幻灰龙

Merge branch 'master' into 'master'

添加OSChina的标签集

See merge request csdn/csdn-tags!11
此差异已折叠。
......@@ -4,21 +4,27 @@ import tag_source.stackoverflow
import tag_source.segmentfault
import tag_source.infoq
import tag_source.cnblogs
import tag_source.oschina
@click.command()
@click.option('--source')
def fetch(source):
click.echo('will fetch tags from %s!' % source)
if source=='vscode':
tag_source.vscode.fetch()
elif source=='so':
tag_source.stackoverflow.fetch()
elif source=='sf':
tag_source.segmentfault.fetch()
elif source=='infoq':
tag_source.infoq.fetch()
elif source=='cnblogs':
tag_source.cnblogs.fetch()
sources = {
'vscode': lambda: tag_source.vscode.fetch(),
'so': lambda: tag_source.stackoverflow.fetch(),
'sf': lambda: tag_source.segmentfault.fetch(),
'infoq': lambda: tag_source.infoq.fetch(),
'cnblogs': lambda: tag_source.cnblogs.fetch(),
'oschina': lambda: tag_source.oschina.fetch(),
}
action = sources.get(source)
if action is not None:
action()
else:
print('source {} is not support now.'.format(source))
if __name__ == '__main__':
fetch()
\ No newline at end of file
import os
import json
import urllib.request
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
class OSChinaTagSpider(scrapy.Spider):
name = "oschina_tags"
allowed_domains = ["oschina.net"]
start_urls = ['https://www.oschina.net/question/tags?p=1']
custom_settings = {
'ITEM_PIPELINES':{'tag_source.oschina.TagPipeline': 301},
'LOG_LEVEL': 'INFO'
}
def __init__(self):
self.page_count = 0
self.totgal_pages = 606
def parse(self, response):
self.page_count += 1
tags = response.css('.tag-card')
for tag in tags:
content = tag.css('.content')
name = content.css('.header::text').get()
star = int(content.css('.meta::text').get().replace(' 个问答',''))
desc = content.css('.description::text').get()
yield {
'name': name,
'star': star,
'desc': desc
}
if self.page_count<self.totgal_pages:
next_page = response.css('.next-item::attr(href)').get()
if next_page is not None:
print('next_page:', next_page)
yield response.follow(next_page, callback=self.parse, dont_filter=True)
class TagPipeline(object):
def open_spider(self, spider):
self.file = open('dataset/oschina.tag.json', 'w')
self.file.write('[\n')
self.count = 0
self.tags = {}
def close_spider(self, spider):
self.file.write('\n]')
self.file.close()
def process_item(self, item, spider):
if self.tags.get(item['name']) is not None:
return
self.tags[item['name']] = True
words = []
if self.count>0:
words.append(',\n')
words.append(' ')
words.append(json.dumps(item, ensure_ascii=False).strip())
line = ''.join(words)
self.file.write(line)
self.count += 1
def fetch():
settings = Settings()
process = CrawlerProcess()
process.crawl(OSChinaTagSpider)
process.start()
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册