提交 8547118b 编写于 作者: 幻灰龙's avatar 幻灰龙

Merge branch 'master' into 'master'

添加segmentfault标签数据集

See merge request csdn/csdn-tags!8
此差异已折叠。
import click
import tag_source.vscode
import tag_source.stackoverflow
import tag_source.segmentfault
@click.command()
@click.option('--source')
......@@ -10,6 +11,8 @@ def fetch(source):
tag_source.vscode.fetch()
elif source=='so':
tag_source.stackoverflow.fetch()
elif source=='sf':
tag_source.segmentfault.fetch()
if __name__ == '__main__':
fetch()
\ No newline at end of file
import os
import json
import urllib.request
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
class SegmentFaultTagSpider(scrapy.Spider):
name = "segmentfault_tags"
allowed_domains = ["segmentfault.com"]
start_urls = ['https://segmentfault.com/tags/all?page=1']
custom_settings = {
'ITEM_PIPELINES':{'tag_source.segmentfault.TagPipeline': 301},
'LOG_LEVEL': 'INFO'
}
def __init__(self):
self.page_count = 0
self.totgal_pages = 654
def parse(self, response):
self.page_count += 1
tags = response.css('.widget-tag')
for tag in tags:
name = tag.xpath('h2/a/text()').get()
desc = tag.xpath('p/text()').get()
star = tag.xpath('div/strong/text()').get()
yield {
'name': name,
'desc': desc,
'star': star
}
next_page_list = response.css('.next')
if len(next_page_list)>0:
next_page_item = next_page_list[len(next_page_list)-1]
next_page = next_page_item.css('a::attr(href)').get()
print('next_page:', next_page)
yield response.follow(next_page, callback=self.parse, dont_filter=True)
class TagPipeline(object):
def open_spider(self, spider):
self.file = open('dataset/segmentfault.tag.json', 'w')
self.file.write('[\n')
self.count = 0
self.tags = {}
def close_spider(self, spider):
self.file.write('\n]')
self.file.close()
def process_item(self, item, spider):
if self.tags.get(item['name']) is not None:
return
self.tags[item['name']] = True
words = []
if self.count>0:
words.append(',\n')
words.append(' ')
words.append(json.dumps(item, ensure_ascii=False).strip())
line = ''.join(words)
self.file.write(line)
self.count += 1
def fetch():
settings = Settings()
process = CrawlerProcess()
process.crawl(SegmentFaultTagSpider)
process.start()
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册