提交 adddc2e5 编写于 作者: 梦想橡皮擦's avatar 梦想橡皮擦 💬

书伴网数据爬虫

上级 d7345539
import requests
from lxml import etree
# 导入协程模块
import asyncio
import aiohttp
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
"Host": "www.shuban.net",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"}
async def get_content(url):
print("正在操作:{}".format(url))
# 创建一个session 去获取数据
async with aiohttp.ClientSession() as session:
async with session.get(url,headers=headers) as res:
if res.status == 200:
source = await res.text() # 等待获取文本
tree =etree.HTML(source)
await async_content(tree)
async def async_content(tree):
title = tree.xpath("//h1[@class='title']/a/text()")[0]
print(title)
# 如果页面没有信息,直接返回即可
# if title == '':
# return
# else:
# try:
# description = tree.xpath("//div[@class='hanghang-shu-content-font']")
# author = description[0].xpath("p[1]/text()")[0].replace("作者:","") if description[0].xpath("p[1]/text()")[0] is not None else None
# cate = description[0].xpath("p[2]/text()")[0].replace("分类:","") if description[0].xpath("p[2]/text()")[0] is not None else None
# douban = description[0].xpath("p[3]/text()")[0].replace("豆瓣评分:","") if description[0].xpath("p[3]/text()")[0] is not None else None
# # 这部分内容不明确,不做记录
# #des = description[0].xpath("p[5]/text()")[0] if description[0].xpath("p[5]/text()")[0] is not None else None
# download = tree.xpath("//a[@class='downloads']")
# except Exception as e:
# print(title)
# return
# ls = [
# title,author,cate,douban,download[0].get('href')
# ]
# return ls
if __name__ == '__main__':
url_format = "https://www.shuban.net/read-{}.html"
full_urllist = [url_format.format(i) for i in range(50773,50783)] # 控制到第3页,更多数据自行获取
loop = asyncio.get_event_loop()
tasks = [asyncio.ensure_future(get_content(url)) for url in full_urllist]
results = loop.run_until_complete(asyncio.wait(tasks))
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册