提交 56e2765e 编写于 作者: 梦想橡皮擦's avatar 梦想橡皮擦 💬

阳光问政网爬虫

上级 a74b2598
import requests
import random
from lxml import etree # 从lxml中导入etree
ua = ['Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36']
headers = {
'user-agent':ua[random.randint(0,2)]
}
for i in range(1,10):
response = requests.get(f"http://yglz.tousu.hebnews.cn/shss-{i}.html",headers=headers)
html = response.content.decode("utf-8")
print("*"*200)
tree = etree.HTML(html) # 解析html
divs = tree.xpath('//div[@class="listcon"]') # 解析列表区域div
for div in divs: # 循环这个区域
try:
# 注意下面是通过div去进行的xpath查找,同时加上try方式报错
shouli = div.xpath('span[1]/p/a/text()')[0] # 受理单位
content = div.xpath('span[2]/p/a/text()')[0] # 投诉内容
datetime = div.xpath('span[3]/p/text()')[0].replace("\n","") # 时间
status = div.xpath('span[5]/p/text()')[0].replace("\n","") # 时间
one_data = {"shouli":shouli,
"type":type,
"content":content,
"datetime":datetime,
"status":status,
}
print(one_data) # 打印数据,方便存储到mongodb里面
except Exception as e:
print("内部数据报错")
print(div)
continue
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册