阳光问政网爬虫

56e2765e · 梦想橡皮擦 · a74b2598 · 56e2765e
隐藏空白更改
内联并排

Showing with 43 addition and 0 deletion

案例9/河北阳光问政平台实话实说.py 案例9/河北阳光问政平台实话实说.py +43 -0

未找到文件。
--- a/案例9/河北阳光问政平台实话实说.py
+++ b/案例9/河北阳光问政平台实话实说.py
+import requests
+import random
+from lxml import etree  # 从lxml中导入etree
+ua = ['Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
+'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
+'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362',
+'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36']
+headers = {
+    'user-agent':ua[random.randint(0,2)]
+}
+for i in range(1,10):
+    
+
+    response = requests.get(f"http://yglz.tousu.hebnews.cn/shss-{i}.html",headers=headers)
+    html = response.content.decode("utf-8")
+    print("*"*200)
+
+    tree = etree.HTML(html)  # 解析html
+    divs = tree.xpath('//div[@class="listcon"]')  # 解析列表区域div
+    for div in divs:  # 循环这个区域
+        try:
+        	# 注意下面是通过div去进行的xpath查找，同时加上try方式报错
+            shouli = div.xpath('span[1]/p/a/text()')[0]  # 受理单位
+    
+            content = div.xpath('span[2]/p/a/text()')[0]  # 投诉内容
+            datetime = div.xpath('span[3]/p/text()')[0].replace("\n","")  # 时间
+            status = div.xpath('span[5]/p/text()')[0].replace("\n","")  # 时间
+            one_data = {"shouli":shouli,
+                        "type":type,
+                        "content":content,
+                        "datetime":datetime,
+                        "status":status,
+                        }
+            print(one_data)  # 打印数据，方便存储到mongodb里面
+            
+        except Exception as e:
+            print("内部数据报错")
+            print(div)
+            continue
+    
+
+ 
\ No newline at end of file