提交 22fe92d3 编写于 作者: 梦想橡皮擦's avatar 梦想橡皮擦 💬

花容网,一派网,热门话题案例上传

上级 a1dbe14f
import requests
import threading
from queue import Queue
from lxml import etree
import time
import random
# 初始化一个队列
q = Queue(maxsize=0)
# 批量添加数据
for page in range(1, 4):
q.put('https://www.huaroo.net/d/pg_{}/'.format(page))
# 获取头文件
def get_headers():
uas = [
"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
"Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)",
"Baiduspider-image+(+http://www.baidu.com/search/spider.htm)",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36",
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
"Mozilla/5.0 (compatible; Googlebot-Image/1.0; +http://www.google.com/bot.html)",
"Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
"Sogou News Spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);",
"Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)",
"Sosospider+(+http://help.soso.com/webspider.htm)",
"Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)"
]
ua = random.choice(uas)
headers = {
"user-agent": ua,
"referer": "https://www.baidu.com"
}
return headers
# 格式化数据
def format(text):
element = etree.HTML(text)
# print(element)
article_list = element.xpath('//div[contains(@class,"article_list")]')
# print(article_list)
wait_save_str = ""
for article in article_list:
title = article.xpath(
"./a/div/div[@class='article_title']/text()")[0].strip()
hospital = article.xpath(
"./a/div/div[@class='hospital_list_content mt10 oh']/div[1]/text()")[0].strip()
duties = article.xpath(
"./a/div/div[@class='hospital_list_content mt10 oh']/div[2]/text()")[0].strip()
practice = article.xpath(
"./a/div/div[@class='hospital_list_content mt10 oh']/div[3]/text()")[0].strip()
project = article.xpath(
"./a/div/div[@class='hospital_list_content mt10 oh']/div[4]/text()")[0].strip()
wait_save_str += f"{title},{hospital},{duties},{practice},{project}\n"
save(wait_save_str)
# 储存数据
def save(wait_save_str):
with open('./医美2.csv', 'a+', encoding='utf-8') as f:
f.write(wait_save_str)
print(wait_save_str, "---保存成功")
# 爬虫请求与解析入口
def run():
while q.qsize() > 0:
url = q.get()
q.task_done()
# print(url)
res = requests.get(url=url, headers=get_headers(), timeout=10)
format(res.text)
l = []
for i in range(2):
t = threading.Thread(target=run)
l.append(t)
t.start()
for p in l:
p.join()
print("多线程执行完毕")
q.join()
print("所有线程运行完毕")
因为 它太大了无法显示 source diff 。你可以改为 查看blob
import requests
import threading
from queue import LifoQueue
import time
import random
# 初始化一个队列
q = LifoQueue(maxsize=0)
# 批量添加数据
for page in range(1, 7):
# https://sspai.com/api/v1/bullet/search/page/get?type=0&limit=10&offset=0&created_at=0
q.put('https://sspai.com/api/v1/bullet/search/page/get?type=0&limit=10&offset={}&created_at=0'.format((page-1)*10))
def get_headers():
uas = [
"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
"Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)",
"Baiduspider-image+(+http://www.baidu.com/search/spider.htm)",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36",
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
"Mozilla/5.0 (compatible; Googlebot-Image/1.0; +http://www.google.com/bot.html)",
"Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
"Sogou News Spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);",
"Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)",
"Sosospider+(+http://help.soso.com/webspider.htm)",
"Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)"
]
ua = random.choice(uas)
headers = {
"user-agent": ua
}
return headers
# 储存数据
def save(text):
with open(f'{time.time()}.json', 'a+', encoding='utf-8') as f:
f.write(text)
print(text, "--- 保存成功")
if __name__ == "__main__":
while q.qsize() > 0:
url = q.get()
q.task_done()
res = requests.get(url=url, headers=get_headers(), timeout=10)
save(res.text)
q.join()
print("所有任务都已完成")
from queue import Queue
import time
import threading
import requests
from lxml import etree
import random
def get_headers():
uas = [
"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
"Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)",
"Baiduspider-image+(+http://www.baidu.com/search/spider.htm)",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36",
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
"Mozilla/5.0 (compatible; Googlebot-Image/1.0; +http://www.google.com/bot.html)",
"Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
"Sogou News Spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);",
"Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)",
"Sosospider+(+http://help.soso.com/webspider.htm)",
"Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)"
]
ua = random.choice(uas)
headers = {
"user-agent": ua
}
return headers
# 热门话题列表页待抓取链接
hot_subjects = Queue(maxsize=0)
for i in range(1, 11):
url = f'https://www.jisilu.cn/topic/square/id-hot__feature_id-__page-{i}'
hot_subjects.put(url)
# 初始化一个队列
q_data_ids = Queue(maxsize=0)
# 生产者
def producer():
while hot_subjects.qsize() > 0:
list_url = hot_subjects.get()
hot_subjects.task_done()
print("正在解析:", list_url)
# 获取分页地址
res = requests.get(list_url, headers=get_headers(), timeout=3)
element = etree.HTML(res.text)
data_ids = element.xpath('//a[@class="aw-topic-name"]/@data-id')
for data_id in data_ids:
q_data_ids.put(data_id)
# 消费者
def consumer():
while True:
# 取一个分类ID
data_id = q_data_ids.get()
q_data_ids.task_done()
if data_id is None:
break
start_page = 1
url = f'https://www.jisilu.cn/question/ajax/discuss/sort_type-new__topic_id-{data_id}__page-{start_page}'
res = requests.get(url=url, headers=get_headers(), timeout=5)
text = res.text
while len(text) > 0:
url = f'https://www.jisilu.cn/question/ajax/discuss/sort_type-new__topic_id-{data_id}__page-{start_page}'
res = requests.get(url=url, headers=get_headers(), timeout=5)
print(res.url)
text = res.text
start_page += 1
if len(text)>0:
element = etree.HTML(res.text)
titles = element.xpath('//h4/a/text()')
urls = element.xpath('//h4/a/@href')
names = element.xpath('//a[@class="aw-user-name"]/text()')
data = zip(titles,names,urls)
save_list = [f"{item[0]},{item[1]},{item[2]}\n" for item in data]
long_str = "".join(save_list)
with open("./data.csv","a+",encoding="utf-8") as f:
f.write(long_str)
# 开启2个生产者线程
for p_in in range(1, 3):
p = threading.Thread(target=producer)
p.start()
# 开启2个消费者线程
for p_in in range(1, 2):
p = threading.Thread(target=consumer)
p.start()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册