提交 348de4b0 编写于 作者: 梦想橡皮擦's avatar 梦想橡皮擦 💬

博客粉丝采集

上级 b0f61b78
import threading
from threading import Lock, Thread
import time
import os
import requests
import random
class MyThread(threading.Thread):
def __init__(self, name):
super(MyThread, self).__init__()
self.name = name
def run(self):
global urls
lock.acquire()
one_url = urls.pop()
print("正在爬取:", one_url)
lock.release()
print("任意线程等待随机时间")
time.sleep(random.randint(1,3))
res = requests.get(one_url, headers=self.get_headers(), timeout=5)
if res.json()["code"] != 400:
data = res.json()["data"]["list"]
for user in data:
name = user['username']
nickname = self.remove_character(user['nickname'])
userAvatar = user['userAvatar']
blogUrl = user['blogUrl']
blogExpert = user['blogExpert']
briefIntroduction = self.remove_character(
user['briefIntroduction'])
with open('./qing_gee_data.csv', 'a+', encoding='utf-8') as f:
print(
f'{name},{nickname},{userAvatar},{blogUrl},{blogExpert},{briefIntroduction}')
f.write(
f"{name},{nickname},{userAvatar},{blogUrl},{blogExpert},{briefIntroduction}\n")
else:
print(res.json())
print("异常数据", one_url)
with open('./error.txt', 'a+', encoding='utf-8') as f:
f.write(one_url+"\n")
# 去除特殊字符
def remove_character(self, origin_str):
if origin_str is None:
return
origin_str = origin_str.replace('\n', '')
origin_str = origin_str.replace(',', ',')
return origin_str
def get_headers(self):
uas = [
"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
"Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)",
"Baiduspider-image+(+http://www.baidu.com/search/spider.htm)",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36",
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
"Mozilla/5.0 (compatible; Googlebot-Image/1.0; +http://www.google.com/bot.html)",
"Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
"Sogou News Spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);",
"Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)",
"Sosospider+(+http://help.soso.com/webspider.htm)",
"Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)"
]
ua = random.choice(uas)
headers = {
"user-agent": ua,
'cookie': 'UserName=你的ID; UserInfo=你的UserInfo; UserToken=你的UserToken;',
"referer": "https://blog.csdn.net/qing_gee?type=sub&subType=fans"
}
return headers
if __name__ == '__main__':
lock = Lock()
url_format = 'https://blog.csdn.net/community/home-api/v1/get-fans-list?page={}&size=20&noMore=false&blogUsername=qing_gee'
urls = [url_format.format(i) for i in range(1, 13300)]
l = []
while len(urls) > 0:
print(len(urls))
for i in range(5):
p = MyThread("t"+str(i))
l.append(p)
p.start()
for p in l:
p.join()
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册