import requests
from fake_useragent import UserAgent
import re
import threading
def replace_mark(my_str):
return my_str.replace(",", ",").replace('"', "“")
def format_html(html):
li_pattern = re.compile(
'
[\s\S]+?')
title_url_pattern = re.compile(
'(.*?)')
sign_pattern = re.compile('签约')
exclusive_pattern = re.compile('独家')
author_pattern = re.compile(
'(.*?)
')
tags_pattern = re.compile('(.*?)')
score_pattern = re.compile('人气:(.*?)')
items = li_pattern.findall(html)
for item in items:
title_url = title_url_pattern.search(item)
title = title_url.group(2)
url = title_url.group(1)
sign = 0
exclusive = 0
if sign_pattern.search(item) is not None:
sign = 1
if exclusive_pattern.search(item) is not None:
exclusive = 1
author = author_pattern.search(item).group(1)
tags = tags_pattern.findall(item)
score = score_pattern.search(item).group(1)
lock.acquire()
with open("./qq.csv", "a+", encoding="utf-8") as f:
f.write(
f'{replace_mark(title)},{url},{sign},{exclusive},{replace_mark(author)},{"#".join(tags)},"{replace_mark(score)}" \n')
lock.release()
def run(index):
ua = UserAgent(use_cache_server=False)
response = requests.get(
f"https://ac.qq.com/Comic/index/page/{index}", headers={'User-Agent': ua.random})
html = response.text
format_html(html)
semaphore.release()
lock = threading.Lock()
if __name__ == "__main__":
num = 0
semaphore = threading.BoundedSemaphore(5)
lst_record_threads = []
for index in range(1, 462):
print(f"正在抓取{index}")
semaphore.acquire()
t = threading.Thread(target=run, args=(index, ))
t.start()
lst_record_threads.append(t)
for rt in lst_record_threads:
rt.join()
print("数据爬取完毕")