提交 458eb93d 编写于 作者: 梦想橡皮擦's avatar 梦想橡皮擦 💬

10000+动漫抓取

上级 d9a47235
此差异已折叠。
import os
import re
import requests
def reade_html():
path = r"E:\pythonProject\test\html"
files = os.listdir(path)
for file in files:
file_path = os.path.join(path, file)
with open(file_path, "r", encoding="utf-8") as f:
html = f.read()
img_pattern = re.compile('<div class="img_book"[.\s]*style="background:url\((.*?)\)')
title_pattern = re.compile("<a href='(?P<url>.*?)'>(?P<title>.*?)</a> <br /> \[(?P<author>.*?)\] <br />")
score_pattern = re.compile('<p style=".*?"><b>(.*?)</b></p>')
img_urls = img_pattern.findall(html)
details = title_pattern.findall(html)
scores = score_pattern.findall(html)
# save(details, scores)
for index, url in enumerate(img_urls):
save_img(details[index][1], url)
def save(details, scores):
for index, detail in enumerate(details):
my_str = "%s,%s,%s,%s\n" % (detail[1].replace(",", ","), detail[0], detail[2].replace(",", ","), scores[index])
with open("./comic.csv", "a+", encoding="utf-8") as f:
f.write(my_str)
def save_img(title, url):
print(f"正在抓取{title}--{url}")
headers = {
"User-Agent": "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52"
}
try:
res = requests.get(url, headers=headers, allow_redirects=False, timeout=10)
data = res.content
with open(f"imgs/{title}.jpg", "wb+") as f:
f.write(data)
except Exception as e:
print(e)
if __name__ == '__main__':
reade_html()
import requests
import re
import threading
import time
import random
USER_AGENTS = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
]
# 循环获取 URL
def get_image(base_url, index):
headers = {
"User-Agent": random.choice(USER_AGENTS)
}
print(f"正在抓取{index}")
try:
res = requests.get(url=base_url, headers=headers, allow_redirects=False, timeout=10)
print(res.status_code)
while res.status_code == 302:
ip_json = requests.get("http://118.24.52.95:5010/get/", headers=headers).json()
ip = ip_json["proxy"]
proxies = {
"http": ip,
"https": ip
}
print(proxies)
res = requests.get(url=base_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=10)
time.sleep(5)
print(res.status_code)
else:
html = res.text
with open(f"html/{index}.html", "w+", encoding="utf-8") as f:
f.write(html)
semaphore.release()
except Exception as e:
print(e)
print("睡眠10s,再去抓取")
time.sleep(10)
get_image(base_url, index)
if __name__ == '__main__':
num = 0
# 最多开启5个线程
semaphore = threading.BoundedSemaphore(5)
lst_record_threads = []
for index in range(1, 525):
semaphore.acquire()
t = threading.Thread(target=get_image, args=(
f"https://vol.moe/l/all,all,all,sortpoint,all,all,BL/{index}.htm", index))
t.start()
lst_record_threads.append(t)
for rt in lst_record_threads:
rt.join()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册