10000+动漫抓取

458eb93d · 梦想橡皮擦 · d9a47235 · 458eb93d · 458eb93d · 458eb93d
4 changed file
--- a/NO9/html/测试用.html
+++ b/NO9/html/测试用.html
--- a/NO9/imgs/测试用.jpg
+++ b/NO9/imgs/测试用.jpg
--- a/NO9/数据提取代码.py
+++ b/NO9/数据提取代码.py
+import os
+import re
+import requests
+
+
+def reade_html():
+    path = r"E:\pythonProject\test\html"
+    files = os.listdir(path)
+
+    for file in files:
+        file_path = os.path.join(path, file)
+        with open(file_path, "r", encoding="utf-8") as f:
+            html = f.read()
+            img_pattern = re.compile('<div class="img_book"[.\s]*style="background:url\((.*?)\)')
+            title_pattern = re.compile("<a href='(?P<url>.*?)'>(?P<title>.*?)</a> <br /> \[(?P<author>.*?)\] <br />")
+            score_pattern = re.compile('<p style=".*?"><b>(.*?)</b></p>')
+            img_urls = img_pattern.findall(html)
+            details = title_pattern.findall(html)
+            scores = score_pattern.findall(html)
+
+            # save(details, scores)
+            for index, url in enumerate(img_urls):
+                save_img(details[index][1], url)
+
+
+def save(details, scores):
+    for index, detail in enumerate(details):
+        my_str = "%s,%s,%s,%s\n" % (detail[1].replace(",", "，"), detail[0], detail[2].replace(",", "，"), scores[index])
+        with open("./comic.csv", "a+", encoding="utf-8") as f:
+            f.write(my_str)
+
+
+def save_img(title, url):
+    print(f"正在抓取{title}--{url}")
+    headers = {
+        "User-Agent": "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52"
+    }
+    try:
+        res = requests.get(url, headers=headers, allow_redirects=False, timeout=10)
+
+        data = res.content
+        with open(f"imgs/{title}.jpg", "wb+") as f:
+            f.write(data)
+
+    except Exception as e:
+        print(e)
+
+
+if __name__ == '__main__':
+    reade_html()
--- a/NO9/静态页爬取代码.py
+++ b/NO9/静态页爬取代码.py
+import requests
+import re
+import threading
+import time
+import random
+
+USER_AGENTS = [
+    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
+    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
+    "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
+    "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
+    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
+    "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
+    "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
+    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
+    "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
+    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
+    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
+    "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
+    "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
+    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
+    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
+]
+
+
+# 循环获取 URL
+def get_image(base_url, index):
+    headers = {
+        "User-Agent": random.choice(USER_AGENTS)
+
+    }
+
+    print(f"正在抓取{index}")
+    try:
+        res = requests.get(url=base_url, headers=headers, allow_redirects=False, timeout=10)
+        print(res.status_code)
+        while res.status_code == 302:
+            ip_json = requests.get("http://118.24.52.95:5010/get/", headers=headers).json()
+            ip = ip_json["proxy"]
+            proxies = {
+                "http": ip,
+                "https": ip
+            }
+            print(proxies)
+            res = requests.get(url=base_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=10)
+            time.sleep(5)
+            print(res.status_code)
+
+        else:
+            html = res.text
+            with open(f"html/{index}.html", "w+", encoding="utf-8") as f:
+                f.write(html)
+
+        semaphore.release()
+    except Exception as e:
+        print(e)
+        print("睡眠10s，再去抓取")
+        time.sleep(10)
+        get_image(base_url, index)
+
+
+if __name__ == '__main__':
+    num = 0
+    # 最多开启5个线程
+    semaphore = threading.BoundedSemaphore(5)
+    lst_record_threads = []
+    for index in range(1, 525):
+        semaphore.acquire()
+        t = threading.Thread(target=get_image, args=(
+            f"https://vol.moe/l/all,all,all,sortpoint,all,all,BL/{index}.htm", index))
+        t.start()
+        lst_record_threads.append(t)
+
+    for rt in lst_record_threads:
+        rt.join()