From 24c753c52986d401d875644cc299b79f31818a10 Mon Sep 17 00:00:00 2001 From: hjCodeCloud <7482185+hjcodecloud@user.noreply.gitee.com> Date: Mon, 21 Jun 2021 10:23:57 +0800 Subject: [PATCH] =?UTF-8?q?=E5=8F=AF=E7=88=B1=E5=9B=BE=E7=89=87=E7=BD=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- NO7/index.py | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 NO7/index.py diff --git a/NO7/index.py b/NO7/index.py new file mode 100644 index 0000000..d2eab1a --- /dev/null +++ b/NO7/index.py @@ -0,0 +1,80 @@ +import requests +import re +import threading +import time + +headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"} + +# 全局 urls +urls = [] + +mutex = threading.Lock() + + +# 循环获取URL +def get_image(start_url): + global urls + urls.append(start_url) + next_url = start_url + while next_url != "#": + res = requests.get(url=next_url, headers=headers) + + if res is not None: + html = res.text + pattern = re.compile('') + match = pattern.search(html) + if match: + next_url = match.group(1) + if next_url.find('www.keaitupian') < 0: + next_url = f"https://www.keaitupian.net{next_url}" + print(next_url) + # 上锁 + mutex.acquire() + + urls.append(next_url) + mutex.release() + + +# 保存图片线程 +def save_image(): + global urls + print(urls) + + while True: + mutex.acquire() # 上锁 + if len(urls) > 0: + img_url = urls[0] + del urls[0] + mutex.release() + res = requests.get(url=img_url, headers=headers) + + if res is not None: + html = res.text + + pattern = re.compile( + '') + + img_match = pattern.search(html) + + if img_match: + img_data_url = img_match.group(1) + print("抓取图片中:", img_data_url) + try: + res = requests.get(img_data_url) + with open(f"images/{time.time()}.png", "wb+") as f: + f.write(res.content) + except Exception as e: + print(e) + else: + print("等待中,长时间等待,可以直接关闭") + + +if __name__ == '__main__': + # 获取图片线程 + gets = threading.Thread(target=get_image, args=( + "https://www.keaitupian.net/article/202389.html",)) + gets.start() + + save = threading.Thread(target=save_image) + save.start() -- GitLab