From 5f0d52450f4db91510f8853adf1efc2f281144ea Mon Sep 17 00:00:00 2001 From: hihell Date: Sat, 17 Jul 2021 01:02:31 +0800 Subject: [PATCH] =?UTF-8?q?=E6=A1=88=E4=BE=8B2=E4=BF=AE=E5=A4=8D=E8=A3=85?= =?UTF-8?q?=E4=BF=AE=E7=BE=8E=E5=9B=BE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...05\344\277\256\347\276\216\345\233\276.py" | 191 ++++++++++++++++++ 1 file changed, 191 insertions(+) create mode 100644 "\346\241\210\344\276\2132/\346\241\210\344\276\2132\344\277\256\345\244\215\347\211\210\346\234\254\357\274\214\350\243\205\344\277\256\347\276\216\345\233\276.py" diff --git "a/\346\241\210\344\276\2132/\346\241\210\344\276\2132\344\277\256\345\244\215\347\211\210\346\234\254\357\274\214\350\243\205\344\277\256\347\276\216\345\233\276.py" "b/\346\241\210\344\276\2132/\346\241\210\344\276\2132\344\277\256\345\244\215\347\211\210\346\234\254\357\274\214\350\243\205\344\277\256\347\276\216\345\233\276.py" new file mode 100644 index 0000000..b70070b --- /dev/null +++ "b/\346\241\210\344\276\2132/\346\241\210\344\276\2132\344\277\256\345\244\215\347\211\210\346\234\254\357\274\214\350\243\205\344\277\256\347\276\216\345\233\276.py" @@ -0,0 +1,191 @@ + +import requests +import threading # 多线程模块 +import re # 正则表达式模块 +import time # 时间模块 +import os # 目录操作模块 + +# 图片列表页面的数组 +all_img_urls = [] +# 初始化一个锁 +g_lock = threading.Lock() + +# 我们拼接好的图片集和列表路径 +all_urls = [] + +# 图片地址列表 +pic_links = [] + + +class Spider(): + # 构造函数,初始化数据使用 + def __init__(self, target_url, headers): + self.target_url = target_url + self.headers = headers + + # 获取所有的想要抓取的 URL + def getUrls(self, start_page, page_num): + + global all_urls + # 循环得到URL + for i in range(start_page, page_num+1): + url = self.target_url % i + all_urls.append(url) + + +# 生产者,负责从每个页面提取图片列表链接 +class Producer(threading.Thread): + + def run(self): + headers = { + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0', + 'HOST': 'home.fang.com' + } + global all_urls + while len(all_urls) > 0: + # 在访问all_urls的时候,需要使用锁机制 + g_lock.acquire() + # 通过pop方法移除最后一个元素,并且返回该值 + page_url = all_urls.pop() + # 使用完成之后及时把锁给释放,方便其他线程使用 + g_lock.release() + try: + print("分析"+page_url) + response = requests.get(page_url, headers=headers, timeout=3) + # 提取详情页地址的正则表达式,需要重新编写 + all_pic_link = re.findall( + '', response.text) + global all_img_urls + # 这里还有一个锁 + g_lock.acquire() + # 这个地方注意数组的拼接,没有用 append 直接用的+=也算是 python 的一个新语法 + # 这里还需要将图片地址拼接完整 + all_img_urls += [ + f"https://home.fang.com{link}" for link in all_pic_link] + # print(all_img_urls) + # 释放锁 + g_lock.release() + time.sleep(0.5) + except: + pass + +# 消费者 + + +class Consumer(threading.Thread): + def run(self): + headers = { + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0', + 'HOST': 'home.fang.com' + } + global all_img_urls # 调用全局的图片详情页面的数组 + print("%s is running " % threading.current_thread) + # print(len(all_img_urls)) + while len(all_img_urls) > 0: + g_lock.acquire() + img_url = all_img_urls.pop() + g_lock.release() + try: + response = requests.get(img_url, headers=headers) + # 设置案例编码 + response.encoding = 'utf-8' + title = re.search( + '(.*?)-房天下家居装修网', response.text).group(1) + all_pic_src = re.findall( + '', response.text) + + pic_dict = {title: all_pic_src} # python字典 + global pic_links + g_lock.acquire() + pic_links.append(pic_dict) # 字典数组 + print(title+" 获取成功") + g_lock.release() + + except Exception as e: + print(e) + time.sleep(0.5) + + +class DownPic(threading.Thread): + + def run(self): + headers = { + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0', + 'HOST': 'imgwcs3.soufunimg.com' + + } + while True: # 这个地方写成死循环,为的是不断监控图片链接数组是否更新 + global pic_links + # 上锁 + g_lock.acquire() + if len(pic_links) == 0: # 如果没有图片了,就解锁 + # 不管什么情况,都要释放锁 + g_lock.release() + continue + else: + pic = pic_links.pop() + g_lock.release() + # 遍历字典列表 + for key, values in pic.items(): + path = key.rstrip("\\") + is_exists = os.path.exists(path) + # 判断结果 + if not is_exists: + # 如果不存在则创建目录 + # 创建目录操作函数 + os.makedirs(path) + + print(path+'目录创建成功') + + else: + # 如果目录存在则不创建,并提示目录已存在 + print(path+' 目录已存在') + for pic in values: + filename = path+"/"+pic.split('/')[-1] + if os.path.exists(filename): + continue + else: + try: + response = requests.get(pic,headers=headers) + with open(filename,'wb') as f : + f.write(response.content) + except Exception as e: + print(e) + pass + + + + +if __name__ == "__main__": + headers = { + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0', + 'HOST': 'home.fang.com' + } + # 图片集和列表规则 + target_url = 'https://home.fang.com/album/s24/%d/' + + spider = Spider(target_url, headers) + # 先测试 5 页数据 + spider.getUrls(1, 5) + # print(all_urls) + threads = [] + # 开启两个线程去访问 + for x in range(2): + t = Producer() + t.start() + threads.append(t) + + for tt in threads: + tt.join() + + print("进行到我这里了") + + # 开启10个线程去获取链接 + for x in range(10): + ta = Consumer() + ta.start() + + # 开启10个线程保存图片 + for x in range(10): + down = DownPic() + down.start() -- GitLab