diff --git "a/\346\241\210\344\276\2135/demo.py" "b/\346\241\210\344\276\2135/demo.py" new file mode 100644 index 0000000000000000000000000000000000000000..65e159bd668687e7ff14361be94f15e3efdc8332 --- /dev/null +++ "b/\346\241\210\344\276\2135/demo.py" @@ -0,0 +1,210 @@ +import http_help as hh +import re +import threading +import time +import os +import requests + +urls_lock = threading.Lock() # url操作锁 +imgs_lock = threading.Lock() # 图片操作锁 + +imgs_start_urls = [] + + +class Consumer(threading.Thread): + + def __init__(self): + + threading.Thread.__init__(self) + + self.__headers = {"Referer": "http://www.jj20.com/bz/ktmh", + "Host": "www.jj20.com"} + + self.__res = hh.R(headers=self.__headers) + + def download_img(self, filder, img_down_url, filename): + file_path = "./downs/{}".format(filder) + + if not os.path.exists(file_path): + os.mkdir(file_path) # 创建目录 + + if os.path.exists("./downs/{}/{}".format(filder, filename)): + return + else: + + try: + # 由于图片比较大,请求时间调整到10秒 + img = requests.get(img_down_url, headers={ + "Host": "img.jj20.com"}, timeout=10) + except Exception as e: + print(e) + + print("{}写入图片".format(img_down_url)) + try: + with open("./downs/{}/{}".format(filder, filename), "wb+") as f: + f.write(img.content) + except Exception as e: + print(e) + return + + def run(self): + + while True: + global imgs_start_urls, imgs_lock + + if len(imgs_start_urls) > 0: + if imgs_lock.acquire(): # 锁定 + img_url = imgs_start_urls[0] # 获取到链接之后 + del imgs_start_urls[0] # 删掉第0项 + imgs_lock.release() # 解锁 + else: + continue + + # print("图片开始下载") + img_url = "http://www.jj20.com"+img_url[0] + title = img_url[1] + start_index = 1 + base_url = img_url[0:img_url.rindex(".")] + + while True: + if start_index > 1: + img_url = "{}_{}.html".format(base_url, start_index) + + content = self.__res.get_content(img_url, charset="gb2312") + + if content is not None: + + pattern = re.compile("") + + img_down_url = pattern.search(content) # 获取到了图片地址 + + if img_down_url is not None: + filder = title + img_down_url = "http://www.jj20.com" + \ + img_down_url.group(1) + + filename = img_down_url[img_down_url.rindex("/")+1:] + + self.download_img( + filder, img_down_url, filename) # 下载图片 + + else: + print("-"*100) + print(content) + break # 终止循环体 + + else: + print("{}链接加载失败".format(img_url)) + + if imgs_lock.acquire(): # 锁定 + imgs_start_urls.append(img_url) + imgs_lock.release() # 解锁 + + start_index += 1 + # time.sleep(3) + + +class Product(threading.Thread): + + def __init__(self, urls): + threading.Thread.__init__(self) + self.__urls = urls + self.__headers = {"Referer": "http://www.jj20.com/bz/ktmh", + "Host": "www.jj20.com" + } + + self.__res = hh.R(headers=self.__headers) + + def add_fail_url(self, url): + + print("{}该URL抓取失败".format(url)) + global urls_lock + if urls_lock.acquire(): + self.__urls.insert(0, url) + urls_lock.release() # 解锁 + + def run(self): + print("*"*100) + while True: + global urls_lock, imgs_start_urls + if len(self.__urls) > 0: + if urls_lock.acquire(): # 锁定 + last_url = self.__urls.pop() + urls_lock.release() # 解锁 + + print("正在操作{}".format(last_url)) + + content = self.__res.get_content(last_url, "gb2312") + if content is not None: + html = self.get_page_list(content) + + if len(html) == 0: + self.add_fail_url(last_url) + else: + if imgs_lock.acquire(): + imgs_start_urls.extend(html) + imgs_lock.release() + + time.sleep(5) + else: + self.add_fail_url(last_url) + + else: + print("所有链接已经运行完毕") + break + + def get_page_list(self, content): + + pattern = re.compile( + '(.*?)') + list_page = re.findall(pattern, content) + + return list_page + + +class ImageList(): + def __init__(self): + self.__start = "http://www.jj20.com/bz/ktmh/list_16_{}.html" # URL模板 + self.__headers = {"Referer": "http://www.jj20.com/bz/ktmh", + "Host": "www.jj20.com" + } + self.__res = hh.R(headers=self.__headers) # 初始化访问请求 + + def run(self): + + page_count = 43 # int(self.get_page_count()) + + if page_count == 0: + return + urls = [self.__start.format(i) for i in range(1, page_count)] + print(urls) + return urls + + # 废弃掉该方法,直接人眼识别总页数 + def get_page_count(self): + + content = self.__res.get_content(self.__start.format("1"), "gb2312") + pattern = re.compile( + "
  • 末页
  • ") + + search_text = pattern.search(content) + + if search_text is not None: + count = search_text.group(1) + return count + else: + return 0 + + +if __name__ == '__main__': + + img = ImageList() + urls = img.run() + + for i in range(1, 2): + p = Product(urls) + p.start() + + for i in range(1, 2): + c = Consumer() + c.start() diff --git "a/\346\241\210\344\276\2135/downs/\345\233\276\347\211\207\344\270\213\350\275\275\347\233\256\345\275\225.txt" "b/\346\241\210\344\276\2135/downs/\345\233\276\347\211\207\344\270\213\350\275\275\347\233\256\345\275\225.txt" new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git "a/\346\241\210\344\276\2135/http_help.py" "b/\346\241\210\344\276\2135/http_help.py" new file mode 100644 index 0000000000000000000000000000000000000000..89c3ce17a10e9394ae1cb8dac70ab441d6414f81 --- /dev/null +++ "b/\346\241\210\344\276\2135/http_help.py" @@ -0,0 +1,67 @@ +import requests +from retrying import retry +import random +import datetime + +class R: + # 类的初始化方法 + def __init__(self,method="get",params=None,headers=None,cookies=None): + self.__method = method + myheaders = self.get_headers() + if headers is not None: + myheaders.update(headers) + self.__headers = myheaders + self.__cookies = cookies + self.__params = params + + + def get_headers(self): + user_agent_list = [ \ + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" \ + "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \ + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \ + "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \ + "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \ + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \ + "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \ + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ + "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ + "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \ + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \ + "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ + "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ + "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \ + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \ + "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" + ] + UserAgent = random.choice(user_agent_list) + headers = {'User-Agent': UserAgent} + return headers + + + @retry(stop_max_attempt_number=3) + def __retrying_requests(self,url): + if self.__method == "get": + response = requests.get(url,headers=self.__headers,cookies=self.__cookies,timeout=3) + else: + response = requests.post(url,params=self.__params,headers=self.__headers,cookies=self.__cookies,timeout=3) + return response.content + + + # get请求 + def get_content(self,url,charset="utf-8"): + try: + html_str = self.__retrying_requests(url).decode(charset) + except: + html_str = None + return html_str + + def get_file(self,file_url): + try: + file = self.__retrying_requests(file_url) + except: + file = None + return file +