提交 546ca312 编写于 作者: 梦想橡皮擦's avatar 梦想橡皮擦 💬

案例五复盘

上级 3d15bb1e
import http_help as hh
import re
import threading
import time
import os
import requests
urls_lock = threading.Lock() # url操作锁
imgs_lock = threading.Lock() # 图片操作锁
imgs_start_urls = []
class Consumer(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
self.__headers = {"Referer": "http://www.jj20.com/bz/ktmh",
"Host": "www.jj20.com"}
self.__res = hh.R(headers=self.__headers)
def download_img(self, filder, img_down_url, filename):
file_path = "./downs/{}".format(filder)
if not os.path.exists(file_path):
os.mkdir(file_path) # 创建目录
if os.path.exists("./downs/{}/{}".format(filder, filename)):
return
else:
try:
# 由于图片比较大,请求时间调整到10秒
img = requests.get(img_down_url, headers={
"Host": "img.jj20.com"}, timeout=10)
except Exception as e:
print(e)
print("{}写入图片".format(img_down_url))
try:
with open("./downs/{}/{}".format(filder, filename), "wb+") as f:
f.write(img.content)
except Exception as e:
print(e)
return
def run(self):
while True:
global imgs_start_urls, imgs_lock
if len(imgs_start_urls) > 0:
if imgs_lock.acquire(): # 锁定
img_url = imgs_start_urls[0] # 获取到链接之后
del imgs_start_urls[0] # 删掉第0项
imgs_lock.release() # 解锁
else:
continue
# print("图片开始下载")
img_url = "http://www.jj20.com"+img_url[0]
title = img_url[1]
start_index = 1
base_url = img_url[0:img_url.rindex(".")]
while True:
if start_index > 1:
img_url = "{}_{}.html".format(base_url, start_index)
content = self.__res.get_content(img_url, charset="gb2312")
if content is not None:
pattern = re.compile("<script>var id='(.*?)';</script>")
img_down_url = pattern.search(content) # 获取到了图片地址
if img_down_url is not None:
filder = title
img_down_url = "http://www.jj20.com" + \
img_down_url.group(1)
filename = img_down_url[img_down_url.rindex("/")+1:]
self.download_img(
filder, img_down_url, filename) # 下载图片
else:
print("-"*100)
print(content)
break # 终止循环体
else:
print("{}链接加载失败".format(img_url))
if imgs_lock.acquire(): # 锁定
imgs_start_urls.append(img_url)
imgs_lock.release() # 解锁
start_index += 1
# time.sleep(3)
class Product(threading.Thread):
def __init__(self, urls):
threading.Thread.__init__(self)
self.__urls = urls
self.__headers = {"Referer": "http://www.jj20.com/bz/ktmh",
"Host": "www.jj20.com"
}
self.__res = hh.R(headers=self.__headers)
def add_fail_url(self, url):
print("{}该URL抓取失败".format(url))
global urls_lock
if urls_lock.acquire():
self.__urls.insert(0, url)
urls_lock.release() # 解锁
def run(self):
print("*"*100)
while True:
global urls_lock, imgs_start_urls
if len(self.__urls) > 0:
if urls_lock.acquire(): # 锁定
last_url = self.__urls.pop()
urls_lock.release() # 解锁
print("正在操作{}".format(last_url))
content = self.__res.get_content(last_url, "gb2312")
if content is not None:
html = self.get_page_list(content)
if len(html) == 0:
self.add_fail_url(last_url)
else:
if imgs_lock.acquire():
imgs_start_urls.extend(html)
imgs_lock.release()
time.sleep(5)
else:
self.add_fail_url(last_url)
else:
print("所有链接已经运行完毕")
break
def get_page_list(self, content):
pattern = re.compile(
'<a href="(.*?)" target="_blank"><img src=".*?" width="270" height="151" alt="(.*?)"></a>')
list_page = re.findall(pattern, content)
return list_page
class ImageList():
def __init__(self):
self.__start = "http://www.jj20.com/bz/ktmh/list_16_{}.html" # URL模板
self.__headers = {"Referer": "http://www.jj20.com/bz/ktmh",
"Host": "www.jj20.com"
}
self.__res = hh.R(headers=self.__headers) # 初始化访问请求
def run(self):
page_count = 43 # int(self.get_page_count())
if page_count == 0:
return
urls = [self.__start.format(i) for i in range(1, page_count)]
print(urls)
return urls
# 废弃掉该方法,直接人眼识别总页数
def get_page_count(self):
content = self.__res.get_content(self.__start.format("1"), "gb2312")
pattern = re.compile(
"<li><a href='list_11_(\d+?).html' target='_self'>末页</a></li>")
search_text = pattern.search(content)
if search_text is not None:
count = search_text.group(1)
return count
else:
return 0
if __name__ == '__main__':
img = ImageList()
urls = img.run()
for i in range(1, 2):
p = Product(urls)
p.start()
for i in range(1, 2):
c = Consumer()
c.start()
import requests
from retrying import retry
import random
import datetime
class R:
# 类的初始化方法
def __init__(self,method="get",params=None,headers=None,cookies=None):
self.__method = method
myheaders = self.get_headers()
if headers is not None:
myheaders.update(headers)
self.__headers = myheaders
self.__cookies = cookies
self.__params = params
def get_headers(self):
user_agent_list = [ \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" \
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
UserAgent = random.choice(user_agent_list)
headers = {'User-Agent': UserAgent}
return headers
@retry(stop_max_attempt_number=3)
def __retrying_requests(self,url):
if self.__method == "get":
response = requests.get(url,headers=self.__headers,cookies=self.__cookies,timeout=3)
else:
response = requests.post(url,params=self.__params,headers=self.__headers,cookies=self.__cookies,timeout=3)
return response.content
# get请求
def get_content(self,url,charset="utf-8"):
try:
html_str = self.__retrying_requests(url).decode(charset)
except:
html_str = None
return html_str
def get_file(self,file_url):
try:
file = self.__retrying_requests(file_url)
except:
file = None
return file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册