女生头像网采集.py 1.4 KB
Newer Older
梦想橡皮擦's avatar
gevent  
梦想橡皮擦 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
from gevent import monkey

monkey.patch_all()

import threading
from bs4 import BeautifulSoup

import gevent
import requests
import lxml


def get_page(this_urls):
    while True:

        if this_urls is None:
            break
        url = this_urls.pop()
        print('正在抓取:{},当前的虚拟线程为:{}'.format(url, threading.current_thread().getName()))
        res = requests.get(url=url)
        res.encoding = "gb2312"
        if res.status_code == 200:

            soup = BeautifulSoup(res.text, 'lxml')
            content = soup.find(attrs={'class': 'g-gxlist-imgbox'})
            img_tags = content.find_all('img')

            for img_tag in img_tags:
                img_src = img_tag['src']
                # 注意去除文件路径中的特殊符号,防止出错
                try:
                    name = img_tag['alt'].replace('/', '').replace('+', '').replace('?', '').replace('*', '')
                except OSError as e:
                    continue
                save_img(img_src, name)


def save_img(img_src, name):
    res = requests.get(img_src)
    with open(f'imgs/{name}.jpg', mode='wb') as f:
        f.write(res.content)


if __name__ == '__main__':
    urls = [f"https://www.qqtn.com/tx/nvshengtx_{page}.html" for page in range(1, 244)]
    # 开启 5 个协程
    gevent.joinall([gevent.spawn(get_page, urls) for i in range(5)])

    print("爬取完毕")