diff --git "a/NO11/19LOU\347\233\270\344\272\262\347\210\254\350\231\253.py" "b/NO11/19LOU\347\233\270\344\272\262\347\210\254\350\231\253.py" new file mode 100644 index 0000000000000000000000000000000000000000..509c745fe1176c107fce204bfec0b667efd7c053 --- /dev/null +++ "b/NO11/19LOU\347\233\270\344\272\262\347\210\254\350\231\253.py" @@ -0,0 +1,53 @@ +import requests +from lxml import etree +from fake_useragent import UserAgent +import time + + +def save(src, title): + try: + res = requests.get(src) + with open(f"imgs/{title}.jpg", "wb+") as f: + f.write(res.content) + except Exception as e: + print(e) + + +def run(url): + # ua = UserAgent(cache=False) + ua = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36" + headers = { + "User-Agent": ua, + "Host": "www.19lou.com", + "Referer": "https://www.19lou.com/r/1/19lnsxq-233.html", + "Cookie": "_Z3nY0d4C_=37XgPK9h" # 从反爬代码中获取到的值 + } + try: + res = requests.get(url=url, headers=headers) + text = res.text + # 将 html 转换成 Element 对象 + html = etree.HTML(text) + # xpath 路径提取 @class 为选取 class 属性 + divs = html.xpath("//div[@class='pics']") + # print(len(divs)) + # 遍历 Elements 节点 + for div in divs: + # 提取地址,注意提取的属性为 data-src 而不是 src + src = div.xpath("./img/@data-src")[0] + # 提取标题 + title = div.xpath("./img/@alt")[0] + save(src, title) + except Exception as e: + print(e) + + +if __name__ == '__main__': + urls = ["https://www.19lou.com/r/1/19lnsxq.html"] + for i in range(114, 243): + urls.append(f"https://www.19lou.com/r/1/19lnsxq-{i}.html") + for url in urls: + print(f"正在抓取{url}") + run(url) + # time.sleep(5) + + print("全部爬取完毕")