From 75a3dd6e452b8c71300bf21b6975f701a5002644 Mon Sep 17 00:00:00 2001 From: hjCodeCloud <7482185+hjcodecloud@user.noreply.gitee.com> Date: Thu, 10 Jun 2021 11:39:51 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A5=A5=E7=89=B9=E6=9B=BC=E6=8A=93=E5=8F=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- NO4/index.py | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 NO4/index.py diff --git a/NO4/index.py b/NO4/index.py new file mode 100644 index 0000000..767caa2 --- /dev/null +++ b/NO4/index.py @@ -0,0 +1,78 @@ +import requests +import re +import time + +# 声明 UA +headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36" +} +# 存储异常路径,防止出现爬取失败情况 +error_list = [] + +# 爬虫入口 +def run(): + url = "http://www.ultramanclub.com/allultraman/" + try: + # 网页访问速度慢,需要设置 timeout + res = requests.get(url=url, headers=headers, timeout=10) + res.encoding = "gb2312" + html = res.text + return get_detail_list(html) + + except Exception as e: + print("请求异常", e) + + +# 获取全部奥特曼详情页 +def get_detail_list(html): + start_index = '