From a30c1d3cb212708ac7e2ba2062f2b81555d971cd Mon Sep 17 00:00:00 2001 From: hihell Date: Sun, 5 Sep 2021 21:40:32 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A4=96=E5=9B=BD=E7=BD=91=E7=AB=99=E6=8E=92?= =?UTF-8?q?=E8=A1=8C=E6=A6=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...77\347\250\213\347\211\210\346\234\254.py" | 18 +++++ ...77\347\250\213\347\211\210\346\234\254.py" | 65 +++++++++++++++++++ 2 files changed, 83 insertions(+) create mode 100644 "NO30/\345\215\225\347\272\277\347\250\213\347\211\210\346\234\254.py" create mode 100644 "NO30/\345\244\232\347\272\277\347\250\213\347\211\210\346\234\254.py" diff --git "a/NO30/\345\215\225\347\272\277\347\250\213\347\211\210\346\234\254.py" "b/NO30/\345\215\225\347\272\277\347\250\213\347\211\210\346\234\254.py" new file mode 100644 index 0000000..3a1fe92 --- /dev/null +++ "b/NO30/\345\215\225\347\272\277\347\250\213\347\211\210\346\234\254.py" @@ -0,0 +1,18 @@ +from requests_html import HTMLSession + +session = HTMLSession() + +page_size = int(input("请输入总页码:")) +for page in range(1, page_size + 1): + + world = session.get(f'http://www.world68.com/top.asp?t=5star&page={page}') + world.encoding = 'gb2312' + # world.html.encoding = "gb2312" + # print(world.text) + print("正在采集数据", world.url) + title_a = world.html.find('dl>dt>a') + for item in title_a: + name = item.text + url = item.attrs['href'] + with open('webs1.txt', "a+", encoding="utf-8") as f: + f.write(f"{name},{url}\n") diff --git "a/NO30/\345\244\232\347\272\277\347\250\213\347\211\210\346\234\254.py" "b/NO30/\345\244\232\347\272\277\347\250\213\347\211\210\346\234\254.py" new file mode 100644 index 0000000..fdc345e --- /dev/null +++ "b/NO30/\345\244\232\347\272\277\347\250\213\347\211\210\346\234\254.py" @@ -0,0 +1,65 @@ +import requests_html +import threading +import time +import fcntl + + +class MyThread(threading.Thread): + def __init__(self): + threading.Thread.__init__(self) + + def run(self): + global page, lock, page_size + while True: + lock.acquire(True) + if page >= page_size: + lock.release() + break + else: + page += 1 + lock.release() + requests_html.DEFAULT_ENCODING = "gb18030" + session = requests_html.HTMLSession() + + print("正在采集第{}页".format(page), "*" * 50) + try: + page_url = f'http://www.world68.com/top.asp?t=5star&page={page}' + world = session.get(page_url, timeout=10) + print("正在采集数据", world.url) + # print(world.html) + title_a = world.html.find('dl>dt>a') + print(title_a) + my_str = "" + + for item in title_a: + name = item.text + url = item.attrs['href'] + my_str += f"{name.encode('utf-8').decode('utf-8')},{url}\n" + + with open('thread_webs.txt', "a+", encoding="utf-8") as f: + fcntl.flock(f.fileno(), fcntl.LOCK_EX) # 文件加锁 + f.write(f"{my_str}") + + except Exception as e: + print(e, page_url) + + +if "__main__" == __name__: + page_size = int(input("请输入总页码:")) + page = 0 + thread_list = [] + + # 获取开始时间 + start = time.perf_counter() + + lock = threading.Lock() + for i in range(1, 5): + t = MyThread() + thread_list.append(t) + for t in thread_list: + t.start() + for t in thread_list: + t.join() + # 获取时间间隔 + elapsed = (time.perf_counter() - start) + print("程序运行完毕,总耗时为:", elapsed) -- GitLab