From d46883c3279c070c356a10b6382209c28363be81 Mon Sep 17 00:00:00 2001 From: wizardforcel <562826179@qq.com> Date: Sat, 20 Aug 2022 21:24:58 +0800 Subject: [PATCH] 2022-08-20 21:24:58 --- EpubCrawler/__main__.py | 34 ++------ EpubCrawler/config.py | 2 + EpubCrawler/get_article.py | 33 ++++++++ EpubCrawler/sele_crawler.py | 160 ++++++++++++++++++++++++++++++++++++ EpubCrawler/util.py | 2 + requirements.txt | 3 +- 6 files changed, 204 insertions(+), 30 deletions(-) create mode 100644 EpubCrawler/get_article.py create mode 100644 EpubCrawler/sele_crawler.py diff --git a/EpubCrawler/__main__.py b/EpubCrawler/__main__.py index 7ae6fdb..af81312 100644 --- a/EpubCrawler/__main__.py +++ b/EpubCrawler/__main__.py @@ -16,6 +16,8 @@ from . import * from .util import * from .img import * from .config import config +from .sele_crawler import crawl_sele +from .get_article import get_article def get_toc_from_cfg(): if config['list'] and len(config['list']) > 0: @@ -63,35 +65,7 @@ def get_toc(html, base): res.append(url) return res - -def get_article(html, url): - # 预处理掉 XML 声明和命名空间 - html = re.sub(r'<\?xml[^>]*\?>', '', html) - html = re.sub(r'xmlns=".+?"', '', html) - root = pq(html) - - if config['remove']: - root(config['remove']).remove() - - el_title = root(config['title']).eq(0) - title = el_title.text().strip() - el_title.remove() - - if config['content']: - el_co = root(config['content']) - co = '\r\n'.join([ - el_co.eq(i).html() - for i in range(len(el_co)) - ]) - else: - co = Document(str(root)).summary() - co = pq(co).find('body').html() - - if config['credit']: - credit = f"
原文:{url}
" - co = credit + co - - return {'title': title, 'content': co} + def tr_download_page_safe(url, art, imgs): try: @@ -171,6 +145,8 @@ def main(): user_cfg = json.loads(open(cfg_fname, encoding='utf-8').read()) update_config(user_cfg) + if config['selenium']: crawl_sele(args) + toc = get_toc_from_cfg() articles = [] imgs = {} diff --git a/EpubCrawler/config.py b/EpubCrawler/config.py index 0fab588..be21cfe 100644 --- a/EpubCrawler/config.py +++ b/EpubCrawler/config.py @@ -27,4 +27,6 @@ config = { 'external': None, 'checkStatus': False, 'cache': True, + 'waitContent': False, + 'debug': False, } \ No newline at end of file diff --git a/EpubCrawler/get_article.py b/EpubCrawler/get_article.py new file mode 100644 index 0000000..0463f10 --- /dev/null +++ b/EpubCrawler/get_article.py @@ -0,0 +1,33 @@ +import re +from pyquery import PyQuery as pq +from .config import config +from readability import Document + +def get_article(html, url): + # 预处理掉 XML 声明和命名空间 + html = re.sub(r'<\?xml[^>]*\?>', '', html) + html = re.sub(r'xmlns=".+?"', '', html) + root = pq(html) + + if config['remove']: + root(config['remove']).remove() + + el_title = root(config['title']).eq(0) + title = el_title.text().strip() + el_title.remove() + + if config['content']: + el_co = root(config['content']) + co = '\r\n'.join([ + el_co.eq(i).html() + for i in range(len(el_co)) + ]) + else: + co = Document(str(root)).summary() + co = pq(co).find('body').html() + + if config['credit']: + credit = f"
原文:{url}
" + co = credit + co + + return {'title': title, 'content': co} \ No newline at end of file diff --git a/EpubCrawler/sele_crawler.py b/EpubCrawler/sele_crawler.py new file mode 100644 index 0000000..52aaf3c --- /dev/null +++ b/EpubCrawler/sele_crawler.py @@ -0,0 +1,160 @@ +from pyquery import PyQuery as pq +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.support.wait import WebDriverWait +from GenEpub import gen_epub +from urllib.parse import urljoin +import sys +import json +import re +import hashlib +import base64 +import time +from concurrent.futures import ThreadPoolExecutor +import threading +import traceback +from .util import * +from .img import process_img +from .config import config +from .get_article import get_article + +trlocal = threading.local() +drivers = [] + +JS_GET_IMG_B64 = ''' +function getImageBase64(img_stor) { + var img = document.querySelector(img_stor) + if (!img) return '' + var canvas = document.createElement("canvas"); + canvas.width = img.width; + canvas.height = img.height; + var ctx = canvas.getContext("2d"); + ctx.drawImage(img, 0, 0, img.width, img.height); + var dataURL = canvas.toDataURL("image/png"); + return dataURL; +} +''' + +''' +def get_img_src(el_img): + url = '' + for prop in config['imgSrc']: + url = el_img.attr(prop) + if url: break + return url + + +def process_img_data_url(url, el_img, imgs, **kw): + if not re.search(RE_DATA_URL, url): + return False + picname = hashlib.md5(url.encode('utf-8')).hexdigest() + '.png' + print(f'pic: {url} => {picname}') + if picname not in imgs: + enco_data = re.sub(RE_DATA_URL, '', url) + data = base64.b64decode(enco_data.encode('utf-8')) + data = opti_img(data, config['optiMode'], config['colors']) + imgs[picname] = data + el_img.attr('src', kw['img_prefix'] + picname) + return True + +def process_img(driver, html, imgs, **kw): + kw.setdefault('img_prefix', 'img/') + + root = pq(html) + el_imgs = root('img') + + for i in range(len(el_imgs)): + el_img = el_imgs.eq(i) + url = get_img_src(el_img) + if not url: continue + if process_img_data_url(url, el_img, imgs, **kw): + continue + if not url.startswith('http'): + if kw.get('page_url'): + url = urljoin(kw.get('page_url'), url) + else: continue + + picname = hashlib.md5(url.encode('utf-8')).hexdigest() + '.png' + print(f'pic: {url} => {picname}') + if picname not in imgs: + try: + driver.get(url) + b64 = driver.execute_script( + JS_GET_IMG_B64 + '\nreturn getImageBase64("body>img")') + print(b64[:100]) + process_img_data_url(b64, el_img, imgs, **kw) + time.sleep(config['wait']) + except Exception as ex: print(ex) + + return root.html() +''' + +def wait_content_cb(driver): + return driver.execute_script(''' + var titlePresent = document.querySelector(arguments[0]) != null + var contPresent = document.querySelector(arguments[1]) != null + return titlePresent && contPresent + ''', config['title'], config['content']) + +def download_page(url, art, imgs): + print(url) + + if not hasattr(trlocal, 'driver'): + trlocal.driver = create_driver() + drivers.append(trlocal.driver) + driver = trlocal.driver + + if not re.search(r'^https?://', url): + articles.append({'title': url, 'content': ''}) + return + driver.get(url) + # 显式等待 + if config['waitContent']: + WebDriverWait(driver, config['waitContent'], 0.5) \ + .until(wait_content_cb, "无法获取标题或内容") + html = driver.find_element_by_css_selector('body').get_attribute('outerHTML') + art.update(get_article(html, url)) + art['content'] = process_img(art['content'], imgs, page_url=url, img_prefix='../Images/') + time.sleep(config['wait']) + +def download_page_safe(url, art, imgs): + try: download_page(url, art, imgs) + except: traceback.print_exc() + +def create_driver(): + options = Options() + if not config['debug']: + options.add_argument('--headless') + options.add_argument('--disable-gpu') + options.add_argument('--log-level=3') + driver = webdriver.Chrome(options=options) + driver.get(config['url']) + + for kv in config.get('headers', {}).get('Cookie', '').split('; '): + kv = kv.split('=') + if len(kv) < 2: continue + driver.add_cookie({'name': kv[0], 'value': kv[1]}) + driver.get(config['url']) + + return driver + +def crawl_sele(args): + articles = [{ + 'title': config['name'], + 'content': f"

来源:" + config['url'] + "

" + }] + imgs = {} + pool = ThreadPoolExecutor(config['textThreads']) + hdls = [] + for url in config['list']: + art = {} + articles.append(art) + h = pool.submit(download_page_safe, url, art, imgs) + hdls.append(h) + # download_page_safe(driver, url, articles, imgs) + for h in hdls: h.result() + + articles = [art for art in articles if art] + gen_epub(articles, imgs) + + for d in drivers: d.close() diff --git a/EpubCrawler/util.py b/EpubCrawler/util.py index 424e130..a78fe20 100644 --- a/EpubCrawler/util.py +++ b/EpubCrawler/util.py @@ -12,6 +12,8 @@ import uuid import tempfile import json +RE_DATA_URL = r'^data:image/\w+;base64,' + bundle_dir = tempfile.gettempdir() cache_dir = 'epubcralwer' diff --git a/requirements.txt b/requirements.txt index c87ad7b..6e2b53a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,5 @@ requests pyquery GenEpub imgyaso -readability-lxml \ No newline at end of file +readability-lxml +selenium \ No newline at end of file -- GitLab