提交 d46883c3 编写于 作者: W wizardforcel

2022-08-20 21:24:58

上级 4b1fffa0
......@@ -16,6 +16,8 @@ from . import *
from .util import *
from .img import *
from .config import config
from .sele_crawler import crawl_sele
from .get_article import get_article
def get_toc_from_cfg():
if config['list'] and len(config['list']) > 0:
......@@ -63,35 +65,7 @@ def get_toc(html, base):
res.append(url)
return res
def get_article(html, url):
# 预处理掉 XML 声明和命名空间
html = re.sub(r'<\?xml[^>]*\?>', '', html)
html = re.sub(r'xmlns=".+?"', '', html)
root = pq(html)
if config['remove']:
root(config['remove']).remove()
el_title = root(config['title']).eq(0)
title = el_title.text().strip()
el_title.remove()
if config['content']:
el_co = root(config['content'])
co = '\r\n'.join([
el_co.eq(i).html()
for i in range(len(el_co))
])
else:
co = Document(str(root)).summary()
co = pq(co).find('body').html()
if config['credit']:
credit = f"<blockquote>原文:<a href='{url}'>{url}</a></blockquote>"
co = credit + co
return {'title': title, 'content': co}
def tr_download_page_safe(url, art, imgs):
try:
......@@ -171,6 +145,8 @@ def main():
user_cfg = json.loads(open(cfg_fname, encoding='utf-8').read())
update_config(user_cfg)
if config['selenium']: crawl_sele(args)
toc = get_toc_from_cfg()
articles = []
imgs = {}
......
......@@ -27,4 +27,6 @@ config = {
'external': None,
'checkStatus': False,
'cache': True,
'waitContent': False,
'debug': False,
}
\ No newline at end of file
import re
from pyquery import PyQuery as pq
from .config import config
from readability import Document
def get_article(html, url):
# 预处理掉 XML 声明和命名空间
html = re.sub(r'<\?xml[^>]*\?>', '', html)
html = re.sub(r'xmlns=".+?"', '', html)
root = pq(html)
if config['remove']:
root(config['remove']).remove()
el_title = root(config['title']).eq(0)
title = el_title.text().strip()
el_title.remove()
if config['content']:
el_co = root(config['content'])
co = '\r\n'.join([
el_co.eq(i).html()
for i in range(len(el_co))
])
else:
co = Document(str(root)).summary()
co = pq(co).find('body').html()
if config['credit']:
credit = f"<blockquote>原文:<a href='{url}'>{url}</a></blockquote>"
co = credit + co
return {'title': title, 'content': co}
\ No newline at end of file
from pyquery import PyQuery as pq
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from GenEpub import gen_epub
from urllib.parse import urljoin
import sys
import json
import re
import hashlib
import base64
import time
from concurrent.futures import ThreadPoolExecutor
import threading
import traceback
from .util import *
from .img import process_img
from .config import config
from .get_article import get_article
trlocal = threading.local()
drivers = []
JS_GET_IMG_B64 = '''
function getImageBase64(img_stor) {
var img = document.querySelector(img_stor)
if (!img) return ''
var canvas = document.createElement("canvas");
canvas.width = img.width;
canvas.height = img.height;
var ctx = canvas.getContext("2d");
ctx.drawImage(img, 0, 0, img.width, img.height);
var dataURL = canvas.toDataURL("image/png");
return dataURL;
}
'''
'''
def get_img_src(el_img):
url = ''
for prop in config['imgSrc']:
url = el_img.attr(prop)
if url: break
return url
def process_img_data_url(url, el_img, imgs, **kw):
if not re.search(RE_DATA_URL, url):
return False
picname = hashlib.md5(url.encode('utf-8')).hexdigest() + '.png'
print(f'pic: {url} => {picname}')
if picname not in imgs:
enco_data = re.sub(RE_DATA_URL, '', url)
data = base64.b64decode(enco_data.encode('utf-8'))
data = opti_img(data, config['optiMode'], config['colors'])
imgs[picname] = data
el_img.attr('src', kw['img_prefix'] + picname)
return True
def process_img(driver, html, imgs, **kw):
kw.setdefault('img_prefix', 'img/')
root = pq(html)
el_imgs = root('img')
for i in range(len(el_imgs)):
el_img = el_imgs.eq(i)
url = get_img_src(el_img)
if not url: continue
if process_img_data_url(url, el_img, imgs, **kw):
continue
if not url.startswith('http'):
if kw.get('page_url'):
url = urljoin(kw.get('page_url'), url)
else: continue
picname = hashlib.md5(url.encode('utf-8')).hexdigest() + '.png'
print(f'pic: {url} => {picname}')
if picname not in imgs:
try:
driver.get(url)
b64 = driver.execute_script(
JS_GET_IMG_B64 + '\nreturn getImageBase64("body>img")')
print(b64[:100])
process_img_data_url(b64, el_img, imgs, **kw)
time.sleep(config['wait'])
except Exception as ex: print(ex)
return root.html()
'''
def wait_content_cb(driver):
return driver.execute_script('''
var titlePresent = document.querySelector(arguments[0]) != null
var contPresent = document.querySelector(arguments[1]) != null
return titlePresent && contPresent
''', config['title'], config['content'])
def download_page(url, art, imgs):
print(url)
if not hasattr(trlocal, 'driver'):
trlocal.driver = create_driver()
drivers.append(trlocal.driver)
driver = trlocal.driver
if not re.search(r'^https?://', url):
articles.append({'title': url, 'content': ''})
return
driver.get(url)
# 显式等待
if config['waitContent']:
WebDriverWait(driver, config['waitContent'], 0.5) \
.until(wait_content_cb, "无法获取标题或内容")
html = driver.find_element_by_css_selector('body').get_attribute('outerHTML')
art.update(get_article(html, url))
art['content'] = process_img(art['content'], imgs, page_url=url, img_prefix='../Images/')
time.sleep(config['wait'])
def download_page_safe(url, art, imgs):
try: download_page(url, art, imgs)
except: traceback.print_exc()
def create_driver():
options = Options()
if not config['debug']:
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--log-level=3')
driver = webdriver.Chrome(options=options)
driver.get(config['url'])
for kv in config.get('headers', {}).get('Cookie', '').split('; '):
kv = kv.split('=')
if len(kv) < 2: continue
driver.add_cookie({'name': kv[0], 'value': kv[1]})
driver.get(config['url'])
return driver
def crawl_sele(args):
articles = [{
'title': config['name'],
'content': f"<p>来源:<a href='" + config['url'] + "'>" + config['url'] + "</a></p>"
}]
imgs = {}
pool = ThreadPoolExecutor(config['textThreads'])
hdls = []
for url in config['list']:
art = {}
articles.append(art)
h = pool.submit(download_page_safe, url, art, imgs)
hdls.append(h)
# download_page_safe(driver, url, articles, imgs)
for h in hdls: h.result()
articles = [art for art in articles if art]
gen_epub(articles, imgs)
for d in drivers: d.close()
......@@ -12,6 +12,8 @@ import uuid
import tempfile
import json
RE_DATA_URL = r'^data:image/\w+;base64,'
bundle_dir = tempfile.gettempdir()
cache_dir = 'epubcralwer'
......
......@@ -2,4 +2,5 @@ requests
pyquery
GenEpub
imgyaso
readability-lxml
\ No newline at end of file
readability-lxml
selenium
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册