提交 cd022ec6 编写于 作者: C crj1998

refactor

上级 fe9ac759
# meetslut
> A fancy👠 tool to download pictures from internt.
> A fancy command line tool to download slut👠 pictures from internt.
![010116-220-C](https://upload-images.jianshu.io/upload_images/13843118-f46b965a1c878a67.png)
<!-- ![010116-220-C](https://upload-images.jianshu.io/upload_images/13843118-f46b965a1c878a67.png) -->
## 🚀 Support website
> 📢 Checked on 2022.11.05
|Website|Status|Desc|
|---|---|---|
| [Caitlin](https://caitlin.top/)|✔️|comics|
| [zipai](https://99zipai.com/)|✔️|china amateur photos|
| [motherless]() |✔️|photo|
| [Imagefap]()|❌|photo|
| [pictoa](https://www.pictoa.com/)|⭕️|photo|
✔️ success ❌ failed ⭕️ developing
❗️ proxy needed for mainland user
## 🔨 Installation
```
......@@ -12,34 +27,9 @@ pip install .
```
## 💡 Usage
```bash
meetslut image https://www.99zipai.com/selfies/202010/110590.html -o ./saved
```
meetslut download --url https://www.python.org/static/img/python-logo.png -o path/to/python-logo.png
```
### Test
```
python meetslut/cli.py zipai https://www.99zipai.com/selfies/202010/110590.html -o ~/work/saved
python meetslut/cli.py caitlin 'https://caitlin.top/index.php?route=comic/readOnline&comic_id=697352&host_id=0&page=3&gallery_brightness=100&gallery_contrast=100' -o ~/work/saved
python meetslut/cli.py motherless https://motherless.com/GICB9A5D8?page=2 -o ~/work/saved
```
### 🚀 Support
> 📢 Update on 2022.11.05
✔️ success
❌ failed
❗️ proxy needed for mainland user
⭕️ developing
|Website|Status|Desc|
|---|---|---|
| [Caitlin](https://caitlin.top/)|✔️|comics|
| [zipai](https://99zipai.com/)|✔️|china amateur photos|
| [motherless]() |✔️|photo|
| [Imagefap]()|❌|photo|
| [pictoa](https://www.pictoa.com/)|⭕️|photo|
## 📑 License
......@@ -50,4 +40,12 @@ python meetslut/cli.py motherless https://motherless.com/GICB9A5D8?page=2 -o ~/w
## For Developers
parser: input a url, output a dict {"url": ..., "name": ...}
\ No newline at end of file
parser: input a url, output a dict {"url": ..., "name": ...}
### Test
```
python meetslut/cli.py zipai https://www.99zipai.com/selfies/202010/110590.html -o ~/work/saved
python meetslut/cli.py caitlin 'https://caitlin.top/index.php?route=comic/readOnline&comic_id=697352&host_id=0&page=3&gallery_brightness=100&gallery_contrast=100' -o ~/work/saved
python meetslut/cli.py motherless https://motherless.com/GICB9A5D8?page=2 -o ~/work/saved
```
\ No newline at end of file
"""
The Command Line Interface (CLI) for the downloader
"""
import os, argparse, logging
import os, argparse, logging, json
import sys
sys.path.append('D:\workspace\meetslut')
from meetslut.processor.caitlin import caitlin_download
from meetslut.processor.zipai import zipai_download
from meetslut.processor.motherless import motherless_download
def zipai(args):
zipai_download(args.url, args.output)
def caitlin(args):
caitlin_download(args.url, args.output)
def motherless(args):
motherless_download(args.url, args.output)
from meetslut.webparser import ParserFactory
from meetslut.download import download
def setup_logger():
logger = logging.getLogger()
logger.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s [%(levelname)-5s] [%(filename)s:%(lineno)d] %(message)s', datefmt='%d/%m/%Y %H:%M:%S')
formatter = logging.Formatter('%(asctime)s [%(levelname)-5s] [%(filename)s:%(lineno)d] %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
console = logging.StreamHandler()
console.setFormatter(formatter)
......@@ -30,43 +21,49 @@ def setup_logger():
def main():
parser = argparse.ArgumentParser(description="A fancy downloader for slut.")
subparsers = parser.add_subparsers(help='sub-command help')
zipai_parser = subparsers.add_parser('zipai', help='download zipai')
zipai_parser.add_argument("url", type=str, help="The URL of the resource to be downloaded.")
zipai_parser.add_argument(
"--output", "-o", default=os.getcwd(),
help=("Destination local file path. If not set, the resource "
"will be downloaded to the current working directory, with filename "
"same as the basename of the URL")
)
zipai_parser.set_defaults(func=zipai)
caitlin_parser = subparsers.add_parser('caitlin', help='download caitlin')
caitlin_parser.add_argument("url", type=str, help="The URL of the resource to be downloaded.")
caitlin_parser.add_argument(
image_parser = subparsers.add_parser('image', help='image tool')
image_parser.add_argument("url", type=str, help="The URL of the resource to be downloaded.")
image_parser.add_argument(
"--output", "-o", default=os.getcwd(),
help=("Destination local file path. If not set, the resource "
"will be downloaded to the current working directory, with filename "
"same as the basename of the URL")
)
caitlin_parser.set_defaults(func=caitlin)
image_parser.set_defaults(func=process_image)
motherless_parser = subparsers.add_parser('motherless', help='download motherless')
motherless_parser.add_argument("url", type=str, help="The URL of the resource to be downloaded.")
motherless_parser.add_argument(
"--output", "-o", default=os.getcwd(),
help=("Destination local file path. If not set, the resource "
"will be downloaded to the current working directory, with filename "
"same as the basename of the URL")
)
motherless_parser.set_defaults(func=motherless)
args = parser.parse_args()
# args.func(args)
args.func(args)
def process_image(args):
url = args.url
output = args.output
app = ParserFactory.create(url)
data = app.parse(url)
folder = os.path.join(output, data['title'])
metafile = os.path.join(folder, 'metadata.json')
# if os.path.exists(metafile):
# # save metadata
# with open(metafile, 'r', encoding='utf8') as f:
# data = json.load(f)
# else:
# # save metadata
# with open(metafile, 'w', encoding='utf8') as f:
# json.dump(data, f, ensure_ascii=False)
# save metadata
with open(metafile, 'w', encoding='utf8') as f:
json.dump(data, f, ensure_ascii=False)
download(data['images'], folder, indexed=app.indexed)
exit()
if __name__ == "__main__":
setup_logger()
logging.info("🚀 Start...")
# main()
\ No newline at end of file
main()
\ No newline at end of file
......@@ -2,14 +2,23 @@
NUM_THREADS = 4
# requests max retry
RETRY = 3
# header
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36"
# requests config
GET_CFG = {
'headers': {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36"
},
# 'proxies': None,
'proxies': {
'http': 'socks5h://127.0.0.1:2801',
'https': 'socks5h://127.0.0.1:2801'
},
'timeout': 5
}
# proxy
PROXIES = {
"""
{
'http': 'socks5h://127.0.0.1:2801',
'https': 'socks5h://127.0.0.1:2801'
}
"""
ROOT_ZIPAI = "https://99zipai.com"
\ No newline at end of file
import os, requests
import os, logging, requests, re
from functools import partial
from tqdm import tqdm
from multiprocessing.pool import ThreadPool
from meetslut.config import HEADERS, NUM_THREADS, RETRY
from meetslut.config import GET_CFG, NUM_THREADS, RETRY
def saveImage(image, folder):
url, filename = image
......@@ -14,55 +14,51 @@ def saveImage(image, folder):
while r==None and i<RETRY:
i += 1
try:
r = requests.get(url, headers=HEADERS, timeout=10)
r = requests.get(url, headers=GET_CFG['headers'], proxies=GET_CFG['proxies'], timeout=10)
except:
continue
if r.headers["Content-Type"].strip().startswith("image/"):
break
if i >= RETRY:
raise Exception("Max Retries!")
with open(filepath, 'wb') as f:
f.write(r.content)
filesize = len(r.content)
return filename, filesize
def convert(suffix):
if suffix in [".gif", ".jpg", ".jpeg", ".png"]:
return suffix
else:
return ".jpg"
def amend_suffix(s):
return s if s in [".gif", ".jpg", ".jpeg", ".png"] else '.jpg'
def rename(urls, reserve, pad):
def rename(urls, names, indexed):
res = []
l = len(str(len(urls)))
for idx, url in enumerate(urls):
max_length = len(str(len(urls)))
for idx, (url, name) in enumerate(zip(urls, names), start=1):
filename = os.path.basename(url)
filename, suffix = os.path.splitext(filename)
suffix = convert(suffix)
if reserve:
filename = filename
else:
if pad:
filename = f"{(l-len(str(idx+1)))*'0'}{idx+1}"
else:
filename = f"{idx+1}"
suffix = amend_suffix(suffix)
filename = str(idx).zfill(max_length) if indexed else name
filename = re.sub(r"[\/\\\:\*\?\"\<\>\|]", " ", filename)
if f"{filename}{suffix}" in res:
filename = filename + str(idx)
res.append(f"{filename}{suffix}")
return res
def download(urls, folder, reserve, pad):
def download(images, folder, indexed=True):
""" multi-threading downloader.
"""
os.makedirs(folder, exist_ok=True)
urls = [i['url'] for i in images]
names = [i['name'] for i in images]
files = rename(urls, names, indexed)
save = partial(saveImage, folder=folder)
files = rename(urls, reserve, pad)
results = ThreadPool(NUM_THREADS).imap(save, zip(urls, files))
print(f"Start download in {folder}...")
logging.info(f"Start download in {folder}...")
with tqdm(results, total=len(urls), ncols=100, desc=f"Download") as t:
for filename, filesize in t:
t.set_postfix({"name": filename, "size": f"{round(filesize/1024)}kb"})
\ No newline at end of file
import os, re, json
import requests
from meetslut.config import HEADERS
from meetslut.download import download
def fetch(comic_id):
""" get the title and image urls according to comic id.
title, images = fetch("697352")
"""
params = {
"route": "comic/readOnline",
"comic_id": comic_id
}
r = requests.get(f"https://caitlin.top/index.php", params=params, headers=HEADERS)
html = r.text
title = re.search(r'<span class="d">(.+)<span>', html).group(1)
title = title.replace(" ", "")
root_url = re.search(r'HTTP_IMAGE = "(//.+)";', html).group(1)
images = re.search(r'Image_List = (\[.+\]);', html).group(1)
images = json.loads(images)
images = [f"https:{root_url}{image['sort']}.{image['extension']}" for image in images]
return title, images
def caitlin_download(url, folder):
if url.startswith("https://caitlin.top/index.php"):
comic_id = re.search('comic_id=(\d+)', url).group(1)
elif url.isdigit():
comic_id = url
else:
return False
title, images = fetch(comic_id)
folder = os.path.join(folder, title)
# print(url, folder, images)
download(images, folder, reserve=False, pad=True)
\ No newline at end of file
import requests, os
from urllib import parse
from lxml import etree
ROOT = "https://www.imagefap.com"
HEADERS = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
def gallery(gid):
url = parse.urljoin(ROOT, os.path.join("pictures", f"{str(gid)}/"))
params = {"view": "2"}
r = requests.get(url, params=params, headers=HEADERS, timeout=3)
assert r.status_code == 200, f"Status code {r.status_code} error"
html = etree.HTML(r.text)
title = html.xpath("//*[@id='menubar']/table/tr[1]/td[2]/table/tr/td[1]/b[1]/font/text()")[0]
table = html.xpath("//div[@class='expp-container']/form/table")[0]
pid = table.xpath("tr/td/table/tr[1]/td/a/@name")
name = table.xpath("tr/td/table/tr[2]/td/font[2]/i/text()")
assert len(pid) == len(name)
total = len(pid)
current = 0
src = []
while current<total:
url = parse.urljoin(ROOT, os.path.join("photo", f"{str(pid[current])}/"))
params = {
"gid": str(gid),
"idx": str(current),
"partial": "true"
}
r = requests.get(url, headers=HEADERS, timeout=3)
html = etree.HTML(r.text)
href = html.xpath("//div[@id='navigation']/ul/li/a/@href")
current += len(href)
src.extend(href)
print(f"Images url fetching [{current}/{total}]...")
assert len(src) == total
return list(zip(pid, src, name))
import os, re, time, random
import requests
from tqdm import tqdm
from lxml import etree
from meetslut.config import HEADERS
from meetslut.download import download
def gallery(gid, sleep=None):
page = 1
codes = []
def get(p):
url = f"https://motherless.com/{gid}"
url += f"?page={p}" if page>1 else ""
r = requests.get(url, headers=HEADERS)
r.encoding = r.apparent_encoding
html = etree.HTML(r.text)
return html
html = get(page)
title = html.xpath("//h2[@id='view-upload-title']/text()")[0].strip()
amount = int(re.search("Images \(([0-9]*)\)", html.xpath("//span[@class='active']/text()")[0]).group(1))
codenames = html.xpath("//img[@class='static']/@data-strip-src")
codes.extend([codename.replace("thumbs", "images") for codename in codenames])
print(f"Parse gallery({gid}) {title}: ")
print(f"{len(codes)}/{amount} @ page {page}")
while len(codenames) > 0 and len(codes) <= amount:
if sleep:
time.sleep(sleep if isinstance(sleep, (int, float)) else random.random())
page += 1
html = get(page)
codenames = html.xpath("//img[@class='static']/@data-strip-src")
codes.extend([codename.replace("thumbs", "images") for codename in codenames])
print(f"{len(codes)}/{amount} @ page {page}")
return title, amount, codes
def motherless_download(url, folder):
if url.startswith("https://motherless.com"):
gid = url.split("?")[0].split("/")[-1]
title, amount, images = gallery(gid)
folder = os.path.join(folder, title)
download(images, folder, reserve=True, pad=False)
else:
print(url)
\ No newline at end of file
import os, re, json
import requests
import sys
sys.path.append('D:\workspace\meetslut')
from bs4 import BeautifulSoup
from bs4.element import Tag, NavigableString
from meetslut.config import HEADERS, PROXIES
def parse(url):
r = requests.get(url, headers=HEADERS, proxies=PROXIES, timeout=5)
assert r.status_code == 200
r.encoding = r.apparent_encoding
soup = BeautifulSoup(r.text, "html.parser")
album = soup.find("div", attrs={"id": "album"})
title = album.div.h1.text.strip()
wrapper = album.findAll("a", attrs={"class": "gallery-link"})
res = {
"title": title,
"data": [{'name': a.img.attrs['alt'], 'url': a.img.attrs['data-lazy-src']} for a in wrapper]
}
return res
if __name__ == '__main__':
url = "https://www.pictoa.com/albums/white-women-getting-cream-pies-from-bbc-588816-p5.html"
print(parse(url))
\ No newline at end of file
import os, re, json
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag, NavigableString
from meetslut.config import HEADERS, ROOT_ZIPAI
from meetslut.download import download
def article(url):
aid = url.split("/")[-1].split(".")[0]
a, b = url.split("//")
root = f"{a}//{b.split('/')[0]}"
r = requests.get(url, headers=HEADERS)
assert r.status_code == 200
r.encoding = r.apparent_encoding
soup = BeautifulSoup(r.text, "html.parser")
title = soup.find("div", attrs={"class": "item_title"}).h1.text.strip()
div = soup.find("div", attrs={"class": "content_left"})
images = []
for idx, ele in enumerate(div.contents):
if isinstance(ele, Tag) and ele.name == "img":
src = ele.get("src")
if "d/file/selfies" not in src:
continue
if src.startswith("/"):
src = root + src
images.append(src)
return title, images
def worklist(uid):
# url = f"{ROOT}/my/{uid}/"
url = f"https://99zipai.com/e/space/ulist.php?page=0&mid=1&line=80&tempid=10&orderby=&myorder=0&userid={uid}"
r = requests.get(url, headers=HEADERS)
assert r.status_code == 200, r.text
r.encoding = r.apparent_encoding
soup = BeautifulSoup(r.text, "html.parser")
total = soup.find("a", attrs={"class": "sh_1"}).span.text.strip()
user = soup.find(name="h2", attrs={"itemprop": "name"}).text.strip()
# print(total, user)
ul = soup.find(name="ul", attrs={"class": "ul_author_list cl"})
for idx, li in enumerate(ul.findAll("li")):
a = li.a
yield a.text, a.get("href")
def zipai_download(url, folder):
if url.startswith("https://www.99zipai.com/selfies"):
title, images = article(url)
folder = os.path.join(folder, title)
# print(url, folder, images)
download(images, folder, reserve=False, pad=True)
else:
print(url)
\ No newline at end of file
import sys
sys.path.append('D:\workspace\meetslut')
from typing import Union, Optional
from abc import ABCMeta, abstractmethod
import os, re, json
import time, random
import requests
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from bs4.element import Tag
from lxml import etree
from meetslut.config import GET_CFG
class AbstractParser(metaclass=ABCMeta):
_instance = None
def __new__(cls, *args, **kwargs):
if cls._instance is None:
cls._instance = super().__new__(cls)
return cls._instance
def __init__(self, name):
self.name = name
@abstractmethod
def parse(self):
pass
@staticmethod
def get(url, **kwargs):
r = requests.get(url, **kwargs, **GET_CFG)
assert r.status_code == 200, f'Statuc code: {r.status_code} @ url: {url}'
r.encoding = r.apparent_encoding
return r.text
def __repr__(self):
return f'{self.__class__.__name__}()'
class Caitlin(AbstractParser):
def __init__(self):
super().__init__('Caitlin')
self.indexed = True
def _fetch(self, comic_id: Union[str, int]) -> dict:
""" get the title and image urls according to comic id."""
params = {
"route": "comic/readOnline",
"comic_id": str(comic_id)
}
html = self.get("https://caitlin.top/index.php", params=params)
title = re.search(r'<span class="d">(.+)<span>', html).group(1)
title = title.replace(" ", "")
root_url = re.search(r'HTTP_IMAGE = "(//.+)";', html).group(1)
images = re.search(r'Image_List = (\[.+\]);', html).group(1)
images = json.loads(images)
data = {
'title': title,
'images': [
{'name': str(i), 'url': f"https:{root_url}{image['sort']}.{image['extension']}"}
for i, image in enumerate(images, start=1)
]
}
return data
def parse(self, url: str) -> dict:
# get comic id
if url.startswith('https://caitlin.top/index.php'):
matched = re.search('comic_id=(\d+)', url)
assert matched is not None, f'No valid comic_id in url: {url}.'
comic_id = matched.group(1)
elif url.isdigit():
comic_id = url
else:
raise ValueError(f'Invalid url: {url}. It should be full url or comic id.')
data = self._fetch(comic_id)
data['url'] = url
return data
class Zipai(AbstractParser):
def __init__(self):
super().__init__('Zipai')
self.indexed = True
# def worklist(uid):
# # url = f"{ROOT}/my/{uid}/"
# url = f"https://99zipai.com/e/space/ulist.php?page=0&mid=1&line=80&tempid=10&orderby=&myorder=0&userid={uid}"
# r = requests.get(url, headers=HEADERS)
# assert r.status_code == 200, r.text
# r.encoding = r.apparent_encoding
# soup = BeautifulSoup(r.text, "html.parser")
# total = soup.find("a", attrs={"class": "sh_1"}).span.text.strip()
# user = soup.find(name="h2", attrs={"itemprop": "name"}).text.strip()
# # print(total, user)
# ul = soup.find(name="ul", attrs={"class": "ul_author_list cl"})
# for idx, li in enumerate(ul.findAll("li")):
# a = li.a
# yield a.text, a.get("href")
def _fetch(self, url: str) -> dict:
p = urlparse(url)
root = f"{p.scheme}//{p.netloc}"
html = self.get(url)
soup = BeautifulSoup(html, "html.parser")
title = soup.find("div", attrs={"class": "item_title"}).h1.text.strip()
div = soup.find("div", attrs={"class": "content_left"})
images = []
for i, ele in enumerate(div.contents, start=1):
if isinstance(ele, Tag) and ele.name == "img":
src = ele.get("src")
if "d/file/selfies" not in src:
continue
if src.startswith("/"):
src = root + src
images.append({'name': str(i), 'url': src})
data = {
'title': title,
'images': images
}
return data
def parse(self, url: str) -> dict:
if url.startswith("https://www.99zipai.com/selfies"):
pass
else:
raise ValueError(f'Invalid url: {url}. It should be full url or comic id.')
data = self._fetch(url)
data['url'] = url
return data
class Motherless(AbstractParser):
def __init__(self):
super().__init__('Motherless')
self.indexed = False
def _fetch(self, gid: str, sleep: Optional[Union[int, float]] = None) -> dict:
url = f"https://motherless.com/{gid}"
page = 1
codes = []
def get(p):
params = {'page': p} if p>1 else {}
html = self.get(url, params=params)
html = etree.HTML(html)
return html
html = get(page)
title = html.xpath("//h2[@id='view-upload-title']/text()")[0].strip()
amount = int(re.search("Images \(([0-9]*)\)", html.xpath("//span[@class='active']/text()")[0]).group(1))
srcs = html.xpath("//img[@class='static']/@data-strip-src")
names = html.xpath("//img[@class='static']/@alt")
codes.extend([{'name': name, 'url': src.replace("thumbs", "images")} for src, name in zip(srcs, names)])
# print(f"Parse gallery({gid}) {title}: ")
# print(f"{len(codes)}/{amount} @ page {page}")
while len(srcs) > 0 and len(codes) <= amount:
if sleep: time.sleep(sleep if isinstance(sleep, (int, float)) else random.random())
page += 1
html = get(page)
srcs = html.xpath("//img[@class='static']/@data-strip-src")
names = html.xpath("//img[@class='static']/@alt")
codes.extend([{'name': name, 'url': src.replace("thumbs", "images")} for src, name in zip(srcs, names)])
# print(f"{len(codes)}/{amount} @ page {page}")
data = {
'title': title,
'images': codes
}
return data
def parse(self, url: str) -> dict:
if url.startswith("https://motherless.com"):
p = urlparse(url)
gid = p.path.split("/")[-1]
else:
raise ValueError(f'Invalid url: {url}. It should be full url or comic id.')
data = self._fetch(gid)
data['url'] = url
return data
class ImageFap(AbstractParser):
def __init__(self):
super().__init__('ImageFap')
self.indexed = False
def _fetch(self, gid: str) -> dict:
return
url = parse.urljoin(ROOT, os.path.join("pictures", f"{str(gid)}/"))
params = {"view": "2"}
r = requests.get(url, params=params, headers=HEADERS, timeout=3)
assert r.status_code == 200, f"Status code {r.status_code} error"
html = etree.HTML(r.text)
title = html.xpath("//*[@id='menubar']/table/tr[1]/td[2]/table/tr/td[1]/b[1]/font/text()")[0]
table = html.xpath("//div[@class='expp-container']/form/table")[0]
pid = table.xpath("tr/td/table/tr[1]/td/a/@name")
name = table.xpath("tr/td/table/tr[2]/td/font[2]/i/text()")
assert len(pid) == len(name)
total = len(pid)
current = 0
src = []
while current<total:
url = parse.urljoin(ROOT, os.path.join("photo", f"{str(pid[current])}/"))
params = {
"gid": str(gid),
"idx": str(current),
"partial": "true"
}
r = requests.get(url, headers=HEADERS, timeout=3)
html = etree.HTML(r.text)
href = html.xpath("//div[@id='navigation']/ul/li/a/@href")
current += len(href)
src.extend(href)
print(f"Images url fetching [{current}/{total}]...")
assert len(src) == total
return list(zip(pid, src, name))
def parse(self, url: str) -> dict:
if url.startswith("https://imagefap.com"):
p = urlparse(url)
gid = p.path.split("/")[-1]
else:
raise ValueError(f'Invalid url: {url}. It should be full url or comic id.')
data = self._fetch(gid)
data['url'] = url
return data
class Pictoa(AbstractParser):
def __init__(self):
super().__init__('Pictoa')
self.indexed = False
def _fetch(self, gid: str) -> dict:
return
r = requests.get(url, headers=HEADERS, proxies=PROXIES, timeout=5)
assert r.status_code == 200
r.encoding = r.apparent_encoding
soup = BeautifulSoup(r.text, "html.parser")
album = soup.find("div", attrs={"id": "album"})
title = album.div.h1.text.strip()
wrapper = album.findAll("a", attrs={"class": "gallery-link"})
res = {
"title": title,
"data": [{'name': a.img.attrs['alt'], 'url': a.img.attrs['data-lazy-src']} for a in wrapper]
}
return res
def parse(self, url: str) -> dict:
if url.startswith("https://imagefap.com"):
p = urlparse(url)
gid = p.path.split("/")[-1]
else:
raise ValueError(f'Invalid url: {url}. It should be full url or comic id.')
data = self._fetch(gid)
data['url'] = url
return data
class ParserFactory:
@staticmethod
def create(website):
p = urlparse(website)
domain = p.netloc
domain = domain.strip().replace(' ', '').lower()
if "caitlin" in domain:
p = Caitlin()
elif "zipai" in domain:
p = Zipai()
elif "motherless" in domain:
p = Motherless()
elif "imagefap" in domain:
p = ImageFap()
else:
raise ValueError(f'Unknown website {website}')
return p
if __name__ == '__main__':
# app = Caitlin()
# app.parse('aaaa')
# app.parse('https://caitlin.top/index.php')
# data = app.parse('https://caitlin.top/index.php?route=comic/readOnline&comic_id=697352&host_id=0&page=3&gallery_brightness=100&gallery_contrast=100')
# print(data)
# app = Zipai()
# data = app.parse('https://www.99zipai.com/selfies/202010/110590.html')
# print(data)
app = Motherless()
data = app.parse('https://motherless.com/GI6E08B47')
print(data)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册