提交 f88c1951 编写于 作者: 梦想橡皮擦's avatar 梦想橡皮擦 💬

中药数据采集

上级 27ef4202
from pyquery import PyQuery as pq
import time
def remove(str):
return str.replace("<br/>", "").replace("\n", "").replace(",", ",")
def get_data(page):
url = "http://www.highestbridges.com/wiki/index.php?title=List_of_Highest_International_Bridges/Page_{}".format(
page)
print(url)
doc = pq(url=url, encoding='utf-8')
print(doc('title'))
# 获取所有数据所在的行,下面使用的是 css 选择器,称作 jquery 选择器也没啥问题
items = doc.find('table.wikitable.sortable tr').items()
for item in items:
td_list = item.find('td')
rank = td_list.eq(1).find("span.sorttext").text()
name = remove(td_list.eq(2).find("a").text())
height = remove(td_list.eq(3).text())
length = remove(td_list.eq(4).text())
completed = remove(td_list.eq(5).text())
location = remove(td_list.eq(6).text())
country = remove(td_list.eq(7).text())
data_tuple = (rank, name, height, length, completed, location, country)
save(data_tuple)
def save(data_tuple):
try:
my_str = ",".join(data_tuple) + "\n"
# print(my_str)
with open(f"./data.csv", "a+", encoding="utf-8") as f:
f.write(my_str)
print("写入完毕")
except Exception as e:
pass
if __name__ == '__main__':
for page in range(1, 14):
get_data(page)
time.sleep(3)
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
from pyquery import PyQuery as pq
import time
import requests
import os
import re
def get_html(page):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"
}
res = requests.get(url='http://www.zhongyoo.com/name/page_{}.html'.format(page), headers=headers, timeout=5)
res.encoding = "gb2312"
pq_data = pq(res.text)
titles = pq_data.find("strong>a.title")
for item in titles:
# 下面两种写法都能获取到 href 属性
# print(pq(item).attr('href'))
# print(item.attrib['href'])
link = pq(item).attr('href')
print(link)
save(link)
# return len(res.text)
def save(link):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"
}
doc = pq(url=link, headers=headers, encoding='gbk')
title = doc('title')
print(title.text())
if title:
title = title.text().split('_')[0]
text = doc.html()
with open("./html/{}.html".format(title), "w+") as f:
f.write(text.encode("gbk", "ignore").decode("gbk"))
def extract_data():
file_names = os.listdir("./html/")
for file in file_names:
with open(f"./html/{file}", "r") as f:
html_content = f.read()
# 解析数据
pq_obj = pq(html_content)
items = pq_obj.find('div.text p')
print("获取到的段落数为,", len(items))
# 等待提取的字符串
item_str = ""
for item in items:
text = pq(item).text()
item_str += text
# 使用正则提取数据
# 正名/中药名/药名
name_p = re.compile('【(正名|中药名|药名)】([\s\S]*?)【')
name = name_p.findall(item_str)
# 别名
alias_p = re.compile('【别名】([\s\S]*?)【')
alias = alias_p.findall(item_str)
# 英文名
en_name_p = re.compile('【英文名】([\s\S]*?)【')
en_name = en_name_p.findall(item_str)
print(name, alias, en_name)
if __name__ == '__main__':
# 静态页面获取代码逻辑
"""
page_data = list(range(1, 46))
ret = map(get_html, page_data)
for item in ret:
print(item)
"""
# 数据提取
extract_data()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册