提交 90c810ec 编写于 作者: 梦想橡皮擦's avatar 梦想橡皮擦 💬

代理服务器采集

上级 76835302
27.14.86.171:8000
import requests
from lxml import etree
import random
import telnetlib
# 代理检测函数
def check_ip_port(ip_port):
for item in ip_port:
ip = item["ip"]
port = item["port"]
try:
tn = telnetlib.Telnet(ip, port=port, timeout=3)
except:
print('[-] ip:{}:{}'.format(ip, port))
else:
print('[+] ip:{}:{}'.format(ip, port))
with open('ipporxy.txt', 'a') as f:
f.write(ip+':'+port+'\n')
print("阶段性检测完毕")
# 代理IP的第二种检测办法
def check_proxy(ip_port):
for item in ip_port:
ip = item["ip"]
port = item["port"]
# url = 'https://api.ipify.org/?format=json'
url = "http://icanhazip.com/"
proxies = {
"http": "http://{}:{}".format(ip, port),
"https": "https://{}:{}".format(ip, port),
}
try:
# res = requests.get(url, proxies=proxies, timeout=3).json()
res = requests.get(url, proxies=proxies, timeout=3)
# if 'ip' in res:
# print(res['ip'])
if res.status_code == 200:
print(res)
except Exception as e:
print(e)
def ip89(pagesize):
url_format = "https://www.89ip.cn/index_{}.html"
for page in range(1, pagesize+1):
url = url_format.format(page)
text = get_html(url)
ip_xpath = '//tbody/tr/td[1]/text()'
port_xpath = '//tbody/tr/td[2]/text()'
ret = format_html(text, ip_xpath, port_xpath)
# 检测代理是否可用
check_ip_port(ret)
# check_proxy(ret)
def ip66(pagesize):
url_format = "http://www.66ip.cn/{}.html"
for page in range(1, pagesize+1):
url = url_format.format(page)
text = get_html(url)
ip_xpath = '//table/tr[position()>1]/td[1]/text()'
port_xpath = '//table/tr[position()>1]/td[2]/text()'
ret = format_html(text, ip_xpath, port_xpath)
# 检测代理是否可用
check_ip_port(ret)
def ip3366(pagesize):
url_format = "https://proxy.ip3366.net/free/?action=china&page={}"
for page in range(1, pagesize+1):
url = url_format.format(page)
text = get_html(url)
ip_xpath = '//td[@data-title="IP"]/text()'
port_xpath = '//td[@data-title="PORT"]/text()'
ret = format_html(text, ip_xpath, port_xpath)
# 检测代理是否可用
check_ip_port(ret)
def ip_huan():
urls = ["https://ip.ihuan.me/?page=b97827cc", "https://ip.ihuan.me/?page=4ce63706", "https://ip.ihuan.me/?page=5crfe930",
"https://ip.ihuan.me/?page=f3k1d581", "https://ip.ihuan.me/?page=ce1d45977", "https://ip.ihuan.me/?page=881aaf7b5"]
for url in urls:
text = get_html(url)
ip_xpath = '//tbody/tr/td[1]/a/text()'
port_xpath = '//tbody/tr/td[2]/text()'
ret = format_html(text, ip_xpath, port_xpath)
check_ip_port(ret)
def ip_kuai(pagesize):
url_format = "https://www.kuaidaili.com/free/inha/{}/"
for page in range(1, pagesize+1):
url = url_format.format(page)
text = get_html(url)
ip_xpath = '//td[@data-title="IP"]/text()'
port_xpath = '//td[@data-title="PORT"]/text()'
ret = format_html(text, ip_xpath, port_xpath)
check_ip_port(ret)
def ip_jiangxi(pagesize):
url_format = "https://ip.jiangxianli.com/?page={}"
for page in range(1, pagesize+1):
url = url_format.format(page)
text = get_html(url)
ip_xpath = '//tbody/tr[position()!=7]/td[1]/text()'
port_xpath = '//tbody/tr[position()!=7]/td[2]/text()'
ret = format_html(text, ip_xpath, port_xpath)
check_ip_port(ret)
def ip_kaixin(pagesize):
url_format = "http://www.kxdaili.com/dailiip/1/{}.html"
for page in range(1, pagesize+1):
url = url_format.format(page)
text = get_html(url)
ip_xpath = '//tbody/tr/td[1]/text()'
port_xpath = '//tbody/tr/td[2]/text()'
ret = format_html(text, ip_xpath, port_xpath)
check_ip_port(ret)
def ip_nima(pagesize):
url_format = "http://www.nimadaili.com/putong/{}/"
for page in range(1, pagesize+1):
url = url_format.format(page)
text = get_html(url)
ip_xpath = '//tbody/tr/td[1]/text()'
ret = format_html_ext(text, ip_xpath)
check_ip_port(ret)
def format_html_ext(text, ip_xpath):
# 待返回的IP与端口列表
ret = []
html = etree.HTML(text)
ips = html.xpath(ip_xpath)
print(ips)
for ip in ips:
item_dict = {
"ip": ip.split(":")[0],
"port": ip.split(":")[1]
}
ret.append(item_dict)
return ret
def format_html(text, ip_xpath, port_xpath):
# 待返回的IP与端口列表
ret = []
html = etree.HTML(text)
ips = html.xpath(ip_xpath)
ports = html.xpath(port_xpath)
# 测试,正式运行删除本部分代码
print(ips, ports)
ip_port = zip(ips, ports)
for ip, port in ip_port:
item_dict = {
"ip": ip.strip(),
"port": port.strip()
}
ret.append(item_dict)
return ret
def get_html(url):
headers = get_headers()
try:
res = requests.get(url, headers=headers, timeout=5)
return res.text
except Exception as e:
print("请求网址异常", e)
return None
def get_headers():
uas = [
"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
"Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)",
"Baiduspider-image+(+http://www.baidu.com/search/spider.htm)",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36",
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
"Mozilla/5.0 (compatible; Googlebot-Image/1.0; +http://www.google.com/bot.html)",
"Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
"Sogou News Spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);",
"Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)",
"Sosospider+(+http://help.soso.com/webspider.htm)",
"Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)"
]
ua = random.choice(uas)
headers = {
"user-agent": ua,
"referer": "https://www.baidu.com"
}
return headers
def run():
ip89(10)
ip66(10)
ip3366(2)
ip_huan()
ip_kuai(4)
ip_jiangxi(4)
ip_kaixin(10)
ip_nima(5)
if __name__ == "__main__":
run()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册