Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
亦蔚然
Project Crawler Elasticsearch
提交
33aebeea
P
Project Crawler Elasticsearch
项目概览
亦蔚然
/
Project Crawler Elasticsearch
通知
4
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Project Crawler Elasticsearch
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
提交
33aebeea
编写于
5月 24, 2021
作者:
亦蔚然
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
删除主分支的Main文件
上级
2ac09d01
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
0 addition
and
182 deletion
+0
-182
src/main/java/com/github/weiranyi/Main.java
src/main/java/com/github/weiranyi/Main.java
+0
-182
未找到文件。
src/main/java/com/github/weiranyi/Main.java
已删除
100644 → 0
浏览文件 @
2ac09d01
package
com.github.weiranyi
;
import
edu.umd.cs.findbugs.annotations.SuppressFBWarnings
;
import
org.apache.http.HttpEntity
;
import
org.apache.http.client.methods.CloseableHttpResponse
;
import
org.apache.http.client.methods.HttpGet
;
import
org.apache.http.impl.client.CloseableHttpClient
;
import
org.apache.http.impl.client.HttpClients
;
import
org.apache.http.util.EntityUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
java.io.IOException
;
import
java.sql.*
;
import
java.util.ArrayList
;
import
java.util.stream.Collectors
;
public
class
Main
{
private
static
final
String
USER_NAME
=
"root"
;
private
static
final
String
USER_PASSWORD
=
"123456"
;
@SuppressFBWarnings
(
"DMI_CONSTANT_DB_PASSWORD"
)
public
static
void
main
(
String
[]
args
)
throws
IOException
,
SQLException
{
Connection
connection
=
DriverManager
.
getConnection
(
"jdbc:h2:file:/Users/yiweiran/Documents/workPlace/java/JavaProject-Crawler-Elasticsearch/news"
,
USER_NAME
,
USER_PASSWORD
);
String
link
=
null
;
// 从数据库中加载下一个链接,若能加载到则进行下一个循环
while
((
link
=
getNextLinkThenDelete
(
connection
))
!=
null
)
{
// 若链接已经处理过了就跳到下一次循环
if
(
isLinkProcessed
(
connection
,
link
))
{
continue
;
}
// 判断是否是感兴趣滴内容【新浪站内的网页】
if
(
isInterestingLink
(
link
))
{
Document
doc
=
httpGetAndParseHtml
(
link
);
// 分析页面url将它们放到即将处理的url池子中去
parseUrlsFromAndStoreIntoDatabase
(
connection
,
doc
);
storeIntoDatabaseIfItIsNewsPage
(
connection
,
doc
,
link
);
updataDatabase
(
connection
,
link
,
"insert into LINKS_ALREADY_PROCESSED(link) values (?)"
);
}
else
{
// 不感兴趣
continue
;
}
}
}
/*
* 4、优化主干逻辑,进一步重构
*/
private
static
String
getNextLinkThenDelete
(
Connection
connection
)
throws
SQLException
{
String
link
=
getNextLink
(
connection
,
"select link from LINKS_TO_BE_PROCESSED limit 1;"
);
if
(
link
!=
null
)
{
updataDatabase
(
connection
,
link
,
"delete FROM LINKS_TO_BE_PROCESSED where LINK=?"
);
}
return
link
;
}
private
static
void
parseUrlsFromAndStoreIntoDatabase
(
Connection
connection
,
Document
doc
)
throws
SQLException
{
for
(
Element
aTag
:
doc
.
select
(
"a"
))
{
String
href
=
aTag
.
attr
(
"href"
);
if
(
href
.
startsWith
(
"//"
))
{
href
=
"https:"
+
href
;
}
if
(
href
.
toLowerCase
().
startsWith
(
"javascript"
))
{
continue
;
}
updataDatabase
(
connection
,
href
,
"insert into LINKS_TO_BE_PROCESSED(link) values (?)"
);
}
}
/*
* 3、重构对数据库操作部分的代码
*/
private
static
String
getNextLink
(
Connection
connection
,
String
sql
)
throws
SQLException
{
ResultSet
resultSet
=
null
;
try
(
PreparedStatement
statement
=
connection
.
prepareStatement
(
sql
))
{
resultSet
=
statement
.
executeQuery
();
while
(
resultSet
.
next
())
{
return
resultSet
.
getString
(
1
);
}
}
finally
{
if
(
resultSet
!=
null
)
{
resultSet
.
close
();
}
}
return
null
;
}
private
static
void
updataDatabase
(
Connection
connection
,
String
link
,
String
sql
)
throws
SQLException
{
try
(
PreparedStatement
statement
=
connection
.
prepareStatement
(
sql
))
{
statement
.
setString
(
1
,
link
);
statement
.
executeUpdate
();
}
}
private
static
boolean
isLinkProcessed
(
Connection
connection
,
String
link
)
throws
SQLException
{
ResultSet
resultSet
=
null
;
try
(
PreparedStatement
statement
=
connection
.
prepareStatement
(
"select link from LINKS_ALREADY_PROCESSED where LINK=?;"
))
{
statement
.
setString
(
1
,
link
);
// 从数据库加载即将处理的代码
resultSet
=
statement
.
executeQuery
();
while
(
resultSet
.
next
())
{
return
true
;
}
}
finally
{
if
(
resultSet
!=
null
)
{
resultSet
.
close
();
}
}
return
false
;
}
/*
* 2、将表达不同逻辑的代码抽象为短方法
* 优点:
* a.便于人脑理解
* b.越短越容易复用
* c.对于Java来说可以方便的对方法进行覆盖
*/
// 通过http请求拿到HTML文档
private
static
Document
httpGetAndParseHtml
(
String
link
)
throws
IOException
{
try
(
CloseableHttpClient
httpclient
=
HttpClients
.
createDefault
())
{
HttpGet
httpGet
=
new
HttpGet
(
link
);
httpGet
.
addHeader
(
"user-agent"
,
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36"
);
try
(
CloseableHttpResponse
response1
=
httpclient
.
execute
(
httpGet
))
{
System
.
out
.
println
(
response1
.
getStatusLine
());
System
.
out
.
println
(
link
);
HttpEntity
entity1
=
response1
.
getEntity
();
String
html
=
EntityUtils
.
toString
(
entity1
);
return
Jsoup
.
parse
(
html
);
}
}
}
// 若是新闻页面就存到数据库中
private
static
void
storeIntoDatabaseIfItIsNewsPage
(
Connection
connection
,
Document
doc
,
String
link
)
throws
SQLException
{
ArrayList
<
Element
>
articleTags
=
doc
.
select
(
"article"
);
if
(!
articleTags
.
isEmpty
())
{
for
(
Element
articleTag
:
articleTags
)
{
String
title
=
articleTags
.
get
(
0
).
child
(
0
).
text
();
// Collectors.joining("\n")得到的字符串用换行符分隔
String
content
=
articleTag
.
select
(
"p"
).
stream
().
map
(
Element:
:
text
).
collect
(
Collectors
.
joining
(
"\n"
));
System
.
out
.
println
(
title
);
try
(
PreparedStatement
statement
=
connection
.
prepareStatement
(
"insert into news(url,title,content,created_at,MODIFIED_AT)VALUES ( ?,?,?,now(),now() )"
))
{
statement
.
setString
(
1
,
link
);
statement
.
setString
(
2
,
title
);
statement
.
setString
(
3
,
content
);
statement
.
executeUpdate
();
}
}
}
}
/*
* 1、将长的判断条件抽取为不同的方法
*/
// 感兴趣的链接
private
static
boolean
isInterestingLink
(
String
link
)
{
return
(
isNewsPage
(
link
)
||
isIndexPage
(
link
)
&&
isNotLoginPage
(
link
));
}
// 首页
private
static
boolean
isIndexPage
(
String
link
)
{
return
"https://sina.cn"
.
equals
(
link
);
}
// 新闻页
private
static
boolean
isNewsPage
(
String
link
)
{
return
link
.
contains
(
"news.sina.cn"
);
}
// 登录页
private
static
boolean
isNotLoginPage
(
String
link
)
{
return
!
link
.
contains
(
"passport.sina.cn"
);
}
}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录