提交 024c6eff 编写于 作者: 亦蔚然's avatar 亦蔚然

第二次重构

上级 76f28cd4
......@@ -70,6 +70,11 @@
<version>1.4.199</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>com.google.code.findbugs</groupId>
<artifactId>annotations</artifactId>
<version>3.0.1</version>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-api</artifactId>
......
package com.github.weiranyi;
import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
......@@ -13,42 +14,41 @@ import org.jsoup.nodes.Element;
import java.io.IOException;
import java.sql.*;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
public class Main {
private static final String USER_NAME = "root";
private static final String USER_PASSWORD = "123456";
@SuppressFBWarnings("DMI_CONSTANT_DB_PASSWORD")
public static void main(String[] args) throws IOException, SQLException {
// 创建一个数据库链接
Connection connection = connection = DriverManager.getConnection("jdbc:h2:file:/Users/yiweiran/Documents/workPlace/java/JavaProject-Crawler-Elasticsearch/news", "root", "123456");
// 【待处理】存放待处理的链接的池子
List<String> linkPool = loadUrlsFromDatabase(connection, "select link from LINKS_TO_BE_PROCESSED;");
// 【已处理】存放已经处理的链接
Set<String> processedLinks = new HashSet<>(loadUrlsFromDatabase(connection, "select link from LINKS_ALREADY_PROCESSED;"));
Connection connection = DriverManager.getConnection("jdbc:h2:file:/Users/yiweiran/Documents/workPlace/java/JavaProject-Crawler-Elasticsearch/news", USER_NAME, USER_PASSWORD);
while (true) {
// 【待处理】存放待处理的链接的池子
List<String> linkPool = loadUrlsFromDatabase(connection, "select link from LINKS_TO_BE_PROCESSED;");
// 链接池是空的就退出循环
if (linkPool.isEmpty()) {
break;
}
// 获取并移除最后一个链接,对于ArrayList来说更有效率
String link = linkPool.remove(linkPool.size() - 1);
insertIntoDatabase(connection, link, "delete FROM LINKS_TO_BE_PROCESSED where LINK=?");
// 若链接已经处理过了就跳到下一次循环
if (processedLinks.contains(link)) {
if (isLinkProcessed(connection, link)) {
continue;
}
// 判断是否是感兴趣滴内容【新浪站内的网页】
if (isInterestingLink(link)) {
Document doc = httpGetAndParseHtml(link);
// 使用CSS选择器,html中去获取
ArrayList<Element> links = doc.select("a");
// 用Java8引入的特性对代码进行简化,过程式语言变成描述式语言
links.stream().map(aTag -> aTag.attr("href")).forEach(linkPool::add);
// 假设这是一个新闻的详情页,就存入数据库,否则,就什么都不做
// 分析页面url将它们放到即将处理的url池子中去
parseUrlsFromAndStoreIntoDatabase(connection, doc);
storeIntoDatabaseIfItIsNewsPage(doc);
processedLinks.add(link);
insertIntoDatabase(connection, link, "insert into LINKS_ALREADY_PROCESSED(link) values (?)");
} else {
// 不感兴趣
continue;
......@@ -57,22 +57,68 @@ public class Main {
}
private static void parseUrlsFromAndStoreIntoDatabase(Connection connection, Document doc) throws SQLException {
for (Element aTag : doc.select("a")) {
String href = aTag.attr("href");
insertIntoDatabase(connection, href, "insert into LINKS_TO_BE_PROCESSED(link) values (?)");
}
}
/*
* 3、重构对数据库操作部分的代码
*
*/
private static List<String> loadUrlsFromDatabase(Connection connection, String sql) throws SQLException {
List<String> results = new ArrayList<>();
ResultSet resultSet = null;
try (PreparedStatement statement = connection.prepareStatement(sql)) {
// 从数据库加载即将处理的代码
ResultSet resultSet = statement.executeQuery();
resultSet = statement.executeQuery();
while (resultSet.next()) {
results.add(resultSet.getString(1));
}
} finally {
if (resultSet != null) {
resultSet.close();
}
}
return results;
}
private static void insertIntoDatabase(Connection connection, String link, String sql) throws SQLException {
try (PreparedStatement statement = connection.prepareStatement(sql)) {
statement.setString(1, link);
statement.executeUpdate();
}
}
private static boolean isLinkProcessed(Connection connection, String link) throws SQLException {
ResultSet resultSet = null;
try (PreparedStatement statement = connection.prepareStatement("select link from LINKS_ALREADY_PROCESSED where LINK=?;")) {
statement.setString(1, link);
// 从数据库加载即将处理的代码
resultSet = statement.executeQuery();
while (resultSet.next()) {
return true;
}
} finally {
if (resultSet != null) {
resultSet.close();
}
}
return false;
}
// private static List<String> deleteFromDatabase(Connection connection, String sql) throws SQLException {
// List<String> results = new ArrayList<>();
// try (PreparedStatement statement = connection.prepareStatement("delete FROM LINKS_TO_BE_PROCESSED where LINK=?")) {
// statement.setString(1,link);
// // 从数据库加载即将处理的代码
// statement.executeUpdate();
// }
// return results;
// }
/*
* 2、将表达不同逻辑的代码抽象为短方法
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册