提交 8c3c3ad7 编写于 作者: 亦蔚然's avatar 亦蔚然

ORM实现业务逻辑的分离

上级 84552027
package com.github.weiranyi;
import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
......@@ -17,48 +16,36 @@ import java.util.ArrayList;
import java.util.stream.Collectors;
public class Main {
private static final String USER_NAME = "root";
private static final String USER_PASSWORD = "123456";
public class Crawler {
CrawlerDao dao = new JdbcCrawlerDao();
@SuppressFBWarnings("DMI_CONSTANT_DB_PASSWORD")
public static void main(String[] args) throws IOException, SQLException {
Connection connection = DriverManager.getConnection("jdbc:h2:file:/Users/yiweiran/Documents/workPlace/java/JavaProject-Crawler-Elasticsearch/news", USER_NAME, USER_PASSWORD);
public void run() throws SQLException, IOException {
String link = null;
// 从数据库中加载下一个链接,若能加载到则进行下一个循环
while ((link = getNextLinkThenDelete(connection)) != null) {
while ((link = dao.getNextLinkThenDelete()) != null) {
// 若链接已经处理过了就跳到下一次循环
if (isLinkProcessed(connection, link)) {
if (dao.isLinkProcessed(link)) {
continue;
}
// 判断是否是感兴趣滴内容【新浪站内的网页】
if (isInterestingLink(link)) {
Document doc = httpGetAndParseHtml(link);
// 分析页面url将它们放到即将处理的url池子中去
parseUrlsFromAndStoreIntoDatabase(connection, doc);
storeIntoDatabaseIfItIsNewsPage(connection, doc, link);
updataDatabase(connection, link, "insert into LINKS_ALREADY_PROCESSED(link) values (?)");
parseUrlsFromAndStoreIntoDatabase(doc);
storeIntoDatabaseIfItIsNewsPage(doc, link);
dao.updataDatabase(link, "insert into LINKS_ALREADY_PROCESSED(link) values (?)");
} else {
// 不感兴趣
continue;
}
}
}
/*
* 4、优化主干逻辑,进一步重构
*/
private static String getNextLinkThenDelete(Connection connection) throws SQLException {
String link = getNextLink(connection, "select link from LINKS_TO_BE_PROCESSED limit 1;");
if (link != null) {
updataDatabase(connection, link, "delete FROM LINKS_TO_BE_PROCESSED where LINK=?");
}
return link;
public static void main(String[] args) throws IOException, SQLException {
new Crawler().run();
}
private static void parseUrlsFromAndStoreIntoDatabase(Connection connection, Document doc) throws SQLException {
private void parseUrlsFromAndStoreIntoDatabase(Document doc) throws SQLException {
for (Element aTag : doc.select("a")) {
String href = aTag.attr("href");
if (href.startsWith("//")) {
......@@ -67,62 +54,10 @@ public class Main {
if (href.toLowerCase().startsWith("javascript")) {
continue;
}
updataDatabase(connection, href, "insert into LINKS_TO_BE_PROCESSED(link) values (?)");
}
}
/*
* 3、重构对数据库操作部分的代码
*/
private static String getNextLink(Connection connection, String sql) throws SQLException {
ResultSet resultSet = null;
try (PreparedStatement statement = connection.prepareStatement(sql)) {
resultSet = statement.executeQuery();
while (resultSet.next()) {
return resultSet.getString(1);
}
} finally {
if (resultSet != null) {
resultSet.close();
}
dao.updataDatabase(href, "insert into LINKS_TO_BE_PROCESSED(link) values (?)");
}
return null;
}
private static void updataDatabase(Connection connection, String link, String sql) throws SQLException {
try (PreparedStatement statement = connection.prepareStatement(sql)) {
statement.setString(1, link);
statement.executeUpdate();
}
}
private static boolean isLinkProcessed(Connection connection, String link) throws SQLException {
ResultSet resultSet = null;
try (PreparedStatement statement = connection.prepareStatement("select link from LINKS_ALREADY_PROCESSED where LINK=?;")) {
statement.setString(1, link);
// 从数据库加载即将处理的代码
resultSet = statement.executeQuery();
while (resultSet.next()) {
return true;
}
} finally {
if (resultSet != null) {
resultSet.close();
}
}
return false;
}
/*
* 2、将表达不同逻辑的代码抽象为短方法
* 优点:
* a.便于人脑理解
* b.越短越容易复用
* c.对于Java来说可以方便的对方法进行覆盖
*/
// 通过http请求拿到HTML文档
private static Document httpGetAndParseHtml(String link) throws IOException {
try (CloseableHttpClient httpclient = HttpClients.createDefault()) {
HttpGet httpGet = new HttpGet(link);
......@@ -137,8 +72,7 @@ public class Main {
}
}
// 若是新闻页面就存到数据库中
private static void storeIntoDatabaseIfItIsNewsPage(Connection connection, Document doc, String link) throws SQLException {
private void storeIntoDatabaseIfItIsNewsPage(Document doc, String link) throws SQLException {
ArrayList<Element> articleTags = doc.select("article");
if (!articleTags.isEmpty()) {
for (Element articleTag : articleTags) {
......@@ -146,37 +80,24 @@ public class Main {
// Collectors.joining("\n")得到的字符串用换行符分隔
String content = articleTag.select("p").stream().map(Element::text).collect(Collectors.joining("\n"));
System.out.println(title);
try (PreparedStatement statement = connection.prepareStatement("insert into news(url,title,content,created_at,MODIFIED_AT)VALUES ( ?,?,?,now(),now() )")) {
statement.setString(1, link);
statement.setString(2, title);
statement.setString(3, content);
statement.executeUpdate();
}
dao.insertNewsIntoDataBase(link, title, content);
}
}
}
/*
* 1、将长的判断条件抽取为不同的方法
*/
// 感兴趣的链接
private static boolean isInterestingLink(String link) {
return (isNewsPage(link) || isIndexPage(link) && isNotLoginPage(link));
}
// 首页
private static boolean isIndexPage(String link) {
return "https://sina.cn".equals(link);
}
// 新闻页
private static boolean isNewsPage(String link) {
return link.contains("news.sina.cn");
}
// 登录页
private static boolean isNotLoginPage(String link) {
return !link.contains("passport.sina.cn");
}
}
package com.github.weiranyi;
import java.sql.SQLException;
public interface CrawlerDao {
String getNextLink(String sql) throws SQLException;
String getNextLinkThenDelete() throws SQLException;
void updataDatabase(String link, String sql) throws SQLException;
void insertNewsIntoDataBase(String url, String title, String content) throws SQLException;
boolean isLinkProcessed(String link) throws SQLException;
}
package com.github.weiranyi;
import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
import java.sql.*;
public class JdbcCrawlerDao implements CrawlerDao {
private static final String USER_NAME = "root";
private static final String USER_PASSWORD = "123456";
private final Connection connection;
@SuppressFBWarnings("DMI_CONSTANT_DB_PASSWORD")
public JdbcCrawlerDao() {
try {
this.connection = DriverManager.getConnection("jdbc:h2:file:/Users/yiweiran/Documents/workPlace/java/JavaProject-Crawler-Elasticsearch/news", USER_NAME, USER_PASSWORD);
} catch (SQLException e) {
throw new RuntimeException(e);
}
}
public String getNextLinkThenDelete() throws SQLException {
String link = getNextLink("select link from LINKS_TO_BE_PROCESSED limit 1;");
if (link != null) {
updataDatabase(link, "delete FROM LINKS_TO_BE_PROCESSED where LINK=?");
}
return link;
}
public String getNextLink(String sql) throws SQLException {
ResultSet resultSet = null;
try (PreparedStatement statement = connection.prepareStatement(sql)) {
resultSet = statement.executeQuery();
while (resultSet.next()) {
return resultSet.getString(1);
}
} finally {
if (resultSet != null) {
resultSet.close();
}
}
return null;
}
public void updataDatabase(String link, String sql) throws SQLException {
try (PreparedStatement statement = connection.prepareStatement(sql)) {
statement.setString(1, link);
statement.executeUpdate();
}
}
public void insertNewsIntoDataBase(String url, String title, String content) throws SQLException {
try (PreparedStatement statement = connection.prepareStatement("insert into news (url, title, content, created_at,MODIFIED_AT)values(?,?,?,now(),now())")) {
statement.setString(1, url);
statement.setString(2, title);
statement.setString(3, content);
statement.executeUpdate();
}
}
public boolean isLinkProcessed(String link) throws SQLException {
ResultSet resultSet = null;
try (PreparedStatement statement = connection.prepareStatement("select link from LINKS_ALREADY_PROCESSED where LINK=?;")) {
statement.setString(1, link);
// 从数据库加载即将处理的代码
resultSet = statement.executeQuery();
while (resultSet.next()) {
return true;
}
} finally {
if (resultSet != null) {
resultSet.close();
}
}
return false;
}
}
package com.github.weiranyi;
/**
* @author: https://github.com/weiranyi
* @description 这是一个新闻类
* @date: 2021/5/22 9:00 下午
* @Version 1.0
* '';
*/
public class News {
private Integer id;
private String url;
private String content;
private String title;
public News() {
}
public News(String url, String content, String title) {
this.url = url;
this.content = content;
this.title = title;
}
public Integer getId() {
return id;
}
public void setId(Integer id) {
this.id = id;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册