From 0901951a63dda77568cf6e2e1d998c74689cf884 Mon Sep 17 00:00:00 2001 From: yiweiran <168578203@qq.com> Date: Tue, 25 May 2021 08:19:24 +0800 Subject: [PATCH] =?UTF-8?q?v4=5F=E4=BD=BF=E7=94=A8=E4=BA=86=E5=A4=9A?= =?UTF-8?q?=E7=BA=BF=E7=A8=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../java/com/github/weiranyi/Crawler.java | 56 ++++++++++--------- src/main/java/com/github/weiranyi/Main.java | 13 +++++ .../github/weiranyi/MyBatisCrawlerDao.java | 5 +- 3 files changed, 46 insertions(+), 28 deletions(-) create mode 100644 src/main/java/com/github/weiranyi/Main.java diff --git a/src/main/java/com/github/weiranyi/Crawler.java b/src/main/java/com/github/weiranyi/Crawler.java index 30beec0..c031f10 100644 --- a/src/main/java/com/github/weiranyi/Crawler.java +++ b/src/main/java/com/github/weiranyi/Crawler.java @@ -16,34 +16,40 @@ import java.util.ArrayList; import java.util.stream.Collectors; -public class Crawler { - CrawlerDao dao = new MyBatisCrawlerDao(); +public class Crawler extends Thread { + private CrawlerDao dao; - public void run() throws SQLException, IOException { - String link = null; - // 从数据库中加载下一个链接,若能加载到则进行下一个循环 - while ((link = dao.getNextLinkThenDelete()) != null) { - // 若链接已经处理过了就跳到下一次循环 - if (dao.isLinkProcessed(link)) { - continue; - } - // 判断是否是感兴趣滴内容【新浪站内的网页】 - if (isInterestingLink(link)) { - Document doc = httpGetAndParseHtml(link); - // 分析页面url将它们放到即将处理的url池子中去 - parseUrlsFromAndStoreIntoDatabase(doc); - storeIntoDatabaseIfItIsNewsPage(doc, link); - dao.insertProcessedLinked(link); - // dao.updataDatabase(link, "insert into LINKS_ALREADY_PROCESSED(link) values (?)"); - } else { - // 不感兴趣 - continue; - } - } + public Crawler(CrawlerDao dao) { + // 这样每个线程共享同一个链接 + this.dao = dao; } - public static void main(String[] args) throws IOException, SQLException { - new Crawler().run(); + @Override + public void run() { + try { + String link; + // 从数据库中加载下一个链接,若能加载到则进行下一个循环 + while ((link = dao.getNextLinkThenDelete()) != null) { + // 若链接已经处理过了就跳到下一次循环 + if (dao.isLinkProcessed(link)) { + continue; + } + // 判断是否是感兴趣滴内容【新浪站内的网页】 + if (isInterestingLink(link)) { + Document doc = httpGetAndParseHtml(link); + // 分析页面url将它们放到即将处理的url池子中去 + parseUrlsFromAndStoreIntoDatabase(doc); + storeIntoDatabaseIfItIsNewsPage(doc, link); + dao.insertProcessedLinked(link); + // dao.updataDatabase(link, "insert into LINKS_ALREADY_PROCESSED(link) values (?)"); + } else { + // 不感兴趣 + continue; + } + } + } catch (Exception e) { + throw new RuntimeException(e); + } } private void parseUrlsFromAndStoreIntoDatabase(Document doc) throws SQLException { diff --git a/src/main/java/com/github/weiranyi/Main.java b/src/main/java/com/github/weiranyi/Main.java new file mode 100644 index 0000000..1b0a8ed --- /dev/null +++ b/src/main/java/com/github/weiranyi/Main.java @@ -0,0 +1,13 @@ +package com.github.weiranyi; + +import java.io.IOException; +import java.sql.SQLException; + +public class Main { + public static void main(String[] args) throws IOException, SQLException { + CrawlerDao dao = new MyBatisCrawlerDao(); + for (int i = 0; i < 5; i++) { + new Crawler(dao).start(); + } + } +} diff --git a/src/main/java/com/github/weiranyi/MyBatisCrawlerDao.java b/src/main/java/com/github/weiranyi/MyBatisCrawlerDao.java index 469eb17..ba84e41 100644 --- a/src/main/java/com/github/weiranyi/MyBatisCrawlerDao.java +++ b/src/main/java/com/github/weiranyi/MyBatisCrawlerDao.java @@ -26,9 +26,9 @@ public class MyBatisCrawlerDao implements CrawlerDao { sqlSessionFactory = new SqlSessionFactoryBuilder().build(inputStream); } - // 获取下一个链接再删除 + // 【synchronized转原子操作】获取下一个链接再删除 @Override - public String getNextLinkThenDelete() throws SQLException { + public synchronized String getNextLinkThenDelete() throws SQLException { // SqlSession openSession(boolean autoCommit);这里设计事务,必须提交才生效,要设置参数为true try (SqlSession session = sqlSessionFactory.openSession(true)) { String url = session.selectOne("com.github.weiranyi.MyMapper.selectNextAvailableLink"); @@ -47,7 +47,6 @@ public class MyBatisCrawlerDao implements CrawlerDao { } } - // @Override public boolean isLinkProcessed(String link) throws SQLException { try (SqlSession session = sqlSessionFactory.openSession(true)) { -- GitLab