AutoRefreshService.java 4.0 KB
Newer Older
L
liulei06 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
package com.llthunder.csdn;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.http.HttpStatus;
import org.springframework.http.ResponseEntity;
import org.springframework.web.client.RestTemplate;

import java.util.concurrent.CopyOnWriteArraySet;
import java.util.concurrent.TimeUnit;

public class AutoRefreshService {

    private  CopyOnWriteArraySet<String> allArticleUrl = new CopyOnWriteArraySet<>();

    //博主的博客管理也地址,eg:https://blog.csdn.net/123123/
    private String url = "";

    public AutoRefreshService(String url) {
        this.url = url;
    }

    /**
     * 获取博主所有的分类专栏访问地址
     */
    public CopyOnWriteArraySet<String> getAllArticleUrl(){
        ResponseEntity<String> htmlString = new RestTemplate().getForEntity(url, String.class);
        String html = htmlString.toString();//将获取的网页转换成字符串
        //获取html元素
        Document doc = Jsoup.parse(html);
        // 获取id=asideCategory的标签,这个标签下存放的是包含分类专栏的标签,缩小范围
        Element asideCategory = doc.getElementById("asideCategory");
        //  获取id=asideCategory的标签下的ul标签
        Elements ultag = asideCategory.getElementsByTag("ul");
        // 获取id=asideCategory的标签下的ul标签的a标签===>这个标签的href存放的就是分类专栏地址
        Elements as = ultag.get(0).getElementsByTag("a");
        //遍历a标签,获取a标签中的href属性值
        as.stream().forEach(a -> {
            String href = a.attr("href");
            if (!href.isEmpty()) {
                addAllArticle_By_CategoryURL(href);
            }
        });
        System.out.println("获取url地址共有:" + allArticleUrl.size());
        return allArticleUrl;
    }

    /**
     * 根据分类专栏的地址获取分类专栏下的所有文章
     *
     * @param url
     * @return
     */
    public boolean addAllArticle_By_CategoryURL(String url) {
        ResponseEntity<String> htmlString = new RestTemplate().getForEntity(url, String.class);
        String html = htmlString.toString();//将获取的网页转换成字符串
        // 获取html元素
        Document doc = Jsoup.parse(html);
        String title = doc.getElementById("column").getElementsByClass("column_title oneline").get(0).text();//获取专栏标题

        //1、 获取类名为column_article_list的html标签,这个标签内存放的就是文章的列表
        Elements column_article_list = doc.getElementsByAttributeValue("class", "column_article_list");
        //2、 获取专栏中所有文章的li元素
        Elements li_s = column_article_list.get(0).getElementsByTag("li");
        //3、并行流处理,遍历获取文章url
L
commit  
liulei06 已提交
68
        allArticleUrl.clear();
L
liulei06 已提交
69 70 71 72 73 74 75 76 77 78 79 80 81
        li_s.parallelStream().forEach((li) -> {
            String href = li.getElementsByTag("a").attr("href");
            if (!href.isEmpty()) {
                allArticleUrl.add(href);//存入成员变量中
            }
        });
        return true;
    }

    public void doRefresh(Long sleepSecondNum) throws InterruptedException {
        this.getAllArticleUrl();
        long i = 1;
        while (true){
L
commit  
liulei06 已提交
82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
            try {
                System.out.println("第" + i + "遍访问开始");
                allArticleUrl.parallelStream().forEach(articleUrl -> {
                    ResponseEntity<String> forEntity = new RestTemplate().getForEntity(articleUrl, String.class);
                    String msg = forEntity.getStatusCode() == HttpStatus.OK ? "访问成功" :  "访问失败";
                    System.out.println(articleUrl + msg);
                });
                System.out.println("第" + i + "遍访问结束");
                TimeUnit.SECONDS.sleep(sleepSecondNum);
                if(i % 10 == 0){
                    this.getAllArticleUrl();
                }
                i++;
            }catch (Exception e){
                System.out.println(e.getMessage());
L
liulei06 已提交
97 98 99 100
            }
        }
    }
}