提交 7e32bbb8 编写于 作者: _Fighter's avatar _Fighter

8月8日

上级 4999f12d
package engine
import (
"learngo/crawler/fetcher"
"log"
)
func Run(seeds ...Request) {
var requests []Request // requet 队列
// 请求加入队列
for _, r := range seeds {
requests = append(requests, r)
}
for len(requests) > 0 {
r := requests[0] // 获取第一个请求request
requests = requests[1:] // 截取
log.Printf(" parserUrl :%s", r.Url)
body, error := fetcher.Fetch(r.Url)
if error != nil {
log.Printf("Fetcher :error fetcher url %s : %v", r.Url, error)
continue
}
parserResult := r.ParserFunc(body) // body 传给解析器,返回解析结果
requests = append(requests,
parserResult.Requests...)
//打印解析后的item 城市名称
for _, item := range parserResult.Items {
log.Printf(" got item %v", item)
}
}
}
package engine
type Request struct {
Url string
ParserFunc func([]byte) ParserResult
}
type ParserResult struct {
Requests []Request
Items []interface{}
}
func NilParser([]byte) ParserResult {
return ParserResult{}
}
package fetcher
import (
"bufio"
"fmt"
"golang.org/x/net/html/charset"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/unicode"
"golang.org/x/text/transform"
"io"
"io/ioutil"
"log"
"net/http"
)
func Fetch(url string) ([]byte, error) {
resp, err := http.Get(url)
if err != nil {
//panic(err)
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
fmt.Println(" Error: status code ", resp.StatusCode)
return nil, fmt.Errorf(" wrong status code: %d", resp.StatusCode)
}
// 自动发现见面编码
e := determineEncoding(resp.Body)
// 把内容以GBK编码读取
utf8Reader := transform.NewReader(resp.Body, e.NewDecoder())
return ioutil.ReadAll(utf8Reader)
}
func determineEncoding(r io.Reader) encoding.Encoding {
bytes, err := bufio.NewReader(r).Peek(1024)
if err != nil {
log.Printf("Fetcher error : %v", err)
return unicode.UTF8 //返回默认UTF8
}
// 返回 encoding
e, _, _ := charset.DetermineEncoding(
bytes,
"",
)
//fmt.Printf(" name %s certain %s", name, certain)
return e
}
......@@ -8,11 +8,21 @@ import (
"golang.org/x/text/transform"
"io"
"io/ioutil"
"learngo/crawler/engine"
"learngo/crawler/zhenai/paser"
"net/http"
"regexp"
)
func main() {
engine.Run(
engine.Request{
Url: "https://www.zhenai.com/zhenghun",
ParserFunc: paser.ParseCityList,
})
}
func mains() {
resp, err := http.Get("https://www.zhenai.com/zhenghun")
if err != nil {
......@@ -35,7 +45,9 @@ func main() {
panic(err)
}
fmt.Printf("%s\n", all)
//fmt.Printf("%s\n", all)
printCityList(all)
}
......@@ -52,3 +64,13 @@ func determineEncoding(r io.Reader) encoding.Encoding {
fmt.Printf(" name %s certain %s", name, certain)
return e
}
func printCityList(contents []byte) {
re := regexp.MustCompile(`<a href="(http://www.zhenai.com/zhenghun/[0-9a-z]+)"[^>]*>([^<]+)</a>`)
mastches := re.FindAllSubmatch(contents, -1)
fmt.Printf("查找%s 城市", len(mastches))
for _, m := range mastches {
fmt.Printf("City: %s ,URL: %s\n", m[2], m[1])
}
}
package paser
import (
"learngo/crawler/engine"
"regexp"
)
/**
城市列表解析器
*/
const cityListRe = `<a href="(http://www.zhenai.com/zhenghun/[0-9a-z]+)"[^>]*>([^<]+)</a>`
func ParseCityList(contents []byte) engine.ParserResult {
re := regexp.MustCompile(cityListRe)
mastches := re.FindAllSubmatch(contents, -1) // 0 是 整条路径 ,1 路径 ,2 名称
//fmt.Printf("查找%s 城市",len(mastches))
result := engine.ParserResult{}
for _, m := range mastches {
// 把城市名做为返回值,加入items 中
result.Items = append(result.Items, string(m[2]))
result.Requests = append(
result.Requests, engine.Request{
Url: string(m[1]),
ParserFunc: engine.NilParser,
})
//fmt.Printf("City: %s ,URL: %s\n",m[2],m[1])
}
return result
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册