diff --git a/crawler/engine/engine.go b/crawler/engine/engine.go new file mode 100644 index 0000000000000000000000000000000000000000..f469048e7c95f166bfe8951e89958c7358892c19 --- /dev/null +++ b/crawler/engine/engine.go @@ -0,0 +1,33 @@ +package engine + +import ( + "learngo/crawler/fetcher" + "log" +) + +func Run(seeds ...Request) { + var requests []Request // requet 队列 + // 请求加入队列 + for _, r := range seeds { + requests = append(requests, r) + } + for len(requests) > 0 { + r := requests[0] // 获取第一个请求request + requests = requests[1:] // 截取 + log.Printf(" parserUrl :%s", r.Url) + body, error := fetcher.Fetch(r.Url) + + if error != nil { + log.Printf("Fetcher :error fetcher url %s : %v", r.Url, error) + continue + + } + parserResult := r.ParserFunc(body) // body 传给解析器,返回解析结果 + requests = append(requests, + parserResult.Requests...) + //打印解析后的item 城市名称 + for _, item := range parserResult.Items { + log.Printf(" got item %v", item) + } + } +} diff --git a/crawler/engine/types.go b/crawler/engine/types.go new file mode 100644 index 0000000000000000000000000000000000000000..12a9ddc6b83c8b3e421eaf96bf3f8652f15963b7 --- /dev/null +++ b/crawler/engine/types.go @@ -0,0 +1,15 @@ +package engine + +type Request struct { + Url string + ParserFunc func([]byte) ParserResult +} + +type ParserResult struct { + Requests []Request + Items []interface{} +} + +func NilParser([]byte) ParserResult { + return ParserResult{} +} diff --git a/crawler/fetcher/fetcher.go b/crawler/fetcher/fetcher.go new file mode 100644 index 0000000000000000000000000000000000000000..1e4c6989b21cdc01090f33fd19959e5610b6f360 --- /dev/null +++ b/crawler/fetcher/fetcher.go @@ -0,0 +1,49 @@ +package fetcher + +import ( + "bufio" + "fmt" + "golang.org/x/net/html/charset" + "golang.org/x/text/encoding" + "golang.org/x/text/encoding/unicode" + "golang.org/x/text/transform" + "io" + "io/ioutil" + "log" + "net/http" +) + +func Fetch(url string) ([]byte, error) { + resp, err := http.Get(url) + if err != nil { + //panic(err) + return nil, err + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + fmt.Println(" Error: status code ", resp.StatusCode) + return nil, fmt.Errorf(" wrong status code: %d", resp.StatusCode) + } + // 自动发现见面编码 + e := determineEncoding(resp.Body) + // 把内容以GBK编码读取 + utf8Reader := transform.NewReader(resp.Body, e.NewDecoder()) + return ioutil.ReadAll(utf8Reader) +} + +func determineEncoding(r io.Reader) encoding.Encoding { + + bytes, err := bufio.NewReader(r).Peek(1024) + if err != nil { + log.Printf("Fetcher error : %v", err) + return unicode.UTF8 //返回默认UTF8 + } + // 返回 encoding + e, _, _ := charset.DetermineEncoding( + bytes, + "", + ) + //fmt.Printf(" name %s certain %s", name, certain) + return e +} diff --git a/crawler/main.go b/crawler/main.go index cdda725290d83135d4a9b81c32b97bc24ad28894..194f63e564fcb1b898e0c0f785da39c55bb32bbe 100644 --- a/crawler/main.go +++ b/crawler/main.go @@ -8,11 +8,21 @@ import ( "golang.org/x/text/transform" "io" "io/ioutil" + "learngo/crawler/engine" + "learngo/crawler/zhenai/paser" "net/http" + "regexp" ) func main() { + engine.Run( + engine.Request{ + Url: "https://www.zhenai.com/zhenghun", + ParserFunc: paser.ParseCityList, + }) +} +func mains() { resp, err := http.Get("https://www.zhenai.com/zhenghun") if err != nil { @@ -35,7 +45,9 @@ func main() { panic(err) } - fmt.Printf("%s\n", all) + //fmt.Printf("%s\n", all) + + printCityList(all) } @@ -52,3 +64,13 @@ func determineEncoding(r io.Reader) encoding.Encoding { fmt.Printf(" name %s certain %s", name, certain) return e } + +func printCityList(contents []byte) { + + re := regexp.MustCompile(`]*>([^<]+)`) + mastches := re.FindAllSubmatch(contents, -1) + fmt.Printf("查找%s 城市", len(mastches)) + for _, m := range mastches { + fmt.Printf("City: %s ,URL: %s\n", m[2], m[1]) + } +} diff --git a/crawler/zhenai/paser/citylist.go b/crawler/zhenai/paser/citylist.go new file mode 100644 index 0000000000000000000000000000000000000000..622f3155f78c21f2262ddf9ca191eef6ef8fc6bc --- /dev/null +++ b/crawler/zhenai/paser/citylist.go @@ -0,0 +1,29 @@ +package paser + +import ( + "learngo/crawler/engine" + "regexp" +) + +/** +城市列表解析器 +*/ +const cityListRe = `]*>([^<]+)` + +func ParseCityList(contents []byte) engine.ParserResult { + re := regexp.MustCompile(cityListRe) + mastches := re.FindAllSubmatch(contents, -1) // 0 是 整条路径 ,1 路径 ,2 名称 + //fmt.Printf("查找%s 城市",len(mastches)) + result := engine.ParserResult{} + for _, m := range mastches { + // 把城市名做为返回值,加入items 中 + result.Items = append(result.Items, string(m[2])) + result.Requests = append( + result.Requests, engine.Request{ + Url: string(m[1]), + ParserFunc: engine.NilParser, + }) + //fmt.Printf("City: %s ,URL: %s\n",m[2],m[1]) + } + return result +}