示例#1
0
文件: weixin.go 项目: aosen/spiders
// Parse html dom here and record the parse result that we want to Page.
// Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html.
func (this *MyPageProcesser) Process(p *robot.Page) {
	if !p.IsSucc() {
		println(p.Errormsg())
		return
	}

	query := p.GetHtmlParser()

	query.Find(`div[class="wx-rb bg-blue wx-rb_v1 _item"]`).Each(func(i int, s *goquery.Selection) {
		name := s.Find("div.txt-box > h3").Text()
		href, _ := s.Attr("href")

		fmt.Printf("WeName:%v link:http://http://weixin.sogou.com%v \r\n", name, href)
		// the entity we want to save by Pipeline
		p.AddField("name", name)
		p.AddField("href", href)
	})

	next_page_href, _ := query.Find("#sogou_next").Attr("href")
	if next_page_href == "" {
		p.SetSkip(true)
	} else {
		p.AddTargetRequestWithHeaderFile("http://weixin.sogou.com/weixin"+next_page_href, "html", "weixin.sogou.com.json")
	}

}