// Parse html dom here and record the parse result that we want to Page. // Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html. func (this *MyPageProcesser) Process(p *page.Page) { if !p.IsSucc() { println(p.Errormsg()) return } query := p.GetHtmlParser() var urls []string query.Find("h3[class='repo-list-name'] a").Each(func(i int, s *goquery.Selection) { href, _ := s.Attr("href") urls = append(urls, "http://github.com/"+href) }) // these urls will be saved and crawed by other coroutines. p.AddTargetRequests(urls, "html") name := query.Find(".entry-title .author").Text() name = strings.Trim(name, " \t\n") repository := query.Find(".entry-title .js-current-repository").Text() repository = strings.Trim(repository, " \t\n") //readme, _ := query.Find("#readme").Html() if name == "" { p.SetSkip(true) } // the entity we want to save by Pipeline p.AddField("author", name) p.AddField("project", repository) //p.AddField("readme", readme) }
// Parse html dom here and record the parse result that we want to Page. // Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html. func (this *MyPageProcesser) Process(p *page.Page) { query := p.GetHtmlParser() var urls []string query.Find("#threadlisttableid tbody").Each(func(i int, s *goquery.Selection) { if s.HasClass("emptb") { return } href, _ := s.Find("tbody tr .icn a").Attr("href") urls = append(urls, href) }) // these urls will be saved and crawed by other coroutines. p.AddTargetRequests(urls, "html") title := query.Find("#thread_subject").Text() title = strings.Trim(title, "\t\n\r") author := query.Find("#postlist div .authi").Eq(0).Text() author = strings.Trim(author, "\t\r\n") if title == "" || author == "" { p.SetSkip(true) } p.AddField("title", title) p.AddField("author", author) }
// Parse html dom here and record the parse result that we want to Page. // Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html. func (this *MyPageProcesser) Process(p *page.Page) { if !p.IsSucc() { println(p.Errormsg()) return } query := p.GetHtmlParser() query.Find(`div[class="wx-rb bg-blue wx-rb_v1 _item"]`).Each(func(i int, s *goquery.Selection) { name := s.Find("div.txt-box > h3").Text() href, _ := s.Attr("href") fmt.Printf("WeName:%v link:http://http://weixin.sogou.com%v \r\n", name, href) // the entity we want to save by Pipeline p.AddField("name", name) p.AddField("href", href) }) next_page_href, _ := query.Find("#sogou_next").Attr("href") if next_page_href == "" { p.SetSkip(true) } else { p.AddTargetRequestWithHeaderFile("http://weixin.sogou.com/weixin"+next_page_href, "html", "weixin.sogou.com.json") } }
func (this *PlantProcesser) Process(p *page.Page) { if !p.IsSucc() { println(p.Errormsg()) return } query := p.GetHtmlParser() if !this.isPlant(query, p) { p.SetSkip(true) } this.getName(query, p) this.getSummary(query, p) this.getCatalog(query, p) p.AddTargetRequests(this.getUrls(query), "html") }
// Parse html dom here and record the parse result that we want to Page. // Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html. func (this *MyPageProcesser) Process(p *page.Page) { if p.GetUrlTag() == "site_login" { //fmt.Printf("%v\n", p.GetCookies()) this.cookies = p.GetCookies() // AddTargetRequestWithParams Params: // 1. Url. // 2. Responce type is "html" or "json" or "jsonp" or "text". // 3. The urltag is name for marking url and distinguish different urls in PageProcesser and Pipeline. // 4. The method is POST or GET. // 5. The postdata is body string sent to sever. // 6. The header is header for http request. // 7. Cookies // 8. Http redirect function if len(this.cookies) != 0 { p.AddField("info", "get cookies success") req := request.NewRequest("http://backadmin.hucong.net/site/index", "html", "site_index", "GET", "", nil, this.cookies, nil, nil) p.AddTargetRequestWithParams(req) } else { p.AddField("info", "get cookies failed") } } else { //fmt.Printf("%v\n", p.GetBodyStr()) query := p.GetHtmlParser() pageTitle := query.Find(".page-content .page-title").Text() if len(pageTitle) != 0 { p.AddField("page_title", pageTitle) p.AddField("info", "login success") } else { p.AddField("info", "login failed") } } return if !p.IsSucc() { println(p.Errormsg()) return } query := p.GetHtmlParser() var urls []string query.Find("h3[class='repo-list-name'] a").Each(func(i int, s *goquery.Selection) { href, _ := s.Attr("href") urls = append(urls, "http://github.com/"+href) }) // these urls will be saved and crawed by other coroutines. p.AddTargetRequests(urls, "html") name := query.Find(".entry-title .author").Text() name = strings.Trim(name, " \t\n") repository := query.Find(".entry-title .js-current-repository").Text() repository = strings.Trim(repository, " \t\n") //readme, _ := query.Find("#readme").Html() if name == "" { p.SetSkip(true) } // the entity we want to save by Pipeline p.AddField("author", name) p.AddField("project", repository) //p.AddField("readme", readme) }