// Parse html dom here and record the parse result that we want to Page. // Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html. func (this *MyPageProcesser) Process(p *page.Page) { if !p.IsSucc() { println(p.Errormsg()) return } query := p.GetHtmlParser() query.Find(`div[class="wx-rb bg-blue wx-rb_v1 _item"]`).Each(func(i int, s *goquery.Selection) { name := s.Find("div.txt-box > h3").Text() href, _ := s.Attr("href") fmt.Printf("WeName:%v link:http://http://weixin.sogou.com%v \r\n", name, href) // the entity we want to save by Pipeline p.AddField("name", name) p.AddField("href", href) }) next_page_href, _ := query.Find("#sogou_next").Attr("href") if next_page_href == "" { p.SetSkip(true) } else { p.AddTargetRequestWithHeaderFile("http://weixin.sogou.com/weixin"+next_page_href, "html", "weixin.sogou.com.json") } }
// Parse html dom here and record the parse result that we want to Page. // Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html. func (this *MyPageProcesser) Process(p *page.Page) { if !p.IsSucc() { println(p.Errormsg()) return } query := p.GetHtmlParser() var urls []string query.Find("h3[class='repo-list-name'] a").Each(func(i int, s *goquery.Selection) { href, _ := s.Attr("href") urls = append(urls, "http://github.com/"+href) }) // these urls will be saved and crawed by other coroutines. p.AddTargetRequests(urls, "html") name := query.Find(".entry-title .author").Text() name = strings.Trim(name, " \t\n") repository := query.Find(".entry-title .js-current-repository").Text() repository = strings.Trim(repository, " \t\n") //readme, _ := query.Find("#readme").Html() if name == "" { p.SetSkip(true) } // the entity we want to save by Pipeline p.AddField("author", name) p.AddField("project", repository) //p.AddField("readme", readme) }
func (this *MyProcessor) Process(p *page.Page) { if !p.IsSucc() { mlog.LogInst().LogError(p.Errormsg()) return } u, err := url.Parse(p.GetRequest().GetUrl()) if err != nil { mlog.LogInst().LogError(err.Error()) return } if !strings.HasSuffix(u.Host, "jiexieyin.org") { return } var urls []string query := p.GetHtmlParser() query.Find("a").Each(func(i int, s *goquery.Selection) { href, _ := s.Attr("href") reJavascript := regexp.MustCompile("^javascript\\:") reLocal := regexp.MustCompile("^\\#") reMailto := regexp.MustCompile("^mailto\\:") if reJavascript.MatchString(href) || reLocal.MatchString(href) || reMailto.MatchString(href) { return } //处理相对路径 var absHref string urlHref, err := url.Parse(href) if err != nil { mlog.LogInst().LogError(err.Error()) return } if !urlHref.IsAbs() { urlPrefix := p.GetRequest().GetUrl() absHref = urlPrefix + href urls = append(urls, absHref) } else { urls = append(urls, href) } }) p.AddTargetRequests(urls, "html") }
// Parse html dom here and record the parse result that we want to crawl. // Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html. func (this *MyPageProcesser) Process(p *page.Page) { if !p.IsSucc() { println(p.Errormsg()) return } query := p.GetHtmlParser() name := query.Find(".lemmaTitleH1").Text() name = strings.Trim(name, " \t\n") summary := query.Find(".card-summary-content .para").Text() summary = strings.Trim(summary, " \t\n") // the entity we want to save by Pipeline p.AddField("name", name) p.AddField("summary", summary) }
func TestDownloadHtml(t *testing.T) { //return //request := request.NewRequest("http://live.sina.com.cn/zt/api/l/get/finance/globalnews1/index.htm?format=json&callback=t13975294&id=23521&pagesize=45&dire=f&dpc=1") var req *request.Request req = request.NewRequest("http://live.sina.com.cn/zt/l/v/finance/globalnews1/", "html", "", "GET", "", nil, nil, nil, nil) var dl downloader.Downloader dl = downloader.NewHttpDownloader() var p *page.Page p = dl.Download(req) var doc *goquery.Document doc = p.GetHtmlParser() //fmt.Println(doc) //body := p.GetBodyStr() //fmt.Println(body) var s *goquery.Selection s = doc.Find("body") if s.Length() < 1 { t.Error("html parse failed!") } /* doc, err := goquery.NewDocument("http://live.sina.com.cn/zt/l/v/finance/globalnews1/") if err != nil { fmt.Printf("%v",err) } s := doc.Find("meta"); fmt.Println(s.Length()) resp, err := http.Get("http://live.sina.com.cn/zt/l/v/finance/globalnews1/") if err != nil { fmt.Printf("%v",err) } defer resp.Body.Close() doc, err = goquery.NewDocumentFromReader(resp.Body) s = doc.Find("meta"); fmt.Println(s.Length()) */ }
func (this MyPageProcesser) Process(p *page.Page) { query := p.GetHtmlParser() if p.GetUrlTag() == "index" { query.Find(`div[class="main area"] div[class="lc"] ul li a`).Each(func(i int, s *goquery.Selection) { url, isExsit := s.Attr("href") if isExsit { reg := regexp.MustCompile(`^do not know what is this`) var fmtStr string if rxYule.MatchString(url) { reg = rxYule fmtStr = wkSohuYule } if rxPic.MatchString(url) { reg = rxPic fmtStr = wkSohuPic } regxpArrag := reg.FindStringSubmatch(url) if len(regxpArrag) == 2 { addRequest(p, "changyan", fmt.Sprintf(fmtStr, regxpArrag[1]), "", s.Text()) } } }) } if p.GetUrlTag() == "changyan" { jsonMap := ChangyanJson{} err := json.NewDecoder(strings.NewReader(p.GetBodyStr())).Decode(&jsonMap) if err == nil { content, ok := p.GetRequest().GetMeta().(string) if ok { fmt.Println("Title:", content, " CommentCount:", jsonMap.ListData.OuterCmtSum, " ParticipationCount:", jsonMap.ListData.ParticipationSum) } } } }
// Parse html dom here and record the parse result that we want to Page. // Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html. func (this *MyPageProcesser) Process(p *page.Page) { if p.GetUrlTag() == "site_login" { //fmt.Printf("%v\n", p.GetCookies()) this.cookies = p.GetCookies() // AddTargetRequestWithParams Params: // 1. Url. // 2. Responce type is "html" or "json" or "jsonp" or "text". // 3. The urltag is name for marking url and distinguish different urls in PageProcesser and Pipeline. // 4. The method is POST or GET. // 5. The postdata is body string sent to sever. // 6. The header is header for http request. // 7. Cookies // 8. Http redirect function if len(this.cookies) != 0 { p.AddField("info", "get cookies success") req := request.NewRequest("http://backadmin.hucong.net/site/index", "html", "site_index", "GET", "", nil, this.cookies, nil, nil) p.AddTargetRequestWithParams(req) } else { p.AddField("info", "get cookies failed") } } else { //fmt.Printf("%v\n", p.GetBodyStr()) query := p.GetHtmlParser() pageTitle := query.Find(".page-content .page-title").Text() if len(pageTitle) != 0 { p.AddField("page_title", pageTitle) p.AddField("info", "login success") } else { p.AddField("info", "login failed") } } return if !p.IsSucc() { println(p.Errormsg()) return } query := p.GetHtmlParser() var urls []string query.Find("h3[class='repo-list-name'] a").Each(func(i int, s *goquery.Selection) { href, _ := s.Attr("href") urls = append(urls, "http://github.com/"+href) }) // these urls will be saved and crawed by other coroutines. p.AddTargetRequests(urls, "html") name := query.Find(".entry-title .author").Text() name = strings.Trim(name, " \t\n") repository := query.Find(".entry-title .js-current-repository").Text() repository = strings.Trim(repository, " \t\n") //readme, _ := query.Find("#readme").Html() if name == "" { p.SetSkip(true) } // the entity we want to save by Pipeline p.AddField("author", name) p.AddField("project", repository) //p.AddField("readme", readme) }