// Parse html dom here and record the parse result that we want to Page. // Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html. func (this *MyPageProcesser) Process(p *robot.Page) { if !p.IsSucc() { println(p.Errormsg()) return } query := p.GetHtmlParser() var urls []string query.Find("h3[class='repo-list-name'] a").Each(func(i int, s *goquery.Selection) { href, _ := s.Attr("href") urls = append(urls, "http://github.com/"+href) }) // these urls will be saved and crawed by other coroutines. p.AddTargetRequests(urls, "html") name := query.Find(".entry-title .author").Text() name = strings.Trim(name, " \t\n") repository := query.Find(".entry-title .js-current-repository").Text() repository = strings.Trim(repository, " \t\n") //readme, _ := query.Find("#readme").Html() if name == "" { p.SetSkip(true) } // the entity we want to save by Pipeline p.AddField("author", name) p.AddField("project", repository) //p.AddField("readme", readme) }
func (self *MyProcessor) Process(p *robot.Page) { if !p.IsSucc() { mlog.LogInst().LogError(p.Errormsg()) return } u, err := url.Parse(p.GetRequest().GetUrl()) if err != nil { mlog.LogInst().LogError(err.Error()) return } if !strings.HasSuffix(u.Host, "jiexieyin.org") { return } var urls []string query := p.GetHtmlParser() query.Find("a").Each(func(i int, s *goquery.Selection) { href, _ := s.Attr("href") reJavascript := regexp.MustCompile("^javascript\\:") reLocal := regexp.MustCompile("^\\#") reMailto := regexp.MustCompile("^mailto\\:") if reJavascript.MatchString(href) || reLocal.MatchString(href) || reMailto.MatchString(href) { return } //处理相对路径 var absHref string urlHref, err := url.Parse(href) if err != nil { mlog.LogInst().LogError(err.Error()) return } if !urlHref.IsAbs() { urlPrefix := p.GetRequest().GetUrl() absHref = urlPrefix + href urls = append(urls, absHref) } else { urls = append(urls, href) } }) p.AddTargetRequests(initrequests(urls)) p.AddField("test1", p.GetRequest().GetUrl()) p.AddField("test2", p.GetRequest().GetUrl()) }