// Parse html dom here and record the parse result that we want to Page. // Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html. func (this *MyPageProcesser) Process(p *page.Page) { if !p.IsSucc() { println(p.Errormsg()) return } query := p.GetHtmlParser() var urls []string query.Find("h3[class='repo-list-name'] a").Each(func(i int, s *goquery.Selection) { href, _ := s.Attr("href") urls = append(urls, "http://github.com/"+href) }) // these urls will be saved and crawed by other coroutines. p.AddTargetRequests(urls, "html") name := query.Find(".entry-title .author").Text() name = strings.Trim(name, " \t\n") repository := query.Find(".entry-title .js-current-repository").Text() repository = strings.Trim(repository, " \t\n") //readme, _ := query.Find("#readme").Html() if name == "" { p.SetSkip(true) } // the entity we want to save by Pipeline p.AddField("author", name) p.AddField("project", repository) //p.AddField("readme", readme) }
// Parse html dom here and record the parse result that we want to Page. // Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html. func (this *MyPageProcesser) Process(p *page.Page) { if !p.IsSucc() { println(p.Errormsg()) return } query := p.GetHtmlParser() query.Find(`div[class="wx-rb bg-blue wx-rb_v1 _item"]`).Each(func(i int, s *goquery.Selection) { name := s.Find("div.txt-box > h3").Text() href, _ := s.Attr("href") fmt.Printf("WeName:%v link:http://http://weixin.sogou.com%v \r\n", name, href) // the entity we want to save by Pipeline p.AddField("name", name) p.AddField("href", href) }) next_page_href, _ := query.Find("#sogou_next").Attr("href") if next_page_href == "" { p.SetSkip(true) } else { p.AddTargetRequestWithHeaderFile("http://weixin.sogou.com/weixin"+next_page_href, "html", "weixin.sogou.com.json") } }
// Parse html dom here and record the parse result that we want to Page. // Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html. func (this *MyPageProcesser) Process(p *page.Page) { query := p.GetHtmlParser() var urls []string query.Find("#threadlisttableid tbody").Each(func(i int, s *goquery.Selection) { if s.HasClass("emptb") { return } href, _ := s.Find("tbody tr .icn a").Attr("href") urls = append(urls, href) }) // these urls will be saved and crawed by other coroutines. p.AddTargetRequests(urls, "html") title := query.Find("#thread_subject").Text() title = strings.Trim(title, "\t\n\r") author := query.Find("#postlist div .authi").Eq(0).Text() author = strings.Trim(author, "\t\r\n") if title == "" || author == "" { p.SetSkip(true) } p.AddField("title", title) p.AddField("author", author) }
// Parse html dom here and record the parse result that we want to crawl. // Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html. func (this *MyPageProcesser) Process(p *page.Page) { query := p.GetHtmlParser() name := query.Find(".lemmaTitleH1").Text() name = strings.Trim(name, " \t\n") summary := query.Find(".card-summary-content .para").Text() summary = strings.Trim(summary, " \t\n") // the entity we want to save by Pipeline p.AddField("name", name) p.AddField("summary", summary) }
func (this *PlantProcesser) Process(p *page.Page) { if !p.IsSucc() { println(p.Errormsg()) return } query := p.GetHtmlParser() if !this.isPlant(query, p) { p.SetSkip(true) } this.getName(query, p) this.getSummary(query, p) this.getCatalog(query, p) p.AddTargetRequests(this.getUrls(query), "html") }
// Parse html dom here and record the parse result that we want to Page. // Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html. func (this *MyPageProcesser) Process(p *page.Page) { if !p.IsSucc() { println(p.Errormsg()) return } query := p.GetHtmlParser() currentUrl := p.GetRequest().GetUrl() var urls []string query.Find("a").Each(func(i int, s *goquery.Selection) { href, _ := s.Attr("href") urlHref, err := url.Parse(href) if err != nil { mlog.LogInst().LogError(err.Error()) return } if !urlHref.IsAbs() { href = currentUrl + href } // Temporarily check in crawler.go, it will be implemented in pattern package. if checkMatchPattern(base, href) { visited, _ := rep.CheckIfVisited(href) if !visited { rep.VisitedNewNode(href) // urls = append(urls, href) urlstr.UploadURL(href) } } }) // store content to db fmt.Printf("====store & commit : %s====\n\n\n", currentUrl) content, _ := query.Html() // content := "" storage.StoreInsert(collection, storage.StoreFormat{currentUrl, content}) urlstr.CommitURL(currentUrl) releaseSlot <- 1 url := GetOneURL() if url != "" { urls = append(urls, url) } p.AddTargetRequests(urls, "html") }
func (this *MyProcessor) Process(p *page.Page) { if !p.IsSucc() { mlog.LogInst().LogError(p.Errormsg()) return } u, err := url.Parse(p.GetRequest().GetUrl()) if err != nil { mlog.LogInst().LogError(err.Error()) return } if !strings.HasSuffix(u.Host, "jiexieyin.org") { return } var urls []string query := p.GetHtmlParser() query.Find("a").Each(func(i int, s *goquery.Selection) { href, _ := s.Attr("href") reJavascript := regexp.MustCompile("^javascript\\:") reLocal := regexp.MustCompile("^\\#") reMailto := regexp.MustCompile("^mailto\\:") if reJavascript.MatchString(href) || reLocal.MatchString(href) || reMailto.MatchString(href) { return } //处理相对路径 var absHref string urlHref, err := url.Parse(href) if err != nil { mlog.LogInst().LogError(err.Error()) return } if !urlHref.IsAbs() { urlPrefix := p.GetRequest().GetUrl() absHref = urlPrefix + href urls = append(urls, absHref) } else { urls = append(urls, href) } }) p.AddTargetRequests(urls, "html") }
func TestDownloadHtml(t *testing.T) { //return //request := request.NewRequest("http://live.sina.com.cn/zt/api/l/get/finance/globalnews1/index.htm?format=json&callback=t13975294&id=23521&pagesize=45&dire=f&dpc=1") var req *request.Request req = request.NewRequest("http://live.sina.com.cn/zt/l/v/finance/globalnews1/", "html", "", "GET", "", nil, nil, nil, nil) var dl downloader.Downloader dl = downloader.NewHttpDownloader() var p *page.Page p = dl.Download(req) var doc *goquery.Document doc = p.GetHtmlParser() //fmt.Println(doc) //body := p.GetBodyStr() //fmt.Println(body) var s *goquery.Selection s = doc.Find("body") if s.Length() < 1 { t.Error("html parse failed!") } /* doc, err := goquery.NewDocument("http://live.sina.com.cn/zt/l/v/finance/globalnews1/") if err != nil { fmt.Printf("%v",err) } s := doc.Find("meta"); fmt.Println(s.Length()) resp, err := http.Get("http://live.sina.com.cn/zt/l/v/finance/globalnews1/") if err != nil { fmt.Printf("%v",err) } defer resp.Body.Close() doc, err = goquery.NewDocumentFromReader(resp.Body) s = doc.Find("meta"); fmt.Println(s.Length()) */ }
/* ** 解析页面,把粉丝的信息存入dynamodb,同时把接下来要爬取的url存入sqs */ func (this *MyPageProcesser) Process(p *page.Page) { if !p.IsSucc() { glog.Errorln(p.Errormsg()) return } /* ** 打印爬取得页面 */ glog.Infoln(p) query := p.GetHtmlParser() if Urls[i] == "weibo.cn" { i = i + 1 } if UrlsLevel[i] == 0 { glog.Infoln("layer:", crawlUrl.Layer) this.w.GetNextPageUrl(query, p) this.w.GetFriendsUrl(query, p) } else if UrlsLevel[i] == 1 { this.w.GetFriendsInfo(query) } // if crawlUrl.Layer == 0 { // } else if crawlUrl.Layer == 1 { // glog.Infoln("layer:", crawlUrl.Layer) // this.w.GetNextPageUrl(query, p) // this.w.GetFFUrl(query) // } else if crawlUrl.Layer == 2 { // glog.Infoln("layer:", crawlUrl.Layer) // this.w.GetFFInfo(query) // } // header_num := rand.Intn(9) header_json := headerJson[header_num] i = i + 1 p.AddTargetRequestWithHeaderFile(Urls[i], "html", header_json) }
func (this MyPageProcesser) Process(p *page.Page) { query := p.GetHtmlParser() if p.GetUrlTag() == "index" { query.Find(`div[class="main area"] div[class="lc"] ul li a`).Each(func(i int, s *goquery.Selection) { url, isExsit := s.Attr("href") if isExsit { reg := regexp.MustCompile(`^do not know what is this`) var fmtStr string if rxYule.MatchString(url) { reg = rxYule fmtStr = wkSohuYule } if rxPic.MatchString(url) { reg = rxPic fmtStr = wkSohuPic } regxpArrag := reg.FindStringSubmatch(url) if len(regxpArrag) == 2 { addRequest(p, "changyan", fmt.Sprintf(fmtStr, regxpArrag[1]), "", s.Text()) } } }) } if p.GetUrlTag() == "changyan" { jsonMap := ChangyanJson{} err := json.NewDecoder(strings.NewReader(p.GetBodyStr())).Decode(&jsonMap) if err == nil { content, ok := p.GetRequest().GetMeta().(string) if ok { fmt.Println("Title:", content, " CommentCount:", jsonMap.ListData.OuterCmtSum, " ParticipationCount:", jsonMap.ListData.ParticipationSum) } } } }
func (this SitePageProcesser) Process(p *page.Page) { fmt.Println("Site Page Processer") if p.GetUrlTag() == "index" { query := p.GetHtmlParser() query.Find("ul[class='audioList fontYaHei'] li a").Each(func(i int, s *goquery.Selection) { strTitle, _ := s.Attr("title") strUrl, _ := s.Attr("data-url") if !IsFileExist(strTitle) { strFileName := fmt.Sprintf("%s.mp3", strTitle) fmt.Println(strFileName) cmd := exec.Command("/usr/local/bin/wget", strUrl, "-O", strFileName) err := cmd.Run() if err != nil { fmt.Println(err) } d, _ := cmd.Output() fmt.Println(string(d)) } }) } }
// Parse html dom here and record the parse result that we want to Page. // Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html. func (this *MyPageProcesser) Process(p *page.Page) { if !p.IsSucc() { println(p.Errormsg()) return } var fetch_content string query := p.GetHtmlParser() content := p.GetBodyStr() reg := regexp.MustCompile(`class="([0-9a-zA-Z_-]*content[0-9a-zA-Z_-]*)"`) reg_res := reg.FindAllStringSubmatch(content, -1) class_content := make([]string, 0) for _, class := range reg_res { submatch := class[1] class_content = append(class_content, submatch) } removeDuplicate(&class_content) for _, class := range class_content { query.Find("." + class).Each(func(i int, s *goquery.Selection) { text := strings.Trim(s.Text(), " \t\n") text = strings.Replace(text, " ", "", -1) text = strings.Replace(text, "\n", "", -1) text = strings.Replace(text, "\t", "", -1) if text != "" { fetch_content = fetch_content + text } }) } if fetch_content != "" { p.AddField("content", fetch_content) } }
// Parse html dom here and record the parse result that we want to Page. // Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html. func (this *MyPageProcesser) Process(p *page.Page) { if p.GetUrlTag() == "site_login" { //fmt.Printf("%v\n", p.GetCookies()) this.cookies = p.GetCookies() // AddTargetRequestWithParams Params: // 1. Url. // 2. Responce type is "html" or "json" or "jsonp" or "text". // 3. The urltag is name for marking url and distinguish different urls in PageProcesser and Pipeline. // 4. The method is POST or GET. // 5. The postdata is body string sent to sever. // 6. The header is header for http request. // 7. Cookies // 8. Http redirect function if len(this.cookies) != 0 { p.AddField("info", "get cookies success") req := request.NewRequest("http://backadmin.hucong.net/site/index", "html", "site_index", "GET", "", nil, this.cookies, nil, nil) p.AddTargetRequestWithParams(req) } else { p.AddField("info", "get cookies failed") } } else { //fmt.Printf("%v\n", p.GetBodyStr()) query := p.GetHtmlParser() pageTitle := query.Find(".page-content .page-title").Text() if len(pageTitle) != 0 { p.AddField("page_title", pageTitle) p.AddField("info", "login success") } else { p.AddField("info", "login failed") } } return if !p.IsSucc() { println(p.Errormsg()) return } query := p.GetHtmlParser() var urls []string query.Find("h3[class='repo-list-name'] a").Each(func(i int, s *goquery.Selection) { href, _ := s.Attr("href") urls = append(urls, "http://github.com/"+href) }) // these urls will be saved and crawed by other coroutines. p.AddTargetRequests(urls, "html") name := query.Find(".entry-title .author").Text() name = strings.Trim(name, " \t\n") repository := query.Find(".entry-title .js-current-repository").Text() repository = strings.Trim(repository, " \t\n") //readme, _ := query.Find("#readme").Html() if name == "" { p.SetSkip(true) } // the entity we want to save by Pipeline p.AddField("author", name) p.AddField("project", repository) //p.AddField("readme", readme) }