func TestCharSetChange(t *testing.T) { var req *request.Request //req = request.NewRequest("http://stock.finance.sina.com.cn/usstock/api/jsonp.php/t/US_CategoryService.getList?page=1&num=60", "jsonp") req = request.NewRequest("http://soft.chinabyte.com/416/13164916.shtml", "html", "", "GET", "", nil, nil, nil, nil) var dl downloader.Downloader dl = downloader.NewHttpDownloader() var p *page.Page p = dl.Download(req) //hp := p.GetHtmlParser() //fmt.Printf("%v", jsonMap) //fmt.Println(doc) p.GetBodyStr() body := p.GetBodyStr() fmt.Println(body) }
// Parse html dom here and record the parse result that we want to crawl. // Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html. func (this *MyPageProcesser) Process(p *page.Page) { if !p.IsSucc() { println(p.Errormsg()) return } html := p.GetBodyStr() newUrls := urlutil.GetAllUrlIn(p.GetRequest().GetUrl(), html) for _, newUrl := range newUrls { newUrl = strings.Replace(newUrl, "//weibo.com/", "//tw.weibo.com/", -1) p.AddTargetRequest(newUrl, "html") } mailAddrList := mailaddrutil.GetAllMailAddrIn(html) for _, mailAddr := range mailAddrList { if _, ok := this.mailAddrMap[mailAddr]; !ok { this.mailAddrMap[mailAddr] = true this.mailLogger.WriteString(mailAddr + "\n") this.MailHandle.Push(mailAddr) } } }
func (this MyPageProcesser) Process(p *page.Page) { query := p.GetHtmlParser() if p.GetUrlTag() == "index" { query.Find(`div[class="main area"] div[class="lc"] ul li a`).Each(func(i int, s *goquery.Selection) { url, isExsit := s.Attr("href") if isExsit { reg := regexp.MustCompile(`^do not know what is this`) var fmtStr string if rxYule.MatchString(url) { reg = rxYule fmtStr = wkSohuYule } if rxPic.MatchString(url) { reg = rxPic fmtStr = wkSohuPic } regxpArrag := reg.FindStringSubmatch(url) if len(regxpArrag) == 2 { addRequest(p, "changyan", fmt.Sprintf(fmtStr, regxpArrag[1]), "", s.Text()) } } }) } if p.GetUrlTag() == "changyan" { jsonMap := ChangyanJson{} err := json.NewDecoder(strings.NewReader(p.GetBodyStr())).Decode(&jsonMap) if err == nil { content, ok := p.GetRequest().GetMeta().(string) if ok { fmt.Println("Title:", content, " CommentCount:", jsonMap.ListData.OuterCmtSum, " ParticipationCount:", jsonMap.ListData.ParticipationSum) } } } }