func (this *HttpDownloader) downloadHtml(p *page.Page, req *request.Request) *page.Page { var err error p, destbody := this.downloadFile(p, req) //fmt.Printf("Destbody %v \r\n", destbody) if !p.IsSucc() { //fmt.Print("Page error \r\n") return p } bodyReader := bytes.NewReader([]byte(destbody)) var doc *goquery.Document if doc, err = goquery.NewDocumentFromReader(bodyReader); err != nil { mlog.LogInst().LogError(err.Error()) p.SetStatus(true, err.Error()) return p } var body string if body, err = doc.Html(); err != nil { mlog.LogInst().LogError(err.Error()) p.SetStatus(true, err.Error()) return p } p.SetBodyStr(body).SetHtmlParser(doc).SetStatus(false, "") return p }
// prepScrapeCinemaMovies prepares the actual URL for movie showtimes at a particular cinema, then // calls the actual scraping function. func prepScrapeCinemaMovies(url string, context interface{}, cinemas chan<- []*data.Cinema, movies chan<- []*data.Movie) { var doc *gq.Document var err error log.Println("Retrieving document for " + url) if doc, err = gq.NewDocument(url); err != nil { log.Fatal(err) } allText, err := doc.Html() startIdx := strings.Index(allText, "buyTickets2") if startIdx > -1 { locIdx := strings.Index(allText[startIdx:], "loc=") endLoc := strings.Index(allText[startIdx+locIdx:], "&") loc := allText[startIdx+locIdx+4 : startIdx+locIdx+endLoc] go scrapeCinemaMovies(BASE+"/buyTickets2.jsp?loc="+loc+"&date="+time.Now().Format("02-01-2006"), context, cinemas, movies) } else { log.Fatalf("No available source URL") } }
func (self *HttpDownloader) downloadHtml(p *context.Response, req *context.Request) *context.Response { var err error p, destbody := self.downloadFile(p, req) //fmt.Printf("Destbody %v \r\n", destbody) if !p.IsSucc() { //fmt.Print("Response error \r\n") return p } bodyReader := bytes.NewReader([]byte(destbody)) var doc *goquery.Document if doc, err = goquery.NewDocumentFromReader(bodyReader); err != nil { reporter.Log.Println(err.Error()) p.SetStatus(true, err.Error()) return p } var body string if body, err = doc.Html(); err != nil { reporter.Log.Println(err.Error()) p.SetStatus(true, err.Error()) return p } p.SetBodyStr(body).SetHtmlParser(doc).SetStatus(false, "") return p }
func (this *HttpDownloader) downloadHtml(req *request.Request) *page.Page { var err error var url string if url = req.GetUrl(); len(url) == 0 { return nil } var resp *http.Response if resp, err = http.Get(url); err != nil { mlog.LogInst().LogError(err.Error()) return nil } defer resp.Body.Close() var doc *goquery.Document if doc, err = goquery.NewDocumentFromReader(resp.Body); err != nil { mlog.LogInst().LogError(err.Error()) return nil } var body string if body, err = doc.Html(); err != nil { mlog.LogInst().LogError(err.Error()) return nil } // create Page var p *page.Page = page.NewPage(req). SetBodyStr(body). SetHtmlParser(doc) return p }
//获取店铺掌柜 func GetShopBoss(p *goquery.Document) string { name := p.Find(".tb-seller-name").Text() if name != "" { return strings.TrimSpace(name) } reg, _ := regexp.Compile(`"sellerNickName":"([\w%%]+)"`) a, _ := p.Html() res := reg.FindStringSubmatch(a) if len(res) >= 1 { b, _ := url.QueryUnescape(res[1]) return strings.TrimSpace(b) } return "" }
func GetSourceList(document *goquery.Document) string { key, _ := document.Find("a[play_key]").First().Attr("play_key") fmt.Println("detail.Length():%s", key) if key != "" { return "http://v.nhaccuatui.com/flash/xml?key=" + key } else { contentString, _ := document.Html() if strings.Contains(contentString, "key2=") { startIndex := strings.Index(contentString, "key2=") + 5 endIndex := startIndex + 32 keyString := contentString[startIndex:endIndex] return "http://v.nhaccuatui.com/flash/xml?key2=" + keyString } if strings.Contains(contentString, "key3=") { startIndex := strings.Index(contentString, "key3=") + 5 endIndex := startIndex + 32 keyString := contentString[startIndex:endIndex] return "http://v.nhaccuatui.com/flash/xml?key3=" + keyString } return "" } }
func dumpDoc(doc *goquery.Document) { html, _ := doc.Html() pt("%s\n", html) }