func (this *HttpDownloader) downloadHtml(p *page.Page, req *request.Request) *page.Page {
	var err error
	p, destbody := this.downloadFile(p, req)
	//fmt.Printf("Destbody %v \r\n", destbody)
	if !p.IsSucc() {
		//fmt.Print("Page error \r\n")
		return p
	}
	bodyReader := bytes.NewReader([]byte(destbody))

	var doc *goquery.Document
	if doc, err = goquery.NewDocumentFromReader(bodyReader); err != nil {
		mlog.LogInst().LogError(err.Error())
		p.SetStatus(true, err.Error())
		return p
	}

	var body string
	if body, err = doc.Html(); err != nil {
		mlog.LogInst().LogError(err.Error())
		p.SetStatus(true, err.Error())
		return p
	}

	p.SetBodyStr(body).SetHtmlParser(doc).SetStatus(false, "")

	return p
}
Example #2
0
// prepScrapeCinemaMovies prepares the actual URL for movie showtimes at a particular cinema, then
// calls the actual scraping function.
func prepScrapeCinemaMovies(url string, context interface{}, cinemas chan<- []*data.Cinema, movies chan<- []*data.Movie) {

	var doc *gq.Document
	var err error

	log.Println("Retrieving document for " + url)
	if doc, err = gq.NewDocument(url); err != nil {
		log.Fatal(err)
	}

	allText, err := doc.Html()
	startIdx := strings.Index(allText, "buyTickets2")

	if startIdx > -1 {

		locIdx := strings.Index(allText[startIdx:], "loc=")
		endLoc := strings.Index(allText[startIdx+locIdx:], "&")
		loc := allText[startIdx+locIdx+4 : startIdx+locIdx+endLoc]

		go scrapeCinemaMovies(BASE+"/buyTickets2.jsp?loc="+loc+"&date="+time.Now().Format("02-01-2006"), context, cinemas, movies)

	} else {
		log.Fatalf("No available source URL")
	}

}
Example #3
0
func (self *HttpDownloader) downloadHtml(p *context.Response, req *context.Request) *context.Response {
	var err error
	p, destbody := self.downloadFile(p, req)
	//fmt.Printf("Destbody %v \r\n", destbody)
	if !p.IsSucc() {
		//fmt.Print("Response error \r\n")
		return p
	}
	bodyReader := bytes.NewReader([]byte(destbody))

	var doc *goquery.Document
	if doc, err = goquery.NewDocumentFromReader(bodyReader); err != nil {
		reporter.Log.Println(err.Error())
		p.SetStatus(true, err.Error())
		return p
	}

	var body string
	if body, err = doc.Html(); err != nil {
		reporter.Log.Println(err.Error())
		p.SetStatus(true, err.Error())
		return p
	}

	p.SetBodyStr(body).SetHtmlParser(doc).SetStatus(false, "")

	return p
}
Example #4
0
func (this *HttpDownloader) downloadHtml(req *request.Request) *page.Page {
	var err error
	var url string
	if url = req.GetUrl(); len(url) == 0 {
		return nil
	}

	var resp *http.Response
	if resp, err = http.Get(url); err != nil {
		mlog.LogInst().LogError(err.Error())
		return nil
	}
	defer resp.Body.Close()

	var doc *goquery.Document
	if doc, err = goquery.NewDocumentFromReader(resp.Body); err != nil {
		mlog.LogInst().LogError(err.Error())
		return nil
	}

	var body string
	if body, err = doc.Html(); err != nil {
		mlog.LogInst().LogError(err.Error())
		return nil
	}

	// create Page
	var p *page.Page = page.NewPage(req).
		SetBodyStr(body).
		SetHtmlParser(doc)

	return p

}
Example #5
0
File: tao.go Project: qgweb/new
//获取店铺掌柜
func GetShopBoss(p *goquery.Document) string {
	name := p.Find(".tb-seller-name").Text()
	if name != "" {
		return strings.TrimSpace(name)
	}

	reg, _ := regexp.Compile(`"sellerNickName":"([\w%%]+)"`)
	a, _ := p.Html()
	res := reg.FindStringSubmatch(a)

	if len(res) >= 1 {
		b, _ := url.QueryUnescape(res[1])
		return strings.TrimSpace(b)
	}
	return ""
}
Example #6
0
func GetSourceList(document *goquery.Document) string {

	key, _ := document.Find("a[play_key]").First().Attr("play_key")
	fmt.Println("detail.Length():%s", key)
	if key != "" {
		return "http://v.nhaccuatui.com/flash/xml?key=" + key
	} else {
		contentString, _ := document.Html()
		if strings.Contains(contentString, "key2=") {
			startIndex := strings.Index(contentString, "key2=") + 5
			endIndex := startIndex + 32
			keyString := contentString[startIndex:endIndex]
			return "http://v.nhaccuatui.com/flash/xml?key2=" + keyString
		}
		if strings.Contains(contentString, "key3=") {
			startIndex := strings.Index(contentString, "key3=") + 5
			endIndex := startIndex + 32
			keyString := contentString[startIndex:endIndex]
			return "http://v.nhaccuatui.com/flash/xml?key3=" + keyString
		}
		return ""
	}
}
Example #7
0
func dumpDoc(doc *goquery.Document) {
	html, _ := doc.Html()
	pt("%s\n", html)
}