示例#1
0
func (this *PageProcesserHtml) Process(p *page.Page) {
	if !p.IsSucc() {
		println(p.Errormsg())
		return
	}

	result := map[string]string{}
	for k, _ := range this.rule {
		result[k] = ""
	}

	query := p.GetHtmlParser()
	var urls []string
	query.Find(this.page["rule"]).Each(func(i int, s *goquery.Selection) {
		href := ""
		if this.page["fun"] == "text" {
			href = s.Text()
		} else {
			href, _ = s.Attr(this.page["fun"])
		}

		urls = append(urls, this.page["pre"]+href)
	})
	p.AddMyTargetRequests(urls, this.conf["texttype"], "", this.conf["resqType"], this.conf["postdata"], this.conf["proxy"], this.conf["heardefile"], this.conf["cookie"])
	for k, v := range this.rule {
		if this.num[k] == "ALL" {
			var items []string
			query.Find(v).Each(func(i int, s *goquery.Selection) {
				item := ""
				if this.fun[k] == "text" {
					item = s.Text()
				} else {
					item, _ = s.Attr(this.fun[k])
				}
				items = append(items, item)
			})
			result[k] = strings.Join(items, "|")
		} else {
			if this.fun[k] == "text" {
				result[k] = query.Find(v).Text()
			} else {
				result[k], _ = query.Find(v).Attr(this.fun[k])
			}
			result[k] = strings.Trim(result[k], " \t\n")
		}

		if result[k] == "" {
			p.SetSkip(true)
		}
		p.AddField(k, result[k])

	}
	for k, v := range p.GetPageItems().GetAll() {
		println(k, v)
	}

}
func TestDownloadHtml(t *testing.T) {
	//return
	//request := request.NewRequest("http://live.sina.com.cn/zt/api/l/get/finance/globalnews1/index.htm?format=json&callback=t13975294&id=23521&pagesize=45&dire=f&dpc=1")
	var req *request.Request
	req = request.NewRequest("http://live.sina.com.cn/zt/l/v/finance/globalnews1/", "html", "", "GET", "", nil, nil, nil, nil)

	var dl downloader.Downloader
	dl = downloader.NewHttpDownloader()

	var p *page.Page
	p = dl.Download(req)

	var doc *goquery.Document
	doc = p.GetHtmlParser()
	//fmt.Println(doc)
	//body := p.GetBodyStr()
	//fmt.Println(body)

	var s *goquery.Selection
	s = doc.Find("body")
	if s.Length() < 1 {
		t.Error("html parse failed!")
	}

	/*
	   doc, err := goquery.NewDocument("http://live.sina.com.cn/zt/l/v/finance/globalnews1/")
	   if err != nil {
	       fmt.Printf("%v",err)
	   }
	   s := doc.Find("meta");
	   fmt.Println(s.Length())

	   resp, err := http.Get("http://live.sina.com.cn/zt/l/v/finance/globalnews1/")
	   if err != nil {
	       fmt.Printf("%v",err)
	   }
	   defer resp.Body.Close()
	   doc, err = goquery.NewDocumentFromReader(resp.Body)
	   s = doc.Find("meta");
	   fmt.Println(s.Length())
	*/
}