Exemple #1
0
// Parse html dom here and record the parse result that we want to Page.
// Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html.
func (this *MyPageProcesser) Process(p *page.Page) {
	if !p.IsSucc() {
		println(p.Errormsg())
		return
	}

	query := p.GetHtmlParser()
	var urls []string
	query.Find("h3[class='repo-list-name'] a").Each(func(i int, s *goquery.Selection) {
		href, _ := s.Attr("href")
		urls = append(urls, "http://github.com/"+href)
	})
	// these urls will be saved and crawed by other coroutines.
	p.AddTargetRequests(urls, "html")

	name := query.Find(".entry-title .author").Text()
	name = strings.Trim(name, " \t\n")
	repository := query.Find(".entry-title .js-current-repository").Text()
	repository = strings.Trim(repository, " \t\n")
	//readme, _ := query.Find("#readme").Html()
	if name == "" {
		p.SetSkip(true)
	}
	// the entity we want to save by Pipeline
	p.AddField("author", name)
	p.AddField("project", repository)
	//p.AddField("readme", readme)
}
Exemple #2
0
// Parse html dom here and record the parse result that we want to Page.
// Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html.
func (this *MyPageProcesser) Process(p *page.Page) {
	query := p.GetHtmlParser()
	var urls []string
	query.Find("#threadlisttableid tbody").Each(func(i int, s *goquery.Selection) {
		if s.HasClass("emptb") {
			return
		}
		href, _ := s.Find("tbody tr .icn a").Attr("href")
		urls = append(urls, href)
	})

	// these urls will be saved and crawed by other coroutines.
	p.AddTargetRequests(urls, "html")

	title := query.Find("#thread_subject").Text()
	title = strings.Trim(title, "\t\n\r")
	author := query.Find("#postlist div .authi").Eq(0).Text()
	author = strings.Trim(author, "\t\r\n")

	if title == "" || author == "" {
		p.SetSkip(true)
	}

	p.AddField("title", title)
	p.AddField("author", author)
}
Exemple #3
0
func (this *PlantProcesser) getName(query *goquery.Document, p *page.Page) {

	name := query.Find(".lemmaWgt-lemmaTitle-title").Find("h1").Text()
	name = strings.Trim(name, " \t\n")
	p.AddField("name", name)

}
Exemple #4
0
// Parse html dom here and record the parse result that we want to Page.
// Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html.
func (this *MyPageProcesser) Process(p *page.Page) {
	if !p.IsSucc() {
		println(p.Errormsg())
		return
	}

	query := p.GetHtmlParser()

	query.Find(`div[class="wx-rb bg-blue wx-rb_v1 _item"]`).Each(func(i int, s *goquery.Selection) {
		name := s.Find("div.txt-box > h3").Text()
		href, _ := s.Attr("href")

		fmt.Printf("WeName:%v link:http://http://weixin.sogou.com%v \r\n", name, href)
		// the entity we want to save by Pipeline
		p.AddField("name", name)
		p.AddField("href", href)
	})

	next_page_href, _ := query.Find("#sogou_next").Attr("href")
	if next_page_href == "" {
		p.SetSkip(true)
	} else {
		p.AddTargetRequestWithHeaderFile("http://weixin.sogou.com/weixin"+next_page_href, "html", "weixin.sogou.com.json")
	}

}
Exemple #5
0
// Parse html dom here and record the parse result that we want to crawl.
// Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html.
func (this *MyPageProcesser) Process(p *page.Page) {
	query := p.GetHtmlParser()

	name := query.Find(".lemmaTitleH1").Text()
	name = strings.Trim(name, " \t\n")

	summary := query.Find(".card-summary-content .para").Text()
	summary = strings.Trim(summary, " \t\n")

	// the entity we want to save by Pipeline
	p.AddField("name", name)
	p.AddField("summary", summary)
}
Exemple #6
0
// Parse html dom here and record the parse result that we want to crawl.
// Package simplejson (https://github.com/bitly/go-simplejson) is used to parse data of json.
func (this *MyPageProcesser) Process(p *page.Page) {
	if !p.IsSucc() {
		println(p.Errormsg())
		return
	}

	query := p.GetJson()
	status, err := query.GetPath("result", "status", "code").Int()
	if status != 0 || err != nil {
		log.Panicf("page is crawled error : errorinfo=%s : status=%d : startNewsId=%d", err.Error(), status, this.startNewsId)
	}
	num, err := query.GetPath("result", "pageStr", "pageSize").Int()
	if num == 0 || err != nil {
		// Add url of next crawl
		startIdstr := strconv.Itoa(this.startNewsId)
		p.AddTargetRequest("http://live.sina.com.cn/zt/api/l/get/finance/globalnews1/index.htm?format=json&id="+startIdstr+"&pagesize=10&dire=f", "json")
		return
	}

	var idint, nextid int
	var nextidstr string
	query = query.Get("result").Get("data")
	for i := 0; i < num; i++ {
		id, err := query.GetIndex(i).Get("id").String()
		if id == "" || err != nil {
			continue
		}
		idint, err = strconv.Atoi(id)
		if err != nil {
			continue
		}
		if idint <= this.startNewsId {
			break
		}
		if i == 0 {
			nextid = idint
			nextidstr = id
		}
		content, err := query.GetIndex(i).Get("content").String()
		if content == "" || err != nil {
			continue
		}
		time, err := query.GetIndex(i).Get("created_at").String()
		if err != nil {
			continue
		}

		p.AddField(id+"_id", id)
		p.AddField(id+"_content", content)
		p.AddField(id+"_time", time)
	}
	// Add url of next crawl
	this.startNewsId = nextid
	p.AddTargetRequest("http://live.sina.com.cn/zt/api/l/get/finance/globalnews1/index.htm?format=json&id="+nextidstr+"&pagesize=10&dire=f", "json")
	//println(p.GetTargetRequests())

}
Exemple #7
0
// Parse html dom here and record the parse result that we want to Page.
// Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html.
func (this *MyPageProcesser) Process(p *page.Page) {
	if !p.IsSucc() {
		println(p.Errormsg())
		return
	}
	var fetch_content string
	query := p.GetHtmlParser()
	content := p.GetBodyStr()
	reg := regexp.MustCompile(`class="([0-9a-zA-Z_-]*content[0-9a-zA-Z_-]*)"`)
	reg_res := reg.FindAllStringSubmatch(content, -1)
	class_content := make([]string, 0)
	for _, class := range reg_res {
		submatch := class[1]
		class_content = append(class_content, submatch)
	}
	removeDuplicate(&class_content)

	for _, class := range class_content {

		query.Find("." + class).Each(func(i int, s *goquery.Selection) {
			text := strings.Trim(s.Text(), " \t\n")
			text = strings.Replace(text, " ", "", -1)
			text = strings.Replace(text, "\n", "", -1)
			text = strings.Replace(text, "\t", "", -1)

			if text != "" {
				fetch_content = fetch_content + text
			}
		})
	}

	if fetch_content != "" {
		p.AddField("content", fetch_content)
	}

}
Exemple #8
0
// Parse html dom here and record the parse result that we want to Page.
// Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html.
func (this *MyPageProcesser) Process(p *page.Page) {

	if p.GetUrlTag() == "site_login" {
		//fmt.Printf("%v\n", p.GetCookies())
		this.cookies = p.GetCookies()
		// AddTargetRequestWithParams Params:
		//  1. Url.
		//  2. Responce type is "html" or "json" or "jsonp" or "text".
		//  3. The urltag is name for marking url and distinguish different urls in PageProcesser and Pipeline.
		//  4. The method is POST or GET.
		//  5. The postdata is body string sent to sever.
		//  6. The header is header for http request.
		//  7. Cookies
		//  8. Http redirect function
		if len(this.cookies) != 0 {
			p.AddField("info", "get cookies success")
			req := request.NewRequest("http://backadmin.hucong.net/site/index", "html", "site_index", "GET", "", nil, this.cookies, nil, nil)
			p.AddTargetRequestWithParams(req)
		} else {
			p.AddField("info", "get cookies failed")
		}
	} else {
		//fmt.Printf("%v\n", p.GetBodyStr())
		query := p.GetHtmlParser()
		pageTitle := query.Find(".page-content .page-title").Text()

		if len(pageTitle) != 0 {
			p.AddField("page_title", pageTitle)
			p.AddField("info", "login success")
		} else {
			p.AddField("info", "login failed")
		}

	}

	return
	if !p.IsSucc() {
		println(p.Errormsg())
		return
	}

	query := p.GetHtmlParser()
	var urls []string
	query.Find("h3[class='repo-list-name'] a").Each(func(i int, s *goquery.Selection) {
		href, _ := s.Attr("href")
		urls = append(urls, "http://github.com/"+href)
	})
	// these urls will be saved and crawed by other coroutines.
	p.AddTargetRequests(urls, "html")

	name := query.Find(".entry-title .author").Text()
	name = strings.Trim(name, " \t\n")
	repository := query.Find(".entry-title .js-current-repository").Text()
	repository = strings.Trim(repository, " \t\n")
	//readme, _ := query.Find("#readme").Html()
	if name == "" {
		p.SetSkip(true)
	}
	// the entity we want to save by Pipeline
	p.AddField("author", name)
	p.AddField("project", repository)
	//p.AddField("readme", readme)
}
Exemple #9
0
func (this *PlantProcesser) getSummary(query *goquery.Document, p *page.Page) {

	summary := query.Find(".lemma-summary .para").Text()
	summary = strings.Trim(summary, " \t\n")
	p.AddField("summary", summary)
}
Exemple #10
0
func (this *PlantProcesser) getCatalog(query *goquery.Document, p *page.Page) {

	catalog := query.Find(".lemma-catalog").Find("span.text").Text()
	catalog = strings.Trim(catalog, " \t\n")
	p.AddField("catalog", catalog)
}