Example #1
0
func (this *YouxiduoProcesser) Process(p *page.Page) {
	if !p.IsSucc() {
		println(p.GetErrMsg())
		return
	}
	var body string = p.GetBody()
	var urlTag string = p.GetRequest().GetUrlTag()
	p.SetUrlTag(urlTag)
	//分析这个页面是LIST页面还是内容页面
	// <div class="infroList"><ul><li>...</div>===>LIST
	// <div class="pagebreak">...</div>===>LIST
	// CONTENT
	//<div class="article"

	if urlTag == "list" {
		//
		//1.寻找news-brief的content
		regList, err := regexp.Compile(`<div class=\"infroList\">(\s|.)*<\/ul>(\s|.)*<div class=\"pagebreak\">`)
		if err != nil {
			logs.GetFirstLogger().Error("分析页面出错,正则表达式错误了,url = " + p.GetRequest().GetUrl())
		}
		var infroList []string = regList.FindAllString(body, -1)

		if len(infroList) > 0 {
			this.parseNewsBreifInfo(infroList[0], p)
		} else {
			logs.GetFirstLogger().Info("No more list items")
		}
		//先寻找额外的LIST页面
		if !p.IsBreak() {
			regPageBreak, err := regexp.Compile(`<div class=\"pagebreak\">(\s|.)+<li class=\"lastPage\">`)
			if err != nil {
				logs.GetFirstLogger().Error("分析页面出错,翻页正则表达式错误,url = " + p.GetRequest().GetUrl())
			}
			var pageBreakList []string = regPageBreak.FindAllString(body, -1)
			if len(pageBreakList) > 0 {
				this.parseNewsLinkListInfo(pageBreakList[0], p)
			} else {
				logs.GetFirstLogger().Info("No more links")
			}

		}

	} else {
		//CONTENT
		this.parseNewsDetail(body, p)
	}

}
Example #2
0
func (this *YouxiduoProcesser) parseNewsLinkListInfo(content string, p *page.Page) *page.Page {
	//println("B LINK URLS")
	if p.IsBreak() {
		return p
	}
	reg, _ := regexp.Compile(`<a href(.)*<\/a>`)
	urlStr := reg.FindAllString(content, -1)
	for _, tmp := range urlStr {
		var pos1 int = strings.Index(tmp, "href=")
		var pos2 int = strings.Index(tmp, ">")
		if (pos2 - 1) > (pos1 + 6) {
			tmp = string(tmp[pos1+6 : pos2-1])
			if strings.Index(tmp, "http://") >= 0 {
				continue
			}
			tmp = util.GetRealUrl(p.GetRequest().GetUrl(), tmp)
			p.AddNewUrl(tmp, "list")
			//	println("list url = " + tmp)
		}
	}
	//println("E LINK URLS")
	return p
}