Esempio n. 1
0
func (this *YouxiduoProcesser) parseNewsLinkListInfo(content string, p *page.Page) *page.Page {
	//println("B LINK URLS")
	if p.IsBreak() {
		return p
	}
	reg, _ := regexp.Compile(`<a href(.)*<\/a>`)
	urlStr := reg.FindAllString(content, -1)
	for _, tmp := range urlStr {
		var pos1 int = strings.Index(tmp, "href=")
		var pos2 int = strings.Index(tmp, ">")
		if (pos2 - 1) > (pos1 + 6) {
			tmp = string(tmp[pos1+6 : pos2-1])
			if strings.Index(tmp, "http://") >= 0 {
				continue
			}
			tmp = util.GetRealUrl(p.GetRequest().GetUrl(), tmp)
			p.AddNewUrl(tmp, "list")
			//	println("list url = " + tmp)
		}
	}
	//println("E LINK URLS")
	return p
}
Esempio n. 2
0
func (this *YouxiduoProcesser) parseNewsBreifInfo(content string, p *page.Page) *page.Page {
	logs.GetFirstLogger().Trace("B TEST LIST ITEMS")
	var pos1 int = strings.Index(content, "<li>")
	var pos2 int = strings.Index(content, "</li>")
	var count int = 1

	for pos1 >= 0 && pos2 >= 0 && (pos2 > pos1) {
		item := page.NewPageItems("")
		tmpStr := string(content[pos1 : pos2+5])
		content = string(content[pos2+5 : len(content)])

		pos1 = strings.Index(content, "<li>")
		pos2 = strings.Index(content, "</li>")
		logs.GetFirstLogger().Trace("B================>")
		reg, _ := regexp.Compile(`<span>(.)*[\d]{4}-[\d]{2}-[\d]{2}`)
		timeStr := reg.FindString(tmpStr)
		reg, _ = regexp.Compile(`[\d]{4}-[\d]{2}-[\d]{2}`)
		timeStr = reg.FindString(timeStr)
		if this.exitDate > timeStr {
			p.SetBreak(true)
			continue
		}
		item.AddItem("time", timeStr)

		reg, _ = regexp.Compile("title=\"(.)*\"")
		title := reg.FindString(tmpStr)
		title = string(title[strings.Index(title, "\"")+1 : len(title)])
		title = string(title[0:strings.Index(title, "\"")])
		logs.GetFirstLogger().Trace("title = " + title)
		//p.AddResultItem("title", title)
		item.AddItem("title", title)
		reg, _ = regexp.Compile("<img src=(.)*alt")
		pic := reg.FindString(tmpStr)
		pic = string(pic[strings.Index(pic, "\"")+1 : len(pic)])
		pic = string(pic[0:strings.Index(pic, "\"")])

		if util.IsRelativePath(pic) {
			pic = util.GetRealUrl(p.GetRequest().GetUrl(), pic)
		}
		logs.GetFirstLogger().Trace("pic = " + pic)
		//p.AddResultItem("pic", pic)
		item.AddItem("pic", pic)

		reg, _ = regexp.Compile("<p>(.)*</p>")
		info := reg.FindString(tmpStr)
		logs.GetFirstLogger().Trace("info = " + info)
		//p.AddResultItem("info", info)
		info = strings.Replace(info, "'", "\"", -1)
		info = strings.Replace(info, "&#39;", "\"", -1)

		item.AddItem("info", info)

		reg, _ = regexp.Compile("<span(.)*<a(.)*</span>")
		detailurl := reg.FindString(tmpStr)
		reg, _ = regexp.Compile("href(.)*\">")
		detailurl = reg.FindString(detailurl)
		detailurl = detailurl[strings.Index(detailurl, "\"")+1 : len(detailurl)]
		detailurl = detailurl[0:strings.Index(detailurl, "\"")]
		logs.GetFirstLogger().Trace("detailurl = " + detailurl)
		//p.AddResultItem("detailurl", detailurl)
		item.AddItem("detailurl", detailurl)
		//p.AddResultItem("key", detailurl)
		item.SetKey(detailurl)
		p.AddNewUrl(detailurl, "content")

		logs.GetFirstLogger().Trace("E================>")
		logs.GetFirstLogger().Tracef("count = %d", count)
		count = count + 1
		logs.GetFirstLogger().Warn(title)

		pos1 = strings.Index(content, "<li>")
		pos2 = strings.Index(content, "</li>")
		p.AddPageItems(item)
	}

	return p
}