func (this *YouxiduoProcesser) parseNewsLinkListInfo(content string, p *page.Page) *page.Page { //println("B LINK URLS") if p.IsBreak() { return p } reg, _ := regexp.Compile(`<a href(.)*<\/a>`) urlStr := reg.FindAllString(content, -1) for _, tmp := range urlStr { var pos1 int = strings.Index(tmp, "href=") var pos2 int = strings.Index(tmp, ">") if (pos2 - 1) > (pos1 + 6) { tmp = string(tmp[pos1+6 : pos2-1]) if strings.Index(tmp, "http://") >= 0 { continue } tmp = util.GetRealUrl(p.GetRequest().GetUrl(), tmp) p.AddNewUrl(tmp, "list") // println("list url = " + tmp) } } //println("E LINK URLS") return p }
func (this *YouxiduoProcesser) parseNewsBreifInfo(content string, p *page.Page) *page.Page { logs.GetFirstLogger().Trace("B TEST LIST ITEMS") var pos1 int = strings.Index(content, "<li>") var pos2 int = strings.Index(content, "</li>") var count int = 1 for pos1 >= 0 && pos2 >= 0 && (pos2 > pos1) { item := page.NewPageItems("") tmpStr := string(content[pos1 : pos2+5]) content = string(content[pos2+5 : len(content)]) pos1 = strings.Index(content, "<li>") pos2 = strings.Index(content, "</li>") logs.GetFirstLogger().Trace("B================>") reg, _ := regexp.Compile(`<span>(.)*[\d]{4}-[\d]{2}-[\d]{2}`) timeStr := reg.FindString(tmpStr) reg, _ = regexp.Compile(`[\d]{4}-[\d]{2}-[\d]{2}`) timeStr = reg.FindString(timeStr) if this.exitDate > timeStr { p.SetBreak(true) continue } item.AddItem("time", timeStr) reg, _ = regexp.Compile("title=\"(.)*\"") title := reg.FindString(tmpStr) title = string(title[strings.Index(title, "\"")+1 : len(title)]) title = string(title[0:strings.Index(title, "\"")]) logs.GetFirstLogger().Trace("title = " + title) //p.AddResultItem("title", title) item.AddItem("title", title) reg, _ = regexp.Compile("<img src=(.)*alt") pic := reg.FindString(tmpStr) pic = string(pic[strings.Index(pic, "\"")+1 : len(pic)]) pic = string(pic[0:strings.Index(pic, "\"")]) if util.IsRelativePath(pic) { pic = util.GetRealUrl(p.GetRequest().GetUrl(), pic) } logs.GetFirstLogger().Trace("pic = " + pic) //p.AddResultItem("pic", pic) item.AddItem("pic", pic) reg, _ = regexp.Compile("<p>(.)*</p>") info := reg.FindString(tmpStr) logs.GetFirstLogger().Trace("info = " + info) //p.AddResultItem("info", info) info = strings.Replace(info, "'", "\"", -1) info = strings.Replace(info, "'", "\"", -1) item.AddItem("info", info) reg, _ = regexp.Compile("<span(.)*<a(.)*</span>") detailurl := reg.FindString(tmpStr) reg, _ = regexp.Compile("href(.)*\">") detailurl = reg.FindString(detailurl) detailurl = detailurl[strings.Index(detailurl, "\"")+1 : len(detailurl)] detailurl = detailurl[0:strings.Index(detailurl, "\"")] logs.GetFirstLogger().Trace("detailurl = " + detailurl) //p.AddResultItem("detailurl", detailurl) item.AddItem("detailurl", detailurl) //p.AddResultItem("key", detailurl) item.SetKey(detailurl) p.AddNewUrl(detailurl, "content") logs.GetFirstLogger().Trace("E================>") logs.GetFirstLogger().Tracef("count = %d", count) count = count + 1 logs.GetFirstLogger().Warn(title) pos1 = strings.Index(content, "<li>") pos2 = strings.Index(content, "</li>") p.AddPageItems(item) } return p }