func (self *Www79xsComProcessor) Process(p *robot.Page) { //判断页面是否抓取成功 if !p.IsSucc() { log.Println(p.Errormsg()) return } meta := p.GetRequest().GetMeta() handler, ok := meta.(map[string]interface{})["handler"] //如果meta中没有handler处理方法,则说明是入口页面,否则直接执行对应callback if ok { switch handler { case "mainParse": self.mainParse(p) case "urlListParse": self.urlListParse(p) case "classParse": self.classParse(p) case "introParse": self.introParse(p) case "chaperParse": self.chaperParse(p) case "contentParse": self.contentParse(p) default: return } } }
//小说内容解析 func (self *Www79xsComProcessor) contentParse(p *robot.Page) { meta := p.GetRequest().GetMeta().(map[string]interface{}) //开始解析页面 query := p.GetHtmlParser() html, _ := query.Find(".contentbox").Html() meta["content"] = strings.Replace(strings.Replace(html, "<br/><br/>", "\n", -1), "<br/>", "\n", -1) p.AddField("code", "0") for k, v := range meta { p.AddField(k, v.(string)) } }
//解析小说详情页 func (self *Www79xsComProcessor) introParse(p *robot.Page) { meta := p.GetRequest().GetMeta().(map[string]string) //开始解析页面 query := p.GetHtmlParser() intro := query.Find("#info h3 p").Eq(1).Text() img, _ := query.Find(".img img").Attr("src") // 小说章节列表地址 chaptersource, _ := query.Find(".b1 a").Attr("href") tmp := utils.MapCopy(meta) tmp["introduction"] = intro tmp["img"] = utils.BaseUrl + img tmp["chaptersource"] = utils.BaseUrl + chaptersource p.AddTargetRequest(utils.InitRequest(utils.BaseUrl+chaptersource, tmp, self.chaperParse)) }
func (self *Www79xsComProcessor) Process(p *robot.Page) { //判断页面是否抓取成功 if !p.IsSucc() { mlog.LogInst().LogError(p.Errormsg()) return } //如果callback为空,则说明是入口页面,否则直接执行对应callback callback := p.GetRequest().GetCallBack() if callback == nil { self.mainParse(p) } else { callback(p) } }
func (this *MyProcessor) Process(p *robot.Page) { if !p.IsSucc() { mlog.LogInst().LogError(p.Errormsg()) return } u, err := url.Parse(p.GetRequest().GetUrl()) if err != nil { mlog.LogInst().LogError(err.Error()) return } if !strings.HasSuffix(u.Host, "jiexieyin.org") { return } var urls []string query := p.GetHtmlParser() query.Find("a").Each(func(i int, s *goquery.Selection) { href, _ := s.Attr("href") reJavascript := regexp.MustCompile("^javascript\\:") reLocal := regexp.MustCompile("^\\#") reMailto := regexp.MustCompile("^mailto\\:") if reJavascript.MatchString(href) || reLocal.MatchString(href) || reMailto.MatchString(href) { return } //处理相对路径 var absHref string urlHref, err := url.Parse(href) if err != nil { mlog.LogInst().LogError(err.Error()) return } if !urlHref.IsAbs() { urlPrefix := p.GetRequest().GetUrl() absHref = urlPrefix + href urls = append(urls, absHref) } else { urls = append(urls, href) } }) p.AddTargetRequests(urls, "html") }
//小说章节解析 func (self *Www79xsComProcessor) chaperParse(p *robot.Page) { meta := p.GetRequest().GetMeta().(map[string]string) //开始解析页面 query := p.GetHtmlParser() query.Find(".insert_list li").Each(func(i int, s *goquery.Selection) { tmp := utils.MapCopy(meta) tmp["chapter"] = strconv.Itoa(i) tmp["subtitle"] = s.Find("strong a").Text() addr, _ := s.Find("strong a").Attr("href") tmp["contenturl"] = p.GetRequest().GetBaseUrl() + addr //检测contenturl, 如果数据库中存在,则跳过本次抓取,如果不存在则将url加入调度队列 //这个需求有时间再做 if len(tmp["subtitle"]) != 0 { p.AddTargetRequest(utils.InitRequest(tmp["contenturl"], tmp, self.contentParse)) } }) }
//分类列表解析 func (self *Www79xsComProcessor) classParse(p *robot.Page) { meta := p.GetRequest().GetMeta().(map[string]string) //开始解析页面 query := p.GetHtmlParser() query.Find("div .yl_nr_lt2 ul").Each(func(i int, s *goquery.Selection) { //获取二级分类, 小说标题,作者 second := s.Find(".ynl2 a").Text() title := s.Find(".ynl3 a").Eq(1).Text() author := s.Find(".ynl6 a").Text() novelsource := utils.BaseUrl + func() string { addr, _ := s.Find(".ynl3 a").Eq(1).Attr("href") return addr }() tmp := make(map[string]string) tmp["first"] = meta["first"] tmp["second"] = second tmp["title"] = title tmp["author"] = author tmp["novelsource"] = novelsource p.AddTargetRequest(utils.InitRequest(novelsource, tmp, self.introParse)) }) }
//获取分类页面的url list,并解析 func (self *Www79xsComProcessor) urlListParse(p *robot.Page) { meta := p.GetRequest().GetMeta() //开始解析页面 query := p.GetHtmlParser() //获取尾页addr lastaddr, ok := query.Find("tbody a").Last().Attr("href") if ok { //解析addr kv := goutils.GetKVInRelaPath(lastaddr) //url拼接 maxpage, _ := strconv.Atoi(kv["page"]) for i := 1; i <= maxpage; i++ { page := strconv.Itoa(i) p.AddTargetRequest(utils.InitRequest( "http://www.79xs.com/Book/ShowBookList.aspx?tclassid="+kv["tclassid"]+"&page="+page, meta.(map[string]string), self.classParse)) } } else { p.AddTargetRequest(utils.InitRequest(p.GetRequest().GetUrl(), meta.(map[string]string), self.classParse)) } }
func (this MyPageProcesser) Process(p *robot.Page) { query := p.GetHtmlParser() if p.GetUrlTag() == "index" { query.Find(`div[class="main area"] div[class="lc"] ul li a`).Each(func(i int, s *goquery.Selection) { url, isExsit := s.Attr("href") if isExsit { reg := regexp.MustCompile(`^do not know what is this`) var fmtStr string if rxYule.MatchString(url) { reg = rxYule fmtStr = wkSohuYule } if rxPic.MatchString(url) { reg = rxPic fmtStr = wkSohuPic } regxpArrag := reg.FindStringSubmatch(url) if len(regxpArrag) == 2 { addRequest(p, "changyan", fmt.Sprintf(fmtStr, regxpArrag[1]), "", s.Text()) } } }) } if p.GetUrlTag() == "changyan" { jsonMap := ChangyanJson{} err := json.NewDecoder(strings.NewReader(p.GetBodyStr())).Decode(&jsonMap) if err == nil { content, ok := p.GetRequest().GetMeta().(string) if ok { fmt.Println("Title:", content, " CommentCount:", jsonMap.ListData.OuterCmtSum, " ParticipationCount:", jsonMap.ListData.ParticipationSum) } } } }