func (self *Spider) AddUrl(urlstr string) { if self.downloader == nil { panic("downloader instance is nil, please init downloader.") } if self.scheduler == nil { panic("scheduler instance is nil, please init scheduler.") } if self.pipeliner == nil { panic("pipeliner instance is nil, please init pipeliner.") } elemItem := common.NewElementItem(urlstr) self.scheduler.AddElementItem(elemItem, false) }
func (process *MyProcess) Do(page *cm.Page) { bytes, _ := ioutil.ReadAll(page.Res.Body) bodyStr := string(bytes[:]) reader := strings.NewReader(bodyStr) reqUrl := page.Req.URL doc, _ := query.NewDocumentFromReader(reader) /**------------------------------波奇的产品爬虫------------------------------**/ if reqUrl.String() == "http://shop.boqii.com/" { doc.Find(".menu .menu_body a").Each(func(i int, s *query.Selection) { href, exists := s.Attr("href") if exists { url, err := reqUrl.Parse(href) if err == nil { urlStr := url.String() page.AddElem(cm.NewElementItem(urlStr)) } } }) } else { doc.Find(".product_list .product_list_container").Each(func(i int, s *query.Selection) { tmp := "" reg := regexp.MustCompile("\\d+") filter := regexp.MustCompile("\\s+") name := filter.ReplaceAllString(s.Find(".product_name a").Text(), "") price := filter.ReplaceAllString(s.Find(".product_price strong").Text(), "") sellNum := s.Find(".product_status .product_sold").Text() sellNum = filter.ReplaceAllString(reg.FindString(sellNum), "") tmp = strings.Join([]string{tmp, sellNum, name, price}, " ") fileMutex.Lock() fmt.Println(strings.Join([]string{tmp, "\r\n"}, "")) file.WriteString(strings.Join([]string{tmp, "\r\n"}, "")) fileMutex.Unlock() }) doc.Find(".pagination a").Each(func(i int, s *query.Selection) { href, exists := s.Attr("href") if exists { url, err := reqUrl.Parse(href) if err == nil { urlStr := url.String() page.AddElem(cm.NewElementItem(urlStr)) } } }) } /**------------------------------易宠的产品爬虫------------------------------**/ // if reqUrl.String() == "http://www.epet.com/" { // doc.Find(".catelist h3 a").Each(func(i int, s *query.Selection) { // href, exists := s.Attr("href") // if exists { // url, err := reqUrl.Parse(href) // if err == nil { // urlStr := url.String() // page.AddElem(cm.NewElementItem(urlStr)) // } // } // }) // } else { // count := 0 // doc.Find(".list_box-li").Each(func(i int, s *query.Selection) { // tmp := "" // reg := regexp.MustCompile("\\d+") // filter := regexp.MustCompile("\\s+") // sellNum := "" // // name := filter.ReplaceAllString(s.Find(".gtitle").Text(), " ") // price := filter.ReplaceAllString(s.Find(".gprice .price").Text(), " ") // s.Find(".c999 em").Each(func(i int, s *query.Selection) { // if i == 0 { // sellNum = filter.ReplaceAllString(reg.FindString(s.Text()), " ") // } // }) // // tmp = strings.Join([]string{tmp, sellNum, name, price}, " ") // // fileMutex.Lock() // fmt.Println(strings.Join([]string{tmp, "\r\n"}, "")); // file.WriteString(strings.Join([]string{tmp, "\r\n"}, "")) // fileMutex.Unlock() // // }) // // doc.Find(".pages a").Each(func(i int, s *query.Selection) { // href, exists := s.Attr("href") // if exists { // url, err := reqUrl.Parse(href) // if err == nil { // count++ // urlStr := url.String() // page.AddElem(cm.NewElementItem(urlStr)) // } // } // }) // // if count == 0 { // fmt.Println(strings.Join([]string{reqUrl.String(), "lose pager data!!!"}, " ")) // } // } }