Beispiel #1
0
func (self *Spider) AddUrl(urlstr string) {

	if self.downloader == nil {
		panic("downloader instance is nil, please init downloader.")
	}

	if self.scheduler == nil {
		panic("scheduler instance is nil, please init scheduler.")
	}

	if self.pipeliner == nil {
		panic("pipeliner instance is nil, please init pipeliner.")
	}

	elemItem := common.NewElementItem(urlstr)

	self.scheduler.AddElementItem(elemItem, false)
}
Beispiel #2
0
func (process *MyProcess) Do(page *cm.Page) {
	bytes, _ := ioutil.ReadAll(page.Res.Body)
	bodyStr := string(bytes[:])
	reader := strings.NewReader(bodyStr)
	reqUrl := page.Req.URL

	doc, _ := query.NewDocumentFromReader(reader)

	/**------------------------------波奇的产品爬虫------------------------------**/

	if reqUrl.String() == "http://shop.boqii.com/" {
		doc.Find(".menu .menu_body a").Each(func(i int, s *query.Selection) {
			href, exists := s.Attr("href")
			if exists {
				url, err := reqUrl.Parse(href)
				if err == nil {
					urlStr := url.String()
					page.AddElem(cm.NewElementItem(urlStr))
				}
			}
		})
	} else {
		doc.Find(".product_list .product_list_container").Each(func(i int, s *query.Selection) {
			tmp := ""
			reg := regexp.MustCompile("\\d+")
			filter := regexp.MustCompile("\\s+")

			name := filter.ReplaceAllString(s.Find(".product_name a").Text(), "")
			price := filter.ReplaceAllString(s.Find(".product_price strong").Text(), "")
			sellNum := s.Find(".product_status .product_sold").Text()
			sellNum = filter.ReplaceAllString(reg.FindString(sellNum), "")

			tmp = strings.Join([]string{tmp, sellNum, name, price}, " ")

			fileMutex.Lock()
			fmt.Println(strings.Join([]string{tmp, "\r\n"}, ""))
			file.WriteString(strings.Join([]string{tmp, "\r\n"}, ""))
			fileMutex.Unlock()
		})

		doc.Find(".pagination a").Each(func(i int, s *query.Selection) {
			href, exists := s.Attr("href")
			if exists {
				url, err := reqUrl.Parse(href)
				if err == nil {
					urlStr := url.String()
					page.AddElem(cm.NewElementItem(urlStr))
				}
			}
		})
	}

	/**------------------------------易宠的产品爬虫------------------------------**/

	//	if reqUrl.String() == "http://www.epet.com/" {
	//		doc.Find(".catelist h3 a").Each(func(i int, s *query.Selection) {
	//			href, exists := s.Attr("href")
	//			if exists {
	//				url, err := reqUrl.Parse(href)
	//				if err == nil {
	//					urlStr := url.String()
	//					page.AddElem(cm.NewElementItem(urlStr))
	//				}
	//			}
	//		})
	//	} else {
	//		count := 0
	//		doc.Find(".list_box-li").Each(func(i int, s *query.Selection) {
	//			tmp := ""
	//			reg := regexp.MustCompile("\\d+")
	//			filter := regexp.MustCompile("\\s+")
	//			sellNum := ""
	//
	//			name := filter.ReplaceAllString(s.Find(".gtitle").Text(), " ")
	//			price := filter.ReplaceAllString(s.Find(".gprice .price").Text(), " ")
	//			s.Find(".c999 em").Each(func(i int, s *query.Selection) {
	//				if i == 0 {
	//					sellNum = filter.ReplaceAllString(reg.FindString(s.Text()), " ")
	//				}
	//			})
	//
	//			tmp = strings.Join([]string{tmp, sellNum, name, price}, " ")
	//
	//			fileMutex.Lock()
	//			fmt.Println(strings.Join([]string{tmp, "\r\n"}, ""));
	//			file.WriteString(strings.Join([]string{tmp, "\r\n"}, ""))
	//			fileMutex.Unlock()
	//
	//		})
	//
	//		doc.Find(".pages a").Each(func(i int, s *query.Selection) {
	//			href, exists := s.Attr("href")
	//			if exists {
	//				url, err := reqUrl.Parse(href)
	//				if err == nil {
	//					count++
	//					urlStr := url.String()
	//					page.AddElem(cm.NewElementItem(urlStr))
	//				}
	//			}
	//		})
	//
	//		if count == 0 {
	//			fmt.Println(strings.Join([]string{reqUrl.String(), "lose pager data!!!"}, " "))
	//		}
	//	}
}