Ejemplo n.º 1
0
//url:tocrawl url, spiderCrawlFunc: how spiderCrawl, spiderAnalystFunc: how spider deal with response func
func DoCrawl(seedSpider *Spider, spiderCrawlFunc SpiderFunc, spiderAnlystFunc SpiderFunc, maxConcurrencyNum int) {
	logger := logger4go.GetDefaultLogger()

	visited := map[string]bool{}

	// crawler has two channel, one url channel, one data channel
	var spiderChannel = make(chan *Spider) // spider to crawl channel
	var dataChannel = make(chan *Spider)   // spider to analyst channel

	var wg sync.WaitGroup
	logger.Info("Crawl start!")
	//seedSpider := NewDefaultGetSpider(url)

	go func() {
		spiderChannel <- seedSpider
	}()

	// max crawl concurrency
	//start maxConcurrencyNum goroutine to crawl
	for i := 0; i < maxConcurrencyNum; i++ {
		wg.Add(1)
		go func() {
			defer wg.Done()
			for {
				select {
				case spider := <-spiderChannel:
					if !visited[spider.Url] {
						//fmt.Println(len(visited))
						visited[spider.Url] = true
						//logger.Info(len(visited))
						err := spiderCrawlFunc(spider, dataChannel)
						if err != nil {
							logger.Error(err)
						}
						//fmt.Println("crawl end")
					}
				}
			}
		}()
	}

	//analys function will start a goroutine to analys when a response throw to dataChannel
	for {
		select {
		case data := <-dataChannel:
			wg.Add(1)
			go func() {
				defer wg.Done()
				//fmt.Println("analysising")
				err := spiderAnlystFunc(data, spiderChannel)
				if err != nil {
					logger.Error(err)
				}
				//fmt.Println("analys end")
			}()
		}
	}

	wg.Wait()

}
Ejemplo n.º 2
0
func main() {
	logger := logger4go.GetDefaultLogger()

	crawlFunc := func(spider *crawler.Spider, dataChannel chan *crawler.Spider) error {

		time.Sleep(spider.CrawlDelay)
		var client *http.Client

		// is use proxy
		if len(spider.ProxyUrl) > 0 {
			proxyUrl, err := url.Parse(spider.ProxyUrl)

			if nil != err {
				return err
			}

			client = &http.Client{Transport: &http.Transport{Proxy: http.ProxyURL(proxyUrl)}, Timeout: spider.TTL}
		} else {
			client = &http.Client{Timeout: spider.TTL}
		}

		logger.Info(spider.Url)
		logger.Info(spider.Generation)
		logger.Info(runtime.NumGoroutine())
		req, err := http.NewRequest(spider.Method, spider.Url, nil)
		if err != nil {
			return err
		}

		req.Header.Set("Content-Type", spider.ContentType)
		//req.Header.Set("Cookie", "name=anny")

		resp, err := client.Do(req)

		if err != nil {
			return err
		}

		body, err := ioutil.ReadAll(resp.Body)
		if err != nil {
			return err
		}
		defer resp.Body.Close()

		spider.RequestURL = resp.Request.URL
		spider.ResponseData = string(body)
		//logger.Info(spider.ResponseData)
		go func() {
			dataChannel <- spider
		}()

		return nil
	}

	analystFunc := func(spider *crawler.Spider, spiderChannel chan *crawler.Spider) error {

		if len(spider.ResponseData) <= 0 {
			return nil
		}
		reader := strings.NewReader(spider.ResponseData)

		doc, err := goquery.NewDocumentFromReader(reader)
		if err != nil {
			return err
		}
		//fmt.Println(strconv.FormatInt(spider.Generation, 10) + "---" + spider.Url)

		doc.Url = spider.RequestURL

		// find link in response, and gen child spider, set child spider to spider channel
		doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
			link, exist := s.Attr("href")
			if exist {
				if len(link) > 0 && strings.Index(link, "#") != 0 {
					if parsed, e := url.Parse(link); e == nil {
						parsed = doc.Url.ResolveReference(parsed)
						var parsedLink = parsed.String()
						//fmt.Println("parsing...")

						if strings.Index(parsedLink, "http") == 0 && strings.Index(parsedLink, spider.RequestURL.Host) > 0 {
							childSpider := spider.NewChildSpider(parsedLink)
							go func() {
								spiderChannel <- childSpider
							}()
						}
					}
				}
			}

		})

		//查找赞过万的页面
		/*if strings.Index(spider.Url, "question") > 0 && strings.Index(spider.Url, "answer") > 0 {
			doc.Find("span.count").Each(func(i int, s *goquery.Selection) {
				upCountStr, _ := s.Html()
				upCountStr = strings.Replace(upCountStr, "K", "000", -1)
				upCount, err := strconv.ParseInt(upCountStr, 10, 32)
				if err != nil {
					logger.Error(err)
				}
				fmt.Println(upCount)
				if upCount > 10000 {
					logger.Info(spider.Url)
				}
			})
		}*/

		//fmt.Println("analys end")
		//zhihu 相关, 如果当前页面是有问有答的, 则尝试拼接评论链接, 加到搜索中
		/*if strings.Index(spider.Url, "question") > 0 && strings.Index(spider.Url, "#answer") > 0 {
			//l := "http://www.zhihu.com/node/AnswerCommentBoxV2?params=%7B%22answer_id%22%3A%22"
			//l += spider.Url[strings.LastIndex(spider.Url, "answer\\"):]
			re, _ := regexp.Compile("[0-9]+")
			submatch := re.FindAllString(spider.Url, -1)
			//allindex := re.FindAllIndex([]byte(spider.Url), -1)
			l := "http://www.zhihu.com/node/AnswerCommentBoxV2?params=%7B%22answer_id%22%3A%22"
			l += submatch[1]
			l += "%22%2C%22load_all%22%3Atrue%7D"
			childSpider := spider.NewChildSpider(l)
			go func() {
				spiderChannel <- childSpider
			}()
			fmt.Println(spider.Url)
			fmt.Println(l)
		}

		//zhihu 相关, 如果访问的是评论的链接, 则在缓存中查找之前存储的答案和问题链接的map
		if strings.Index(spider.Url, "AnswerCommentBoxV2") > 0 {
			fmt.Println("comment")
		}*/

		//do other analyst
		/*doc.Find("zm-comment-content").Each(func(i int, s *goquery.Selection) {
			html, _ := s.Html()
			fmt.Println(html)
		})*/

		return nil

	}

	seedSpider := crawler.NewDefaultGetSpider("http://www.zhihu.com")
	crawler.DoCrawl(seedSpider, crawlFunc, analystFunc, 1000)
}
Ejemplo n.º 3
0
func sign() {
	logger := logger4go.GetDefaultLogger()

	options := cookiejar.Options{
		PublicSuffixList: publicsuffix.List,
	}
	jar, err := cookiejar.New(&options)
	if err != nil {
		logger.Error(err)
		return
	}

	client := &http.Client{
		CheckRedirect: nil,
		Jar:           jar,
	}
	client.Get("http://www.zimuzu.tv/user/login")

	// login form data
	formValue := url.Values{}
	config, err := ParseConfig("config.json")
	if err != nil {
		logger.Error(err)
		return
	}
	for key, val := range config {
		formValue.Set(key, val)
	}

	if err != nil {
		logger.Error(err)
		return
	}
	req, _ := http.NewRequest("POST", "http://www.zimuzu.tv/User/Login/ajaxLogin", bytes.NewBufferString(formValue.Encode()))
	req.Header.Set("Accept", "application/json, text/javascript, */*; q=0.01")
	req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
	resp, _ := client.Do(req)
	/*content, err := ioutil.ReadAll(resp.Body)

	if err != nil {
		logger.Error(err)
	}
	fmt.Println("login : "******"http://www.zimuzu.tv/user/sign")
	/*content, err = ioutil.ReadAll(resp.Body)
	if err != nil {
		logger.Error("get page: ", err)
	}
	fmt.Println(string(content))*/

	time.Sleep(3 * time.Second)
	resp, err = client.Get("http://www.zimuzu.tv/user/login/getCurUserTopInfo")

	/*content, err = ioutil.ReadAll(resp.Body)
	if err != nil {
		logger.Error("get Cur User Top Info : ", err)
	}

	fmt.Println(string(content))*/
	resp, err = client.Get("http://www.zimuzu.tv/user/&")

	return //已经不需要再签到了,只要登陆即可

	time.Sleep(15 * time.Second)
	resp, err = client.Get("http://www.zimuzu.tv/user/sign/dosign")
	defer resp.Body.Close()
	body, err := ioutil.ReadAll(resp.Body)
	if err != nil {
		logger.Error(err)
		return
	}
	logger.Info(string(body))
}