//url:tocrawl url, spiderCrawlFunc: how spiderCrawl, spiderAnalystFunc: how spider deal with response func func DoCrawl(seedSpider *Spider, spiderCrawlFunc SpiderFunc, spiderAnlystFunc SpiderFunc, maxConcurrencyNum int) { logger := logger4go.GetDefaultLogger() visited := map[string]bool{} // crawler has two channel, one url channel, one data channel var spiderChannel = make(chan *Spider) // spider to crawl channel var dataChannel = make(chan *Spider) // spider to analyst channel var wg sync.WaitGroup logger.Info("Crawl start!") //seedSpider := NewDefaultGetSpider(url) go func() { spiderChannel <- seedSpider }() // max crawl concurrency //start maxConcurrencyNum goroutine to crawl for i := 0; i < maxConcurrencyNum; i++ { wg.Add(1) go func() { defer wg.Done() for { select { case spider := <-spiderChannel: if !visited[spider.Url] { //fmt.Println(len(visited)) visited[spider.Url] = true //logger.Info(len(visited)) err := spiderCrawlFunc(spider, dataChannel) if err != nil { logger.Error(err) } //fmt.Println("crawl end") } } } }() } //analys function will start a goroutine to analys when a response throw to dataChannel for { select { case data := <-dataChannel: wg.Add(1) go func() { defer wg.Done() //fmt.Println("analysising") err := spiderAnlystFunc(data, spiderChannel) if err != nil { logger.Error(err) } //fmt.Println("analys end") }() } } wg.Wait() }
func main() { logger := logger4go.GetDefaultLogger() crawlFunc := func(spider *crawler.Spider, dataChannel chan *crawler.Spider) error { time.Sleep(spider.CrawlDelay) var client *http.Client // is use proxy if len(spider.ProxyUrl) > 0 { proxyUrl, err := url.Parse(spider.ProxyUrl) if nil != err { return err } client = &http.Client{Transport: &http.Transport{Proxy: http.ProxyURL(proxyUrl)}, Timeout: spider.TTL} } else { client = &http.Client{Timeout: spider.TTL} } logger.Info(spider.Url) logger.Info(spider.Generation) logger.Info(runtime.NumGoroutine()) req, err := http.NewRequest(spider.Method, spider.Url, nil) if err != nil { return err } req.Header.Set("Content-Type", spider.ContentType) //req.Header.Set("Cookie", "name=anny") resp, err := client.Do(req) if err != nil { return err } body, err := ioutil.ReadAll(resp.Body) if err != nil { return err } defer resp.Body.Close() spider.RequestURL = resp.Request.URL spider.ResponseData = string(body) //logger.Info(spider.ResponseData) go func() { dataChannel <- spider }() return nil } analystFunc := func(spider *crawler.Spider, spiderChannel chan *crawler.Spider) error { if len(spider.ResponseData) <= 0 { return nil } reader := strings.NewReader(spider.ResponseData) doc, err := goquery.NewDocumentFromReader(reader) if err != nil { return err } //fmt.Println(strconv.FormatInt(spider.Generation, 10) + "---" + spider.Url) doc.Url = spider.RequestURL // find link in response, and gen child spider, set child spider to spider channel doc.Find("a[href]").Each(func(i int, s *goquery.Selection) { link, exist := s.Attr("href") if exist { if len(link) > 0 && strings.Index(link, "#") != 0 { if parsed, e := url.Parse(link); e == nil { parsed = doc.Url.ResolveReference(parsed) var parsedLink = parsed.String() //fmt.Println("parsing...") if strings.Index(parsedLink, "http") == 0 && strings.Index(parsedLink, spider.RequestURL.Host) > 0 { childSpider := spider.NewChildSpider(parsedLink) go func() { spiderChannel <- childSpider }() } } } } }) //查找赞过万的页面 /*if strings.Index(spider.Url, "question") > 0 && strings.Index(spider.Url, "answer") > 0 { doc.Find("span.count").Each(func(i int, s *goquery.Selection) { upCountStr, _ := s.Html() upCountStr = strings.Replace(upCountStr, "K", "000", -1) upCount, err := strconv.ParseInt(upCountStr, 10, 32) if err != nil { logger.Error(err) } fmt.Println(upCount) if upCount > 10000 { logger.Info(spider.Url) } }) }*/ //fmt.Println("analys end") //zhihu 相关, 如果当前页面是有问有答的, 则尝试拼接评论链接, 加到搜索中 /*if strings.Index(spider.Url, "question") > 0 && strings.Index(spider.Url, "#answer") > 0 { //l := "http://www.zhihu.com/node/AnswerCommentBoxV2?params=%7B%22answer_id%22%3A%22" //l += spider.Url[strings.LastIndex(spider.Url, "answer\\"):] re, _ := regexp.Compile("[0-9]+") submatch := re.FindAllString(spider.Url, -1) //allindex := re.FindAllIndex([]byte(spider.Url), -1) l := "http://www.zhihu.com/node/AnswerCommentBoxV2?params=%7B%22answer_id%22%3A%22" l += submatch[1] l += "%22%2C%22load_all%22%3Atrue%7D" childSpider := spider.NewChildSpider(l) go func() { spiderChannel <- childSpider }() fmt.Println(spider.Url) fmt.Println(l) } //zhihu 相关, 如果访问的是评论的链接, 则在缓存中查找之前存储的答案和问题链接的map if strings.Index(spider.Url, "AnswerCommentBoxV2") > 0 { fmt.Println("comment") }*/ //do other analyst /*doc.Find("zm-comment-content").Each(func(i int, s *goquery.Selection) { html, _ := s.Html() fmt.Println(html) })*/ return nil } seedSpider := crawler.NewDefaultGetSpider("http://www.zhihu.com") crawler.DoCrawl(seedSpider, crawlFunc, analystFunc, 1000) }
func sign() { logger := logger4go.GetDefaultLogger() options := cookiejar.Options{ PublicSuffixList: publicsuffix.List, } jar, err := cookiejar.New(&options) if err != nil { logger.Error(err) return } client := &http.Client{ CheckRedirect: nil, Jar: jar, } client.Get("http://www.zimuzu.tv/user/login") // login form data formValue := url.Values{} config, err := ParseConfig("config.json") if err != nil { logger.Error(err) return } for key, val := range config { formValue.Set(key, val) } if err != nil { logger.Error(err) return } req, _ := http.NewRequest("POST", "http://www.zimuzu.tv/User/Login/ajaxLogin", bytes.NewBufferString(formValue.Encode())) req.Header.Set("Accept", "application/json, text/javascript, */*; q=0.01") req.Header.Set("Content-Type", "application/x-www-form-urlencoded") resp, _ := client.Do(req) /*content, err := ioutil.ReadAll(resp.Body) if err != nil { logger.Error(err) } fmt.Println("login : "******"http://www.zimuzu.tv/user/sign") /*content, err = ioutil.ReadAll(resp.Body) if err != nil { logger.Error("get page: ", err) } fmt.Println(string(content))*/ time.Sleep(3 * time.Second) resp, err = client.Get("http://www.zimuzu.tv/user/login/getCurUserTopInfo") /*content, err = ioutil.ReadAll(resp.Body) if err != nil { logger.Error("get Cur User Top Info : ", err) } fmt.Println(string(content))*/ resp, err = client.Get("http://www.zimuzu.tv/user/&") return //已经不需要再签到了,只要登陆即可 time.Sleep(15 * time.Second) resp, err = client.Get("http://www.zimuzu.tv/user/sign/dosign") defer resp.Body.Close() body, err := ioutil.ReadAll(resp.Body) if err != nil { logger.Error(err) return } logger.Info(string(body)) }