// It creates a robot type, appends it to hostinfo and RootURL to the Queue. func (f *Fetch) SeedSlice(jsn string) { args := jsnLinks(jsn) cln := resolv(args) for _, str := range cln { res, err := f.DoRequest(BotCmd(str)) if err != nil { cognilog.Log("red", err) continue } robot := MakeBot(res) if !robot.FullDisallow { // if not FullDisallow add f.HostInfo = append(f.HostInfo, robot) f.addSeed(NewCmd(str)) // add RootURL to Queue or Frontier. cognilog.LogINFO("green", "Root added", str) } else { cognilog.LogINFO("red", "Root Not added", str) } } }
// Seed creates a robot type, appends it to hostinfo and appends RootURL to // the Queue. func (f *Fetch) Seed(args ...string) { for _, str := range args { res, err := f.DoRequest(BotCmd(str)) if err != nil { cognilog.Log("red", err) } robot := MakeBot(res) if !robot.FullDisallow { // if not FullDisallow add f.HostInfo = append(f.HostInfo, robot) f.addSeed(NewCmd(str)) // add RootURL to Queue or Frontier. cognilog.LogINFO("green", "added to Queue", str) } } }
// MakeBot takes an http.Response and returns a Robot func MakeBot(res *http.Response) *Robot { var robot = new(Robot) // treat all 4xx errors in the same way. Assume that there are no restrictions. if res.StatusCode >= 400 && res.StatusCode > 500 { robot.RootURL = NewCmd(strings.TrimSuffix(res.Request.URL.String(), "robots.txt")).URL() robot.FullAllow = true robot.CrawDelay = DefaultCrawlDelay return robot } else if res.StatusCode == 200 { byt, err := ioutil.ReadAll(res.Body) if err != nil { cognilog.LogINFO("red", "Body read error", err) } redr := strings.NewReader(string(byt)) scanner := bufio.NewScanner(redr) var groups = make(map[string][]string) var key string var status bool for scanner.Scan() { txt := strings.ToLower(scanner.Text()) if txt == "" { continue } // new group if strings.HasPrefix(txt, "user-agent:") { key = trimSpaces(txt) status = true continue } if status && key != "" { groups[key] = append(groups[key], trimSpaces(txt)) } } if err := scanner.Err(); err != nil { cognilog.Log("red", err) } if isAllowAll(groups) { robot.FullAllow = true } else if isDisallowAll(groups) { robot.FullDisallow = true } robot.RootURL = NewCmd(strings.TrimSuffix(res.Request.URL.String(), "robots.txt")).URL() robot.Groups = groups robot.CrawDelay = DefaultCrawlDelay return robot } robot.RootURL = NewCmd(strings.TrimSuffix(res.Request.URL.String(), "robots.txt")).URL() return robot }
// Start runs all the specified crawl goroutines. func (f *Fetch) Start(num int) { c := make(chan bool) go f.watch(c) for i := 1; i <= num; i++ { go f.crawl(i) f.crawCount = f.crawCount + i } for { if <-c { cognilog.LogINFO("yellow", f.UserAgent, "Closed all Crawl threads.") break } time.Sleep(time.Duration(1) * time.Second) } }
// watch over f.Queue and f.Visited. func (f *Fetch) watch(c chan bool) { for { queue := len(f.Queue) visit := len(f.Visited) if visit > 0 && queue > 0 { if visit >= queue { c <- true break } } cognilog.LogINFO("yellow", "status", fmt.Sprintf("Queue[%d] Visited[%d]", queue, visit)) time.Sleep(time.Duration(1) * time.Second) } }
// crawl thread that gets the body of a web pages and do lots of things with it, // such as collectiong all the links from it and saving it to disk for future use. func (f *Fetch) crawl(cr int) { for { // Get cmd from the queue and update the index num once done. var lnk Cmder f.mu.RLock() if len(f.Queue) == 0 { // if the queue is empty cognilog.FatalINFO("red", "panic", "Empty Queue exiting Now!") } else if len(f.Queue) == f.index { break } num := f.index lnk = f.Queue[num] num++ f.index = num f.mu.RUnlock() cognilog.LogINFO("cyan", lnk.Method(), fmt.Sprintf("%v", lnk.URL().String())) res, err := f.DoRequest(lnk) if err != nil || res.StatusCode > 300 { if err == nil { cognilog.LogINFO("red", fmt.Sprintf("Crawl %d 404", cr), " [Page not found]") // append cmd to Visited f.tex.RLock() f.Visited = append(f.Visited, lnk) f.tex.RUnlock() continue } cognilog.LogINFO("red", fmt.Sprintf("Crawl %d [request not 200 ok]", cr), err) // append cmd to Visited f.tex.RLock() f.Visited = append(f.Visited, lnk) f.tex.RUnlock() continue } // write the page to disk byt, err := ioutil.ReadAll(res.Body) if err != nil { cognilog.LogINFO("red", fmt.Sprintf("Crawl %d body", cr), err) // append cmd to Visited f.tex.RLock() f.Visited = append(f.Visited, lnk) f.tex.RUnlock() continue } err = ioutil.WriteFile("docs/"+docName(lnk.URL()), byt, 0755) if err != nil { cognilog.LogINFO("red", fmt.Sprintf("Crawl %d write ", cr), err) // append cmd to Visited f.tex.RLock() f.Visited = append(f.Visited, lnk) f.tex.RUnlock() continue } redr := strings.NewReader(string(byt)) anchColl := collectlinks.All(redr) for _, a := range anchColl { cmd, err := parseCmd(a, lnk.URL()) if err != nil { continue } // skip if Host maxed out if count(f.HostCount, cmd.URL().Host) >= f.MaxPages { continue } // skip if resource not available res, err := f.DoRequest(cmd) if err != nil || res.StatusCode > 300 { if err == nil { cognilog.LogINFO("red", fmt.Sprintf("Crawl %d error", cr), "page not found") res.Body.Close() continue } cognilog.LogINFO("red", fmt.Sprintf("Crawl %d response not 200", cr), err) continue } // lock queue again f.mu.RLock() if checkURL(f.Queue, cmd.URL()) { // if the url is present in the queue, continue cognilog.LogINFO("magenta", fmt.Sprintf("Crawl %d", cr), fmt.Sprintf("%v [Already in the Queue skip.]", cmd.URL().String())) f.mu.RUnlock() // unlock before continue continue } // robot exclusion if !robExcl(cmd, f.HostInfo) { cognilog.LogINFO("magenta", fmt.Sprintf("Crawl %d", cr), fmt.Sprintf("%v [Disallowed by robot.]", cmd.URL().String())) f.mu.RUnlock() // unlock before continue continue } // if its a blob or pdf skip if filter(cmd) { cognilog.LogINFO("red", fmt.Sprintf("Crawl %d", cr), fmt.Sprintf("%v [Not Accepted]", cmd.URL().String())) f.mu.RUnlock() // unlock before continue continue } // appends cmd to Queue func(cd Cmder) { var host bool for _, rob := range f.HostInfo { // check for existing host if rob.RootURL.Host == cd.URL().Host { host = true break } } if host == false { cognilog.LogINFO("magenta", fmt.Sprintf("Crawl %d", cr), fmt.Sprintf("%v [Host Not white listed]", cd.URL().Host)) f.mu.RUnlock() return } if count(f.HostCount, cd.URL().Host) >= f.MaxPages { cognilog.LogINFO("white", fmt.Sprintf("Crawl %d", cr), fmt.Sprintf("%v [Host slots maxed out]", cd.URL().Host)) f.mu.RUnlock() return } f.HostCount = append(f.HostCount, cd.URL().Host) f.Queue = append(f.Queue, cd) cognilog.LogINFO("green", fmt.Sprintf("Crawl %d", cr), fmt.Sprintf("%v [URL add to Queue]", cd.URL().String())) f.mu.RUnlock() }(cmd) } // append cmd to Visited f.tex.RLock() f.Visited = append(f.Visited, lnk) f.tex.RUnlock() res.Body.Close() time.Sleep(DefaultCrawlDelay) } }