Esempio n. 1
0
// It creates a robot type, appends it to hostinfo and RootURL to the Queue.
func (f *Fetch) SeedSlice(jsn string) {
	args := jsnLinks(jsn)
	cln := resolv(args)
	for _, str := range cln {
		res, err := f.DoRequest(BotCmd(str))
		if err != nil {
			cognilog.Log("red", err)
			continue
		}
		robot := MakeBot(res)
		if !robot.FullDisallow { // if not FullDisallow add
			f.HostInfo = append(f.HostInfo, robot)
			f.addSeed(NewCmd(str)) // add RootURL to Queue or Frontier.
			cognilog.LogINFO("green", "Root added", str)
		} else {
			cognilog.LogINFO("red", "Root Not added", str)
		}
	}
}
Esempio n. 2
0
// Seed creates a robot type, appends it to hostinfo and appends RootURL to
// the Queue.
func (f *Fetch) Seed(args ...string) {
	for _, str := range args {
		res, err := f.DoRequest(BotCmd(str))
		if err != nil {
			cognilog.Log("red", err)
		}
		robot := MakeBot(res)
		if !robot.FullDisallow { // if not FullDisallow add
			f.HostInfo = append(f.HostInfo, robot)
			f.addSeed(NewCmd(str)) // add RootURL to Queue or Frontier.
			cognilog.LogINFO("green", "added to Queue", str)
		}
	}
}
Esempio n. 3
0
// MakeBot takes an http.Response and returns a Robot
func MakeBot(res *http.Response) *Robot {
	var robot = new(Robot)
	// treat all 4xx errors in the same way. Assume that there are no restrictions.
	if res.StatusCode >= 400 && res.StatusCode > 500 {
		robot.RootURL = NewCmd(strings.TrimSuffix(res.Request.URL.String(), "robots.txt")).URL()
		robot.FullAllow = true
		robot.CrawDelay = DefaultCrawlDelay
		return robot
	} else if res.StatusCode == 200 {
		byt, err := ioutil.ReadAll(res.Body)
		if err != nil {
			cognilog.LogINFO("red", "Body read error", err)
		}
		redr := strings.NewReader(string(byt))
		scanner := bufio.NewScanner(redr)
		var groups = make(map[string][]string)
		var key string
		var status bool
		for scanner.Scan() {
			txt := strings.ToLower(scanner.Text())
			if txt == "" {
				continue
			}
			// new group
			if strings.HasPrefix(txt, "user-agent:") {
				key = trimSpaces(txt)
				status = true
				continue
			}
			if status && key != "" {
				groups[key] = append(groups[key], trimSpaces(txt))
			}
		}
		if err := scanner.Err(); err != nil {
			cognilog.Log("red", err)
		}
		if isAllowAll(groups) {
			robot.FullAllow = true
		} else if isDisallowAll(groups) {
			robot.FullDisallow = true
		}
		robot.RootURL = NewCmd(strings.TrimSuffix(res.Request.URL.String(), "robots.txt")).URL()
		robot.Groups = groups
		robot.CrawDelay = DefaultCrawlDelay
		return robot
	}

	robot.RootURL = NewCmd(strings.TrimSuffix(res.Request.URL.String(), "robots.txt")).URL()
	return robot
}
Esempio n. 4
0
// Start runs all the specified crawl goroutines.
func (f *Fetch) Start(num int) {
	c := make(chan bool)
	go f.watch(c)
	for i := 1; i <= num; i++ {
		go f.crawl(i)
		f.crawCount = f.crawCount + i
	}
	for {
		if <-c {
			cognilog.LogINFO("yellow", f.UserAgent, "Closed all Crawl threads.")
			break
		}
		time.Sleep(time.Duration(1) * time.Second)
	}
}
Esempio n. 5
0
// watch over f.Queue and f.Visited.
func (f *Fetch) watch(c chan bool) {
	for {
		queue := len(f.Queue)
		visit := len(f.Visited)
		if visit > 0 && queue > 0 {
			if visit >= queue {
				c <- true
				break
			}
		}

		cognilog.LogINFO("yellow", "status", fmt.Sprintf("Queue[%d]  Visited[%d]", queue, visit))
		time.Sleep(time.Duration(1) * time.Second)
	}
}
Esempio n. 6
0
// crawl thread that gets the body of a web pages and do lots of things with it,
// such as collectiong all the links from it and saving it to disk for future use.
func (f *Fetch) crawl(cr int) {
	for {
		// Get cmd from the queue and update the index num once done.
		var lnk Cmder
		f.mu.RLock()
		if len(f.Queue) == 0 { // if the queue is empty
			cognilog.FatalINFO("red", "panic", "Empty Queue exiting Now!")
		} else if len(f.Queue) == f.index {
			break
		}
		num := f.index
		lnk = f.Queue[num]
		num++
		f.index = num
		f.mu.RUnlock()
		cognilog.LogINFO("cyan", lnk.Method(), fmt.Sprintf("%v", lnk.URL().String()))
		res, err := f.DoRequest(lnk)
		if err != nil || res.StatusCode > 300 {
			if err == nil {
				cognilog.LogINFO("red", fmt.Sprintf("Crawl %d 404", cr), " [Page not found]")
				// append cmd to Visited
				f.tex.RLock()
				f.Visited = append(f.Visited, lnk)
				f.tex.RUnlock()
				continue
			}
			cognilog.LogINFO("red", fmt.Sprintf("Crawl %d [request not 200 ok]", cr), err)
			// append cmd to Visited
			f.tex.RLock()
			f.Visited = append(f.Visited, lnk)
			f.tex.RUnlock()
			continue
		}
		// write the page to disk
		byt, err := ioutil.ReadAll(res.Body)
		if err != nil {
			cognilog.LogINFO("red", fmt.Sprintf("Crawl %d body", cr), err)
			// append cmd to Visited
			f.tex.RLock()
			f.Visited = append(f.Visited, lnk)
			f.tex.RUnlock()
			continue
		}
		err = ioutil.WriteFile("docs/"+docName(lnk.URL()), byt, 0755)
		if err != nil {
			cognilog.LogINFO("red", fmt.Sprintf("Crawl %d write ", cr), err)
			// append cmd to Visited
			f.tex.RLock()
			f.Visited = append(f.Visited, lnk)
			f.tex.RUnlock()
			continue
		}
		redr := strings.NewReader(string(byt))
		anchColl := collectlinks.All(redr)
		for _, a := range anchColl {
			cmd, err := parseCmd(a, lnk.URL())
			if err != nil {
				continue
			}

			// skip if Host maxed out
			if count(f.HostCount, cmd.URL().Host) >= f.MaxPages {
				continue
			}

			// skip if resource not available
			res, err := f.DoRequest(cmd)
			if err != nil || res.StatusCode > 300 {
				if err == nil {
					cognilog.LogINFO("red", fmt.Sprintf("Crawl %d error", cr), "page not found")
					res.Body.Close()
					continue
				}
				cognilog.LogINFO("red", fmt.Sprintf("Crawl %d response not 200", cr), err)
				continue
			}

			// lock queue again
			f.mu.RLock()
			if checkURL(f.Queue, cmd.URL()) { // if the url is present in the queue, continue
				cognilog.LogINFO("magenta", fmt.Sprintf("Crawl %d", cr), fmt.Sprintf("%v [Already in the Queue skip.]", cmd.URL().String()))
				f.mu.RUnlock() // unlock before continue
				continue
			}

			// robot exclusion
			if !robExcl(cmd, f.HostInfo) {
				cognilog.LogINFO("magenta", fmt.Sprintf("Crawl %d", cr), fmt.Sprintf("%v [Disallowed by robot.]", cmd.URL().String()))
				f.mu.RUnlock() // unlock before continue
				continue
			}

			// if its a blob or pdf skip
			if filter(cmd) {
				cognilog.LogINFO("red", fmt.Sprintf("Crawl %d", cr), fmt.Sprintf("%v [Not Accepted]", cmd.URL().String()))
				f.mu.RUnlock() // unlock before continue
				continue
			}
			// appends cmd to Queue
			func(cd Cmder) {
				var host bool
				for _, rob := range f.HostInfo { // check for existing host
					if rob.RootURL.Host == cd.URL().Host {
						host = true
						break
					}
				}
				if host == false {
					cognilog.LogINFO("magenta", fmt.Sprintf("Crawl %d", cr), fmt.Sprintf("%v [Host Not white listed]", cd.URL().Host))
					f.mu.RUnlock()
					return
				}

				if count(f.HostCount, cd.URL().Host) >= f.MaxPages {
					cognilog.LogINFO("white", fmt.Sprintf("Crawl %d", cr), fmt.Sprintf("%v [Host slots maxed out]", cd.URL().Host))
					f.mu.RUnlock()
					return
				}
				f.HostCount = append(f.HostCount, cd.URL().Host)

				f.Queue = append(f.Queue, cd)
				cognilog.LogINFO("green", fmt.Sprintf("Crawl %d", cr), fmt.Sprintf("%v [URL add to Queue]", cd.URL().String()))
				f.mu.RUnlock()
			}(cmd)
		}

		// append cmd to Visited
		f.tex.RLock()
		f.Visited = append(f.Visited, lnk)
		f.tex.RUnlock()

		res.Body.Close()

		time.Sleep(DefaultCrawlDelay)
	}
}