Example #1
0
// ClearDocs removes old docs.
func ClearDocs() {
	// rm docs and create a new one.
	path := "./" + docs
	err := os.RemoveAll(path)
	if err != nil {
		cognilog.Log("red", err)
	}
	err = os.Mkdir(path, 0755)
	if err != nil {
		cognilog.Log("red", err)
	}
}
Example #2
0
func jsnLinks(s string) []string {
	config, err := ioutil.ReadFile(s)
	if err != nil {
		cognilog.Log("red", err)
	}

	jsn := make(map[string][]string)
	err = json.Unmarshal(config, &jsn)
	if err != nil {
		cognilog.Log("red", err)
	}
	// slice of stopwords
	links := jsn["links"]
	return links
}
Example #3
0
// NewCmd returns an initialized Cmd
func NewCmd(s string) *Cmd {
	url, err := url.Parse(s)
	if err != nil {
		cognilog.Log("red", err)
	}
	return &Cmd{
		U: url,
		M: "GET",
	}
}
Example #4
0
// BotCmd returns an initialized robot Cmd
func BotCmd(s string) *Cmd {
	lnk, err := url.Parse(s)
	if err != nil {
		cognilog.Log("red", err)
	}
	rob := lnk.ResolveReference(robotPath)
	return &Cmd{
		U: rob,
		M: "GET",
	}
}
Example #5
0
// parseCmd returns cmd by resolving a url
func parseCmd(s string, lnk *url.URL) (*Cmd, error) {
	href, err := url.Parse(s)
	if err != nil {
		cognilog.Log("red", err)
		return nil, err
	}

	url := lnk.ResolveReference(href)
	return &Cmd{
		U: url,
		M: "GET",
	}, nil
}
Example #6
0
// Seed creates a robot type, appends it to hostinfo and appends RootURL to
// the Queue.
func (f *Fetch) Seed(args ...string) {
	for _, str := range args {
		res, err := f.DoRequest(BotCmd(str))
		if err != nil {
			cognilog.Log("red", err)
		}
		robot := MakeBot(res)
		if !robot.FullDisallow { // if not FullDisallow add
			f.HostInfo = append(f.HostInfo, robot)
			f.addSeed(NewCmd(str)) // add RootURL to Queue or Frontier.
			cognilog.LogINFO("green", "added to Queue", str)
		}
	}
}
Example #7
0
// MakeBot takes an http.Response and returns a Robot
func MakeBot(res *http.Response) *Robot {
	var robot = new(Robot)
	// treat all 4xx errors in the same way. Assume that there are no restrictions.
	if res.StatusCode >= 400 && res.StatusCode > 500 {
		robot.RootURL = NewCmd(strings.TrimSuffix(res.Request.URL.String(), "robots.txt")).URL()
		robot.FullAllow = true
		robot.CrawDelay = DefaultCrawlDelay
		return robot
	} else if res.StatusCode == 200 {
		byt, err := ioutil.ReadAll(res.Body)
		if err != nil {
			cognilog.LogINFO("red", "Body read error", err)
		}
		redr := strings.NewReader(string(byt))
		scanner := bufio.NewScanner(redr)
		var groups = make(map[string][]string)
		var key string
		var status bool
		for scanner.Scan() {
			txt := strings.ToLower(scanner.Text())
			if txt == "" {
				continue
			}
			// new group
			if strings.HasPrefix(txt, "user-agent:") {
				key = trimSpaces(txt)
				status = true
				continue
			}
			if status && key != "" {
				groups[key] = append(groups[key], trimSpaces(txt))
			}
		}
		if err := scanner.Err(); err != nil {
			cognilog.Log("red", err)
		}
		if isAllowAll(groups) {
			robot.FullAllow = true
		} else if isDisallowAll(groups) {
			robot.FullDisallow = true
		}
		robot.RootURL = NewCmd(strings.TrimSuffix(res.Request.URL.String(), "robots.txt")).URL()
		robot.Groups = groups
		robot.CrawDelay = DefaultCrawlDelay
		return robot
	}

	robot.RootURL = NewCmd(strings.TrimSuffix(res.Request.URL.String(), "robots.txt")).URL()
	return robot
}
Example #8
0
// It creates a robot type, appends it to hostinfo and RootURL to the Queue.
func (f *Fetch) SeedSlice(jsn string) {
	args := jsnLinks(jsn)
	cln := resolv(args)
	for _, str := range cln {
		res, err := f.DoRequest(BotCmd(str))
		if err != nil {
			cognilog.Log("red", err)
			continue
		}
		robot := MakeBot(res)
		if !robot.FullDisallow { // if not FullDisallow add
			f.HostInfo = append(f.HostInfo, robot)
			f.addSeed(NewCmd(str)) // add RootURL to Queue or Frontier.
			cognilog.LogINFO("green", "Root added", str)
		} else {
			cognilog.LogINFO("red", "Root Not added", str)
		}
	}
}