// ClearDocs removes old docs. func ClearDocs() { // rm docs and create a new one. path := "./" + docs err := os.RemoveAll(path) if err != nil { cognilog.Log("red", err) } err = os.Mkdir(path, 0755) if err != nil { cognilog.Log("red", err) } }
func jsnLinks(s string) []string { config, err := ioutil.ReadFile(s) if err != nil { cognilog.Log("red", err) } jsn := make(map[string][]string) err = json.Unmarshal(config, &jsn) if err != nil { cognilog.Log("red", err) } // slice of stopwords links := jsn["links"] return links }
// NewCmd returns an initialized Cmd func NewCmd(s string) *Cmd { url, err := url.Parse(s) if err != nil { cognilog.Log("red", err) } return &Cmd{ U: url, M: "GET", } }
// BotCmd returns an initialized robot Cmd func BotCmd(s string) *Cmd { lnk, err := url.Parse(s) if err != nil { cognilog.Log("red", err) } rob := lnk.ResolveReference(robotPath) return &Cmd{ U: rob, M: "GET", } }
// parseCmd returns cmd by resolving a url func parseCmd(s string, lnk *url.URL) (*Cmd, error) { href, err := url.Parse(s) if err != nil { cognilog.Log("red", err) return nil, err } url := lnk.ResolveReference(href) return &Cmd{ U: url, M: "GET", }, nil }
// Seed creates a robot type, appends it to hostinfo and appends RootURL to // the Queue. func (f *Fetch) Seed(args ...string) { for _, str := range args { res, err := f.DoRequest(BotCmd(str)) if err != nil { cognilog.Log("red", err) } robot := MakeBot(res) if !robot.FullDisallow { // if not FullDisallow add f.HostInfo = append(f.HostInfo, robot) f.addSeed(NewCmd(str)) // add RootURL to Queue or Frontier. cognilog.LogINFO("green", "added to Queue", str) } } }
// MakeBot takes an http.Response and returns a Robot func MakeBot(res *http.Response) *Robot { var robot = new(Robot) // treat all 4xx errors in the same way. Assume that there are no restrictions. if res.StatusCode >= 400 && res.StatusCode > 500 { robot.RootURL = NewCmd(strings.TrimSuffix(res.Request.URL.String(), "robots.txt")).URL() robot.FullAllow = true robot.CrawDelay = DefaultCrawlDelay return robot } else if res.StatusCode == 200 { byt, err := ioutil.ReadAll(res.Body) if err != nil { cognilog.LogINFO("red", "Body read error", err) } redr := strings.NewReader(string(byt)) scanner := bufio.NewScanner(redr) var groups = make(map[string][]string) var key string var status bool for scanner.Scan() { txt := strings.ToLower(scanner.Text()) if txt == "" { continue } // new group if strings.HasPrefix(txt, "user-agent:") { key = trimSpaces(txt) status = true continue } if status && key != "" { groups[key] = append(groups[key], trimSpaces(txt)) } } if err := scanner.Err(); err != nil { cognilog.Log("red", err) } if isAllowAll(groups) { robot.FullAllow = true } else if isDisallowAll(groups) { robot.FullDisallow = true } robot.RootURL = NewCmd(strings.TrimSuffix(res.Request.URL.String(), "robots.txt")).URL() robot.Groups = groups robot.CrawDelay = DefaultCrawlDelay return robot } robot.RootURL = NewCmd(strings.TrimSuffix(res.Request.URL.String(), "robots.txt")).URL() return robot }
// It creates a robot type, appends it to hostinfo and RootURL to the Queue. func (f *Fetch) SeedSlice(jsn string) { args := jsnLinks(jsn) cln := resolv(args) for _, str := range cln { res, err := f.DoRequest(BotCmd(str)) if err != nil { cognilog.Log("red", err) continue } robot := MakeBot(res) if !robot.FullDisallow { // if not FullDisallow add f.HostInfo = append(f.HostInfo, robot) f.addSeed(NewCmd(str)) // add RootURL to Queue or Frontier. cognilog.LogINFO("green", "Root added", str) } else { cognilog.LogINFO("red", "Root Not added", str) } } }