// parse is a helper to just get a URL object from a string we know is a safe // url (ParseURL requires us to deal with potential errors) func parse(ref string) *walker.URL { u, err := walker.ParseURL(ref) if err != nil { panic("Failed to parse walker.URL: " + ref) } return u }
func (ds *CqlModel) ListLinkHistorical(linkUrl string, seedIndex int, limit int) ([]LinkInfo, int, error) { if limit <= 0 { return nil, seedIndex, fmt.Errorf("Bad value for limit parameter %d", limit) } db := ds.Db u, err := walker.ParseURL(linkUrl) if err != nil { return nil, seedIndex, err } query := `SELECT dom, subdom, path, proto, time, stat, err, robot_ex FROM links WHERE dom = ? AND subdom = ? AND path = ? AND proto = ?` tld1, err := u.ToplevelDomainPlusOne() if err != nil { return nil, seedIndex, err } subtld1, err := u.Subdomain() if err != nil { return nil, seedIndex, err } itr := db.Query(query, tld1, subtld1, u.RequestURI(), u.Scheme).Iter() var linfos []LinkInfo var dom, sub, path, prot, getError string var crawlTime time.Time var status int var robotsExcluded bool count := 0 for itr.Scan(&dom, &sub, &path, &prot, &crawlTime, &status, &getError, &robotsExcluded) { if count < seedIndex { count++ continue } url, _ := walker.CreateURL(dom, sub, path, prot, crawlTime) linfo := LinkInfo{ Url: url.String(), Status: status, Error: getError, RobotsExcluded: robotsExcluded, CrawlTime: crawlTime, } linfos = append(linfos, linfo) if len(linfos) >= limit { break } } err = itr.Close() return linfos, seedIndex + len(linfos), err }
func TestSeedCommand(t *testing.T) { u, _ := walker.ParseURL("http://test.com") datastore := &MockDatastore{} datastore.On("StoreParsedURL", u, mock.AnythingOfType("*walker.FetchResults")).Return("") cmd.Datastore(datastore) orig := os.Args defer func() { os.Args = orig }() os.Args = []string{os.Args[0], "seed", "--url=" + u.String()} go func() { time.Sleep(5 * time.Millisecond) syscall.Kill(os.Getpid(), syscall.SIGINT) }() cmd.Execute() datastore.AssertExpectations(t) }
func (ds *CqlModel) FindLink(link string) (*LinkInfo, error) { db := ds.Db u, err := walker.ParseURL(link) if err != nil { return nil, err } query := `SELECT dom, subdom, path, proto, time, stat, err, robot_ex FROM links WHERE dom = ? AND subdom = ? AND path = ? AND proto = ?` tld1, err := u.ToplevelDomainPlusOne() if err != nil { return nil, err } subtld1, err := u.Subdomain() if err != nil { return nil, err } itr := db.Query(query, tld1, subtld1, u.RequestURI(), u.Scheme).Iter() rtimes := map[string]rememberTimes{} linfos, err := ds.collectLinkInfos(nil, rtimes, itr, 1) if err != nil { itr.Close() return nil, err } err = itr.Close() if err != nil { return nil, err } if len(linfos) == 0 { return nil, nil } else { return &linfos[0], nil } }
func TestURLTLD(t *testing.T) { for _, dt := range tldtests { u, err := walker.ParseURL(dt.URL) if err != nil { if !dt.ErrorExpected { t.Errorf("Did not expect error parsing %v: %v", dt.URL, err) } continue } dom, err := u.ToplevelDomainPlusOne() if err != nil && !dt.ErrorExpected { t.Errorf("Did not expect error getting TLD+1: %v", err) } if dom != dt.ExpectedTLDPlusOne { t.Errorf("Expected ToplevelDomainPlusOne to be %v\nBut got: %v", dt.ExpectedTLDPlusOne, dom) } subdom, err := u.Subdomain() if err != nil && !dt.ErrorExpected { t.Errorf("Did not expect error getting subdomain: %v", err) } if subdom != dt.ExpectedSubdomain { t.Errorf("Expected Subdomain to be %v\nBut got: %v", dt.ExpectedSubdomain, subdom) } dom2, subdom2, err := u.TLDPlusOneAndSubdomain() if err != nil && !dt.ErrorExpected { t.Errorf("Did not expect error getting TLD+1 and subdomain: %v", err) } if dom2 != dt.ExpectedTLDPlusOne { t.Errorf("Expected TLDPlusOneAndSubdomain to give domain %v\nBut got: %v", dt.ExpectedTLDPlusOne, dom2) } if subdom2 != dt.ExpectedSubdomain { t.Errorf("Expected TLDPlusOneAndSubdomain to give subdomain %v\nBut got: %v", dt.ExpectedSubdomain, subdom2) } } }
func TestURLCreation(t *testing.T) { url1, err := url.Parse("http://sub1.test.com/thepath?query=blah") if err != nil { t.Fatal(err) } wurl1, err := walker.ParseURL("http://sub1.test.com/thepath?query=blah") if err != nil { t.Fatal(err) } if url1.String() != wurl1.String() { t.Errorf("URLs should be the same: %v\nAnd: %v") } created, err := walker.CreateURL("test.com", "sub1", "thepath?query=blah", "http", walker.NotYetCrawled) if err != nil { t.Fatal(err) } if created.String() != wurl1.String() { t.Errorf("Expected CreateURL to return %v\nBut got: %v", wurl1, created) } }
func (ds *CqlModel) ListLinks(domain string, seedUrl string, limit int) ([]LinkInfo, error) { if limit <= 0 { return nil, fmt.Errorf("Bad value for limit parameter %d", limit) } db := ds.Db var linfos []LinkInfo rtimes := map[string]rememberTimes{} var table []queryEntry if seedUrl == "" { table = []queryEntry{ queryEntry{ query: `SELECT dom, subdom, path, proto, time, stat, err, robot_ex FROM links WHERE dom = ?`, args: []interface{}{domain}, }, } } else { u, err := walker.ParseURL(seedUrl) if err != nil { return linfos, err } dom, err := u.ToplevelDomainPlusOne() if err != nil { return linfos, err } sub, err := u.Subdomain() if err != nil { return linfos, err } pat := u.RequestURI() pro := u.Scheme table = []queryEntry{ queryEntry{ query: `SELECT dom, subdom, path, proto, time, stat, err, robot_ex FROM links WHERE dom = ? AND subdom = ? AND path = ? AND proto > ?`, args: []interface{}{dom, sub, pat, pro}, }, queryEntry{ query: `SELECT dom, subdom, path, proto, time, stat, err, robot_ex FROM links WHERE dom = ? AND subdom = ? AND path > ?`, args: []interface{}{dom, sub, pat}, }, queryEntry{ query: `SELECT dom, subdom, path, proto, time, stat, err, robot_ex FROM links WHERE dom = ? AND subdom > ?`, args: []interface{}{dom, sub}, }, } } var err error for _, qt := range table { itr := db.Query(qt.query, qt.args...).Iter() linfos, err = ds.collectLinkInfos(linfos, rtimes, itr, limit) if err != nil { return linfos, err } err = itr.Close() if err != nil { return linfos, err } else if len(linfos) >= limit { return linfos, nil } } return linfos, nil }
//NOTE: InsertLinks should try to insert as much information as possible //return errors for things it can't handle func (ds *CqlModel) InsertLinks(links []string) []error { // // Collect domains // var domains []string var errList []error var urls []*walker.URL for i := range links { link := links[i] url, err := walker.ParseURL(link) if err != nil { errList = append(errList, fmt.Errorf("%v # ParseURL: %v", link, err)) domains = append(domains, "") urls = append(urls, nil) continue } else if url.Scheme == "" { errList = append(errList, fmt.Errorf("%v # ParseURL: undefined scheme (http:// or https://)", link)) domains = append(domains, "") urls = append(urls, nil) continue } domain, err := url.ToplevelDomainPlusOne() if err != nil { errList = append(errList, fmt.Errorf("%v # ToplevelDomainPlusOne: bad domain: %v", link, err)) domains = append(domains, "") urls = append(urls, nil) continue } domains = append(domains, domain) urls = append(urls, url) } // // Push domain information to table. The only trick to this, is I don't add links unless // the domain can be added // db := ds.Db var seen = map[string]bool{} for i := range links { link := links[i] d := domains[i] u := urls[i] // if you already had an error, keep going if u == nil { continue } if !seen[d] { err := ds.addDomainIfNew(d) if err != nil { errList = append(errList, fmt.Errorf("%v # addDomainIfNew: %v", link, err)) continue } } seen[d] = true subdom, err := u.Subdomain() if err != nil { errList = append(errList, fmt.Errorf("%v # Subdomain(): %v", link, err)) continue } err = db.Query(`INSERT INTO links (dom, subdom, path, proto, time) VALUES (?, ?, ?, ?, ?)`, d, subdom, u.RequestURI(), u.Scheme, walker.NotYetCrawled).Exec() if err != nil { errList = append(errList, fmt.Errorf("%v # `insert query`: %v", link, err)) continue } } return errList }
func init() { walkerCommand := &cobra.Command{ Use: "walker", } var config string walkerCommand.PersistentFlags().StringVarP(&config, "config", "c", "", "path to a config file to load") readConfig := func() { if config != "" { if err := walker.ReadConfigFile(config); err != nil { panic(err.Error()) } } } var noConsole bool = false crawlCommand := &cobra.Command{ Use: "crawl", Short: "start an all-in-one crawler", Run: func(cmd *cobra.Command, args []string) { readConfig() if commander.Datastore == nil { ds, err := walker.NewCassandraDatastore() if err != nil { fatalf("Failed creating Cassandra datastore: %v", err) } commander.Datastore = ds commander.Dispatcher = &walker.CassandraDispatcher{} } if commander.Handler == nil { commander.Handler = &walker.SimpleWriterHandler{} } manager := &walker.FetchManager{ Datastore: commander.Datastore, Handler: commander.Handler, } go manager.Start() if commander.Dispatcher != nil { go func() { err := commander.Dispatcher.StartDispatcher() if err != nil { panic(err.Error()) } }() } if !noConsole { console.Start() } sig := make(chan os.Signal) signal.Notify(sig, syscall.SIGINT) <-sig if commander.Dispatcher != nil { commander.Dispatcher.StopDispatcher() } manager.Stop() }, } crawlCommand.Flags().BoolVarP(&noConsole, "no-console", "C", false, "Do not start the console") walkerCommand.AddCommand(crawlCommand) fetchCommand := &cobra.Command{ Use: "fetch", Short: "start only a walker fetch manager", Run: func(cmd *cobra.Command, args []string) { readConfig() if commander.Datastore == nil { ds, err := walker.NewCassandraDatastore() if err != nil { fatalf("Failed creating Cassandra datastore: %v", err) } commander.Datastore = ds commander.Dispatcher = &walker.CassandraDispatcher{} } if commander.Handler == nil { commander.Handler = &walker.SimpleWriterHandler{} } manager := &walker.FetchManager{ Datastore: commander.Datastore, Handler: commander.Handler, } go manager.Start() sig := make(chan os.Signal) signal.Notify(sig, syscall.SIGINT) <-sig manager.Stop() }, } walkerCommand.AddCommand(fetchCommand) dispatchCommand := &cobra.Command{ Use: "dispatch", Short: "start only a walker dispatcher", Run: func(cmd *cobra.Command, args []string) { readConfig() if commander.Dispatcher == nil { commander.Dispatcher = &walker.CassandraDispatcher{} } go func() { err := commander.Dispatcher.StartDispatcher() if err != nil { panic(err.Error()) } }() sig := make(chan os.Signal) signal.Notify(sig, syscall.SIGINT) <-sig commander.Dispatcher.StopDispatcher() }, } walkerCommand.AddCommand(dispatchCommand) var seedURL string seedCommand := &cobra.Command{ Use: "seed", Short: "add a seed URL to the datastore", Long: `Seed is useful for: - Adding starter links to bootstrap a broad crawl - Adding links when add_new_domains is false - Adding any other link that needs to be crawled soon This command will insert the provided link and also add its domain to the crawl, regardless of the add_new_domains configuration setting.`, Run: func(cmd *cobra.Command, args []string) { readConfig() orig := walker.Config.AddNewDomains defer func() { walker.Config.AddNewDomains = orig }() walker.Config.AddNewDomains = true if seedURL == "" { fatalf("Seed URL needed to execute; add on with --url/-u") } u, err := walker.ParseURL(seedURL) if err != nil { fatalf("Could not parse %v as a url: %v", seedURL, err) } if commander.Datastore == nil { ds, err := walker.NewCassandraDatastore() if err != nil { fatalf("Failed creating Cassandra datastore: %v", err) } commander.Datastore = ds } commander.Datastore.StoreParsedURL(u, nil) }, } seedCommand.Flags().StringVarP(&seedURL, "url", "u", "", "URL to add as a seed") walkerCommand.AddCommand(seedCommand) var outfile string schemaCommand := &cobra.Command{ Use: "schema", Short: "output the walker schema", Long: `Schema prints the walker schema to stdout, substituting schema-relevant configuration items (ex. keyspace, replication factor). Useful for something like: $ <edit walker.yaml as desired> $ walker schema -o schema.cql $ <edit schema.cql further as desired> $ cqlsh -f schema.cql `, Run: func(cmd *cobra.Command, args []string) { readConfig() if outfile == "" { fatalf("An output file is needed to execute; add with --out/-o") } out, err := os.Create(outfile) if err != nil { panic(err.Error()) } defer out.Close() schema, err := walker.GetCassandraSchema() if err != nil { panic(err.Error()) } fmt.Fprint(out, schema) }, } schemaCommand.Flags().StringVarP(&outfile, "out", "o", "", "File to write output to") walkerCommand.AddCommand(schemaCommand) consoleCommand := &cobra.Command{ Use: "console", Short: "Start up the walker console", Run: func(cmd *cobra.Command, args []string) { readConfig() console.Run() }, } walkerCommand.AddCommand(consoleCommand) commander.Command = walkerCommand }