Beispiel #1
0
// parse is a helper to just get a URL object from a string we know is a safe
// url (ParseURL requires us to deal with potential errors)
func parse(ref string) *walker.URL {
	u, err := walker.ParseURL(ref)
	if err != nil {
		panic("Failed to parse walker.URL: " + ref)
	}
	return u
}
Beispiel #2
0
func (ds *CqlModel) ListLinkHistorical(linkUrl string, seedIndex int, limit int) ([]LinkInfo, int, error) {
	if limit <= 0 {
		return nil, seedIndex, fmt.Errorf("Bad value for limit parameter %d", limit)
	}
	db := ds.Db
	u, err := walker.ParseURL(linkUrl)
	if err != nil {
		return nil, seedIndex, err
	}

	query := `SELECT dom, subdom, path, proto, time, stat, err, robot_ex 
              FROM links
              WHERE dom = ? AND subdom = ? AND path = ? AND proto = ?`
	tld1, err := u.ToplevelDomainPlusOne()
	if err != nil {
		return nil, seedIndex, err
	}
	subtld1, err := u.Subdomain()
	if err != nil {
		return nil, seedIndex, err
	}

	itr := db.Query(query, tld1, subtld1, u.RequestURI(), u.Scheme).Iter()

	var linfos []LinkInfo
	var dom, sub, path, prot, getError string
	var crawlTime time.Time
	var status int
	var robotsExcluded bool
	count := 0
	for itr.Scan(&dom, &sub, &path, &prot, &crawlTime, &status, &getError, &robotsExcluded) {
		if count < seedIndex {
			count++
			continue
		}

		url, _ := walker.CreateURL(dom, sub, path, prot, crawlTime)
		linfo := LinkInfo{
			Url:            url.String(),
			Status:         status,
			Error:          getError,
			RobotsExcluded: robotsExcluded,
			CrawlTime:      crawlTime,
		}
		linfos = append(linfos, linfo)
		if len(linfos) >= limit {
			break
		}
	}
	err = itr.Close()

	return linfos, seedIndex + len(linfos), err
}
Beispiel #3
0
func TestSeedCommand(t *testing.T) {
	u, _ := walker.ParseURL("http://test.com")
	datastore := &MockDatastore{}
	datastore.On("StoreParsedURL", u, mock.AnythingOfType("*walker.FetchResults")).Return("")
	cmd.Datastore(datastore)

	orig := os.Args
	defer func() { os.Args = orig }()
	os.Args = []string{os.Args[0], "seed", "--url=" + u.String()}

	go func() {
		time.Sleep(5 * time.Millisecond)
		syscall.Kill(os.Getpid(), syscall.SIGINT)
	}()
	cmd.Execute()

	datastore.AssertExpectations(t)
}
Beispiel #4
0
func (ds *CqlModel) FindLink(link string) (*LinkInfo, error) {
	db := ds.Db
	u, err := walker.ParseURL(link)
	if err != nil {
		return nil, err
	}
	query := `SELECT dom, subdom, path, proto, time, stat, err, robot_ex 
                      FROM links 
                      WHERE dom = ? AND 
                            subdom = ? AND 
                            path = ? AND 
                            proto = ?`

	tld1, err := u.ToplevelDomainPlusOne()
	if err != nil {
		return nil, err
	}

	subtld1, err := u.Subdomain()
	if err != nil {
		return nil, err
	}

	itr := db.Query(query, tld1, subtld1, u.RequestURI(), u.Scheme).Iter()
	rtimes := map[string]rememberTimes{}
	linfos, err := ds.collectLinkInfos(nil, rtimes, itr, 1)
	if err != nil {
		itr.Close()
		return nil, err
	}

	err = itr.Close()
	if err != nil {
		return nil, err
	}

	if len(linfos) == 0 {
		return nil, nil
	} else {
		return &linfos[0], nil
	}
}
Beispiel #5
0
func TestURLTLD(t *testing.T) {
	for _, dt := range tldtests {
		u, err := walker.ParseURL(dt.URL)
		if err != nil {
			if !dt.ErrorExpected {
				t.Errorf("Did not expect error parsing %v: %v", dt.URL, err)
			}
			continue
		}

		dom, err := u.ToplevelDomainPlusOne()
		if err != nil && !dt.ErrorExpected {
			t.Errorf("Did not expect error getting TLD+1: %v", err)
		}
		if dom != dt.ExpectedTLDPlusOne {
			t.Errorf("Expected ToplevelDomainPlusOne to be %v\nBut got: %v",
				dt.ExpectedTLDPlusOne, dom)
		}
		subdom, err := u.Subdomain()
		if err != nil && !dt.ErrorExpected {
			t.Errorf("Did not expect error getting subdomain: %v", err)
		}
		if subdom != dt.ExpectedSubdomain {
			t.Errorf("Expected Subdomain to be %v\nBut got: %v",
				dt.ExpectedSubdomain, subdom)
		}

		dom2, subdom2, err := u.TLDPlusOneAndSubdomain()
		if err != nil && !dt.ErrorExpected {
			t.Errorf("Did not expect error getting TLD+1 and subdomain: %v", err)
		}
		if dom2 != dt.ExpectedTLDPlusOne {
			t.Errorf("Expected TLDPlusOneAndSubdomain to give domain %v\nBut got: %v",
				dt.ExpectedTLDPlusOne, dom2)
		}
		if subdom2 != dt.ExpectedSubdomain {
			t.Errorf("Expected TLDPlusOneAndSubdomain to give subdomain %v\nBut got: %v",
				dt.ExpectedSubdomain, subdom2)
		}
	}
}
Beispiel #6
0
func TestURLCreation(t *testing.T) {
	url1, err := url.Parse("http://sub1.test.com/thepath?query=blah")
	if err != nil {
		t.Fatal(err)
	}
	wurl1, err := walker.ParseURL("http://sub1.test.com/thepath?query=blah")
	if err != nil {
		t.Fatal(err)
	}
	if url1.String() != wurl1.String() {
		t.Errorf("URLs should be the same: %v\nAnd: %v")
	}

	created, err := walker.CreateURL("test.com", "sub1", "thepath?query=blah", "http",
		walker.NotYetCrawled)
	if err != nil {
		t.Fatal(err)
	}
	if created.String() != wurl1.String() {
		t.Errorf("Expected CreateURL to return %v\nBut got: %v", wurl1, created)
	}
}
Beispiel #7
0
func (ds *CqlModel) ListLinks(domain string, seedUrl string, limit int) ([]LinkInfo, error) {
	if limit <= 0 {
		return nil, fmt.Errorf("Bad value for limit parameter %d", limit)
	}
	db := ds.Db
	var linfos []LinkInfo
	rtimes := map[string]rememberTimes{}
	var table []queryEntry

	if seedUrl == "" {
		table = []queryEntry{
			queryEntry{
				query: `SELECT dom, subdom, path, proto, time, stat, err, robot_ex
                      FROM links 
                      WHERE dom = ?`,
				args: []interface{}{domain},
			},
		}
	} else {
		u, err := walker.ParseURL(seedUrl)
		if err != nil {
			return linfos, err
		}

		dom, err := u.ToplevelDomainPlusOne()
		if err != nil {
			return linfos, err
		}

		sub, err := u.Subdomain()
		if err != nil {
			return linfos, err
		}

		pat := u.RequestURI()
		pro := u.Scheme

		table = []queryEntry{
			queryEntry{
				query: `SELECT dom, subdom, path, proto, time, stat, err, robot_ex
                      FROM links 
                      WHERE dom = ? AND 
                            subdom = ? AND 
                            path = ? AND 
                            proto > ?`,
				args: []interface{}{dom, sub, pat, pro},
			},
			queryEntry{
				query: `SELECT dom, subdom, path, proto, time, stat, err, robot_ex 
                      FROM links 
                      WHERE dom = ? AND 
                            subdom = ? AND 
                            path > ?`,
				args: []interface{}{dom, sub, pat},
			},
			queryEntry{
				query: `SELECT dom, subdom, path, proto, time, stat, err, robot_ex 
                      FROM links 
                      WHERE dom = ? AND 
                            subdom > ?`,
				args: []interface{}{dom, sub},
			},
		}
	}

	var err error
	for _, qt := range table {
		itr := db.Query(qt.query, qt.args...).Iter()
		linfos, err = ds.collectLinkInfos(linfos, rtimes, itr, limit)
		if err != nil {
			return linfos, err
		}

		err = itr.Close()
		if err != nil {
			return linfos, err
		} else if len(linfos) >= limit {
			return linfos, nil
		}
	}

	return linfos, nil
}
Beispiel #8
0
//NOTE: InsertLinks should try to insert as much information as possible
//return errors for things it can't handle
func (ds *CqlModel) InsertLinks(links []string) []error {
	//
	// Collect domains
	//
	var domains []string
	var errList []error
	var urls []*walker.URL
	for i := range links {
		link := links[i]
		url, err := walker.ParseURL(link)
		if err != nil {
			errList = append(errList, fmt.Errorf("%v # ParseURL: %v", link, err))
			domains = append(domains, "")
			urls = append(urls, nil)
			continue
		} else if url.Scheme == "" {
			errList = append(errList, fmt.Errorf("%v # ParseURL: undefined scheme (http:// or https://)", link))
			domains = append(domains, "")
			urls = append(urls, nil)
			continue
		}
		domain, err := url.ToplevelDomainPlusOne()
		if err != nil {
			errList = append(errList, fmt.Errorf("%v # ToplevelDomainPlusOne: bad domain: %v", link, err))
			domains = append(domains, "")
			urls = append(urls, nil)
			continue
		}

		domains = append(domains, domain)
		urls = append(urls, url)
	}

	//
	// Push domain information to table. The only trick to this, is I don't add links unless
	// the domain can be added
	//
	db := ds.Db
	var seen = map[string]bool{}
	for i := range links {
		link := links[i]
		d := domains[i]
		u := urls[i]

		// if you already had an error, keep going
		if u == nil {
			continue
		}

		if !seen[d] {
			err := ds.addDomainIfNew(d)
			if err != nil {
				errList = append(errList, fmt.Errorf("%v # addDomainIfNew: %v", link, err))
				continue
			}
		}
		seen[d] = true

		subdom, err := u.Subdomain()
		if err != nil {
			errList = append(errList, fmt.Errorf("%v # Subdomain(): %v", link, err))
			continue
		}

		err = db.Query(`INSERT INTO links (dom, subdom, path, proto, time)
                                     VALUES (?, ?, ?, ?, ?)`, d, subdom,
			u.RequestURI(), u.Scheme, walker.NotYetCrawled).Exec()
		if err != nil {
			errList = append(errList, fmt.Errorf("%v # `insert query`: %v", link, err))
			continue
		}
	}

	return errList
}
Beispiel #9
0
func init() {
	walkerCommand := &cobra.Command{
		Use: "walker",
	}

	var config string
	walkerCommand.PersistentFlags().StringVarP(&config,
		"config", "c", "", "path to a config file to load")
	readConfig := func() {
		if config != "" {
			if err := walker.ReadConfigFile(config); err != nil {
				panic(err.Error())
			}
		}
	}

	var noConsole bool = false
	crawlCommand := &cobra.Command{
		Use:   "crawl",
		Short: "start an all-in-one crawler",
		Run: func(cmd *cobra.Command, args []string) {
			readConfig()

			if commander.Datastore == nil {
				ds, err := walker.NewCassandraDatastore()
				if err != nil {
					fatalf("Failed creating Cassandra datastore: %v", err)
				}
				commander.Datastore = ds
				commander.Dispatcher = &walker.CassandraDispatcher{}
			}

			if commander.Handler == nil {
				commander.Handler = &walker.SimpleWriterHandler{}
			}

			manager := &walker.FetchManager{
				Datastore: commander.Datastore,
				Handler:   commander.Handler,
			}
			go manager.Start()

			if commander.Dispatcher != nil {
				go func() {
					err := commander.Dispatcher.StartDispatcher()
					if err != nil {
						panic(err.Error())
					}
				}()
			}

			if !noConsole {
				console.Start()
			}

			sig := make(chan os.Signal)
			signal.Notify(sig, syscall.SIGINT)
			<-sig

			if commander.Dispatcher != nil {
				commander.Dispatcher.StopDispatcher()
			}
			manager.Stop()
		},
	}
	crawlCommand.Flags().BoolVarP(&noConsole, "no-console", "C", false, "Do not start the console")
	walkerCommand.AddCommand(crawlCommand)

	fetchCommand := &cobra.Command{
		Use:   "fetch",
		Short: "start only a walker fetch manager",
		Run: func(cmd *cobra.Command, args []string) {
			readConfig()

			if commander.Datastore == nil {
				ds, err := walker.NewCassandraDatastore()
				if err != nil {
					fatalf("Failed creating Cassandra datastore: %v", err)
				}
				commander.Datastore = ds
				commander.Dispatcher = &walker.CassandraDispatcher{}
			}

			if commander.Handler == nil {
				commander.Handler = &walker.SimpleWriterHandler{}
			}

			manager := &walker.FetchManager{
				Datastore: commander.Datastore,
				Handler:   commander.Handler,
			}
			go manager.Start()

			sig := make(chan os.Signal)
			signal.Notify(sig, syscall.SIGINT)
			<-sig

			manager.Stop()
		},
	}
	walkerCommand.AddCommand(fetchCommand)

	dispatchCommand := &cobra.Command{
		Use:   "dispatch",
		Short: "start only a walker dispatcher",
		Run: func(cmd *cobra.Command, args []string) {
			readConfig()

			if commander.Dispatcher == nil {
				commander.Dispatcher = &walker.CassandraDispatcher{}
			}

			go func() {
				err := commander.Dispatcher.StartDispatcher()
				if err != nil {
					panic(err.Error())
				}
			}()

			sig := make(chan os.Signal)
			signal.Notify(sig, syscall.SIGINT)
			<-sig

			commander.Dispatcher.StopDispatcher()
		},
	}
	walkerCommand.AddCommand(dispatchCommand)

	var seedURL string
	seedCommand := &cobra.Command{
		Use:   "seed",
		Short: "add a seed URL to the datastore",
		Long: `Seed is useful for:
    - Adding starter links to bootstrap a broad crawl
    - Adding links when add_new_domains is false
    - Adding any other link that needs to be crawled soon

This command will insert the provided link and also add its domain to the
crawl, regardless of the add_new_domains configuration setting.`,
		Run: func(cmd *cobra.Command, args []string) {
			readConfig()

			orig := walker.Config.AddNewDomains
			defer func() { walker.Config.AddNewDomains = orig }()
			walker.Config.AddNewDomains = true

			if seedURL == "" {
				fatalf("Seed URL needed to execute; add on with --url/-u")
			}
			u, err := walker.ParseURL(seedURL)
			if err != nil {
				fatalf("Could not parse %v as a url: %v", seedURL, err)
			}

			if commander.Datastore == nil {
				ds, err := walker.NewCassandraDatastore()
				if err != nil {
					fatalf("Failed creating Cassandra datastore: %v", err)
				}
				commander.Datastore = ds
			}

			commander.Datastore.StoreParsedURL(u, nil)
		},
	}
	seedCommand.Flags().StringVarP(&seedURL, "url", "u", "", "URL to add as a seed")
	walkerCommand.AddCommand(seedCommand)

	var outfile string
	schemaCommand := &cobra.Command{
		Use:   "schema",
		Short: "output the walker schema",
		Long: `Schema prints the walker schema to stdout, substituting
schema-relevant configuration items (ex. keyspace, replication factor).
Useful for something like:
    $ <edit walker.yaml as desired>
    $ walker schema -o schema.cql
    $ <edit schema.cql further as desired>
    $ cqlsh -f schema.cql
`,
		Run: func(cmd *cobra.Command, args []string) {
			readConfig()
			if outfile == "" {
				fatalf("An output file is needed to execute; add with --out/-o")
			}

			out, err := os.Create(outfile)
			if err != nil {
				panic(err.Error())
			}
			defer out.Close()

			schema, err := walker.GetCassandraSchema()
			if err != nil {
				panic(err.Error())
			}
			fmt.Fprint(out, schema)
		},
	}
	schemaCommand.Flags().StringVarP(&outfile, "out", "o", "", "File to write output to")
	walkerCommand.AddCommand(schemaCommand)

	consoleCommand := &cobra.Command{
		Use:   "console",
		Short: "Start up the walker console",
		Run: func(cmd *cobra.Command, args []string) {
			readConfig()
			console.Run()
		},
	}
	walkerCommand.AddCommand(consoleCommand)

	commander.Command = walkerCommand
}