func (ds *CqlModel) ListLinkHistorical(linkUrl string, seedIndex int, limit int) ([]LinkInfo, int, error) { if limit <= 0 { return nil, seedIndex, fmt.Errorf("Bad value for limit parameter %d", limit) } db := ds.Db u, err := walker.ParseURL(linkUrl) if err != nil { return nil, seedIndex, err } query := `SELECT dom, subdom, path, proto, time, stat, err, robot_ex FROM links WHERE dom = ? AND subdom = ? AND path = ? AND proto = ?` tld1, err := u.ToplevelDomainPlusOne() if err != nil { return nil, seedIndex, err } subtld1, err := u.Subdomain() if err != nil { return nil, seedIndex, err } itr := db.Query(query, tld1, subtld1, u.RequestURI(), u.Scheme).Iter() var linfos []LinkInfo var dom, sub, path, prot, getError string var crawlTime time.Time var status int var robotsExcluded bool count := 0 for itr.Scan(&dom, &sub, &path, &prot, &crawlTime, &status, &getError, &robotsExcluded) { if count < seedIndex { count++ continue } url, _ := walker.CreateURL(dom, sub, path, prot, crawlTime) linfo := LinkInfo{ Url: url.String(), Status: status, Error: getError, RobotsExcluded: robotsExcluded, CrawlTime: crawlTime, } linfos = append(linfos, linfo) if len(linfos) >= limit { break } } err = itr.Close() return linfos, seedIndex + len(linfos), err }
//collectLinkInfos populates a []LinkInfo list given a cassandra iterator func (ds *CqlModel) collectLinkInfos(linfos []LinkInfo, rtimes map[string]rememberTimes, itr *gocql.Iter, limit int) ([]LinkInfo, error) { var domain, subdomain, path, protocol, anerror string var crawlTime time.Time var robotsExcluded bool var status int for itr.Scan(&domain, &subdomain, &path, &protocol, &crawlTime, &status, &anerror, &robotsExcluded) { u, err := walker.CreateURL(domain, subdomain, path, protocol, crawlTime) if err != nil { return linfos, err } urlString := u.String() qq, yes := rtimes[urlString] if yes && qq.ctm.After(crawlTime) { continue } linfo := LinkInfo{ Url: urlString, Status: status, Error: anerror, RobotsExcluded: robotsExcluded, CrawlTime: crawlTime, } nindex := -1 if yes { nindex = qq.ind linfos[qq.ind] = linfo } else { // If you've reached the limit, then we're all done if len(linfos) >= limit { break } linfos = append(linfos, linfo) nindex = len(linfos) - 1 } rtimes[urlString] = rememberTimes{ctm: crawlTime, ind: nindex} } return linfos, nil }
func TestURLCreation(t *testing.T) { url1, err := url.Parse("http://sub1.test.com/thepath?query=blah") if err != nil { t.Fatal(err) } wurl1, err := walker.ParseURL("http://sub1.test.com/thepath?query=blah") if err != nil { t.Fatal(err) } if url1.String() != wurl1.String() { t.Errorf("URLs should be the same: %v\nAnd: %v") } created, err := walker.CreateURL("test.com", "sub1", "thepath?query=blah", "http", walker.NotYetCrawled) if err != nil { t.Fatal(err) } if created.String() != wurl1.String() { t.Errorf("Expected CreateURL to return %v\nBut got: %v", wurl1, created) } }
func TestDatastoreBasic(t *testing.T) { db := getDB(t) ds := getDS(t) insertDomainInfo := `INSERT INTO domain_info (dom, claim_tok, priority, dispatched) VALUES (?, ?, ?, ?)` insertSegment := `INSERT INTO segments (dom, subdom, path, proto) VALUES (?, ?, ?, ?)` insertLink := `INSERT INTO links (dom, subdom, path, proto, time) VALUES (?, ?, ?, ?, ?)` queries := []*gocql.Query{ db.Query(insertDomainInfo, "test.com", gocql.UUID{}, 0, true), db.Query(insertSegment, "test.com", "", "page1.html", "http"), db.Query(insertSegment, "test.com", "", "page2.html", "http"), db.Query(insertLink, "test.com", "", "page1.html", "http", walker.NotYetCrawled), db.Query(insertLink, "test.com", "", "page2.html", "http", walker.NotYetCrawled), } for _, q := range queries { err := q.Exec() if err != nil { t.Fatalf("Failed to insert test data: %v\nQuery: %v", err, q) } } host := ds.ClaimNewHost() if host != "test.com" { t.Errorf("Expected test.com but got %v", host) } links := map[url.URL]bool{} expectedLinks := map[url.URL]bool{ *page1URL.URL: true, *page2URL.URL: true, } for u := range ds.LinksForHost("test.com") { links[*u.URL] = true } if !reflect.DeepEqual(links, expectedLinks) { t.Errorf("Expected links from LinksForHost: %v\nBut got: %v", expectedLinks, links) } ds.StoreURLFetchResults(page1Fetch) ds.StoreURLFetchResults(page2Fetch) expectedResults := map[url.URL]int{ *page1URL.URL: 200, *page2URL.URL: 200, } iter := db.Query(`SELECT dom, subdom, path, proto, time, stat FROM links WHERE dom = 'test.com'`).Iter() var linkdomain, subdomain, path, protocol string var status int var crawl_time time.Time results := map[url.URL]int{} for iter.Scan(&linkdomain, &subdomain, &path, &protocol, &crawl_time, &status) { if !crawl_time.Equal(walker.NotYetCrawled) { u, _ := walker.CreateURL(linkdomain, subdomain, path, protocol, crawl_time) results[*u.URL] = status } } if !reflect.DeepEqual(results, expectedResults) { t.Errorf("Expected results from StoreURLFetchResults: %v\nBut got: %v", expectedResults, results) } ds.StoreParsedURL(parse("http://test2.com/page1-1.html"), page1Fetch) ds.StoreParsedURL(parse("http://test2.com/page2-1.html"), page2Fetch) var count int db.Query(`SELECT COUNT(*) FROM links WHERE dom = 'test2.com'`).Scan(&count) if count != 2 { t.Errorf("Expected 2 parsed links to be inserted for test2.com, found %v", count) } ds.UnclaimHost("test.com") db.Query(`SELECT COUNT(*) FROM segments WHERE dom = 'test.com'`).Scan(&count) if count != 0 { t.Errorf("Expected links from unclaimed domain to be deleted, found %v", count) } err := db.Query(`SELECT COUNT(*) FROM domain_info WHERE dom = 'test.com' AND claim_tok = 00000000-0000-0000-0000-000000000000 AND dispatched = false ALLOW FILTERING`).Scan(&count) if err != nil { t.Fatalf("Failed to query for test.com in domain_info: %v", err) } if count != 1 { t.Fatalf("test.com has incorrect values in domain_info after unclaim") } }
func TestDispatcherBasic(t *testing.T) { // These config settings MUST be here. The results of the test // change if these are changed. origMaxLinksPerSegment := walker.Config.Dispatcher.MaxLinksPerSegment origRefreshPercentage := walker.Config.Dispatcher.RefreshPercentage defer func() { walker.Config.Dispatcher.MaxLinksPerSegment = origMaxLinksPerSegment walker.Config.Dispatcher.RefreshPercentage = origRefreshPercentage }() walker.Config.Dispatcher.MaxLinksPerSegment = 9 walker.Config.Dispatcher.RefreshPercentage = 33 var q *gocql.Query for _, dt := range DispatcherTests { db := getDB(t) // runs between tests to reset the db for _, edi := range dt.ExistingDomainInfos { q = db.Query(`INSERT INTO domain_info (dom, claim_tok, priority, dispatched) VALUES (?, ?, ?, ?)`, edi.Dom, edi.ClaimTok, edi.Priority, edi.Dispatched) if err := q.Exec(); err != nil { t.Fatalf("Failed to insert test domain info: %v\nQuery: %v", err, q) } } for _, el := range dt.ExistingLinks { dom, subdom, _ := el.URL.TLDPlusOneAndSubdomain() if el.Status == -1 { q = db.Query(`INSERT INTO links (dom, subdom, path, proto, time, getnow) VALUES (?, ?, ?, ?, ?, ?)`, dom, subdom, el.URL.RequestURI(), el.URL.Scheme, el.URL.LastCrawled, el.GetNow) } else { q = db.Query(`INSERT INTO links (dom, subdom, path, proto, time, stat, getnow) VALUES (?, ?, ?, ?, ?, ?, ?)`, dom, subdom, el.URL.RequestURI(), el.URL.Scheme, el.URL.LastCrawled, el.Status, el.GetNow) } if err := q.Exec(); err != nil { t.Fatalf("Failed to insert test links: %v\nQuery: %v", err, q) } } d := &walker.CassandraDispatcher{} go d.StartDispatcher() time.Sleep(time.Second) d.StopDispatcher() expectedResults := map[url.URL]bool{} for _, esl := range dt.ExpectedSegmentLinks { expectedResults[*esl.URL] = true } results := map[url.URL]bool{} iter := db.Query(`SELECT dom, subdom, path, proto FROM segments WHERE dom = 'test.com'`).Iter() var linkdomain, subdomain, path, protocol string for iter.Scan(&linkdomain, &subdomain, &path, &protocol) { u, _ := walker.CreateURL(linkdomain, subdomain, path, protocol, walker.NotYetCrawled) results[*u.URL] = true } if !reflect.DeepEqual(results, expectedResults) { t.Errorf("For tag %q expected results in segments: %v\nBut got: %v", dt.Tag, expectedResults, results) } for _, edi := range dt.ExistingDomainInfos { q = db.Query(`SELECT dispatched FROM domain_info WHERE dom = ?`, edi.Dom) var dispatched bool if err := q.Scan(&dispatched); err != nil { t.Fatalf("For tag %q failed to insert find domain info: %v\nQuery: %v", dt.Tag, err, q) } if !dispatched { t.Errorf("For tag %q `dispatched` flag not set on domain: %v", dt.Tag, edi.Dom) } } } }
func getDs(t *testing.T) *console.CqlModel { modifyConfigDataSource() initdb.Do(func() { cluster := gocql.NewCluster(walker.Config.Cassandra.Hosts...) db, err := cluster.CreateSession() if err != nil { panic(err) } // Just want to make sure no one makes a mistake with this code if walker.Config.Cassandra.Keyspace == "walker" { panic("Not allowed to spoof the walker keyspace") } err = db.Query(fmt.Sprintf("DROP KEYSPACE IF EXISTS %s", walker.Config.Cassandra.Keyspace)).Exec() if err != nil { panic(fmt.Errorf("Failed to drop %s keyspace: %v", walker.Config.Cassandra.Keyspace, err)) } err = walker.CreateCassandraSchema() if err != nil { t.Fatalf(err.Error()) } db.Close() }) ds, err := console.NewCqlModel() if err != nil { panic(err) } db := ds.Db // // Clear out the tables first // tables := []string{"links", "segments", "domain_info"} for _, table := range tables { err := db.Query(fmt.Sprintf(`TRUNCATE %v`, table)).Exec() if err != nil { t.Fatalf("Failed to truncate table %v: %v", table, err) } } // // Insert some data // insertDomainInfo := `INSERT INTO domain_info (dom) VALUES (?)` insertDomainToCrawl := `INSERT INTO domain_info (dom, claim_tok, claim_time, dispatched) VALUES (?, ?, ?, true)` insertSegment := `INSERT INTO segments (dom, subdom, path, proto) VALUES (?, ?, ?, ?)` insertLink := `INSERT INTO links (dom, subdom, path, proto, time, stat, err, robot_ex) VALUES (?, ?, ?, ?, ?, ?, ?, ?)` queries := []*gocql.Query{ db.Query(insertDomainToCrawl, "test.com", gocql.UUID{}, testTime), db.Query(insertLink, "test.com", "", "/page1.html", "http", walker.NotYetCrawled, 200, "", false), db.Query(insertLink, "test.com", "", "/page2.html", "http", walker.NotYetCrawled, 200, "", false), db.Query(insertLink, "test.com", "", "/page3.html", "http", walker.NotYetCrawled, 404, "", false), db.Query(insertLink, "test.com", "", "/page4.html", "http", walker.NotYetCrawled, 200, "An Error", false), db.Query(insertLink, "test.com", "", "/page5.html", "http", walker.NotYetCrawled, 200, "", true), db.Query(insertLink, "test.com", "sub", "/page6.html", "http", walker.NotYetCrawled, 200, "", false), db.Query(insertLink, "test.com", "sub", "/page7.html", "https", walker.NotYetCrawled, 200, "", false), db.Query(insertLink, "test.com", "sub", "/page8.html", "https", walker.NotYetCrawled, 200, "", false), db.Query(insertSegment, "test.com", "", "/page1.html", "http"), db.Query(insertSegment, "test.com", "", "/page2.html", "http"), db.Query(insertDomainInfo, "foo.com"), db.Query(insertLink, "foo.com", "sub", "/page1.html", "http", fooTime, 200, "", false), db.Query(insertLink, "foo.com", "sub", "/page2.html", "http", fooTime, 200, "", false), db.Query(insertDomainInfo, "bar.com"), db.Query(insertDomainToCrawl, "baz.com", bazUuid, testTime), db.Query(insertLink, "baz.com", "sub", "/page1.html", "http", bazLinkHistoryInit[0].CrawlTime, 200, "", false), db.Query(insertLink, "baz.com", "sub", "/page1.html", "http", bazLinkHistoryInit[1].CrawlTime, 200, "", false), db.Query(insertLink, "baz.com", "sub", "/page1.html", "http", bazLinkHistoryInit[2].CrawlTime, 200, "", false), db.Query(insertLink, "baz.com", "sub", "/page1.html", "http", bazLinkHistoryInit[3].CrawlTime, 200, "", false), db.Query(insertLink, "baz.com", "sub", "/page1.html", "http", bazLinkHistoryInit[4].CrawlTime, 200, "", false), db.Query(insertLink, "baz.com", "sub", "/page1.html", "http", bazLinkHistoryInit[5].CrawlTime, 200, "", false), db.Query(insertSegment, "baz.com", "sub", "page1.html", "http"), } for _, q := range queries { err := q.Exec() if err != nil { t.Fatalf("Failed to insert test data: %v\nQuery: %v", err, q) } } // // Need to record the order that the test.com urls come off on // itr := db.Query("SELECT dom, subdom, path, proto FROM links WHERE dom = 'test.com'").Iter() var domain, subdomain, path, protocol string testComLinkOrder = nil for itr.Scan(&domain, &subdomain, &path, &protocol) { u, _ := walker.CreateURL(domain, subdomain, path, protocol, walker.NotYetCrawled) urlString := u.String() linfo, gotLinfo := testComLinkHash[urlString] if !gotLinfo { panic(fmt.Errorf("testComLinkOrder can't find url: %v", urlString)) } testComLinkOrder = append(testComLinkOrder, linfo) } err = itr.Close() if err != nil { panic(fmt.Errorf("testComLinkOrder iterator error: %v", err)) } // // Need to record order for baz // itr = db.Query("SELECT time FROM links WHERE dom = 'baz.com'").Iter() var crawlTime time.Time bazLinkHistoryOrder = nil for itr.Scan(&crawlTime) { bestIndex := -1 var bestDiff int64 = 99999999 for i := range bazLinkHistoryInit { e := &bazLinkHistoryInit[i] delta := crawlTime.Unix() - e.CrawlTime.Unix() if delta < 0 { delta = -delta } if delta < bestDiff { bestIndex = i bestDiff = delta } } if bestIndex < 0 { panic("UNEXPECTED ERROR") } bazLinkHistoryOrder = append(bazLinkHistoryOrder, bazLinkHistoryInit[bestIndex]) } err = itr.Close() if err != nil { panic(fmt.Errorf("bazLinkHistoryOrder iterator error: %v", err)) } itr = db.Query("SELECT dom, subdom, path, proto FROM links").Iter() var foundBaz = false var beforeBazComLink *walker.URL = nil for itr.Scan(&domain, &subdomain, &path, &protocol) { url, err := walker.CreateURL(domain, subdomain, path, protocol, walker.NotYetCrawled) if err != nil { panic(err) } if domain == "baz.com" { foundBaz = true break } beforeBazComLink = url } if !foundBaz { panic("Unable to find domain before baz.com") } err = itr.Close() if err != nil { panic(fmt.Errorf("beforeBazCom link iterator error: %v", err)) } if beforeBazComLink == nil { bazSeed = "" } else { bazSeed = beforeBazComLink.String() } return ds }