func getDB(t *testing.T) *gocql.Session { initdb.Do(func() { err := walker.CreateCassandraSchema() if err != nil { t.Fatalf(err.Error()) } }) if walker.Config.Cassandra.Keyspace != "walker_test" { t.Fatal("Running tests requires using the walker_test keyspace") return nil } config := walker.GetCassandraConfig() db, err := config.CreateSession() if err != nil { t.Fatalf("Could not connect to local cassandra db: %v", err) return nil } tables := []string{"links", "segments", "domain_info"} for _, table := range tables { err := db.Query(fmt.Sprintf(`TRUNCATE %v`, table)).Exec() if err != nil { t.Fatalf("Failed to truncate table %v: %v", table, err) } } return db }
func spoofDataLong() { if walker.Config.Cassandra.Keyspace == "walker" { panic("Not allowed to spoof the walker keyspace") } // // Drop the keyspace if it exists // { cluster := gocql.NewCluster(walker.Config.Cassandra.Hosts...) //cluster.Keyspace = walker.Config.Cassandra.Keyspace db, err := cluster.CreateSession() if err != nil { panic(err) } err = db.Query(fmt.Sprintf("DROP KEYSPACE IF EXISTS %s", walker.Config.Cassandra.Keyspace)).Exec() if err != nil { panic(fmt.Errorf("Failed to drop %s keyspace: %v", walker.Config.Cassandra.Keyspace, err)) } db.Close() } // // Build the new schema // err := walker.CreateCassandraSchema() if err != nil { panic(err) } // // Build data store // ds, err := NewCqlModel() if err != nil { panic(fmt.Errorf("Failed to start data source: %v", err)) } db := ds.Db // // Clear out the tables first // tables := []string{"links", "segments", "domain_info"} for _, table := range tables { err := db.Query(fmt.Sprintf(`TRUNCATE %v`, table)).Exec() if err != nil { panic(fmt.Errorf("Failed to truncate table %v: %v", table, err)) } } rand.Seed(42) insertDomainInfo := `INSERT INTO domain_info (dom) VALUES (?)` insertDomainToCrawl := `INSERT INTO domain_info (dom, claim_tok, claim_time) VALUES (?, ?, ?)` insertSegment := `INSERT INTO segments (dom, subdom, path, proto) VALUES (?, ?, ?, ?)` insertLink := `INSERT INTO links (dom, subdom, path, proto, time, stat, err, robot_ex) VALUES (?, ?, ?, ?, ?, ?, ?, ?)` for i := 0; i < 100; i++ { domain := fmt.Sprintf("x%d.com", i) err := db.Query(insertDomainInfo, domain).Exec() if err != nil { panic(err) } crawlTime := fakeCrawlTime() status := fakeStatus() excluded := false if rand.Float32() < 0.1 { status = http.StatusOK crawlTime = walker.NotYetCrawled excluded = true } err = db.Query(insertLink, domain, "subd", "/page1.html", "http", crawlTime, status, "", excluded).Exec() if err != nil { panic(err) } } for i := 0; i < 10; i++ { domain := fmt.Sprintf("y%d.com", i) err := db.Query(insertDomainInfo, domain).Exec() if err != nil { panic(err) } for i := 0; i < 100; i++ { crawlTime := fakeCrawlTime() status := fakeStatus() excluded := false if rand.Float32() < 0.1 { status = http.StatusOK crawlTime = walker.NotYetCrawled excluded = true } page := fmt.Sprintf("/page%d.html", i) err = db.Query(insertLink, domain, "link", page, "http", crawlTime, status, "", excluded).Exec() if err != nil { panic(err) } } } errorBC := []string{ "Something very bad happened", "Program failed to parse message 5", "All your base are belong to us", "The Tragically Hip sensor failed", } for i := 0; i < 10; i++ { domain := fmt.Sprintf("h%d.com", i) err := db.Query(insertDomainInfo, domain).Exec() if err != nil { panic(err) } crawlTime := time.Now() for i := 0; i < 20; i++ { crawlTime = crawlTime.AddDate(0, 0, -rand.Intn(30)) status := fakeStatus() fakeError := "" if rand.Float32() < 0.1 { status = http.StatusOK fakeError = errorBC[rand.Intn(len(errorBC))] } err = db.Query(insertLink, domain, "link", "/page1.html", "http", crawlTime, status, fakeError, false).Exec() if err != nil { panic(err) } } } for i := 0; i < 10; i++ { domain := fmt.Sprintf("t%d.com", i) uuid := fakeUuid() err = db.Query(insertDomainToCrawl, domain, uuid, time.Now()).Exec() if err != nil { panic(err) } for i := 0; i < 20; i++ { page := fmt.Sprintf("/page%d.html", i) err = db.Query(insertLink, domain, "link", page, "http", walker.NotYetCrawled, http.StatusOK, "", false).Exec() if err != nil { panic(err) } err = db.Query(insertSegment, domain, "", page, "http").Exec() if err != nil { panic(err) } } } return }
func getDs(t *testing.T) *console.CqlModel { modifyConfigDataSource() initdb.Do(func() { cluster := gocql.NewCluster(walker.Config.Cassandra.Hosts...) db, err := cluster.CreateSession() if err != nil { panic(err) } // Just want to make sure no one makes a mistake with this code if walker.Config.Cassandra.Keyspace == "walker" { panic("Not allowed to spoof the walker keyspace") } err = db.Query(fmt.Sprintf("DROP KEYSPACE IF EXISTS %s", walker.Config.Cassandra.Keyspace)).Exec() if err != nil { panic(fmt.Errorf("Failed to drop %s keyspace: %v", walker.Config.Cassandra.Keyspace, err)) } err = walker.CreateCassandraSchema() if err != nil { t.Fatalf(err.Error()) } db.Close() }) ds, err := console.NewCqlModel() if err != nil { panic(err) } db := ds.Db // // Clear out the tables first // tables := []string{"links", "segments", "domain_info"} for _, table := range tables { err := db.Query(fmt.Sprintf(`TRUNCATE %v`, table)).Exec() if err != nil { t.Fatalf("Failed to truncate table %v: %v", table, err) } } // // Insert some data // insertDomainInfo := `INSERT INTO domain_info (dom) VALUES (?)` insertDomainToCrawl := `INSERT INTO domain_info (dom, claim_tok, claim_time, dispatched) VALUES (?, ?, ?, true)` insertSegment := `INSERT INTO segments (dom, subdom, path, proto) VALUES (?, ?, ?, ?)` insertLink := `INSERT INTO links (dom, subdom, path, proto, time, stat, err, robot_ex) VALUES (?, ?, ?, ?, ?, ?, ?, ?)` queries := []*gocql.Query{ db.Query(insertDomainToCrawl, "test.com", gocql.UUID{}, testTime), db.Query(insertLink, "test.com", "", "/page1.html", "http", walker.NotYetCrawled, 200, "", false), db.Query(insertLink, "test.com", "", "/page2.html", "http", walker.NotYetCrawled, 200, "", false), db.Query(insertLink, "test.com", "", "/page3.html", "http", walker.NotYetCrawled, 404, "", false), db.Query(insertLink, "test.com", "", "/page4.html", "http", walker.NotYetCrawled, 200, "An Error", false), db.Query(insertLink, "test.com", "", "/page5.html", "http", walker.NotYetCrawled, 200, "", true), db.Query(insertLink, "test.com", "sub", "/page6.html", "http", walker.NotYetCrawled, 200, "", false), db.Query(insertLink, "test.com", "sub", "/page7.html", "https", walker.NotYetCrawled, 200, "", false), db.Query(insertLink, "test.com", "sub", "/page8.html", "https", walker.NotYetCrawled, 200, "", false), db.Query(insertSegment, "test.com", "", "/page1.html", "http"), db.Query(insertSegment, "test.com", "", "/page2.html", "http"), db.Query(insertDomainInfo, "foo.com"), db.Query(insertLink, "foo.com", "sub", "/page1.html", "http", fooTime, 200, "", false), db.Query(insertLink, "foo.com", "sub", "/page2.html", "http", fooTime, 200, "", false), db.Query(insertDomainInfo, "bar.com"), db.Query(insertDomainToCrawl, "baz.com", bazUuid, testTime), db.Query(insertLink, "baz.com", "sub", "/page1.html", "http", bazLinkHistoryInit[0].CrawlTime, 200, "", false), db.Query(insertLink, "baz.com", "sub", "/page1.html", "http", bazLinkHistoryInit[1].CrawlTime, 200, "", false), db.Query(insertLink, "baz.com", "sub", "/page1.html", "http", bazLinkHistoryInit[2].CrawlTime, 200, "", false), db.Query(insertLink, "baz.com", "sub", "/page1.html", "http", bazLinkHistoryInit[3].CrawlTime, 200, "", false), db.Query(insertLink, "baz.com", "sub", "/page1.html", "http", bazLinkHistoryInit[4].CrawlTime, 200, "", false), db.Query(insertLink, "baz.com", "sub", "/page1.html", "http", bazLinkHistoryInit[5].CrawlTime, 200, "", false), db.Query(insertSegment, "baz.com", "sub", "page1.html", "http"), } for _, q := range queries { err := q.Exec() if err != nil { t.Fatalf("Failed to insert test data: %v\nQuery: %v", err, q) } } // // Need to record the order that the test.com urls come off on // itr := db.Query("SELECT dom, subdom, path, proto FROM links WHERE dom = 'test.com'").Iter() var domain, subdomain, path, protocol string testComLinkOrder = nil for itr.Scan(&domain, &subdomain, &path, &protocol) { u, _ := walker.CreateURL(domain, subdomain, path, protocol, walker.NotYetCrawled) urlString := u.String() linfo, gotLinfo := testComLinkHash[urlString] if !gotLinfo { panic(fmt.Errorf("testComLinkOrder can't find url: %v", urlString)) } testComLinkOrder = append(testComLinkOrder, linfo) } err = itr.Close() if err != nil { panic(fmt.Errorf("testComLinkOrder iterator error: %v", err)) } // // Need to record order for baz // itr = db.Query("SELECT time FROM links WHERE dom = 'baz.com'").Iter() var crawlTime time.Time bazLinkHistoryOrder = nil for itr.Scan(&crawlTime) { bestIndex := -1 var bestDiff int64 = 99999999 for i := range bazLinkHistoryInit { e := &bazLinkHistoryInit[i] delta := crawlTime.Unix() - e.CrawlTime.Unix() if delta < 0 { delta = -delta } if delta < bestDiff { bestIndex = i bestDiff = delta } } if bestIndex < 0 { panic("UNEXPECTED ERROR") } bazLinkHistoryOrder = append(bazLinkHistoryOrder, bazLinkHistoryInit[bestIndex]) } err = itr.Close() if err != nil { panic(fmt.Errorf("bazLinkHistoryOrder iterator error: %v", err)) } itr = db.Query("SELECT dom, subdom, path, proto FROM links").Iter() var foundBaz = false var beforeBazComLink *walker.URL = nil for itr.Scan(&domain, &subdomain, &path, &protocol) { url, err := walker.CreateURL(domain, subdomain, path, protocol, walker.NotYetCrawled) if err != nil { panic(err) } if domain == "baz.com" { foundBaz = true break } beforeBazComLink = url } if !foundBaz { panic("Unable to find domain before baz.com") } err = itr.Close() if err != nil { panic(fmt.Errorf("beforeBazCom link iterator error: %v", err)) } if beforeBazComLink == nil { bazSeed = "" } else { bazSeed = beforeBazComLink.String() } return ds }