Exemple #1
0
func getDB(t *testing.T) *gocql.Session {
	initdb.Do(func() {
		err := walker.CreateCassandraSchema()
		if err != nil {
			t.Fatalf(err.Error())
		}
	})

	if walker.Config.Cassandra.Keyspace != "walker_test" {
		t.Fatal("Running tests requires using the walker_test keyspace")
		return nil
	}
	config := walker.GetCassandraConfig()
	db, err := config.CreateSession()
	if err != nil {
		t.Fatalf("Could not connect to local cassandra db: %v", err)
		return nil
	}

	tables := []string{"links", "segments", "domain_info"}
	for _, table := range tables {
		err := db.Query(fmt.Sprintf(`TRUNCATE %v`, table)).Exec()
		if err != nil {
			t.Fatalf("Failed to truncate table %v: %v", table, err)
		}
	}

	return db
}
Exemple #2
0
func spoofDataLong() {
	if walker.Config.Cassandra.Keyspace == "walker" {
		panic("Not allowed to spoof the walker keyspace")
	}

	//
	// Drop the keyspace if it exists
	//
	{
		cluster := gocql.NewCluster(walker.Config.Cassandra.Hosts...)
		//cluster.Keyspace = walker.Config.Cassandra.Keyspace
		db, err := cluster.CreateSession()
		if err != nil {
			panic(err)
		}

		err = db.Query(fmt.Sprintf("DROP KEYSPACE IF EXISTS %s", walker.Config.Cassandra.Keyspace)).Exec()
		if err != nil {
			panic(fmt.Errorf("Failed to drop %s keyspace: %v", walker.Config.Cassandra.Keyspace, err))
		}
		db.Close()
	}

	//
	// Build the new schema
	//
	err := walker.CreateCassandraSchema()
	if err != nil {
		panic(err)
	}

	//
	// Build data store
	//
	ds, err := NewCqlModel()
	if err != nil {
		panic(fmt.Errorf("Failed to start data source: %v", err))
	}
	db := ds.Db

	//
	// Clear out the tables first
	//
	tables := []string{"links", "segments", "domain_info"}
	for _, table := range tables {
		err := db.Query(fmt.Sprintf(`TRUNCATE %v`, table)).Exec()
		if err != nil {
			panic(fmt.Errorf("Failed to truncate table %v: %v", table, err))
		}
	}

	rand.Seed(42)

	insertDomainInfo := `INSERT INTO domain_info (dom) VALUES (?)`
	insertDomainToCrawl := `INSERT INTO domain_info (dom, claim_tok, claim_time) VALUES (?, ?, ?)`
	insertSegment := `INSERT INTO segments (dom, subdom, path, proto) VALUES (?, ?, ?, ?)`
	insertLink := `INSERT INTO links (dom, subdom, path, proto, time, stat, err, robot_ex) VALUES (?, ?, ?, ?, ?, ?, ?, ?)`

	for i := 0; i < 100; i++ {
		domain := fmt.Sprintf("x%d.com", i)
		err := db.Query(insertDomainInfo, domain).Exec()
		if err != nil {
			panic(err)
		}
		crawlTime := fakeCrawlTime()
		status := fakeStatus()
		excluded := false
		if rand.Float32() < 0.1 {
			status = http.StatusOK
			crawlTime = walker.NotYetCrawled
			excluded = true
		}
		err = db.Query(insertLink, domain, "subd", "/page1.html", "http", crawlTime, status, "", excluded).Exec()
		if err != nil {
			panic(err)
		}
	}

	for i := 0; i < 10; i++ {
		domain := fmt.Sprintf("y%d.com", i)
		err := db.Query(insertDomainInfo, domain).Exec()
		if err != nil {
			panic(err)
		}

		for i := 0; i < 100; i++ {
			crawlTime := fakeCrawlTime()
			status := fakeStatus()
			excluded := false
			if rand.Float32() < 0.1 {
				status = http.StatusOK
				crawlTime = walker.NotYetCrawled
				excluded = true
			}
			page := fmt.Sprintf("/page%d.html", i)
			err = db.Query(insertLink, domain, "link", page, "http", crawlTime, status, "", excluded).Exec()
			if err != nil {
				panic(err)
			}
		}
	}

	errorBC := []string{
		"Something very bad happened",
		"Program failed to parse message 5",
		"All your base are belong to us",
		"The Tragically Hip sensor failed",
	}

	for i := 0; i < 10; i++ {
		domain := fmt.Sprintf("h%d.com", i)
		err := db.Query(insertDomainInfo, domain).Exec()
		if err != nil {
			panic(err)
		}

		crawlTime := time.Now()
		for i := 0; i < 20; i++ {
			crawlTime = crawlTime.AddDate(0, 0, -rand.Intn(30))
			status := fakeStatus()
			fakeError := ""
			if rand.Float32() < 0.1 {
				status = http.StatusOK
				fakeError = errorBC[rand.Intn(len(errorBC))]
			}
			err = db.Query(insertLink, domain, "link", "/page1.html", "http", crawlTime, status, fakeError, false).Exec()
			if err != nil {
				panic(err)
			}
		}
	}

	for i := 0; i < 10; i++ {
		domain := fmt.Sprintf("t%d.com", i)
		uuid := fakeUuid()
		err = db.Query(insertDomainToCrawl, domain, uuid, time.Now()).Exec()
		if err != nil {
			panic(err)
		}

		for i := 0; i < 20; i++ {
			page := fmt.Sprintf("/page%d.html", i)
			err = db.Query(insertLink, domain, "link", page, "http", walker.NotYetCrawled, http.StatusOK, "", false).Exec()
			if err != nil {
				panic(err)
			}

			err = db.Query(insertSegment, domain, "", page, "http").Exec()
			if err != nil {
				panic(err)
			}
		}
	}

	return
}
Exemple #3
0
func getDs(t *testing.T) *console.CqlModel {
	modifyConfigDataSource()

	initdb.Do(func() {
		cluster := gocql.NewCluster(walker.Config.Cassandra.Hosts...)
		db, err := cluster.CreateSession()
		if err != nil {
			panic(err)
		}

		// Just want to make sure no one makes a mistake with this code
		if walker.Config.Cassandra.Keyspace == "walker" {
			panic("Not allowed to spoof the walker keyspace")
		}
		err = db.Query(fmt.Sprintf("DROP KEYSPACE IF EXISTS %s", walker.Config.Cassandra.Keyspace)).Exec()
		if err != nil {
			panic(fmt.Errorf("Failed to drop %s keyspace: %v", walker.Config.Cassandra.Keyspace, err))
		}
		err = walker.CreateCassandraSchema()
		if err != nil {
			t.Fatalf(err.Error())
		}

		db.Close()
	})

	ds, err := console.NewCqlModel()
	if err != nil {
		panic(err)
	}
	db := ds.Db

	//
	// Clear out the tables first
	//
	tables := []string{"links", "segments", "domain_info"}
	for _, table := range tables {
		err := db.Query(fmt.Sprintf(`TRUNCATE %v`, table)).Exec()
		if err != nil {
			t.Fatalf("Failed to truncate table %v: %v", table, err)
		}
	}

	//
	// Insert some data
	//
	insertDomainInfo := `INSERT INTO domain_info (dom) VALUES (?)`
	insertDomainToCrawl := `INSERT INTO domain_info (dom, claim_tok, claim_time, dispatched) VALUES (?, ?, ?, true)`
	insertSegment := `INSERT INTO segments (dom, subdom, path, proto) VALUES (?, ?, ?, ?)`
	insertLink := `INSERT INTO links (dom, subdom, path, proto, time, stat, err, robot_ex) VALUES (?, ?, ?, ?, ?, ?, ?, ?)`

	queries := []*gocql.Query{
		db.Query(insertDomainToCrawl, "test.com", gocql.UUID{}, testTime),
		db.Query(insertLink, "test.com", "", "/page1.html", "http", walker.NotYetCrawled, 200, "", false),
		db.Query(insertLink, "test.com", "", "/page2.html", "http", walker.NotYetCrawled, 200, "", false),
		db.Query(insertLink, "test.com", "", "/page3.html", "http", walker.NotYetCrawled, 404, "", false),
		db.Query(insertLink, "test.com", "", "/page4.html", "http", walker.NotYetCrawled, 200, "An Error", false),
		db.Query(insertLink, "test.com", "", "/page5.html", "http", walker.NotYetCrawled, 200, "", true),

		db.Query(insertLink, "test.com", "sub", "/page6.html", "http", walker.NotYetCrawled, 200, "", false),
		db.Query(insertLink, "test.com", "sub", "/page7.html", "https", walker.NotYetCrawled, 200, "", false),
		db.Query(insertLink, "test.com", "sub", "/page8.html", "https", walker.NotYetCrawled, 200, "", false),

		db.Query(insertSegment, "test.com", "", "/page1.html", "http"),
		db.Query(insertSegment, "test.com", "", "/page2.html", "http"),

		db.Query(insertDomainInfo, "foo.com"),
		db.Query(insertLink, "foo.com", "sub", "/page1.html", "http", fooTime, 200, "", false),
		db.Query(insertLink, "foo.com", "sub", "/page2.html", "http", fooTime, 200, "", false),

		db.Query(insertDomainInfo, "bar.com"),

		db.Query(insertDomainToCrawl, "baz.com", bazUuid, testTime),
		db.Query(insertLink, "baz.com", "sub", "/page1.html", "http", bazLinkHistoryInit[0].CrawlTime, 200, "", false),
		db.Query(insertLink, "baz.com", "sub", "/page1.html", "http", bazLinkHistoryInit[1].CrawlTime, 200, "", false),
		db.Query(insertLink, "baz.com", "sub", "/page1.html", "http", bazLinkHistoryInit[2].CrawlTime, 200, "", false),
		db.Query(insertLink, "baz.com", "sub", "/page1.html", "http", bazLinkHistoryInit[3].CrawlTime, 200, "", false),
		db.Query(insertLink, "baz.com", "sub", "/page1.html", "http", bazLinkHistoryInit[4].CrawlTime, 200, "", false),
		db.Query(insertLink, "baz.com", "sub", "/page1.html", "http", bazLinkHistoryInit[5].CrawlTime, 200, "", false),

		db.Query(insertSegment, "baz.com", "sub", "page1.html", "http"),
	}
	for _, q := range queries {
		err := q.Exec()
		if err != nil {
			t.Fatalf("Failed to insert test data: %v\nQuery: %v", err, q)
		}
	}

	//
	// Need to record the order that the test.com urls come off on
	//
	itr := db.Query("SELECT dom, subdom, path, proto FROM links WHERE dom = 'test.com'").Iter()
	var domain, subdomain, path, protocol string
	testComLinkOrder = nil
	for itr.Scan(&domain, &subdomain, &path, &protocol) {
		u, _ := walker.CreateURL(domain, subdomain, path, protocol, walker.NotYetCrawled)
		urlString := u.String()
		linfo, gotLinfo := testComLinkHash[urlString]
		if !gotLinfo {
			panic(fmt.Errorf("testComLinkOrder can't find url: %v", urlString))
		}
		testComLinkOrder = append(testComLinkOrder, linfo)
	}
	err = itr.Close()
	if err != nil {
		panic(fmt.Errorf("testComLinkOrder iterator error: %v", err))
	}

	//
	// Need to record order for baz
	//
	itr = db.Query("SELECT time FROM links WHERE dom = 'baz.com'").Iter()
	var crawlTime time.Time
	bazLinkHistoryOrder = nil
	for itr.Scan(&crawlTime) {
		bestIndex := -1
		var bestDiff int64 = 99999999
		for i := range bazLinkHistoryInit {
			e := &bazLinkHistoryInit[i]
			delta := crawlTime.Unix() - e.CrawlTime.Unix()
			if delta < 0 {
				delta = -delta
			}
			if delta < bestDiff {
				bestIndex = i
				bestDiff = delta
			}
		}
		if bestIndex < 0 {
			panic("UNEXPECTED ERROR")
		}
		bazLinkHistoryOrder = append(bazLinkHistoryOrder, bazLinkHistoryInit[bestIndex])
	}
	err = itr.Close()
	if err != nil {
		panic(fmt.Errorf("bazLinkHistoryOrder iterator error: %v", err))
	}

	itr = db.Query("SELECT dom, subdom, path, proto FROM links").Iter()
	var foundBaz = false
	var beforeBazComLink *walker.URL = nil
	for itr.Scan(&domain, &subdomain, &path, &protocol) {
		url, err := walker.CreateURL(domain, subdomain, path, protocol, walker.NotYetCrawled)
		if err != nil {
			panic(err)
		}

		if domain == "baz.com" {
			foundBaz = true
			break
		}

		beforeBazComLink = url
	}
	if !foundBaz {
		panic("Unable to find domain before baz.com")
	}
	err = itr.Close()
	if err != nil {
		panic(fmt.Errorf("beforeBazCom link iterator error: %v", err))
	}
	if beforeBazComLink == nil {
		bazSeed = ""
	} else {
		bazSeed = beforeBazComLink.String()
	}

	return ds
}