Пример #1
0
func (ds *CqlModel) ListLinkHistorical(linkUrl string, seedIndex int, limit int) ([]LinkInfo, int, error) {
	if limit <= 0 {
		return nil, seedIndex, fmt.Errorf("Bad value for limit parameter %d", limit)
	}
	db := ds.Db
	u, err := walker.ParseURL(linkUrl)
	if err != nil {
		return nil, seedIndex, err
	}

	query := `SELECT dom, subdom, path, proto, time, stat, err, robot_ex 
              FROM links
              WHERE dom = ? AND subdom = ? AND path = ? AND proto = ?`
	tld1, err := u.ToplevelDomainPlusOne()
	if err != nil {
		return nil, seedIndex, err
	}
	subtld1, err := u.Subdomain()
	if err != nil {
		return nil, seedIndex, err
	}

	itr := db.Query(query, tld1, subtld1, u.RequestURI(), u.Scheme).Iter()

	var linfos []LinkInfo
	var dom, sub, path, prot, getError string
	var crawlTime time.Time
	var status int
	var robotsExcluded bool
	count := 0
	for itr.Scan(&dom, &sub, &path, &prot, &crawlTime, &status, &getError, &robotsExcluded) {
		if count < seedIndex {
			count++
			continue
		}

		url, _ := walker.CreateURL(dom, sub, path, prot, crawlTime)
		linfo := LinkInfo{
			Url:            url.String(),
			Status:         status,
			Error:          getError,
			RobotsExcluded: robotsExcluded,
			CrawlTime:      crawlTime,
		}
		linfos = append(linfos, linfo)
		if len(linfos) >= limit {
			break
		}
	}
	err = itr.Close()

	return linfos, seedIndex + len(linfos), err
}
Пример #2
0
//collectLinkInfos populates a []LinkInfo list given a cassandra iterator
func (ds *CqlModel) collectLinkInfos(linfos []LinkInfo, rtimes map[string]rememberTimes, itr *gocql.Iter, limit int) ([]LinkInfo, error) {
	var domain, subdomain, path, protocol, anerror string
	var crawlTime time.Time
	var robotsExcluded bool
	var status int

	for itr.Scan(&domain, &subdomain, &path, &protocol, &crawlTime, &status, &anerror, &robotsExcluded) {

		u, err := walker.CreateURL(domain, subdomain, path, protocol, crawlTime)
		if err != nil {
			return linfos, err
		}
		urlString := u.String()

		qq, yes := rtimes[urlString]

		if yes && qq.ctm.After(crawlTime) {
			continue
		}

		linfo := LinkInfo{
			Url:            urlString,
			Status:         status,
			Error:          anerror,
			RobotsExcluded: robotsExcluded,
			CrawlTime:      crawlTime,
		}

		nindex := -1
		if yes {
			nindex = qq.ind
			linfos[qq.ind] = linfo
		} else {
			// If you've reached the limit, then we're all done
			if len(linfos) >= limit {
				break
			}
			linfos = append(linfos, linfo)
			nindex = len(linfos) - 1
		}
		rtimes[urlString] = rememberTimes{ctm: crawlTime, ind: nindex}
	}

	return linfos, nil
}
Пример #3
0
func TestURLCreation(t *testing.T) {
	url1, err := url.Parse("http://sub1.test.com/thepath?query=blah")
	if err != nil {
		t.Fatal(err)
	}
	wurl1, err := walker.ParseURL("http://sub1.test.com/thepath?query=blah")
	if err != nil {
		t.Fatal(err)
	}
	if url1.String() != wurl1.String() {
		t.Errorf("URLs should be the same: %v\nAnd: %v")
	}

	created, err := walker.CreateURL("test.com", "sub1", "thepath?query=blah", "http",
		walker.NotYetCrawled)
	if err != nil {
		t.Fatal(err)
	}
	if created.String() != wurl1.String() {
		t.Errorf("Expected CreateURL to return %v\nBut got: %v", wurl1, created)
	}
}
Пример #4
0
func TestDatastoreBasic(t *testing.T) {
	db := getDB(t)
	ds := getDS(t)

	insertDomainInfo := `INSERT INTO domain_info (dom, claim_tok, priority, dispatched)
								VALUES (?, ?, ?, ?)`
	insertSegment := `INSERT INTO segments (dom, subdom, path, proto)
						VALUES (?, ?, ?, ?)`
	insertLink := `INSERT INTO links (dom, subdom, path, proto, time)
						VALUES (?, ?, ?, ?, ?)`

	queries := []*gocql.Query{
		db.Query(insertDomainInfo, "test.com", gocql.UUID{}, 0, true),
		db.Query(insertSegment, "test.com", "", "page1.html", "http"),
		db.Query(insertSegment, "test.com", "", "page2.html", "http"),
		db.Query(insertLink, "test.com", "", "page1.html", "http", walker.NotYetCrawled),
		db.Query(insertLink, "test.com", "", "page2.html", "http", walker.NotYetCrawled),
	}
	for _, q := range queries {
		err := q.Exec()
		if err != nil {
			t.Fatalf("Failed to insert test data: %v\nQuery: %v", err, q)
		}
	}

	host := ds.ClaimNewHost()
	if host != "test.com" {
		t.Errorf("Expected test.com but got %v", host)
	}

	links := map[url.URL]bool{}
	expectedLinks := map[url.URL]bool{
		*page1URL.URL: true,
		*page2URL.URL: true,
	}
	for u := range ds.LinksForHost("test.com") {
		links[*u.URL] = true
	}
	if !reflect.DeepEqual(links, expectedLinks) {
		t.Errorf("Expected links from LinksForHost: %v\nBut got: %v", expectedLinks, links)
	}

	ds.StoreURLFetchResults(page1Fetch)
	ds.StoreURLFetchResults(page2Fetch)

	expectedResults := map[url.URL]int{
		*page1URL.URL: 200,
		*page2URL.URL: 200,
	}
	iter := db.Query(`SELECT dom, subdom, path, proto, time, stat
						FROM links WHERE dom = 'test.com'`).Iter()
	var linkdomain, subdomain, path, protocol string
	var status int
	var crawl_time time.Time
	results := map[url.URL]int{}
	for iter.Scan(&linkdomain, &subdomain, &path, &protocol, &crawl_time, &status) {
		if !crawl_time.Equal(walker.NotYetCrawled) {
			u, _ := walker.CreateURL(linkdomain, subdomain, path, protocol, crawl_time)
			results[*u.URL] = status
		}
	}
	if !reflect.DeepEqual(results, expectedResults) {
		t.Errorf("Expected results from StoreURLFetchResults: %v\nBut got: %v",
			expectedResults, results)
	}

	ds.StoreParsedURL(parse("http://test2.com/page1-1.html"), page1Fetch)
	ds.StoreParsedURL(parse("http://test2.com/page2-1.html"), page2Fetch)

	var count int
	db.Query(`SELECT COUNT(*) FROM links WHERE dom = 'test2.com'`).Scan(&count)
	if count != 2 {
		t.Errorf("Expected 2 parsed links to be inserted for test2.com, found %v", count)
	}

	ds.UnclaimHost("test.com")

	db.Query(`SELECT COUNT(*) FROM segments WHERE dom = 'test.com'`).Scan(&count)
	if count != 0 {
		t.Errorf("Expected links from unclaimed domain to be deleted, found %v", count)
	}

	err := db.Query(`SELECT COUNT(*) FROM domain_info
						WHERE dom = 'test.com'
						AND claim_tok = 00000000-0000-0000-0000-000000000000
						AND dispatched = false ALLOW FILTERING`).Scan(&count)
	if err != nil {
		t.Fatalf("Failed to query for test.com in domain_info: %v", err)
	}
	if count != 1 {
		t.Fatalf("test.com has incorrect values in domain_info after unclaim")
	}
}
Пример #5
0
func TestDispatcherBasic(t *testing.T) {
	// These config settings MUST be here. The results of the test
	// change if these are changed.
	origMaxLinksPerSegment := walker.Config.Dispatcher.MaxLinksPerSegment
	origRefreshPercentage := walker.Config.Dispatcher.RefreshPercentage
	defer func() {
		walker.Config.Dispatcher.MaxLinksPerSegment = origMaxLinksPerSegment
		walker.Config.Dispatcher.RefreshPercentage = origRefreshPercentage
	}()
	walker.Config.Dispatcher.MaxLinksPerSegment = 9
	walker.Config.Dispatcher.RefreshPercentage = 33

	var q *gocql.Query

	for _, dt := range DispatcherTests {
		db := getDB(t) // runs between tests to reset the db

		for _, edi := range dt.ExistingDomainInfos {
			q = db.Query(`INSERT INTO domain_info (dom, claim_tok, priority, dispatched)
							VALUES (?, ?, ?, ?)`,
				edi.Dom, edi.ClaimTok, edi.Priority, edi.Dispatched)
			if err := q.Exec(); err != nil {
				t.Fatalf("Failed to insert test domain info: %v\nQuery: %v", err, q)
			}
		}

		for _, el := range dt.ExistingLinks {
			dom, subdom, _ := el.URL.TLDPlusOneAndSubdomain()
			if el.Status == -1 {
				q = db.Query(`INSERT INTO links (dom, subdom, path, proto, time, getnow)
								VALUES (?, ?, ?, ?, ?, ?)`,
					dom,
					subdom,
					el.URL.RequestURI(),
					el.URL.Scheme,
					el.URL.LastCrawled,
					el.GetNow)
			} else {
				q = db.Query(`INSERT INTO links (dom, subdom, path, proto, time, stat, getnow)
								VALUES (?, ?, ?, ?, ?, ?, ?)`,
					dom,
					subdom,
					el.URL.RequestURI(),
					el.URL.Scheme,
					el.URL.LastCrawled,
					el.Status,
					el.GetNow)
			}
			if err := q.Exec(); err != nil {
				t.Fatalf("Failed to insert test links: %v\nQuery: %v", err, q)
			}
		}

		d := &walker.CassandraDispatcher{}
		go d.StartDispatcher()
		time.Sleep(time.Second)
		d.StopDispatcher()

		expectedResults := map[url.URL]bool{}
		for _, esl := range dt.ExpectedSegmentLinks {
			expectedResults[*esl.URL] = true
		}

		results := map[url.URL]bool{}
		iter := db.Query(`SELECT dom, subdom, path, proto
							FROM segments WHERE dom = 'test.com'`).Iter()
		var linkdomain, subdomain, path, protocol string
		for iter.Scan(&linkdomain, &subdomain, &path, &protocol) {
			u, _ := walker.CreateURL(linkdomain, subdomain, path, protocol, walker.NotYetCrawled)
			results[*u.URL] = true
		}
		if !reflect.DeepEqual(results, expectedResults) {
			t.Errorf("For tag %q expected results in segments: %v\nBut got: %v",
				dt.Tag, expectedResults, results)
		}

		for _, edi := range dt.ExistingDomainInfos {
			q = db.Query(`SELECT dispatched FROM domain_info WHERE dom = ?`, edi.Dom)
			var dispatched bool
			if err := q.Scan(&dispatched); err != nil {
				t.Fatalf("For tag %q failed to insert find domain info: %v\nQuery: %v", dt.Tag, err, q)
			}
			if !dispatched {
				t.Errorf("For tag %q `dispatched` flag not set on domain: %v", dt.Tag, edi.Dom)
			}
		}
	}
}
Пример #6
0
func getDs(t *testing.T) *console.CqlModel {
	modifyConfigDataSource()

	initdb.Do(func() {
		cluster := gocql.NewCluster(walker.Config.Cassandra.Hosts...)
		db, err := cluster.CreateSession()
		if err != nil {
			panic(err)
		}

		// Just want to make sure no one makes a mistake with this code
		if walker.Config.Cassandra.Keyspace == "walker" {
			panic("Not allowed to spoof the walker keyspace")
		}
		err = db.Query(fmt.Sprintf("DROP KEYSPACE IF EXISTS %s", walker.Config.Cassandra.Keyspace)).Exec()
		if err != nil {
			panic(fmt.Errorf("Failed to drop %s keyspace: %v", walker.Config.Cassandra.Keyspace, err))
		}
		err = walker.CreateCassandraSchema()
		if err != nil {
			t.Fatalf(err.Error())
		}

		db.Close()
	})

	ds, err := console.NewCqlModel()
	if err != nil {
		panic(err)
	}
	db := ds.Db

	//
	// Clear out the tables first
	//
	tables := []string{"links", "segments", "domain_info"}
	for _, table := range tables {
		err := db.Query(fmt.Sprintf(`TRUNCATE %v`, table)).Exec()
		if err != nil {
			t.Fatalf("Failed to truncate table %v: %v", table, err)
		}
	}

	//
	// Insert some data
	//
	insertDomainInfo := `INSERT INTO domain_info (dom) VALUES (?)`
	insertDomainToCrawl := `INSERT INTO domain_info (dom, claim_tok, claim_time, dispatched) VALUES (?, ?, ?, true)`
	insertSegment := `INSERT INTO segments (dom, subdom, path, proto) VALUES (?, ?, ?, ?)`
	insertLink := `INSERT INTO links (dom, subdom, path, proto, time, stat, err, robot_ex) VALUES (?, ?, ?, ?, ?, ?, ?, ?)`

	queries := []*gocql.Query{
		db.Query(insertDomainToCrawl, "test.com", gocql.UUID{}, testTime),
		db.Query(insertLink, "test.com", "", "/page1.html", "http", walker.NotYetCrawled, 200, "", false),
		db.Query(insertLink, "test.com", "", "/page2.html", "http", walker.NotYetCrawled, 200, "", false),
		db.Query(insertLink, "test.com", "", "/page3.html", "http", walker.NotYetCrawled, 404, "", false),
		db.Query(insertLink, "test.com", "", "/page4.html", "http", walker.NotYetCrawled, 200, "An Error", false),
		db.Query(insertLink, "test.com", "", "/page5.html", "http", walker.NotYetCrawled, 200, "", true),

		db.Query(insertLink, "test.com", "sub", "/page6.html", "http", walker.NotYetCrawled, 200, "", false),
		db.Query(insertLink, "test.com", "sub", "/page7.html", "https", walker.NotYetCrawled, 200, "", false),
		db.Query(insertLink, "test.com", "sub", "/page8.html", "https", walker.NotYetCrawled, 200, "", false),

		db.Query(insertSegment, "test.com", "", "/page1.html", "http"),
		db.Query(insertSegment, "test.com", "", "/page2.html", "http"),

		db.Query(insertDomainInfo, "foo.com"),
		db.Query(insertLink, "foo.com", "sub", "/page1.html", "http", fooTime, 200, "", false),
		db.Query(insertLink, "foo.com", "sub", "/page2.html", "http", fooTime, 200, "", false),

		db.Query(insertDomainInfo, "bar.com"),

		db.Query(insertDomainToCrawl, "baz.com", bazUuid, testTime),
		db.Query(insertLink, "baz.com", "sub", "/page1.html", "http", bazLinkHistoryInit[0].CrawlTime, 200, "", false),
		db.Query(insertLink, "baz.com", "sub", "/page1.html", "http", bazLinkHistoryInit[1].CrawlTime, 200, "", false),
		db.Query(insertLink, "baz.com", "sub", "/page1.html", "http", bazLinkHistoryInit[2].CrawlTime, 200, "", false),
		db.Query(insertLink, "baz.com", "sub", "/page1.html", "http", bazLinkHistoryInit[3].CrawlTime, 200, "", false),
		db.Query(insertLink, "baz.com", "sub", "/page1.html", "http", bazLinkHistoryInit[4].CrawlTime, 200, "", false),
		db.Query(insertLink, "baz.com", "sub", "/page1.html", "http", bazLinkHistoryInit[5].CrawlTime, 200, "", false),

		db.Query(insertSegment, "baz.com", "sub", "page1.html", "http"),
	}
	for _, q := range queries {
		err := q.Exec()
		if err != nil {
			t.Fatalf("Failed to insert test data: %v\nQuery: %v", err, q)
		}
	}

	//
	// Need to record the order that the test.com urls come off on
	//
	itr := db.Query("SELECT dom, subdom, path, proto FROM links WHERE dom = 'test.com'").Iter()
	var domain, subdomain, path, protocol string
	testComLinkOrder = nil
	for itr.Scan(&domain, &subdomain, &path, &protocol) {
		u, _ := walker.CreateURL(domain, subdomain, path, protocol, walker.NotYetCrawled)
		urlString := u.String()
		linfo, gotLinfo := testComLinkHash[urlString]
		if !gotLinfo {
			panic(fmt.Errorf("testComLinkOrder can't find url: %v", urlString))
		}
		testComLinkOrder = append(testComLinkOrder, linfo)
	}
	err = itr.Close()
	if err != nil {
		panic(fmt.Errorf("testComLinkOrder iterator error: %v", err))
	}

	//
	// Need to record order for baz
	//
	itr = db.Query("SELECT time FROM links WHERE dom = 'baz.com'").Iter()
	var crawlTime time.Time
	bazLinkHistoryOrder = nil
	for itr.Scan(&crawlTime) {
		bestIndex := -1
		var bestDiff int64 = 99999999
		for i := range bazLinkHistoryInit {
			e := &bazLinkHistoryInit[i]
			delta := crawlTime.Unix() - e.CrawlTime.Unix()
			if delta < 0 {
				delta = -delta
			}
			if delta < bestDiff {
				bestIndex = i
				bestDiff = delta
			}
		}
		if bestIndex < 0 {
			panic("UNEXPECTED ERROR")
		}
		bazLinkHistoryOrder = append(bazLinkHistoryOrder, bazLinkHistoryInit[bestIndex])
	}
	err = itr.Close()
	if err != nil {
		panic(fmt.Errorf("bazLinkHistoryOrder iterator error: %v", err))
	}

	itr = db.Query("SELECT dom, subdom, path, proto FROM links").Iter()
	var foundBaz = false
	var beforeBazComLink *walker.URL = nil
	for itr.Scan(&domain, &subdomain, &path, &protocol) {
		url, err := walker.CreateURL(domain, subdomain, path, protocol, walker.NotYetCrawled)
		if err != nil {
			panic(err)
		}

		if domain == "baz.com" {
			foundBaz = true
			break
		}

		beforeBazComLink = url
	}
	if !foundBaz {
		panic("Unable to find domain before baz.com")
	}
	err = itr.Close()
	if err != nil {
		panic(fmt.Errorf("beforeBazCom link iterator error: %v", err))
	}
	if beforeBazComLink == nil {
		bazSeed = ""
	} else {
		bazSeed = beforeBazComLink.String()
	}

	return ds
}