func TestContinueCrawling(t *testing.T) { d, _ := db.NewMapConn() c := newCrawler(d, queue.NewPoolConn(d), queue.NewMessage("test", "http://example.com", 1)) assert.False(t, c.continueCrawling()) c = newCrawler(d, queue.NewPoolConn(d), queue.NewMessage("test", "http://example.com", 0)) assert.True(t, c.continueCrawling()) }
// connectQueue attempts to connect with the cluster of Gnatsd servers. // It exits the program if the connection fails. // It falls back to a channel pool if the nats servers are not configured. func connectQueue(d db.Connection) queue.Connection { if servers, ok := ParseNatsNodes(); ok { return ConnectNatsQueue(servers, d) } return queue.NewPoolConn(d) }
func TestCrawlDocuments(t *testing.T) { testCases := []struct { id string page string images int }{ { id: "test1", page: "simple_page.html", images: 1, }, { id: "test2", page: "multi_images.html", images: 2, }, { id: "test3", page: "empty_page.html", images: 0, }, } for _, e := range testCases { d, _ := db.NewMapConn() c := newCrawler(d, queue.NewPoolConn(d), queue.NewMessage(e.id, "http://example.com", 1)) doc := loadPage(t, e.page) x := loadContext(t, "http://example.com") c.crawlDocument(x, doc) r, _ := d.Results(e.id) assert.Equal(t, e.images, len(r)) } }
func TestCrawl(t *testing.T) { d, _ := db.NewMapConn() q := queue.NewPoolConn(d) counter := 0 processor := func(q queue.Connection, d db.Connection, msg *queue.Message) { counter++ } q.Subscribe(processor) x := context.Context{d, q} s := newServer(x) r, _ := http.NewRequest("GET", "http://example.com", strings.NewReader("http://example.com")) p := make(httprouter.Params, 0) w := httptest.NewRecorder() s.crawl(w, r, p) b, err := ioutil.ReadAll(w.Body) assert.NoError(t, err) assert.NotEmpty(t, b) j := string(b) assert.Equal(t, fmt.Sprintf("/status/%s", j), w.Header().Get("Location")) assert.Equal(t, 201, w.Code) }
func TestCrawlHref(t *testing.T) { d, _ := db.NewMapConn() p := queue.NewPoolConn(d) c := newCrawler(d, p, queue.NewMessage("test", "http://example.com", 0)) x := loadContext(t, "http://example.com") done := make(chan bool) processor := func(q queue.Connection, d db.Connection, msg *queue.Message) { doc := loadPage(t, "simple_page.html") c.crawlDocument(x, doc) done <- true } p.Subscribe(processor) doc := loadPage(t, "follow_index.html") c.crawlDocument(x, doc) <-done r, _ := d.Results("test") assert.Equal(t, 1, len(r)) assert.Equal(t, "http://example.com/images/logo.jpg", string(r[0])) }