Exemple #1
0
func (c *Crawler) Start() Enqueuer {
	if c.Logger == nil {
		c.Logger = newLogger(c.LogFormat, c.LogLevel)
	}

	c.mux.HandleErrors(c.newErrorHandler())
	h := c.newRequestHandler()

	f := fetchbot.New(h)
	f.CrawlDelay = c.CrawlDelay
	f.DisablePoliteness = !c.CrawlPoliteness
	f.HttpClient = c.HTTPClient
	f.UserAgent = c.UserAgent

	c.f = f
	c.q = c.f.Start()

	if c.CrawlDuration > 0 {
		go func() {
			t := time.After(c.CrawlDuration)
			<-t
			c.q.Close()
		}()
	}

	return &Queue{c.q}
}
Exemple #2
0
func main() {
	f := fetchbot.New(fetchbot.HandlerFunc(handler))
	f.AutoClose = true
	f.WorkerIdleTTL = time.Second
	queue := f.Start()
	queue.SendStringHead("http://google.com", "http://golang.org", "http://golang.org/doc")
	queue.Block()
}
Exemple #3
0
func main() {
	var sql_Cmd string

	runtime.GOMAXPROCS(runtime.NumCPU())

	flag.Parse()
	mux := fetchbot.NewMux()

	fmt.Printf("Register a ErrorHandler\n")
	mux.HandleErrors(fetchbot.HandlerFunc(err_Handler))

	// 1. Handle GET requests for html responses
	// 2. to parse the body and
	// 3. enqueue all links as HEAD requests.
	fmt.Printf("Register a GetHandler\n")
	mux.Response().Method("GET").ContentType("text/html").Handler(fetchbot.HandlerFunc(get_Handler))

	// 1. Handle HEAD requests for html responses coming from the source host.
	// 2. We don't want to crawl links from other hosts.
	fmt.Printf("Register a HeadHandler\n")
	mux.Response().Method("HEAD").Host("golang.org").ContentType("text/html").Handler(fetchbot.HandlerFunc(head_Handler))

	// Create the Fetcher, handle the logging first, then dispatch to the Muxer
	fmt.Printf("Register a LogHandler\n")
	h := logHandler(mux)

	fmt.Printf("New a fetchbot\n")
	f := fetchbot.New(h)

	/* Start processing */
	fmt.Printf("Start the fetchbot\n")
	q := f.Start()

	// Enqueue the seed, which is the first entry in the dup map
	fmt.Printf("Enqueue the seed\n")
	db = setupDB()

	sql_Cmd = sql_Cmd_Select(consumer)
	rows, err_DB := db.Query(sql_Cmd)
	if err_DB != nil {
		fmt.Printf("[ERR]DB select fail\n")
		panicIF(err_DB)
	}

	for rows.Next() {
		rows.Scan(&seed_URL)
		dup_URLs[seed_URL] = true

		_, err := q.SendStringGet(seed_URL)
		if err != nil {
			fmt.Printf("[ERR] GET %s - %s\n", seed_URL, err)
		}
	}

	fmt.Printf("Start fetch Process\n")
	q.Block()
	fmt.Printf("End the process\n")
}
Exemple #4
0
func main() {
	var url string
	f := fetchbot.New(fetchbot.HandlerFunc(handler))
	f.DisablePoliteness = true
	db1 = dbutils.NewDB1()
	defer db1.Close()
	db2 = dbutils.NewDB2()
	defer db2.Close()

	rows, _ := db1.Query("select url FROM urls")
	defer rows.Close()
	for rows.Next() {
		rows.Scan(&url)
		queue := f.Start()
		queue.SendStringGet(url)
		queue.Close()
	}
}
Exemple #5
0
// ProcessMessage initializes a crawler to parse a specific url and crawls its html looking for images.
func ProcessMessage(q queue.Connection, d db.Connection, msg *queue.Message) {
	log.Printf("type=messageReceived msg=%v\n", msg)

	view, err := d.ViewPage(msg.JobUUID, msg.URL)
	if err != nil {
		log.Printf("type=viewPageError jobUUID=%s url=%s err=%v\n", msg.JobUUID, msg.URL, err)
		return
	}

	if !view {
		log.Printf("type=pageAlreadyViewed jobUUID=%s url=%s\n", msg.JobUUID, msg.URL)
		return
	}

	c := newCrawler(d, q, msg)
	c.fetcher = fetchbot.New(fetchbot.HandlerFunc(c.crawlResponse))
	c.fetcher.HttpClient = httpClient
	c.Crawl()
}
Exemple #6
0
func main() {
	flag.Parse()

	// Create the muxer
	mux := fetchbot.NewMux()

	// Handle all errors the same
	mux.HandleErrors(fetchbot.HandlerFunc(func(ctx *fetchbot.Context, res *http.Response, err error) {
		fmt.Printf("[ERR] %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err)
	}))

	// Handle GET requests for html responses, to parse the body and enqueue all links as HEAD
	// requests.
	mux.Response().Method("GET").ContentType("text/html").Handler(fetchbot.HandlerFunc(
		func(ctx *fetchbot.Context, res *http.Response, err error) {
			// Process the body to find the links
			doc, err := goquery.NewDocumentFromResponse(res)
			if err != nil {
				fmt.Printf("[ERR] %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err)
				return
			}
			// Enqueue all links as HEAD requests
			enqueueLinks(ctx, doc)
		}))

	// Handle HEAD requests for html responses coming from the source host - we don't want
	// to crawl links from other hosts.
	mux.Response().Method("HEAD").Host("golang.org").ContentType("text/html").Handler(fetchbot.HandlerFunc(
		func(ctx *fetchbot.Context, res *http.Response, err error) {
			if _, err := ctx.Q.SendStringGet(ctx.Cmd.URL().String()); err != nil {
				fmt.Printf("[ERR] %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err)
			}
		}))

	// Create the Fetcher, handle the logging first, then dispatch to the Muxer
	h := logHandler(mux)
	if *stopAtUrl != "" {
		h = stopHandler(*stopAtUrl, logHandler(mux))
	}
	f := fetchbot.New(h)
	// First mem stat print must be right after creating the fetchbot
	if *memStats > 0 {
		// Print starting stats
		printMemStats(nil)
		// Run at regular intervals
		runMemStats(f, *memStats)
		// On exit, print ending stats after a GC
		defer func() {
			runtime.GC()
			printMemStats(nil)
		}()
	}
	// Start processing
	q := f.Start()
	if *stopAfter > 0 {
		go func() {
			c := time.After(*stopAfter)
			<-c
			q.Close()
		}()
	}
	// Enqueue the seed, which is the first entry in the dup map
	_, err := q.SendStringGet(seed)
	if err != nil {
		fmt.Printf("[ERR] GET %s - %s\n", seed, err)
	}
	q.Block()
}
Exemple #7
0
func main() {
	f := fetchbot.New(fetchbot.HandlerFunc(handler))
	queue := f.Start()
	queue.SendStringHead("http://google.com", "http://golang.org", "http://golang.org/doc")
	queue.Close()
}
func main() {
	flag.Parse()

	// Parse the provided seed
	u, err := url.Parse(*seed)
	if err != nil {
		log.Fatal(err)
	}

	// Create the muxer
	mux := fetchbot.NewMux()

	// Handle all errors the same
	mux.HandleErrors(fetchbot.HandlerFunc(func(ctx *fetchbot.Context, res *http.Response, err error) {
		fmt.Printf("[ERR] %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err)
	}))

	// Handle GET requests for html responses, to parse the body and enqueue all links as HEAD
	// requests.
	mux.Response().Method("GET").ContentType("text/html").Handler(fetchbot.HandlerFunc(
		func(ctx *fetchbot.Context, res *http.Response, err error) {
			// Process the body to find the links
			doc, err := goquery.NewDocumentFromResponse(res)
			if err != nil {
				fmt.Printf("[ERR] %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err)
				return
			}
			// Enqueue all links as HEAD requests
			enqueueLinks(ctx, doc)
		}))

	// Handle HEAD requests for html responses coming from the source host - we don't want
	// to crawl links from other hosts.
	mux.Response().Method("HEAD").Host(u.Host).ContentType("text/html").Handler(fetchbot.HandlerFunc(
		func(ctx *fetchbot.Context, res *http.Response, err error) {
			if _, err := ctx.Q.SendStringGet(ctx.Cmd.URL().String()); err != nil {
				fmt.Printf("[ERR] %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err)
			}
		}))

	// Create the Fetcher, handle the logging first, then dispatch to the Muxer
	h := logHandler(mux)
	if *stopAtURL != "" || *cancelAtURL != "" {
		stopURL := *stopAtURL
		if *cancelAtURL != "" {
			stopURL = *cancelAtURL
		}
		h = stopHandler(stopURL, *cancelAtURL != "", logHandler(mux))
	}
	f := fetchbot.New(h)

	// First mem stat print must be right after creating the fetchbot
	if *memStats > 0 {
		// Print starting stats
		printMemStats(nil)
		// Run at regular intervals
		runMemStats(f, *memStats)
		// On exit, print ending stats after a GC
		defer func() {
			runtime.GC()
			printMemStats(nil)
		}()
	}

	// Start processing
	q := f.Start()

	// if a stop or cancel is requested after some duration, launch the goroutine
	// that will stop or cancel.
	if *stopAfter > 0 || *cancelAfter > 0 {
		after := *stopAfter
		stopFunc := q.Close
		if *cancelAfter != 0 {
			after = *cancelAfter
			stopFunc = q.Cancel
		}

		go func() {
			c := time.After(after)
			<-c
			stopFunc()
		}()
	}

	// Enqueue the seed, which is the first entry in the dup map
	dup[*seed] = true
	_, err = q.SendStringGet(*seed)
	if err != nil {
		fmt.Printf("[ERR] GET %s - %s\n", *seed, err)
	}
	q.Block()
}
func main() {
	lf, err := os.OpenFile("testlogfile", os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
	if err != nil {
		log.Fatalf("error opening file: %v", err)
	}
	defer lf.Close()

	log.SetOutput(lf)
	log.Println("This is a test log entry")
	seed := "http://github.com/golang/go/wiki"

	u, err := url.Parse(seed)
	fmt.Printf("Scanning: %v\n", u.String())
	if err != nil {
		log.Fatal(err)
	}
	mux := fetchbot.NewMux()

	// Handle all errors the same
	mux.HandleErrors(fetchbot.HandlerFunc(func(ctx *fetchbot.Context, res *http.Response, err error) {
		fmt.Printf("[ERR - HandleErrors] %s\n", err)
	}))

	// Handle GET requests for html responses, to parse the body and enqueue all links as HEAD
	// requests.
	mux.Response().Method("GET").Host(u.Host).Path("/golang/go/wiki").ContentType("text/html").Handler(fetchbot.HandlerFunc(
		func(ctx *fetchbot.Context, res *http.Response, err error) {
			log.Printf("GET: %v - %v\n", res.Status, ctx.Cmd.URL())
			doc, err := goquery.NewDocumentFromResponse(res)
			if err != nil {
				fmt.Printf("[GET] %s %s - %s\n", res.Status, ctx.Cmd.URL(), err)
				return
			}
			// Enqueue all links as HEAD requests
			enqueueLinks(ctx, doc)
		}))

	// Handle GET requests for html responses coming from the source host - we don't want
	// to crawl links from other hosts.
	mux.Response().ContentType("text/html").Handler(fetchbot.HandlerFunc(
		func(ctx *fetchbot.Context, res *http.Response, err error) {
			log.Printf("HEAD: %v -  %v\n", res.Status, ctx.Cmd.URL())
			if strings.HasPrefix(res.Status, "40") || strings.HasPrefix(res.Status, "50") {
				fmt.Printf("[ERR] - %v - %v\n", res.Status, ctx.Cmd.URL())
			}
		}))
	h := logHandler(mux)
	f := fetchbot.New(h)
	f.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36"
	f.DisablePoliteness = true

	queue := f.Start()
	queue.SendStringHead()

	dup[seed] = true
	_, err = queue.SendStringGet(seed)
	if err != nil {
		fmt.Printf("[ERR] GET %s - %s\n", seed, err)
	}
	queue.Block()

}