func main() { var sql_Cmd string runtime.GOMAXPROCS(runtime.NumCPU()) flag.Parse() mux := fetchbot.NewMux() fmt.Printf("Register a ErrorHandler\n") mux.HandleErrors(fetchbot.HandlerFunc(err_Handler)) // 1. Handle GET requests for html responses // 2. to parse the body and // 3. enqueue all links as HEAD requests. fmt.Printf("Register a GetHandler\n") mux.Response().Method("GET").ContentType("text/html").Handler(fetchbot.HandlerFunc(get_Handler)) // 1. Handle HEAD requests for html responses coming from the source host. // 2. We don't want to crawl links from other hosts. fmt.Printf("Register a HeadHandler\n") mux.Response().Method("HEAD").Host("golang.org").ContentType("text/html").Handler(fetchbot.HandlerFunc(head_Handler)) // Create the Fetcher, handle the logging first, then dispatch to the Muxer fmt.Printf("Register a LogHandler\n") h := logHandler(mux) fmt.Printf("New a fetchbot\n") f := fetchbot.New(h) /* Start processing */ fmt.Printf("Start the fetchbot\n") q := f.Start() // Enqueue the seed, which is the first entry in the dup map fmt.Printf("Enqueue the seed\n") db = setupDB() sql_Cmd = sql_Cmd_Select(consumer) rows, err_DB := db.Query(sql_Cmd) if err_DB != nil { fmt.Printf("[ERR]DB select fail\n") panicIF(err_DB) } for rows.Next() { rows.Scan(&seed_URL) dup_URLs[seed_URL] = true _, err := q.SendStringGet(seed_URL) if err != nil { fmt.Printf("[ERR] GET %s - %s\n", seed_URL, err) } } fmt.Printf("Start fetch Process\n") q.Block() fmt.Printf("End the process\n") }
// logHandler prints the fetch information and dispatches the call to the wrapped Handler. func logHandler(wrapped fetchbot.Handler) fetchbot.Handler { return fetchbot.HandlerFunc(func(ctx *fetchbot.Context, res *http.Response, err error) { if err == nil { fmt.Printf("[%d] %s %s - %s\n", res.StatusCode, ctx.Cmd.Method(), ctx.Cmd.URL(), res.Header.Get("Content-Type")) } wrapped.Handle(ctx, res, err) }) }
func main() { f := fetchbot.New(fetchbot.HandlerFunc(handler)) f.AutoClose = true f.WorkerIdleTTL = time.Second queue := f.Start() queue.SendStringHead("http://google.com", "http://golang.org", "http://golang.org/doc") queue.Block() }
func (c *Crawler) newErrorHandler() fetchbot.Handler { return fetchbot.HandlerFunc(func(ctx *fetchbot.Context, res *http.Response, err error) { c.Logger.WithFields(logrus.Fields{ "url": ctx.Cmd.URL(), "method": ctx.Cmd.Method(), }).Error(err) }) }
// stopHandler stops the fetcher if the stopurl is reached. Otherwise it dispatches // the call to the wrapped Handler. func stopHandler(stopurl string, wrapped fetchbot.Handler) fetchbot.Handler { return fetchbot.HandlerFunc(func(ctx *fetchbot.Context, res *http.Response, err error) { if ctx.Cmd.URL().String() == stopurl { ctx.Q.Close() return } wrapped.Handle(ctx, res, err) }) }
func (c *Crawler) newRequestHandler() fetchbot.Handler { return fetchbot.HandlerFunc(func(ctx *fetchbot.Context, res *http.Response, err error) { if res != nil { context := &Ctx{ctx, c.Cache, c.Logger} c.Logger.WithFields(logrus.Fields{ "method": context.Method(), "status": res.StatusCode, "content_type": res.Header.Get("Content-Type"), "depth": context.Depth(), }).Info(context.URL()) } c.mux.Handle(ctx, res, err) }) }
func main() { var url string f := fetchbot.New(fetchbot.HandlerFunc(handler)) f.DisablePoliteness = true db1 = dbutils.NewDB1() defer db1.Close() db2 = dbutils.NewDB2() defer db2.Close() rows, _ := db1.Query("select url FROM urls") defer rows.Close() for rows.Next() { rows.Scan(&url) queue := f.Start() queue.SendStringGet(url) queue.Close() } }
// stopHandler stops the fetcher if the stopurl is reached. Otherwise it dispatches // the call to the wrapped Handler. func stopHandler(stopurl string, cancel bool, wrapped fetchbot.Handler) fetchbot.Handler { return fetchbot.HandlerFunc(func(ctx *fetchbot.Context, res *http.Response, err error) { if ctx.Cmd.URL().String() == stopurl { fmt.Printf(">>>>> STOP URL %s\n", ctx.Cmd.URL()) // generally not a good idea to stop/block from a handler goroutine // so do it in a separate goroutine go func() { if cancel { ctx.Q.Cancel() } else { ctx.Q.Close() } }() return } wrapped.Handle(ctx, res, err) }) }
// ProcessMessage initializes a crawler to parse a specific url and crawls its html looking for images. func ProcessMessage(q queue.Connection, d db.Connection, msg *queue.Message) { log.Printf("type=messageReceived msg=%v\n", msg) view, err := d.ViewPage(msg.JobUUID, msg.URL) if err != nil { log.Printf("type=viewPageError jobUUID=%s url=%s err=%v\n", msg.JobUUID, msg.URL, err) return } if !view { log.Printf("type=pageAlreadyViewed jobUUID=%s url=%s\n", msg.JobUUID, msg.URL) return } c := newCrawler(d, q, msg) c.fetcher = fetchbot.New(fetchbot.HandlerFunc(c.crawlResponse)) c.fetcher.HttpClient = httpClient c.Crawl() }
func (c *Crawler) newHTMLHandler(procs ...Processor) fetchbot.Handler { return fetchbot.HandlerFunc(func(ctx *fetchbot.Context, res *http.Response, err error) { context := &Ctx{ctx, c.Cache, c.Logger} doc, err := goquery.NewDocumentFromResponse(res) if err != nil { c.Logger.WithFields(logrus.Fields{ "url": context.URL(), "method": context.Method(), }).Error(err) return } for _, p := range procs { ok := p.Process(context, doc) if !ok { return } } }) }
func main() { flag.Parse() // Create the muxer mux := fetchbot.NewMux() // Handle all errors the same mux.HandleErrors(fetchbot.HandlerFunc(func(ctx *fetchbot.Context, res *http.Response, err error) { fmt.Printf("[ERR] %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err) })) // Handle GET requests for html responses, to parse the body and enqueue all links as HEAD // requests. mux.Response().Method("GET").ContentType("text/html").Handler(fetchbot.HandlerFunc( func(ctx *fetchbot.Context, res *http.Response, err error) { // Process the body to find the links doc, err := goquery.NewDocumentFromResponse(res) if err != nil { fmt.Printf("[ERR] %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err) return } // Enqueue all links as HEAD requests enqueueLinks(ctx, doc) })) // Handle HEAD requests for html responses coming from the source host - we don't want // to crawl links from other hosts. mux.Response().Method("HEAD").Host("golang.org").ContentType("text/html").Handler(fetchbot.HandlerFunc( func(ctx *fetchbot.Context, res *http.Response, err error) { if _, err := ctx.Q.SendStringGet(ctx.Cmd.URL().String()); err != nil { fmt.Printf("[ERR] %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err) } })) // Create the Fetcher, handle the logging first, then dispatch to the Muxer h := logHandler(mux) if *stopAtUrl != "" { h = stopHandler(*stopAtUrl, logHandler(mux)) } f := fetchbot.New(h) // First mem stat print must be right after creating the fetchbot if *memStats > 0 { // Print starting stats printMemStats(nil) // Run at regular intervals runMemStats(f, *memStats) // On exit, print ending stats after a GC defer func() { runtime.GC() printMemStats(nil) }() } // Start processing q := f.Start() if *stopAfter > 0 { go func() { c := time.After(*stopAfter) <-c q.Close() }() } // Enqueue the seed, which is the first entry in the dup map _, err := q.SendStringGet(seed) if err != nil { fmt.Printf("[ERR] GET %s - %s\n", seed, err) } q.Block() }
func main() { f := fetchbot.New(fetchbot.HandlerFunc(handler)) queue := f.Start() queue.SendStringHead("http://google.com", "http://golang.org", "http://golang.org/doc") queue.Close() }
func main() { flag.Parse() // Parse the provided seed u, err := url.Parse(*seed) if err != nil { log.Fatal(err) } // Create the muxer mux := fetchbot.NewMux() // Handle all errors the same mux.HandleErrors(fetchbot.HandlerFunc(func(ctx *fetchbot.Context, res *http.Response, err error) { fmt.Printf("[ERR] %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err) })) // Handle GET requests for html responses, to parse the body and enqueue all links as HEAD // requests. mux.Response().Method("GET").ContentType("text/html").Handler(fetchbot.HandlerFunc( func(ctx *fetchbot.Context, res *http.Response, err error) { // Process the body to find the links doc, err := goquery.NewDocumentFromResponse(res) if err != nil { fmt.Printf("[ERR] %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err) return } // Enqueue all links as HEAD requests enqueueLinks(ctx, doc) })) // Handle HEAD requests for html responses coming from the source host - we don't want // to crawl links from other hosts. mux.Response().Method("HEAD").Host(u.Host).ContentType("text/html").Handler(fetchbot.HandlerFunc( func(ctx *fetchbot.Context, res *http.Response, err error) { if _, err := ctx.Q.SendStringGet(ctx.Cmd.URL().String()); err != nil { fmt.Printf("[ERR] %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err) } })) // Create the Fetcher, handle the logging first, then dispatch to the Muxer h := logHandler(mux) if *stopAtURL != "" || *cancelAtURL != "" { stopURL := *stopAtURL if *cancelAtURL != "" { stopURL = *cancelAtURL } h = stopHandler(stopURL, *cancelAtURL != "", logHandler(mux)) } f := fetchbot.New(h) // First mem stat print must be right after creating the fetchbot if *memStats > 0 { // Print starting stats printMemStats(nil) // Run at regular intervals runMemStats(f, *memStats) // On exit, print ending stats after a GC defer func() { runtime.GC() printMemStats(nil) }() } // Start processing q := f.Start() // if a stop or cancel is requested after some duration, launch the goroutine // that will stop or cancel. if *stopAfter > 0 || *cancelAfter > 0 { after := *stopAfter stopFunc := q.Close if *cancelAfter != 0 { after = *cancelAfter stopFunc = q.Cancel } go func() { c := time.After(after) <-c stopFunc() }() } // Enqueue the seed, which is the first entry in the dup map dup[*seed] = true _, err = q.SendStringGet(*seed) if err != nil { fmt.Printf("[ERR] GET %s - %s\n", *seed, err) } q.Block() }
func main() { lf, err := os.OpenFile("testlogfile", os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666) if err != nil { log.Fatalf("error opening file: %v", err) } defer lf.Close() log.SetOutput(lf) log.Println("This is a test log entry") seed := "http://github.com/golang/go/wiki" u, err := url.Parse(seed) fmt.Printf("Scanning: %v\n", u.String()) if err != nil { log.Fatal(err) } mux := fetchbot.NewMux() // Handle all errors the same mux.HandleErrors(fetchbot.HandlerFunc(func(ctx *fetchbot.Context, res *http.Response, err error) { fmt.Printf("[ERR - HandleErrors] %s\n", err) })) // Handle GET requests for html responses, to parse the body and enqueue all links as HEAD // requests. mux.Response().Method("GET").Host(u.Host).Path("/golang/go/wiki").ContentType("text/html").Handler(fetchbot.HandlerFunc( func(ctx *fetchbot.Context, res *http.Response, err error) { log.Printf("GET: %v - %v\n", res.Status, ctx.Cmd.URL()) doc, err := goquery.NewDocumentFromResponse(res) if err != nil { fmt.Printf("[GET] %s %s - %s\n", res.Status, ctx.Cmd.URL(), err) return } // Enqueue all links as HEAD requests enqueueLinks(ctx, doc) })) // Handle GET requests for html responses coming from the source host - we don't want // to crawl links from other hosts. mux.Response().ContentType("text/html").Handler(fetchbot.HandlerFunc( func(ctx *fetchbot.Context, res *http.Response, err error) { log.Printf("HEAD: %v - %v\n", res.Status, ctx.Cmd.URL()) if strings.HasPrefix(res.Status, "40") || strings.HasPrefix(res.Status, "50") { fmt.Printf("[ERR] - %v - %v\n", res.Status, ctx.Cmd.URL()) } })) h := logHandler(mux) f := fetchbot.New(h) f.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36" f.DisablePoliteness = true queue := f.Start() queue.SendStringHead() dup[seed] = true _, err = queue.SendStringGet(seed) if err != nil { fmt.Printf("[ERR] GET %s - %s\n", seed, err) } queue.Block() }