func main() { var sql_Cmd string runtime.GOMAXPROCS(runtime.NumCPU()) flag.Parse() mux := fetchbot.NewMux() fmt.Printf("Register a ErrorHandler\n") mux.HandleErrors(fetchbot.HandlerFunc(err_Handler)) // 1. Handle GET requests for html responses // 2. to parse the body and // 3. enqueue all links as HEAD requests. fmt.Printf("Register a GetHandler\n") mux.Response().Method("GET").ContentType("text/html").Handler(fetchbot.HandlerFunc(get_Handler)) // 1. Handle HEAD requests for html responses coming from the source host. // 2. We don't want to crawl links from other hosts. fmt.Printf("Register a HeadHandler\n") mux.Response().Method("HEAD").Host("golang.org").ContentType("text/html").Handler(fetchbot.HandlerFunc(head_Handler)) // Create the Fetcher, handle the logging first, then dispatch to the Muxer fmt.Printf("Register a LogHandler\n") h := logHandler(mux) fmt.Printf("New a fetchbot\n") f := fetchbot.New(h) /* Start processing */ fmt.Printf("Start the fetchbot\n") q := f.Start() // Enqueue the seed, which is the first entry in the dup map fmt.Printf("Enqueue the seed\n") db = setupDB() sql_Cmd = sql_Cmd_Select(consumer) rows, err_DB := db.Query(sql_Cmd) if err_DB != nil { fmt.Printf("[ERR]DB select fail\n") panicIF(err_DB) } for rows.Next() { rows.Scan(&seed_URL) dup_URLs[seed_URL] = true _, err := q.SendStringGet(seed_URL) if err != nil { fmt.Printf("[ERR] GET %s - %s\n", seed_URL, err) } } fmt.Printf("Start fetch Process\n") q.Block() fmt.Printf("End the process\n") }
// New returns an initialized Crawler. func New() *Crawler { return &Crawler{ CrawlDelay: DefaultCrawlDelay, CrawlPoliteness: DefaultCrawlPoliteness, LogFormat: DefaultLogFormat, LogLevel: DefaultLogLevel, UserAgent: DefaultUserAgent, HTTPClient: DefaultHTTPClient, Cache: DefaultCache, mux: fetchbot.NewMux(), } }
func main() { flag.Parse() // Create the muxer mux := fetchbot.NewMux() // Handle all errors the same mux.HandleErrors(fetchbot.HandlerFunc(func(ctx *fetchbot.Context, res *http.Response, err error) { fmt.Printf("[ERR] %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err) })) // Handle GET requests for html responses, to parse the body and enqueue all links as HEAD // requests. mux.Response().Method("GET").ContentType("text/html").Handler(fetchbot.HandlerFunc( func(ctx *fetchbot.Context, res *http.Response, err error) { // Process the body to find the links doc, err := goquery.NewDocumentFromResponse(res) if err != nil { fmt.Printf("[ERR] %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err) return } // Enqueue all links as HEAD requests enqueueLinks(ctx, doc) })) // Handle HEAD requests for html responses coming from the source host - we don't want // to crawl links from other hosts. mux.Response().Method("HEAD").Host("golang.org").ContentType("text/html").Handler(fetchbot.HandlerFunc( func(ctx *fetchbot.Context, res *http.Response, err error) { if _, err := ctx.Q.SendStringGet(ctx.Cmd.URL().String()); err != nil { fmt.Printf("[ERR] %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err) } })) // Create the Fetcher, handle the logging first, then dispatch to the Muxer h := logHandler(mux) if *stopAtUrl != "" { h = stopHandler(*stopAtUrl, logHandler(mux)) } f := fetchbot.New(h) // First mem stat print must be right after creating the fetchbot if *memStats > 0 { // Print starting stats printMemStats(nil) // Run at regular intervals runMemStats(f, *memStats) // On exit, print ending stats after a GC defer func() { runtime.GC() printMemStats(nil) }() } // Start processing q := f.Start() if *stopAfter > 0 { go func() { c := time.After(*stopAfter) <-c q.Close() }() } // Enqueue the seed, which is the first entry in the dup map _, err := q.SendStringGet(seed) if err != nil { fmt.Printf("[ERR] GET %s - %s\n", seed, err) } q.Block() }
func main() { flag.Parse() // Parse the provided seed u, err := url.Parse(*seed) if err != nil { log.Fatal(err) } // Create the muxer mux := fetchbot.NewMux() // Handle all errors the same mux.HandleErrors(fetchbot.HandlerFunc(func(ctx *fetchbot.Context, res *http.Response, err error) { fmt.Printf("[ERR] %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err) })) // Handle GET requests for html responses, to parse the body and enqueue all links as HEAD // requests. mux.Response().Method("GET").ContentType("text/html").Handler(fetchbot.HandlerFunc( func(ctx *fetchbot.Context, res *http.Response, err error) { // Process the body to find the links doc, err := goquery.NewDocumentFromResponse(res) if err != nil { fmt.Printf("[ERR] %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err) return } // Enqueue all links as HEAD requests enqueueLinks(ctx, doc) })) // Handle HEAD requests for html responses coming from the source host - we don't want // to crawl links from other hosts. mux.Response().Method("HEAD").Host(u.Host).ContentType("text/html").Handler(fetchbot.HandlerFunc( func(ctx *fetchbot.Context, res *http.Response, err error) { if _, err := ctx.Q.SendStringGet(ctx.Cmd.URL().String()); err != nil { fmt.Printf("[ERR] %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err) } })) // Create the Fetcher, handle the logging first, then dispatch to the Muxer h := logHandler(mux) if *stopAtURL != "" || *cancelAtURL != "" { stopURL := *stopAtURL if *cancelAtURL != "" { stopURL = *cancelAtURL } h = stopHandler(stopURL, *cancelAtURL != "", logHandler(mux)) } f := fetchbot.New(h) // First mem stat print must be right after creating the fetchbot if *memStats > 0 { // Print starting stats printMemStats(nil) // Run at regular intervals runMemStats(f, *memStats) // On exit, print ending stats after a GC defer func() { runtime.GC() printMemStats(nil) }() } // Start processing q := f.Start() // if a stop or cancel is requested after some duration, launch the goroutine // that will stop or cancel. if *stopAfter > 0 || *cancelAfter > 0 { after := *stopAfter stopFunc := q.Close if *cancelAfter != 0 { after = *cancelAfter stopFunc = q.Cancel } go func() { c := time.After(after) <-c stopFunc() }() } // Enqueue the seed, which is the first entry in the dup map dup[*seed] = true _, err = q.SendStringGet(*seed) if err != nil { fmt.Printf("[ERR] GET %s - %s\n", *seed, err) } q.Block() }
func main() { lf, err := os.OpenFile("testlogfile", os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666) if err != nil { log.Fatalf("error opening file: %v", err) } defer lf.Close() log.SetOutput(lf) log.Println("This is a test log entry") seed := "http://github.com/golang/go/wiki" u, err := url.Parse(seed) fmt.Printf("Scanning: %v\n", u.String()) if err != nil { log.Fatal(err) } mux := fetchbot.NewMux() // Handle all errors the same mux.HandleErrors(fetchbot.HandlerFunc(func(ctx *fetchbot.Context, res *http.Response, err error) { fmt.Printf("[ERR - HandleErrors] %s\n", err) })) // Handle GET requests for html responses, to parse the body and enqueue all links as HEAD // requests. mux.Response().Method("GET").Host(u.Host).Path("/golang/go/wiki").ContentType("text/html").Handler(fetchbot.HandlerFunc( func(ctx *fetchbot.Context, res *http.Response, err error) { log.Printf("GET: %v - %v\n", res.Status, ctx.Cmd.URL()) doc, err := goquery.NewDocumentFromResponse(res) if err != nil { fmt.Printf("[GET] %s %s - %s\n", res.Status, ctx.Cmd.URL(), err) return } // Enqueue all links as HEAD requests enqueueLinks(ctx, doc) })) // Handle GET requests for html responses coming from the source host - we don't want // to crawl links from other hosts. mux.Response().ContentType("text/html").Handler(fetchbot.HandlerFunc( func(ctx *fetchbot.Context, res *http.Response, err error) { log.Printf("HEAD: %v - %v\n", res.Status, ctx.Cmd.URL()) if strings.HasPrefix(res.Status, "40") || strings.HasPrefix(res.Status, "50") { fmt.Printf("[ERR] - %v - %v\n", res.Status, ctx.Cmd.URL()) } })) h := logHandler(mux) f := fetchbot.New(h) f.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36" f.DisablePoliteness = true queue := f.Start() queue.SendStringHead() dup[seed] = true _, err = queue.SendStringGet(seed) if err != nil { fmt.Printf("[ERR] GET %s - %s\n", seed, err) } queue.Block() }