Beispiel #1
0
func main() {
	var sql_Cmd string

	runtime.GOMAXPROCS(runtime.NumCPU())

	flag.Parse()
	mux := fetchbot.NewMux()

	fmt.Printf("Register a ErrorHandler\n")
	mux.HandleErrors(fetchbot.HandlerFunc(err_Handler))

	// 1. Handle GET requests for html responses
	// 2. to parse the body and
	// 3. enqueue all links as HEAD requests.
	fmt.Printf("Register a GetHandler\n")
	mux.Response().Method("GET").ContentType("text/html").Handler(fetchbot.HandlerFunc(get_Handler))

	// 1. Handle HEAD requests for html responses coming from the source host.
	// 2. We don't want to crawl links from other hosts.
	fmt.Printf("Register a HeadHandler\n")
	mux.Response().Method("HEAD").Host("golang.org").ContentType("text/html").Handler(fetchbot.HandlerFunc(head_Handler))

	// Create the Fetcher, handle the logging first, then dispatch to the Muxer
	fmt.Printf("Register a LogHandler\n")
	h := logHandler(mux)

	fmt.Printf("New a fetchbot\n")
	f := fetchbot.New(h)

	/* Start processing */
	fmt.Printf("Start the fetchbot\n")
	q := f.Start()

	// Enqueue the seed, which is the first entry in the dup map
	fmt.Printf("Enqueue the seed\n")
	db = setupDB()

	sql_Cmd = sql_Cmd_Select(consumer)
	rows, err_DB := db.Query(sql_Cmd)
	if err_DB != nil {
		fmt.Printf("[ERR]DB select fail\n")
		panicIF(err_DB)
	}

	for rows.Next() {
		rows.Scan(&seed_URL)
		dup_URLs[seed_URL] = true

		_, err := q.SendStringGet(seed_URL)
		if err != nil {
			fmt.Printf("[ERR] GET %s - %s\n", seed_URL, err)
		}
	}

	fmt.Printf("Start fetch Process\n")
	q.Block()
	fmt.Printf("End the process\n")
}
Beispiel #2
0
// New returns an initialized Crawler.
func New() *Crawler {
	return &Crawler{
		CrawlDelay:      DefaultCrawlDelay,
		CrawlPoliteness: DefaultCrawlPoliteness,
		LogFormat:       DefaultLogFormat,
		LogLevel:        DefaultLogLevel,
		UserAgent:       DefaultUserAgent,
		HTTPClient:      DefaultHTTPClient,
		Cache:           DefaultCache,
		mux:             fetchbot.NewMux(),
	}
}
Beispiel #3
0
func main() {
	flag.Parse()

	// Create the muxer
	mux := fetchbot.NewMux()

	// Handle all errors the same
	mux.HandleErrors(fetchbot.HandlerFunc(func(ctx *fetchbot.Context, res *http.Response, err error) {
		fmt.Printf("[ERR] %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err)
	}))

	// Handle GET requests for html responses, to parse the body and enqueue all links as HEAD
	// requests.
	mux.Response().Method("GET").ContentType("text/html").Handler(fetchbot.HandlerFunc(
		func(ctx *fetchbot.Context, res *http.Response, err error) {
			// Process the body to find the links
			doc, err := goquery.NewDocumentFromResponse(res)
			if err != nil {
				fmt.Printf("[ERR] %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err)
				return
			}
			// Enqueue all links as HEAD requests
			enqueueLinks(ctx, doc)
		}))

	// Handle HEAD requests for html responses coming from the source host - we don't want
	// to crawl links from other hosts.
	mux.Response().Method("HEAD").Host("golang.org").ContentType("text/html").Handler(fetchbot.HandlerFunc(
		func(ctx *fetchbot.Context, res *http.Response, err error) {
			if _, err := ctx.Q.SendStringGet(ctx.Cmd.URL().String()); err != nil {
				fmt.Printf("[ERR] %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err)
			}
		}))

	// Create the Fetcher, handle the logging first, then dispatch to the Muxer
	h := logHandler(mux)
	if *stopAtUrl != "" {
		h = stopHandler(*stopAtUrl, logHandler(mux))
	}
	f := fetchbot.New(h)
	// First mem stat print must be right after creating the fetchbot
	if *memStats > 0 {
		// Print starting stats
		printMemStats(nil)
		// Run at regular intervals
		runMemStats(f, *memStats)
		// On exit, print ending stats after a GC
		defer func() {
			runtime.GC()
			printMemStats(nil)
		}()
	}
	// Start processing
	q := f.Start()
	if *stopAfter > 0 {
		go func() {
			c := time.After(*stopAfter)
			<-c
			q.Close()
		}()
	}
	// Enqueue the seed, which is the first entry in the dup map
	_, err := q.SendStringGet(seed)
	if err != nil {
		fmt.Printf("[ERR] GET %s - %s\n", seed, err)
	}
	q.Block()
}
func main() {
	flag.Parse()

	// Parse the provided seed
	u, err := url.Parse(*seed)
	if err != nil {
		log.Fatal(err)
	}

	// Create the muxer
	mux := fetchbot.NewMux()

	// Handle all errors the same
	mux.HandleErrors(fetchbot.HandlerFunc(func(ctx *fetchbot.Context, res *http.Response, err error) {
		fmt.Printf("[ERR] %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err)
	}))

	// Handle GET requests for html responses, to parse the body and enqueue all links as HEAD
	// requests.
	mux.Response().Method("GET").ContentType("text/html").Handler(fetchbot.HandlerFunc(
		func(ctx *fetchbot.Context, res *http.Response, err error) {
			// Process the body to find the links
			doc, err := goquery.NewDocumentFromResponse(res)
			if err != nil {
				fmt.Printf("[ERR] %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err)
				return
			}
			// Enqueue all links as HEAD requests
			enqueueLinks(ctx, doc)
		}))

	// Handle HEAD requests for html responses coming from the source host - we don't want
	// to crawl links from other hosts.
	mux.Response().Method("HEAD").Host(u.Host).ContentType("text/html").Handler(fetchbot.HandlerFunc(
		func(ctx *fetchbot.Context, res *http.Response, err error) {
			if _, err := ctx.Q.SendStringGet(ctx.Cmd.URL().String()); err != nil {
				fmt.Printf("[ERR] %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err)
			}
		}))

	// Create the Fetcher, handle the logging first, then dispatch to the Muxer
	h := logHandler(mux)
	if *stopAtURL != "" || *cancelAtURL != "" {
		stopURL := *stopAtURL
		if *cancelAtURL != "" {
			stopURL = *cancelAtURL
		}
		h = stopHandler(stopURL, *cancelAtURL != "", logHandler(mux))
	}
	f := fetchbot.New(h)

	// First mem stat print must be right after creating the fetchbot
	if *memStats > 0 {
		// Print starting stats
		printMemStats(nil)
		// Run at regular intervals
		runMemStats(f, *memStats)
		// On exit, print ending stats after a GC
		defer func() {
			runtime.GC()
			printMemStats(nil)
		}()
	}

	// Start processing
	q := f.Start()

	// if a stop or cancel is requested after some duration, launch the goroutine
	// that will stop or cancel.
	if *stopAfter > 0 || *cancelAfter > 0 {
		after := *stopAfter
		stopFunc := q.Close
		if *cancelAfter != 0 {
			after = *cancelAfter
			stopFunc = q.Cancel
		}

		go func() {
			c := time.After(after)
			<-c
			stopFunc()
		}()
	}

	// Enqueue the seed, which is the first entry in the dup map
	dup[*seed] = true
	_, err = q.SendStringGet(*seed)
	if err != nil {
		fmt.Printf("[ERR] GET %s - %s\n", *seed, err)
	}
	q.Block()
}
func main() {
	lf, err := os.OpenFile("testlogfile", os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
	if err != nil {
		log.Fatalf("error opening file: %v", err)
	}
	defer lf.Close()

	log.SetOutput(lf)
	log.Println("This is a test log entry")
	seed := "http://github.com/golang/go/wiki"

	u, err := url.Parse(seed)
	fmt.Printf("Scanning: %v\n", u.String())
	if err != nil {
		log.Fatal(err)
	}
	mux := fetchbot.NewMux()

	// Handle all errors the same
	mux.HandleErrors(fetchbot.HandlerFunc(func(ctx *fetchbot.Context, res *http.Response, err error) {
		fmt.Printf("[ERR - HandleErrors] %s\n", err)
	}))

	// Handle GET requests for html responses, to parse the body and enqueue all links as HEAD
	// requests.
	mux.Response().Method("GET").Host(u.Host).Path("/golang/go/wiki").ContentType("text/html").Handler(fetchbot.HandlerFunc(
		func(ctx *fetchbot.Context, res *http.Response, err error) {
			log.Printf("GET: %v - %v\n", res.Status, ctx.Cmd.URL())
			doc, err := goquery.NewDocumentFromResponse(res)
			if err != nil {
				fmt.Printf("[GET] %s %s - %s\n", res.Status, ctx.Cmd.URL(), err)
				return
			}
			// Enqueue all links as HEAD requests
			enqueueLinks(ctx, doc)
		}))

	// Handle GET requests for html responses coming from the source host - we don't want
	// to crawl links from other hosts.
	mux.Response().ContentType("text/html").Handler(fetchbot.HandlerFunc(
		func(ctx *fetchbot.Context, res *http.Response, err error) {
			log.Printf("HEAD: %v -  %v\n", res.Status, ctx.Cmd.URL())
			if strings.HasPrefix(res.Status, "40") || strings.HasPrefix(res.Status, "50") {
				fmt.Printf("[ERR] - %v - %v\n", res.Status, ctx.Cmd.URL())
			}
		}))
	h := logHandler(mux)
	f := fetchbot.New(h)
	f.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36"
	f.DisablePoliteness = true

	queue := f.Start()
	queue.SendStringHead()

	dup[seed] = true
	_, err = queue.SendStringGet(seed)
	if err != nil {
		fmt.Printf("[ERR] GET %s - %s\n", seed, err)
	}
	queue.Block()

}