Skip to content

phayes/crawlbot

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

33 Commits
 
 
 
 
 
 
 
 
 
 
 
 
 
 

Repository files navigation

CrawlBot

GoDoc

CrawlBot is a simple, efficient, and flexible webcrawler / spider. CrawlBot is easy to use out-of-the-box, but also provides extensive flexibility for advanced users.

package main

import (
	"fmt"
	"github.com/phayes/crawlbot"
	"log"
)

func main() {
	crawler := NewCrawler("http://cnn.com", myURLHandler, 4)
	crawler.Start()
	crawler.Wait()
}

func myURLHandler(resp *crawlbot.Response) {
	if resp.Err != nil {
		log.Fatal(resp.Err)
	}

	fmt.Println("Found URL at " + resp.URL)
}

CrawlBot provides extensive customizability for advances use cases. Please see documentation on crawlbot.Crawler and crawlbot.Response for more details.

package main

import (
	"fmt"
	"github.com/phayes/crawlbot"
	"log"
)

func main() {
	crawler := crawlbot.Crawler{
		URLs:       []string{"http://example.com", "http://cnn.com", "http://en.wikipedia.org"},
		NumWorkers: 12,
		Handler:    PrintTitle,
		CheckURL:   AllowEverything,
	}
	crawler.Start()
	crawler.Wait()
}

// Print the title of the page
func PrintTitle(resp *crawlbot.Response) {
	if resp.Err != nil {
		log.Println(resp.Err)
	}

	if resp.Doc != nil {
		title, err := resp.Doc.Search("//title")
		if err != nil {
			log.Println(err)
		}
		fmt.Printf("Title of %s is %s\n", resp.URL, title[0].Content())
	} else {
		fmt.Println("HTML was not parsed for " + resp.URL)
	}
}

// Crawl everything!
func AllowEverything(crawler *crawlbot.Crawler, url string) bool {
	return true
}

About

A simple, efficient, and flexible webcrawler / spider for go

Resources

License

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published

Languages