Пример #1
0
func main() {
	// Spider input:
	//  PageProcesser ;
	//  Task name used in Pipeline for record;
	f, err := os.Open("./formated_bookmark")
	if err != nil {
		panic("f open error")
	}
	defer f.Close()

	sp := spider.NewSpider(NewMyPageProcesser(), "BookMarkSearch")

	br := bufio.NewReader(f)

	urls := make(map[string]string)
	line, err := br.ReadString('\n')
	for ; err == nil; line, err = br.ReadString('\n') {
		data := strings.Split(line, "||")
		url := data[1]
		urltag := data[0] + "||" + strings.Trim(data[2], "\n")
		urls[url] = urltag
	}

	sp.AddUrls(urls, "html")

	// sp.AddPipeline(pipeline.NewPipelineConsole()).
	sp.AddPipeline(pipeline.NewPipelineFile("./crawler_result.dat")).
		SetThreadnum(3). // Crawl request by three Coroutines
		Run()
}
Пример #2
0
func main() {
	var startUrl = "http://baike.baidu.com/subview/412610/19548276.htm"
	sp := spider.NewSpider(plant.NewPlantProcesser(), "TaskName").
		SetScheduler(scheduler.NewQueueScheduler(true)).
		AddPipeline(pipeline.NewPipelineFile("result.txt"))
	sp.AddUrl(startUrl, "html")
	sp.Run()
}
Пример #3
0
func main() {
	// spider input:
	//  PageProcesser ;
	//  task name used in Pipeline for record;
	spider.NewSpider(NewMyPageProcesser(), "sina_stock_news").
		AddUrl("http://live.sina.com.cn/zt/api/l/get/finance/globalnews1/index.htm?format=json&id=63621&pagesize=10&dire=f", "json"). // start url, html is the responce type ("html" or "json" or "jsonp" or "text")
		AddPipeline(pipeline.NewPipelineConsole()).                                                                                   // Print result to std output
		AddPipeline(pipeline.NewPipelineFile("/tmp/sinafile")).                                                                       // Print result in file
		OpenFileLog("/tmp").                                                                                                          // Error info or other useful info in spider will be logged in file of defalt path like "WD/log/log.2014-9-1".
		SetSleepTime("rand", 1000, 3000).                                                                                             // Sleep time between 1s and 3s.
		Run()
	//AddPipeline(pipeline.NewPipelineFile("/tmp/tmpfile")). // print result in file
}
func main() {

	pageProcess := NewMyPageProcesser()
	pageProcess.Init()
	diandianSpider := spider.NewSpider(pageProcess, "TaskName").
		AddUrl("http://shinichr.diandian.com/", "html"). // Start url, html is the responce type ("html" or "json" or "jsonp" or "text")
		AddPipeline(pipeline.NewPipelineConsole()).      // Print result on screen
		AddPipeline(pipeline.NewPipelineFile("/home/shinichr/spider.tmp")).
		SetThreadnum(10000) // Crawl request by three Coroutines

	for i := 2; i < 10; i++ {
		url := fmt.Sprintf("http://shinichr.diandian.com/page/%d", i)
		diandianSpider.AddUrl(url, "html")
	}

	diandianSpider.Run()

	for url, _ := range pageProcess.visit_url {
		fmt.Println("spider:", url)
	}

}