func main() { // Spider input: // PageProcesser ; // Task name used in Pipeline for record; f, err := os.Open("./formated_bookmark") if err != nil { panic("f open error") } defer f.Close() sp := spider.NewSpider(NewMyPageProcesser(), "BookMarkSearch") br := bufio.NewReader(f) urls := make(map[string]string) line, err := br.ReadString('\n') for ; err == nil; line, err = br.ReadString('\n') { data := strings.Split(line, "||") url := data[1] urltag := data[0] + "||" + strings.Trim(data[2], "\n") urls[url] = urltag } sp.AddUrls(urls, "html") // sp.AddPipeline(pipeline.NewPipelineConsole()). sp.AddPipeline(pipeline.NewPipelineFile("./crawler_result.dat")). SetThreadnum(3). // Crawl request by three Coroutines Run() }
func main() { var startUrl = "http://baike.baidu.com/subview/412610/19548276.htm" sp := spider.NewSpider(plant.NewPlantProcesser(), "TaskName"). SetScheduler(scheduler.NewQueueScheduler(true)). AddPipeline(pipeline.NewPipelineFile("result.txt")) sp.AddUrl(startUrl, "html") sp.Run() }
func main() { // spider input: // PageProcesser ; // task name used in Pipeline for record; spider.NewSpider(NewMyPageProcesser(), "sina_stock_news"). AddUrl("http://live.sina.com.cn/zt/api/l/get/finance/globalnews1/index.htm?format=json&id=63621&pagesize=10&dire=f", "json"). // start url, html is the responce type ("html" or "json" or "jsonp" or "text") AddPipeline(pipeline.NewPipelineConsole()). // Print result to std output AddPipeline(pipeline.NewPipelineFile("/tmp/sinafile")). // Print result in file OpenFileLog("/tmp"). // Error info or other useful info in spider will be logged in file of defalt path like "WD/log/log.2014-9-1". SetSleepTime("rand", 1000, 3000). // Sleep time between 1s and 3s. Run() //AddPipeline(pipeline.NewPipelineFile("/tmp/tmpfile")). // print result in file }
func main() { pageProcess := NewMyPageProcesser() pageProcess.Init() diandianSpider := spider.NewSpider(pageProcess, "TaskName"). AddUrl("http://shinichr.diandian.com/", "html"). // Start url, html is the responce type ("html" or "json" or "jsonp" or "text") AddPipeline(pipeline.NewPipelineConsole()). // Print result on screen AddPipeline(pipeline.NewPipelineFile("/home/shinichr/spider.tmp")). SetThreadnum(10000) // Crawl request by three Coroutines for i := 2; i < 10; i++ { url := fmt.Sprintf("http://shinichr.diandian.com/page/%d", i) diandianSpider.AddUrl(url, "html") } diandianSpider.Run() for url, _ := range pageProcess.visit_url { fmt.Println("spider:", url) } }