func main() { // POST data post_arg := url.Values{ "name": {"admin"}, "pwd": {"admin"}, } // http header header := make(http.Header) header.Set("Content-Type", "application/x-www-form-urlencoded") // Spider input: // PageProcesser ; // Task name used in Pipeline for record; // AddRequest Params: // 1. Url. // 2. Responce type is "html" or "json" or "jsonp" or "text". // 3. The urltag is name for marking url and distinguish different urls in PageProcesser and Pipeline. // 4. The method is POST or GET. // 5. The postdata is body string sent to sever. // 6. The header is header for http request. // 7. Cookies // 8. Http redirect function req := request.NewRequest("http://backadmin.hucong.net/main/user/login", "html", "site_login", "POST", post_arg.Encode(), header, nil, myRedirect, nil) spider.NewSpider(NewMyPageProcesser(), "TaskName"). AddRequest(req). AddPipeline(pipeline.NewPipelineConsole()). // Print result on screen SetThreadnum(3). // Crawl request by three Coroutines Run() }
func main() { start_url := "http://www.jiexieyin.org" thread_num := uint(16) redisAddr := "127.0.0.1:6379" redisMaxConn := 10 redisMaxIdle := 10 proc := &MyProcessor{} sp := spider.NewSpider(proc, "redis_scheduler_example"). //SetSleepTime("fixed", 6000, 6000). //SetScheduler(scheduler.NewQueueScheduler(true)). SetScheduler(scheduler.NewRedisScheduler(redisAddr, redisMaxConn, redisMaxIdle, true)). AddPipeline(pipeline.NewPipelineConsole()). SetThreadnum(thread_num) init := false for _, arg := range os.Args { if arg == "--init" { init = true break } } if init { sp.AddUrl(start_url, "html") mlog.LogInst().LogInfo("重新开始爬") } else { mlog.LogInst().LogInfo("继续爬") } sp.Run() }
func main() { // Spider input: // PageProcesser ; // Task name used in Pipeline for record; spider.NewSpider(NewMyPageProcesser(), "TaskName"). AddUrl("https://github.com/jixiuf?tab=repositories", "html"). // Start url, html is the responce type ("html" or "json" or "jsonp" or "text") AddPipeline(pipeline.NewPipelineConsole()). // Print result on screen SetThreadnum(3). // Crawl request by three Coroutines Run() }
func main() { // Spider input: // PageProcesser ; // Task name used in Pipeline for record; req_url := "http://weixin.sogou.com/weixin?query=%E4%BA%91%E6%B5%AE&type=1&page=1&ie=utf8" spider.NewSpider(NewMyPageProcesser(), "TaskName"). AddUrlWithHeaderFile(req_url, "html", "weixin.sogou.com.json"). // Start url, html is the responce type ("html" or "json" or "jsonp" or "text") AddPipeline(pipeline.NewPipelineConsole()). // Print result on screen SetThreadnum(3). // Crawl request by three Coroutines Run() }
func main() { // spider input: // PageProcesser ; // task name used in Pipeline for record; spider.NewSpider(NewMyPageProcesser(), "sina_stock_news"). AddUrl("http://live.sina.com.cn/zt/api/l/get/finance/globalnews1/index.htm?format=json&id=63621&pagesize=10&dire=f", "json"). // start url, html is the responce type ("html" or "json" or "jsonp" or "text") AddPipeline(pipeline.NewPipelineConsole()). // Print result to std output AddPipeline(pipeline.NewPipelineFile("/tmp/sinafile")). // Print result in file OpenFileLog("/tmp"). // Error info or other useful info in spider will be logged in file of defalt path like "WD/log/log.2014-9-1". SetSleepTime("rand", 1000, 3000). // Sleep time between 1s and 3s. Run() //AddPipeline(pipeline.NewPipelineFile("/tmp/tmpfile")). // print result in file }