func main() { start_url := "http://www.jiexieyin.org" thread_num := uint(16) redisAddr := "127.0.0.1:6379" redisMaxConn := 10 redisMaxIdle := 10 proc := &MyProcessor{} sp := robot.NewSpider(proc, "redis_scheduler_example"). //SetSleepTime("fixed", 6000, 6000). //SetScheduler(scheduler.NewQueueScheduler(true)). SetScheduler(robot.NewRedisScheduler(redisAddr, redisMaxConn, redisMaxIdle, true)). AddPipeline(robot.NewPipelineConsole()). SetThreadnum(thread_num) init := false for _, arg := range os.Args { if arg == "--init" { init = true break } } if init { sp.AddUrl(start_url, "html") mlog.LogInst().LogInfo("重新开始爬") } else { mlog.LogInst().LogInfo("继续爬") } sp.Run() }
func main() { start_url := utils.BaseUrl //加载配置文件 settings := utils.LoadConf("conf/spider.conf") //获取数据库连接信息 dbinfo, ok := settings["DBINFO"] if !ok { log.Fatalf("please insert dbinfo in spider.conf") } //爬虫初始化 options := robot.SpiderOptions{ TaskName: "79xs", PageProcesser: process.NewWww79xsComProcessor(), Downloader: downloader.NewHttpDownloader("text/html; charset=gb2312"), Scheduler: scheduler.NewQueueScheduler(false), Pipelines: []robot.Pipeline{pipeline.NewPipelineMySQL(dbinfo)}, //设置资源管理器,资源池容量为100 ResourceManage: resource.NewSpidersPool(100, nil), } sp := robot.NewSpider(options) //增加根url sp.AddRequest(utils.InitRequest(start_url, nil, nil)) go sp.Run() <-utils.Stop sp.Close() }
func main() { // Spider input: // PageProcesser ; // Task name used in Pipeline for record; robot.NewSpider(NewMyPageProcesser(), "TaskName"). AddUrl("https://github.com/hu17889?tab=repositories", "html"). // Start url, html is the responce type ("html" or "json" or "jsonp" or "text") AddPipeline(robot.NewPipelineConsole()). // Print result on screen SetThreadnum(3). // Crawl request by three Coroutines Run() }
func main() { // Spider input: // PageProcesser ; // Task name used in Pipeline for record; req_url := "http://weixin.sogou.com/weixin?query=%E4%BA%91%E6%B5%AE&type=1&page=1&ie=utf8" robot.NewSpider(NewMyPageProcesser(), "TaskName"). AddUrlWithHeaderFile(req_url, "html", "weixin.sogou.com.json"). // Start url, html is the responce type ("html" or "json" or "jsonp" or "text") AddPipeline(robot.NewPipelineConsole()). // Print result on screen SetThreadnum(3). // Crawl request by three Coroutines Run() }
func main() { // spider input: // PageProcesser ; // task name used in Pipeline for record; robot.NewSpider(NewMyPageProcesser(), "sina_stock_news"). AddUrl("http://live.sina.com.cn/zt/api/l/get/finance/globalnews1/index.htm?format=json&id=63621&pagesize=10&dire=f", "json"). // start url, html is the responce type ("html" or "json" or "jsonp" or "text") AddPipeline(robot.NewPipelineConsole()). // Print result to std output AddPipeline(robot.NewPipelineFile("./sinafile")). // Print result in file OpenFileLog("./"). // Error info or other useful info in spider will be logged in file of defalt path like "WD/log/log.2014-9-1". SetSleepTime("rand", 1000, 3000). // Sleep time between 1s and 3s. Run() //AddPipeline(pipeline.NewPipelineFile("/tmp/tmpfile")). // print result in file }
func main() { req := robot.NewRequest(wkSohuUrl, "html", "index", "GET", "", nil, nil, nil, nil) sohuSpider := robot.NewSpider(NewMyPageProcesser(), "Sohu"). AddRequest(req). SetSleepTime("rand", 500, 1000). SetThreadnum(2) for i := 1; i < maxWKSouhuLayer; i++ { url := fmt.Sprintf("http://yule.sohu.com/gossip/index_%d.shtml", 5301-i) // magic num req := robot.NewRequest(url, "html", "index", "GET", "", nil, nil, nil, nil) sohuSpider.AddRequest(req) } sohuSpider.Run() }
func main() { start_url := "http://www.jiexieyin.org" redisAddr := "127.0.0.1:6379" redisMaxConn := 10 redisMaxIdle := 10 mongoUrl := "localhost:27017" mongoDB := "test" mongoCollection := "test" scheduleroptions := scheduler.RedisSchedulerOptions{ RequestList: "mgospider_requests", UrlList: "mgospider_urls", RedisAddr: redisAddr, MaxConn: redisMaxConn, MaxIdle: redisMaxIdle, ForbiddenDuplicateUrl: false, } //爬虫初始化 options := robot.SpiderOptions{ TaskName: "mgospider", PageProcesser: NewMyProcesser(), Downloader: downloader.NewHttpDownloader("text/html; charset=gb2312"), Scheduler: scheduler.NewRedisScheduler(scheduleroptions), Pipelines: []robot.Pipeline{NewPipelineMongo(mongoUrl, mongoDB, mongoCollection)}, //设置资源管理器,资源池容量为10 ResourceManage: resource.NewSpidersPool(10, nil), } sp := robot.NewSpider(options) init := false for _, arg := range os.Args { if arg == "--init" { init = true break } } if init { sp.AddRequest(initrequest(start_url)) mlog.LogInst().LogInfo("重新开始爬") } else { mlog.LogInst().LogInfo("继续爬") } sp.Run() }
func main() { spidername := "79xs" start_url := utils.BaseUrl //加载配置文件 settings := utils.LoadConf("conf/spider.conf") //获取数据库连接信息 dbinfo, ok := settings["DBINFO"] if !ok { log.Fatalf("please insert dbinfo in spider.conf") } //爬虫初始化 options := robot.SpiderOptions{ TaskName: spidername, PageProcesser: process.NewWww79xsComProcessor(), Downloader: downloader.NewHttpDownloader("text/html; charset=gb2312"), Scheduler: scheduler.NewMysqlScheduler(spidername, dbinfo), //Scheduler: scheduler.NewQueueScheduler(false), Pipelines: []robot.Pipeline{pipeline.NewPipelineMySQL(dbinfo)}, //设置资源管理器,资源池容量为100 ResourceManage: resource.NewSpidersPool(100, nil), } sp := robot.NewSpider(options) init := false for _, arg := range os.Args { if arg == "--init" { init = true break } } if init { //增加根url sp.AddRequest(utils.InitRequest(start_url, map[string]string{ "handler": "mainParse", })) log.Println("重新开始爬") } else { log.Println("继续爬") } go sp.Run() <-utils.Stop sp.Close() }