func main() { start_url := utils.BaseUrl //加载配置文件 settings := utils.LoadConf("conf/spider.conf") //获取数据库连接信息 dbinfo, ok := settings["DBINFO"] if !ok { log.Fatalf("please insert dbinfo in spider.conf") } //爬虫初始化 options := robot.SpiderOptions{ TaskName: "79xs", PageProcesser: process.NewWww79xsComProcessor(), Downloader: downloader.NewHttpDownloader("text/html; charset=gb2312"), Scheduler: scheduler.NewQueueScheduler(false), Pipelines: []robot.Pipeline{pipeline.NewPipelineMySQL(dbinfo)}, //设置资源管理器,资源池容量为100 ResourceManage: resource.NewSpidersPool(100, nil), } sp := robot.NewSpider(options) //增加根url sp.AddRequest(utils.InitRequest(start_url, nil, nil)) go sp.Run() <-utils.Stop sp.Close() }
func main() { spidername := "79xs" start_url := utils.BaseUrl //加载配置文件 settings := utils.LoadConf("conf/spider.conf") //获取数据库连接信息 dbinfo, ok := settings["DBINFO"] if !ok { log.Fatalf("please insert dbinfo in spider.conf") } //爬虫初始化 options := robot.SpiderOptions{ TaskName: spidername, PageProcesser: process.NewWww79xsComProcessor(), Downloader: downloader.NewHttpDownloader("text/html; charset=gb2312"), Scheduler: scheduler.NewMysqlScheduler(spidername, dbinfo), //Scheduler: scheduler.NewQueueScheduler(false), Pipelines: []robot.Pipeline{pipeline.NewPipelineMySQL(dbinfo)}, //设置资源管理器,资源池容量为100 ResourceManage: resource.NewSpidersPool(100, nil), } sp := robot.NewSpider(options) init := false for _, arg := range os.Args { if arg == "--init" { init = true break } } if init { //增加根url sp.AddRequest(utils.InitRequest(start_url, map[string]string{ "handler": "mainParse", })) log.Println("重新开始爬") } else { log.Println("继续爬") } go sp.Run() <-utils.Stop sp.Close() }