Esempio n. 1
0
func main() {
	start_url := utils.BaseUrl
	//加载配置文件
	settings := utils.LoadConf("conf/spider.conf")
	//获取数据库连接信息
	dbinfo, ok := settings["DBINFO"]
	if !ok {
		log.Fatalf("please insert dbinfo in spider.conf")
	}

	//爬虫初始化
	options := robot.SpiderOptions{
		TaskName:      "79xs",
		PageProcesser: process.NewWww79xsComProcessor(),
		Downloader:    downloader.NewHttpDownloader("text/html; charset=gb2312"),
		Scheduler:     scheduler.NewQueueScheduler(false),
		Pipelines:     []robot.Pipeline{pipeline.NewPipelineMySQL(dbinfo)},
		//设置资源管理器,资源池容量为100
		ResourceManage: resource.NewSpidersPool(100, nil),
	}

	sp := robot.NewSpider(options)
	//增加根url
	sp.AddRequest(utils.InitRequest(start_url, nil, nil))
	go sp.Run()
	<-utils.Stop
	sp.Close()
}
Esempio n. 2
0
func main() {
	spidername := "79xs"
	start_url := utils.BaseUrl
	//加载配置文件
	settings := utils.LoadConf("conf/spider.conf")
	//获取数据库连接信息
	dbinfo, ok := settings["DBINFO"]
	if !ok {
		log.Fatalf("please insert dbinfo in spider.conf")
	}

	//爬虫初始化
	options := robot.SpiderOptions{
		TaskName:      spidername,
		PageProcesser: process.NewWww79xsComProcessor(),
		Downloader:    downloader.NewHttpDownloader("text/html; charset=gb2312"),
		Scheduler:     scheduler.NewMysqlScheduler(spidername, dbinfo),
		//Scheduler: scheduler.NewQueueScheduler(false),
		Pipelines: []robot.Pipeline{pipeline.NewPipelineMySQL(dbinfo)},
		//设置资源管理器,资源池容量为100
		ResourceManage: resource.NewSpidersPool(100, nil),
	}

	sp := robot.NewSpider(options)

	init := false
	for _, arg := range os.Args {
		if arg == "--init" {
			init = true
			break
		}
	}

	if init {
		//增加根url
		sp.AddRequest(utils.InitRequest(start_url, map[string]string{
			"handler": "mainParse",
		}))
		log.Println("重新开始爬")
	} else {
		log.Println("继续爬")
	}
	go sp.Run()
	<-utils.Stop
	sp.Close()
}