Exemple #1
0
func main() {
	start_url := "http://www.jiexieyin.org"
	thread_num := uint(16)

	redisAddr := "127.0.0.1:6379"
	redisMaxConn := 10
	redisMaxIdle := 10

	proc := &MyProcessor{}

	sp := robot.NewSpider(proc, "redis_scheduler_example").
		//SetSleepTime("fixed", 6000, 6000).
		//SetScheduler(scheduler.NewQueueScheduler(true)).
		SetScheduler(robot.NewRedisScheduler(redisAddr, redisMaxConn, redisMaxIdle, true)).
		AddPipeline(robot.NewPipelineConsole()).
		SetThreadnum(thread_num)

	init := false
	for _, arg := range os.Args {
		if arg == "--init" {
			init = true
			break
		}
	}
	if init {
		sp.AddUrl(start_url, "html")
		mlog.LogInst().LogInfo("重新开始爬")
	} else {
		mlog.LogInst().LogInfo("继续爬")
	}
	sp.Run()
}
Exemple #2
0
func main() {
	start_url := utils.BaseUrl
	//加载配置文件
	settings := utils.LoadConf("conf/spider.conf")
	//获取数据库连接信息
	dbinfo, ok := settings["DBINFO"]
	if !ok {
		log.Fatalf("please insert dbinfo in spider.conf")
	}

	//爬虫初始化
	options := robot.SpiderOptions{
		TaskName:      "79xs",
		PageProcesser: process.NewWww79xsComProcessor(),
		Downloader:    downloader.NewHttpDownloader("text/html; charset=gb2312"),
		Scheduler:     scheduler.NewQueueScheduler(false),
		Pipelines:     []robot.Pipeline{pipeline.NewPipelineMySQL(dbinfo)},
		//设置资源管理器,资源池容量为100
		ResourceManage: resource.NewSpidersPool(100, nil),
	}

	sp := robot.NewSpider(options)
	//增加根url
	sp.AddRequest(utils.InitRequest(start_url, nil, nil))
	go sp.Run()
	<-utils.Stop
	sp.Close()
}
Exemple #3
0
func main() {
	// Spider input:
	//  PageProcesser ;
	//  Task name used in Pipeline for record;
	robot.NewSpider(NewMyPageProcesser(), "TaskName").
		AddUrl("https://github.com/hu17889?tab=repositories", "html"). // Start url, html is the responce type ("html" or "json" or "jsonp" or "text")
		AddPipeline(robot.NewPipelineConsole()).                       // Print result on screen
		SetThreadnum(3).                                               // Crawl request by three Coroutines
		Run()
}
Exemple #4
0
func main() {
	// Spider input:
	//  PageProcesser ;
	//  Task name used in Pipeline for record;
	req_url := "http://weixin.sogou.com/weixin?query=%E4%BA%91%E6%B5%AE&type=1&page=1&ie=utf8"
	robot.NewSpider(NewMyPageProcesser(), "TaskName").
		AddUrlWithHeaderFile(req_url, "html", "weixin.sogou.com.json"). // Start url, html is the responce type ("html" or "json" or "jsonp" or "text")
		AddPipeline(robot.NewPipelineConsole()).                        // Print result on screen
		SetThreadnum(3).                                                // Crawl request by three Coroutines
		Run()
}
Exemple #5
0
func main() {
	// spider input:
	//  PageProcesser ;
	//  task name used in Pipeline for record;
	robot.NewSpider(NewMyPageProcesser(), "sina_stock_news").
		AddUrl("http://live.sina.com.cn/zt/api/l/get/finance/globalnews1/index.htm?format=json&id=63621&pagesize=10&dire=f", "json"). // start url, html is the responce type ("html" or "json" or "jsonp" or "text")
		AddPipeline(robot.NewPipelineConsole()).                                                                                      // Print result to std output
		AddPipeline(robot.NewPipelineFile("./sinafile")).                                                                             // Print result in file
		OpenFileLog("./").                                                                                                            // Error info or other useful info in spider will be logged in file of defalt path like "WD/log/log.2014-9-1".
		SetSleepTime("rand", 1000, 3000).                                                                                             // Sleep time between 1s and 3s.
		Run()
	//AddPipeline(pipeline.NewPipelineFile("/tmp/tmpfile")). // print result in file
}
Exemple #6
0
func main() {
	req := robot.NewRequest(wkSohuUrl, "html", "index", "GET", "", nil, nil, nil, nil)
	sohuSpider := robot.NewSpider(NewMyPageProcesser(), "Sohu").
		AddRequest(req).
		SetSleepTime("rand", 500, 1000).
		SetThreadnum(2)

	for i := 1; i < maxWKSouhuLayer; i++ {
		url := fmt.Sprintf("http://yule.sohu.com/gossip/index_%d.shtml", 5301-i) // magic num
		req := robot.NewRequest(url, "html", "index", "GET", "", nil, nil, nil, nil)
		sohuSpider.AddRequest(req)
	}

	sohuSpider.Run()
}
Exemple #7
0
func main() {
	start_url := "http://www.jiexieyin.org"
	redisAddr := "127.0.0.1:6379"
	redisMaxConn := 10
	redisMaxIdle := 10

	mongoUrl := "localhost:27017"
	mongoDB := "test"
	mongoCollection := "test"

	scheduleroptions := scheduler.RedisSchedulerOptions{
		RequestList:           "mgospider_requests",
		UrlList:               "mgospider_urls",
		RedisAddr:             redisAddr,
		MaxConn:               redisMaxConn,
		MaxIdle:               redisMaxIdle,
		ForbiddenDuplicateUrl: false,
	}

	//爬虫初始化
	options := robot.SpiderOptions{
		TaskName:      "mgospider",
		PageProcesser: NewMyProcesser(),
		Downloader:    downloader.NewHttpDownloader("text/html; charset=gb2312"),
		Scheduler:     scheduler.NewRedisScheduler(scheduleroptions),
		Pipelines:     []robot.Pipeline{NewPipelineMongo(mongoUrl, mongoDB, mongoCollection)},
		//设置资源管理器,资源池容量为10
		ResourceManage: resource.NewSpidersPool(10, nil),
	}

	sp := robot.NewSpider(options)

	init := false
	for _, arg := range os.Args {
		if arg == "--init" {
			init = true
			break
		}
	}
	if init {
		sp.AddRequest(initrequest(start_url))
		mlog.LogInst().LogInfo("重新开始爬")
	} else {
		mlog.LogInst().LogInfo("继续爬")
	}
	sp.Run()
}
Exemple #8
0
func main() {
	spidername := "79xs"
	start_url := utils.BaseUrl
	//加载配置文件
	settings := utils.LoadConf("conf/spider.conf")
	//获取数据库连接信息
	dbinfo, ok := settings["DBINFO"]
	if !ok {
		log.Fatalf("please insert dbinfo in spider.conf")
	}

	//爬虫初始化
	options := robot.SpiderOptions{
		TaskName:      spidername,
		PageProcesser: process.NewWww79xsComProcessor(),
		Downloader:    downloader.NewHttpDownloader("text/html; charset=gb2312"),
		Scheduler:     scheduler.NewMysqlScheduler(spidername, dbinfo),
		//Scheduler: scheduler.NewQueueScheduler(false),
		Pipelines: []robot.Pipeline{pipeline.NewPipelineMySQL(dbinfo)},
		//设置资源管理器,资源池容量为100
		ResourceManage: resource.NewSpidersPool(100, nil),
	}

	sp := robot.NewSpider(options)

	init := false
	for _, arg := range os.Args {
		if arg == "--init" {
			init = true
			break
		}
	}

	if init {
		//增加根url
		sp.AddRequest(utils.InitRequest(start_url, map[string]string{
			"handler": "mainParse",
		}))
		log.Println("重新开始爬")
	} else {
		log.Println("继续爬")
	}
	go sp.Run()
	<-utils.Stop
	sp.Close()
}