Пример #1
0
func main() {

	// POST data
	post_arg := url.Values{
		"name": {"admin"},
		"pwd":  {"admin"},
	}

	// http header
	header := make(http.Header)
	header.Set("Content-Type", "application/x-www-form-urlencoded")

	// Spider input:
	//  PageProcesser ;
	//  Task name used in Pipeline for record;
	// AddRequest Params:
	//  1. Url.
	//  2. Responce type is "html" or "json" or "jsonp" or "text".
	//  3. The urltag is name for marking url and distinguish different urls in PageProcesser and Pipeline.
	//  4. The method is POST or GET.
	//  5. The postdata is body string sent to sever.
	//  6. The header is header for http request.
	//  7. Cookies
	//  8. Http redirect function
	req := request.NewRequest("http://backadmin.hucong.net/main/user/login", "html", "site_login", "POST", post_arg.Encode(), header, nil, myRedirect, nil)

	spider.NewSpider(NewMyPageProcesser(), "TaskName").
		AddRequest(req).
		AddPipeline(pipeline.NewPipelineConsole()). // Print result on screen
		SetThreadnum(3).                            // Crawl request by three Coroutines
		Run()
}
Пример #2
0
func main() {
	// spider input:
	//  PageProcesser ;
	//  task name used in Pipeline for record;
	if len(os.Args) < 2 {
		fmt.Println("need an url as param,like ./main 'http://baidu.com/' ")
		return

	}
	myPageProcess := NewMyPageProcesser()
	sp := spider.NewSpider(myPageProcess, "mailSpider")
	go func() {
		for {
			<-time.After(time.Second)
			loadWhiteAndBlackList(sp)
		}
	}()
	go doSendMail(myPageProcess.MailHandle)
	sp.SetScheduler(scheduler.NewQueueScheduler(true)).
		SetExitWhenComplete(false).
		AddUrl(os.Args[1], "html").
		// AddPipeline(pipeline.NewPipelineConsole()).

		SetThreadnum(3).
		Run()
}
Пример #3
0
func main() {
	start_url := "http://www.jiexieyin.org"
	thread_num := uint(16)

	redisAddr := "127.0.0.1:6379"
	redisMaxConn := 10
	redisMaxIdle := 10

	proc := &MyProcessor{}

	sp := spider.NewSpider(proc, "redis_scheduler_example").
		//SetSleepTime("fixed", 6000, 6000).
		//SetScheduler(scheduler.NewQueueScheduler(true)).
		SetScheduler(scheduler.NewRedisScheduler(redisAddr, redisMaxConn, redisMaxIdle, true)).
		AddPipeline(pipeline.NewPipelineConsole()).
		SetThreadnum(thread_num)

	init := false
	for _, arg := range os.Args {
		if arg == "--init" {
			init = true
			break
		}
	}
	if init {
		sp.AddUrl(start_url, "html")
		mlog.LogInst().LogInfo("重新开始爬")
	} else {
		mlog.LogInst().LogInfo("继续爬")
	}
	sp.Run()
}
Пример #4
0
func main() {
	// Spider input:
	//  PageProcesser ;
	//  Task name used in Pipeline for record;
	spider.NewSpider(NewMyPageProcesser(), "TaskName").
		AddUrl("https://github.com/jixiuf?tab=repositories", "html"). // Start url, html is the responce type ("html" or "json" or "jsonp" or "text")
		AddPipeline(pipeline.NewPipelineConsole()).                   // Print result on screen
		SetThreadnum(3).                                              // Crawl request by three Coroutines
		Run()
}
Пример #5
0
func main() {
	// Spider input:
	//  PageProcesser ;
	//  Task name used in Pipeline for record;
	req_url := "http://weixin.sogou.com/weixin?query=%E4%BA%91%E6%B5%AE&type=1&page=1&ie=utf8"
	spider.NewSpider(NewMyPageProcesser(), "TaskName").
		AddUrlWithHeaderFile(req_url, "html", "weixin.sogou.com.json"). // Start url, html is the responce type ("html" or "json" or "jsonp" or "text")
		AddPipeline(pipeline.NewPipelineConsole()).                     // Print result on screen
		SetThreadnum(3).                                                // Crawl request by three Coroutines
		Run()
}
Пример #6
0
func main() {
	// spider input:
	//  PageProcesser ;
	//  task name used in Pipeline for record;
	spider.NewSpider(NewMyPageProcesser(), "sina_stock_news").
		AddUrl("http://live.sina.com.cn/zt/api/l/get/finance/globalnews1/index.htm?format=json&id=63621&pagesize=10&dire=f", "json"). // start url, html is the responce type ("html" or "json" or "jsonp" or "text")
		AddPipeline(pipeline.NewPipelineConsole()).                                                                                   // Print result to std output
		AddPipeline(pipeline.NewPipelineFile("/tmp/sinafile")).                                                                       // Print result in file
		OpenFileLog("/tmp").                                                                                                          // Error info or other useful info in spider will be logged in file of defalt path like "WD/log/log.2014-9-1".
		SetSleepTime("rand", 1000, 3000).                                                                                             // Sleep time between 1s and 3s.
		Run()
	//AddPipeline(pipeline.NewPipelineFile("/tmp/tmpfile")). // print result in file
}
Пример #7
0
func main() {
	req := request.NewRequest(wkSohuUrl, "html", "index", "GET", "", nil, nil, nil, nil)
	sohuSpider := spider.NewSpider(NewMyPageProcesser(), "Sohu").
		AddRequest(req).
		SetSleepTime("rand", 500, 1000).
		SetThreadnum(2)

	for i := 1; i < maxWKSouhuLayer; i++ {
		url := fmt.Sprintf("http://yule.sohu.com/gossip/index_%d.shtml", 5301-i) // magic num
		req := request.NewRequest(url, "html", "index", "GET", "", nil, nil, nil, nil)
		sohuSpider.AddRequest(req)
	}

	sohuSpider.Run()
}
Пример #8
0
func main() {
	// spider input:
	//  PageProcesser ;
	//  task name used in Pipeline for record;
	sp := spider.NewSpider(NewMyPageProcesser(), "TaskName")
	// GetWithParams Params:
	//  1. Url.
	//  2. Responce type is "html" or "json" or "jsonp" or "text".
	//  3. The urltag is name for marking url and distinguish different urls in PageProcesser and Pipeline.
	//  4. The method is POST or GET.
	//  5. The postdata is body string sent to sever.
	//  6. The header is header for http request.
	//  7. Cookies
	req := request.NewRequest("http://baike.baidu.com/view/1628025.htm?fromtitle=http&fromid=243074&type=syn", "html", "", "GET", "", nil, nil, nil, nil)
	pageItems := sp.GetByRequest(req)
	//pageItems := sp.Get("http://baike.baidu.com/view/1628025.htm?fromtitle=http&fromid=243074&type=syn", "html")

	url := pageItems.GetRequest().GetUrl()
	println("-----------------------------------spider.Get---------------------------------")
	println("url\t:\t" + url)
	for name, value := range pageItems.GetAll() {
		println(name + "\t:\t" + value)
	}

	println("\n--------------------------------spider.GetAll---------------------------------")
	urls := []string{
		"http://baike.baidu.com/view/1628025.htm?fromtitle=http&fromid=243074&type=syn",
		"http://baike.baidu.com/view/383720.htm?fromtitle=html&fromid=97049&type=syn",
	}
	var reqs []*request.Request
	for _, url := range urls {
		req := request.NewRequest(url, "html", "", "GET", "", nil, nil, nil, nil)
		reqs = append(reqs, req)
	}
	pageItemsArr := sp.SetThreadnum(2).GetAllByRequest(reqs)
	//pageItemsArr := sp.SetThreadnum(2).GetAll(urls, "html")
	for _, item := range pageItemsArr {
		url = item.GetRequest().GetUrl()
		println("url\t:\t" + url)
		fmt.Printf("item\t:\t%s\n", item.GetAll())
	}
}