Golang NewSpider示例，github.com/hu17889/go_spider/core/spider.NewSpider Golang示例

示例#1

0

显示文件

文件： main.go 项目： CrocdileChan/go_spider

func main() {

	// POST data
	post_arg := url.Values{
		"name": {"admin"},
		"pwd":  {"admin"},
	}

	// http header
	header := make(http.Header)
	header.Set("Content-Type", "application/x-www-form-urlencoded")

	// Spider input:
	//  PageProcesser ;
	//  Task name used in Pipeline for record;
	// AddRequest Params:
	//  1. Url.
	//  2. Responce type is "html" or "json" or "jsonp" or "text".
	//  3. The urltag is name for marking url and distinguish different urls in PageProcesser and Pipeline.
	//  4. The method is POST or GET.
	//  5. The postdata is body string sent to sever.
	//  6. The header is header for http request.
	//  7. Cookies
	//  8. Http redirect function
	req := request.NewRequest("http://backadmin.hucong.net/main/user/login", "html", "site_login", "POST", post_arg.Encode(), header, nil, myRedirect, nil)

	spider.NewSpider(NewMyPageProcesser(), "TaskName").
		AddRequest(req).
		AddPipeline(pipeline.NewPipelineConsole()). // Print result on screen
		SetThreadnum(3).                            // Crawl request by three Coroutines
		Run()
}

示例#2

0

显示文件

文件： spider.go 项目： luzh0422/spider-docker

func (w *SocialWorker) SpiderMain() {
	spider.NewSpider(NewMyPageProcesser(w), "TaskName").
		AddUrlWithHeaderFile(crawlUrl.Url, "html", "./header_1.json"). // Start url, html is the responce type ("html" or "json" or "jsonp" or "text")
		AddPipeline(pipeline.NewPipelineConsole()).                    // Print result on screen
		SetThreadnum(1).                                               // Crawl request by three Coroutines
		Run()
}

示例#3

0

显示文件

文件： main.go 项目： w3hacker/go_spider

func main() {
	// spider input:
	//  PageProcesser ;
	//  config path(default: WD/etc/main.conf);
	//  task name used in Pipeline for record;
	sp := spider.NewSpider(NewMyPageProcesser(), "", "TaskName")
	pageItems := sp.Get("http://baike.baidu.com/view/1628025.htm?fromtitle=http&fromid=243074&type=syn", "html") // url, html is the responce type ("html" or "json")

	url := pageItems.GetRequest().GetUrl()
	println("-----------------------------------spider.Get---------------------------------")
	println("url\t:\t" + url)
	for name, value := range pageItems.GetAll() {
		println(name + "\t:\t" + value)
	}

	println("\n--------------------------------spider.GetAll---------------------------------")
	urls := []string{
		"http://baike.baidu.com/view/1628025.htm?fromtitle=http&fromid=243074&type=syn",
		"http://baike.baidu.com/view/383720.htm?fromtitle=html&fromid=97049&type=syn",
	}
	pageItemsArr := sp.SetThreadnum(2).GetAll(urls, "html")
	for _, item := range pageItemsArr {
		url = item.GetRequest().GetUrl()
		println("url\t:\t" + url)
		fmt.Printf("item\t:\t%s\n", item.GetAll())
	}
}

示例#4

0

显示文件

文件： website_crawler.go 项目： wadee/go_proj

func main() {
	// Spider input:
	//  PageProcesser ;
	//  Task name used in Pipeline for record;
	f, err := os.Open("./formated_bookmark")
	if err != nil {
		panic("f open error")
	}
	defer f.Close()

	sp := spider.NewSpider(NewMyPageProcesser(), "BookMarkSearch")

	br := bufio.NewReader(f)

	urls := make(map[string]string)
	line, err := br.ReadString('\n')
	for ; err == nil; line, err = br.ReadString('\n') {
		data := strings.Split(line, "||")
		url := data[1]
		urltag := data[0] + "||" + strings.Trim(data[2], "\n")
		urls[url] = urltag
	}

	sp.AddUrls(urls, "html")

	// sp.AddPipeline(pipeline.NewPipelineConsole()).
	sp.AddPipeline(pipeline.NewPipelineFile("./crawler_result.dat")).
		SetThreadnum(3). // Crawl request by three Coroutines
		Run()
}

示例#5

0

显示文件

文件： main.go 项目： wadee/go_proj

func main() {
	start_url := "http://www.jiexieyin.org"
	thread_num := uint(16)

	redisAddr := "127.0.0.1:6379"
	redisMaxConn := 10
	redisMaxIdle := 10

	proc := &MyProcessor{}

	sp := spider.NewSpider(proc, "redis_scheduler_example").
		//SetSleepTime("fixed", 6000, 6000).
		//SetScheduler(scheduler.NewQueueScheduler(true)).
		SetScheduler(scheduler.NewRedisScheduler(redisAddr, redisMaxConn, redisMaxIdle, true)).
		AddPipeline(pipeline.NewPipelineConsole()).
		SetThreadnum(thread_num)

	init := false
	for _, arg := range os.Args {
		if arg == "--init" {
			init = true
			break
		}
	}
	if init {
		sp.AddUrl(start_url, "html")
		mlog.LogInst().LogInfo("重新开始爬")
	} else {
		mlog.LogInst().LogInfo("继续爬")
	}
	sp.Run()
}

示例#6

0

显示文件

文件： main.go 项目： liulnn/plant-spider

func main() {
	var startUrl = "http://baike.baidu.com/subview/412610/19548276.htm"
	sp := spider.NewSpider(plant.NewPlantProcesser(), "TaskName").
		SetScheduler(scheduler.NewQueueScheduler(true)).
		AddPipeline(pipeline.NewPipelineFile("result.txt"))
	sp.AddUrl(startUrl, "html")
	sp.Run()
}

示例#7

0

显示文件

文件： main.go 项目： xujb/go_spider

func main() {
	// Spider input:
	//  PageProcesser ;
	//  Task name used in Pipeline for record;
	spider.NewSpider(NewMyPageProcesser(), "TaskName").
		AddUrl("https://github.com/hu17889?tab=repositories", "html"). // Start url, html is the responce type ("html" or "json" or "jsonp" or "text")
		AddPipeline(pipeline.NewPipelineConsole()).                    // Print result on screen
		SetThreadnum(3).                                               // Crawl request by three Coroutines
		Run()
}

示例#8

0

显示文件

文件： crawler.go 项目： plutoshe/webCrawler

func main() {
	flag.Parse()
	// chan init
	exitChan = make(chan struct{})
	getURL = make(chan string, *threadNum)
	releaseSlot = make(chan int, *threadNum)

	// repetition and urlstor initialization
	c := redis.NewClient(&redis.Options{
		Addr:     *redisURL,
		Password: *redisPWD,
		DB:       *redisDB,
	})

	rep = repetition.RepetitionJudgement{}
	err := rep.InitializeVisited(c, "repetition")
	if err != nil {
		log.Fatal("redis error, ", err)
	}

	urlstr = urlstore.URLCrawlerStore{}
	_, err = urlstr.InitialURLsStore(c, "colNeedCrawl", "colNeedCommit", "colNeedCrawl", "colNeedCommit")
	visited, _ := rep.CheckIfVisited("http://www.dianping.com/")
	if !visited {
		rep.VisitedNewNode("http://www.dianping.com/")
		urlstr.UploadURL("http://www.dianping.com/")
	}
	if err != nil {
		log.Fatal("urlstore initialzation error, ", err)
	}

	// db initilization
	dbSession, err := storage.Link2Db(*databaseURL)
	defer dbSession.Close()
	if err != nil {
		log.Fatal("mongodb error, ", err)
	}
	collection = storage.Link2Collection(dbSession, *databaseName, *databaseUser, *databasePwd, *collectionName, *databaseAuth)
	go distributeURL(*threadNum, urlstr)
	// url initilziation
	for i := 0; i < *threadNum; i++ {
		releaseSlot <- 1
	}
	rootURL := GetOneURL()

	// Spider input:
	//  PageProcesser ;
	//  Task name used in Pipeline for record;
	spider.NewSpider(NewMyPageProcesser(), "TaskName").
		AddUrl(rootURL, "html").                    // Start url, html is the responce type ("html" or "json" or "jsonp" or "text")
		AddPipeline(pipeline.NewPipelineConsole()). // Print result on screen
		SetThreadnum((uint)(*threadNum)).           // Crawl request by three Coroutines
		Run()
}

示例#9

0

显示文件

文件： github_repo_page_processor.go 项目： w3hacker/go_spider

func main() {
	// spider input:
	//  PageProcesser ;
	//  config path(default: WD/etc/main.conf);
	//  task name used in Pipeline for record;
	spider.NewSpider(NewMyPageProcesser(), "", "TaskName").
		AddUrl("https://github.com/hu17889?tab=repositories", "html"). // start url, html is the responce type ("html" or "json")
		AddPipeline(pipeline.NewPipelineConsole()).                    // print result on screen
		SetThreadnum(3).                                               // crawl request by three Coroutines
		Run()
}

示例#10

0

显示文件

文件： main.go 项目： xujb/go_spider

func main() {
	// Spider input:
	//  PageProcesser ;
	//  Task name used in Pipeline for record;
	req_url := "http://weixin.sogou.com/weixin?query=%E4%BA%91%E6%B5%AE&type=1&page=1&ie=utf8"
	spider.NewSpider(NewMyPageProcesser(), "TaskName").
		AddUrlWithHeaderFile(req_url, "html", "weixin.sogou.com.json"). // Start url, html is the responce type ("html" or "json" or "jsonp" or "text")
		AddPipeline(pipeline.NewPipelineConsole()).                     // Print result on screen
		SetThreadnum(3).                                                // Crawl request by three Coroutines
		Run()
}

示例#11

0

显示文件

文件： main.go 项目： tuyuwei/test

func main() {
	// spider input:
	//  PageProcesser ;
	//  task name used in Pipeline for record;
	spider.NewSpider(NewMyPageProcesser(), "TaskName").
		AddUrls(getUrls(), "html").                 // start url, html is the responce type ("html" or "json")
		AddPipeline(pipeline.NewPipelineConsole()). // print result on screen
		SetThreadnum(1000).                         // crawl request by three Coroutines
		CloseStrace().
		OpenFileLog("/Users/tywei/workspace/devLanguage/go/crawl/logs").
		Run()
}

示例#12

0

显示文件

文件： main.go 项目： CrocdileChan/go_spider

func main() {
	// spider input:
	//  PageProcesser ;
	//  task name used in Pipeline for record;
	spider.NewSpider(NewMyPageProcesser(), "sina_stock_news").
		AddUrl("http://live.sina.com.cn/zt/api/l/get/finance/globalnews1/index.htm?format=json&id=63621&pagesize=10&dire=f", "json"). // start url, html is the responce type ("html" or "json" or "jsonp" or "text")
		AddPipeline(pipeline.NewPipelineConsole()).                                                                                   // Print result to std output
		AddPipeline(pipeline.NewPipelineFile("/tmp/sinafile")).                                                                       // Print result in file
		OpenFileLog("/tmp").                                                                                                          // Error info or other useful info in spider will be logged in file of defalt path like "WD/log/log.2014-9-1".
		SetSleepTime("rand", 1000, 3000).                                                                                             // Sleep time between 1s and 3s.
		Run()
	//AddPipeline(pipeline.NewPipelineFile("/tmp/tmpfile")). // print result in file
}

示例#13

0

显示文件

文件： main.go 项目： CrocdileChan/go_spider

func main() {
	req := request.NewRequest(wkSohuUrl, "html", "index", "GET", "", nil, nil, nil, nil)
	sohuSpider := spider.NewSpider(NewMyPageProcesser(), "Sohu").
		AddRequest(req).
		SetSleepTime("rand", 500, 1000).
		SetThreadnum(2)

	for i := 1; i < maxWKSouhuLayer; i++ {
		url := fmt.Sprintf("http://yule.sohu.com/gossip/index_%d.shtml", 5301-i) // magic num
		req := request.NewRequest(url, "html", "index", "GET", "", nil, nil, nil, nil)
		sohuSpider.AddRequest(req)
	}

	sohuSpider.Run()
}

示例#14

0

显示文件

文件： main.go 项目： CrocdileChan/go_spider

func main() {
	// spider input:
	//  PageProcesser ;
	//  task name used in Pipeline for record;
	sp := spider.NewSpider(NewMyPageProcesser(), "TaskName")
	// GetWithParams Params:
	//  1. Url.
	//  2. Responce type is "html" or "json" or "jsonp" or "text".
	//  3. The urltag is name for marking url and distinguish different urls in PageProcesser and Pipeline.
	//  4. The method is POST or GET.
	//  5. The postdata is body string sent to sever.
	//  6. The header is header for http request.
	//  7. Cookies
	req := request.NewRequest("http://baike.baidu.com/view/1628025.htm?fromtitle=http&fromid=243074&type=syn", "html", "", "GET", "", nil, nil, nil, nil)
	pageItems := sp.GetByRequest(req)
	//pageItems := sp.Get("http://baike.baidu.com/view/1628025.htm?fromtitle=http&fromid=243074&type=syn", "html")

	url := pageItems.GetRequest().GetUrl()
	println("-----------------------------------spider.Get---------------------------------")
	println("url\t:\t" + url)
	for name, value := range pageItems.GetAll() {
		println(name + "\t:\t" + value)
	}

	println("\n--------------------------------spider.GetAll---------------------------------")
	urls := []string{
		"http://baike.baidu.com/view/1628025.htm?fromtitle=http&fromid=243074&type=syn",
		"http://baike.baidu.com/view/383720.htm?fromtitle=html&fromid=97049&type=syn",
	}
	var reqs []*request.Request
	for _, url := range urls {
		req := request.NewRequest(url, "html", "", "GET", "", nil, nil, nil, nil)
		reqs = append(reqs, req)
	}
	pageItemsArr := sp.SetThreadnum(2).GetAllByRequest(reqs)
	//pageItemsArr := sp.SetThreadnum(2).GetAll(urls, "html")
	for _, item := range pageItemsArr {
		url = item.GetRequest().GetUrl()
		println("url\t:\t" + url)
		fmt.Printf("item\t:\t%s\n", item.GetAll())
	}
}

示例#15

0

显示文件

文件： diandianPostSpider.go 项目： ShinichR/diandianPostSpider

func main() {

	pageProcess := NewMyPageProcesser()
	pageProcess.Init()
	diandianSpider := spider.NewSpider(pageProcess, "TaskName").
		AddUrl("http://shinichr.diandian.com/", "html"). // Start url, html is the responce type ("html" or "json" or "jsonp" or "text")
		AddPipeline(pipeline.NewPipelineConsole()).      // Print result on screen
		AddPipeline(pipeline.NewPipelineFile("/home/shinichr/spider.tmp")).
		SetThreadnum(10000) // Crawl request by three Coroutines

	for i := 2; i < 10; i++ {
		url := fmt.Sprintf("http://shinichr.diandian.com/page/%d", i)
		diandianSpider.AddUrl(url, "html")
	}

	diandianSpider.Run()

	for url, _ := range pageProcess.visit_url {
		fmt.Println("spider:", url)
	}

}

示例#16

0

显示文件

文件： main.go 项目： rpoverflow/LiZhiFMCrawler

/*
 * Main
 */
func main() {
	var header http.Header = make(http.Header)
	header.Add("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115 Safari/537.36")

	for i := 0; i < 20; i++ {
		url := ""
		if i == 0 {
			url = "http://www.lizhi.fm/1560327/"
		} else {
			url = fmt.Sprintf("http://www.lizhi.fm/1560327/p/%d.html", i+1)
		}

		fmt.Printf("Page:%d, Url: %s\n", i, url)

		req := request.NewRequest(url, "html", "index", "GET", "", header, nil, nil, nil)
		siteSpider := spider.NewSpider(NewSitePageProcesser("Tonghuashu"), "Tonghuashu").
			AddRequest(req).
			SetSleepTime("rand", 1000, 2000).
			SetThreadnum(2)

		siteSpider.Run()
	}
}