func main() { // POST data post_arg := url.Values{ "name": {"admin"}, "pwd": {"admin"}, } // http header header := make(http.Header) header.Set("Content-Type", "application/x-www-form-urlencoded") // Spider input: // PageProcesser ; // Task name used in Pipeline for record; // AddRequest Params: // 1. Url. // 2. Responce type is "html" or "json" or "jsonp" or "text". // 3. The urltag is name for marking url and distinguish different urls in PageProcesser and Pipeline. // 4. The method is POST or GET. // 5. The postdata is body string sent to sever. // 6. The header is header for http request. // 7. Cookies // 8. Http redirect function req := request.NewRequest("http://backadmin.hucong.net/main/user/login", "html", "site_login", "POST", post_arg.Encode(), header, nil, myRedirect, nil) spider.NewSpider(NewMyPageProcesser(), "TaskName"). AddRequest(req). AddPipeline(pipeline.NewPipelineConsole()). // Print result on screen SetThreadnum(3). // Crawl request by three Coroutines Run() }
func (w *SocialWorker) SpiderMain() { spider.NewSpider(NewMyPageProcesser(w), "TaskName"). AddUrlWithHeaderFile(crawlUrl.Url, "html", "./header_1.json"). // Start url, html is the responce type ("html" or "json" or "jsonp" or "text") AddPipeline(pipeline.NewPipelineConsole()). // Print result on screen SetThreadnum(1). // Crawl request by three Coroutines Run() }
func main() { // spider input: // PageProcesser ; // config path(default: WD/etc/main.conf); // task name used in Pipeline for record; sp := spider.NewSpider(NewMyPageProcesser(), "", "TaskName") pageItems := sp.Get("http://baike.baidu.com/view/1628025.htm?fromtitle=http&fromid=243074&type=syn", "html") // url, html is the responce type ("html" or "json") url := pageItems.GetRequest().GetUrl() println("-----------------------------------spider.Get---------------------------------") println("url\t:\t" + url) for name, value := range pageItems.GetAll() { println(name + "\t:\t" + value) } println("\n--------------------------------spider.GetAll---------------------------------") urls := []string{ "http://baike.baidu.com/view/1628025.htm?fromtitle=http&fromid=243074&type=syn", "http://baike.baidu.com/view/383720.htm?fromtitle=html&fromid=97049&type=syn", } pageItemsArr := sp.SetThreadnum(2).GetAll(urls, "html") for _, item := range pageItemsArr { url = item.GetRequest().GetUrl() println("url\t:\t" + url) fmt.Printf("item\t:\t%s\n", item.GetAll()) } }
func main() { // Spider input: // PageProcesser ; // Task name used in Pipeline for record; f, err := os.Open("./formated_bookmark") if err != nil { panic("f open error") } defer f.Close() sp := spider.NewSpider(NewMyPageProcesser(), "BookMarkSearch") br := bufio.NewReader(f) urls := make(map[string]string) line, err := br.ReadString('\n') for ; err == nil; line, err = br.ReadString('\n') { data := strings.Split(line, "||") url := data[1] urltag := data[0] + "||" + strings.Trim(data[2], "\n") urls[url] = urltag } sp.AddUrls(urls, "html") // sp.AddPipeline(pipeline.NewPipelineConsole()). sp.AddPipeline(pipeline.NewPipelineFile("./crawler_result.dat")). SetThreadnum(3). // Crawl request by three Coroutines Run() }
func main() { start_url := "http://www.jiexieyin.org" thread_num := uint(16) redisAddr := "127.0.0.1:6379" redisMaxConn := 10 redisMaxIdle := 10 proc := &MyProcessor{} sp := spider.NewSpider(proc, "redis_scheduler_example"). //SetSleepTime("fixed", 6000, 6000). //SetScheduler(scheduler.NewQueueScheduler(true)). SetScheduler(scheduler.NewRedisScheduler(redisAddr, redisMaxConn, redisMaxIdle, true)). AddPipeline(pipeline.NewPipelineConsole()). SetThreadnum(thread_num) init := false for _, arg := range os.Args { if arg == "--init" { init = true break } } if init { sp.AddUrl(start_url, "html") mlog.LogInst().LogInfo("重新开始爬") } else { mlog.LogInst().LogInfo("继续爬") } sp.Run() }
func main() { var startUrl = "http://baike.baidu.com/subview/412610/19548276.htm" sp := spider.NewSpider(plant.NewPlantProcesser(), "TaskName"). SetScheduler(scheduler.NewQueueScheduler(true)). AddPipeline(pipeline.NewPipelineFile("result.txt")) sp.AddUrl(startUrl, "html") sp.Run() }
func main() { // Spider input: // PageProcesser ; // Task name used in Pipeline for record; spider.NewSpider(NewMyPageProcesser(), "TaskName"). AddUrl("https://github.com/hu17889?tab=repositories", "html"). // Start url, html is the responce type ("html" or "json" or "jsonp" or "text") AddPipeline(pipeline.NewPipelineConsole()). // Print result on screen SetThreadnum(3). // Crawl request by three Coroutines Run() }
func main() { flag.Parse() // chan init exitChan = make(chan struct{}) getURL = make(chan string, *threadNum) releaseSlot = make(chan int, *threadNum) // repetition and urlstor initialization c := redis.NewClient(&redis.Options{ Addr: *redisURL, Password: *redisPWD, DB: *redisDB, }) rep = repetition.RepetitionJudgement{} err := rep.InitializeVisited(c, "repetition") if err != nil { log.Fatal("redis error, ", err) } urlstr = urlstore.URLCrawlerStore{} _, err = urlstr.InitialURLsStore(c, "colNeedCrawl", "colNeedCommit", "colNeedCrawl", "colNeedCommit") visited, _ := rep.CheckIfVisited("http://www.dianping.com/") if !visited { rep.VisitedNewNode("http://www.dianping.com/") urlstr.UploadURL("http://www.dianping.com/") } if err != nil { log.Fatal("urlstore initialzation error, ", err) } // db initilization dbSession, err := storage.Link2Db(*databaseURL) defer dbSession.Close() if err != nil { log.Fatal("mongodb error, ", err) } collection = storage.Link2Collection(dbSession, *databaseName, *databaseUser, *databasePwd, *collectionName, *databaseAuth) go distributeURL(*threadNum, urlstr) // url initilziation for i := 0; i < *threadNum; i++ { releaseSlot <- 1 } rootURL := GetOneURL() // Spider input: // PageProcesser ; // Task name used in Pipeline for record; spider.NewSpider(NewMyPageProcesser(), "TaskName"). AddUrl(rootURL, "html"). // Start url, html is the responce type ("html" or "json" or "jsonp" or "text") AddPipeline(pipeline.NewPipelineConsole()). // Print result on screen SetThreadnum((uint)(*threadNum)). // Crawl request by three Coroutines Run() }
func main() { // spider input: // PageProcesser ; // config path(default: WD/etc/main.conf); // task name used in Pipeline for record; spider.NewSpider(NewMyPageProcesser(), "", "TaskName"). AddUrl("https://github.com/hu17889?tab=repositories", "html"). // start url, html is the responce type ("html" or "json") AddPipeline(pipeline.NewPipelineConsole()). // print result on screen SetThreadnum(3). // crawl request by three Coroutines Run() }
func main() { // Spider input: // PageProcesser ; // Task name used in Pipeline for record; req_url := "http://weixin.sogou.com/weixin?query=%E4%BA%91%E6%B5%AE&type=1&page=1&ie=utf8" spider.NewSpider(NewMyPageProcesser(), "TaskName"). AddUrlWithHeaderFile(req_url, "html", "weixin.sogou.com.json"). // Start url, html is the responce type ("html" or "json" or "jsonp" or "text") AddPipeline(pipeline.NewPipelineConsole()). // Print result on screen SetThreadnum(3). // Crawl request by three Coroutines Run() }
func main() { // spider input: // PageProcesser ; // task name used in Pipeline for record; spider.NewSpider(NewMyPageProcesser(), "TaskName"). AddUrls(getUrls(), "html"). // start url, html is the responce type ("html" or "json") AddPipeline(pipeline.NewPipelineConsole()). // print result on screen SetThreadnum(1000). // crawl request by three Coroutines CloseStrace(). OpenFileLog("/Users/tywei/workspace/devLanguage/go/crawl/logs"). Run() }
func main() { // spider input: // PageProcesser ; // task name used in Pipeline for record; spider.NewSpider(NewMyPageProcesser(), "sina_stock_news"). AddUrl("http://live.sina.com.cn/zt/api/l/get/finance/globalnews1/index.htm?format=json&id=63621&pagesize=10&dire=f", "json"). // start url, html is the responce type ("html" or "json" or "jsonp" or "text") AddPipeline(pipeline.NewPipelineConsole()). // Print result to std output AddPipeline(pipeline.NewPipelineFile("/tmp/sinafile")). // Print result in file OpenFileLog("/tmp"). // Error info or other useful info in spider will be logged in file of defalt path like "WD/log/log.2014-9-1". SetSleepTime("rand", 1000, 3000). // Sleep time between 1s and 3s. Run() //AddPipeline(pipeline.NewPipelineFile("/tmp/tmpfile")). // print result in file }
func main() { req := request.NewRequest(wkSohuUrl, "html", "index", "GET", "", nil, nil, nil, nil) sohuSpider := spider.NewSpider(NewMyPageProcesser(), "Sohu"). AddRequest(req). SetSleepTime("rand", 500, 1000). SetThreadnum(2) for i := 1; i < maxWKSouhuLayer; i++ { url := fmt.Sprintf("http://yule.sohu.com/gossip/index_%d.shtml", 5301-i) // magic num req := request.NewRequest(url, "html", "index", "GET", "", nil, nil, nil, nil) sohuSpider.AddRequest(req) } sohuSpider.Run() }
func main() { // spider input: // PageProcesser ; // task name used in Pipeline for record; sp := spider.NewSpider(NewMyPageProcesser(), "TaskName") // GetWithParams Params: // 1. Url. // 2. Responce type is "html" or "json" or "jsonp" or "text". // 3. The urltag is name for marking url and distinguish different urls in PageProcesser and Pipeline. // 4. The method is POST or GET. // 5. The postdata is body string sent to sever. // 6. The header is header for http request. // 7. Cookies req := request.NewRequest("http://baike.baidu.com/view/1628025.htm?fromtitle=http&fromid=243074&type=syn", "html", "", "GET", "", nil, nil, nil, nil) pageItems := sp.GetByRequest(req) //pageItems := sp.Get("http://baike.baidu.com/view/1628025.htm?fromtitle=http&fromid=243074&type=syn", "html") url := pageItems.GetRequest().GetUrl() println("-----------------------------------spider.Get---------------------------------") println("url\t:\t" + url) for name, value := range pageItems.GetAll() { println(name + "\t:\t" + value) } println("\n--------------------------------spider.GetAll---------------------------------") urls := []string{ "http://baike.baidu.com/view/1628025.htm?fromtitle=http&fromid=243074&type=syn", "http://baike.baidu.com/view/383720.htm?fromtitle=html&fromid=97049&type=syn", } var reqs []*request.Request for _, url := range urls { req := request.NewRequest(url, "html", "", "GET", "", nil, nil, nil, nil) reqs = append(reqs, req) } pageItemsArr := sp.SetThreadnum(2).GetAllByRequest(reqs) //pageItemsArr := sp.SetThreadnum(2).GetAll(urls, "html") for _, item := range pageItemsArr { url = item.GetRequest().GetUrl() println("url\t:\t" + url) fmt.Printf("item\t:\t%s\n", item.GetAll()) } }
func main() { pageProcess := NewMyPageProcesser() pageProcess.Init() diandianSpider := spider.NewSpider(pageProcess, "TaskName"). AddUrl("http://shinichr.diandian.com/", "html"). // Start url, html is the responce type ("html" or "json" or "jsonp" or "text") AddPipeline(pipeline.NewPipelineConsole()). // Print result on screen AddPipeline(pipeline.NewPipelineFile("/home/shinichr/spider.tmp")). SetThreadnum(10000) // Crawl request by three Coroutines for i := 2; i < 10; i++ { url := fmt.Sprintf("http://shinichr.diandian.com/page/%d", i) diandianSpider.AddUrl(url, "html") } diandianSpider.Run() for url, _ := range pageProcess.visit_url { fmt.Println("spider:", url) } }
/* * Main */ func main() { var header http.Header = make(http.Header) header.Add("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115 Safari/537.36") for i := 0; i < 20; i++ { url := "" if i == 0 { url = "http://www.lizhi.fm/1560327/" } else { url = fmt.Sprintf("http://www.lizhi.fm/1560327/p/%d.html", i+1) } fmt.Printf("Page:%d, Url: %s\n", i, url) req := request.NewRequest(url, "html", "index", "GET", "", header, nil, nil, nil) siteSpider := spider.NewSpider(NewSitePageProcesser("Tonghuashu"), "Tonghuashu"). AddRequest(req). SetSleepTime("rand", 1000, 2000). SetThreadnum(2) siteSpider.Run() } }