func main() { req := request.NewRequest(wkSohuUrl, "html", "index", "GET", "", nil, nil, nil, nil) sohuSpider := spider.NewSpider(NewMyPageProcesser(), "Sohu"). AddRequest(req). SetSleepTime("rand", 500, 1000). SetThreadnum(2) for i := 1; i < maxWKSouhuLayer; i++ { url := fmt.Sprintf("http://yule.sohu.com/gossip/index_%d.shtml", 5301-i) // magic num req := request.NewRequest(url, "html", "index", "GET", "", nil, nil, nil, nil) sohuSpider.AddRequest(req) } sohuSpider.Run() }
func (this *Spider) AddUrlsEx(urls []string, respType string, headerFile string, proxyHost string) *Spider { for _, url := range urls { req := request.NewRequest(url, respType, "", "GET", "", nil, nil, nil, nil) this.AddRequest(req.AddHeaderFile(headerFile).AddProxyHost(proxyHost)) } return this }
func main() { // POST data post_arg := url.Values{ "name": {"admin"}, "pwd": {"admin"}, } // http header header := make(http.Header) header.Set("Content-Type", "application/x-www-form-urlencoded") // Spider input: // PageProcesser ; // Task name used in Pipeline for record; // AddRequest Params: // 1. Url. // 2. Responce type is "html" or "json" or "jsonp" or "text". // 3. The urltag is name for marking url and distinguish different urls in PageProcesser and Pipeline. // 4. The method is POST or GET. // 5. The postdata is body string sent to sever. // 6. The header is header for http request. // 7. Cookies // 8. Http redirect function req := request.NewRequest("http://backadmin.hucong.net/main/user/login", "html", "site_login", "POST", post_arg.Encode(), header, nil, myRedirect, nil) spider.NewSpider(NewMyPageProcesser(), "TaskName"). AddRequest(req). AddPipeline(pipeline.NewPipelineConsole()). // Print result on screen SetThreadnum(3). // Crawl request by three Coroutines Run() }
func (this *Spider) AddUrls(urls []string, respType string) *Spider { for _, url := range urls { req := request.NewRequest(url, respType, "", "GET", "", nil, nil, nil, nil) this.AddRequest(req) } return this }
func TestQueueScheduler(t *testing.T) { var r *request.Request r = request.NewRequest("http://baidu.com", "html", "", "GET", "", nil, nil, nil, nil) fmt.Printf("%v\n", r) var s *scheduler.QueueScheduler s = scheduler.NewQueueScheduler(false) s.Push(r) var count int = s.Count() if count != 1 { t.Error("count error") } fmt.Println(count) var r1 *request.Request r1 = s.Poll() if r1 == nil { t.Error("poll error") } fmt.Printf("%v\n", r1) // remove duplicate s = scheduler.NewQueueScheduler(true) r2 := request.NewRequest("http://qq.com", "html", "", "GET", "", nil, nil, nil, nil) s.Push(r) s.Push(r2) s.Push(r) count = s.Count() if count != 2 { t.Error("count error") } fmt.Println(count) r1 = s.Poll() if r1 == nil { t.Error("poll error") } fmt.Printf("%v\n", r1) r1 = s.Poll() if r1 == nil { t.Error("poll error") } fmt.Printf("%v\n", r1) }
func main() { // spider input: // PageProcesser ; // task name used in Pipeline for record; sp := spider.NewSpider(NewMyPageProcesser(), "TaskName") // GetWithParams Params: // 1. Url. // 2. Responce type is "html" or "json" or "jsonp" or "text". // 3. The urltag is name for marking url and distinguish different urls in PageProcesser and Pipeline. // 4. The method is POST or GET. // 5. The postdata is body string sent to sever. // 6. The header is header for http request. // 7. Cookies req := request.NewRequest("http://baike.baidu.com/view/1628025.htm?fromtitle=http&fromid=243074&type=syn", "html", "", "GET", "", nil, nil, nil, nil) pageItems := sp.GetByRequest(req) //pageItems := sp.Get("http://baike.baidu.com/view/1628025.htm?fromtitle=http&fromid=243074&type=syn", "html") url := pageItems.GetRequest().GetUrl() println("-----------------------------------spider.Get---------------------------------") println("url\t:\t" + url) for name, value := range pageItems.GetAll() { println(name + "\t:\t" + value) } println("\n--------------------------------spider.GetAll---------------------------------") urls := []string{ "http://baike.baidu.com/view/1628025.htm?fromtitle=http&fromid=243074&type=syn", "http://baike.baidu.com/view/383720.htm?fromtitle=html&fromid=97049&type=syn", } var reqs []*request.Request for _, url := range urls { req := request.NewRequest(url, "html", "", "GET", "", nil, nil, nil, nil) reqs = append(reqs, req) } pageItemsArr := sp.SetThreadnum(2).GetAllByRequest(reqs) //pageItemsArr := sp.SetThreadnum(2).GetAll(urls, "html") for _, item := range pageItemsArr { url = item.GetRequest().GetUrl() println("url\t:\t" + url) fmt.Printf("item\t:\t%s\n", item.GetAll()) } }
// Deal with several urls and return the PageItems slice. func (this *Spider) GetAll(urls []string, respType string) []*page_items.PageItems { for _, u := range urls { req := request.NewRequest(u, respType, "", "GET", "", nil, nil, nil, nil) this.AddRequest(req) } pip := pipeline.NewCollectPipelinePageItems() this.AddPipeline(pip) this.Run() return pip.GetCollected() }
func TestQueueScheduler(t *testing.T) { var r *request.Request r = request.NewRequest("http://baidu.com", "html") var s *scheduler.QueueScheduler s = scheduler.NewQueueScheduler() s.Push(r) var count int = s.Count() fmt.Println(count) var r1 *request.Request r1 = s.Poll() fmt.Println(r1) }
func TestDownloadJson(t *testing.T) { var req *request.Request req = request.NewRequest("http://live.sina.com.cn/zt/api/l/get/finance/globalnews1/index.htm?format=json&id=23521&pagesize=4&dire=f&dpc=1", "json") var dl downloader.Downloader dl = downloader.NewHttpDownloader() var p *page.Page p = dl.Download(req) var jsonMap interface{} jsonMap = p.GetJsonMap() fmt.Printf("%v", jsonMap) //fmt.Println(doc) //body := p.GetBodyStr() //fmt.Println(body) }
func TestDownloadHtml(t *testing.T) { //return //request := request.NewRequest("http://live.sina.com.cn/zt/api/l/get/finance/globalnews1/index.htm?format=json&callback=t13975294&id=23521&pagesize=45&dire=f&dpc=1") var req *request.Request req = request.NewRequest("http://live.sina.com.cn/zt/l/v/finance/globalnews1/", "html", "", "GET", "", nil, nil, nil, nil) var dl downloader.Downloader dl = downloader.NewHttpDownloader() var p *page.Page p = dl.Download(req) var doc *goquery.Document doc = p.GetHtmlParser() //fmt.Println(doc) //body := p.GetBodyStr() //fmt.Println(body) var s *goquery.Selection s = doc.Find("body") if s.Length() < 1 { t.Error("html parse failed!") } /* doc, err := goquery.NewDocument("http://live.sina.com.cn/zt/l/v/finance/globalnews1/") if err != nil { fmt.Printf("%v",err) } s := doc.Find("meta"); fmt.Println(s.Length()) resp, err := http.Get("http://live.sina.com.cn/zt/l/v/finance/globalnews1/") if err != nil { fmt.Printf("%v",err) } defer resp.Body.Close() doc, err = goquery.NewDocumentFromReader(resp.Body) s = doc.Find("meta"); fmt.Println(s.Length()) */ }
func TestCharSetChange(t *testing.T) { var req *request.Request //req = request.NewRequest("http://stock.finance.sina.com.cn/usstock/api/jsonp.php/t/US_CategoryService.getList?page=1&num=60", "jsonp") req = request.NewRequest("http://soft.chinabyte.com/416/13164916.shtml", "html", "", "GET", "", nil, nil, nil, nil) var dl downloader.Downloader dl = downloader.NewHttpDownloader() var p *page.Page p = dl.Download(req) //hp := p.GetHtmlParser() //fmt.Printf("%v", jsonMap) //fmt.Println(doc) p.GetBodyStr() body := p.GetBodyStr() fmt.Println(body) }
/* * Main */ func main() { var header http.Header = make(http.Header) header.Add("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115 Safari/537.36") for i := 0; i < 20; i++ { url := "" if i == 0 { url = "http://www.lizhi.fm/1560327/" } else { url = fmt.Sprintf("http://www.lizhi.fm/1560327/p/%d.html", i+1) } fmt.Printf("Page:%d, Url: %s\n", i, url) req := request.NewRequest(url, "html", "index", "GET", "", header, nil, nil, nil) siteSpider := spider.NewSpider(NewSitePageProcesser("Tonghuashu"), "Tonghuashu"). AddRequest(req). SetSleepTime("rand", 1000, 2000). SetThreadnum(2) siteSpider.Run() } }
// Parse html dom here and record the parse result that we want to Page. // Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html. func (this *MyPageProcesser) Process(p *page.Page) { if p.GetUrlTag() == "site_login" { //fmt.Printf("%v\n", p.GetCookies()) this.cookies = p.GetCookies() // AddTargetRequestWithParams Params: // 1. Url. // 2. Responce type is "html" or "json" or "jsonp" or "text". // 3. The urltag is name for marking url and distinguish different urls in PageProcesser and Pipeline. // 4. The method is POST or GET. // 5. The postdata is body string sent to sever. // 6. The header is header for http request. // 7. Cookies // 8. Http redirect function if len(this.cookies) != 0 { p.AddField("info", "get cookies success") req := request.NewRequest("http://backadmin.hucong.net/site/index", "html", "site_index", "GET", "", nil, this.cookies, nil, nil) p.AddTargetRequestWithParams(req) } else { p.AddField("info", "get cookies failed") } } else { //fmt.Printf("%v\n", p.GetBodyStr()) query := p.GetHtmlParser() pageTitle := query.Find(".page-content .page-title").Text() if len(pageTitle) != 0 { p.AddField("page_title", pageTitle) p.AddField("info", "login success") } else { p.AddField("info", "login failed") } } return if !p.IsSucc() { println(p.Errormsg()) return } query := p.GetHtmlParser() var urls []string query.Find("h3[class='repo-list-name'] a").Each(func(i int, s *goquery.Selection) { href, _ := s.Attr("href") urls = append(urls, "http://github.com/"+href) }) // these urls will be saved and crawed by other coroutines. p.AddTargetRequests(urls, "html") name := query.Find(".entry-title .author").Text() name = strings.Trim(name, " \t\n") repository := query.Find(".entry-title .js-current-repository").Text() repository = strings.Trim(repository, " \t\n") //readme, _ := query.Find("#readme").Html() if name == "" { p.SetSkip(true) } // the entity we want to save by Pipeline p.AddField("author", name) p.AddField("project", repository) //p.AddField("readme", readme) }
// AddTargetRequest adds one new Request waitting for crawl. func (this *Page) AddTargetRequest(url string, respType string) *Page { this.targetRequests = append(this.targetRequests, request.NewRequest(url, respType, "", "GET", "", nil, nil, nil, nil)) return this }
func addRequest(p *page.Page, tag, url, cookie, content string) { req := request.NewRequest(url, "json", tag, "GET", "", nil, nil, nil, content) p.AddTargetRequestWithParams(req) }
// Deal with one url and return the PageItems. func (this *Spider) Get(url string, respType string) *page_items.PageItems { req := request.NewRequest(url, respType, "", "GET", "", nil, nil, nil, nil) return this.GetByRequest(req) }
func (this *Spider) AddUrl(url string, respType string) *Spider { req := request.NewRequest(url, respType, "", "GET", "", nil, nil, nil, nil) this.AddRequest(req) return this }
func (this *Spider) AddUrl(url string, respType string) *Spider { req := request.NewRequest(url, respType) this.addRequest(req) return this }