Esempio n. 1
0
func main() {
	req := request.NewRequest(wkSohuUrl, "html", "index", "GET", "", nil, nil, nil, nil)
	sohuSpider := spider.NewSpider(NewMyPageProcesser(), "Sohu").
		AddRequest(req).
		SetSleepTime("rand", 500, 1000).
		SetThreadnum(2)

	for i := 1; i < maxWKSouhuLayer; i++ {
		url := fmt.Sprintf("http://yule.sohu.com/gossip/index_%d.shtml", 5301-i) // magic num
		req := request.NewRequest(url, "html", "index", "GET", "", nil, nil, nil, nil)
		sohuSpider.AddRequest(req)
	}

	sohuSpider.Run()
}
Esempio n. 2
0
func (this *Spider) AddUrlsEx(urls []string, respType string, headerFile string, proxyHost string) *Spider {
	for _, url := range urls {
		req := request.NewRequest(url, respType, "", "GET", "", nil, nil, nil, nil)
		this.AddRequest(req.AddHeaderFile(headerFile).AddProxyHost(proxyHost))
	}
	return this
}
Esempio n. 3
0
func main() {

	// POST data
	post_arg := url.Values{
		"name": {"admin"},
		"pwd":  {"admin"},
	}

	// http header
	header := make(http.Header)
	header.Set("Content-Type", "application/x-www-form-urlencoded")

	// Spider input:
	//  PageProcesser ;
	//  Task name used in Pipeline for record;
	// AddRequest Params:
	//  1. Url.
	//  2. Responce type is "html" or "json" or "jsonp" or "text".
	//  3. The urltag is name for marking url and distinguish different urls in PageProcesser and Pipeline.
	//  4. The method is POST or GET.
	//  5. The postdata is body string sent to sever.
	//  6. The header is header for http request.
	//  7. Cookies
	//  8. Http redirect function
	req := request.NewRequest("http://backadmin.hucong.net/main/user/login", "html", "site_login", "POST", post_arg.Encode(), header, nil, myRedirect, nil)

	spider.NewSpider(NewMyPageProcesser(), "TaskName").
		AddRequest(req).
		AddPipeline(pipeline.NewPipelineConsole()). // Print result on screen
		SetThreadnum(3).                            // Crawl request by three Coroutines
		Run()
}
Esempio n. 4
0
func (this *Spider) AddUrls(urls []string, respType string) *Spider {
	for _, url := range urls {
		req := request.NewRequest(url, respType, "", "GET", "", nil, nil, nil, nil)
		this.AddRequest(req)
	}
	return this
}
Esempio n. 5
0
func TestQueueScheduler(t *testing.T) {
	var r *request.Request
	r = request.NewRequest("http://baidu.com", "html", "", "GET", "", nil, nil, nil, nil)
	fmt.Printf("%v\n", r)

	var s *scheduler.QueueScheduler
	s = scheduler.NewQueueScheduler(false)

	s.Push(r)
	var count int = s.Count()
	if count != 1 {
		t.Error("count error")
	}
	fmt.Println(count)

	var r1 *request.Request
	r1 = s.Poll()
	if r1 == nil {
		t.Error("poll error")
	}
	fmt.Printf("%v\n", r1)

	// remove duplicate
	s = scheduler.NewQueueScheduler(true)

	r2 := request.NewRequest("http://qq.com", "html", "", "GET", "", nil, nil, nil, nil)
	s.Push(r)
	s.Push(r2)
	s.Push(r)
	count = s.Count()
	if count != 2 {
		t.Error("count error")
	}
	fmt.Println(count)

	r1 = s.Poll()
	if r1 == nil {
		t.Error("poll error")
	}
	fmt.Printf("%v\n", r1)
	r1 = s.Poll()
	if r1 == nil {
		t.Error("poll error")
	}
	fmt.Printf("%v\n", r1)
}
Esempio n. 6
0
func main() {
	// spider input:
	//  PageProcesser ;
	//  task name used in Pipeline for record;
	sp := spider.NewSpider(NewMyPageProcesser(), "TaskName")
	// GetWithParams Params:
	//  1. Url.
	//  2. Responce type is "html" or "json" or "jsonp" or "text".
	//  3. The urltag is name for marking url and distinguish different urls in PageProcesser and Pipeline.
	//  4. The method is POST or GET.
	//  5. The postdata is body string sent to sever.
	//  6. The header is header for http request.
	//  7. Cookies
	req := request.NewRequest("http://baike.baidu.com/view/1628025.htm?fromtitle=http&fromid=243074&type=syn", "html", "", "GET", "", nil, nil, nil, nil)
	pageItems := sp.GetByRequest(req)
	//pageItems := sp.Get("http://baike.baidu.com/view/1628025.htm?fromtitle=http&fromid=243074&type=syn", "html")

	url := pageItems.GetRequest().GetUrl()
	println("-----------------------------------spider.Get---------------------------------")
	println("url\t:\t" + url)
	for name, value := range pageItems.GetAll() {
		println(name + "\t:\t" + value)
	}

	println("\n--------------------------------spider.GetAll---------------------------------")
	urls := []string{
		"http://baike.baidu.com/view/1628025.htm?fromtitle=http&fromid=243074&type=syn",
		"http://baike.baidu.com/view/383720.htm?fromtitle=html&fromid=97049&type=syn",
	}
	var reqs []*request.Request
	for _, url := range urls {
		req := request.NewRequest(url, "html", "", "GET", "", nil, nil, nil, nil)
		reqs = append(reqs, req)
	}
	pageItemsArr := sp.SetThreadnum(2).GetAllByRequest(reqs)
	//pageItemsArr := sp.SetThreadnum(2).GetAll(urls, "html")
	for _, item := range pageItemsArr {
		url = item.GetRequest().GetUrl()
		println("url\t:\t" + url)
		fmt.Printf("item\t:\t%s\n", item.GetAll())
	}
}
Esempio n. 7
0
// Deal with several urls and return the PageItems slice.
func (this *Spider) GetAll(urls []string, respType string) []*page_items.PageItems {
	for _, u := range urls {
		req := request.NewRequest(u, respType, "", "GET", "", nil, nil, nil, nil)
		this.AddRequest(req)
	}

	pip := pipeline.NewCollectPipelinePageItems()
	this.AddPipeline(pip)

	this.Run()

	return pip.GetCollected()
}
Esempio n. 8
0
func TestQueueScheduler(t *testing.T) {
	var r *request.Request
	r = request.NewRequest("http://baidu.com", "html")

	var s *scheduler.QueueScheduler
	s = scheduler.NewQueueScheduler()

	s.Push(r)
	var count int = s.Count()
	fmt.Println(count)

	var r1 *request.Request
	r1 = s.Poll()
	fmt.Println(r1)
}
Esempio n. 9
0
func TestDownloadJson(t *testing.T) {
	var req *request.Request
	req = request.NewRequest("http://live.sina.com.cn/zt/api/l/get/finance/globalnews1/index.htm?format=json&id=23521&pagesize=4&dire=f&dpc=1", "json")

	var dl downloader.Downloader
	dl = downloader.NewHttpDownloader()

	var p *page.Page
	p = dl.Download(req)

	var jsonMap interface{}
	jsonMap = p.GetJsonMap()
	fmt.Printf("%v", jsonMap)

	//fmt.Println(doc)
	//body := p.GetBodyStr()
	//fmt.Println(body)

}
Esempio n. 10
0
func TestDownloadHtml(t *testing.T) {
	//return
	//request := request.NewRequest("http://live.sina.com.cn/zt/api/l/get/finance/globalnews1/index.htm?format=json&callback=t13975294&id=23521&pagesize=45&dire=f&dpc=1")
	var req *request.Request
	req = request.NewRequest("http://live.sina.com.cn/zt/l/v/finance/globalnews1/", "html", "", "GET", "", nil, nil, nil, nil)

	var dl downloader.Downloader
	dl = downloader.NewHttpDownloader()

	var p *page.Page
	p = dl.Download(req)

	var doc *goquery.Document
	doc = p.GetHtmlParser()
	//fmt.Println(doc)
	//body := p.GetBodyStr()
	//fmt.Println(body)

	var s *goquery.Selection
	s = doc.Find("body")
	if s.Length() < 1 {
		t.Error("html parse failed!")
	}

	/*
	   doc, err := goquery.NewDocument("http://live.sina.com.cn/zt/l/v/finance/globalnews1/")
	   if err != nil {
	       fmt.Printf("%v",err)
	   }
	   s := doc.Find("meta");
	   fmt.Println(s.Length())

	   resp, err := http.Get("http://live.sina.com.cn/zt/l/v/finance/globalnews1/")
	   if err != nil {
	       fmt.Printf("%v",err)
	   }
	   defer resp.Body.Close()
	   doc, err = goquery.NewDocumentFromReader(resp.Body)
	   s = doc.Find("meta");
	   fmt.Println(s.Length())
	*/
}
Esempio n. 11
0
func TestCharSetChange(t *testing.T) {
	var req *request.Request
	//req = request.NewRequest("http://stock.finance.sina.com.cn/usstock/api/jsonp.php/t/US_CategoryService.getList?page=1&num=60", "jsonp")
	req = request.NewRequest("http://soft.chinabyte.com/416/13164916.shtml", "html", "", "GET", "", nil, nil, nil, nil)

	var dl downloader.Downloader
	dl = downloader.NewHttpDownloader()

	var p *page.Page
	p = dl.Download(req)

	//hp := p.GetHtmlParser()
	//fmt.Printf("%v", jsonMap)

	//fmt.Println(doc)
	p.GetBodyStr()
	body := p.GetBodyStr()
	fmt.Println(body)

}
Esempio n. 12
0
/*
 * Main
 */
func main() {
	var header http.Header = make(http.Header)
	header.Add("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115 Safari/537.36")

	for i := 0; i < 20; i++ {
		url := ""
		if i == 0 {
			url = "http://www.lizhi.fm/1560327/"
		} else {
			url = fmt.Sprintf("http://www.lizhi.fm/1560327/p/%d.html", i+1)
		}

		fmt.Printf("Page:%d, Url: %s\n", i, url)

		req := request.NewRequest(url, "html", "index", "GET", "", header, nil, nil, nil)
		siteSpider := spider.NewSpider(NewSitePageProcesser("Tonghuashu"), "Tonghuashu").
			AddRequest(req).
			SetSleepTime("rand", 1000, 2000).
			SetThreadnum(2)

		siteSpider.Run()
	}
}
Esempio n. 13
0
// Parse html dom here and record the parse result that we want to Page.
// Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html.
func (this *MyPageProcesser) Process(p *page.Page) {

	if p.GetUrlTag() == "site_login" {
		//fmt.Printf("%v\n", p.GetCookies())
		this.cookies = p.GetCookies()
		// AddTargetRequestWithParams Params:
		//  1. Url.
		//  2. Responce type is "html" or "json" or "jsonp" or "text".
		//  3. The urltag is name for marking url and distinguish different urls in PageProcesser and Pipeline.
		//  4. The method is POST or GET.
		//  5. The postdata is body string sent to sever.
		//  6. The header is header for http request.
		//  7. Cookies
		//  8. Http redirect function
		if len(this.cookies) != 0 {
			p.AddField("info", "get cookies success")
			req := request.NewRequest("http://backadmin.hucong.net/site/index", "html", "site_index", "GET", "", nil, this.cookies, nil, nil)
			p.AddTargetRequestWithParams(req)
		} else {
			p.AddField("info", "get cookies failed")
		}
	} else {
		//fmt.Printf("%v\n", p.GetBodyStr())
		query := p.GetHtmlParser()
		pageTitle := query.Find(".page-content .page-title").Text()

		if len(pageTitle) != 0 {
			p.AddField("page_title", pageTitle)
			p.AddField("info", "login success")
		} else {
			p.AddField("info", "login failed")
		}

	}

	return
	if !p.IsSucc() {
		println(p.Errormsg())
		return
	}

	query := p.GetHtmlParser()
	var urls []string
	query.Find("h3[class='repo-list-name'] a").Each(func(i int, s *goquery.Selection) {
		href, _ := s.Attr("href")
		urls = append(urls, "http://github.com/"+href)
	})
	// these urls will be saved and crawed by other coroutines.
	p.AddTargetRequests(urls, "html")

	name := query.Find(".entry-title .author").Text()
	name = strings.Trim(name, " \t\n")
	repository := query.Find(".entry-title .js-current-repository").Text()
	repository = strings.Trim(repository, " \t\n")
	//readme, _ := query.Find("#readme").Html()
	if name == "" {
		p.SetSkip(true)
	}
	// the entity we want to save by Pipeline
	p.AddField("author", name)
	p.AddField("project", repository)
	//p.AddField("readme", readme)
}
Esempio n. 14
0
// AddTargetRequest adds one new Request waitting for crawl.
func (this *Page) AddTargetRequest(url string, respType string) *Page {
	this.targetRequests = append(this.targetRequests, request.NewRequest(url, respType, "", "GET", "", nil, nil, nil, nil))
	return this
}
Esempio n. 15
0
func addRequest(p *page.Page, tag, url, cookie, content string) {
	req := request.NewRequest(url, "json", tag, "GET", "", nil, nil, nil, content)
	p.AddTargetRequestWithParams(req)
}
Esempio n. 16
0
// Deal with one url and return the PageItems.
func (this *Spider) Get(url string, respType string) *page_items.PageItems {
	req := request.NewRequest(url, respType, "", "GET", "", nil, nil, nil, nil)
	return this.GetByRequest(req)
}
Esempio n. 17
0
func (this *Spider) AddUrl(url string, respType string) *Spider {
	req := request.NewRequest(url, respType, "", "GET", "", nil, nil, nil, nil)
	this.AddRequest(req)
	return this
}
Esempio n. 18
0
func (this *Spider) AddUrl(url string, respType string) *Spider {
	req := request.NewRequest(url, respType)
	this.addRequest(req)
	return this
}