Beispiel #1
0
func DownloadPage(path string) (doc *html.HtmlDocument, err error) {
	req, err := MakeRequest("GET", path)
	if err != nil {
		return
	}

	res, err := client.Do(req)
	if err != nil {
		return
	}
	defer res.Body.Close()

	if res.StatusCode != 200 {
		err = errors.New(fmt.Sprintf("DownloadPage: %d %s", res.StatusCode, path))
		return
	}

	data, err := ioutil.ReadAll(res.Body)
	if err != nil {
		return
	}

	doc, err = gokogiri.ParseHtml(data)
	return
}
Beispiel #2
0
func loadForms(baseUrl string, b []byte) ([]*Form, error) {
	u, e := url.Parse(baseUrl)
	if e != nil {
		return nil, e
	}
	doc, e := gokogiri.ParseHtml(b)
	if e != nil {
		return nil, e
	}
	formTags, e := doc.Search("//form")
	if e != nil {
		return nil, e
	}
	out := []*Form{}
	for _, node := range formTags {
		action := node.Attr("action")
		if !strings.HasPrefix(action, "http") {
			base := u.Scheme + "://" + u.Host
			if strings.HasPrefix(action, "/") {
				action = base + action
			} else {
				action = base + u.Path + "/" + strings.TrimPrefix(action, "/")
			}
		}
		f := &Form{Method: node.Attr("method"), Action: action}
		f.Inputs, e = loadInputs(node)
		if e != nil {
			return nil, e
		}
		out = append(out, f)
	}
	return out, nil
}
Beispiel #3
0
func (s *Service) Check(id string) (status string, err error) {
	body, err := s.Downloader(s.URL, id)
	if err != nil {
		return
	}

	if s.Extractor != nil {
		parts := s.Extractor.FindSubmatch(body)
		if parts == nil {
			return "", nil
		}

		status = string(parts[1])
	} else {
		doc, err := gokogiri.ParseHtml(body)
		if err != nil {
			return "", err
		}
		defer doc.Free()

		res, err := doc.Search(s.XPath)
		if err != nil {
			return "", err
		}
		if len(res) < 1 {
			return "", nil
		}

		status = sanitize.HTML(res[0].String())
		status = replacer.ReplaceAllString(status, " ")
		status = strings.TrimSpace(status)
	}

	return
}
Beispiel #4
0
func NewRecord(content []byte) (record *Record) {
	doc, err := gokogiri.ParseHtml([]byte(content))
	if err != nil {
		panic(err)
	}

	displayText := cleanUpContent(doc.String())
	record = &Record{RawText: content, DisplayText: displayText}
	dateStr := getInterp(doc.Root().NodePtr(), "date", doc)
	date, err := time.Parse("20060102", dateStr)
	if err != nil {
		record.Date = nil
	} else {
		record.Date = &date
	}

	xPath := xpath.NewXPath(doc.DocPtr())
	nodePtrs := xPath.Evaluate(doc.Root().NodePtr(),
		xpath.Compile("//div1"))

	node := xml.NewNode(nodePtrs[0], doc)
	record.Id = node.Attr("id")
	record.Type = node.Attr("type")

	record.processPersons(doc)
	record.processOffences(doc)
	record.processVerdicts(doc)
	record.processOffJoins(doc)
	return
}
Beispiel #5
0
func AllTrainNumbers() (numbers []string, err error) {

	content, err := kcjHttpRequest(nil)

	if err != nil {
		return nil, err
	}

	doc, err := gokogiri.ParseHtml(content)

	if err != nil {
		return nil, err
	}

	defer doc.Free()

	const trainNumXPath = "/html/body/form/table/tr[2]/td[2]/table/tr[4]/td[2]/select/option/text()"

	html := doc.Root().FirstChild()

	numResults, err := html.Search(trainNumXPath)

	if err != nil {
		return nil, err
	}

	numbers = make([]string, len(numResults)-1)

	for i, num := range numResults[1:] {
		numbers[i] = num.String()
	}

	return
}
func (c *Client) recentHandler(resp *http.Response, err error) {
	if err != nil {
		c.Fail(err.Error())
		return
	}
	if resp.StatusCode != 200 {
		c.Fail("status %d != 200 %s", resp.StatusCode, resp.Request.URL.String())
		return
	}
	html, _ := ioutil.ReadAll(resp.Body)
	defer resp.Body.Close()
	doc, docerr := gokogiri.ParseHtml(html)
	if docerr != nil {
		c.Fail("html parse error")
		return
	}
	nodes, _ := doc.Search("//ul[@id='memos']//li")
	if len(nodes) == 0 {
		c.Fail("memos too few")
		return
	}
	nodes, _ = doc.Search("//p[@id='pager']/span[@id='total']/text()")
	if len(nodes) != 1 {
		c.Fail("no pager")
		return
	}
	c.TotalMemos, _ = strconv.Atoi(nodes[0].String())
	c.Success(1.0)
}
func (c *Client) signinHandler(resp *http.Response, err error) {
	if err != nil {
		c.Fail(err.Error())
		return
	}
	if resp.StatusCode != 200 {
		c.Fail("status %d != 200 %s", resp.StatusCode, resp.Request.URL.String())
		return
	}
	html, _ := ioutil.ReadAll(resp.Body)
	defer resp.Body.Close()
	if !c.IsChecker {
		c.Success(1.0)
		return
	}

	doc, docerr := gokogiri.ParseHtml(html)
	defer doc.Free()
	if docerr != nil {
		c.Fail("html parse error")
		return
	}
	nodes, nodeerr := doc.Search("//form//input[@name='username']")
	if nodeerr != nil {
		c.Fail("input element search error")
		return
	}
	if len(nodes) != 1 {
		c.Fail("input element not found")
		return
	}
	c.Success(1.0)
}
func main() {
	if len(os.Args) < 3 {
		fmt.Println("Usage:", os.Args[0], "filename iterations")
		os.Exit(1)
	}

	filename := os.Args[1]
	n, _ := strconv.Atoi(os.Args[2])

	file, err := ioutil.ReadFile(filename)
	if err != nil {
		panic(err)
	}

	start := time.Now()
	for i := 0; i < n; i++ {
		doc, err := gokogiri.ParseHtml(file)
		if err != nil {
			panic(err)
		}
		doc.Root()
		doc.Free()
	}
	end := time.Now()

	fmt.Printf("%f s\n", end.Sub(start).Seconds())
}
Beispiel #9
0
func main() {
	xpathString := ""
	if len(os.Args) < 2 {
		fmt.Fprintln(os.Stderr, "Missing second argument, XPATH!")
		os.Exit(2)
	} else {
		xpathString = os.Args[1]
	}

	page, _ := ioutil.ReadAll(os.Stdin)
	doc, err := gokogiri.ParseHtml(page)
	if err != nil {
		fmt.Fprintln(os.Stderr, "Problem parsing document.")
	}
	defer doc.Free()

	xps := xpath.Compile(xpathString)
	defer xps.Free()

	search, err := doc.Search(xps)
	if err == nil {
		for _, s := range search {
			fmt.Println(s.Content())
		}
	} else {
		fmt.Fprintln(os.Stderr, "Sorry. Got error.")
	}
}
Beispiel #10
0
func parseResult(html []byte) (results []ImageResult, err error) {
	doc, err := gokogiri.ParseHtml([]byte(html))
	if err != nil {
		return nil, err
	}

	// Get image tags
	imagesTags, err := doc.Search(`//div[@class='dg_u']/a`)
	if err != nil {
		return nil, err
	}

	// Filter and parse
	var images []ImageResult
	for _, tag := range imagesTags {
		if meta, err := ParseMetadata(tag.Attr("m")); err == nil {
			images = append(images, metaToResult(meta))
		}
	}

	// No results
	if images == nil {
		return nil, fmt.Errorf("No results")
	}

	return images, nil
}
func (p *Page) scrap(url string, o *parse.Object) error {
	resp, err := p.client.Get(url)
	if err != nil {
		return err
	}
	defer resp.Body.Close()

	page, err := ioutil.ReadAll(resp.Body)
	if err != nil {
		return err
	}

	doc, err := gokogiri.ParseHtml(page)
	if err != nil {
		return err
	}
	defer doc.Free()

	for _, field := range p.fields {
		value, err := field.selector(url, doc)
		if err != nil {
			return err
		}
		o.SetNested(field.path, value)
	}

	for _, processor := range p.processors {
		processor(o)
	}
	return nil
}
Beispiel #12
0
func FiveThousandBest() (titles []string, err error) {
	res, err := http.Get("http://5000best.com/movies/1")
	if err != nil {
		return
	}

	body, err := ioutil.ReadAll(res.Body)
	if err != nil {
		return
	}

	doc, err := gokogiri.ParseHtml(body)
	if err != nil {
		return
	}

	exp := xpath.Compile("//a[@class='n']")

	nodes := doc.XPathCtx.Evaluate(doc.NodePtr(), exp)

	for _, np := range nodes {
		node := xml.NewNode(np, doc)
		title := node.InnerHtml()
		title = title[:len(title)-8]
		titles = append(titles, title)
	}

	return
}
Beispiel #13
0
func parseDivs(body []byte) ([]xml.Node, error) {

	root, err := gokogiri.ParseHtml(body)
	if err != nil {
		return nil, err
	}

	return root.Root().Search(divsPath)
}
Beispiel #14
0
func (this *Yingxiaoqun) GetDetailContent(content, url string) (map[string]interface{}, error) {

	//result:=make(map[string]interface{})
	num_pat, err := regexp.Compile("(\\d+)$")
	if err != nil {
		return nil, err
	}
	matchs := num_pat.FindStringSubmatch(url)
	if matchs == nil {
		err = errors.New("Wrong URL")
		return nil, err
	}

	//this.Logger.Info("[INFO] matchs[1] %v",matchs[1])

	doc, _ := gokogiri.ParseHtml([]byte(content))
	defer doc.Free()
	title_xpath, err := doc.Search(fmt.Sprintf("//*[@id=\"post-%v\"]/div/div[2]/header/h1", matchs[1]))
	if len(title_xpath) == 0 || err != nil {
		return nil, err
	}
	title := title_xpath[0].Content()

	editdate_xpath, err := doc.Search(fmt.Sprintf("//*[@id=\"post-%s\"]/div/div[1]/div/time/@datetime", matchs[1]))
	if len(editdate_xpath) == 0 || err != nil {
		return nil, err
	}
	editdate := editdate_xpath[0].String()

	content_xpath, err := doc.Search(fmt.Sprintf("//*[@id=\"post-%s\"]/div/div[2]/div", matchs[1]))
	if len(content_xpath) == 0 || err != nil {

		return nil, err
	}
	contents := content_xpath[0].Content()

	editor_xpath, err := doc.Search(fmt.Sprintf("//*[@id=\"post-%s\"]/div/div[2]/header/span/span/a", matchs[1]))
	if len(editor_xpath) == 0 || err != nil {
		return nil, err
	}
	editor := editor_xpath[0].Content()

	this.Logger.Info("[INFO] Title %v", title)
	this.Logger.Info("[INFO] EditDate %v", editdate)
	this.Logger.Info("[INFO] Content %v", contents)
	this.Logger.Info("[INFO] Editor %v", editor)
	/*
		err = this.DbAdaptor.ExecFormat(SQL_YXQ,"111",matchs[1],"2","",title,"",contents,editor,editdate[:10],url,"0")
		if err != nil {
			this.Logger.Error(" MYSQL Error : %v ",err)
			return nil,err
		}
	*/
	return nil, nil
}
Beispiel #15
0
func ScrapeMatches(url string) {
  pageSource := retrievePageSource(url)

  doc, err := gokogiri.ParseHtml(pageSource)
  errorHandler(err)
  defer doc.Free()
  matches, err := doc.Search(".//*[@class='item-container clearfix match collapsed']")
  errorHandler(err)

  fmt.Println(parseMatches(matches))
}
Beispiel #16
0
func main() {
	// Parse even this bad bit of HTML and make it valid
	html := "<h2>I am so malformatted</h2>"
	doc, _ := gokogiri.ParseHtml([]byte(html))
	defer doc.Free()

	header := doc.Root().FirstChild().FirstChild()
	header.SetName("h1")

	fmt.Println(doc.String())
}
Beispiel #17
0
func processUrl(url string) *html.HtmlDocument {
	respon, err := http.Get(url)
	if err != nil {
		panic(err)
	}

	page, _ := ioutil.ReadAll(respon.Body)
	document, _ := gokogiri.ParseHtml(page)

	return document
}
Beispiel #18
0
func ScrapePlayers(url string) {
	pageSource := retrievePageSource(url)

	doc, err := gokogiri.ParseHtml(pageSource)
	errorHandler(err)
	defer doc.Free()
	playerContainers, err := doc.Search("//*[contains(@class, 'item-container')]")
	players := parsePlayers(playerContainers)
	json := marshalSlice(players)
	buffer := bytes.NewBuffer(json)
	http.Post("http://our_dumb_url_bro", "application/json", buffer)
}
func (c *Client) mypageHandler(resp *http.Response, err error) {
	if err != nil {
		c.Fail(err.Error())
		return
	}
	if resp.StatusCode != 200 {
		c.Fail("status %d != 200 %s", resp.StatusCode, resp.Request.URL.String())
		return
	}
	if resp.Header.Get("Cache-Control") != "private" {
		c.Fail("invalid Cache-Control header")
		return
	}
	html, _ := ioutil.ReadAll(resp.Body)
	defer resp.Body.Close()
	doc, docerr := gokogiri.ParseHtml(html)
	defer doc.Free()
	if docerr != nil {
		c.Fail("html parse error")
		return
	}
	nodes, _ := doc.Search("//input[@name='sid' and @type='hidden']")
	if len(nodes) == 0 {
		c.Fail("not found <input type='hidden' name='sid'>")
		return
	}
	c.Token = nodes[0].Attribute("value").String()

	if !c.IsChecker {
		c.Success(1.0)
		return
	}
	c.matchDocNode(doc, "//h2/text()", "Hello\\s+"+*c.Username+"\\!")
	nodes, nodeerr := doc.Search("//div[contains(concat(' ', @class, ' '), ' container ')]/ul/li/a")
	if nodeerr != nil {
		c.Fail("li element search error")
		return
	}
	c.Success(1.0)
	nfetches := rand.Intn(10) + 1
	for i := 0; i < nfetches; i++ {
		node := nodes[rand.Intn(len(nodes))]
		if !c.Running {
			break
		}
		href := node.Attribute("href").String()
		if strings.Index(href, "/") == 0 {
			href = c.Endpoint + href
		}
		req, _ := http.NewRequest("GET", href, nil)
		c.requestWithTimeout(req, c.memoHandler)
	}
}
Beispiel #20
0
// Gets the results for a single page
func getResults(resultsurl string) {
	body := getPage(resultsurl)

	// Get a list of all profile links on page
	doc, _ := gokogiri.ParseHtml(body)
	results, _ := doc.NodeById("results").Search("//li[@itemtype='http://schema.org/Person']")
	names, _ := results[0].Search("//a[@class='app_link']")

	// Send link of each profile on page to parser
	for _, profile := range names {
		parseProfile("http://indeed.com" + profile.Attr("href"))
	}
}
Beispiel #21
0
// Returns a gokogiri html.Document from a url
func getGokogiriDoc(c *http.Client, url string) (*ghtml.HtmlDocument, error) {
	resp, err := c.Get(url)
	if err != nil {
		return nil, err
	}
	defer resp.Body.Close()

	page, err := ioutil.ReadAll(resp.Body)
	if err != nil {
		return nil, err
	}

	return gokogiri.ParseHtml(page)
}
Beispiel #22
0
func search(c *http.Client, name string) (foundName, href string, err error) {
	req, err := http.NewRequest("GET", "https://play.google.com/store/search", nil)
	if err != nil {
		err = fmt.Errorf("failed creating request, %s", err)
		return
	}
	values := req.URL.Query()
	values.Add("q", name)
	values.Add("c", "apps")
	req.URL.RawQuery = values.Encode()
	resp, err := c.Do(req)
	if err != nil {
		err = fmt.Errorf("failed searching, %s", err)
		return
	}
	defer resp.Body.Close()
	page, err := ioutil.ReadAll(resp.Body)
	if err != nil {
		err = fmt.Errorf("failed to read response body, %s", err)
		return
	}
	doc, err := gokogiri.ParseHtml(page)
	if err != nil {
		err = fmt.Errorf("failed to parse search HTML, %s", err)
		return
	}
	defer doc.Free()
	nodes, err := doc.Search("//div[@id='body-content']//a[@title]")
	if err != nil {
		err = fmt.Errorf("failed to search for app links, %s", err)
		return
	}
	if len(nodes) == 0 {
		err = fmt.Errorf("could not find any app links")
		return
	}
	bestMatchScore := 0
	lowerName := strings.ToLower(name)
	for _, n := range nodes {
		nName := strings.TrimSpace(n.Content())
		score := len(lcs(strings.ToLower(nName), lowerName))
		if score > bestMatchScore {
			foundName = nName
			href = n.Attr("href")
			bestMatchScore = score
		}
	}
	return
}
func (c *Client) topHandler(resp *http.Response, err error, matchTitle string) {
	if err != nil {
		c.Fail(err.Error())
		return
	}
	if resp.StatusCode != 200 {
		c.Fail("status %d != 200 %s", resp.StatusCode, resp.Request.URL.String())
		return
	}
	html, _ := ioutil.ReadAll(resp.Body)
	defer resp.Body.Close()
	if !c.IsChecker {
		c.Success(1.0)
		return
	}

	doc, docerr := gokogiri.ParseHtml(html)
	if docerr != nil {
		c.Fail("html parse error")
		return
	}

	nodes, _ := doc.Search("//p[@id='pager']/span[@id='total']/text()")
	if len(nodes) != 1 {
		c.Fail("no pager")
		return
	}
	c.TotalMemos, _ = strconv.Atoi(nodes[0].String())

	nodes, _ = doc.Search("//ul[@id='memos']//li")
	if len(nodes) != 100 {
		c.Fail("invalid memos list")
		return
	}
	if matchTitle == "" {
		c.Success(1.0)
		return
	}

	for _, node := range nodes {
		matched, _ := regexp.MatchString(matchTitle, node.String())
		if matched {
			c.Success(1.0)
			return
		}
	}

	c.Fail("no match title: %s", matchTitle)
}
Beispiel #24
0
func parsePage(httpresp *http.Response, pageName string) *ghtml.HtmlDocument {
	page, err := ioutil.ReadAll(httpresp.Body)
	if err != nil {
		fmt.Fprintf(os.Stderr, "Error reading %s html body: %v\n", pageName, err)
		os.Exit(1)
	}
	httpresp.Body.Close()

	doc, err := gokogiri.ParseHtml(page)
	if err != nil {
		fmt.Fprintf(os.Stderr, "Error parsing %s html body: %v\n", pageName, err)
		os.Exit(1)
	}
	return doc
}
Beispiel #25
0
// ProcessURL method is used to fetch website contents and process it's data
func (s *Scrape) ProcessURL(url string) map[string]interface{} {
	client := &http.Client{}

	req, _ := http.NewRequest("GET", url, nil)
	for k, v := range s.Headers {
		req.Header.Add(k, v)
	}

	resp, _ := client.Do(req)
	page, _ := ioutil.ReadAll(resp.Body)
	doc, _ := gokogiri.ParseHtml(page)
	defer doc.Free()

	return s.ProcessDocument(doc)
}
Beispiel #26
0
func (s *scrape) GetNodeFromPage(scrape string) (xml.Node, error) {
	node, err := gokogiri.ParseHtml(s.GetPage())
	defer node.Free()
	if node == nil {
		return nil, errors.New("Node not found on string: " + scrape)
	}
	res, err := node.Root().FirstChild().Search(scrape)
	if err != nil {
		return nil, err
	}
	if res == nil {
		return nil, errors.New("Node not found on: " + scrape)
	}
	return res[0].Duplicate(1), nil
}
Beispiel #27
0
/*
  Not working because:
  http://www.sc2ratings.com/players.php?realname=Yang,%20Hee-Soo
  is parsed as:
  http://www.sc2ratings.com/players.php?realname=Yang, Hee-Soo
*/
func parseLeagues(player xml.Node) []string {
	out := []string{}
	partialUrl, err := player.Search(".//a/@href")
	errorHandler(err)
	if len(partialUrl) == 1 {
		playerPageUrl := "http://www.sc2ratings.com/" + partialUrl[0].String()
		playerPageSource := retrievePageSource(playerPageUrl)

		playerPage, err := gokogiri.ParseHtml(playerPageSource)
		errorHandler(err)
		defer playerPage.Free()
		fmt.Println(playerPage)
	}
	return out
}
Beispiel #28
0
func fetchRating(c *http.Client, path string) (rating float64, ratings int, err error) {
	resp, err := c.Get("https://play.google.com" + path)
	if err != nil {
		err = fmt.Errorf("failed requesting, %s", err)
		return
	}
	defer resp.Body.Close()
	page, err := ioutil.ReadAll(resp.Body)
	if err != nil {
		err = fmt.Errorf("failed to read response body, %s", err)
		return
	}
	doc, err := gokogiri.ParseHtml(page)
	if err != nil {
		err = fmt.Errorf("failed to parse search HTML, %s", err)
		return
	}
	defer doc.Free()
	nodes, err := doc.Search("//div[@class='score-container']//div[@class='score']")
	if err != nil {
		err = fmt.Errorf("failed to find score, %s", err)
		return
	}
	if len(nodes) == 0 {
		err = fmt.Errorf("could not find any score")
		return
	}
	if rating, err = strconv.ParseFloat(nodes[0].Content(), 64); err != nil {
		err = fmt.Errorf("could not parse score, %s", err)
		return
	}
	nodes, err = doc.Search("//div[@class='score-container']//span[@class='reviews-num']")
	if err != nil {
		err = fmt.Errorf("failed to find review count, %s", err)
		return
	}
	if len(nodes) == 0 {
		err = fmt.Errorf("could not find review count")
		return
	}
	if ratings, err = strconv.Atoi(
		numRegexp.ReplaceAllString(nodes[0].Content(), ""),
	); err != nil {
		err = fmt.Errorf("could not parse review count, %s", err)
		return
	}
	return
}
Beispiel #29
0
func request() error {
	res, err := http.Get("http://www.fifa.com/fifa-tournaments/statistics-and-records/clubworldcup/index.html")
	if err != nil {
		return err
	}
	raw, err := ioutil.ReadAll(res.Body)
	if err != nil {
		return err
	}
	doc, err := gokogiri.ParseHtml(raw)
	if err != nil {
		return err
	}
	defer doc.Free()

}
Beispiel #30
0
func (this *Yingxiaoqun) GetNextUrls(content, url string) ([]u.CrawlData, error) {

	crawls := make([]u.CrawlData, 0)
	doc, _ := gokogiri.ParseHtml([]byte(content))
	defer doc.Free()
	res, err := doc.Search("//*[@class=\"read-more\"]/@href")
	if err != nil {
		return nil, err
	}
	for i := range res {
		//this.Logger.Info("[INFO]  %v",res[i])
		crawls = append(crawls, u.CrawlData{Url: res[i].String(), Type: u.DETAIL_URL, HandlerName: this.Name})
	}

	return crawls, nil
}