func DownloadPage(path string) (doc *html.HtmlDocument, err error) { req, err := MakeRequest("GET", path) if err != nil { return } res, err := client.Do(req) if err != nil { return } defer res.Body.Close() if res.StatusCode != 200 { err = errors.New(fmt.Sprintf("DownloadPage: %d %s", res.StatusCode, path)) return } data, err := ioutil.ReadAll(res.Body) if err != nil { return } doc, err = gokogiri.ParseHtml(data) return }
func loadForms(baseUrl string, b []byte) ([]*Form, error) { u, e := url.Parse(baseUrl) if e != nil { return nil, e } doc, e := gokogiri.ParseHtml(b) if e != nil { return nil, e } formTags, e := doc.Search("//form") if e != nil { return nil, e } out := []*Form{} for _, node := range formTags { action := node.Attr("action") if !strings.HasPrefix(action, "http") { base := u.Scheme + "://" + u.Host if strings.HasPrefix(action, "/") { action = base + action } else { action = base + u.Path + "/" + strings.TrimPrefix(action, "/") } } f := &Form{Method: node.Attr("method"), Action: action} f.Inputs, e = loadInputs(node) if e != nil { return nil, e } out = append(out, f) } return out, nil }
func (s *Service) Check(id string) (status string, err error) { body, err := s.Downloader(s.URL, id) if err != nil { return } if s.Extractor != nil { parts := s.Extractor.FindSubmatch(body) if parts == nil { return "", nil } status = string(parts[1]) } else { doc, err := gokogiri.ParseHtml(body) if err != nil { return "", err } defer doc.Free() res, err := doc.Search(s.XPath) if err != nil { return "", err } if len(res) < 1 { return "", nil } status = sanitize.HTML(res[0].String()) status = replacer.ReplaceAllString(status, " ") status = strings.TrimSpace(status) } return }
func NewRecord(content []byte) (record *Record) { doc, err := gokogiri.ParseHtml([]byte(content)) if err != nil { panic(err) } displayText := cleanUpContent(doc.String()) record = &Record{RawText: content, DisplayText: displayText} dateStr := getInterp(doc.Root().NodePtr(), "date", doc) date, err := time.Parse("20060102", dateStr) if err != nil { record.Date = nil } else { record.Date = &date } xPath := xpath.NewXPath(doc.DocPtr()) nodePtrs := xPath.Evaluate(doc.Root().NodePtr(), xpath.Compile("//div1")) node := xml.NewNode(nodePtrs[0], doc) record.Id = node.Attr("id") record.Type = node.Attr("type") record.processPersons(doc) record.processOffences(doc) record.processVerdicts(doc) record.processOffJoins(doc) return }
func AllTrainNumbers() (numbers []string, err error) { content, err := kcjHttpRequest(nil) if err != nil { return nil, err } doc, err := gokogiri.ParseHtml(content) if err != nil { return nil, err } defer doc.Free() const trainNumXPath = "/html/body/form/table/tr[2]/td[2]/table/tr[4]/td[2]/select/option/text()" html := doc.Root().FirstChild() numResults, err := html.Search(trainNumXPath) if err != nil { return nil, err } numbers = make([]string, len(numResults)-1) for i, num := range numResults[1:] { numbers[i] = num.String() } return }
func (c *Client) recentHandler(resp *http.Response, err error) { if err != nil { c.Fail(err.Error()) return } if resp.StatusCode != 200 { c.Fail("status %d != 200 %s", resp.StatusCode, resp.Request.URL.String()) return } html, _ := ioutil.ReadAll(resp.Body) defer resp.Body.Close() doc, docerr := gokogiri.ParseHtml(html) if docerr != nil { c.Fail("html parse error") return } nodes, _ := doc.Search("//ul[@id='memos']//li") if len(nodes) == 0 { c.Fail("memos too few") return } nodes, _ = doc.Search("//p[@id='pager']/span[@id='total']/text()") if len(nodes) != 1 { c.Fail("no pager") return } c.TotalMemos, _ = strconv.Atoi(nodes[0].String()) c.Success(1.0) }
func (c *Client) signinHandler(resp *http.Response, err error) { if err != nil { c.Fail(err.Error()) return } if resp.StatusCode != 200 { c.Fail("status %d != 200 %s", resp.StatusCode, resp.Request.URL.String()) return } html, _ := ioutil.ReadAll(resp.Body) defer resp.Body.Close() if !c.IsChecker { c.Success(1.0) return } doc, docerr := gokogiri.ParseHtml(html) defer doc.Free() if docerr != nil { c.Fail("html parse error") return } nodes, nodeerr := doc.Search("//form//input[@name='username']") if nodeerr != nil { c.Fail("input element search error") return } if len(nodes) != 1 { c.Fail("input element not found") return } c.Success(1.0) }
func main() { if len(os.Args) < 3 { fmt.Println("Usage:", os.Args[0], "filename iterations") os.Exit(1) } filename := os.Args[1] n, _ := strconv.Atoi(os.Args[2]) file, err := ioutil.ReadFile(filename) if err != nil { panic(err) } start := time.Now() for i := 0; i < n; i++ { doc, err := gokogiri.ParseHtml(file) if err != nil { panic(err) } doc.Root() doc.Free() } end := time.Now() fmt.Printf("%f s\n", end.Sub(start).Seconds()) }
func main() { xpathString := "" if len(os.Args) < 2 { fmt.Fprintln(os.Stderr, "Missing second argument, XPATH!") os.Exit(2) } else { xpathString = os.Args[1] } page, _ := ioutil.ReadAll(os.Stdin) doc, err := gokogiri.ParseHtml(page) if err != nil { fmt.Fprintln(os.Stderr, "Problem parsing document.") } defer doc.Free() xps := xpath.Compile(xpathString) defer xps.Free() search, err := doc.Search(xps) if err == nil { for _, s := range search { fmt.Println(s.Content()) } } else { fmt.Fprintln(os.Stderr, "Sorry. Got error.") } }
func parseResult(html []byte) (results []ImageResult, err error) { doc, err := gokogiri.ParseHtml([]byte(html)) if err != nil { return nil, err } // Get image tags imagesTags, err := doc.Search(`//div[@class='dg_u']/a`) if err != nil { return nil, err } // Filter and parse var images []ImageResult for _, tag := range imagesTags { if meta, err := ParseMetadata(tag.Attr("m")); err == nil { images = append(images, metaToResult(meta)) } } // No results if images == nil { return nil, fmt.Errorf("No results") } return images, nil }
func (p *Page) scrap(url string, o *parse.Object) error { resp, err := p.client.Get(url) if err != nil { return err } defer resp.Body.Close() page, err := ioutil.ReadAll(resp.Body) if err != nil { return err } doc, err := gokogiri.ParseHtml(page) if err != nil { return err } defer doc.Free() for _, field := range p.fields { value, err := field.selector(url, doc) if err != nil { return err } o.SetNested(field.path, value) } for _, processor := range p.processors { processor(o) } return nil }
func FiveThousandBest() (titles []string, err error) { res, err := http.Get("http://5000best.com/movies/1") if err != nil { return } body, err := ioutil.ReadAll(res.Body) if err != nil { return } doc, err := gokogiri.ParseHtml(body) if err != nil { return } exp := xpath.Compile("//a[@class='n']") nodes := doc.XPathCtx.Evaluate(doc.NodePtr(), exp) for _, np := range nodes { node := xml.NewNode(np, doc) title := node.InnerHtml() title = title[:len(title)-8] titles = append(titles, title) } return }
func parseDivs(body []byte) ([]xml.Node, error) { root, err := gokogiri.ParseHtml(body) if err != nil { return nil, err } return root.Root().Search(divsPath) }
func (this *Yingxiaoqun) GetDetailContent(content, url string) (map[string]interface{}, error) { //result:=make(map[string]interface{}) num_pat, err := regexp.Compile("(\\d+)$") if err != nil { return nil, err } matchs := num_pat.FindStringSubmatch(url) if matchs == nil { err = errors.New("Wrong URL") return nil, err } //this.Logger.Info("[INFO] matchs[1] %v",matchs[1]) doc, _ := gokogiri.ParseHtml([]byte(content)) defer doc.Free() title_xpath, err := doc.Search(fmt.Sprintf("//*[@id=\"post-%v\"]/div/div[2]/header/h1", matchs[1])) if len(title_xpath) == 0 || err != nil { return nil, err } title := title_xpath[0].Content() editdate_xpath, err := doc.Search(fmt.Sprintf("//*[@id=\"post-%s\"]/div/div[1]/div/time/@datetime", matchs[1])) if len(editdate_xpath) == 0 || err != nil { return nil, err } editdate := editdate_xpath[0].String() content_xpath, err := doc.Search(fmt.Sprintf("//*[@id=\"post-%s\"]/div/div[2]/div", matchs[1])) if len(content_xpath) == 0 || err != nil { return nil, err } contents := content_xpath[0].Content() editor_xpath, err := doc.Search(fmt.Sprintf("//*[@id=\"post-%s\"]/div/div[2]/header/span/span/a", matchs[1])) if len(editor_xpath) == 0 || err != nil { return nil, err } editor := editor_xpath[0].Content() this.Logger.Info("[INFO] Title %v", title) this.Logger.Info("[INFO] EditDate %v", editdate) this.Logger.Info("[INFO] Content %v", contents) this.Logger.Info("[INFO] Editor %v", editor) /* err = this.DbAdaptor.ExecFormat(SQL_YXQ,"111",matchs[1],"2","",title,"",contents,editor,editdate[:10],url,"0") if err != nil { this.Logger.Error(" MYSQL Error : %v ",err) return nil,err } */ return nil, nil }
func ScrapeMatches(url string) { pageSource := retrievePageSource(url) doc, err := gokogiri.ParseHtml(pageSource) errorHandler(err) defer doc.Free() matches, err := doc.Search(".//*[@class='item-container clearfix match collapsed']") errorHandler(err) fmt.Println(parseMatches(matches)) }
func main() { // Parse even this bad bit of HTML and make it valid html := "<h2>I am so malformatted</h2>" doc, _ := gokogiri.ParseHtml([]byte(html)) defer doc.Free() header := doc.Root().FirstChild().FirstChild() header.SetName("h1") fmt.Println(doc.String()) }
func processUrl(url string) *html.HtmlDocument { respon, err := http.Get(url) if err != nil { panic(err) } page, _ := ioutil.ReadAll(respon.Body) document, _ := gokogiri.ParseHtml(page) return document }
func ScrapePlayers(url string) { pageSource := retrievePageSource(url) doc, err := gokogiri.ParseHtml(pageSource) errorHandler(err) defer doc.Free() playerContainers, err := doc.Search("//*[contains(@class, 'item-container')]") players := parsePlayers(playerContainers) json := marshalSlice(players) buffer := bytes.NewBuffer(json) http.Post("http://our_dumb_url_bro", "application/json", buffer) }
func (c *Client) mypageHandler(resp *http.Response, err error) { if err != nil { c.Fail(err.Error()) return } if resp.StatusCode != 200 { c.Fail("status %d != 200 %s", resp.StatusCode, resp.Request.URL.String()) return } if resp.Header.Get("Cache-Control") != "private" { c.Fail("invalid Cache-Control header") return } html, _ := ioutil.ReadAll(resp.Body) defer resp.Body.Close() doc, docerr := gokogiri.ParseHtml(html) defer doc.Free() if docerr != nil { c.Fail("html parse error") return } nodes, _ := doc.Search("//input[@name='sid' and @type='hidden']") if len(nodes) == 0 { c.Fail("not found <input type='hidden' name='sid'>") return } c.Token = nodes[0].Attribute("value").String() if !c.IsChecker { c.Success(1.0) return } c.matchDocNode(doc, "//h2/text()", "Hello\\s+"+*c.Username+"\\!") nodes, nodeerr := doc.Search("//div[contains(concat(' ', @class, ' '), ' container ')]/ul/li/a") if nodeerr != nil { c.Fail("li element search error") return } c.Success(1.0) nfetches := rand.Intn(10) + 1 for i := 0; i < nfetches; i++ { node := nodes[rand.Intn(len(nodes))] if !c.Running { break } href := node.Attribute("href").String() if strings.Index(href, "/") == 0 { href = c.Endpoint + href } req, _ := http.NewRequest("GET", href, nil) c.requestWithTimeout(req, c.memoHandler) } }
// Gets the results for a single page func getResults(resultsurl string) { body := getPage(resultsurl) // Get a list of all profile links on page doc, _ := gokogiri.ParseHtml(body) results, _ := doc.NodeById("results").Search("//li[@itemtype='http://schema.org/Person']") names, _ := results[0].Search("//a[@class='app_link']") // Send link of each profile on page to parser for _, profile := range names { parseProfile("http://indeed.com" + profile.Attr("href")) } }
// Returns a gokogiri html.Document from a url func getGokogiriDoc(c *http.Client, url string) (*ghtml.HtmlDocument, error) { resp, err := c.Get(url) if err != nil { return nil, err } defer resp.Body.Close() page, err := ioutil.ReadAll(resp.Body) if err != nil { return nil, err } return gokogiri.ParseHtml(page) }
func search(c *http.Client, name string) (foundName, href string, err error) { req, err := http.NewRequest("GET", "https://play.google.com/store/search", nil) if err != nil { err = fmt.Errorf("failed creating request, %s", err) return } values := req.URL.Query() values.Add("q", name) values.Add("c", "apps") req.URL.RawQuery = values.Encode() resp, err := c.Do(req) if err != nil { err = fmt.Errorf("failed searching, %s", err) return } defer resp.Body.Close() page, err := ioutil.ReadAll(resp.Body) if err != nil { err = fmt.Errorf("failed to read response body, %s", err) return } doc, err := gokogiri.ParseHtml(page) if err != nil { err = fmt.Errorf("failed to parse search HTML, %s", err) return } defer doc.Free() nodes, err := doc.Search("//div[@id='body-content']//a[@title]") if err != nil { err = fmt.Errorf("failed to search for app links, %s", err) return } if len(nodes) == 0 { err = fmt.Errorf("could not find any app links") return } bestMatchScore := 0 lowerName := strings.ToLower(name) for _, n := range nodes { nName := strings.TrimSpace(n.Content()) score := len(lcs(strings.ToLower(nName), lowerName)) if score > bestMatchScore { foundName = nName href = n.Attr("href") bestMatchScore = score } } return }
func (c *Client) topHandler(resp *http.Response, err error, matchTitle string) { if err != nil { c.Fail(err.Error()) return } if resp.StatusCode != 200 { c.Fail("status %d != 200 %s", resp.StatusCode, resp.Request.URL.String()) return } html, _ := ioutil.ReadAll(resp.Body) defer resp.Body.Close() if !c.IsChecker { c.Success(1.0) return } doc, docerr := gokogiri.ParseHtml(html) if docerr != nil { c.Fail("html parse error") return } nodes, _ := doc.Search("//p[@id='pager']/span[@id='total']/text()") if len(nodes) != 1 { c.Fail("no pager") return } c.TotalMemos, _ = strconv.Atoi(nodes[0].String()) nodes, _ = doc.Search("//ul[@id='memos']//li") if len(nodes) != 100 { c.Fail("invalid memos list") return } if matchTitle == "" { c.Success(1.0) return } for _, node := range nodes { matched, _ := regexp.MatchString(matchTitle, node.String()) if matched { c.Success(1.0) return } } c.Fail("no match title: %s", matchTitle) }
func parsePage(httpresp *http.Response, pageName string) *ghtml.HtmlDocument { page, err := ioutil.ReadAll(httpresp.Body) if err != nil { fmt.Fprintf(os.Stderr, "Error reading %s html body: %v\n", pageName, err) os.Exit(1) } httpresp.Body.Close() doc, err := gokogiri.ParseHtml(page) if err != nil { fmt.Fprintf(os.Stderr, "Error parsing %s html body: %v\n", pageName, err) os.Exit(1) } return doc }
// ProcessURL method is used to fetch website contents and process it's data func (s *Scrape) ProcessURL(url string) map[string]interface{} { client := &http.Client{} req, _ := http.NewRequest("GET", url, nil) for k, v := range s.Headers { req.Header.Add(k, v) } resp, _ := client.Do(req) page, _ := ioutil.ReadAll(resp.Body) doc, _ := gokogiri.ParseHtml(page) defer doc.Free() return s.ProcessDocument(doc) }
func (s *scrape) GetNodeFromPage(scrape string) (xml.Node, error) { node, err := gokogiri.ParseHtml(s.GetPage()) defer node.Free() if node == nil { return nil, errors.New("Node not found on string: " + scrape) } res, err := node.Root().FirstChild().Search(scrape) if err != nil { return nil, err } if res == nil { return nil, errors.New("Node not found on: " + scrape) } return res[0].Duplicate(1), nil }
/* Not working because: http://www.sc2ratings.com/players.php?realname=Yang,%20Hee-Soo is parsed as: http://www.sc2ratings.com/players.php?realname=Yang, Hee-Soo */ func parseLeagues(player xml.Node) []string { out := []string{} partialUrl, err := player.Search(".//a/@href") errorHandler(err) if len(partialUrl) == 1 { playerPageUrl := "http://www.sc2ratings.com/" + partialUrl[0].String() playerPageSource := retrievePageSource(playerPageUrl) playerPage, err := gokogiri.ParseHtml(playerPageSource) errorHandler(err) defer playerPage.Free() fmt.Println(playerPage) } return out }
func fetchRating(c *http.Client, path string) (rating float64, ratings int, err error) { resp, err := c.Get("https://play.google.com" + path) if err != nil { err = fmt.Errorf("failed requesting, %s", err) return } defer resp.Body.Close() page, err := ioutil.ReadAll(resp.Body) if err != nil { err = fmt.Errorf("failed to read response body, %s", err) return } doc, err := gokogiri.ParseHtml(page) if err != nil { err = fmt.Errorf("failed to parse search HTML, %s", err) return } defer doc.Free() nodes, err := doc.Search("//div[@class='score-container']//div[@class='score']") if err != nil { err = fmt.Errorf("failed to find score, %s", err) return } if len(nodes) == 0 { err = fmt.Errorf("could not find any score") return } if rating, err = strconv.ParseFloat(nodes[0].Content(), 64); err != nil { err = fmt.Errorf("could not parse score, %s", err) return } nodes, err = doc.Search("//div[@class='score-container']//span[@class='reviews-num']") if err != nil { err = fmt.Errorf("failed to find review count, %s", err) return } if len(nodes) == 0 { err = fmt.Errorf("could not find review count") return } if ratings, err = strconv.Atoi( numRegexp.ReplaceAllString(nodes[0].Content(), ""), ); err != nil { err = fmt.Errorf("could not parse review count, %s", err) return } return }
func request() error { res, err := http.Get("http://www.fifa.com/fifa-tournaments/statistics-and-records/clubworldcup/index.html") if err != nil { return err } raw, err := ioutil.ReadAll(res.Body) if err != nil { return err } doc, err := gokogiri.ParseHtml(raw) if err != nil { return err } defer doc.Free() }
func (this *Yingxiaoqun) GetNextUrls(content, url string) ([]u.CrawlData, error) { crawls := make([]u.CrawlData, 0) doc, _ := gokogiri.ParseHtml([]byte(content)) defer doc.Free() res, err := doc.Search("//*[@class=\"read-more\"]/@href") if err != nil { return nil, err } for i := range res { //this.Logger.Info("[INFO] %v",res[i]) crawls = append(crawls, u.CrawlData{Url: res[i].String(), Type: u.DETAIL_URL, HandlerName: this.Name}) } return crawls, nil }