func titleSeparation(newDatas []xml.Node) ([]string, error) { var ret []string a := xpath.Compile("./a/span") for _, newData := range newDatas { titles, err := newData.Search(a) if err != nil { return nil, err } for _, title := range titles { newsAndTime := title.Content() timePath := xpath.Compile("./span") time, err := title.Search(timePath) if err != nil { return nil, err } if len(time) != 0 { cutstr := time[0].Content() ret = append(ret, strings.Trim(newsAndTime, cutstr)) } } } return ret, nil }
func main() { data, err := ioutil.ReadFile("xpath.xml") if err != nil { log.Fatal(err) } doc, err := xml.Parse(data, nil, nil, 0, xml.DefaultEncodingBytes) if err != nil { log.Fatal(err) } defer doc.Free() xp := doc.DocXPathCtx() xp.RegisterNamespace("folia", "http://ilk.uvt.nl/folia") fmt.Println("\nAll sentences with all words:\n") xps := xpath.Compile("//folia:s") xpw := xpath.Compile("folia:w/folia:t") ss, err := doc.Root().Search(xps) if err != nil { log.Fatal(err) } for _, s := range ss { fmt.Println(s.Attr("id")) ww, err := s.Search(xpw) if err != nil { log.Fatal(err) } for _, w := range ww { fmt.Println("\t" + w.Parent().Attr("id") + " \t" + w.Content()) } } fmt.Println("\nSearch for specific sentence:\n") n, err := doc.Root().Search(`//folia:s[@xml:id="WR-P-E-E-0000000020.head.4.s.2"]`) if err != nil { log.Fatal(err) } fmt.Println(n) fmt.Println("\nSearch for sentence with specific word:\n") n, err = doc.Root().Search(`//folia:w[@xml:id="WR-P-E-E-0000000020.head.4.s.2.w.2"]`) if err != nil { log.Fatal(err) } fmt.Println(n[0].Parent()) }
func main() { xpathString := "" if len(os.Args) < 2 { fmt.Fprintln(os.Stderr, "Missing second argument, XPATH!") os.Exit(2) } else { xpathString = os.Args[1] } page, _ := ioutil.ReadAll(os.Stdin) doc, err := gokogiri.ParseHtml(page) if err != nil { fmt.Fprintln(os.Stderr, "Problem parsing document.") } defer doc.Free() xps := xpath.Compile(xpathString) defer xps.Free() search, err := doc.Search(xps) if err == nil { for _, s := range search { fmt.Println(s.Content()) } } else { fmt.Fprintln(os.Stderr, "Sorry. Got error.") } }
// GetBlock GETs and parses practitioner's appointments on date // along with any information needed to book available appointments. func (s *Session) GetBlock(date time.Time, practitioner Practitioner) (Block, error) { root, err := s.loadBlock(date) if err != nil { return nil, err } var result Block for status, path := range paths { divs, err := root.Search(xpath.Compile(fmt.Sprintf(path, practitioner))) if err != nil { return nil, err } for _, div := range divs { timestamp, blockIndex, err := parseAppDiv(div) if err != nil { return nil, err } result = append(result, Appointment{ session: s, Timestamp: time.Unix(timestamp, 0), Practitioner: practitioner, Status: status, blockIndex: blockIndex, }) } } return result, nil }
func (record *Record) processOffJoins(doc *html.HtmlDocument) { xPath := xpath.NewXPath(doc.DocPtr()) // join the offence with the defendants and verdict joinPtrs := xPath.Evaluate(doc.Root().NodePtr(), xpath.Compile("//join[@result='criminalCharge']")) for _, nodePtr := range joinPtrs { node := xml.NewNode(nodePtr, doc) targets := strings.Split(node.Attr("targets"), " ") var personId, offId, verdictId string for _, targetId := range targets { if strings.Contains(targetId, "defend") { personId = targetId } if strings.Contains(targetId, "off") { offId = targetId } if strings.Contains(targetId, "verdict") { verdictId = targetId } } offence := record.findOffence(offId) if offence == nil { panic("couldn't find offence " + offId) } person := record.findPerson(personId) if person != nil { offence.Defendants = append(offence.Defendants, person) } verdict := record.findVerdict(verdictId) if verdict != nil { offence.Verdict = verdict } } }
func NewRecord(content []byte) (record *Record) { doc, err := gokogiri.ParseHtml([]byte(content)) if err != nil { panic(err) } displayText := cleanUpContent(doc.String()) record = &Record{RawText: content, DisplayText: displayText} dateStr := getInterp(doc.Root().NodePtr(), "date", doc) date, err := time.Parse("20060102", dateStr) if err != nil { record.Date = nil } else { record.Date = &date } xPath := xpath.NewXPath(doc.DocPtr()) nodePtrs := xPath.Evaluate(doc.Root().NodePtr(), xpath.Compile("//div1")) node := xml.NewNode(nodePtrs[0], doc) record.Id = node.Attr("id") record.Type = node.Attr("type") record.processPersons(doc) record.processOffences(doc) record.processVerdicts(doc) record.processOffJoins(doc) return }
// As the Search function, but passing a VariableScope that can be used to reolve variable // names or registered function references in the XPath being evaluated. func (xmlNode *XmlNode) SearchWithVariables(data interface{}, v xpath.VariableScope) (result []Node, err error) { switch data := data.(type) { default: err = ERR_UNDEFINED_SEARCH_PARAM case string: if xpathExpr := xpath.Compile(data); xpathExpr != nil { defer xpathExpr.Free() result, err = xmlNode.SearchWithVariables(xpathExpr, v) } else { err = errors.New("cannot compile xpath: " + data) } case []byte: result, err = xmlNode.SearchWithVariables(string(data), v) case *xpath.Expression: xpathCtx := xmlNode.Document.DocXPathCtx() xpathCtx.SetResolver(v) nodePtrs, err := xpathCtx.EvaluateAsNodeset(unsafe.Pointer(xmlNode.Ptr), data) if nodePtrs == nil || err != nil { return nil, err } for _, nodePtr := range nodePtrs { result = append(result, NewNode(nodePtr, xmlNode.Document)) } } return }
func FiveThousandBest() (titles []string, err error) { res, err := http.Get("http://5000best.com/movies/1") if err != nil { return } body, err := ioutil.ReadAll(res.Body) if err != nil { return } doc, err := gokogiri.ParseHtml(body) if err != nil { return } exp := xpath.Compile("//a[@class='n']") nodes := doc.XPathCtx.Evaluate(doc.NodePtr(), exp) for _, np := range nodes { node := xml.NewNode(np, doc) title := node.InnerHtml() title = title[:len(title)-8] titles = append(titles, title) } return }
func getNewsInfo(doc *html.HtmlDocument) ([]xml.Node, error) { xp := "//body/div/div/div/div/div/div/div/div/div/div/div/div/div/div/ul/li" xps := xpath.Compile(xp) newDatas, err := doc.Root().Search(xps) if err != nil { return nil, err } return newDatas, nil }
// get the value out of an <interp> tag func getInterp(basePtr unsafe.Pointer, interpType string, doc *html.HtmlDocument) (value string) { xPath := xpath.NewXPath(doc.DocPtr()) nodePtrs := xPath.Evaluate(basePtr, xpath.Compile(".//interp[@type='"+ interpType+"']")) if len(nodePtrs) == 1 { node := xml.NewNode(nodePtrs[0], doc) value = node.Attr("value") } return }
func urlSeparation(newDatas []xml.Node) ([]string, error) { var ret []string a := xpath.Compile("./a/@href") for _, newData := range newDatas { urls, err := newData.Search(a) if err != nil { return nil, err } ret = append(ret, urls[0].Content()) } return ret, nil }
func main() { doc, _ := gokogiri.ParseXml([]byte(a)) defer doc.Free() xp := doc.DocXPathCtx() xp.RegisterNamespace("ns", "http://example.com/this") x := xpath.Compile("/ns:NodeA/ns:NodeB") groups, err := doc.Search(x) if err != nil { fmt.Println(err) } for i, group := range groups { fmt.Println(i, group.Content()) } }
func parseHtml(page []byte) (*[]DubizzleResult, error) { // parse the web page doc, err := gokogiri.ParseHtml(page) if err != nil { return nil, err } println("parsed the doc: \n") // perform operations on the parsed page xp := xpath.Compile("//*[@id='results-list']/div") result_list, err := doc.Root().Search(xp) if err != nil { return nil, err } ads := []DubizzleResult{} for _, rslt := range result_list { xptitle := xpath.Compile(".//h3[@id='title']/span[@class='title']/a") xpprice := xpath.Compile(".//div[@class='price']") title_info, title_err := rslt.Search(xptitle) price_info, price_err := rslt.Search(xpprice) if title_err == nil || price_err == nil { if len(title_info) > 0 && len(price_info) > 0 { title := title_info[0].InnerHtml() price := price_info[0].InnerHtml() price = strings.TrimSpace(price) price = strings.Trim(price, "<br>") price = strings.TrimSpace(price) url := title_info[0].Attribute("href").String() ads = append(ads, DubizzleResult{Title: title, Price: price, Url: url}) } } } doc.Free() return &ads, err }
func (record *Record) processVerdicts(doc *html.HtmlDocument) { xPath := xpath.NewXPath(doc.DocPtr()) verdictPtrs := xPath.Evaluate(doc.Root().NodePtr(), xpath.Compile("//rs[@type='verdictDescription']")) verdicts := make([]Verdict, len(verdictPtrs)) for i, nodePtr := range verdictPtrs { node := xml.NewNode(nodePtr, doc) verdict := Verdict{} verdict.Id = node.Attr("id") verdict.Desc = cleanUpContent(node.Content()) verdict.SetType(getInterp(nodePtr, "verdictCategory", doc)) verdicts[i] = verdict } record.Verdicts = verdicts }
func photoSeparation(newDatas []xml.Node) ([]bool, error) { var ret []bool a := xpath.Compile("./a/span[@class='icon-photo']") for _, newData := range newDatas { icons, err := newData.Search(a) if err != nil { return nil, err } if len(icons) == 0 { ret = append(ret, false) } else { ret = append(ret, true) } } return ret, nil }
func (record *Record) processOffences(doc *html.HtmlDocument) { xPath := xpath.NewXPath(doc.DocPtr()) offencePtrs := xPath.Evaluate(doc.Root().NodePtr(), xpath.Compile("//rs[@type='offenceDescription']")) offences := make([]Offence, len(offencePtrs)) for i, nodePtr := range offencePtrs { node := xml.NewNode(nodePtr, doc) offence := Offence{} offence.Id = node.Attr("id") offence.Category = getInterp(nodePtr, "offenceCategory", doc) offence.SubCategory = getInterp(nodePtr, "offenceSubcategory", doc) offence.Desc = cleanUpContent(node.Content()) offences[i] = offence } record.Offences = offences }
func GetData(body []byte, xpathList map[string]interface{}, xpathChild map[string]interface{}) (result map[string]interface{}) { doc, _ := gokogiri.ParseHtml(body) defer doc.Free() result = map[string]interface{}{} for nodeName, v := range xpathList { xps := xpath.Compile(v.(string)) ss, _ := doc.Root().Search(xps) for k, s := range ss { innerHtml := s.InnerHtml() item := make(map[string]interface{}) item[nodeName] = innerHtml list := ppDownloader.GetData([]byte(innerHtml), xpathChild) result[strconv.Itoa(k)] = list } } return }
func (record *Record) processPersons(doc *html.HtmlDocument) { xPath := xpath.NewXPath(doc.DocPtr()) personPtrs := xPath.Evaluate(doc.Root().NodePtr(), xpath.Compile("//persname")) persons := make([]Person, len(personPtrs)) for i, nodePtr := range personPtrs { node := xml.NewNode(nodePtr, doc) person := Person{} person.Id = node.Attr("id") person.GivenName = getInterp(nodePtr, "given", doc) person.Surname = getInterp(nodePtr, "surname", doc) person.SetType(node.Attr("type")) person.SetGender(getInterp(nodePtr, "gender", doc)) persons[i] = person } record.Persons = persons }
func (t *TvFourFiveSixSeven) GetResultByXpath(body []byte, x string) (list string, err error) { doc, err := gokogiri.ParseHtml(body) if err != nil { return } defer doc.Free() xps := xpath.Compile(x) ss, err := doc.Root().Search(xps) if err != nil { return } for _, s := range ss { return s.InnerHtml(), nil } return }
func getEposNodes(doc *xml.XmlDocument) (retnodes []xml.Node, err error) { // grab the 'Body' element path := xpath.Compile("*[local-name()='Body']") nodes, e := doc.Root().Search(path) if e != nil { err = e return } // check that the data is present if len(nodes) < 1 || nodes[0].CountChildren() < 1 { err = errors.New("bad data") return } // get epos data return nodes[0].FirstChild().Search("./*") }
func GetTemperature() string { resp, _ := http.Get("http://www.jma.go.jp/jp/amedas_h/today-44132.html?areaCode=000&groupCode=30") page, _ := ioutil.ReadAll(resp.Body) doc, _ := gokogiri.ParseHtml(page) defer doc.Free() xps := xpath.Compile("//*[@id=\"tbl_list\"]/tr/td[2]") ss, _ := doc.Root().Search(xps) var temperature string for _, s := range ss { if len(s.InnerHtml()) > 2 { temperature = s.InnerHtml() } } return temperature }
func GetHumidity() string { resp, _ := http.Get("http://www.jma.go.jp/jp/amedas_h/today-44132.html?areaCode=000&groupCode=30") page, _ := ioutil.ReadAll(resp.Body) doc, _ := gokogiri.ParseHtml(page) defer doc.Free() xps := xpath.Compile("//*[@id=\"tbl_list\"]/tr/td[7]") ss, _ := doc.Root().Search(xps) var humidity string for _, s := range ss { if _, err := strconv.Atoi(s.InnerHtml()); err == nil { humidity = s.InnerHtml() } } return humidity }
// In any other cases, the result will be coerced to a string. func (xmlNode *XmlNode) EvalXPath(data interface{}, v xpath.VariableScope) (result interface{}, err error) { switch data := data.(type) { case string: if xpathExpr := xpath.Compile(data); xpathExpr != nil { defer xpathExpr.Free() result, err = xmlNode.EvalXPath(xpathExpr, v) } else { err = errors.New("cannot compile xpath: " + data) } case []byte: result, err = xmlNode.EvalXPath(string(data), v) case *xpath.Expression: xpathCtx := xmlNode.Document.DocXPathCtx() xpathCtx.SetResolver(v) err := xpathCtx.Evaluate(unsafe.Pointer(xmlNode.Ptr), data) if err != nil { return nil, err } rt := xpathCtx.ReturnType() switch rt { case xpath.XPATH_NODESET, xpath.XPATH_XSLT_TREE: nodePtrs, err := xpathCtx.ResultAsNodeset() if err != nil { return nil, err } var output []Node for _, nodePtr := range nodePtrs { output = append(output, NewNode(nodePtr, xmlNode.Document)) } result = output case xpath.XPATH_NUMBER: result, _ = xpathCtx.ResultAsNumber() case xpath.XPATH_BOOLEAN: result, _ = xpathCtx.ResultAsBoolean() default: result, _ = xpathCtx.ResultAsString() } default: err = ERR_UNDEFINED_SEARCH_PARAM } return }
// ProcessField method fetches data from passed document func (f *Field) ProcessField(d *html.HtmlDocument) interface{} { var value interface{} var node xml.Node selector := xpath.Compile(f.Selector) result, _ := d.Root().Search(selector) if len(result) > 0 { node = result[0] } else { return "" } if f.Callback != nil { value = f.Callback(&node) } else { value = node.Content() } return value }
func GetSummary(query string) (string, error) { resp, _ := http.Get(GenerateJaWikipediaURL(query)) if resp.StatusCode != 200 { return "", errors.New("page not found") } page, _ := ioutil.ReadAll(resp.Body) doc, _ := gokogiri.ParseHtml(page) defer doc.Free() xps := xpath.Compile("//*[@id=\"mw-content-text\"]/p[1]") ss, _ := doc.Root().Search(xps) content := "" for _, s := range ss { content += s.Content() } return content, nil }
func (t *TvFourFiveSixSeven) GetUrlsByXpath(body []byte, x string) (list []string, err error) { doc, err := gokogiri.ParseHtml(body) if err != nil { return } defer doc.Free() xps := xpath.Compile(x) ss, err := doc.Root().Search(xps) if err != nil { return } for _, s := range ss { ww, _ := s.Search("./li/a") for _, w := range ww { list = append(list, w.Attr("href")) } } return }
func GetData(body []byte, dxpath map[string]interface{}) (result map[string]interface{}) { doc, _ := gokogiri.ParseHtml(body) defer doc.Free() result = map[string]interface{}{} for mapKey, v := range dxpath { xps := xpath.Compile(v.(string)) ss, _ := doc.Root().Search(xps) for _, s := range ss { if mapKey == "author" { result[mapKey] = s.Attr("title") } else if mapKey == "wx_time" { result["time"] = s.Attr("t") } else { result[mapKey] = s.InnerHtml() } } } return }
func parse(c *cli.Context, data []byte) { doc, err := xml.Parse(data, nil, nil, 0, xml.DefaultEncodingBytes) util.CheckErr(err) defer doc.Free() xp := doc.DocXPathCtx() for _, xmlns := range c.StringSlice("xmlns") { ns := strings.SplitN(xmlns, ":", 2) if c.Bool("verbose") { fmt.Println("NS " + ns[0] + "==" + ns[1]) } xp.RegisterNamespace(ns[0], ns[1]) } xps := xpath.Compile(c.String("xpath")) s, err := doc.Root().Search(xps) util.CheckErr(err) for _, s := range s { util.CheckErr(err) fmt.Println(s.Content()) } }
func xpathSelector(xs []string, apply xpathSelectorApply) Selector { exprs := []*xpath.Expression{} for _, x := range xs { exprs = append(exprs, xpath.Compile(x)) } return func(url string, doc *html.HtmlDocument) (interface{}, error) { var value interface{} for _, expr := range exprs { matches, err := doc.EvalXPath(expr, nil) if err != nil { return nil, err } if nodeset, ok := matches.([]xml.Node); ok == true { for _, node := range nodeset { value = apply(node.Content(), value) } } else { switch match := matches.(type) { case float64: value = apply(strconv.FormatFloat(match, 'f', 10, 64), value) case bool: if match { value = apply("true", value) } else { value = apply("false", value) } case string: value = apply(match, value) } } } return value, nil } }
// If a non-nil VariableScope is provided, any variables or registered functions present // in the xpath will be resolved. func (xmlNode *XmlNode) EvalXPathAsBoolean(data interface{}, v xpath.VariableScope) (result bool) { switch data := data.(type) { case string: if xpathExpr := xpath.Compile(data); xpathExpr != nil { defer xpathExpr.Free() result = xmlNode.EvalXPathAsBoolean(xpathExpr, v) } else { //err = errors.New("cannot compile xpath: " + data) } case []byte: result = xmlNode.EvalXPathAsBoolean(string(data), v) case *xpath.Expression: xpathCtx := xmlNode.Document.DocXPathCtx() xpathCtx.SetResolver(v) err := xpathCtx.Evaluate(unsafe.Pointer(xmlNode.Ptr), data) if err != nil { return false } result, _ = xpathCtx.ResultAsBoolean() default: //err = ERR_UNDEFINED_SEARCH_PARAM } return }