func FiveThousandBest() (titles []string, err error) { res, err := http.Get("http://5000best.com/movies/1") if err != nil { return } body, err := ioutil.ReadAll(res.Body) if err != nil { return } doc, err := gokogiri.ParseHtml(body) if err != nil { return } exp := xpath.Compile("//a[@class='n']") nodes := doc.XPathCtx.Evaluate(doc.NodePtr(), exp) for _, np := range nodes { node := xml.NewNode(np, doc) title := node.InnerHtml() title = title[:len(title)-8] titles = append(titles, title) } return }
func NewRecord(content []byte) (record *Record) { doc, err := gokogiri.ParseHtml([]byte(content)) if err != nil { panic(err) } displayText := cleanUpContent(doc.String()) record = &Record{RawText: content, DisplayText: displayText} dateStr := getInterp(doc.Root().NodePtr(), "date", doc) date, err := time.Parse("20060102", dateStr) if err != nil { record.Date = nil } else { record.Date = &date } xPath := xpath.NewXPath(doc.DocPtr()) nodePtrs := xPath.Evaluate(doc.Root().NodePtr(), xpath.Compile("//div1")) node := xml.NewNode(nodePtrs[0], doc) record.Id = node.Attr("id") record.Type = node.Attr("type") record.processPersons(doc) record.processOffences(doc) record.processVerdicts(doc) record.processOffJoins(doc) return }
func (record *Record) processOffJoins(doc *html.HtmlDocument) { xPath := xpath.NewXPath(doc.DocPtr()) // join the offence with the defendants and verdict joinPtrs := xPath.Evaluate(doc.Root().NodePtr(), xpath.Compile("//join[@result='criminalCharge']")) for _, nodePtr := range joinPtrs { node := xml.NewNode(nodePtr, doc) targets := strings.Split(node.Attr("targets"), " ") var personId, offId, verdictId string for _, targetId := range targets { if strings.Contains(targetId, "defend") { personId = targetId } if strings.Contains(targetId, "off") { offId = targetId } if strings.Contains(targetId, "verdict") { verdictId = targetId } } offence := record.findOffence(offId) if offence == nil { panic("couldn't find offence " + offId) } person := record.findPerson(personId) if person != nil { offence.Defendants = append(offence.Defendants, person) } verdict := record.findVerdict(verdictId) if verdict != nil { offence.Verdict = verdict } } }
// get the value out of an <interp> tag func getInterp(basePtr unsafe.Pointer, interpType string, doc *html.HtmlDocument) (value string) { xPath := xpath.NewXPath(doc.DocPtr()) nodePtrs := xPath.Evaluate(basePtr, xpath.Compile(".//interp[@type='"+ interpType+"']")) if len(nodePtrs) == 1 { node := xml.NewNode(nodePtrs[0], doc) value = node.Attr("value") } return }
func (record *Record) processVerdicts(doc *html.HtmlDocument) { xPath := xpath.NewXPath(doc.DocPtr()) verdictPtrs := xPath.Evaluate(doc.Root().NodePtr(), xpath.Compile("//rs[@type='verdictDescription']")) verdicts := make([]Verdict, len(verdictPtrs)) for i, nodePtr := range verdictPtrs { node := xml.NewNode(nodePtr, doc) verdict := Verdict{} verdict.Id = node.Attr("id") verdict.Desc = cleanUpContent(node.Content()) verdict.SetType(getInterp(nodePtr, "verdictCategory", doc)) verdicts[i] = verdict } record.Verdicts = verdicts }
func (record *Record) processOffences(doc *html.HtmlDocument) { xPath := xpath.NewXPath(doc.DocPtr()) offencePtrs := xPath.Evaluate(doc.Root().NodePtr(), xpath.Compile("//rs[@type='offenceDescription']")) offences := make([]Offence, len(offencePtrs)) for i, nodePtr := range offencePtrs { node := xml.NewNode(nodePtr, doc) offence := Offence{} offence.Id = node.Attr("id") offence.Category = getInterp(nodePtr, "offenceCategory", doc) offence.SubCategory = getInterp(nodePtr, "offenceSubcategory", doc) offence.Desc = cleanUpContent(node.Content()) offences[i] = offence } record.Offences = offences }
func (record *Record) processPersons(doc *html.HtmlDocument) { xPath := xpath.NewXPath(doc.DocPtr()) personPtrs := xPath.Evaluate(doc.Root().NodePtr(), xpath.Compile("//persname")) persons := make([]Person, len(personPtrs)) for i, nodePtr := range personPtrs { node := xml.NewNode(nodePtr, doc) person := Person{} person.Id = node.Attr("id") person.GivenName = getInterp(nodePtr, "given", doc) person.Surname = getInterp(nodePtr, "surname", doc) person.SetType(node.Attr("type")) person.SetGender(getInterp(nodePtr, "gender", doc)) persons[i] = person } record.Persons = persons }
func (store *Store) LoadPrice(url string) (price float64, err error) { resp, err := http.Get(url) if err != nil { return } defer resp.Body.Close() body, err := ioutil.ReadAll(resp.Body) if err != nil { return } doc, err := gokogiri.ParseHtml(body) if err != nil { return } nxpath := xpath.NewXPath(doc.DocPtr()) nodes, err := nxpath.Evaluate(doc.DocPtr(), store.compiledXPath) if err != nil { return } if len(nodes) == 0 { fmt.Printf("Check XPath correctness (not found) for domain: %s\n", store.Domain) return } price_raw := xml.NewNode(nodes[0], doc).InnerHtml() price_raw = strings.Trim(price_raw, "$ \n\r") price, err = strconv.ParseFloat(price_raw, 64) if err != nil { fmt.Printf("Check XPath correctness (not monetary) for domain: %s\n", store.Domain) return } return }
func parsefragment(document xml.Document, node *xml.XmlNode, content, url []byte, options xml.ParseOption) (fragment *xml.DocumentFragment, err error) { //set up pointers before calling the C function var contentPtr, urlPtr unsafe.Pointer if len(url) > 0 { urlPtr = unsafe.Pointer(&url[0]) } var root xml.Node if node == nil { containBody := (bytes.Index(content, bodySigBytes) >= 0) content = append(fragmentWrapper, content...) contentPtr = unsafe.Pointer(&content[0]) contentLen := len(content) inEncoding := document.InputEncoding() var encodingPtr unsafe.Pointer if len(inEncoding) > 0 { encodingPtr = unsafe.Pointer(&inEncoding[0]) } htmlPtr := C.htmlParseFragmentAsDoc(document.DocPtr(), contentPtr, C.int(contentLen), urlPtr, encodingPtr, C.int(options), nil, 0) //Note we've parsed the fragment within the given document //the root is not the root of the document; rather it's the root of the subtree from the fragment html := xml.NewNode(unsafe.Pointer(htmlPtr), document) if html == nil { err = ErrFailParseFragment return } root = html if !containBody { root = html.FirstChild() html.AddPreviousSibling(root) html.Remove() //remove html otherwise it's leaked } } else { //wrap the content newContent := append(fragmentWrapperStart, content...) newContent = append(newContent, fragmentWrapperEnd...) contentPtr = unsafe.Pointer(&newContent[0]) contentLen := len(newContent) rootElementPtr := C.htmlParseFragment(node.NodePtr(), contentPtr, C.int(contentLen), urlPtr, C.int(options), nil, 0) if rootElementPtr == nil { //try to parse it as a doc fragment, err = parsefragment(document, nil, content, url, options) return } if rootElementPtr == nil { err = ErrFailParseFragment return } root = xml.NewNode(unsafe.Pointer(rootElementPtr), document) } fragment = &xml.DocumentFragment{} fragment.Node = root fragment.InEncoding = document.InputEncoding() fragment.OutEncoding = document.OutputEncoding() document.BookkeepFragment(fragment) return }