func (record *Record) processOffJoins(doc *html.HtmlDocument) { xPath := xpath.NewXPath(doc.DocPtr()) // join the offence with the defendants and verdict joinPtrs := xPath.Evaluate(doc.Root().NodePtr(), xpath.Compile("//join[@result='criminalCharge']")) for _, nodePtr := range joinPtrs { node := xml.NewNode(nodePtr, doc) targets := strings.Split(node.Attr("targets"), " ") var personId, offId, verdictId string for _, targetId := range targets { if strings.Contains(targetId, "defend") { personId = targetId } if strings.Contains(targetId, "off") { offId = targetId } if strings.Contains(targetId, "verdict") { verdictId = targetId } } offence := record.findOffence(offId) if offence == nil { panic("couldn't find offence " + offId) } person := record.findPerson(personId) if person != nil { offence.Defendants = append(offence.Defendants, person) } verdict := record.findVerdict(verdictId) if verdict != nil { offence.Verdict = verdict } } }
/* * Invokes a MIME type plugin based on current node's type attribute, passing src attribute's value * as argument. Subcommand's output is piped to Gokogiri through a buffer. */ func (gen *Generator) handleMIMETypePlugin(e xml.Node, doc *html.HtmlDocument) (err error) { src := e.Attribute("src").Value() typ := e.Attribute("type").Value() cmd := exec.Command(fmt.Sprintf("m%s%s", ZAS_PREFIX, gen.resolveMIMETypePlugin(typ)), src) stdout, err := cmd.StdoutPipe() if err != nil { return } cmd.Stderr = os.Stderr c := make(chan bufErr) go func() { data, err := ioutil.ReadAll(stdout) c <- bufErr{data, err} }() if err = cmd.Start(); err != nil { return } be := <-c if err = cmd.Wait(); err != nil { return } if be.err != nil { return be.err } parent := e.Parent() child, err := doc.Coerce(be.buffer) if err != nil { return } parent.AddChild(child) e.Remove() return }
func parseMandiriHtml(document *html.HtmlDocument) (map[string]Currency, error) { nilai := "1.00" var matauang string var kursjual string var kursbeli string var kurs = make(map[string]Currency) doc, err := document.Search("//table[@class='tbl-view']/tr") for i, tr := range doc { t := 0 for td := tr.FirstChild(); td != nil; td = td.NextSibling() { teks := strings.TrimSpace(td.Content()) if i > 0 && i <= 15 && len(teks) > 0 { if t == 1 { matauang = teks } else if t == 2 { kursbeli = teks } else if t == 3 { kursjual = teks } t += 1 if kursjual != "" && kursbeli != "" { kurs[matauang] = Currency{Nilai: nilai, KursJual: kursjual, KursBeli: kursbeli} } } } } return kurs, err }
/* * Extracts first HTML commend as map. It expects it as a valid YAML map. */ func (gen *Generator) extractPageConfig(doc *html.HtmlDocument) (config map[interface{}]interface{}, err error) { result, _ := doc.Search("//comment()") if len(result) > 0 { err = yaml.Unmarshal([]byte(result[0].Content()), &config) } return }
/* * Handles <embed> tags. * * They can be handled with MIME type plugins or internal exported methods like Markdown. */ func (gen *Generator) handleEmbedTags(doc *html.HtmlDocument) (err error) { result, err := doc.Search("//embed") if err != nil { return } for _, e := range result { plugin := gen.resolveMIMETypePlugin(e.Attribute("type").Value()) method := reflect.ValueOf(gen).MethodByName(strings.Title(plugin)) if method == reflect.ValueOf(nil) { err = gen.handleMIMETypePlugin(e, doc) } else { args := make([]reflect.Value, 2) args[0] = reflect.ValueOf(e) args[1] = reflect.ValueOf(doc) r := method.Call(args) rerr := r[0].Interface() if ierr, ok := rerr.(error); ok { err = ierr } } if err != nil { return } } return }
/* * Removes unnecessary paragraph HTML tags generated during Markdown processing by * deleting any <p> without child text nodes (just to avoid deletion if semantic tags * are inside). */ func (gen *Generator) cleanUnnecessaryPTags(doc *html.HtmlDocument) (err error) { ps, err := doc.Search("//p") if err != nil { return } for _, p := range ps { hasText := false child := p.FirstChild() for child != nil { typ := child.NodeType() if typ == xml.XML_TEXT_NODE { // Little heuristic to remove nodes with visually empty content. content := strings.TrimSpace(child.Content()) if content != "" { hasText = true break } } child = child.NextSibling() } // If current <p> tag doesn't have any child text node, extract children and add to its parent. if !hasText { parent := p.Parent() child = p.FirstChild() for child != nil { parent.AddChild(child) child = child.NextSibling() } p.Remove() } } return }
/* * Returns first H1 tag as page title. */ func (gen *Generator) getTitle(doc *html.HtmlDocument) (title string) { result, _ := doc.Search("//h1") if len(result) > 0 { title = result[0].FirstChild().Content() } return }
func parseBiHtml(document *html.HtmlDocument) (map[string]Currency, error) { var matauang string var nilai string var kursjual string var kursbeli string kurs := make(map[string]Currency) doc, err := document.Search("//table[@id='ctl00_PlaceHolderMain_biWebKursTransaksiBI_GridView1']/tr") for i, tr := range doc { t := 0 for td := tr.FirstChild(); td != nil; td = td.NextSibling() { teks := strings.TrimSpace(td.Content()) if i > 0 && len(teks) > 0 { if t == 0 { matauang = teks } else if t == 1 { nilai = teks } else if t == 2 { kursjual = teks } else if t == 3 { kursbeli = teks } t += 1 kurs[matauang] = Currency{Nilai: nilai, KursJual: kursjual, KursBeli: kursbeli} } } } return kurs, err }
func getLastUpdatedMandiri(document *html.HtmlDocument) string { str, _ := document.Search("//p[@class='catatan']") firstData := strings.Split(strings.Split(str[0].InnerHtml(), "<br>")[0], " ") lastUpdated := firstData[2] + " " + firstData[3] + " " + firstData[4] + " " + firstData[5] + " " + firstData[6] return lastUpdated }
func getNewsInfo(doc *html.HtmlDocument) ([]xml.Node, error) { xp := "//body/div/div/div/div/div/div/div/div/div/div/div/div/div/div/ul/li" xps := xpath.Compile(xp) newDatas, err := doc.Root().Search(xps) if err != nil { return nil, err } return newDatas, nil }
// get the value out of an <interp> tag func getInterp(basePtr unsafe.Pointer, interpType string, doc *html.HtmlDocument) (value string) { xPath := xpath.NewXPath(doc.DocPtr()) nodePtrs := xPath.Evaluate(basePtr, xpath.Compile(".//interp[@type='"+ interpType+"']")) if len(nodePtrs) == 1 { node := xml.NewNode(nodePtrs[0], doc) value = node.Attr("value") } return }
func (record *Record) processVerdicts(doc *html.HtmlDocument) { xPath := xpath.NewXPath(doc.DocPtr()) verdictPtrs := xPath.Evaluate(doc.Root().NodePtr(), xpath.Compile("//rs[@type='verdictDescription']")) verdicts := make([]Verdict, len(verdictPtrs)) for i, nodePtr := range verdictPtrs { node := xml.NewNode(nodePtr, doc) verdict := Verdict{} verdict.Id = node.Attr("id") verdict.Desc = cleanUpContent(node.Content()) verdict.SetType(getInterp(nodePtr, "verdictCategory", doc)) verdicts[i] = verdict } record.Verdicts = verdicts }
func (record *Record) processOffences(doc *html.HtmlDocument) { xPath := xpath.NewXPath(doc.DocPtr()) offencePtrs := xPath.Evaluate(doc.Root().NodePtr(), xpath.Compile("//rs[@type='offenceDescription']")) offences := make([]Offence, len(offencePtrs)) for i, nodePtr := range offencePtrs { node := xml.NewNode(nodePtr, doc) offence := Offence{} offence.Id = node.Attr("id") offence.Category = getInterp(nodePtr, "offenceCategory", doc) offence.SubCategory = getInterp(nodePtr, "offenceSubcategory", doc) offence.Desc = cleanUpContent(node.Content()) offences[i] = offence } record.Offences = offences }
func docSearch(doc *ghtml.HtmlDocument, elementName string, pageName string, xpath string, mustFind bool) []gxml.Node { elementArray, err := doc.Root().Search(xpath) if (err != nil || len(elementArray) == 0) && mustFind == false { return nil } if err != nil { fmt.Fprintf(os.Stderr, "Error locating element \"%s\" in page %s (incorrect xpath?): %v\n", elementName, pageName, err) //fmt.Fprintf(os.Stderr, " doc=%+v\n", doc) os.Exit(1) } if len(elementArray) == 0 { fmt.Fprintf(os.Stderr, "Error locating element \"%s\" in page %s (incorrect xpath?): len() == 0\n", elementName, pageName) //fmt.Fprintf(os.Stderr, " doc=%+v\n", doc) os.Exit(1) } return elementArray }
func (record *Record) processPersons(doc *html.HtmlDocument) { xPath := xpath.NewXPath(doc.DocPtr()) personPtrs := xPath.Evaluate(doc.Root().NodePtr(), xpath.Compile("//persname")) persons := make([]Person, len(personPtrs)) for i, nodePtr := range personPtrs { node := xml.NewNode(nodePtr, doc) person := Person{} person.Id = node.Attr("id") person.GivenName = getInterp(nodePtr, "given", doc) person.Surname = getInterp(nodePtr, "surname", doc) person.SetType(node.Attr("type")) person.SetGender(getInterp(nodePtr, "gender", doc)) persons[i] = person } record.Persons = persons }
func (c *Client) matchDocNode(doc *gokogirihtml.HtmlDocument, xpath string, str string) *gokogirihtml.HtmlDocument { nodes, nodeerr := doc.Search(xpath) if nodeerr != nil { c.Fail("element search error") return doc } if len(nodes) == 0 { c.Fail("element is not found: %s", xpath) return doc } matched, _ := regexp.MatchString(str, nodes[0].String()) if matched { c.Success(1.0) return doc } c.Fail("%s match %s", xpath, str) return doc }
func getCandidates(doc *html.HtmlDocument, minLen int) (map[string]*Candidate, error) { candidates := make(map[string]*Candidate) paragraphs, err := doc.Search(`//p|//td`) if err != nil { return nil, err } for _, elem := range paragraphs { text := elem.Content() if len(text) < minLen { continue } sc := 1.0 sc += float64(len(strings.Split(text, ","))) sc += math.Min(float64(len(text)/100.0), 3.0) parent := elem.Parent() grandParent := parent.Parent() if _, found := candidates[parent.String()]; !found { candidates[parent.String()] = newCadidate(parent) } candidates[parent.String()].score += sc if grandParent != nil && grandParent.IsValid() { if _, found := candidates[grandParent.String()]; !found { candidates[grandParent.String()] = newCadidate(grandParent) } candidates[grandParent.String()].score += (sc / 2.0) } for _, candidate := range candidates { candidate.score = (candidate.score * (1 - linkDensity(candidate.node))) } } return candidates, nil }
// ProcessField method fetches data from passed document func (f *Field) ProcessField(d *html.HtmlDocument) interface{} { var value interface{} var node xml.Node selector := xpath.Compile(f.Selector) result, _ := d.Root().Search(selector) if len(result) > 0 { node = result[0] } else { return "" } if f.Callback != nil { value = f.Callback(&node) } else { value = node.Content() } return value }
func (s *Scenario) CheckAssets(w *Worker, doc *html.HtmlDocument) { var wg sync.WaitGroup base, err := url.Parse(s.Path) if err != nil { return } // <link> links, err := doc.Search("//link") if err == nil { for _, link := range links { if link.Attr("href") != "" { wg.Add(1) go func(link xml.Node) { s.GetAsset(w, base, link, "href") wg.Done() }(link) } } } // <script> scripts, err := doc.Search("//script") if err == nil { for _, script := range scripts { if script.Attr("src") != "" { wg.Add(1) go func(script xml.Node) { s.GetAsset(w, base, script, "src") wg.Done() }(script) } } } // img imgs, err := doc.Search("//img") if err == nil { for _, img := range imgs { if img.Attr("src") != "" { wg.Add(1) go func(img xml.Node) { s.GetAsset(w, base, img, "src") wg.Done() }(img) } } } wg.Wait() }
func getLastUpdatedBi(document *html.HtmlDocument) string { span, _ := document.Search("//span[@id='ctl00_PlaceHolderMain_biWebKursTransaksiBI_lblUpdate']") lastUpdated := span[0].InnerHtml() return lastUpdated }