func (s *Scenario) GetAsset(w *Worker, base *url.URL, node xml.Node, attr string) error { path, err := url.Parse(node.Attr(attr)) if err != nil { return w.Fail(nil, err) } requestURI := base.ResolveReference(path) req, res, err := w.SimpleGet(requestURI.String()) if err != nil { return w.Fail(req, err) } if res.StatusCode != 200 { return w.Fail(res.Request, fmt.Errorf("Response code should be %d, got %d", 200, res.StatusCode)) } md5sum := calcMD5(res.Body) defer res.Body.Close() if expectedMD5, ok := s.ExpectedAssets[requestURI.RequestURI()]; ok { if md5sum == expectedMD5 { w.Success(StaticFileScore) } else { return w.Fail(res.Request, fmt.Errorf("Expected MD5 checksum is miss match %s, got %s", expectedMD5, md5sum)) } } return nil }
func trNodeToSchedule(scheduleNode xml.Node) (item ScheduleItem, err error) { results, err := scheduleNode.Search("./td/text()") if err != nil { return ScheduleItem{}, err } item = ScheduleItem{ TrainNumber: strings.TrimSpace(results[1].String()), Misc: strings.TrimSpace(results[2].String()), Class: strings.TrimSpace(results[3].String()), Relation: strings.TrimSpace(results[4].String()), StartingStation: strings.TrimSpace(results[5].String()), CurrentStation: strings.TrimSpace(results[6].String()), ArrivingTime: strings.TrimSpace(results[7].String()), DepartingTime: strings.TrimSpace(results[8].String()), Ls: strings.TrimSpace(results[9].String()), } if len(results) > 10 { item.Status = strings.TrimSpace(results[10].String()) } stationParts := strings.FieldsFunc(item.Relation, func(r rune) bool { return r == '-' }) item.EndStation = stationParts[1] // [ANGKE BOGOR] BOGOR is end station return }
func getContent(node gxtml.Node, cssQuery string) string { result, err := node.Search(toXpath(cssQuery)) if err != nil { panic(fmt.Errorf("Failed to find %v node", cssQuery)) } return result[0].Content() }
func Link(performance xml.Node) string { anchor, err := performance.Search(".//a") if err != nil { fmt.Println(err) } return "http://www.bso.org" + anchor[0].Attr("href") }
func extend(doc *xml.ElementNode, node xml.Node) { dup := node.Duplicate(1).(*xml.ElementNode) doc.AddChild(dup) if strings.ToLower(dup.Name()) != "p" || strings.ToLower(dup.Name()) != "div" { dup.SetName("div") } }
func (s *scrape) MapRegex(node xml.Node) (map[string]string, error) { if node.IsValid() == false { return nil, errors.New("Invalid node") } m := make(map[string]string, 1) inner := node.String() for k, v := range ScrapeRegex { // remove new line chars reg, _ := regexp.CompilePOSIX("\r\n|\r|\n") inner = reg.ReplaceAllString(inner, "") // get the real data reg, _ = regexp.CompilePOSIX(v[0]) scraped := reg.FindString(inner) scraped = reg.ReplaceAllString(scraped, "$1") if scraped != "" { m[k] = scraped } } // Skip empty and unwanted if len(m) > 0 { if m[ScrapeMeta[IGNOREEMPTY]] != "" { return m, nil } return nil, nil } return nil, nil }
func parseSource(m xml.Node) string { res, _ := m.Search("source") if len(res) > 0 { return res[0].Content() } return "" }
// Walks through the documents elements and populates the buffer. func (self *Formatter) walk(node xml.Node) { for c := node.FirstChild(); c != nil; c = c.NextSibling() { self.walk(c) } if node.NodeType() == xml.XML_ELEMENT_NODE { self.handleNode(node) } }
func parseRights(m xml.Node) string { res, _ := m.Search("rights") if len(res) > 0 { return res[0].Content() } return "" }
// Writes code blocks to the buffer. func (self *Formatter) writeCodeBlock(node xml.Node) { block := []byte(strings.Trim(node.Content(), "\n\r\v")) node.SetContent("") if len(block) == 0 { return } self.buf.Write(block) self.buf.Write([]byte{'\n', '\n'}) }
func (this *Document) walkElements(node xml.Node, f func(xml.Node) error) error { f(node) for child := node.FirstChild(); child != nil; child = child.NextSibling() { err := this.walkElements(child, f) if err != nil { return err } } return nil }
func parseSubjects(m xml.Node) []string { subjects := []string{} res, _ := m.Search("subject") for _, n := range res { subjects = append(subjects, n.Content()) } return subjects }
func parseDescription(m xml.Node) string { description := "" res, _ := m.Search("description") if len(res) > 0 { description = res[0].Content() } return description }
func parsePublisher(m xml.Node) string { publisher := "" res, _ := m.Search("publisher") if len(res) > 0 { publisher = res[0].Content() } return publisher }
func parseTitles(m xml.Node) []string { titles := []string{} res, _ := m.Search("title") for _, n := range res { titles = append(titles, n.Content()) } return titles }
func parseLanguages(m xml.Node) []string { languages := []string{} res, _ := m.Search("language") for _, n := range res { languages = append(languages, n.Content()) } return languages }
func parseIdentifiers(m xml.Node) []*Identifier { identifiers := []*Identifier{} res, _ := m.Search("identifier") for _, n := range res { identifier := Identifier{Identifier: n.Content(), Scheme: n.Attr("scheme")} identifiers = append(identifiers, &identifier) } return identifiers }
func Pieces(details xml.Node) string { pieces, _ := details.Search(".//div[@class='program-media-collapse']/h3") var piecesString string piecesString = "<ul class=\"works\">" for _, piece := range pieces { piecesString += "<li>" piecesString += piece.Content() piecesString += "</li>" } piecesString += "</ul>" return piecesString }
// unmarshalPath walks down an XML structure looking for wanted // paths, and calls unmarshal on them. // The consumed result tells whether XML elements have been consumed // from the Decoder until start's matching end element, or if it's // still untouched because start is uninteresting for sv's fields. func (p *Decoder) unmarshalPath(tinfo *typeInfo, sv reflect.Value, parents []string, start gokoxml.Node) (err error) { recurse := false name := start.Name() // For speed Loop: for i := range tinfo.fields { finfo := &tinfo.fields[i] if finfo.flags&fElement == 0 || len(finfo.parents) < len(parents) { continue } for j := range parents { if parents[j] != finfo.parents[j] { continue Loop } } if len(finfo.parents) == len(parents) && finfo.name == name { // It's a perfect match, unmarshal the field. return p.unmarshal(sv.FieldByIndex(finfo.idx), start) } if len(finfo.parents) > len(parents) && finfo.parents[len(parents)] == name { // It's a prefix for the field. Break and recurse // since it's not ok for one field path to be itself // the prefix for another field path. recurse = true // We can reuse the same slice as long as we // don't try to append to it. parents = finfo.parents[:len(parents)+1] break } } if !recurse { // We have no business with this element. return nil } // The element is not a perfect match for any field, but one // or more fields have the path to this element as a parent // prefix. Recurse and attempt to match these. for cur_node := start.FirstChild(); cur_node != nil; cur_node = cur_node.NextSibling() { if cur_node.NodeType() != gokoxml.XML_ELEMENT_NODE { continue } if err := p.unmarshalPath(tinfo, sv, parents, cur_node); err != nil { return err } } // No more XML Nodes. return nil }
func linkDensity(node xml.Node) float64 { links, err := node.Search("a") if err != nil { return 0.0 } llength := 0.0 for _, link := range links { llength += float64(len(link.Content())) } tlength := float64(len(node.Content())) return llength / tlength }
// Writes text blocks to the buffer. func (self *Formatter) writeBlock(node xml.Node, prefix string) { block := []byte(strings.TrimSpace(node.Content())) node.SetContent("") if len(block) == 0 { return } // Position of last space, line break and max length. sp, br, max := 0, 0, 79-len(prefix) self.buf.WriteString(prefix) for i, c := range block { // Break line if exceeded max length and the position of the last space // is greater than the position of the last line break. Don't break very // long words. if i-br > max && sp > br { self.buf.WriteByte('\n') br = sp // Only the first line is prefixed. for j := 0; j < len(prefix); j++ { self.buf.WriteByte(' ') } } if whitespace[c] { // The last character was a space, so ignore this one. if sp == i { sp++ br++ continue } // Write the last word to the buffer, append a space and update // the position of the last space. if sp > br { self.buf.WriteByte(' ') } self.buf.Write(block[sp:i]) sp = i + 1 } } // Write the last word to the buffer. if sp < len(block) { if sp > br { self.buf.WriteByte(' ') } self.buf.Write(block[sp:]) } // Close block with 2 breaks. self.buf.Write([]byte{'\n', '\n'}) }
/* * Invokes a MIME type plugin based on current node's type attribute, passing src attribute's value * as argument. Subcommand's output is piped to Gokogiri through a buffer. */ func (gen *Generator) handleMIMETypePlugin(e xml.Node, doc *html.HtmlDocument) (err error) { src := e.Attribute("src").Value() typ := e.Attribute("type").Value() cmd := exec.Command(fmt.Sprintf("m%s%s", ZAS_PREFIX, gen.resolveMIMETypePlugin(typ)), src) stdout, err := cmd.StdoutPipe() if err != nil { return } cmd.Stderr = os.Stderr c := make(chan bufErr) go func() { data, err := ioutil.ReadAll(stdout) c <- bufErr{data, err} }() if err = cmd.Start(); err != nil { return } be := <-c if err = cmd.Wait(); err != nil { return } if be.err != nil { return be.err } parent := e.Parent() child, err := doc.Coerce(be.buffer) if err != nil { return } parent.AddChild(child) e.Remove() return }
// parseAppDiv extracts timestamp and blockindex from an appointment div func parseAppDiv(div xml.Node) (timestamp int64, blockIndex string, err error) { idValues := idBlockPattern.FindStringSubmatch(div.Attr("id")) timestamp, err = strconv.ParseInt(idValues[1], 10, 64) if err != nil { return } blockIndexValues := blockIndexPattern.FindStringSubmatch(div.Content()) if len(blockIndexValues) == 1 { blockIndex = blockIndexValues[0] } return }
func processNode(node xml.Node, row string) { row = row + node.Attr("TEXT") + "|" kids, err := node.Search("node") if err != nil { log.Println("Error searching for node:", err) return } if len(kids) > 0 { // has children, not a leaf node for i := range kids { processNode(kids[i], row) } } else { fmt.Println(row) // print leaf node } }
/* Not working because: http://www.sc2ratings.com/players.php?realname=Yang,%20Hee-Soo is parsed as: http://www.sc2ratings.com/players.php?realname=Yang, Hee-Soo */ func parseLeagues(player xml.Node) []string { out := []string{} partialUrl, err := player.Search(".//a/@href") errorHandler(err) if len(partialUrl) == 1 { playerPageUrl := "http://www.sc2ratings.com/" + partialUrl[0].String() playerPageSource := retrievePageSource(playerPageUrl) playerPage, err := gokogiri.ParseHtml(playerPageSource) errorHandler(err) defer playerPage.Free() fmt.Println(playerPage) } return out }
func parseDates(m xml.Node) []*Date { dates := []*Date{} res, _ := m.Search("date") for _, n := range res { date := Date{Date: n.Content(), Event: n.Attr("event")} dates = append(dates, &date) } res, _ = m.Search("meta[@property='dcterms:modified']") if len(res) > 0 { date := Date{Date: res[0].Content(), Event: "modified"} dates = append(dates, &date) } return dates }
func loadInputs(doc xml.Node) ([]*Input, error) { nodes, e := doc.Search(".//input") if e != nil { return nil, e } out := []*Input{} for _, n := range nodes { i := &Input{ Type: n.Attr("type"), Name: n.Attr("name"), Value: n.Attr("value"), Checked: n.Attr("checked") == "checked", } out = append(out, i) } return out, nil }
// ProcessField method fetches data from passed document func (f *Field) ProcessField(d *html.HtmlDocument) interface{} { var value interface{} var node xml.Node selector := xpath.Compile(f.Selector) result, _ := d.Root().Search(selector) if len(result) > 0 { node = result[0] } else { return "" } if f.Callback != nil { value = f.Callback(&node) } else { value = node.Content() } return value }
func removeNonAnchorElements(node xml.Node) { var next xml.Node for n := node.FirstChild(); n != nil; n = next { next = n.NextSibling() if n.NodeType() == xml.XML_ELEMENT_NODE { removeNonAnchorElements(n) if strings.ToLower(n.Name()) != "a" { var chNext xml.Node for ch := n.FirstChild(); ch != nil; ch = chNext { chNext = ch.NextSibling() ch.Unlink() n.InsertBefore(ch) } n.Remove() } } } }
func parseListApp(div xml.Node) (*Appointment, error) { values := idListPattern.FindStringSubmatch(div.Attr("id")) timestamp, err := strconv.ParseInt(values[1], 10, 64) if err != nil { return nil, err } practitioner, err := strconv.ParseInt(values[2], 10, 64) if err != nil { return nil, err } return &Appointment{ session: nil, Timestamp: time.Unix(timestamp, 0), Practitioner: Practitioner(practitioner), Status: Booked, }, nil }