func init() { attrs := []string{ "id", "class", "name", } for _, attr := range attrs { for _, s := range badNames { sel := fmt.Sprintf("[%s*=%s]", attr, s) remove = append(remove, cascadia.MustCompile(sel)) } for _, s := range badNamesExact { sel := fmt.Sprintf("[%s=%s]", attr, s) remove = append(remove, cascadia.MustCompile(sel)) } for _, s := range badNamesStartsWith { sel := fmt.Sprintf("[%s^=%s]", attr, s) remove = append(remove, cascadia.MustCompile(sel)) } for _, s := range badNamesEndsWith { sel := fmt.Sprintf("[%s$=%s]", attr, s) remove = append(remove, cascadia.MustCompile(sel)) } } }
func TestInterveningElements(t *testing.T) { cases := []struct { e1Sel string e2Sel string expectedSels []string }{ {"#a", "#e", []string{"#b", "#c", "#d"}}, {"html", "body", []string{"head"}}, } doc := parseDoc(walkHTML) for _, dat := range cases { e1 := cascadia.MustCompile(dat.e1Sel).MatchFirst(doc) e2 := cascadia.MustCompile(dat.e2Sel).MatchFirst(doc) expected := []*html.Node{} for _, sel := range dat.expectedSels { expected = append(expected, cascadia.MustCompile(sel).MatchFirst(doc)) } got, err := interveningElements(e1, e2) if err != nil { t.Errorf("interveningElements(%s,%s) failed: %s", dat.e1Sel, dat.e2Sel, err) break } if len(got) != len(expected) { t.Errorf("interveningElements(%s,%s) got: %v expected: %v", dat.e1Sel, dat.e2Sel, got, expected) break } // TODO: elementwise compare } }
func TestNextElement(t *testing.T) { cases := []struct { start string expected string }{ {"html", "head"}, {"head", "body"}, {"#c", "#d"}, {"#d", "#e"}, } doc := parseDoc(walkHTML) for _, dat := range cases { e := cascadia.MustCompile(dat.start).MatchFirst(doc) expect := cascadia.MustCompile(dat.expected).MatchFirst(doc) got := nextElement(e) //fmt.Printf("%s => %s\n", describeNode(e), describeNode(got)) if got != expect { t.Errorf("nextElement('%s') got %s (expected %s)", dat.start, describeNode(got), dat.expected) } } }
func parseHTML(path string, source_depth int, dest string, dashing Dashing) ([]*reference, error) { refs := []*reference{} r, err := os.Open(path) if err != nil { return refs, err } defer r.Close() top, err := html.Parse(r) root := css.MustCompile("*[href],*[src]") roots := root.MatchAll(top) for _, node := range roots { for i, attribute := range node.Attr { if "href" == attribute.Key || "src" == attribute.Key { if strings.HasPrefix(attribute.Val, "/") { // parts of the path - the file name - the source depth path_depth := len(strings.Split(attribute.Val[1:], "/")) - 1 - source_depth relative := "" if path_depth > 0 { strings.Repeat("../", path_depth) } node.Attr[i].Val = relative + attribute.Val[1:] } break } } } for pattern, sel := range dashing.selectors { // Skip this selector if file path doesn't match if sel.MatchPath != nil && !sel.MatchPath.MatchString(path) { continue } m := css.MustCompile(pattern) found := m.MatchAll(top) for _, n := range found { name := text(n) // Skip things explicitly ignored. if ignored(name) { fmt.Printf("Skipping entry for %s (Ignored by dashing JSON)\n", name) continue } // If we have a regexp, run it. if sel.Regexp != nil { name = sel.Regexp.ReplaceAllString(name, sel.Replacement) } // References we want to track. refs = append(refs, &reference{name, sel.Type, path + "#" + anchor(n)}) // We need to modify the DOM with a special link to support TOC. n.Parent.InsertBefore(newA(name, sel.Type), n) } } return refs, writeHTML(path, dest, top) }
func init() { for _, n := range knownImgNames { knownImgIds = append(knownImgIds, cascadia.MustCompile("#"+n)) knownImgClasses = append(knownImgClasses, cascadia.MustCompile("."+n)) } }
func parseDownstreamTable(n *html.Node) (map[modem.Channel]*modem.Downstream, error) { m := map[modem.Channel]*modem.Downstream{} rows := cascadia.MustCompile("tr").MatchAll(n) if len(rows) <= 2 { return nil, fmt.Errorf("Expected more than 2 row in table, got %d", len(rows)) } for _, row := range rows[2:] { d := &modem.Downstream{} var ch modem.Channel for i, col := range cascadia.MustCompile("td").MatchAll(row) { v := htmlutil.GetText(col) fv := v if idx := strings.Index(v, " "); idx != -1 { fv = fv[:idx] } f, _ := strconv.ParseFloat(fv, 64) switch i { case 0: // Channel ch = modem.Channel(v) case 1: // Lock Status case 2: // Modulation d.Modulation = v case 3: // Channel ID case 4: // Frequency (Hz) d.Frequency = v case 5: // Power (dBmV) d.PowerLevel = f case 6: // SNR (dB) d.SNR = f case 7: // Corrected d.Correctable = f case 8: // Uncorrectables d.Uncorrectable = f default: glog.Errorf("Unexpected %dth column in downstream table", i) } } m[ch] = d } return m, nil }
// Remove all extraneous crap in the content - related articles, share buttons etc... // (equivalent to prepArticle() in readbility.js) func removeCruft(contentNodes []*html.Node, candidates candidateMap) { dbug := Debug.ContentLogger dbug.Printf("Cruft removal\n") zapConditionally(contentNodes, "form", candidates) zap(contentNodes, "object") zap(contentNodes, "h1") // If there is only one h2, they are probably using it // as a header and not a subheader, so remove it since we already have a header. h2Count := 0 h2Sel := cascadia.MustCompile("h2") for _, node := range contentNodes { h2Count += len(h2Sel.MatchAll(node)) } if h2Count == 1 { zap(contentNodes, "h2") } zap(contentNodes, "iframe") //cleanHeaders() /* Do these last as the previous stuff may have removed junk that will affect these */ zapConditionally(contentNodes, "table", candidates) zapConditionally(contentNodes, "ul", candidates) zapConditionally(contentNodes, "div", candidates) }
func fetchList(url, prefix string) ([]string, error) { res, err := http.Get(url) if err != nil { return nil, err } defer res.Body.Close() if res.StatusCode != 200 { return nil, fmt.Errorf("failed to fetch %s - %s", url, res.Status) } list := []string{} selector := cascadia.MustCompile("a") webdevdata.ProcessMatchingTagsReader(res.Body, "table tbody tr > td:first-of-type", func(node *html.Node) { pkg := "" link := selector.MatchFirst(node) if link != nil { pkg = webdevdata.GetAttr("href", link.Attr) } else if node.FirstChild != nil && node.FirstChild.Type == html.TextNode { pkg = node.FirstChild.Data } else if node.FirstChild != nil && node.FirstChild.Data == "b" { return } if pkg == "" { log.Fatal("markup from godoc.org changed") } p := strings.TrimLeft(pkg, "/") if !strings.HasPrefix(p, prefix) { return } list = append(list, p) }) return list, nil }
func parseHTML(path, dest string, dashing Dashing) ([]*reference, error) { refs := []*reference{} r, err := os.Open(path) if err != nil { return refs, err } defer r.Close() top, err := html.Parse(r) for pattern, etype := range dashing.Selectors { m := css.MustCompile(pattern) found := m.MatchAll(top) for _, n := range found { name := text(n) // Skip things explicitly ignored. if ignored(name) { fmt.Printf("Skipping entry for %s (Ignored by dashing JSON)\n", name) continue } // References we want to track. refs = append(refs, &reference{name, etype, path + "#" + anchor(n)}) // We need to modify the DOM with a special link to support TOC. n.Parent.InsertBefore(newA(name, etype), n) } } return refs, writeHTML(path, dest, top) }
// Is checks the current matched set of elements against a selector and // returns true if at least one of these elements matches. func (s *Selection) Is(selector string) bool { if len(s.Nodes) > 0 { return s.IsMatcher(cascadia.MustCompile(selector)) } return false }
func updateUpstream(n *html.Node) map[modem.Channel]*upstreamStat { glog.V(2).Infoln("Updating upstream table") stats := map[modem.Channel]*upstreamStat{} var ids []modem.Channel for row, tr := range cascadia.MustCompile("tr").MatchAll(n)[1:] { switch row { case 0: // ID for _, td := range cascadia.MustCompile("td").MatchAll(tr)[1:] { id := modem.Channel(htmlutil.GetText(td)) ids = append(ids, id) stats[id] = &upstreamStat{} } case 1: // Frequency for i, td := range cascadia.MustCompile("td").MatchAll(tr)[1:] { stats[ids[i]].frequency = strings.Fields(htmlutil.GetText(td))[0] } case 2: // Ranging Service ID for i, td := range cascadia.MustCompile("td").MatchAll(tr)[1:] { stats[ids[i]].rangingService = htmlutil.GetText(td) } case 3: // Symbol Rate for i, td := range cascadia.MustCompile("td").MatchAll(tr)[1:] { f, err := strconv.ParseFloat(strings.Fields(htmlutil.GetText(td))[0], 64) if err != nil { continue } stats[ids[i]].symbolRate = f * 1000000 } case 4: // Power level for i, td := range cascadia.MustCompile("td").MatchAll(tr)[1:] { f, err := strconv.ParseFloat(strings.Fields(htmlutil.GetText(td))[0], 64) if err != nil { continue } stats[ids[i]].powerLevel = f } case 5: // Modulation for i, td := range cascadia.MustCompile("td").MatchAll(tr)[1:] { stats[ids[i]].modulation = strings.Replace(htmlutil.GetText(td), "\n", " ", -1) } case 6: // Ranging Status for i, td := range cascadia.MustCompile("td").MatchAll(tr)[1:] { stats[ids[i]].rangingStatus = htmlutil.GetText(td) } default: glog.Fatalf("Unhandled %d row in upstream table", row) } } return stats }
func updateSignalStats(n *html.Node) map[modem.Channel]*downstreamErrorStat { glog.V(2).Infoln("Updating signal stats table") stats := map[modem.Channel]*downstreamErrorStat{} var ids []modem.Channel for row, tr := range cascadia.MustCompile("tr").MatchAll(n)[1:] { switch row { case 0: // ID for _, td := range cascadia.MustCompile("td").MatchAll(tr)[1:] { id := modem.Channel(htmlutil.GetText(td)) ids = append(ids, id) stats[id] = &downstreamErrorStat{} } case 1: // Total Unerrored Codewords for i, td := range cascadia.MustCompile("td").MatchAll(tr)[1:] { f, err := strconv.ParseFloat(strings.Fields(htmlutil.GetText(td))[0], 64) if err != nil { continue } stats[ids[i]].unerrored = f } case 2: // Total Correctable Codewords for i, td := range cascadia.MustCompile("td").MatchAll(tr)[1:] { f, err := strconv.ParseFloat(strings.Fields(htmlutil.GetText(td))[0], 64) if err != nil { continue } stats[ids[i]].correctable = f } case 3: // Total Uncorrectable Codewords for i, td := range cascadia.MustCompile("td").MatchAll(tr)[1:] { f, err := strconv.ParseFloat(strings.Fields(htmlutil.GetText(td))[0], 64) if err != nil { continue } stats[ids[i]].uncorrectable = f } default: glog.Fatalf("Unhandled %d row in signal stats table", row) } } return stats }
// remove all <script> elements func removeScripts(root *html.Node) []*html.Node { out := []*html.Node{} sel := cascadia.MustCompile("script") for _, script := range sel.MatchAll(root) { script.Parent.RemoveChild(script) out = append(out, script) } return out }
func getProblem(num int) (p problem) { url := "https://projecteuler.net/problem=" + fmt.Sprintf("%v", num) resp, _ := http.Get(url) defer resp.Body.Close() dom, err := html.Parse(resp.Body) if err != nil { panic(err) } content := cascadia.MustCompile("#content").MatchFirst(dom) p.Title = getText(cascadia.MustCompile("h2").MatchFirst(content)) p.Description = getText(cascadia.MustCompile(".problem_content").MatchFirst(content)) p.URL = url p.Id = fmt.Sprintf("Euler%03d", num) return p }
// getLinkDensity calculates the ratio of link text to overall text in a node. // 0 means no link text, 1 means everything is link text func getLinkDensity(n *html.Node) float64 { textLength := len(getTextContent(n)) linkLength := 0 linkSel := cascadia.MustCompile("a") for _, a := range linkSel.MatchAll(n) { linkLength += len(getTextContent(a)) } return float64(linkLength) / float64(textLength) }
func ProcessMatchingTagsReader(r io.Reader, cssSel string, run func(*html.Node)) error { selector := cascadia.MustCompile(cssSel) node, err := html.Parse(r) if err != nil { return err } matchedNodes := selector.MatchAll(node) for _, node := range matchedNodes { run(node) } return nil }
func updateDownstream(n *html.Node) map[modem.Channel]*downstreamStat { glog.V(2).Infoln("Updating downstream table") stats := map[modem.Channel]*downstreamStat{} var ids []modem.Channel // Remove nested tables for _, t := range cascadia.MustCompile("table table").MatchAll(n) { t.Parent.RemoveChild(t) } for row, tr := range cascadia.MustCompile("tr").MatchAll(n)[1:] { switch row { case 0: // ID for _, td := range cascadia.MustCompile("td").MatchAll(tr)[1:] { id := modem.Channel(htmlutil.GetText(td)) ids = append(ids, id) stats[id] = &downstreamStat{} } case 1: // Frequency for i, td := range cascadia.MustCompile("td").MatchAll(tr)[1:] { stats[ids[i]].frequency = strings.Fields(htmlutil.GetText(td))[0] } case 2: // SNR for i, td := range cascadia.MustCompile("td").MatchAll(tr)[1:] { f, err := strconv.ParseFloat(strings.Fields(htmlutil.GetText(td))[0], 64) if err != nil { continue } stats[ids[i]].snr = f } case 3: // Modulation for i, td := range cascadia.MustCompile("td").MatchAll(tr)[1:] { stats[ids[i]].modulation = htmlutil.GetText(td) } case 4: for i, td := range cascadia.MustCompile("td").MatchAll(tr)[1:] { // Power level f, err := strconv.ParseFloat(strings.Fields(htmlutil.GetText(td))[0], 64) if err != nil { continue } stats[ids[i]].powerLevel = f } default: glog.Fatalf("Unhandled %d row in downstream table", row) } } return stats }
func zap(contentNodes []*html.Node, tagSel string) { doomed := make([]*html.Node, 0, 32) sel := cascadia.MustCompile(tagSel) for _, contentNode := range contentNodes { for _, node := range sel.MatchAll(contentNode) { // XYZZY TODO: preserve videos? doomed = append(doomed, node) } } for _, n := range doomed { if n.Parent != nil { n.Parent.RemoveChild(n) } } }
func PackFreeBook() string { resp, err := http.Get("https://www.packtpub.com/packt/offers/free-learning") if err != nil { return "Error fetching url" } defer resp.Body.Close() doc, err := html.Parse(resp.Body) if err != nil { return "Error parsing" } book := cascadia.MustCompile(`div#deal-of-the-day div.dotd-main-book div.section-inner div.dotd-main-book-summary div.dotd-title h2 *`).MatchFirst(doc) header := strings.TrimSpace(book.Data) return header }
func parseStatus(r io.Reader) (*modem.Signal, error) { n, err := html.Parse(r) if err != nil { return nil, err } signal := &modem.Signal{ Downstream: map[modem.Channel]*modem.Downstream{}, Upstream: map[modem.Channel]*modem.Upstream{}, } // All top-level tables are immediate descendants of center. One table has // a nested table in a td, which this filter excludes. sel := cascadia.MustCompile("center > table") for i, t := range sel.MatchAll(n) { switch i { case 0: for ch, s := range updateDownstream(t) { signal.Downstream[ch] = &modem.Downstream{ Frequency: s.frequency, SNR: s.snr, Modulation: s.modulation, PowerLevel: s.powerLevel, } } case 1: for ch, s := range updateUpstream(t) { signal.Upstream[ch] = &modem.Upstream{ Frequency: s.frequency, Status: s.rangingStatus, SymbolRate: s.symbolRate, Modulation: s.modulation, PowerLevel: s.powerLevel, } } case 2: for ch, s := range updateSignalStats(t) { d := signal.Downstream[ch] d.Unerrored = s.unerrored d.Correctable = s.correctable d.Unerrored = s.uncorrectable } } } return signal, nil }
func main() { file, err := os.Open("example.html") if err != nil { panic(err) } doc, err := html.Parse(file) if err != nil { panic(err) } selector := cascadia.MustCompile("p") nodes := selector.MatchAll(doc) for _, node := range nodes { fmt.Println(node.FirstChild.Data) } }
func parseStatus(r io.Reader) (*modem.Signal, error) { n, err := html.Parse(r) if err != nil { return nil, err } tables := cascadia.MustCompile(".simpleTable").MatchAll(n) if len(tables) != 3 { return nil, fmt.Errorf("Found %d simpleTables, expected 3", len(tables)) } d, err := parseDownstreamTable(tables[1]) if err != nil { return nil, err } u, err := parseUpstreamTable(tables[2]) if err != nil { return nil, err } return &modem.Signal{ Downstream: d, Upstream: u, }, nil }
// PrevAllFiltered gets all the preceding siblings of each element in the // Selection filtered by a selector. It returns a new Selection object // containing the matched elements. func (s *Selection) PrevAllFiltered(selector string) *Selection { return filterAndPush(s, getSiblingNodes(s.Nodes, siblingPrevAll, nil, nil), cascadia.MustCompile(selector)) }
// PrevUntil gets all preceding siblings of each element up to but not // including the element matched by the selector. It returns a new Selection // object containing the matched elements. func (s *Selection) PrevUntil(selector string) *Selection { return pushStack(s, getSiblingNodes(s.Nodes, siblingPrevUntil, cascadia.MustCompile(selector), nil)) }
// Add adds the selector string's matching nodes to those in the current // selection and returns a new Selection object. // The selector string is run in the context of the document of the current // Selection object. func (s *Selection) Add(selector string) *Selection { return s.AddNodes(findWithMatcher([]*html.Node{s.document.rootNode}, cascadia.MustCompile(selector))...) }
// PrevFilteredUntil is like PrevUntil, with the option to filter // the results based on a selector string. // It returns a new Selection object containing the matched elements. func (s *Selection) PrevFilteredUntil(filterSelector, untilSelector string) *Selection { return filterAndPush(s, getSiblingNodes(s.Nodes, siblingPrevUntil, cascadia.MustCompile(untilSelector), nil), cascadia.MustCompile(filterSelector)) }
// PrevFilteredUntilSelection is like PrevUntilSelection, with the // option to filter the results based on a selector string. It returns a new // Selection object containing the matched elements. func (s *Selection) PrevFilteredUntilSelection(filterSelector string, sel *Selection) *Selection { return s.PrevMatcherUntilSelection(cascadia.MustCompile(filterSelector), sel) }
// PrevFilteredUntilNodes is like PrevUntilNodes, with the // option to filter the results based on a selector string. It returns a new // Selection object containing the matched elements. func (s *Selection) PrevFilteredUntilNodes(filterSelector string, nodes ...*html.Node) *Selection { return filterAndPush(s, getSiblingNodes(s.Nodes, siblingPrevUntil, nil, nodes), cascadia.MustCompile(filterSelector)) }
// ChildrenFiltered gets the child elements of each element in the Selection, // filtered by the specified selector. It returns a new // Selection object containing these elements. func (s *Selection) ChildrenFiltered(selector string) *Selection { return filterAndPush(s, getChildrenNodes(s.Nodes, siblingAll), cascadia.MustCompile(selector)) }
return ip4[1] == 168 } return false } if ip[0]&0xfe == 0xfc { return true } if ip[0] == 0xfe && (ip[1]&0xfc) == 0x80 { return true } return false } var titleSelector = cascadia.MustCompile("title") func (h ProxyHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { activeConnections.Add(1) defer activeConnections.Done() conf := GetConfig() if !conf.ACLsLoaded { http.Error(w, "Redwood proxy configuration needs to be updated for this version of Redwood.\n(Use ACLs)", 500) return } if len(r.URL.String()) > 10000 { http.Error(w, "URL too long", http.StatusRequestURITooLong) return