示例#1
0
文件: cleanup.go 项目: oudommeas/swan
func init() {
	attrs := []string{
		"id",
		"class",
		"name",
	}

	for _, attr := range attrs {
		for _, s := range badNames {
			sel := fmt.Sprintf("[%s*=%s]", attr, s)
			remove = append(remove, cascadia.MustCompile(sel))
		}

		for _, s := range badNamesExact {
			sel := fmt.Sprintf("[%s=%s]", attr, s)
			remove = append(remove, cascadia.MustCompile(sel))
		}

		for _, s := range badNamesStartsWith {
			sel := fmt.Sprintf("[%s^=%s]", attr, s)
			remove = append(remove, cascadia.MustCompile(sel))
		}

		for _, s := range badNamesEndsWith {
			sel := fmt.Sprintf("[%s$=%s]", attr, s)
			remove = append(remove, cascadia.MustCompile(sel))
		}
	}
}
示例#2
0
func TestInterveningElements(t *testing.T) {

	cases := []struct {
		e1Sel        string
		e2Sel        string
		expectedSels []string
	}{
		{"#a", "#e", []string{"#b", "#c", "#d"}},
		{"html", "body", []string{"head"}},
	}

	doc := parseDoc(walkHTML)

	for _, dat := range cases {
		e1 := cascadia.MustCompile(dat.e1Sel).MatchFirst(doc)
		e2 := cascadia.MustCompile(dat.e2Sel).MatchFirst(doc)
		expected := []*html.Node{}
		for _, sel := range dat.expectedSels {
			expected = append(expected, cascadia.MustCompile(sel).MatchFirst(doc))
		}

		got, err := interveningElements(e1, e2)
		if err != nil {
			t.Errorf("interveningElements(%s,%s) failed: %s", dat.e1Sel, dat.e2Sel, err)
			break
		}

		if len(got) != len(expected) {
			t.Errorf("interveningElements(%s,%s) got: %v  expected: %v", dat.e1Sel, dat.e2Sel, got, expected)
			break
		}
		// TODO: elementwise compare
	}
}
示例#3
0
func TestNextElement(t *testing.T) {

	cases := []struct {
		start    string
		expected string
	}{
		{"html", "head"},
		{"head", "body"},
		{"#c", "#d"},
		{"#d", "#e"},
	}

	doc := parseDoc(walkHTML)

	for _, dat := range cases {
		e := cascadia.MustCompile(dat.start).MatchFirst(doc)
		expect := cascadia.MustCompile(dat.expected).MatchFirst(doc)

		got := nextElement(e)

		//fmt.Printf("%s => %s\n", describeNode(e), describeNode(got))
		if got != expect {
			t.Errorf("nextElement('%s') got %s (expected %s)", dat.start, describeNode(got), dat.expected)
		}
	}
}
示例#4
0
func parseHTML(path string, source_depth int, dest string, dashing Dashing) ([]*reference, error) {
	refs := []*reference{}

	r, err := os.Open(path)
	if err != nil {
		return refs, err
	}
	defer r.Close()
	top, err := html.Parse(r)

	root := css.MustCompile("*[href],*[src]")
	roots := root.MatchAll(top)
	for _, node := range roots {
		for i, attribute := range node.Attr {
			if "href" == attribute.Key || "src" == attribute.Key {
				if strings.HasPrefix(attribute.Val, "/") {
					// parts of the path - the file name - the source depth
					path_depth := len(strings.Split(attribute.Val[1:], "/")) - 1 - source_depth
					relative := ""
					if path_depth > 0 {
						strings.Repeat("../", path_depth)
					}
					node.Attr[i].Val = relative + attribute.Val[1:]
				}
				break
			}
		}
	}

	for pattern, sel := range dashing.selectors {
		// Skip this selector if file path doesn't match
		if sel.MatchPath != nil && !sel.MatchPath.MatchString(path) {
			continue
		}

		m := css.MustCompile(pattern)
		found := m.MatchAll(top)
		for _, n := range found {
			name := text(n)

			// Skip things explicitly ignored.
			if ignored(name) {
				fmt.Printf("Skipping entry for %s (Ignored by dashing JSON)\n", name)
				continue
			}

			// If we have a regexp, run it.
			if sel.Regexp != nil {
				name = sel.Regexp.ReplaceAllString(name, sel.Replacement)
			}

			// References we want to track.
			refs = append(refs, &reference{name, sel.Type, path + "#" + anchor(n)})
			// We need to modify the DOM with a special link to support TOC.
			n.Parent.InsertBefore(newA(name, sel.Type), n)
		}
	}
	return refs, writeHTML(path, dest, top)
}
示例#5
0
func init() {
	for _, n := range knownImgNames {
		knownImgIds = append(knownImgIds,
			cascadia.MustCompile("#"+n))
		knownImgClasses = append(knownImgClasses,
			cascadia.MustCompile("."+n))
	}
}
示例#6
0
文件: sb6183.go 项目: wathiede/surfer
func parseDownstreamTable(n *html.Node) (map[modem.Channel]*modem.Downstream, error) {
	m := map[modem.Channel]*modem.Downstream{}
	rows := cascadia.MustCompile("tr").MatchAll(n)
	if len(rows) <= 2 {
		return nil, fmt.Errorf("Expected more than 2 row in table, got %d", len(rows))
	}
	for _, row := range rows[2:] {
		d := &modem.Downstream{}
		var ch modem.Channel
		for i, col := range cascadia.MustCompile("td").MatchAll(row) {
			v := htmlutil.GetText(col)
			fv := v
			if idx := strings.Index(v, " "); idx != -1 {
				fv = fv[:idx]
			}
			f, _ := strconv.ParseFloat(fv, 64)
			switch i {
			case 0:
				// Channel
				ch = modem.Channel(v)
			case 1:
				// Lock Status
			case 2:
				// Modulation
				d.Modulation = v
			case 3:
				// Channel ID
			case 4:
				// Frequency (Hz)
				d.Frequency = v
			case 5:
				// Power (dBmV)
				d.PowerLevel = f
			case 6:
				// SNR (dB)
				d.SNR = f
			case 7:
				// Corrected
				d.Correctable = f
			case 8:
				// Uncorrectables
				d.Uncorrectable = f
			default:
				glog.Errorf("Unexpected %dth column in downstream table", i)
			}
		}
		m[ch] = d
	}
	return m, nil
}
示例#7
0
文件: content.go 项目: bcampbell/arts
// Remove all extraneous crap in the content - related articles, share buttons etc...
// (equivalent to prepArticle() in readbility.js)
func removeCruft(contentNodes []*html.Node, candidates candidateMap) {
	dbug := Debug.ContentLogger
	dbug.Printf("Cruft removal\n")

	zapConditionally(contentNodes, "form", candidates)
	zap(contentNodes, "object")
	zap(contentNodes, "h1")

	// If there is only one h2, they are probably using it
	// as a header and not a subheader, so remove it since we already have a header.
	h2Count := 0
	h2Sel := cascadia.MustCompile("h2")
	for _, node := range contentNodes {
		h2Count += len(h2Sel.MatchAll(node))
	}

	if h2Count == 1 {
		zap(contentNodes, "h2")
	}
	zap(contentNodes, "iframe")

	//cleanHeaders()

	/* Do these last as the previous stuff may have removed junk that will affect these */
	zapConditionally(contentNodes, "table", candidates)
	zapConditionally(contentNodes, "ul", candidates)
	zapConditionally(contentNodes, "div", candidates)
}
示例#8
0
func fetchList(url, prefix string) ([]string, error) {
	res, err := http.Get(url)
	if err != nil {
		return nil, err
	}
	defer res.Body.Close()
	if res.StatusCode != 200 {
		return nil, fmt.Errorf("failed to fetch %s - %s", url, res.Status)
	}
	list := []string{}
	selector := cascadia.MustCompile("a")
	webdevdata.ProcessMatchingTagsReader(res.Body, "table tbody tr > td:first-of-type", func(node *html.Node) {
		pkg := ""
		link := selector.MatchFirst(node)
		if link != nil {
			pkg = webdevdata.GetAttr("href", link.Attr)
		} else if node.FirstChild != nil && node.FirstChild.Type == html.TextNode {
			pkg = node.FirstChild.Data
		} else if node.FirstChild != nil && node.FirstChild.Data == "b" {
			return
		}
		if pkg == "" {
			log.Fatal("markup from godoc.org changed")
		}
		p := strings.TrimLeft(pkg, "/")
		if !strings.HasPrefix(p, prefix) {
			return
		}
		list = append(list, p)
	})
	return list, nil
}
示例#9
0
func parseHTML(path, dest string, dashing Dashing) ([]*reference, error) {
	refs := []*reference{}

	r, err := os.Open(path)
	if err != nil {
		return refs, err
	}
	defer r.Close()
	top, err := html.Parse(r)

	for pattern, etype := range dashing.Selectors {
		m := css.MustCompile(pattern)
		found := m.MatchAll(top)
		for _, n := range found {
			name := text(n)

			// Skip things explicitly ignored.
			if ignored(name) {
				fmt.Printf("Skipping entry for %s (Ignored by dashing JSON)\n", name)
				continue
			}
			// References we want to track.
			refs = append(refs, &reference{name, etype, path + "#" + anchor(n)})
			// We need to modify the DOM with a special link to support TOC.
			n.Parent.InsertBefore(newA(name, etype), n)
		}
	}
	return refs, writeHTML(path, dest, top)
}
示例#10
0
// Is checks the current matched set of elements against a selector and
// returns true if at least one of these elements matches.
func (s *Selection) Is(selector string) bool {
	if len(s.Nodes) > 0 {
		return s.IsMatcher(cascadia.MustCompile(selector))
	}

	return false
}
示例#11
0
文件: sb6121.go 项目: wathiede/surfer
func updateUpstream(n *html.Node) map[modem.Channel]*upstreamStat {
	glog.V(2).Infoln("Updating upstream table")
	stats := map[modem.Channel]*upstreamStat{}
	var ids []modem.Channel
	for row, tr := range cascadia.MustCompile("tr").MatchAll(n)[1:] {
		switch row {
		case 0:
			// ID
			for _, td := range cascadia.MustCompile("td").MatchAll(tr)[1:] {
				id := modem.Channel(htmlutil.GetText(td))
				ids = append(ids, id)
				stats[id] = &upstreamStat{}
			}
		case 1:
			// Frequency
			for i, td := range cascadia.MustCompile("td").MatchAll(tr)[1:] {
				stats[ids[i]].frequency = strings.Fields(htmlutil.GetText(td))[0]
			}
		case 2:
			// Ranging Service ID
			for i, td := range cascadia.MustCompile("td").MatchAll(tr)[1:] {
				stats[ids[i]].rangingService = htmlutil.GetText(td)
			}
		case 3:
			// Symbol Rate
			for i, td := range cascadia.MustCompile("td").MatchAll(tr)[1:] {
				f, err := strconv.ParseFloat(strings.Fields(htmlutil.GetText(td))[0], 64)
				if err != nil {
					continue
				}
				stats[ids[i]].symbolRate = f * 1000000
			}
		case 4:
			// Power level
			for i, td := range cascadia.MustCompile("td").MatchAll(tr)[1:] {
				f, err := strconv.ParseFloat(strings.Fields(htmlutil.GetText(td))[0], 64)
				if err != nil {
					continue
				}
				stats[ids[i]].powerLevel = f
			}
		case 5:
			// Modulation
			for i, td := range cascadia.MustCompile("td").MatchAll(tr)[1:] {
				stats[ids[i]].modulation = strings.Replace(htmlutil.GetText(td), "\n", " ", -1)
			}
		case 6:
			// Ranging Status
			for i, td := range cascadia.MustCompile("td").MatchAll(tr)[1:] {
				stats[ids[i]].rangingStatus = htmlutil.GetText(td)
			}
		default:
			glog.Fatalf("Unhandled %d row in upstream table", row)
		}
	}
	return stats
}
示例#12
0
文件: sb6121.go 项目: wathiede/surfer
func updateSignalStats(n *html.Node) map[modem.Channel]*downstreamErrorStat {
	glog.V(2).Infoln("Updating signal stats table")
	stats := map[modem.Channel]*downstreamErrorStat{}
	var ids []modem.Channel
	for row, tr := range cascadia.MustCompile("tr").MatchAll(n)[1:] {
		switch row {
		case 0:
			// ID
			for _, td := range cascadia.MustCompile("td").MatchAll(tr)[1:] {
				id := modem.Channel(htmlutil.GetText(td))
				ids = append(ids, id)
				stats[id] = &downstreamErrorStat{}
			}
		case 1:
			// Total Unerrored Codewords
			for i, td := range cascadia.MustCompile("td").MatchAll(tr)[1:] {
				f, err := strconv.ParseFloat(strings.Fields(htmlutil.GetText(td))[0], 64)
				if err != nil {
					continue
				}
				stats[ids[i]].unerrored = f
			}
		case 2:
			// Total Correctable Codewords
			for i, td := range cascadia.MustCompile("td").MatchAll(tr)[1:] {
				f, err := strconv.ParseFloat(strings.Fields(htmlutil.GetText(td))[0], 64)
				if err != nil {
					continue
				}
				stats[ids[i]].correctable = f
			}
		case 3:
			// Total Uncorrectable Codewords
			for i, td := range cascadia.MustCompile("td").MatchAll(tr)[1:] {
				f, err := strconv.ParseFloat(strings.Fields(htmlutil.GetText(td))[0], 64)
				if err != nil {
					continue
				}
				stats[ids[i]].uncorrectable = f
			}
		default:
			glog.Fatalf("Unhandled %d row in signal stats table", row)
		}
	}
	return stats
}
示例#13
0
文件: content.go 项目: bcampbell/arts
// remove all <script> elements
func removeScripts(root *html.Node) []*html.Node {
	out := []*html.Node{}
	sel := cascadia.MustCompile("script")
	for _, script := range sel.MatchAll(root) {
		script.Parent.RemoveChild(script)
		out = append(out, script)
	}
	return out
}
示例#14
0
文件: scrap.go 项目: emetko/goeuler
func getProblem(num int) (p problem) {
	url := "https://projecteuler.net/problem=" + fmt.Sprintf("%v", num)
	resp, _ := http.Get(url)
	defer resp.Body.Close()

	dom, err := html.Parse(resp.Body)
	if err != nil {
		panic(err)
	}
	content := cascadia.MustCompile("#content").MatchFirst(dom)

	p.Title = getText(cascadia.MustCompile("h2").MatchFirst(content))
	p.Description = getText(cascadia.MustCompile(".problem_content").MatchFirst(content))
	p.URL = url
	p.Id = fmt.Sprintf("Euler%03d", num)

	return p

}
示例#15
0
文件: util.go 项目: bcampbell/arts
// getLinkDensity calculates the ratio of link text to overall text in a node.
// 0 means no link text, 1 means everything is link text
func getLinkDensity(n *html.Node) float64 {
	textLength := len(getTextContent(n))
	linkLength := 0
	linkSel := cascadia.MustCompile("a")
	for _, a := range linkSel.MatchAll(n) {
		linkLength += len(getTextContent(a))
	}

	return float64(linkLength) / float64(textLength)
}
示例#16
0
func ProcessMatchingTagsReader(r io.Reader, cssSel string, run func(*html.Node)) error {
	selector := cascadia.MustCompile(cssSel)
	node, err := html.Parse(r)
	if err != nil {
		return err
	}
	matchedNodes := selector.MatchAll(node)
	for _, node := range matchedNodes {
		run(node)
	}
	return nil
}
示例#17
0
文件: sb6121.go 项目: wathiede/surfer
func updateDownstream(n *html.Node) map[modem.Channel]*downstreamStat {
	glog.V(2).Infoln("Updating downstream table")
	stats := map[modem.Channel]*downstreamStat{}
	var ids []modem.Channel

	// Remove nested tables
	for _, t := range cascadia.MustCompile("table table").MatchAll(n) {
		t.Parent.RemoveChild(t)
	}

	for row, tr := range cascadia.MustCompile("tr").MatchAll(n)[1:] {
		switch row {
		case 0:
			// ID
			for _, td := range cascadia.MustCompile("td").MatchAll(tr)[1:] {
				id := modem.Channel(htmlutil.GetText(td))
				ids = append(ids, id)
				stats[id] = &downstreamStat{}
			}
		case 1:
			// Frequency
			for i, td := range cascadia.MustCompile("td").MatchAll(tr)[1:] {
				stats[ids[i]].frequency = strings.Fields(htmlutil.GetText(td))[0]
			}
		case 2:
			// SNR
			for i, td := range cascadia.MustCompile("td").MatchAll(tr)[1:] {
				f, err := strconv.ParseFloat(strings.Fields(htmlutil.GetText(td))[0], 64)
				if err != nil {
					continue
				}
				stats[ids[i]].snr = f
			}
		case 3:
			// Modulation
			for i, td := range cascadia.MustCompile("td").MatchAll(tr)[1:] {
				stats[ids[i]].modulation = htmlutil.GetText(td)
			}
		case 4:
			for i, td := range cascadia.MustCompile("td").MatchAll(tr)[1:] {
				// Power level
				f, err := strconv.ParseFloat(strings.Fields(htmlutil.GetText(td))[0], 64)
				if err != nil {
					continue
				}
				stats[ids[i]].powerLevel = f
			}
		default:
			glog.Fatalf("Unhandled %d row in downstream table", row)
		}
	}
	return stats
}
示例#18
0
文件: content.go 项目: bcampbell/arts
func zap(contentNodes []*html.Node, tagSel string) {
	doomed := make([]*html.Node, 0, 32)
	sel := cascadia.MustCompile(tagSel)
	for _, contentNode := range contentNodes {
		for _, node := range sel.MatchAll(contentNode) {
			// XYZZY TODO: preserve videos?
			doomed = append(doomed, node)
		}
	}
	for _, n := range doomed {
		if n.Parent != nil {
			n.Parent.RemoveChild(n)
		}
	}
}
示例#19
0
func PackFreeBook() string {
	resp, err := http.Get("https://www.packtpub.com/packt/offers/free-learning")
	if err != nil {
		return "Error fetching url"
	}
	defer resp.Body.Close()
	doc, err := html.Parse(resp.Body)
	if err != nil {
		return "Error parsing"
	}

	book := cascadia.MustCompile(`div#deal-of-the-day div.dotd-main-book div.section-inner div.dotd-main-book-summary div.dotd-title h2 *`).MatchFirst(doc)
	header := strings.TrimSpace(book.Data)
	return header
}
示例#20
0
文件: sb6121.go 项目: wathiede/surfer
func parseStatus(r io.Reader) (*modem.Signal, error) {
	n, err := html.Parse(r)
	if err != nil {
		return nil, err
	}
	signal := &modem.Signal{
		Downstream: map[modem.Channel]*modem.Downstream{},
		Upstream:   map[modem.Channel]*modem.Upstream{},
	}
	// All top-level tables are immediate descendants of center.  One table has
	// a nested table in a td, which this filter excludes.
	sel := cascadia.MustCompile("center > table")
	for i, t := range sel.MatchAll(n) {
		switch i {
		case 0:
			for ch, s := range updateDownstream(t) {
				signal.Downstream[ch] = &modem.Downstream{
					Frequency:  s.frequency,
					SNR:        s.snr,
					Modulation: s.modulation,
					PowerLevel: s.powerLevel,
				}
			}
		case 1:
			for ch, s := range updateUpstream(t) {
				signal.Upstream[ch] = &modem.Upstream{
					Frequency:  s.frequency,
					Status:     s.rangingStatus,
					SymbolRate: s.symbolRate,
					Modulation: s.modulation,
					PowerLevel: s.powerLevel,
				}
			}
		case 2:
			for ch, s := range updateSignalStats(t) {
				d := signal.Downstream[ch]
				d.Unerrored = s.unerrored
				d.Correctable = s.correctable
				d.Unerrored = s.uncorrectable
			}
		}
	}
	return signal, nil
}
示例#21
0
func main() {
	file, err := os.Open("example.html")

	if err != nil {
		panic(err)
	}

	doc, err := html.Parse(file)

	if err != nil {
		panic(err)
	}

	selector := cascadia.MustCompile("p")

	nodes := selector.MatchAll(doc)

	for _, node := range nodes {
		fmt.Println(node.FirstChild.Data)
	}
}
示例#22
0
文件: sb6183.go 项目: wathiede/surfer
func parseStatus(r io.Reader) (*modem.Signal, error) {
	n, err := html.Parse(r)
	if err != nil {
		return nil, err
	}
	tables := cascadia.MustCompile(".simpleTable").MatchAll(n)
	if len(tables) != 3 {
		return nil, fmt.Errorf("Found %d simpleTables, expected 3", len(tables))
	}
	d, err := parseDownstreamTable(tables[1])
	if err != nil {
		return nil, err
	}
	u, err := parseUpstreamTable(tables[2])
	if err != nil {
		return nil, err
	}
	return &modem.Signal{
		Downstream: d,
		Upstream:   u,
	}, nil
}
示例#23
0
// PrevAllFiltered gets all the preceding siblings of each element in the
// Selection filtered by a selector. It returns a new Selection object
// containing the matched elements.
func (s *Selection) PrevAllFiltered(selector string) *Selection {
	return filterAndPush(s, getSiblingNodes(s.Nodes, siblingPrevAll, nil, nil), cascadia.MustCompile(selector))
}
示例#24
0
// PrevUntil gets all preceding siblings of each element up to but not
// including the element matched by the selector. It returns a new Selection
// object containing the matched elements.
func (s *Selection) PrevUntil(selector string) *Selection {
	return pushStack(s, getSiblingNodes(s.Nodes, siblingPrevUntil,
		cascadia.MustCompile(selector), nil))
}
示例#25
0
// Add adds the selector string's matching nodes to those in the current
// selection and returns a new Selection object.
// The selector string is run in the context of the document of the current
// Selection object.
func (s *Selection) Add(selector string) *Selection {
	return s.AddNodes(findWithMatcher([]*html.Node{s.document.rootNode}, cascadia.MustCompile(selector))...)
}
示例#26
0
// PrevFilteredUntil is like PrevUntil, with the option to filter
// the results based on a selector string.
// It returns a new Selection object containing the matched elements.
func (s *Selection) PrevFilteredUntil(filterSelector, untilSelector string) *Selection {
	return filterAndPush(s, getSiblingNodes(s.Nodes, siblingPrevUntil,
		cascadia.MustCompile(untilSelector), nil), cascadia.MustCompile(filterSelector))
}
示例#27
0
// PrevFilteredUntilSelection is like PrevUntilSelection, with the
// option to filter the results based on a selector string. It returns a new
// Selection object containing the matched elements.
func (s *Selection) PrevFilteredUntilSelection(filterSelector string, sel *Selection) *Selection {
	return s.PrevMatcherUntilSelection(cascadia.MustCompile(filterSelector), sel)
}
示例#28
0
// PrevFilteredUntilNodes is like PrevUntilNodes, with the
// option to filter the results based on a selector string. It returns a new
// Selection object containing the matched elements.
func (s *Selection) PrevFilteredUntilNodes(filterSelector string, nodes ...*html.Node) *Selection {
	return filterAndPush(s, getSiblingNodes(s.Nodes, siblingPrevUntil,
		nil, nodes), cascadia.MustCompile(filterSelector))
}
示例#29
0
// ChildrenFiltered gets the child elements of each element in the Selection,
// filtered by the specified selector. It returns a new
// Selection object containing these elements.
func (s *Selection) ChildrenFiltered(selector string) *Selection {
	return filterAndPush(s, getChildrenNodes(s.Nodes, siblingAll), cascadia.MustCompile(selector))
}
示例#30
0
			return ip4[1] == 168
		}
		return false
	}

	if ip[0]&0xfe == 0xfc {
		return true
	}
	if ip[0] == 0xfe && (ip[1]&0xfc) == 0x80 {
		return true
	}

	return false
}

var titleSelector = cascadia.MustCompile("title")

func (h ProxyHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
	activeConnections.Add(1)
	defer activeConnections.Done()

	conf := GetConfig()

	if !conf.ACLsLoaded {
		http.Error(w, "Redwood proxy configuration needs to be updated for this version of Redwood.\n(Use ACLs)", 500)
		return
	}

	if len(r.URL.String()) > 10000 {
		http.Error(w, "URL too long", http.StatusRequestURITooLong)
		return