Beispiel #1
0
func (p Cleanup) Process(f parser.Feed) parser.Feed {
	p.logger.Infof("Cleaning up feed '%s'\n", f.Title)

	for i := range f.Articles {
		f.Articles[i].Description = strings.TrimSpace(f.Articles[i].Description)

		if nodes, err := html.ParseFragment(strings.NewReader(f.Articles[i].Description), nil); err == nil {
			if nodesCleanup(nodes) {
				if len(nodes) == 0 {
					break
				}

				buf := util.BufferPool.GetBuffer()
				defer util.BufferPool.Put(buf)

				for _, n := range nodes {
					err = html.Render(buf, n)
					if err != nil {
						break
					}
				}

				content := buf.String()

				// net/http tries to provide valid html, adding html, head and body tags
				content = content[strings.Index(content, "<body>")+6 : strings.LastIndex(content, "</body>")]
				f.Articles[i].Description = content
			}
		}
	}

	return f
}
Beispiel #2
0
func SpoonerizeHTML(r io.Reader, extraHTML string) io.ReadCloser {
	doc, _ := html.Parse(r)
	var f func(*html.Node)
	f = func(n *html.Node) {
		switch n.Type {
		case html.TextNode:
			n.Data = string(Spoonerize([]byte(n.Data)))
		case html.ElementNode:
			switch n.DataAtom {
			case atom.Style, atom.Script:
				return
			}
		}

		for c := n.FirstChild; c != nil; c = c.NextSibling {
			f(c)
		}

		if n.DataAtom == atom.Body {
			if extraHTML != "" {
				nodes, _ := html.ParseFragment(bytes.NewBufferString(extraHTML), n)
				for _, node := range nodes {
					n.AppendChild(node)
				}
			}
		}
	}
	f(doc)

	d := &bufferCloser{}
	html.Render(d, doc)
	return d
}
Beispiel #3
0
// Auth attempts to access a given URL, then enters the given
// credentials when the URL redirects to a login page.
func (s *Session) Auth(serviceURL, email, password string) error {
	resp, err := s.Get(serviceURL)
	if err != nil {
		return err
	}
	defer resp.Body.Close()
	parsed, err := html.ParseFragment(resp.Body, nil)
	if err != nil || len(parsed) == 0 {
		return err
	}
	root := parsed[0]
	form, ok := scrape.Find(root, scrape.ById("gaia_loginform"))
	if !ok {
		return errors.New("failed to process login page")
	}
	submission := url.Values{}
	for _, input := range scrape.FindAll(form, scrape.ByTag(atom.Input)) {
		submission.Add(getAttribute(input, "name"), getAttribute(input, "value"))
	}
	submission["Email"] = []string{email}
	submission["Passwd"] = []string{password}

	postResp, err := s.PostForm(resp.Request.URL.String(), submission)
	if err != nil {
		return err
	}
	postResp.Body.Close()

	if postResp.Request.Method == "POST" {
		return errors.New("login incorrect")
	}

	return nil
}
Beispiel #4
0
func Partial(r io.Reader) ([]*html.Node, error) {
	b := &html.Node{}
	b.Data = "body"
	b.DataAtom = atom.Body
	b.Type = html.ElementNode
	return html.ParseFragment(r, b)
}
Beispiel #5
0
// GoodAsText applies some heuristics to make the data look good when displayed
// as simple text. For example, if the data is escaped HTML then other heuristics are
// applied to remove the HTML. For example if the data contains an HTML image tag,
// goodAsText will return the alt text. If nothing good is found then an empty slice is
// returned.
func goodAsText(d []byte) []byte {
	unesc := html.UnescapeString(string(d))
	nodes, err := html.ParseFragment(strings.NewReader(unesc), bodyNode)
	if err != nil {
		log.Printf("failed to parse [%s] as HTML: %v", unesc, err)
		return d
	}

	var buf bytes.Buffer
	for _, root := range nodes {
		walk(root, func(n *html.Node) {
			if n.Type == html.TextNode {
				buf.WriteString(strings.TrimSpace(n.Data))
				return
			}

			if n := buf.Len(); n > 0 && buf.Bytes()[n-1] != ' ' {
				buf.WriteString(" ")
			}
			if n.DataAtom == atom.Img {
				if alt := altTextOrEmpty(n); alt != "" {
					buf.WriteString(alt)
				}
			}
		})
	}
	return buf.Bytes()
}
// parseGenericLoginForm takes a login page and parses the first form it finds, treating it as the
// login form.
func parseGenericLoginForm(res *http.Response) (result *loginFormInfo, err error) {
	parsed, err := html.ParseFragment(res.Body, nil)
	if err != nil {
		return
	} else if len(parsed) != 1 {
		return nil, errors.New("wrong number of root elements")
	}

	root := parsed[0]

	var form loginFormInfo

	htmlForm, ok := scrape.Find(root, scrape.ByTag(atom.Form))
	if !ok {
		return nil, errors.New("no form element found")
	}

	if actionStr := getNodeAttribute(htmlForm, "action"); actionStr == "" {
		form.action = res.Request.URL.String()
	} else {
		actionURL, err := url.Parse(actionStr)
		if err != nil {
			return nil, err
		}
		if actionURL.Host == "" {
			actionURL.Host = res.Request.URL.Host
		}
		if actionURL.Scheme == "" {
			actionURL.Scheme = res.Request.URL.Scheme
		}
		if !path.IsAbs(actionURL.Path) {
			actionURL.Path = path.Join(res.Request.URL.Path, actionURL.Path)
		}
		form.action = actionURL.String()
	}

	inputs := scrape.FindAll(root, scrape.ByTag(atom.Input))
	form.otherFields = url.Values{}
	for _, input := range inputs {
		inputName := getNodeAttribute(input, "name")
		switch getNodeAttribute(input, "type") {
		case "text":
			form.usernameField = inputName
		case "password":
			form.passwordField = inputName
		default:
			form.otherFields.Add(inputName, getNodeAttribute(input, "value"))
		}
	}

	if form.usernameField == "" {
		return nil, errors.New("no username field found")
	} else if form.passwordField == "" {
		return nil, errors.New("no password field found")
	}

	return &form, nil
}
Beispiel #7
0
func parseHtml(h string) []*html.Node {
	// Errors are only returned when the io.Reader returns any error besides
	// EOF, but strings.Reader never will
	nodes, err := html.ParseFragment(strings.NewReader(h), &html.Node{Type: html.ElementNode})
	if err != nil {
		panic("goquery: failed to parse HTML: " + err.Error())
	}
	return nodes
}
Beispiel #8
0
func ToNode(input string) *html.Node {
	n, err := html.ParseFragment(strings.NewReader(input), &html.Node{
		Type:     html.ElementNode,
		Data:     "body",
		DataAtom: atom.Body,
	})
	if err != nil {
		panic(err)
	}
	return n[0]
}
Beispiel #9
0
// GetBodyNode returns an BODY node nested within an HTML node.
func getBodyNode() *html.Node {
	ns, err := html.ParseFragment(strings.NewReader("<html><body></body></html>"), nil)
	if err != nil {
		panic("error generating context")
	}
	if len(ns) == 0 {
		panic("no nodes generating context")
	}
	h := ns[0]
	b := h.LastChild
	return b
}
Beispiel #10
0
func Fuzz(data []byte) int {
	nodes, err := html.ParseFragment(bytes.NewReader(data), nil)
	if err != nil {
		return 0
	}
	for _, n := range nodes {
		if err := html.Render(ioutil.Discard, n); err != nil {
			panic(err)
		}
	}
	return 1
}
Beispiel #11
0
// Just like jQuery's html() setter.
func (this *Selection) SetHtml(s string) *Selection {
	result := newEmptySelection(this.document)
	for _, n := range this.Nodes {
		newNodes, e := html.ParseFragment(strings.NewReader(s), n)
		if e == nil {
			for _, child := range newNodes {
				n.AppendChild(child)
			}
			result.AddNodes(newNodes...)
		}
	}
	return result
}
Beispiel #12
0
// Leaf converts an HTML fragment into a parse tree (without
// html/head/body ElementNodes or DoctypeNode), and then from the root
// of this tree repeatedly follows FirstChild until it finds a leaf
// node. This leaf node is returned as its result. In order to parse
// fragment, Leaf calls html.ParseFragment with a context of
// html.Node{Type: html.ElementNode}. If there is an error parsing
// fragment or no nodes are returned then Leaf returns a node
// of type html.ErrorNode. The return value of Leaf is intended to be
// passed to Match as its second argument.
func Leaf(fragment string) *html.Node {
	ns, err := html.ParseFragment(
		strings.NewReader(fragment), &html.Node{Type: html.ElementNode})
	if err != nil || len(ns) == 0 {
		return &html.Node{Type: html.ErrorNode}
	}
	n := ns[0]
	if n == nil {
		return nil
	}
	for n.FirstChild != nil {
		n = n.FirstChild
	}
	return n
}
Beispiel #13
0
// ParseHTMLText2 is to parse HTML text data using html.Node
func ParseHTMLText2(text string) ([]*html.Node, error) {
	var context html.Node
	context = html.Node{
		Type:     html.ElementNode,
		Data:     "body",
		DataAtom: atom.Body,
	}
	//[]*Node
	nodes, err := html.ParseFragment(strings.NewReader(text), &context)

	if err != nil {
		return nil, err
	}
	return nodes, nil
}
Beispiel #14
0
// ParseDepth is a convenience function that wraps html.ParseFragment but takes
// a string instead of an io.Reader and omits deep trees.
func ParseDepth(fragment string, maxDepth int) []*html.Node {
	nodes, err := html.ParseFragment(strings.NewReader(fragment), &html.Node{
		Type:     html.ElementNode,
		Data:     "div",
		DataAtom: atom.Div,
	})
	expectError(err, nil)

	if maxDepth > 0 {
		for _, n := range nodes {
			forceMaxDepth(n, maxDepth)
		}
	}

	return nodes
}
Beispiel #15
0
func createIndexEntry(title string, date string, path string) []*html.Node {
	b, err := ioutil.ReadFile(filepath.Join(baseLocation, "generator/index-entry-template.html"))
	exitOnErr(err)
	entryHTML, err := html.ParseFragment(bytes.NewReader(b), fakeBodyNode())
	exitOnErr(err)

	for _, entry := range entryHTML {
		titleNode := queryHTML(entry, hasType(atom.A))
		if titleNode != nil {
			titleNode.AppendChild(&html.Node{
				Type: html.TextNode,
				Data: title,
			})

			var found bool
			for i, attr := range titleNode.Attr {
				if attr.Key == "href" {
					attr.Val = path
					titleNode.Attr[i] = attr
					found = true
					break
				}
			}
			if !found {
				titleNode.Attr = append(titleNode.Attr, html.Attribute{
					Key: "href",
					Val: path,
				})
			}
		}
		dateNode := queryHTML(entry, hasClass("date"))
		if dateNode != nil {
			dateNode.AppendChild(&html.Node{
				Type: html.TextNode,
				Data: date,
			})
		}
	}
	return entryHTML
}
Beispiel #16
0
func generateEntries(location string, indexEntries []indexEntry, postProc postProcFunc) []indexEntry {
	files, err := ioutil.ReadDir(location)
	exitOnErr(err)
	for _, f := range files {
		if f.IsDir() {
			continue
		}

		// read the blog entry
		var titleText, dateText string
		var bodyText bytes.Buffer
		{
			p := filepath.Join(location, f.Name())
			srcf, err := os.Open(p)
			exitOnErr(err)

			scan := bufio.NewScanner(srcf)

			// get the title
			scan.Scan()
			exitOnErr(scan.Err())
			titleText = scan.Text()

			// get the date
			scan.Scan()
			exitOnErr(scan.Err())
			dateText = scan.Text()

			// read the rest of the body
			for scan.Scan() {
				bodyText.Write(scan.Bytes())
				bodyText.WriteByte('\n')
			}

			exitOnErr(scan.Err())
			srcf.Close()

			bodyText = postProc(bodyText)
		}

		// get the blog entry page template
		template, err := loadTemplate()
		exitOnErr(err)

		// set the blog entry data in the blog entry page template
		{
			title := queryHTML(template, hasType(atom.Title))
			heading := queryHTML(template, hasType(atom.H1))
			date := queryHTML(template, hasClass("date"))
			entrye := queryHTML(template, hasClass("entry"))

			title.AppendChild(&html.Node{
				Type: html.TextNode,
				Data: titleText,
			})
			heading.AppendChild(&html.Node{
				Type: html.TextNode,
				Data: titleText,
			})
			date.AppendChild(&html.Node{
				Type: html.TextNode,
				Data: dateText,
			})

			// read the blog entry body as HTML
			entryHTML, err := html.ParseFragment(&bodyText, fakeBodyNode())
			exitOnErr(err)

			for _, eh := range entryHTML {
				entrye.AppendChild(eh)
			}
		}

		t, err := time.Parse("January 2, 2006", dateText)
		exitOnErr(err)

		var targetPath, targetDir string
		{
			targetPathStart := t.Format("2006/01/02/")
			fileNameWithoutExt := f.Name()[0 : len(f.Name())-len(filepath.Ext(f.Name()))]
			targetPath = filepath.Join(targetPathStart, fileNameWithoutExt)
			targetDir = filepath.Join(baseLocation, targetPath)
		}

		exitOnErr(os.RemoveAll(targetDir))
		exitOnErr(os.MkdirAll(targetDir, 0755))

		// write to file
		{
			targetFile := filepath.Join(targetDir, "index.html")
			fmt.Printf("generating: %s\n", targetFile)
			target, err := os.Create(targetFile)
			exitOnErr(err)
			exitOnErr(html.Render(target, template))
			target.Close()
		}

		indexEntries = append(indexEntries, indexEntry{
			html: createIndexEntry(titleText, dateText, targetPath+"/"),
			time: t.Unix(),
		})
	}
	return indexEntries
}
Beispiel #17
0
// ParseHTML reads an HTML document from r, parses it using a proper HTML
// parser, and returns its root node.
//
// The document will be processed as a properly structured HTML document,
// emulating the behavior of a browser when processing it. This includes
// putting the content inside proper <html> and <body> tags, if the
// provided text misses them.
func ParseHTML(r io.Reader) (*Node, error) {
	ns, err := html.ParseFragment(r, nil)
	if err != nil {
		return nil, err
	}

	var nodes []Node
	var text []byte

	n := ns[0]

	// The root node.
	nodes = append(nodes, Node{kind: startNode})

	for n != nil {
		switch n.Type {
		case html.DocumentNode:
		case html.ElementNode:
			nodes = append(nodes, Node{
				kind: startNode,
				name: xml.Name{Local: n.Data, Space: n.Namespace},
			})
			for _, attr := range n.Attr {
				nodes = append(nodes, Node{
					kind: attrNode,
					name: xml.Name{Local: attr.Key, Space: attr.Namespace},
					attr: attr.Val,
				})
			}
		case html.TextNode:
			texti := len(text)
			text = append(text, n.Data...)
			nodes = append(nodes, Node{
				kind: textNode,
				text: text[texti : texti+len(n.Data)],
			})
		case html.CommentNode:
			texti := len(text)
			text = append(text, n.Data...)
			nodes = append(nodes, Node{
				kind: commentNode,
				text: text[texti : texti+len(n.Data)],
			})
		}

		if n.FirstChild != nil {
			n = n.FirstChild
			continue
		}

		for n != nil {
			if n.Type == html.ElementNode {
				nodes = append(nodes, Node{kind: endNode})
			}
			if n.NextSibling != nil {
				n = n.NextSibling
				break
			}
			n = n.Parent
		}
	}

	// Close the root node.
	nodes = append(nodes, Node{kind: endNode})

	stack := make([]*Node, 0, len(nodes))
	downs := make([]*Node, len(nodes))
	downCount := 0

	for pos := range nodes {

		switch nodes[pos].kind {

		case startNode, attrNode, textNode, commentNode, procInstNode:
			node := &nodes[pos]
			node.nodes = nodes
			node.pos = pos
			if len(stack) > 0 {
				node.up = stack[len(stack)-1]
			}
			if node.kind == startNode {
				stack = append(stack, node)
			} else {
				node.end = pos + 1
			}

		case endNode:
			node := stack[len(stack)-1]
			node.end = pos
			stack = stack[:len(stack)-1]

			// Compute downs. Doing that here is what enables the
			// use of a slice of a contiguous pre-allocated block.
			node.down = downs[downCount:downCount]
			for i := node.pos + 1; i < node.end; i++ {
				if nodes[i].up == node {
					switch nodes[i].kind {
					case startNode, textNode, commentNode, procInstNode:
						node.down = append(node.down, &nodes[i])
						downCount++
					}
				}
			}
			if len(stack) == 0 {
				return node, nil
			}
		}
	}
	return nil, io.EOF
}
// parseExtraComponentInfo parses the "Class Detail" page for a component.
func parseExtraComponentInfo(body io.Reader, component *Component) (courseOpen bool, err error) {
	nodes, err := html.ParseFragment(body, nil)
	if err != nil {
		return
	}
	if len(nodes) != 1 {
		return false, errors.New("invalid number of root elements")
	}

	openStatus, ok := scrape.Find(nodes[0], scrape.ById("SSR_CLS_DTL_WRK_SSR_DESCRSHORT"))
	if !ok {
		return false, errors.New("open status not found")
	}
	courseOpen = (nodeInnerText(openStatus) == "Open")

	availTable, ok := scrape.Find(nodes[0], scrape.ById("ACE_SSR_CLS_DTL_WRK_GROUP3"))
	if !ok {
		return courseOpen, errors.New("could not find availability info")
	}

	rows := scrape.FindAll(availTable, scrape.ByTag(atom.Tr))
	if len(rows) != 7 {
		return courseOpen, errors.New("invalid number of rows in availability table")
	}

	var availability ClassAvailability

	cols := nodesWithAlignAttribute(scrape.FindAll(rows[2], scrape.ByTag(atom.Td)))
	if len(cols) != 2 {
		return courseOpen, errors.New("expected 2 aligned columns in row 2")
	}
	availability.Capacity, err = strconv.Atoi(strings.TrimSpace(nodeInnerText(cols[0])))
	if err != nil {
		return
	}
	availability.WaitListCapacity, err = strconv.Atoi(strings.TrimSpace(nodeInnerText(cols[1])))
	if err != nil {
		return
	}

	cols = nodesWithAlignAttribute(scrape.FindAll(rows[4], scrape.ByTag(atom.Td)))
	if len(cols) != 2 {
		return courseOpen, errors.New("expected 2 aligned columns in row 4")
	}
	availability.EnrollmentTotal, err = strconv.Atoi(strings.TrimSpace(nodeInnerText(cols[0])))
	if err != nil {
		return
	}
	availability.WaitListTotal, err = strconv.Atoi(strings.TrimSpace(nodeInnerText(cols[1])))
	if err != nil {
		return
	}

	cols = nodesWithAlignAttribute(scrape.FindAll(rows[6], scrape.ByTag(atom.Td)))
	if len(cols) != 1 {
		return courseOpen, errors.New("expected 1 aligned column in row 6")
	}
	availability.AvailableSeats, err = strconv.Atoi(strings.TrimSpace(nodeInnerText(cols[0])))
	if err != nil {
		return
	}

	component.ClassAvailability = &availability

	return
}
Beispiel #19
0
func markdown(path string, info os.FileInfo) error {
	input, err := ioutil.ReadFile(path)
	if err != nil {
		return err
	}

	// extract the header out of the markdown, so we can control the
	// layout better; blackfriday would put the toc above the h1, and
	// include the singular h1 in the toc, causing stutter.
	idx := bytes.IndexByte(input, '\n')
	if idx == -1 {
		return errors.New("markdown has no content")
	}
	titleMD, input := input[:idx], input[idx+1:]

	htmlFlags := (0 |
		blackfriday.HTML_USE_SMARTYPANTS |
		blackfriday.HTML_SMARTYPANTS_FRACTIONS |
		blackfriday.HTML_SMARTYPANTS_LATEX_DASHES |
		blackfriday.HTML_USE_XHTML |
		blackfriday.HTML_FOOTNOTE_RETURN_LINKS |
		0)
	// HtmlRenderer demands a title and a css path here, but we only
	// render a fragment so those are not used
	renderer := blackfriday.HtmlRenderer(htmlFlags, "", "")
	extensions := (0 |
		blackfriday.EXTENSION_NO_INTRA_EMPHASIS |
		blackfriday.EXTENSION_TABLES |
		blackfriday.EXTENSION_FENCED_CODE |
		blackfriday.EXTENSION_AUTOLINK |
		blackfriday.EXTENSION_STRIKETHROUGH |
		blackfriday.EXTENSION_SPACE_HEADERS |
		blackfriday.EXTENSION_FOOTNOTES |
		blackfriday.EXTENSION_HEADER_IDS |
		blackfriday.EXTENSION_AUTO_HEADER_IDS |
		0)
	titleHTML := blackfriday.Markdown(titleMD, renderer, extensions)
	contentHTML := blackfriday.Markdown(input, renderer, extensions)

	tocFlags := htmlFlags | blackfriday.HTML_TOC | blackfriday.HTML_OMIT_CONTENTS
	tocRenderer := blackfriday.HtmlRenderer(tocFlags, "", "")
	tocHTML := blackfriday.Markdown(input, tocRenderer, extensions)
	body := &html.Node{
		Type:     html.ElementNode,
		Data:     "body",
		DataAtom: atom.Body,
	}
	nodes, err := html.ParseFragment(bytes.NewReader(titleHTML), body)
	if err != nil {
		return fmt.Errorf("cannot parse generated html: %v", err)
	}
	if len(nodes) == 0 ||
		nodes[0].Type != html.ElementNode ||
		nodes[0].DataAtom != atom.H1 {
		return errors.New("markdown does not start with a header")
	}
	title := childText(nodes[0])

	var buf bytes.Buffer
	prettyPath := "/" + strings.TrimSuffix(path, ".md")
	if dir, file := filepath.Split(prettyPath); file == "index" {
		prettyPath = dir
	}
	data := struct {
		Path    string
		Title   string
		H1      template.HTML
		TOC     template.HTML
		Content template.HTML
	}{
		Path:    prettyPath,
		Title:   title,
		H1:      template.HTML(titleHTML),
		TOC:     template.HTML(tocHTML),
		Content: template.HTML(contentHTML),
	}
	if err := layout.Execute(&buf, data); err != nil {
		return fmt.Errorf("executing template: %v", err)
	}

	min, err := minify.Bytes(minifier, "text/html", buf.Bytes())
	if err != nil {
		return fmt.Errorf("cannot minify html: %v", err)
	}

	dst := filepath.Join(outputDir, strings.TrimSuffix(path, ".md")+".html")
	if err := writeFile(dst, min); err != nil {
		return err
	}
	return nil
}