func (p Cleanup) Process(f parser.Feed) parser.Feed { p.logger.Infof("Cleaning up feed '%s'\n", f.Title) for i := range f.Articles { f.Articles[i].Description = strings.TrimSpace(f.Articles[i].Description) if nodes, err := html.ParseFragment(strings.NewReader(f.Articles[i].Description), nil); err == nil { if nodesCleanup(nodes) { if len(nodes) == 0 { break } buf := util.BufferPool.GetBuffer() defer util.BufferPool.Put(buf) for _, n := range nodes { err = html.Render(buf, n) if err != nil { break } } content := buf.String() // net/http tries to provide valid html, adding html, head and body tags content = content[strings.Index(content, "<body>")+6 : strings.LastIndex(content, "</body>")] f.Articles[i].Description = content } } } return f }
func SpoonerizeHTML(r io.Reader, extraHTML string) io.ReadCloser { doc, _ := html.Parse(r) var f func(*html.Node) f = func(n *html.Node) { switch n.Type { case html.TextNode: n.Data = string(Spoonerize([]byte(n.Data))) case html.ElementNode: switch n.DataAtom { case atom.Style, atom.Script: return } } for c := n.FirstChild; c != nil; c = c.NextSibling { f(c) } if n.DataAtom == atom.Body { if extraHTML != "" { nodes, _ := html.ParseFragment(bytes.NewBufferString(extraHTML), n) for _, node := range nodes { n.AppendChild(node) } } } } f(doc) d := &bufferCloser{} html.Render(d, doc) return d }
// Auth attempts to access a given URL, then enters the given // credentials when the URL redirects to a login page. func (s *Session) Auth(serviceURL, email, password string) error { resp, err := s.Get(serviceURL) if err != nil { return err } defer resp.Body.Close() parsed, err := html.ParseFragment(resp.Body, nil) if err != nil || len(parsed) == 0 { return err } root := parsed[0] form, ok := scrape.Find(root, scrape.ById("gaia_loginform")) if !ok { return errors.New("failed to process login page") } submission := url.Values{} for _, input := range scrape.FindAll(form, scrape.ByTag(atom.Input)) { submission.Add(getAttribute(input, "name"), getAttribute(input, "value")) } submission["Email"] = []string{email} submission["Passwd"] = []string{password} postResp, err := s.PostForm(resp.Request.URL.String(), submission) if err != nil { return err } postResp.Body.Close() if postResp.Request.Method == "POST" { return errors.New("login incorrect") } return nil }
func Partial(r io.Reader) ([]*html.Node, error) { b := &html.Node{} b.Data = "body" b.DataAtom = atom.Body b.Type = html.ElementNode return html.ParseFragment(r, b) }
// GoodAsText applies some heuristics to make the data look good when displayed // as simple text. For example, if the data is escaped HTML then other heuristics are // applied to remove the HTML. For example if the data contains an HTML image tag, // goodAsText will return the alt text. If nothing good is found then an empty slice is // returned. func goodAsText(d []byte) []byte { unesc := html.UnescapeString(string(d)) nodes, err := html.ParseFragment(strings.NewReader(unesc), bodyNode) if err != nil { log.Printf("failed to parse [%s] as HTML: %v", unesc, err) return d } var buf bytes.Buffer for _, root := range nodes { walk(root, func(n *html.Node) { if n.Type == html.TextNode { buf.WriteString(strings.TrimSpace(n.Data)) return } if n := buf.Len(); n > 0 && buf.Bytes()[n-1] != ' ' { buf.WriteString(" ") } if n.DataAtom == atom.Img { if alt := altTextOrEmpty(n); alt != "" { buf.WriteString(alt) } } }) } return buf.Bytes() }
// parseGenericLoginForm takes a login page and parses the first form it finds, treating it as the // login form. func parseGenericLoginForm(res *http.Response) (result *loginFormInfo, err error) { parsed, err := html.ParseFragment(res.Body, nil) if err != nil { return } else if len(parsed) != 1 { return nil, errors.New("wrong number of root elements") } root := parsed[0] var form loginFormInfo htmlForm, ok := scrape.Find(root, scrape.ByTag(atom.Form)) if !ok { return nil, errors.New("no form element found") } if actionStr := getNodeAttribute(htmlForm, "action"); actionStr == "" { form.action = res.Request.URL.String() } else { actionURL, err := url.Parse(actionStr) if err != nil { return nil, err } if actionURL.Host == "" { actionURL.Host = res.Request.URL.Host } if actionURL.Scheme == "" { actionURL.Scheme = res.Request.URL.Scheme } if !path.IsAbs(actionURL.Path) { actionURL.Path = path.Join(res.Request.URL.Path, actionURL.Path) } form.action = actionURL.String() } inputs := scrape.FindAll(root, scrape.ByTag(atom.Input)) form.otherFields = url.Values{} for _, input := range inputs { inputName := getNodeAttribute(input, "name") switch getNodeAttribute(input, "type") { case "text": form.usernameField = inputName case "password": form.passwordField = inputName default: form.otherFields.Add(inputName, getNodeAttribute(input, "value")) } } if form.usernameField == "" { return nil, errors.New("no username field found") } else if form.passwordField == "" { return nil, errors.New("no password field found") } return &form, nil }
func parseHtml(h string) []*html.Node { // Errors are only returned when the io.Reader returns any error besides // EOF, but strings.Reader never will nodes, err := html.ParseFragment(strings.NewReader(h), &html.Node{Type: html.ElementNode}) if err != nil { panic("goquery: failed to parse HTML: " + err.Error()) } return nodes }
func ToNode(input string) *html.Node { n, err := html.ParseFragment(strings.NewReader(input), &html.Node{ Type: html.ElementNode, Data: "body", DataAtom: atom.Body, }) if err != nil { panic(err) } return n[0] }
// GetBodyNode returns an BODY node nested within an HTML node. func getBodyNode() *html.Node { ns, err := html.ParseFragment(strings.NewReader("<html><body></body></html>"), nil) if err != nil { panic("error generating context") } if len(ns) == 0 { panic("no nodes generating context") } h := ns[0] b := h.LastChild return b }
func Fuzz(data []byte) int { nodes, err := html.ParseFragment(bytes.NewReader(data), nil) if err != nil { return 0 } for _, n := range nodes { if err := html.Render(ioutil.Discard, n); err != nil { panic(err) } } return 1 }
// Just like jQuery's html() setter. func (this *Selection) SetHtml(s string) *Selection { result := newEmptySelection(this.document) for _, n := range this.Nodes { newNodes, e := html.ParseFragment(strings.NewReader(s), n) if e == nil { for _, child := range newNodes { n.AppendChild(child) } result.AddNodes(newNodes...) } } return result }
// Leaf converts an HTML fragment into a parse tree (without // html/head/body ElementNodes or DoctypeNode), and then from the root // of this tree repeatedly follows FirstChild until it finds a leaf // node. This leaf node is returned as its result. In order to parse // fragment, Leaf calls html.ParseFragment with a context of // html.Node{Type: html.ElementNode}. If there is an error parsing // fragment or no nodes are returned then Leaf returns a node // of type html.ErrorNode. The return value of Leaf is intended to be // passed to Match as its second argument. func Leaf(fragment string) *html.Node { ns, err := html.ParseFragment( strings.NewReader(fragment), &html.Node{Type: html.ElementNode}) if err != nil || len(ns) == 0 { return &html.Node{Type: html.ErrorNode} } n := ns[0] if n == nil { return nil } for n.FirstChild != nil { n = n.FirstChild } return n }
// ParseHTMLText2 is to parse HTML text data using html.Node func ParseHTMLText2(text string) ([]*html.Node, error) { var context html.Node context = html.Node{ Type: html.ElementNode, Data: "body", DataAtom: atom.Body, } //[]*Node nodes, err := html.ParseFragment(strings.NewReader(text), &context) if err != nil { return nil, err } return nodes, nil }
// ParseDepth is a convenience function that wraps html.ParseFragment but takes // a string instead of an io.Reader and omits deep trees. func ParseDepth(fragment string, maxDepth int) []*html.Node { nodes, err := html.ParseFragment(strings.NewReader(fragment), &html.Node{ Type: html.ElementNode, Data: "div", DataAtom: atom.Div, }) expectError(err, nil) if maxDepth > 0 { for _, n := range nodes { forceMaxDepth(n, maxDepth) } } return nodes }
func createIndexEntry(title string, date string, path string) []*html.Node { b, err := ioutil.ReadFile(filepath.Join(baseLocation, "generator/index-entry-template.html")) exitOnErr(err) entryHTML, err := html.ParseFragment(bytes.NewReader(b), fakeBodyNode()) exitOnErr(err) for _, entry := range entryHTML { titleNode := queryHTML(entry, hasType(atom.A)) if titleNode != nil { titleNode.AppendChild(&html.Node{ Type: html.TextNode, Data: title, }) var found bool for i, attr := range titleNode.Attr { if attr.Key == "href" { attr.Val = path titleNode.Attr[i] = attr found = true break } } if !found { titleNode.Attr = append(titleNode.Attr, html.Attribute{ Key: "href", Val: path, }) } } dateNode := queryHTML(entry, hasClass("date")) if dateNode != nil { dateNode.AppendChild(&html.Node{ Type: html.TextNode, Data: date, }) } } return entryHTML }
func generateEntries(location string, indexEntries []indexEntry, postProc postProcFunc) []indexEntry { files, err := ioutil.ReadDir(location) exitOnErr(err) for _, f := range files { if f.IsDir() { continue } // read the blog entry var titleText, dateText string var bodyText bytes.Buffer { p := filepath.Join(location, f.Name()) srcf, err := os.Open(p) exitOnErr(err) scan := bufio.NewScanner(srcf) // get the title scan.Scan() exitOnErr(scan.Err()) titleText = scan.Text() // get the date scan.Scan() exitOnErr(scan.Err()) dateText = scan.Text() // read the rest of the body for scan.Scan() { bodyText.Write(scan.Bytes()) bodyText.WriteByte('\n') } exitOnErr(scan.Err()) srcf.Close() bodyText = postProc(bodyText) } // get the blog entry page template template, err := loadTemplate() exitOnErr(err) // set the blog entry data in the blog entry page template { title := queryHTML(template, hasType(atom.Title)) heading := queryHTML(template, hasType(atom.H1)) date := queryHTML(template, hasClass("date")) entrye := queryHTML(template, hasClass("entry")) title.AppendChild(&html.Node{ Type: html.TextNode, Data: titleText, }) heading.AppendChild(&html.Node{ Type: html.TextNode, Data: titleText, }) date.AppendChild(&html.Node{ Type: html.TextNode, Data: dateText, }) // read the blog entry body as HTML entryHTML, err := html.ParseFragment(&bodyText, fakeBodyNode()) exitOnErr(err) for _, eh := range entryHTML { entrye.AppendChild(eh) } } t, err := time.Parse("January 2, 2006", dateText) exitOnErr(err) var targetPath, targetDir string { targetPathStart := t.Format("2006/01/02/") fileNameWithoutExt := f.Name()[0 : len(f.Name())-len(filepath.Ext(f.Name()))] targetPath = filepath.Join(targetPathStart, fileNameWithoutExt) targetDir = filepath.Join(baseLocation, targetPath) } exitOnErr(os.RemoveAll(targetDir)) exitOnErr(os.MkdirAll(targetDir, 0755)) // write to file { targetFile := filepath.Join(targetDir, "index.html") fmt.Printf("generating: %s\n", targetFile) target, err := os.Create(targetFile) exitOnErr(err) exitOnErr(html.Render(target, template)) target.Close() } indexEntries = append(indexEntries, indexEntry{ html: createIndexEntry(titleText, dateText, targetPath+"/"), time: t.Unix(), }) } return indexEntries }
// ParseHTML reads an HTML document from r, parses it using a proper HTML // parser, and returns its root node. // // The document will be processed as a properly structured HTML document, // emulating the behavior of a browser when processing it. This includes // putting the content inside proper <html> and <body> tags, if the // provided text misses them. func ParseHTML(r io.Reader) (*Node, error) { ns, err := html.ParseFragment(r, nil) if err != nil { return nil, err } var nodes []Node var text []byte n := ns[0] // The root node. nodes = append(nodes, Node{kind: startNode}) for n != nil { switch n.Type { case html.DocumentNode: case html.ElementNode: nodes = append(nodes, Node{ kind: startNode, name: xml.Name{Local: n.Data, Space: n.Namespace}, }) for _, attr := range n.Attr { nodes = append(nodes, Node{ kind: attrNode, name: xml.Name{Local: attr.Key, Space: attr.Namespace}, attr: attr.Val, }) } case html.TextNode: texti := len(text) text = append(text, n.Data...) nodes = append(nodes, Node{ kind: textNode, text: text[texti : texti+len(n.Data)], }) case html.CommentNode: texti := len(text) text = append(text, n.Data...) nodes = append(nodes, Node{ kind: commentNode, text: text[texti : texti+len(n.Data)], }) } if n.FirstChild != nil { n = n.FirstChild continue } for n != nil { if n.Type == html.ElementNode { nodes = append(nodes, Node{kind: endNode}) } if n.NextSibling != nil { n = n.NextSibling break } n = n.Parent } } // Close the root node. nodes = append(nodes, Node{kind: endNode}) stack := make([]*Node, 0, len(nodes)) downs := make([]*Node, len(nodes)) downCount := 0 for pos := range nodes { switch nodes[pos].kind { case startNode, attrNode, textNode, commentNode, procInstNode: node := &nodes[pos] node.nodes = nodes node.pos = pos if len(stack) > 0 { node.up = stack[len(stack)-1] } if node.kind == startNode { stack = append(stack, node) } else { node.end = pos + 1 } case endNode: node := stack[len(stack)-1] node.end = pos stack = stack[:len(stack)-1] // Compute downs. Doing that here is what enables the // use of a slice of a contiguous pre-allocated block. node.down = downs[downCount:downCount] for i := node.pos + 1; i < node.end; i++ { if nodes[i].up == node { switch nodes[i].kind { case startNode, textNode, commentNode, procInstNode: node.down = append(node.down, &nodes[i]) downCount++ } } } if len(stack) == 0 { return node, nil } } } return nil, io.EOF }
// parseExtraComponentInfo parses the "Class Detail" page for a component. func parseExtraComponentInfo(body io.Reader, component *Component) (courseOpen bool, err error) { nodes, err := html.ParseFragment(body, nil) if err != nil { return } if len(nodes) != 1 { return false, errors.New("invalid number of root elements") } openStatus, ok := scrape.Find(nodes[0], scrape.ById("SSR_CLS_DTL_WRK_SSR_DESCRSHORT")) if !ok { return false, errors.New("open status not found") } courseOpen = (nodeInnerText(openStatus) == "Open") availTable, ok := scrape.Find(nodes[0], scrape.ById("ACE_SSR_CLS_DTL_WRK_GROUP3")) if !ok { return courseOpen, errors.New("could not find availability info") } rows := scrape.FindAll(availTable, scrape.ByTag(atom.Tr)) if len(rows) != 7 { return courseOpen, errors.New("invalid number of rows in availability table") } var availability ClassAvailability cols := nodesWithAlignAttribute(scrape.FindAll(rows[2], scrape.ByTag(atom.Td))) if len(cols) != 2 { return courseOpen, errors.New("expected 2 aligned columns in row 2") } availability.Capacity, err = strconv.Atoi(strings.TrimSpace(nodeInnerText(cols[0]))) if err != nil { return } availability.WaitListCapacity, err = strconv.Atoi(strings.TrimSpace(nodeInnerText(cols[1]))) if err != nil { return } cols = nodesWithAlignAttribute(scrape.FindAll(rows[4], scrape.ByTag(atom.Td))) if len(cols) != 2 { return courseOpen, errors.New("expected 2 aligned columns in row 4") } availability.EnrollmentTotal, err = strconv.Atoi(strings.TrimSpace(nodeInnerText(cols[0]))) if err != nil { return } availability.WaitListTotal, err = strconv.Atoi(strings.TrimSpace(nodeInnerText(cols[1]))) if err != nil { return } cols = nodesWithAlignAttribute(scrape.FindAll(rows[6], scrape.ByTag(atom.Td))) if len(cols) != 1 { return courseOpen, errors.New("expected 1 aligned column in row 6") } availability.AvailableSeats, err = strconv.Atoi(strings.TrimSpace(nodeInnerText(cols[0]))) if err != nil { return } component.ClassAvailability = &availability return }
func markdown(path string, info os.FileInfo) error { input, err := ioutil.ReadFile(path) if err != nil { return err } // extract the header out of the markdown, so we can control the // layout better; blackfriday would put the toc above the h1, and // include the singular h1 in the toc, causing stutter. idx := bytes.IndexByte(input, '\n') if idx == -1 { return errors.New("markdown has no content") } titleMD, input := input[:idx], input[idx+1:] htmlFlags := (0 | blackfriday.HTML_USE_SMARTYPANTS | blackfriday.HTML_SMARTYPANTS_FRACTIONS | blackfriday.HTML_SMARTYPANTS_LATEX_DASHES | blackfriday.HTML_USE_XHTML | blackfriday.HTML_FOOTNOTE_RETURN_LINKS | 0) // HtmlRenderer demands a title and a css path here, but we only // render a fragment so those are not used renderer := blackfriday.HtmlRenderer(htmlFlags, "", "") extensions := (0 | blackfriday.EXTENSION_NO_INTRA_EMPHASIS | blackfriday.EXTENSION_TABLES | blackfriday.EXTENSION_FENCED_CODE | blackfriday.EXTENSION_AUTOLINK | blackfriday.EXTENSION_STRIKETHROUGH | blackfriday.EXTENSION_SPACE_HEADERS | blackfriday.EXTENSION_FOOTNOTES | blackfriday.EXTENSION_HEADER_IDS | blackfriday.EXTENSION_AUTO_HEADER_IDS | 0) titleHTML := blackfriday.Markdown(titleMD, renderer, extensions) contentHTML := blackfriday.Markdown(input, renderer, extensions) tocFlags := htmlFlags | blackfriday.HTML_TOC | blackfriday.HTML_OMIT_CONTENTS tocRenderer := blackfriday.HtmlRenderer(tocFlags, "", "") tocHTML := blackfriday.Markdown(input, tocRenderer, extensions) body := &html.Node{ Type: html.ElementNode, Data: "body", DataAtom: atom.Body, } nodes, err := html.ParseFragment(bytes.NewReader(titleHTML), body) if err != nil { return fmt.Errorf("cannot parse generated html: %v", err) } if len(nodes) == 0 || nodes[0].Type != html.ElementNode || nodes[0].DataAtom != atom.H1 { return errors.New("markdown does not start with a header") } title := childText(nodes[0]) var buf bytes.Buffer prettyPath := "/" + strings.TrimSuffix(path, ".md") if dir, file := filepath.Split(prettyPath); file == "index" { prettyPath = dir } data := struct { Path string Title string H1 template.HTML TOC template.HTML Content template.HTML }{ Path: prettyPath, Title: title, H1: template.HTML(titleHTML), TOC: template.HTML(tocHTML), Content: template.HTML(contentHTML), } if err := layout.Execute(&buf, data); err != nil { return fmt.Errorf("executing template: %v", err) } min, err := minify.Bytes(minifier, "text/html", buf.Bytes()) if err != nil { return fmt.Errorf("cannot minify html: %v", err) } dst := filepath.Join(outputDir, strings.TrimSuffix(path, ".md")+".html") if err := writeFile(dst, min); err != nil { return err } return nil }