func parseRef(data string) []cid { var m = map[cid]struct{}{} t := html.NewTokenizer(strings.NewReader(data)) for { t.Next() token := t.Token() if token.Type == html.ErrorToken { break } if token.Type == html.StartTagToken && token.DataAtom == atom.Blockquote { for _, attr := range token.Attr { if attr.Key == "cite" { if s := attr.Val; strings.HasPrefix(s, "#comment-") { id, err := strconv.ParseUint(s[len("#comment-"):], 10, 32) if err != nil { logger.Println("notification:", err.Error()) continue } m[cid(id)] = struct{}{} } break } } } } var ret []cid for k, _ := range m { ret = append(ret, k) } return ret }
func findProviderFromHeadLink(input io.Reader) (opEndpoint, opLocalId string, err error) { tokenizer := html.NewTokenizer(input) inHead := false for { tt := tokenizer.Next() switch tt { case html.ErrorToken: // Even if the document is malformed after we found a // valid <link> tag, ignore and let's be happy with our // openid2.provider and potentially openid2.local_id as well. if len(opEndpoint) > 0 { return } return "", "", tokenizer.Err() case html.StartTagToken, html.EndTagToken: tk := tokenizer.Token() if tk.Data == "head" { if tt == html.StartTagToken { inHead = true } else { if len(opEndpoint) > 0 { return } return "", "", errors.New( "LINK with rel=openid2.provider not found") } } else if inHead && tk.Data == "link" { provider := false localId := false href := "" for _, attr := range tk.Attr { if attr.Key == "rel" { if attr.Val == "openid2.provider" { provider = true } else if attr.Val == "openid2.local_id" { localId = true } } else if attr.Key == "href" { href = attr.Val } } if provider && !localId && len(href) > 0 { opEndpoint = href } else if !provider && localId && len(href) > 0 { opLocalId = href } } } } // At this point we should probably have returned either from // a closing </head> or a tokenizer error (no </head> found). // But just in case. if len(opEndpoint) > 0 { return } return "", "", errors.New("LINK rel=openid2.provider not found") }
// Search for // <head> // <meta http-equiv="X-XRDS-Location" content="...."> func findMetaXrdsLocation(input io.Reader) (location string, err error) { tokenizer := html.NewTokenizer(input) inHead := false for { tt := tokenizer.Next() switch tt { case html.ErrorToken: return "", tokenizer.Err() case html.StartTagToken, html.EndTagToken: tk := tokenizer.Token() if tk.Data == "head" { if tt == html.StartTagToken { inHead = true } else { return "", errors.New("Meta X-XRDS-Location not found") } } else if inHead && tk.Data == "meta" { ok := false content := "" for _, attr := range tk.Attr { if attr.Key == "http-equiv" && attr.Val == "X-XRDS-Location" { ok = true } else if attr.Key == "content" { content = attr.Val } } if ok && len(content) > 0 { return content, nil } } } } return "", errors.New("Meta X-XRDS-Location not found") }
func htmlFilter(content string) (string, error) { var ret string ret = "<p>" t := html.NewTokenizer(strings.NewReader(content)) stack := make([]atom.Atom, 0) L: for { t.Next() token := t.Token() str := token.String() switch token.Type { case html.StartTagToken, html.SelfClosingTagToken: ans := false if attrMap, ex := validAtom[token.DataAtom]; ex { ans = true for _, attr := range token.Attr { if _, ex := attrMap[attr.Key]; !ex { ans = false break } } } if ans { stack = append(stack, token.DataAtom) ret += str } else { ret += html.EscapeString(str) } case html.EndTagToken: var top int = len(stack) - 1 for top >= 0 && stack[top] != token.DataAtom { top-- } if top == -1 { ret += html.EscapeString(str) } else { stack = stack[0:top] ret += str } case html.TextToken: ret += str case html.ErrorToken: break L } } if err := t.Err(); err != io.EOF { return "", err } for len(stack) > 0 { ret += "</" + stack[len(stack)-1].String() + ">" stack = stack[:len(stack)-1] } ret += "</p>" return ret, nil }
func TestNextTextFilter(t *testing.T) { src := `<html> <p> <a name="foo"/> <small> <font face="Arial"> Foo <sup> <u> <b> Bar </b> </u> </sup> </font> </small> <a href="/path/to/somewhere"> <i> Baz </i> </a> </p> <p> <span> Ding </span> </p> </html>` expected := []string{ "<p><a/>Foo<sup>Bar</sup><a>Baz</a></p>", "<p>Ding</p>", } r := bytes.NewBufferString(src) d := html.NewTokenizer(r) for _, v := range expected { node, err := NextTextFilter(d, "p", "a", "sup") if err != nil { t.Fatal(err) } if node.String() != v { t.Errorf("expected %q, got %q", v, node.String()) } } }
func process(content string) string { ret := "" t := html.NewTokenizer(strings.NewReader(content)) latex := false latexSrc := "" L: for { t.Next() token := t.Token() str := token.String() if latex { switch token.Type { case html.ErrorToken: break L case html.EndTagToken: if token.Data == "latex" { latex = false ret += fmt.Sprintf("<img src=\"%s\" alt=\"%s\"/>", genLaTeX(html.UnescapeString(latexSrc)), latexSrc) latexSrc = "" } else { latexSrc += str } default: latexSrc += str } } else { switch token.Type { case html.ErrorToken: break L case html.StartTagToken: if token.Data == "latex" { latex = true } else { ret += str } default: ret += str } } } return ret }
func extractLinks(resp io.Reader) []string { var links = make([]string, 1000) anchorTag := []byte{'a'} tkzer := html.NewTokenizer(resp) var more bool var value []byte var key []byte eof := false i := 0 for { if eof == true { break } switch tkzer.Next() { case html.ErrorToken: if tkzer.Err() == io.EOF { eof = true } case html.StartTagToken: tag, hasAttr := tkzer.TagName() if hasAttr && bytes.Equal(anchorTag, tag) { more = true for more == true { key, value, more = tkzer.TagAttr() if string(key) == "href" { more = false fmt.Printf("%d\n", string(value), len(links)) links[i] = string(value) i++ } } } } } return links }