// ProcessHTML parses given html from Reader interface and fills up OpenGraph structure func (og *OpenGraph) ProcessHTML(buffer io.Reader) error { z := html.NewTokenizer(buffer) for { tt := z.Next() switch tt { case html.ErrorToken: if z.Err() == io.EOF { return nil } return z.Err() case html.StartTagToken, html.SelfClosingTagToken, html.EndTagToken: name, hasAttr := z.TagName() if atom.Lookup(name) == atom.Body { return nil // OpenGraph is only in head, so we don't need body } if atom.Lookup(name) != atom.Meta || !hasAttr { continue } m := make(map[string]string) var key, val []byte for hasAttr { key, val, hasAttr = z.TagAttr() m[atom.String(key)] = string(val) } og.ProcessMeta(m) } } return nil }
func htmlToText(r io.Reader) []byte { t := html.NewTokenizer(r) var out bytes.Buffer var ignorescore int for { switch token := t.Next(); token { case html.StartTagToken: if _, ok := ignoretag[string(t.Token().Data)]; ok { ignorescore++ } case html.EndTagToken: if _, ok := ignoretag[string(t.Token().Data)]; ok { ignorescore-- } case html.ErrorToken: return out.Bytes() case html.CommentToken: continue case html.TextToken: if ignorescore == 0 { html := strings.TrimSpace(t.Token().Data) if len(html) > 0 { fmt.Fprintln(&out, html) } } } } }
// rewriteHTML scans the HTML for tags with url-valued attributes, and updates // those values with the urlRewriter function. The updated HTML is output to the // writer. func rewriteHTML(reader io.Reader, writer io.Writer, urlRewriter func(string) string) error { // Note: This assumes the content is UTF-8. tokenizer := html.NewTokenizer(reader) var err error for err == nil { tokenType := tokenizer.Next() switch tokenType { case html.ErrorToken: err = tokenizer.Err() case html.StartTagToken, html.SelfClosingTagToken: token := tokenizer.Token() if urlAttrs, ok := atomsToAttrs[token.DataAtom]; ok { for i, attr := range token.Attr { if urlAttrs.Has(attr.Key) { token.Attr[i].Val = urlRewriter(attr.Val) } } } _, err = writer.Write([]byte(token.String())) default: _, err = writer.Write(tokenizer.Raw()) } } if err != io.EOF { return err } return nil }
func isHTML(content []byte) bool { isHTML := false if len(content) == 0 { return isHTML } if len(content) > 1024 { content = content[:1024] } z := html.NewTokenizer(bytes.NewReader(content)) isFinish := false for !isFinish { switch z.Next() { case html.ErrorToken: isFinish = true case html.StartTagToken: tagName, _ := z.TagName() if bytes.Equal(tagName, []byte("html")) { isHTML = true isFinish = true } } } return isHTML }
func scrape(r io.Reader) { z := html.NewTokenizer(r) buf := &bytes.Buffer{} L: for { tt := z.Next() tok := z.Token() switch tt { case html.StartTagToken: // if you find a link, replace it with our stylesheet if tok.DataAtom == atom.Tr { // check for correct class attr and then switch to // html.NewTokenizerFragment } break case html.EndTagToken: // once you reach the end of the head, flush everything left in // the tokenizer to the buffer if tok.String() == "</head>" { buf.Write(z.Buffered()) break L } case html.ErrorToken: // this is left in here for things like tracking pixels that have // the HTML content type, so our code doesn't break break L } } }
func Autodiscover(b []byte) (string, error) { r := bytes.NewReader(b) z := html.NewTokenizer(r) for { if z.Next() == html.ErrorToken { if err := z.Err(); err == io.EOF { break } else { return "", ErrNoRssLink } } t := z.Token() switch t.DataAtom { case atom.Link: if t.Type == html.StartTagToken || t.Type == html.SelfClosingTagToken { attrs := make(map[string]string) for _, a := range t.Attr { attrs[a.Key] = a.Val } if attrs["rel"] == "alternate" && attrs["href"] != "" && (attrs["type"] == "application/rss+xml" || attrs["type"] == "application/atom+xml") { return attrs["href"], nil } } } } return "", ErrNoRssLink }
func ParseHtml(r io.Reader, url string) ([]byte, error) { z := html.NewTokenizer(r) var newHtml []byte lastTag := "" for { tt := z.Next() rawHtmlBytes := z.Raw() switch tt { case html.ErrorToken: e := z.Err() if e.Error() == "EOF" { return newHtml, nil } else { return make([]byte, 0), z.Err() } case html.TextToken: rawHtml := strings.TrimSpace(string(rawHtmlBytes[:])) if len(rawHtml) > 0 && lastTag == "style" { newCss := ParseCss(rawHtml, url) newHtml = append(newHtml, []byte(newCss)...) } else { newHtml = append(newHtml, rawHtmlBytes...) } case html.DoctypeToken, html.CommentToken, html.EndTagToken: newHtml = append(newHtml, rawHtmlBytes...) case html.StartTagToken: lastTag = flushTagToken(&newHtml, z, url) case html.SelfClosingTagToken: flushTagToken(&newHtml, z, url) } if tt != html.StartTagToken { lastTag = "" } } }
func GetPriceForBestBuy(url string) float64 { resp, err := http.Get(url) if err != nil { log.Fatal(err) } defer resp.Body.Close() z := html.NewTokenizer(resp.Body) for { tt := z.Next() switch { case tt == html.ErrorToken: return 0.0 case tt == html.StartTagToken: t := z.Token() isSpan := t.Data == "meta" if isSpan { for _, attr := range t.Attr { if attr.Key == "id" && strings.Contains(attr.Val, "schemaorg-offer-price") { nxt := z.Next() if nxt == html.TextToken { t = z.Token() return parseCurrency(t.Data) } } } } } } }
func getLinks(u *url.URL) []*url.URL { resp, err := http.Get(u.String()) if err != nil { logs.Log(fmt.Sprintf("Couldn't crawl %s", u)) } defer resp.Body.Close() links := make([]*url.URL, 0) tokenizer := html.NewTokenizer(resp.Body) for { tokenType := tokenizer.Next() switch tokenType { case html.ErrorToken: return links case html.StartTagToken, html.SelfClosingTagToken: token := tokenizer.Token() if link, ok := getURL(u, token); ok { links = append(links, link) } } } return links }
func parseTitle(resp io.Reader, fallback string) string { r := io.LimitedReader{ R: resp, N: 8192, } h := html.NewTokenizer(&r) for { tt := h.Next() switch tt { case html.ErrorToken: return fallback case html.StartTagToken: tag, _ := h.TagName() if string(tag) == "title" { nt := h.Next() switch nt { case html.ErrorToken: return "Failed to parse title" case html.TextToken: return h.Token().Data } } } } return fallback }
func (w *WebClient) obtainCsrf(b io.Reader) error { var errorMessage error = nil z := html.NewTokenizer(b) for { tt := z.Next() switch { case tt == html.ErrorToken: return errorMessage case tt == html.SelfClosingTagToken: t := z.Token() isMeta := t.Data == "meta" if isMeta && len(t.Attr) > 0 { if (t.Attr[1].Key == "name") && (t.Attr[1].Val == "csrf-token") { w.csrf = t.Attr[0].Val log.Debugf("Csrf Token: %s", w.csrf) } else if (t.Attr[0].Key == "name") && (t.Attr[0].Val == "csrf-token") { w.csrf = t.Attr[1].Val log.Debugf("Csrf Token: %s", w.csrf) } } case tt == html.StartTagToken: t := z.Token() if (t.Data == "div") && len(t.Attr) > 0 && (t.Attr[0].Key == "id") && (t.Attr[0].Val == "flash_alert") { z.Next() errorMessage = errors.New(z.Token().String()) } } } }
// crawl the page func Crawl(url string, ch chan string) { resp, _ := http.Get(url_prefix + url) tokenizer := html.NewTokenizer(resp.Body) defer resp.Body.Close() for { token := tokenizer.Next() switch { case token == html.ErrorToken: // End of page ch <- "END!" return case token == html.StartTagToken: start_tt := tokenizer.Token() if start_tt.Data == "div" { //fmt.Println("get a div! %v", num) if isSummary(start_tt) { getQ(*tokenizer, ch) } } else { continue } } } }
/* * avanza_get_sellprice * * Site: Avanza * Gets the current sellprice from a given httpResponse */ func (this *Parse) avanza_get_sellprice(resp *http.Response) float64 { z := html.NewTokenizer(resp.Body) for { tt := z.Next() switch { case tt == html.ErrorToken: return 0.0 case tt == html.StartTagToken: t := z.Token() if isCatch := t.Data == "span"; isCatch { for _, attr := range t.Attr { if strings.Contains(attr.Val, "sellPrice") { z.Next() tt := z.Token() strval := strings.Replace(tt.String(), ",", ".", -1) value, _ := strconv.ParseFloat(strval, 64) return value } } } } } }
func GetPriceForWalmart(url string) float64 { resp, err := http.Get(url) if err != nil { log.Fatal(err) } defer resp.Body.Close() z := html.NewTokenizer(resp.Body) for { tt := z.Next() switch { case tt == html.ErrorToken: return 0.0 case tt == html.StartTagToken: t := z.Token() isSpan := t.Data == "script" if isSpan { for _, attr := range t.Attr { if attr.Key == "id" && strings.Contains(attr.Val, "tb-djs-wml-base") { nxt := z.Next() if nxt == html.TextToken { return parseJson(z.Token().Data) } } } } } } }
// scrapePageWorker -- this is the function that does most of the work in parsing the HTML func scrapePageWorker(page *io.ReadCloser, out chan [2]string, chFinished chan bool) { defer func() { chFinished <- true }() z := html.NewTokenizer(*page) // infinite loop to toss state tokens into a url map for { var result [2]string tt := z.Next() switch { case tt == html.ErrorToken: return case tt == html.StartTagToken: t := z.Token() isAnchor := t.Data == "a" if !isAnchor { continue } if isAnchor { for _, attr := range t.Attr { if attr.Key == "id" { result[0] = attr.Val } if attr.Key == "data-href" { result[1] = attr.Val out <- result } } } } } // end for }
func getHTMLContent(r io.Reader, tag []byte) (result string, err error) { z := html.NewTokenizer(r) result = "" valid := 0 cacheLen := len(tag) for { tt := z.Next() switch tt { case html.ErrorToken: err = z.Err() return case html.TextToken: if valid == 1 { return string(z.Text()), nil } case html.StartTagToken, html.EndTagToken: tn, _ := z.TagName() if len(tn) == cacheLen && bytes.Equal(tn[0:cacheLen], tag) { if tt == html.StartTagToken { valid = 1 } else { valid = 0 } } } } }
// Find all children links on a page and the title of the page from an HTTP response func (w Web) findAllLinks(httpBody io.Reader, baseURL string) (links []string, title string) { page := html.NewTokenizer(httpBody) for { tokenType := page.Next() // End of the page, we are done if tokenType == html.ErrorToken { return } token := page.Token() // Extract the page title // React uses <title> tags also, but they have got special attributes if tokenType == html.StartTagToken && token.DataAtom.String() == "title" && len(token.Attr) == 0 { page.Next() title = page.Token().Data } // Parse a link if tokenType == html.StartTagToken && token.DataAtom.String() == "a" { href, hasLink := w.extractLink(token) if hasLink && w.ShouldCrawl(baseURL, href) { links = append(links, w.unifyURL(href, baseURL)) } } } }
func (a *HttpWebotsArchive) load() error { resp, err := http.Get(a.baseurl) if err != nil { return err } tokenizer := html.NewTokenizer(resp.Body) nameRx := regexp.MustCompile(fmt.Sprintf(`^webots-(.*)-%s.tar.bz2$`, a.arch)) for { t := tokenizer.Next() if t == html.ErrorToken { err := tokenizer.Err() if err == io.EOF { break } return err } if t != html.StartTagToken { continue } tName, hasAttrib := tokenizer.TagName() if string(tName) != "a" { continue } if hasAttrib == false { continue } stopped := false for stopped == false { key, val, next := tokenizer.TagAttr() if string(key) != "href" { continue } stopped = !next // we got a link, test if it has the right prefix matches := nameRx.FindStringSubmatch(string(val)) if matches == nil { continue } v, err := ParseWebotsVersion(matches[1]) if err != nil { return err } a.versions = append(a.versions, v) } } sort.Sort(&a.versions) return nil }
// Returns the href attribute of a <link rel="shortcut icon"> tag or error if not found. func FindIcon(b []byte) (string, error) { r := bytes.NewReader(b) z := html.NewTokenizer(r) for { if z.Next() == html.ErrorToken { if err := z.Err(); err == io.EOF { break } else { return "", ErrNoIcon } } t := z.Token() switch t.DataAtom { case atom.Link: if t.Type == html.StartTagToken || t.Type == html.SelfClosingTagToken { attrs := make(map[string]string) for _, a := range t.Attr { attrs[a.Key] = a.Val } if attrs["rel"] == "shortcut icon" && attrs["href"] != "" { return attrs["href"], nil } } } } return "", ErrNoIcon }
func extractLinkUrls(page string) []string { z := html.NewTokenizer(strings.NewReader(page)) hrefs := make([]string, 10) for { tt := z.Next() switch { case tt == html.ErrorToken: // End of the document, we're done return hrefs case tt == html.StartTagToken: t := z.Token() isAnchor := t.Data == "a" if isAnchor { // we found a link attributes := t.Attr for _, attr := range attributes { if attr.Key == "href" { href := attr.Val hrefs = append(hrefs, href) } } } } } }
// Preprocess escapes disallowed tags in a cleaner way, but does not fix // nesting problems. Use with Clean. func Preprocess(config *Config, fragment string) string { if config == nil { config = DefaultConfig } var buf bytes.Buffer write := func(raw string) { _, err := buf.WriteString(raw) // The only possible error is running out of memory. expectError(err, nil) } t := html.NewTokenizer(strings.NewReader(fragment)) for { switch tok := t.Next(); tok { case html.ErrorToken: err := t.Err() // The only possible errors are from the Reader or from // the buffer capacity being exceeded. Neither can // happen with strings.NewReader as the string must // already fit into memory. expectError(err, io.EOF) if err == io.EOF { write(html.EscapeString(string(t.Raw()))) return buf.String() } case html.TextToken: write(string(t.Raw())) case html.StartTagToken, html.EndTagToken, html.SelfClosingTagToken: raw := string(t.Raw()) tagName, _ := t.TagName() allowed := false if tag := atom.Lookup(tagName); tag != 0 { if _, ok := config.elem[tag]; ok { allowed = true } } if !allowed { if _, ok := config.elemCustom[string(tagName)]; ok { allowed = true } } if !allowed { raw = html.EscapeString(raw) } write(raw) case html.CommentToken: raw := string(t.Raw()) if config.EscapeComments || !strings.HasPrefix(raw, "<!--") || !strings.HasSuffix(raw, "-->") { raw = html.EscapeString(raw) } write(raw) default: write(html.EscapeString(string(t.Raw()))) } } }
// ScrapeArticle fetches and parses the article. // article should be provided as a *Article. func ScrapeArticle(article Article) error { cookies := NewCookieJar() client := &http.Client{Jar: cookies} // build request req, err := http.NewRequest("GET", article.GetLink(), nil) //create http request err = buildArticleHeader(req) if err != nil { log.Error("could not build article request:", err) return err } //send http request resp, err := client.Do(req) if err != nil { log.Error("error sending article request:", err) return err } defer resp.Body.Close() // TODO: check resp.Header to see if X-Article-Template is [full] // parse request parser := html.NewTokenizer(resp.Body) err = article.DoParse(parser) //parse the html body if err != nil { log.Error("error building article request:", err) return err } return nil }
/** * Download a given url and return all urls found on that page. */ func retrieveLinks(url string) []string { resp, err := http.Get(url) links := make([]string, 0) if err != nil { logger.Error("Detected broken url", url) return links } defer resp.Body.Close() page := html.NewTokenizer(resp.Body) for { tokenType := page.Next() if tokenType == html.ErrorToken { return links } token := page.Token() if tokenType == html.StartTagToken && token.DataAtom.String() == "a" { for _, attr := range token.Attr { if attr.Key == "href" { links = append(links, attr.Val) } } } } }
// GetTitle gets the title token of a HTML page func GetTitle(resp *http.Response, url string) string { fURL := resp.Request.URL.String() z := html.NewTokenizer(resp.Body) for { tt := z.Next() switch tt { case html.ErrorToken: return "" case html.StartTagToken: t := z.Token() if t.Data == "title" { tt = z.Next() t = z.Token() d := t.Data if len(d) > 450 { d = d[:450] } d = strings.TrimSpace(strings.Replace(d, "\n", " ", -1)) if fURL != url { return fmt.Sprintf("%v (%v)", d, fURL) } return d } } } }
func ParseHtml(r io.Reader) map[string]RefType { refs := make(map[string]RefType) tokenizer := html.NewTokenizer(r) for { tt := tokenizer.Next() switch { case tt == html.ErrorToken: // tokenizer telling us its reached the end return refs case tt == html.StartTagToken: token := tokenizer.Token() element, ok := knownElements[token.Data] if !ok { continue } for _, attr := range token.Attr { refType, ok := element[attr.Key] if !ok { continue } refs[attr.Val] = refType break } } } return refs }
func (p *PickerAttr) Pick(r io.Reader) (data []string, err error) { z := html.NewTokenizer(r) for { tt := z.Next() switch tt { case html.ErrorToken: if z.Err() == io.EOF { return data, nil } case html.StartTagToken: tag_name, attr := z.TagName() if string(tag_name) != p.TagName { continue } var key, value []byte for attr { key, value, attr = z.TagAttr() if string(key) == p.Attr { data = append(data, string(value)) } } } } return data, z.Err() }
func (parser *HtmlContentParser) Parse(c *MemoryContent, in io.Reader) error { z := html.NewTokenizer(in) for { tt := z.Next() switch { case tt == html.ErrorToken: if z.Err() == io.EOF { return nil } return z.Err() case tt == html.StartTagToken: tag, _ := z.TagName() switch string(tag) { case "head": if err := parser.parseHead(z, c); err != nil { return err } case "body": if err := parser.parseBody(z, c); err != nil { return err } } } } }
// Scrape scrapes a web page and populates the course struct. func Scrape(r io.Reader) *Course { z := html.NewTokenizer(r) buf := &bytes.Buffer{} L: for { tt := z.Next() tok := z.Token() //if tok.DataAtom != atom.Link { // for anything that isn't a link, just write the raw data to the // buffer buf.Write(z.Raw()) //} switch tt { case html.StartTagToken: // if you find a link, replace it with our stylesheet if tok.DataAtom == atom.Link { // this isn't the correct stylesheet, just a placeholder //buf.Write([]byte(`<link rel="stylesheet" href="/cxo.css">`)) } break case html.EndTagToken: // once you reach the end of the head, flush everything left in // the tokenizer to the buffer if tok.String() == "</head>" { buf.Write(z.Buffered()) break L } case html.ErrorToken: // this is left in here for things like tracking pixels that have // the HTML content type, so our code doesn't break break L } } }
// Search for // <head> // <meta http-equiv="X-XRDS-Location" content="...."> func findMetaXrdsLocation(input io.Reader) (location string, err error) { tokenizer := html.NewTokenizer(input) inHead := false for { tt := tokenizer.Next() switch tt { case html.ErrorToken: return "", tokenizer.Err() case html.StartTagToken, html.EndTagToken: tk := tokenizer.Token() if tk.Data == "head" { if tt == html.StartTagToken { inHead = true } else { return "", errors.New("Meta X-XRDS-Location not found") } } else if inHead && tk.Data == "meta" { ok := false content := "" for _, attr := range tk.Attr { if attr.Key == "http-equiv" && strings.ToLower(attr.Val) == "x-xrds-location" { ok = true } else if attr.Key == "content" { content = attr.Val } } if ok && len(content) > 0 { return content, nil } } } } return "", errors.New("Meta X-XRDS-Location not found") }
func fetchHyperLink(httpBody io.Reader) []string { defer ioutil.ReadAll(httpBody) links := make([]string, 0) body := html.NewTokenizer(httpBody) for { tokenType := body.Next() if tokenType == html.ErrorToken { return links } token := body.Token() if tokenType == html.StartTagToken { if token.DataAtom.String() == "a" || token.DataAtom.String() == "link" { for _, attribute := range token.Attr { if attribute.Key == "href" { links = append(links, attribute.Val) } } } else if token.DataAtom.String() == "img" || token.DataAtom.String() == "script" { for _, attribute := range token.Attr { if attribute.Key == "src" { links = append(links, attribute.Val) } } } } } return links }