Golang NewTokenizerの例、golang.org/x/net/html.NewTokenizer Golangの例

コード例 #1

0

ファイルを表示

ファイル: opengraph.go プロジェクト: dyatlov/go-opengraph

// ProcessHTML parses given html from Reader interface and fills up OpenGraph structure
func (og *OpenGraph) ProcessHTML(buffer io.Reader) error {
	z := html.NewTokenizer(buffer)
	for {
		tt := z.Next()
		switch tt {
		case html.ErrorToken:
			if z.Err() == io.EOF {
				return nil
			}
			return z.Err()
		case html.StartTagToken, html.SelfClosingTagToken, html.EndTagToken:
			name, hasAttr := z.TagName()
			if atom.Lookup(name) == atom.Body {
				return nil // OpenGraph is only in head, so we don't need body
			}
			if atom.Lookup(name) != atom.Meta || !hasAttr {
				continue
			}
			m := make(map[string]string)
			var key, val []byte
			for hasAttr {
				key, val, hasAttr = z.TagAttr()
				m[atom.String(key)] = string(val)
			}
			og.ProcessMeta(m)
		}
	}
	return nil
}

コード例 #2

0

ファイルを表示

ファイル: scrap.go プロジェクト: husio/apps

func htmlToText(r io.Reader) []byte {
	t := html.NewTokenizer(r)

	var out bytes.Buffer

	var ignorescore int
	for {
		switch token := t.Next(); token {
		case html.StartTagToken:
			if _, ok := ignoretag[string(t.Token().Data)]; ok {
				ignorescore++
			}
		case html.EndTagToken:
			if _, ok := ignoretag[string(t.Token().Data)]; ok {
				ignorescore--
			}
		case html.ErrorToken:
			return out.Bytes()
		case html.CommentToken:
			continue
		case html.TextToken:
			if ignorescore == 0 {
				html := strings.TrimSpace(t.Token().Data)
				if len(html) > 0 {
					fmt.Fprintln(&out, html)
				}
			}
		}
	}
}

コード例 #3

0

ファイルを表示

ファイル: transport.go プロジェクト: johndmulhausen/kubernetes

// rewriteHTML scans the HTML for tags with url-valued attributes, and updates
// those values with the urlRewriter function. The updated HTML is output to the
// writer.
func rewriteHTML(reader io.Reader, writer io.Writer, urlRewriter func(string) string) error {
	// Note: This assumes the content is UTF-8.
	tokenizer := html.NewTokenizer(reader)

	var err error
	for err == nil {
		tokenType := tokenizer.Next()
		switch tokenType {
		case html.ErrorToken:
			err = tokenizer.Err()
		case html.StartTagToken, html.SelfClosingTagToken:
			token := tokenizer.Token()
			if urlAttrs, ok := atomsToAttrs[token.DataAtom]; ok {
				for i, attr := range token.Attr {
					if urlAttrs.Has(attr.Key) {
						token.Attr[i].Val = urlRewriter(attr.Val)
					}
				}
			}
			_, err = writer.Write([]byte(token.String()))
		default:
			_, err = writer.Write(tokenizer.Raw())
		}
	}
	if err != io.EOF {
		return err
	}
	return nil
}

コード例 #4

0

ファイルを表示

ファイル: body_parser.go プロジェクト: ReanGD/go-web-search

func isHTML(content []byte) bool {
	isHTML := false
	if len(content) == 0 {
		return isHTML
	}
	if len(content) > 1024 {
		content = content[:1024]
	}

	z := html.NewTokenizer(bytes.NewReader(content))
	isFinish := false
	for !isFinish {
		switch z.Next() {
		case html.ErrorToken:
			isFinish = true
		case html.StartTagToken:
			tagName, _ := z.TagName()
			if bytes.Equal(tagName, []byte("html")) {
				isHTML = true
				isFinish = true
			}
		}
	}

	return isHTML
}

コード例 #5

0

ファイルを表示

ファイル: main.go プロジェクト: bentranter/bookstore

func scrape(r io.Reader) {
	z := html.NewTokenizer(r)
	buf := &bytes.Buffer{}

L:
	for {
		tt := z.Next()
		tok := z.Token()

		switch tt {
		case html.StartTagToken:
			// if you find a link, replace it with our stylesheet
			if tok.DataAtom == atom.Tr {
				// check for correct class attr and then switch to
				// html.NewTokenizerFragment
			}
			break
		case html.EndTagToken:
			// once you reach the end of the head, flush everything left in
			// the tokenizer to the buffer
			if tok.String() == "</head>" {
				buf.Write(z.Buffered())
				break L
			}
		case html.ErrorToken:
			// this is left in here for things like tracking pixels that have
			// the HTML content type, so our code doesn't break
			break L
		}
	}
}

コード例 #6

0

ファイルを表示

ファイル: autodiscover.go プロジェクト: kissthink/goread

func Autodiscover(b []byte) (string, error) {
	r := bytes.NewReader(b)
	z := html.NewTokenizer(r)
	for {
		if z.Next() == html.ErrorToken {
			if err := z.Err(); err == io.EOF {
				break
			} else {
				return "", ErrNoRssLink
			}
		}
		t := z.Token()
		switch t.DataAtom {
		case atom.Link:
			if t.Type == html.StartTagToken || t.Type == html.SelfClosingTagToken {
				attrs := make(map[string]string)
				for _, a := range t.Attr {
					attrs[a.Key] = a.Val
				}
				if attrs["rel"] == "alternate" && attrs["href"] != "" &&
					(attrs["type"] == "application/rss+xml" || attrs["type"] == "application/atom+xml") {
					return attrs["href"], nil
				}
			}
		}
	}
	return "", ErrNoRssLink
}

コード例 #7

0

ファイルを表示

ファイル: html.go プロジェクト: gongshw/lighthouse

func ParseHtml(r io.Reader, url string) ([]byte, error) {
	z := html.NewTokenizer(r)
	var newHtml []byte
	lastTag := ""
	for {
		tt := z.Next()
		rawHtmlBytes := z.Raw()
		switch tt {
		case html.ErrorToken:
			e := z.Err()
			if e.Error() == "EOF" {
				return newHtml, nil
			} else {
				return make([]byte, 0), z.Err()
			}
		case html.TextToken:
			rawHtml := strings.TrimSpace(string(rawHtmlBytes[:]))
			if len(rawHtml) > 0 && lastTag == "style" {
				newCss := ParseCss(rawHtml, url)
				newHtml = append(newHtml, []byte(newCss)...)
			} else {
				newHtml = append(newHtml, rawHtmlBytes...)
			}
		case html.DoctypeToken, html.CommentToken, html.EndTagToken:
			newHtml = append(newHtml, rawHtmlBytes...)
		case html.StartTagToken:
			lastTag = flushTagToken(&newHtml, z, url)
		case html.SelfClosingTagToken:
			flushTagToken(&newHtml, z, url)
		}
		if tt != html.StartTagToken {
			lastTag = ""
		}
	}
}

コード例 #8

0

ファイルを表示

ファイル: bestbuy.go プロジェクト: vinaygaba/pricetell

func GetPriceForBestBuy(url string) float64 {
	resp, err := http.Get(url)
	if err != nil {
		log.Fatal(err)
	}
	defer resp.Body.Close()

	z := html.NewTokenizer(resp.Body)
	for {
		tt := z.Next()
		switch {
		case tt == html.ErrorToken:
			return 0.0
		case tt == html.StartTagToken:
			t := z.Token()
			isSpan := t.Data == "meta"
			if isSpan {
				for _, attr := range t.Attr {
					if attr.Key == "id" && strings.Contains(attr.Val, "schemaorg-offer-price") {
						nxt := z.Next()
						if nxt == html.TextToken {
							t = z.Token()
							return parseCurrency(t.Data)
						}
					}
				}
			}
		}
	}
}

コード例 #9

0

ファイルを表示

ファイル: crawler.go プロジェクト: fueledbymarvin/gocardless

func getLinks(u *url.URL) []*url.URL {

	resp, err := http.Get(u.String())
	if err != nil {
		logs.Log(fmt.Sprintf("Couldn't crawl %s", u))
	}
	defer resp.Body.Close()

	links := make([]*url.URL, 0)
	tokenizer := html.NewTokenizer(resp.Body)
	for {
		tokenType := tokenizer.Next()
		switch tokenType {
		case html.ErrorToken:
			return links
		case html.StartTagToken, html.SelfClosingTagToken:
			token := tokenizer.Token()
			if link, ok := getURL(u, token); ok {
				links = append(links, link)
			}
		}
	}

	return links
}

コード例 #10

0

ファイルを表示

ファイル: main.go プロジェクト: velour/holdmypage

func parseTitle(resp io.Reader, fallback string) string {
	r := io.LimitedReader{
		R: resp,
		N: 8192,
	}

	h := html.NewTokenizer(&r)
	for {
		tt := h.Next()
		switch tt {
		case html.ErrorToken:
			return fallback
		case html.StartTagToken:
			tag, _ := h.TagName()
			if string(tag) == "title" {
				nt := h.Next()
				switch nt {
				case html.ErrorToken:
					return "Failed to parse title"
				case html.TextToken:
					return h.Token().Data
				}
			}
		}
	}

	return fallback
}

コード例 #11

0

ファイルを表示

ファイル: setup.go プロジェクト: odacremolbap/concerto

func (w *WebClient) obtainCsrf(b io.Reader) error {
	var errorMessage error = nil
	z := html.NewTokenizer(b)

	for {
		tt := z.Next()
		switch {
		case tt == html.ErrorToken:
			return errorMessage
		case tt == html.SelfClosingTagToken:
			t := z.Token()
			isMeta := t.Data == "meta"
			if isMeta && len(t.Attr) > 0 {
				if (t.Attr[1].Key == "name") && (t.Attr[1].Val == "csrf-token") {
					w.csrf = t.Attr[0].Val
					log.Debugf("Csrf Token: %s", w.csrf)
				} else if (t.Attr[0].Key == "name") && (t.Attr[0].Val == "csrf-token") {
					w.csrf = t.Attr[1].Val
					log.Debugf("Csrf Token: %s", w.csrf)
				}
			}
		case tt == html.StartTagToken:
			t := z.Token()
			if (t.Data == "div") && len(t.Attr) > 0 && (t.Attr[0].Key == "id") && (t.Attr[0].Val == "flash_alert") {
				z.Next()
				errorMessage = errors.New(z.Token().String())
			}
		}
	}

}

コード例 #12

0

ファイルを表示

ファイル: multiple-web-crawlers.go プロジェクト: carol-hsu/go-study

// crawl the page
func Crawl(url string, ch chan string) {
	resp, _ := http.Get(url_prefix + url)
	tokenizer := html.NewTokenizer(resp.Body)
	defer resp.Body.Close()

	for {
		token := tokenizer.Next()
		switch {
		case token == html.ErrorToken:
			// End of page
			ch <- "END!"
			return
		case token == html.StartTagToken:
			start_tt := tokenizer.Token()
			if start_tt.Data == "div" {
				//fmt.Println("get a div! %v", num)
				if isSummary(start_tt) {
					getQ(*tokenizer, ch)
				}
			} else {
				continue
			}
		}
	}
}

コード例 #13

0

ファイルを表示

ファイル: parse.go プロジェクト: Balzzanar/golang

/*
 * avanza_get_sellprice
 *
 * Site: Avanza
 * Gets the current sellprice from a given httpResponse
 */
func (this *Parse) avanza_get_sellprice(resp *http.Response) float64 {
	z := html.NewTokenizer(resp.Body)

	for {
		tt := z.Next()

		switch {
		case tt == html.ErrorToken:
			return 0.0
		case tt == html.StartTagToken:
			t := z.Token()

			if isCatch := t.Data == "span"; isCatch {
				for _, attr := range t.Attr {
					if strings.Contains(attr.Val, "sellPrice") {
						z.Next()
						tt := z.Token()
						strval := strings.Replace(tt.String(), ",", ".", -1)
						value, _ := strconv.ParseFloat(strval, 64)
						return value
					}
				}
			}
		}
	}
}

コード例 #14

0

ファイルを表示

ファイル: walmart.go プロジェクト: vinaygaba/pricetell

func GetPriceForWalmart(url string) float64 {
	resp, err := http.Get(url)
	if err != nil {
		log.Fatal(err)
	}
	defer resp.Body.Close()

	z := html.NewTokenizer(resp.Body)
	for {
		tt := z.Next()
		switch {
		case tt == html.ErrorToken:
			return 0.0
		case tt == html.StartTagToken:
			t := z.Token()
			isSpan := t.Data == "script"
			if isSpan {
				for _, attr := range t.Attr {
					if attr.Key == "id" && strings.Contains(attr.Val, "tb-djs-wml-base") {
						nxt := z.Next()
						if nxt == html.TextToken {
							return parseJson(z.Token().Data)
						}
					}
				}
			}
		}
	}
}

コード例 #15

0

ファイルを表示

ファイル: 2kcookies.go プロジェクト: ohrodr/2kcookies

// scrapePageWorker -- this is the function that does most of the work in parsing the HTML
func scrapePageWorker(page *io.ReadCloser, out chan [2]string, chFinished chan bool) {
	defer func() {
		chFinished <- true
	}()
	z := html.NewTokenizer(*page)
	// infinite loop to toss state tokens into a url map
	for {
		var result [2]string
		tt := z.Next()
		switch {
		case tt == html.ErrorToken:
			return
		case tt == html.StartTagToken:
			t := z.Token()

			isAnchor := t.Data == "a"
			if !isAnchor {
				continue
			}
			if isAnchor {
				for _, attr := range t.Attr {
					if attr.Key == "id" {
						result[0] = attr.Val
					}
					if attr.Key == "data-href" {
						result[1] = attr.Val
						out <- result
					}
				}
			}
		}
	} // end for
}

コード例 #16

0

ファイルを表示

ファイル: pipeline.go プロジェクト: pedronasser/caddy-search

func getHTMLContent(r io.Reader, tag []byte) (result string, err error) {
	z := html.NewTokenizer(r)
	result = ""
	valid := 0
	cacheLen := len(tag)

	for {
		tt := z.Next()
		switch tt {
		case html.ErrorToken:
			err = z.Err()
			return
		case html.TextToken:
			if valid == 1 {
				return string(z.Text()), nil
			}
		case html.StartTagToken, html.EndTagToken:
			tn, _ := z.TagName()
			if len(tn) == cacheLen && bytes.Equal(tn[0:cacheLen], tag) {
				if tt == html.StartTagToken {
					valid = 1
				} else {
					valid = 0
				}
			}
		}
	}
}

コード例 #17

0

ファイルを表示

ファイル: web.go プロジェクト: AntoineAugusti/crawler

// Find all children links on a page and the title of the page from an HTTP response
func (w Web) findAllLinks(httpBody io.Reader, baseURL string) (links []string, title string) {
	page := html.NewTokenizer(httpBody)
	for {
		tokenType := page.Next()
		// End of the page, we are done
		if tokenType == html.ErrorToken {
			return
		}
		token := page.Token()

		// Extract the page title
		// React uses <title> tags also, but they have got special attributes
		if tokenType == html.StartTagToken && token.DataAtom.String() == "title" && len(token.Attr) == 0 {
			page.Next()
			title = page.Token().Data
		}

		// Parse a link
		if tokenType == html.StartTagToken && token.DataAtom.String() == "a" {
			href, hasLink := w.extractLink(token)
			if hasLink && w.ShouldCrawl(baseURL, href) {
				links = append(links, w.unifyURL(href, baseURL))
			}
		}
	}
}

コード例 #18

0

ファイルを表示

ファイル: webots_archive.go プロジェクト: biorob/webots-manager

func (a *HttpWebotsArchive) load() error {
	resp, err := http.Get(a.baseurl)
	if err != nil {
		return err
	}

	tokenizer := html.NewTokenizer(resp.Body)

	nameRx := regexp.MustCompile(fmt.Sprintf(`^webots-(.*)-%s.tar.bz2$`, a.arch))

	for {
		t := tokenizer.Next()
		if t == html.ErrorToken {
			err := tokenizer.Err()
			if err == io.EOF {
				break
			}
			return err

		}

		if t != html.StartTagToken {
			continue
		}

		tName, hasAttrib := tokenizer.TagName()
		if string(tName) != "a" {
			continue
		}

		if hasAttrib == false {
			continue
		}

		stopped := false
		for stopped == false {
			key, val, next := tokenizer.TagAttr()
			if string(key) != "href" {
				continue
			}
			stopped = !next
			// we got a link, test if it has the right prefix
			matches := nameRx.FindStringSubmatch(string(val))
			if matches == nil {
				continue
			}

			v, err := ParseWebotsVersion(matches[1])
			if err != nil {
				return err
			}

			a.versions = append(a.versions, v)
		}
	}

	sort.Sort(&a.versions)

	return nil
}

コード例 #19

0

ファイルを表示

ファイル: autodiscover.go プロジェクト: kissthink/goread

// Returns the href attribute of a <link rel="shortcut icon"> tag or error if not found.
func FindIcon(b []byte) (string, error) {
	r := bytes.NewReader(b)
	z := html.NewTokenizer(r)
	for {
		if z.Next() == html.ErrorToken {
			if err := z.Err(); err == io.EOF {
				break
			} else {
				return "", ErrNoIcon
			}
		}
		t := z.Token()
		switch t.DataAtom {
		case atom.Link:
			if t.Type == html.StartTagToken || t.Type == html.SelfClosingTagToken {
				attrs := make(map[string]string)
				for _, a := range t.Attr {
					attrs[a.Key] = a.Val
				}
				if attrs["rel"] == "shortcut icon" && attrs["href"] != "" {
					return attrs["href"], nil
				}
			}
		}
	}
	return "", ErrNoIcon
}

コード例 #20

0

ファイルを表示

ファイル: crawler.go プロジェクト: carriercomm/gocrawl

func extractLinkUrls(page string) []string {

	z := html.NewTokenizer(strings.NewReader(page))

	hrefs := make([]string, 10)

	for {
		tt := z.Next()

		switch {
		case tt == html.ErrorToken:
			// End of the document, we're done
			return hrefs
		case tt == html.StartTagToken:
			t := z.Token()
			isAnchor := t.Data == "a"
			if isAnchor {
				// we found a link
				attributes := t.Attr
				for _, attr := range attributes {
					if attr.Key == "href" {
						href := attr.Val
						hrefs = append(hrefs, href)
					}
				}
			}
		}
	}
}

コード例 #21

0

ファイルを表示

ファイル: cleaner.go プロジェクト: BenLubar/htmlcleaner

// Preprocess escapes disallowed tags in a cleaner way, but does not fix
// nesting problems. Use with Clean.
func Preprocess(config *Config, fragment string) string {
	if config == nil {
		config = DefaultConfig
	}

	var buf bytes.Buffer
	write := func(raw string) {
		_, err := buf.WriteString(raw)

		// The only possible error is running out of memory.
		expectError(err, nil)
	}

	t := html.NewTokenizer(strings.NewReader(fragment))
	for {
		switch tok := t.Next(); tok {
		case html.ErrorToken:
			err := t.Err()

			// The only possible errors are from the Reader or from
			// the buffer capacity being exceeded. Neither can
			// happen with strings.NewReader as the string must
			// already fit into memory.
			expectError(err, io.EOF)

			if err == io.EOF {
				write(html.EscapeString(string(t.Raw())))
				return buf.String()
			}
		case html.TextToken:
			write(string(t.Raw()))
		case html.StartTagToken, html.EndTagToken, html.SelfClosingTagToken:
			raw := string(t.Raw())
			tagName, _ := t.TagName()
			allowed := false
			if tag := atom.Lookup(tagName); tag != 0 {
				if _, ok := config.elem[tag]; ok {
					allowed = true
				}
			}
			if !allowed {
				if _, ok := config.elemCustom[string(tagName)]; ok {
					allowed = true
				}
			}
			if !allowed {
				raw = html.EscapeString(raw)
			}
			write(raw)
		case html.CommentToken:
			raw := string(t.Raw())
			if config.EscapeComments || !strings.HasPrefix(raw, "<!--") || !strings.HasSuffix(raw, "-->") {
				raw = html.EscapeString(raw)
			}
			write(raw)
		default:
			write(html.EscapeString(string(t.Raw())))
		}
	}
}

コード例 #22

0

ファイルを表示

ファイル: interface.go プロジェクト: opinionated/scraper-core

// ScrapeArticle fetches and parses the article.
// article should be provided as a *Article.
func ScrapeArticle(article Article) error {
	cookies := NewCookieJar()
	client := &http.Client{Jar: cookies}

	// build request
	req, err := http.NewRequest("GET", article.GetLink(), nil) //create http request
	err = buildArticleHeader(req)
	if err != nil {
		log.Error("could not build article request:", err)
		return err
	}

	//send http request
	resp, err := client.Do(req)
	if err != nil {
		log.Error("error sending article request:", err)
		return err
	}
	defer resp.Body.Close()

	// TODO: check resp.Header to see if X-Article-Template is [full]

	// parse request
	parser := html.NewTokenizer(resp.Body)
	err = article.DoParse(parser) //parse the html body
	if err != nil {
		log.Error("error building article request:", err)
		return err
	}
	return nil
}

コード例 #23

0

ファイルを表示

ファイル: main.go プロジェクト: NoUseFreak/blc

/**
 * Download a given url and return all urls found on that page.
 */
func retrieveLinks(url string) []string {
	resp, err := http.Get(url)
	links := make([]string, 0)
	if err != nil {
		logger.Error("Detected broken url", url)
		return links
	}
	defer resp.Body.Close()

	page := html.NewTokenizer(resp.Body)
	for {
		tokenType := page.Next()
		if tokenType == html.ErrorToken {
			return links
		}
		token := page.Token()
		if tokenType == html.StartTagToken && token.DataAtom.String() == "a" {
			for _, attr := range token.Attr {
				if attr.Key == "href" {
					links = append(links, attr.Val)
				}
			}
		}
	}
}

コード例 #24

0

ファイルを表示

ファイル: middleware.go プロジェクト: nado/go-b0tsec

// GetTitle gets the title token of a HTML page
func GetTitle(resp *http.Response, url string) string {
	fURL := resp.Request.URL.String()
	z := html.NewTokenizer(resp.Body)
	for {
		tt := z.Next()
		switch tt {
		case html.ErrorToken:
			return ""
		case html.StartTagToken:
			t := z.Token()
			if t.Data == "title" {
				tt = z.Next()
				t = z.Token()
				d := t.Data
				if len(d) > 450 {
					d = d[:450]
				}
				d = strings.TrimSpace(strings.Replace(d, "\n", " ", -1))
				if fURL != url {
					return fmt.Sprintf("%v (%v)", d, fURL)
				}
				return d
			}
		}
	}
}

コード例 #25

0

ファイルを表示

ファイル: html.go プロジェクト: rdingwall/go-webcrawler

func ParseHtml(r io.Reader) map[string]RefType {
	refs := make(map[string]RefType)
	tokenizer := html.NewTokenizer(r)

	for {
		tt := tokenizer.Next()

		switch {
		case tt == html.ErrorToken:
			// tokenizer telling us its reached the end
			return refs

		case tt == html.StartTagToken:
			token := tokenizer.Token()

			element, ok := knownElements[token.Data]
			if !ok {
				continue
			}

			for _, attr := range token.Attr {

				refType, ok := element[attr.Key]
				if !ok {
					continue
				}

				refs[attr.Val] = refType
				break
			}
		}
	}

	return refs
}

コード例 #26

0

ファイルを表示

ファイル: picker.go プロジェクト: abhishekgahlot/go-crawler

func (p *PickerAttr) Pick(r io.Reader) (data []string, err error) {
	z := html.NewTokenizer(r)

	for {
		tt := z.Next()

		switch tt {
		case html.ErrorToken:
			if z.Err() == io.EOF {
				return data, nil
			}

		case html.StartTagToken:
			tag_name, attr := z.TagName()

			if string(tag_name) != p.TagName {
				continue
			}

			var key, value []byte

			for attr {
				key, value, attr = z.TagAttr()

				if string(key) == p.Attr {
					data = append(data, string(value))
				}
			}
		}
	}

	return data, z.Err()
}

コード例 #27

0

ファイルを表示

ファイル: html_content_parser.go プロジェクト: tarent/lib-compose

func (parser *HtmlContentParser) Parse(c *MemoryContent, in io.Reader) error {
	z := html.NewTokenizer(in)
	for {
		tt := z.Next()
		switch {
		case tt == html.ErrorToken:
			if z.Err() == io.EOF {
				return nil
			}
			return z.Err()
		case tt == html.StartTagToken:
			tag, _ := z.TagName()
			switch string(tag) {
			case "head":
				if err := parser.parseHead(z, c); err != nil {
					return err
				}
			case "body":
				if err := parser.parseBody(z, c); err != nil {
					return err
				}
			}
		}
	}
}

コード例 #28

0

ファイルを表示

ファイル: scrape.go プロジェクト: bentranter/bookstore

// Scrape scrapes a web page and populates the course struct.
func Scrape(r io.Reader) *Course {
	z := html.NewTokenizer(r)
	buf := &bytes.Buffer{}

L:
	for {
		tt := z.Next()
		tok := z.Token()
		//if tok.DataAtom != atom.Link {
		// for anything that isn't a link, just write the raw data to the
		// buffer
		buf.Write(z.Raw())
		//}
		switch tt {
		case html.StartTagToken:
			// if you find a link, replace it with our stylesheet
			if tok.DataAtom == atom.Link {
				// this isn't the correct stylesheet, just a placeholder
				//buf.Write([]byte(`<link rel="stylesheet" href="/cxo.css">`))
			}
			break
		case html.EndTagToken:
			// once you reach the end of the head, flush everything left in
			// the tokenizer to the buffer
			if tok.String() == "</head>" {
				buf.Write(z.Buffered())
				break L
			}
		case html.ErrorToken:
			// this is left in here for things like tracking pixels that have
			// the HTML content type, so our code doesn't break
			break L
		}
	}
}

コード例 #29

0

ファイルを表示

ファイル: yadis_discovery.go プロジェクト: origami/openid-go

// Search for
// <head>
//    <meta http-equiv="X-XRDS-Location" content="....">
func findMetaXrdsLocation(input io.Reader) (location string, err error) {
	tokenizer := html.NewTokenizer(input)
	inHead := false
	for {
		tt := tokenizer.Next()
		switch tt {
		case html.ErrorToken:
			return "", tokenizer.Err()
		case html.StartTagToken, html.EndTagToken:
			tk := tokenizer.Token()
			if tk.Data == "head" {
				if tt == html.StartTagToken {
					inHead = true
				} else {
					return "", errors.New("Meta X-XRDS-Location not found")
				}
			} else if inHead && tk.Data == "meta" {
				ok := false
				content := ""
				for _, attr := range tk.Attr {
					if attr.Key == "http-equiv" &&
						strings.ToLower(attr.Val) == "x-xrds-location" {
						ok = true
					} else if attr.Key == "content" {
						content = attr.Val
					}
				}
				if ok && len(content) > 0 {
					return content, nil
				}
			}
		}
	}
	return "", errors.New("Meta X-XRDS-Location not found")
}

コード例 #30

0

ファイルを表示

ファイル: webcrawler.go プロジェクト: griefdlament/gowebbench

func fetchHyperLink(httpBody io.Reader) []string {
	defer ioutil.ReadAll(httpBody)
	links := make([]string, 0)
	body := html.NewTokenizer(httpBody)
	for {
		tokenType := body.Next()
		if tokenType == html.ErrorToken {
			return links
		}
		token := body.Token()
		if tokenType == html.StartTagToken {
			if token.DataAtom.String() == "a" || token.DataAtom.String() == "link" {
				for _, attribute := range token.Attr {
					if attribute.Key == "href" {
						links = append(links, attribute.Val)
					}
				}
			} else if token.DataAtom.String() == "img" || token.DataAtom.String() == "script" {
				for _, attribute := range token.Attr {
					if attribute.Key == "src" {
						links = append(links, attribute.Val)
					}
				}
			}
		}
	}
	return links
}