Ejemplo n.º 1
0
func main() {
	s := `<p>Links:<a href="a1" class="test"/></p><ul><li><a href="foo">Foo</a><li><a href="/bar/baz">BarBaz</a></ul>`

	doc, _ := html.Parse(strings.NewReader(s))
	traverse_html_node(doc, 0)

	z := html.NewTokenizer(strings.NewReader(s))
	traverse_html_tokenizer(z)

	z1 := html.NewTokenizer(strings.NewReader(s))
	traverse_html_token(z1)
}
Ejemplo n.º 2
0
func TokenizePage(r io.Reader) ([]string, string) {
	res := []string{}
	z := html.NewTokenizer(r)
	isTitle := false
	title := ""
loop:
	for {
		tt := z.Next()
		switch tt {
		case html.ErrorToken:
			break loop
		case html.TextToken:
			text := string(z.Text())
			if isTitle {
				title = cleanTitle(text)
				continue
			}
			res = append(res, bstrings.TokenizeWords(text)...)
		case html.EndTagToken:
			tn, _ := z.TagName()
			if string(tn) == "title" {
				isTitle = false
			}
		case html.StartTagToken:
			tn, _ := z.TagName()
			if string(tn) == "title" {
				isTitle = true
			}
		}
	}
	return res, title
}
Ejemplo n.º 3
0
func FindLinks(body io.Reader) chan link {
	c := make(chan link)

	go func() {
		z := html.NewTokenizer(body)
		for {
			tt := z.Next()
			if tt == html.ErrorToken {
				break
			}
			if tt == html.StartTagToken {
				tn, _ := z.TagName()
				if len(tn) == 1 && tn[0] == 'a' {
					for {
						key, value, more := z.TagAttr()
						// http://stackoverflow.com/questions/14230145/what-is-the-best-way-to-convert-byte-array-to-string
						if string(key) == "href" {
							v := string(value)
							// http://codereview.stackexchange.com/questions/28386/fibonacci-generator-with-golang
							c <- link{v, v}
						}
						if !more {
							break
						}
					}
				}
			}
		}
		c <- link{"", ""}
	}()

	return c
}
Ejemplo n.º 4
0
// getLinks parses the response for links, doing it's best with bad HTML.
func getLinks(contents []byte) ([]*URL, error) {
	utf8Reader, err := charset.NewReader(bytes.NewReader(contents), "text/html")
	if err != nil {
		return nil, err
	}
	tokenizer := html.NewTokenizer(utf8Reader)

	var links []*URL
	tags := getIncludedTags()

	for {
		tokenType := tokenizer.Next()
		switch tokenType {
		case html.ErrorToken:
			//TODO: should use tokenizer.Err() to see if this is io.EOF
			//		(meaning success) or an actual error
			return links, nil
		case html.StartTagToken:

			tagName, hasAttrs := tokenizer.TagName()
			if hasAttrs && tags[string(tagName)] {
				links = parseAnchorAttrs(tokenizer, links)
			}
		}
	}

	return links, nil
}
Ejemplo n.º 5
0
// Search for
// <head>
//    <meta http-equiv="X-XRDS-Location" content="....">
func findMetaXrdsLocation(input io.Reader) (location string, err error) {
	tokenizer := html.NewTokenizer(input)
	inHead := false
	for {
		tt := tokenizer.Next()
		switch tt {
		case html.ErrorToken:
			return "", tokenizer.Err()
		case html.StartTagToken, html.EndTagToken:
			tk := tokenizer.Token()
			if tk.Data == "head" {
				if tt == html.StartTagToken {
					inHead = true
				} else {
					return "", errors.New("Meta X-XRDS-Location not found")
				}
			} else if inHead && tk.Data == "meta" {
				ok := false
				content := ""
				for _, attr := range tk.Attr {
					if attr.Key == "http-equiv" &&
						attr.Val == "X-XRDS-Location" {
						ok = true
					} else if attr.Key == "content" {
						content = attr.Val
					}
				}
				if ok && len(content) > 0 {
					return content, nil
				}
			}
		}
	}
	return "", errors.New("Meta X-XRDS-Location not found")
}
Ejemplo n.º 6
0
func ExtractText(reader io.Reader, remover func(string) (string, error)) (string, error) {
	z := html.NewTokenizer(reader)

	var buf bytes.Buffer
	bodyBlock := false

loop:
	for {
		tokenType := z.Next()
		switch tokenType {
		case html.StartTagToken:
			if z.Token().DataAtom == atom.Body {
				bodyBlock = true
			}
		case html.EndTagToken:
			if z.Token().DataAtom == atom.Body {
				bodyBlock = false
			}
		case html.TextToken:
			if bodyBlock {
				buf.Write(z.Text())
			}
		case html.ErrorToken:
			if z.Err() != io.EOF {
				return "", z.Err()
			}
			break loop
		}
	}

	return remover(buf.String())
}
Ejemplo n.º 7
0
func html_detect_content_type(head []byte) string {
	reader := bytes.NewReader(head)
	z := html.NewTokenizer(reader)
	expect_html_root := true
FORBEGIN:
	for tt := z.Next(); tt != html.ErrorToken; tt = z.Next() {
		t := z.Token()
		switch {
		case t.Data == "meta" && (tt == html.StartTagToken || tt == html.SelfClosingTagToken):
			if ct, ok := detect_charset_by_token(t.Attr); ok == true {
				return ct
			}
		case t.Data == "head" && tt == html.EndTagToken:
			break
			// un-html file
		case expect_html_root && (tt == html.StartTagToken || tt == html.SelfClosingTagToken):
			if t.Data == "html" {
				expect_html_root = false
			} else {
				break FORBEGIN
			}
		}
	}
	return ""
}
Ejemplo n.º 8
0
// Returns the href attribute of a <link rel="shortcut icon"> tag or error if not found.
func FindIcon(b []byte) (string, error) {
	r := bytes.NewReader(b)
	z := html.NewTokenizer(r)
	for {
		if z.Next() == html.ErrorToken {
			if err := z.Err(); err == io.EOF {
				break
			} else {
				return "", ErrNoIcon
			}
		}
		t := z.Token()
		switch t.DataAtom {
		case atom.Link:
			if t.Type == html.StartTagToken || t.Type == html.SelfClosingTagToken {
				attrs := make(map[string]string)
				for _, a := range t.Attr {
					attrs[a.Key] = a.Val
				}
				if attrs["rel"] == "shortcut icon" && attrs["href"] != "" {
					return attrs["href"], nil
				}
			}
		}
	}
	return "", ErrNoIcon
}
Ejemplo n.º 9
0
func linkParser(page_chan chan string) <-chan string {
	link_chan := make(chan string)
	go func() {
		for page := range page_chan {
			//page := <-page_chan
			page_bytes := bytes.NewBufferString(page)
			d := html.NewTokenizer(io.Reader(page_bytes))
			for {
				tokenType := d.Next()
				if tokenType == html.ErrorToken {
					fmt.Println("\nFinished to parse page")
					break
				}
				token := d.Token()
				switch tokenType {
				case html.StartTagToken:
					if strings.EqualFold(token.Data, "A") {
						for _, a := range token.Attr {
							if strings.EqualFold(a.Key, "HREF") {
								link_chan <- a.Val
							}
						}
					}
				}
			}
		}
		close(link_chan)
	}()
	return link_chan
}
Ejemplo n.º 10
0
func TestPushHTML(t *testing.T) {
	xmlns := NewXmlNamespace()

	for i := range xmlNsSamples {
		j := 0
		z := html.NewTokenizer(strings.NewReader(xhtmlNsSamples[i].sample))
		for {
			tt := z.Next()
			if tt == html.ErrorToken {
				err := z.Err()
				if err == io.EOF {
					err = nil
					break
				}
				t.Fatal(err)
			}
			switch tt {
			case html.StartTagToken, html.SelfClosingTagToken:
				xmlns.PushHTML(z.Token())
				checkState("push", j, xmlns, xhtmlNsSamples[i].prefix[j], xhtmlNsSamples[i].uri[j], t)
				j++
			case html.EndTagToken:
				j--
				checkState("pop", j, xmlns, xhtmlNsSamples[i].prefix[j], xhtmlNsSamples[i].uri[j], t)
				xmlns.Pop()
			}
		}
	}
}
Ejemplo n.º 11
0
func Sanitize(s string) (string, string) {
	r := bytes.NewReader([]byte(s))
	z := html.NewTokenizer(r)
	buf := &bytes.Buffer{}
	snip := &bytes.Buffer{}
	scripts := 0
	for {
		if z.Next() == html.ErrorToken {
			if err := z.Err(); err == io.EOF {
				break
			} else {
				return s, snipper(s)
			}
		}
		t := z.Token()
		if t.DataAtom == atom.Script {
			if t.Type == html.StartTagToken {
				scripts++
			} else if t.Type == html.EndTagToken {
				scripts--
			}
		} else if scripts == 0 {
			buf.WriteString(t.String())
			if t.Type == html.TextToken {
				snip.WriteString(t.String())
			}
		}
	}

	return buf.String(), snipper(snip.String())
}
Ejemplo n.º 12
0
func Autodiscover(b []byte) (string, error) {
	r := bytes.NewReader(b)
	z := html.NewTokenizer(r)
	inHtml := false
	inHead := false
	for {
		if z.Next() == html.ErrorToken {
			if err := z.Err(); err == io.EOF {
				break
			} else {
				return "", ErrNoRssLink
			}
		}
		t := z.Token()
		switch t.DataAtom {
		case atom.Html:
			inHtml = !inHtml
		case atom.Head:
			inHead = !inHead
		case atom.Link:
			if inHead && inHtml && (t.Type == html.StartTagToken || t.Type == html.SelfClosingTagToken) {
				attrs := make(map[string]string)
				for _, a := range t.Attr {
					attrs[a.Key] = a.Val
				}
				if attrs["rel"] == "alternate" && attrs["href"] != "" &&
					(attrs["type"] == "application/rss+xml" || attrs["type"] == "application/atom+xml") {
					return attrs["href"], nil
				}
			}
		}
	}

	return "", ErrNoRssLink
}
Ejemplo n.º 13
0
func GetAllLinks(data io.ReadCloser) (links []string, err error) {
	tokenizer := html.NewTokenizer(data)
	for {
		tokenizer.Next()
		token := tokenizer.Token()
		switch token.Type {
		case html.ErrorToken:
			return
		case html.EndTagToken:
		case html.CommentToken:
		case html.TextToken:
		case html.StartTagToken, html.SelfClosingTagToken:
			if *debug {
				log.Print("type ", token.Type)
				log.Print("data ", token.Data)
			}
			if token.Data == "a" {
				for _, a := range token.Attr {
					if a.Key == "href" {
						for _, ext := range strings.Split(*fileType, ",") {
							if strings.HasSuffix(a.Val, ext) {
								if strings.HasPrefix(a.Val, "//") {
									links = append(links, "http:"+a.Val)
								} else {
									links = append(links, a.Val)
								}
							}
						}
					}
				}
			}
		}
	}
	return
}
Ejemplo n.º 14
0
// Given the HTML of a Goodreads bookshelf, returns the books.
func bookshelfToBooks(body io.ReadCloser) (books []Book) {
	z := html.NewTokenizer(body)

	books = make([]Book, 100)
	for i := 0; i < 1000; {
		book := new(Book)
		tok := z.Next()
		// fmt.Println(tok)
		if tok == html.ErrorToken {
			// ...
			return books
		}
		_, atr, _ := z.TagAttr()
		if strings.Contains(string(atr), "/book/show") {
			_, atr, _ := z.TagAttr()
			book.title = string(string(atr))
			//			fmt.Println("Got book:", book.title)
		} else if strings.Contains(string(atr), "staticStars") {
			_, atr, _ := z.TagAttr()
			book.rating = getRating(string(atr))
		}

		if book.title != "" {
			books[i] = *book
			i++
		}
	}

	return books
}
Ejemplo n.º 15
0
func Parse(reader io.Reader) (newPost *post.Post, err error) {

	newPost = &post.Post{}
	currentIdx := 0
	parsers := []post.PartParser{&ReceiverParser{}, &SenderParser{}, &SubjectParser{}, &PostDateParser{}, &ContentParser{}}
	linkParser := &LinkParser{}
	bodyBlock := false

	z := html.NewTokenizer(reader)

loop:
	for {
		tokenType := z.Next()
		switch tokenType {
		case html.StartTagToken:
			tk := z.Token()
			if tk.DataAtom == atom.Body {
				bodyBlock = true
			} else if tk.DataAtom == atom.A {
				for _, attr := range tk.Attr {
					if attr.Key == "href" {
						linkParser.Parse(newPost, []byte(attr.Val))
					}
				}
			}
		case html.EndTagToken:
			if z.Token().DataAtom == atom.Body {
				bodyBlock = false
			}
		case html.TextToken:
			if bodyBlock {
				flow := parsers[currentIdx].Parse(newPost, z.Text())
				switch flow {
				case post.Next:
					if currentIdx < len(parsers) {
						currentIdx += 1
					}
				case post.Error:
					err = parsers[currentIdx].Err()
					break loop
				case post.Stop:
					break loop
				}
			}
		case html.ErrorToken:
			if z.Err() != io.EOF {
				err = z.Err()
			}
			break loop
		}
	}

	if currentIdx != len(parsers)-1 {
		err = errors.New("malformed Post format")
	}

	return
}
Ejemplo n.º 16
0
func findProviderFromHeadLink(input io.Reader) (opEndpoint, opLocalId string, err error) {
	tokenizer := html.NewTokenizer(input)
	inHead := false
	for {
		tt := tokenizer.Next()
		switch tt {
		case html.ErrorToken:
			// Even if the document is malformed after we found a
			// valid <link> tag, ignore and let's be happy with our
			// openid2.provider and potentially openid2.local_id as well.
			if len(opEndpoint) > 0 {
				return
			}
			return "", "", tokenizer.Err()
		case html.StartTagToken, html.EndTagToken:
			tk := tokenizer.Token()
			if tk.Data == "head" {
				if tt == html.StartTagToken {
					inHead = true
				} else {
					if len(opEndpoint) > 0 {
						return
					}
					return "", "", errors.New(
						"LINK with rel=openid2.provider not found")
				}
			} else if inHead && tk.Data == "link" {
				provider := false
				localId := false
				href := ""
				for _, attr := range tk.Attr {
					if attr.Key == "rel" {
						if attr.Val == "openid2.provider" {
							provider = true
						} else if attr.Val == "openid2.local_id" {
							localId = true
						}
					} else if attr.Key == "href" {
						href = attr.Val
					}
				}
				if provider && !localId && len(href) > 0 {
					opEndpoint = href
				} else if !provider && localId && len(href) > 0 {
					opLocalId = href
				}
			}
		}
	}
	// At this point we should probably have returned either from
	// a closing </head> or a tokenizer error (no </head> found).
	// But just in case.
	if len(opEndpoint) > 0 {
		return
	}
	return "", "", errors.New("LINK rel=openid2.provider not found")
}
Ejemplo n.º 17
0
func (c *Crawl) Scan(surl string) {
	//fmt.Printf("scanning %s\n",surl)

	resp := c.R.LaunchNoRead("GET", surl, "")
	if resp == nil || resp.Body == nil {
		//fmt.Println("nil response: "+surl)
		return
	}
	defer resp.Body.Close()

	page := html.NewTokenizer(resp.Body)
	for {
		tokenType := page.Next()
		if tokenType == html.ErrorToken {
			c.Crawled = append(c.Crawled, surl)
			return
		}
		token := page.Token()

		//if tokenType == html.StartTagToken { //&& token.DataAtom.String() == "a" {
		for _, attr := range token.Attr {
			if attr.Key == "href" || attr.Key == "action" || attr.Key == "src" {
				res := c.FixUrl(attr.Val)
				if res != "" && !c.IsRepeated(res) {

					oUrl, err := url.Parse(res)
					if err == nil {

						if oUrl.Host == c.Host {

							var test string

							idx := strings.LastIndex(oUrl.Path, ".")
							if idx >= 0 {
								oUrl.Path = oUrl.Path[0:idx] + "test1337" + oUrl.Path[idx+1:] //TODO: si la url acaba en punto, crashea out of index
								test = oUrl.String()
							} else {
								test = res
							}

							//fmt.Printf("test:%s\n",test)
							_, code_not_found, _ := R.Get(test)
							html, code, _ := R.Get(res)

							if code != code_not_found {
								P.Show("c", code, len(html), res)
								c.Resources = append(c.Resources, res)
								c.NewResources = append(c.NewResources, res)
							}

						}
					}
				}
			}
		}
	}
}
Ejemplo n.º 18
0
func main() {
	urls := make([]string, 0, 75)

	resp, err := http.Get("http://opensource.org/licenses/alphabetical")
	if err != nil {
		fmt.Println(err)
		return
	}

	z := html.NewTokenizer(resp.Body)
	for {
		tok := z.Next()
		if tok == html.ErrorToken {
			//fmt.Println("reached error")
			break
		}
		if tok != html.StartTagToken {
			//fmt.Println("not a start tag")
			continue
		}

		tagName, hasAttr := z.TagName()
		if string(tagName) != "a" {
			//fmt.Println(string(tagName), " is not 'a'")
			continue
		}
		if !hasAttr {
			//fmt.Println("tag has no attributes")
			continue
		}

		href := ""

		for {
			attr, val, more := z.TagAttr()
			if string(attr) == "href" {
				//fmt.Println("Found href: ", string(val))
				href = string(val)
			}
			if !more {
				break
			}
		}
		if strings.HasPrefix(href, "/licenses/") {
			href = strings.Replace(href, "/licenses/", "", 1)
			if href == strings.ToLower(href) {
				continue
			}
			urls = append(urls, href)
		}
	}

	for _, license := range urls {
		getLicense(license)
	}
}
Ejemplo n.º 19
0
// parse parses a stirng and converts it into an html.
func parse(s string) *htmlDocument {
	htmlDoc := &htmlDocument{}
	tokenizer := html.NewTokenizer(strings.NewReader(s))
	for {
		if errorToken, _, _ := parseToken(tokenizer, htmlDoc, nil); errorToken {
			break
		}
	}
	return htmlDoc
}
Ejemplo n.º 20
0
func TestXMLBasePushHTML(t *testing.T) {
	for i, v := range xmlBaseTests {
		xmlbase, err := NewXmlBase("")
		if err != nil {
			t.Fatal(i, err)
		}

		if verbose {
			fmt.Println(i, "created", xmlbase.baseUri, xmlbase.depth)
		}

		z := html.NewTokenizer(strings.NewReader(v.example))
		r := 0
		for {
			tt := z.Next()
			switch tt {
			case html.ErrorToken:
				err = z.Err()
				if err == io.EOF {
					return
				}
				t.Fatal(i, err)
			case html.StartTagToken:
				node := z.Token()
				xmlbase.PushHTML(node)
				if verbose {
					fmt.Println(i, "pushed", xmlbase.baseUri, xmlbase.depth)
				}
				for _, attr := range node.Attr {
					if attr.Key == v.resolve[r].html.Key {
						if verbose {
							fmt.Println(i, "verify", attr, v.resolve[r].iri)
						}

						iri, err := xmlbase.Resolve(attr.Val)
						if err != nil {
							t.Fatal(i, r, err)
						}

						if iri != v.resolve[r].iri {
							t.Fatalf("%d %d expected '%s', got '%s'", i, r, v.resolve[r].iri, iri)
						}
						r++
					}
				}

			case html.EndTagToken:
				xmlbase.Pop()
				if verbose {
					fmt.Println(i, "popped", xmlbase.baseUri, xmlbase.depth)
				}
			}
		}
	}
}
Ejemplo n.º 21
0
func fetch_description(num int) (n string, err error) {

	n = "ok"

	url := fmt.Sprintf("http://projecteuler.net/problem=%d", num)
	resp, err := http.Get(url)
	if err != nil {
		fmt.Printf("Error fetching description: %v", err.Error())
		return n, err
	}

	var desc bytes.Buffer
	//buf.ReadFrom(resp.Body)
	z := html.NewTokenizer(resp.Body)

	in_desc := false
	desc_depth := 0
	depth := 0
	for {
		tt := z.Next()
		switch tt {
		case html.ErrorToken:
			fmt.Printf("returning ErrorToken, captured %v", string(desc.Bytes()))
			return string(desc.Bytes()), err
		case html.TextToken:
			if in_desc {
				desc.Write(z.Text())
			}
		case html.StartTagToken, html.EndTagToken:
			tn, _ := z.TagName()
			stn := string(tn)

			if stn == "div" {
				if tt == html.StartTagToken {
					depth++
					key, val, _ := z.TagAttr()
					if string(key) == "class" && string(val) == "problem_content" {
						in_desc = true
						desc_depth = depth
					}
				} else {
					depth--
					if in_desc && depth < desc_depth {
						return string(desc.Bytes()), err
					}
				}
			}
		}
	}

	n = string(desc.Bytes())
	resp.Body.Close()
	return n, err
}
Ejemplo n.º 22
0
// Gets latest version numbers of vim plugins from vim.org
// Takes one argment, the ID of the script on vim.org
// Returns a string with the version, and an error (if any)
func getVersionFromVimDotOrg(scriptID string) (string, error) {
	url := "http://www.vim.org/scripts/script.php?script_id=" + scriptID
	resp, err := http.Get(url)
	if err != nil {
		return "", err
	}

	defer resp.Body.Close()
	tokenizer := html.NewTokenizer(resp.Body)
	// vim.org doesn't annotate their html entities very well,
	// so we use this variable to keep track of which column in the table we are looking at
	// Version #'s are in the second column
	columnInDataTable := 0
	// This loop exits when we find the version, or the tokenizer runs out of input
	for {
		tokenType := tokenizer.Next()
		switch tokenType {
		case html.ErrorToken:
			// we either can't parse the HTML, or we're done
			// In either case, we haven't found a good version
			return "", tokenizer.Err()
		case html.StartTagToken:
			token := tokenizer.Token()
			// If this is a table data, it might be part of the data table
			if token.DataAtom == atom.Lookup([]byte("td")) {
				for _, attribute := range token.Attr {
					// If this is annotated with class=rowodd or roweven, this is a field in the data table
					if attribute.Key == "class" &&
						(strings.Contains(attribute.Val, "rowodd") || strings.Contains(attribute.Val, "roweven")) {
						// We have seen one more field in the data table
						columnInDataTable++
					}
				}
			}
			break
		case html.EndTagToken:
			// If this is the end of a table row, we reset the number of data fields seen
			if tokenizer.Token().DataAtom == atom.Lookup([]byte("tr")) {
				columnInDataTable = 0
			}
			break
		case html.TextToken:
			token := tokenizer.Token()
			// If this is the second column in the table, it is the version column.
			// Because vim.org sorts the data table with the most recent version at the top,
			// we can return the first version we find, as it must be the most recent.
			if columnInDataTable == 2 && strings.TrimSpace(token.String()) != "" {
				return token.String(), nil
			}
			break
		}
	}
}
Ejemplo n.º 23
0
Archivo: crwl.go Proyecto: rsdk/crwlr
// parseHTML bekommt eine komplette HTML Seite
// und legt eine Map mit Wörtern und (viele) einzelne Links in entsprechende Channels
func parseHtml(a HTTPRESP) {
	//start := time.Now()
	d := html.NewTokenizer(a.FD)
	var words map[string]int
	words = make(map[string]int)

	for {
		// token type
		tokenType := d.Next()
		// ErrorToken kommt (auch) beim Ende der Daten
		if tokenType == html.ErrorToken {
			chan_urlindexes <- URLINDEX{a.URL, words} // WORD-Map in den Channel legen
			//fmt.Printf("Parse-Dauer : [%.2fs]  URL: %s\n", time.Since(start).Seconds(), a.URL)
			return
		}
		token := d.Token()
		switch tokenType {
		case html.StartTagToken: // <tag>
			// Links finden
			if token.Data == "a" {
				for _, element := range token.Attr {
					if element.Key == "href" {
						// Link normalisieren
						ref_url, err := url.Parse(element.Val)         // geparste URL
						base_url, _ := url.Parse(a.URL)                // Basis URL der geparsten Seite
						comp_url := base_url.ResolveReference(ref_url) // zusammengesetzte url oder falls ref_url==absoluteurl->ref_url
						// Nur Links die nicht in der globalen Link Map sind
						if err == nil && comp_url.Scheme == "http" && crwldurls[comp_url.String()] != true && a.LINKDEPTH < MaxLinkDepth {
							crwldurls[comp_url.String()] = true                  //URL in die globale URL Liste aufnehmen damit sie nicht nochmal in den Work Queue kommt.
							chan_urls <- URL{comp_url.String(), a.LINKDEPTH + 1} // Die URL in den Channel legen und Linktiefe hochzählen
						}
					}
				}
			}

		case html.TextToken: // text between start and end tag
			//Map mit Wörtern erstellen
			temp := strings.Fields(token.Data) //Aufteilen in Einzelne Wörter, trennen bei Whitespace
			for _, element := range temp {
				//TODO: einzelne Örter noch besser von Sonderzeichen trennen z.b. mit TRIM()
				words[element] = words[element] + 1
			}

			//fmt.Printf("%q\n", temp)
		case html.EndTagToken: // </tag>
		case html.SelfClosingTagToken: // <tag/>

		}
	}
}
Ejemplo n.º 24
0
func (p *Post) Clean() string {
	z := html.NewTokenizer(strings.NewReader(string(p.HTML)))
	var buffer bytes.Buffer
loop:
	for {
		switch tt := z.Next(); tt {
		case html.ErrorToken:
			break loop
		case html.TextToken:
			buffer.Write(z.Text())
		}
	}
	return string(bytes.TrimSpace(ws.ReplaceAll(buffer.Bytes(), []byte{' '})))
}
Ejemplo n.º 25
0
// Clean returns the sanitized HTML (based on a tag and attribute whitelist) and
// the text contents of s. Links are made relative to u, if non-nil.
func Clean(s string, u *url.URL) (string, string) {
	r := bytes.NewReader([]byte(strings.TrimSpace(s)))
	z := html.NewTokenizer(r)
	buf := &bytes.Buffer{}
	strip := &bytes.Buffer{}
	skip := 0
	if u != nil {
		u.RawQuery = ""
		u.Fragment = ""
	}
	for {
		if z.Next() == html.ErrorToken {
			if err := z.Err(); err == io.EOF {
				break
			} else {
				return s, s
			}
		}

		t := z.Token()
		if t.Type == html.StartTagToken || t.Type == html.SelfClosingTagToken {
			if !AcceptableElements[t.Data] {
				if UnacceptableElementsWithEndTag[t.Data] && t.Type != html.SelfClosingTagToken {
					skip += 1
				}
			} else {
				cleanAttributes(u, &t)
				buf.WriteString(t.String())
			}
		} else if t.Type == html.EndTagToken {
			if !AcceptableElements[t.Data] {
				if UnacceptableElementsWithEndTag[t.Data] {
					skip -= 1
				}
			} else {
				buf.WriteString(t.String())
			}
		} else if skip == 0 {
			buf.WriteString(t.String())
			if t.Type == html.TextToken {
				strip.WriteString(t.String())
			}
		}
	}

	return buf.String(), strip.String()
}
Ejemplo n.º 26
0
func main() {
	start := "http://www.panynj.gov/path/full-schedules.html"
	res, err := http.Get(start)
	if err != nil {
		log.Fatal(err)
	}
	z := html.NewTokenizer(res.Body)
	for {
		tt := z.Next()
		if tt == html.ErrorToken {
			break
		}
		if tt == html.StartTagToken {
			tn, _ := z.TagName()
			if len(tn) == 1 && tn[0] == 'a' {
				for {
					key, value, more := z.TagAttr()
					// http://stackoverflow.com/questions/14230145/what-is-the-best-way-to-convert-byte-array-to-string
					if string(key) == "href" {
						v := string(value)
						if strings.HasPrefix(v, "schedules/") {
							fuckedurl := path.Join(path.Dir(start), v)
							// yep, hack it
							// thx go for making me rename the variable
							url := strings.Replace(fuckedurl, ":/", "://", 1)
							fmt.Printf("%s\n", url)
							fetch(url)
						}
					}
					if !more {
						break
					}
				}
			}
			// ...
			//return ...
		}
		// Process the current token.
	}
	res.Body.Close()
	if err != nil {
		log.Fatal(err)
	}
}
Ejemplo n.º 27
0
func _parseHTML(r io.Reader, ch chan<- resource) {

	defer func() {
		close(ch)
	}()

	z := html.NewTokenizer(r)

	findAttr := func(name string) string {
		lname := strings.ToLower(name)
		moreAttr := true
		for moreAttr {
			var key, val []byte
			key, val, moreAttr = z.TagAttr()
			if strings.ToLower(string(key)) == lname {
				return strings.Split(string(val), "#")[0]
			}
		}
		return ""
	}

	for {
		tokenType := z.Next()
		switch tokenType {
		case html.ErrorToken:
			return
		case html.StartTagToken, html.SelfClosingTagToken:
			tagName, hasAttr := z.TagName()
			if !hasAttr {
				continue
			}
			ltag := strings.ToLower(string(tagName))
			attrName, ok := attrNameMap[ltag]
			if !ok {
				continue
			}
			if attr := findAttr(attrName); attr != "" {
				ch <- resource{ltag, attr}
			}
		default:
		}
	}
}
Ejemplo n.º 28
0
func ParseLink(reader io.Reader) []string {
	links := make([]string, 0)
	page := html.NewTokenizer(reader)
	for {
		tokenType := page.Next()
		if tokenType == html.ErrorToken {
			return links
		}
		token := page.Token()
		if tokenType == html.StartTagToken && token.DataAtom.String() == "a" {
			for _, attr := range token.Attr {
				if attr.Key == "href" {
					links = append(links, attr.Val)
				}
			}
		}
	}
	return links
}
Ejemplo n.º 29
0
func Extract(r io.Reader, extractor Extractor) error {
	z := html.NewTokenizer(r)
	for {
		tt := z.Next()
		switch tt {
		case html.ErrorToken:
			switch z.Err() {
			case io.EOF:
				return nil
			default:
				return z.Err()
			}
		default:
			token := z.Token()
			extractor.HandleToken(token)
		}
	}

}
Ejemplo n.º 30
0
func ParseWeather(r io.Reader) []string {
	res := []string{}
	z := html.NewTokenizer(r)
	inTablePluie := false
	candidateText := false
	horaire := ""
loop:
	for {
		tt := z.Next()
		switch tt {
		case html.ErrorToken:
			break loop
		case html.TextToken:
			if candidateText {
				text := strings.TrimSpace(string(z.Text()))
				if text != "" {
					if horaire == "" {
						horaire = text
					} else {
						res = append(res,
							fmt.Sprintf("%s : %s", horaire, text))
						horaire = ""
					}
				}
			}
		case html.EndTagToken:
			candidateText = false
			if hasTablPluieClass(z) {
				return res
			}
			break
		case html.StartTagToken:
			if !inTablePluie && hasTablPluieClass(z) {
				inTablePluie = true
			} else if inTablePluie {
				tn, _ := z.TagName()
				candidateText = string(tn) == "td"
			}
		}
	}
	return res
}