Exemple #1
0
func buildHTML(tokenizer *html.Tokenizer) (s string, err error) {
	buf := new(bytes.Buffer)

	bp := 0
	if tag, _ := tokenizer.TagName(); string(tag) == "div" {
		div := tokenizer.Raw()
		buf.Write(div)
		bp = len(div)
		err = nextToken(tokenizer)
	}

	ep := bp
	for err != io.EOF {
		if err != nil && err != io.EOF {
			return
		}
		ep = buf.Len()
		b := tokenizer.Raw()
		if _, err := buf.Write(b); err != nil {
			return "", err
		}
		err = nextToken(tokenizer)
	}
	b := buf.Bytes()
	if bp > 0 {
		b = b[bp:ep]
	}
	return string(b), nil
}
func skipSubtreeIfUicRemove(z *html.Tokenizer, tt html.TokenType, tagName string, attrs []html.Attribute) bool {
	_, foundRemoveTag := getAttr(attrs, UicRemove)
	if !foundRemoveTag {
		return false
	}

	if isSelfClosingTag(tagName, tt) {
		return true
	}

	depth := 0
	for {
		tt := z.Next()
		tag, _ := z.TagName()

		switch {
		case tt == html.ErrorToken:
			return true
		case tt == html.StartTagToken && !isSelfClosingTag(string(tag), tt):
			depth++
		case tt == html.EndTagToken:
			depth--
			if depth < 0 {
				return true
			}
		}
	}
}
Exemple #3
0
func flushTagToken(htmlBuf *[]byte, tz *html.Tokenizer, url string) string {
	*htmlBuf = append(*htmlBuf, '<')
	tagName, hasAttr := tz.TagName()
	*htmlBuf = append(*htmlBuf, tagName...)
	if hasAttr {
		for {
			attrKey, attrValue, hasMore := tz.TagAttr()
			*htmlBuf = append(*htmlBuf, ' ')
			*htmlBuf = append(*htmlBuf, attrKey...)
			*htmlBuf = append(*htmlBuf, '=', '"')
			if tagAttrToProxy[string(tagName)][string(attrKey)] {
				urlInAttr := string(attrValue)
				*htmlBuf = append(*htmlBuf, []byte(GetProxiedUrl(urlInAttr, url))...)
			} else {
				*htmlBuf = append(*htmlBuf, attrValue...)
			}
			*htmlBuf = append(*htmlBuf, '"')
			if !hasMore {
				break
			}
		}
	}
	*htmlBuf = append(*htmlBuf, '>')
	if string(tagName) == "head" {
		*htmlBuf = append(*htmlBuf, []byte(getJsHookTag())...)
	}
	return string(tagName)
}
func (item *AnimeConventionItem) readResgiterNowurl(t *html.Tokenizer) {
	t.Next()
	if _, hasmore := t.TagName(); hasmore {
		if key, val, _ := t.TagAttr(); strings.EqualFold(string(key), "href") {
			item.registerNowURL = string(val)
		}
	}
}
func (item *AnimeConventionItem) readNameAndLink(t *html.Tokenizer) {
	if label := t.Next(); label == html.StartTagToken {
		_, hasmore := t.TagName()
		if hasmore {
			if key, val, _ := t.TagAttr(); strings.EqualFold(string(key), "href") {
				item.siteURL = string(val)
			}
		}
	}
	if label := t.Next(); label == html.TextToken {
		item.name = string(t.Text())
	}
}
Exemple #6
0
func parseTableX86(z *html.Tokenizer, in *Intrinsic) *Intrinsic {
	in.Performance = make(map[string]Timing)

	for {
		tt := z.Next()
		switch tt {
		case html.ErrorToken:
			return in
		case html.StartTagToken, html.EndTagToken:
			tn, _ := z.TagName()
			tns := strings.ToLower(string(tn))
			switch tns {
			case "tr":
				if tt == html.StartTagToken {
					n := 0
					p := Timing{}
					for {
						tt = z.Next()
						tn, _ = z.TagName()
						tns = strings.ToLower(string(tn))
						if tt == html.EndTagToken && tns == "tr" {
							break
						}
						if tt == html.StartTagToken && tns == "td" {
							switch n {
							case 0:
								p.Arch = getText(z)
							case 1:
								p.Latency, _ = strconv.ParseFloat(getText(z), 64)
							case 2:
								p.Throughput, _ = strconv.ParseFloat(getText(z), 64)
								in.Performance[p.Arch] = p
							}
							n++
						}
					}
				} else {
					panic("tr ended")
				}
			case "table":
				if tt == html.EndTagToken {
					return in
				} else {
					panic("table started")

				}
			}
		}
	}
}
func (item *AnimeConventionItem) Parse(t *html.Tokenizer) {
	for {
		label := t.Next()
		switch label {
		case html.ErrorToken:
			fmt.Errorf("%v\n", t.Err())
			return
		case html.TextToken:
			switch string(t.Text()) {
			case "Advance Rates:":
				//fmt.Println("rate")
				item.readadvanceRate(t)
			case "At-Door Rates:":
				item.readatDoorRate(t)
			}
		case html.StartTagToken, html.EndTagToken, html.SelfClosingTagToken:
			tag, hasmore := t.TagName()
			if strings.EqualFold(string(tag), "big") {
				item.readResgiterNowurl(t)
			} else if hasmore {
				key, val, hasmore := t.TagAttr()
				if strings.EqualFold(string(key), "itemprop") {
					//fmt.Println(string(val))
					switch string(val) {
					case "description":
						item.readDescription(t)
					case "latitude":
						item.readLatitude(t)
					case "longitude":
						item.readLongitude(t)
					case "startDate":
						item.readStartDate(t)
					case "endDate":
						item.readEndDate(t)
					case "location":
						item.readLocation(t)
					case "addressLocality":
						item.readCity(t)
					case "addressRegion":
						item.readState(t)
					case "addressCountry":
						item.readCountry(t, hasmore)
					case "name":
						item.readNameAndLink(t)
					}
				}
			}
		}
	}
}
func (item *AnimeConventionItem) readLocation(t *html.Tokenizer) {
	for {
		if label := t.Next(); label == html.StartTagToken {
			_, hasmore := t.TagName()
			if hasmore {
				if _, val, _ := t.TagAttr(); strings.EqualFold(string(val), "name") {
					break
				}
			}
		}
	}
	if label := t.Next(); label == html.TextToken {
		item.location = string(t.Text())
	}
}
func (item *AnimeConventionItem) readRates(t *html.Tokenizer) string {
	rates := ""
	for {
		label := t.Next()
		if label == html.EndTagToken {
			val, _ := t.TagName()
			if strings.EqualFold(string(val), "p") {
				break
			}
		}
		if label == html.TextToken {
			rates = strings.Join([]string{rates, string(t.Text())}, "\n")
		}
	}
	return strings.TrimSpace(rates)
}
func parseFragment(z *html.Tokenizer) (f Fragment, dependencies []*FetchDefinition, err error) {
	attrs := make([]html.Attribute, 0, 10)
	dependencies = make([]*FetchDefinition, 0, 0)

	buff := bytes.NewBuffer(nil)
forloop:
	for {
		tt := z.Next()
		tag, _ := z.TagName()
		raw := byteCopy(z.Raw()) // create a copy here, because readAttributes modifies z.Raw, if attributes contain an &
		attrs = readAttributes(z, attrs)

		switch {
		case tt == html.ErrorToken:
			if z.Err() != io.EOF {
				return nil, nil, z.Err()
			}
			break forloop
		case tt == html.StartTagToken || tt == html.SelfClosingTagToken:
			if string(tag) == UicInclude {
				if replaceTextStart, replaceTextEnd, err := getInclude(z, attrs); err != nil {
					return nil, nil, err
				} else {
					fmt.Fprintf(buff, replaceTextStart)
					// Enhancement: WriteOut sub tree, to allow alternative content
					//              for optional includes.
					fmt.Fprintf(buff, replaceTextEnd)
					continue
				}
			}

			if skipSubtreeIfUicRemove(z, tt, string(tag), attrs) {
				continue
			}

		case tt == html.EndTagToken:
			if string(tag) == UicFragment || string(tag) == UicTail {
				break forloop
			}
		}
		buff.Write(raw)
	}

	return StringFragment(buff.String()), dependencies, nil
}
func (parser *HtmlContentParser) parseHead(z *html.Tokenizer, c *MemoryContent) error {
	attrs := make([]html.Attribute, 0, 10)
	headBuff := bytes.NewBuffer(nil)

forloop:
	for {
		tt := z.Next()
		tag, _ := z.TagName()
		raw := byteCopy(z.Raw()) // create a copy here, because readAttributes modifies z.Raw, if attributes contain an &
		attrs = readAttributes(z, attrs)

		switch {
		case tt == html.ErrorToken:
			if z.Err() != io.EOF {
				return z.Err()
			}
			break forloop
		case tt == html.StartTagToken || tt == html.SelfClosingTagToken:
			if skipSubtreeIfUicRemove(z, tt, string(tag), attrs) {
				continue
			}
			if string(tag) == "script" && attrHasValue(attrs, "type", ScriptTypeMeta) {
				if err := parseMetaJson(z, c); err != nil {
					return err
				}
				continue
			}
		case tt == html.EndTagToken:
			if string(tag) == "head" {
				break forloop
			}
		}
		headBuff.Write(raw)
	}

	s := headBuff.String()
	st := strings.Trim(s, " \n")
	if len(st) > 0 {
		c.head = StringFragment(st)
	}
	return nil
}
func parseMetaJson(z *html.Tokenizer, c *MemoryContent) error {
	tt := z.Next()
	if tt != html.TextToken {
		return fmt.Errorf("expected text node for meta json, but found %v, (%s)", tt.String(), z.Raw())
	}

	bytes := z.Text()
	err := json.Unmarshal(bytes, &c.meta)
	if err != nil {
		return fmt.Errorf("error while parsing json from meta json element: %v", err.Error())
	}

	tt = z.Next()
	tag, _ := z.TagName()
	if tt != html.EndTagToken || string(tag) != "script" {
		return fmt.Errorf("Tag not properly ended. Expected </script>, but found %s", z.Raw())
	}

	return nil
}
func skipCompleteTag(z *html.Tokenizer, tagName string) error {
forloop:
	for {
		tt := z.Next()
		tag, _ := z.TagName()
		switch {
		case tt == html.ErrorToken:
			if z.Err() != io.EOF {
				return z.Err()
			}
			break forloop
		case tt == html.EndTagToken:
			tagAsString := string(tag)
			if tagAsString == tagName {
				break forloop
			}
		}
	}
	return nil
}
func (list *AnimeConventionList) Parse(t *html.Tokenizer) {
	for {
		next := t.Next()
		switch next {
		case html.ErrorToken:
			return
		case html.TextToken:
			//fmt.Println(string(t.Text()))
		case html.StartTagToken, html.EndTagToken, html.SelfClosingTagToken:
			_, hasmore := t.TagName()
			if hasmore {
				key, val, _ := t.TagAttr()
				if strings.EqualFold(string(key), "href") && strings.HasPrefix(string(val), "/events/info.shtml") {
					var item = &AnimeConventionItem{}
					item.url = strings.Join([]string{"http://animecons.com", string(val)}, "")
					list.taskNum++
					go item.crawlInformation(&list.ConventionList)
					//time.Sleep(100 * time.Millisecond)
				}
			}
		}
	}
}
Exemple #15
0
func getTextR(z *html.Tokenizer) string {
	r := ""
	depth := 1
	for {
		tt := z.Next()
		switch tt {
		case html.ErrorToken:
			panic(z.Err())
		case html.TextToken:
			r += string(z.Text())
		case html.StartTagToken:
			tn, _ := z.TagName()
			tns := strings.ToLower(string(tn))
			switch tns {
			case "div":
				r += "\r"
				depth++
			case "span":
				r += "'"
				depth++
			}
		case html.EndTagToken:
			tn, _ := z.TagName()
			tns := strings.ToLower(string(tn))
			switch tns {
			case "div":
				depth--
			case "span":
				r += "'"
				depth--
			}
		}
		if depth == 0 {
			return r
		}
	}
}
Exemple #16
0
func extractTitleFromTree(z *html.Tokenizer) string {
	depth := 0
	for {
		tt := z.Next()
		switch tt {
		case html.ErrorToken:
			return ""
		case html.TextToken:
			if depth > 0 {
				title := strings.TrimSpace(string(z.Text()))
				lower := strings.ToLower(title)
				if strings.HasPrefix(lower, "imgur") {
					return ""
				}
				return title
			}
		case html.StartTagToken:
			tn, _ := z.TagName()
			if string(tn) == "title" {
				depth++
			}
		}
	}
}
Exemple #17
0
func buildTokenArray(tokenizer *xhtml.Tokenizer) []tagInfo {
	tokens := []tagInfo{}
	for tt := tokenizer.Next(); tt != xhtml.ErrorToken; tt = tokenizer.Next() {
		switch tt {
		case xhtml.TextToken:
			txt := string(tokenizer.Text())
			if len(tokens) == 0 {
				info := tagInfo{
					raw: txt,
				}
				tokens = append(tokens, info)
			}
			tn, _ := tokenizer.TagName()
			key, val, _ := tokenizer.TagAttr()
			info := tagInfo{
				tag: string(tn),
				key: string(key),
				val: string(val),
				txt: txt,
			}
			tokens = append(tokens, info)
		case xhtml.StartTagToken:
			tn, _ := tokenizer.TagName()
			key, val, _ := tokenizer.TagAttr()
			info := tagInfo{
				tag: string(tn),
				key: string(key),
				val: string(val),
			}
			tokens = append(tokens, info)
		case xhtml.SelfClosingTagToken, xhtml.EndTagToken:
			tn, _ := tokenizer.TagName()
			key, val, _ := tokenizer.TagAttr()
			info := tagInfo{
				tag:        string(tn),
				key:        string(key),
				val:        string(val),
				closingTag: true,
			}
			tokens = append(tokens, info)
		}
	}
	return tokens
}
Exemple #18
0
// getTagName gets a tagName from tokenizer.
func getTagName(tokenizer *html.Tokenizer) string {
	tagName, _ := tokenizer.TagName()
	return string(tagName)
}
func (parser *HtmlContentParser) parseBody(z *html.Tokenizer, c *MemoryContent) error {
	attrs := make([]html.Attribute, 0, 10)
	bodyBuff := bytes.NewBuffer(nil)

	attrs = readAttributes(z, attrs)
	if len(attrs) > 0 {
		c.bodyAttributes = StringFragment(joinAttrs(attrs))
	}

forloop:
	for {
		tt := z.Next()
		tag, _ := z.TagName()
		raw := byteCopy(z.Raw()) // create a copy here, because readAttributes modifies z.Raw, if attributes contain an &
		attrs = readAttributes(z, attrs)

		switch {
		case tt == html.ErrorToken:
			if z.Err() != io.EOF {
				return z.Err()
			}
			break forloop
		case tt == html.StartTagToken || tt == html.SelfClosingTagToken:
			if skipSubtreeIfUicRemove(z, tt, string(tag), attrs) {
				continue
			}
			if string(tag) == UicFragment {
				if f, deps, err := parseFragment(z); err != nil {
					return err
				} else {
					c.body[getFragmentName(attrs)] = f
					for _, dep := range deps {
						c.requiredContent[dep.URL] = dep
					}
				}
				continue
			}
			if string(tag) == UicTail {
				if f, deps, err := parseFragment(z); err != nil {
					return err
				} else {
					c.tail = f
					for _, dep := range deps {
						c.requiredContent[dep.URL] = dep
					}
				}
				continue
			}
			if string(tag) == UicFetch {
				if fd, err := getFetch(z, attrs); err != nil {
					return err
				} else {
					c.requiredContent[fd.URL] = fd
					continue
				}
			}
			if string(tag) == UicInclude {
				if replaceTextStart, replaceTextEnd, err := getInclude(z, attrs); err != nil {
					return err
				} else {
					bodyBuff.WriteString(replaceTextStart)
					// Enhancement: WriteOut sub tree, to allow alternative content
					//              for optional includes.
					bodyBuff.WriteString(replaceTextEnd)
					continue
				}
			}

		case tt == html.EndTagToken:
			if string(tag) == "body" {
				break forloop
			}
		}
		bodyBuff.Write(raw)
	}

	s := bodyBuff.String()
	if _, defaultFragmentExists := c.body[""]; !defaultFragmentExists {
		if st := strings.Trim(s, " \n"); len(st) > 0 {
			c.body[""] = StringFragment(st)
		}
	}

	return nil
}
Exemple #20
0
func pullNode(tokens *html.Tokenizer, root *Markup) {
	var node *Markup

	for {
		token := tokens.Next()

		switch token {
		case html.ErrorToken:
			return

		case html.TextToken, html.CommentToken, html.DoctypeToken:
			text := strings.TrimSpace(string(tokens.Text()))

			if text == "" {
				continue
			}

			if token == html.CommentToken {
				text = "<!--" + text + "-->"
			}

			if node != nil {
				NewText(text).Apply(node)
				continue
			}

			NewText(text).Apply(root)
			continue

		case html.StartTagToken, html.EndTagToken, html.SelfClosingTagToken:
			if token == html.EndTagToken {
				node = nil
				return
			}

			tagName, hasAttr := tokens.TagName()

			node = NewMarkup(string(tagName), token == html.SelfClosingTagToken)
			node.Apply(root)

			if hasAttr {
			attrLoop:
				for {
					key, val, more := tokens.TagAttr()

					NewAttr(string(key), string(val)).Apply(node)

					if !more {
						break attrLoop
					}
				}
			}

			if token == html.SelfClosingTagToken {
				continue
			}

			pullNode(tokens, node)
		}
	}
}