func getInclude(z *html.Tokenizer, attrs []html.Attribute) (startMarker, endMarker string, error error) { var srcString string if url, hasUrl := getAttr(attrs, "src"); !hasUrl { return "", "", fmt.Errorf("include definition without src %s", z.Raw()) } else { srcString = strings.TrimSpace(url.Val) if strings.HasPrefix(srcString, "#") { srcString = srcString[1:] } } required := false if r, hasRequired := getAttr(attrs, "required"); hasRequired { if requiredBool, err := strconv.ParseBool(r.Val); err != nil { return "", "", fmt.Errorf("error parsing bool in %s: %s", z.Raw(), err.Error()) } else { required = requiredBool } } if required { return fmt.Sprintf("§[> %s]§", srcString), "", nil } else { return fmt.Sprintf("§[#> %s]§", srcString), fmt.Sprintf("§[/%s]§", srcString), nil } }
func buildHTML(tokenizer *html.Tokenizer) (s string, err error) { buf := new(bytes.Buffer) bp := 0 if tag, _ := tokenizer.TagName(); string(tag) == "div" { div := tokenizer.Raw() buf.Write(div) bp = len(div) err = nextToken(tokenizer) } ep := bp for err != io.EOF { if err != nil && err != io.EOF { return } ep = buf.Len() b := tokenizer.Raw() if _, err := buf.Write(b); err != nil { return "", err } err = nextToken(tokenizer) } b := buf.Bytes() if bp > 0 { b = b[bp:ep] } return string(b), nil }
func parseFragment(z *html.Tokenizer) (f Fragment, dependencies []*FetchDefinition, err error) { attrs := make([]html.Attribute, 0, 10) dependencies = make([]*FetchDefinition, 0, 0) buff := bytes.NewBuffer(nil) forloop: for { tt := z.Next() tag, _ := z.TagName() raw := byteCopy(z.Raw()) // create a copy here, because readAttributes modifies z.Raw, if attributes contain an & attrs = readAttributes(z, attrs) switch { case tt == html.ErrorToken: if z.Err() != io.EOF { return nil, nil, z.Err() } break forloop case tt == html.StartTagToken || tt == html.SelfClosingTagToken: if string(tag) == UicInclude { if replaceTextStart, replaceTextEnd, err := getInclude(z, attrs); err != nil { return nil, nil, err } else { fmt.Fprintf(buff, replaceTextStart) // Enhancement: WriteOut sub tree, to allow alternative content // for optional includes. fmt.Fprintf(buff, replaceTextEnd) continue } } if skipSubtreeIfUicRemove(z, tt, string(tag), attrs) { continue } case tt == html.EndTagToken: if string(tag) == UicFragment || string(tag) == UicTail { break forloop } } buff.Write(raw) } return StringFragment(buff.String()), dependencies, nil }
func (parser *HtmlContentParser) parseHead(z *html.Tokenizer, c *MemoryContent) error { attrs := make([]html.Attribute, 0, 10) headBuff := bytes.NewBuffer(nil) forloop: for { tt := z.Next() tag, _ := z.TagName() raw := byteCopy(z.Raw()) // create a copy here, because readAttributes modifies z.Raw, if attributes contain an & attrs = readAttributes(z, attrs) switch { case tt == html.ErrorToken: if z.Err() != io.EOF { return z.Err() } break forloop case tt == html.StartTagToken || tt == html.SelfClosingTagToken: if skipSubtreeIfUicRemove(z, tt, string(tag), attrs) { continue } if string(tag) == "script" && attrHasValue(attrs, "type", ScriptTypeMeta) { if err := parseMetaJson(z, c); err != nil { return err } continue } case tt == html.EndTagToken: if string(tag) == "head" { break forloop } } headBuff.Write(raw) } s := headBuff.String() st := strings.Trim(s, " \n") if len(st) > 0 { c.head = StringFragment(st) } return nil }
func parseMetaJson(z *html.Tokenizer, c *MemoryContent) error { tt := z.Next() if tt != html.TextToken { return fmt.Errorf("expected text node for meta json, but found %v, (%s)", tt.String(), z.Raw()) } bytes := z.Text() err := json.Unmarshal(bytes, &c.meta) if err != nil { return fmt.Errorf("error while parsing json from meta json element: %v", err.Error()) } tt = z.Next() tag, _ := z.TagName() if tt != html.EndTagToken || string(tag) != "script" { return fmt.Errorf("Tag not properly ended. Expected </script>, but found %s", z.Raw()) } return nil }
func parseToken(tokenizer *html.Tokenizer, htmlDoc *htmlDocument, parent *tagElement) (bool, bool, string) { tokenType := tokenizer.Next() switch tokenType { case html.ErrorToken: return true, false, "" case html.TextToken: text := string(tokenizer.Text()) if strings.TrimSpace(text) == "" { break } textElement := &textElement{text: text} appendElement(htmlDoc, parent, textElement) case html.StartTagToken: tagElement := &tagElement{tagName: getTagName(tokenizer), startTagRaw: string(tokenizer.Raw())} appendElement(htmlDoc, parent, tagElement) for { errorToken, parentEnded, unsetEndTag := parseToken(tokenizer, htmlDoc, tagElement) if errorToken { return true, false, "" } if parentEnded { if unsetEndTag != "" { return false, false, unsetEndTag } break } if unsetEndTag != "" { return false, false, setEndTagRaw(tokenizer, tagElement, unsetEndTag) } } case html.EndTagToken: return false, true, setEndTagRaw(tokenizer, parent, getTagName(tokenizer)) case html.DoctypeToken, html.SelfClosingTagToken, html.CommentToken: tagElement := &tagElement{tagName: getTagName(tokenizer), startTagRaw: string(tokenizer.Raw())} appendElement(htmlDoc, parent, tagElement) } return false, false, "" }
func getFetch(z *html.Tokenizer, attrs []html.Attribute) (*FetchDefinition, error) { fd := &FetchDefinition{} url, hasUrl := getAttr(attrs, "src") if !hasUrl { return nil, fmt.Errorf("include definition without src %s", z.Raw()) } fd.URL = strings.TrimSpace(url.Val) if name, hasName := getAttr(attrs, "name"); hasName { fd.Name = name.Val } else { fd.Name = urlToName(fd.URL) } if timeout, hasTimeout := getAttr(attrs, "timeout"); hasTimeout { if timeoutInt, err := strconv.Atoi(timeout.Val); err != nil { return nil, fmt.Errorf("error parsing timeout in %s: %s", z.Raw(), err.Error()) } else { fd.Timeout = time.Millisecond * time.Duration(timeoutInt) } } if required, hasRequired := getAttr(attrs, "required"); hasRequired { if requiredBool, err := strconv.ParseBool(required.Val); err != nil { return nil, fmt.Errorf("error parsing bool in %s: %s", z.Raw(), err.Error()) } else { fd.Required = requiredBool } } attr, found := getAttr(attrs, "discoveredby") if found { fd.DiscoveredBy(attr.Val) } return fd, nil }
func (parser *HtmlContentParser) parseBody(z *html.Tokenizer, c *MemoryContent) error { attrs := make([]html.Attribute, 0, 10) bodyBuff := bytes.NewBuffer(nil) attrs = readAttributes(z, attrs) if len(attrs) > 0 { c.bodyAttributes = StringFragment(joinAttrs(attrs)) } forloop: for { tt := z.Next() tag, _ := z.TagName() raw := byteCopy(z.Raw()) // create a copy here, because readAttributes modifies z.Raw, if attributes contain an & attrs = readAttributes(z, attrs) switch { case tt == html.ErrorToken: if z.Err() != io.EOF { return z.Err() } break forloop case tt == html.StartTagToken || tt == html.SelfClosingTagToken: if skipSubtreeIfUicRemove(z, tt, string(tag), attrs) { continue } if string(tag) == UicFragment { if f, deps, err := parseFragment(z); err != nil { return err } else { c.body[getFragmentName(attrs)] = f for _, dep := range deps { c.requiredContent[dep.URL] = dep } } continue } if string(tag) == UicTail { if f, deps, err := parseFragment(z); err != nil { return err } else { c.tail = f for _, dep := range deps { c.requiredContent[dep.URL] = dep } } continue } if string(tag) == UicFetch { if fd, err := getFetch(z, attrs); err != nil { return err } else { c.requiredContent[fd.URL] = fd continue } } if string(tag) == UicInclude { if replaceTextStart, replaceTextEnd, err := getInclude(z, attrs); err != nil { return err } else { bodyBuff.WriteString(replaceTextStart) // Enhancement: WriteOut sub tree, to allow alternative content // for optional includes. bodyBuff.WriteString(replaceTextEnd) continue } } case tt == html.EndTagToken: if string(tag) == "body" { break forloop } } bodyBuff.Write(raw) } s := bodyBuff.String() if _, defaultFragmentExists := c.body[""]; !defaultFragmentExists { if st := strings.Trim(s, " \n"); len(st) > 0 { c.body[""] = StringFragment(st) } } return nil }
// setEndTagRaw sets an endTagRaw to the parent. func setEndTagRaw(tokenizer *html.Tokenizer, parent *tagElement, tagName string) string { if parent != nil && parent.tagName == tagName { parent.endTagRaw = string(tokenizer.Raw()) return "" } return tagName }