func setClasses(n *html.Node, attr *html.Attribute, classes string) { classes = strings.TrimSpace(classes) if classes == "" { removeAttr(n, "class") return } attr.Val = classes }
func (p Parser) setAttr(selection *goquery.Selection, attr string, value string) { if selection.Size() > 0 { node := selection.Get(0) var attrs []html.Attribute for _, a := range node.Attr { if a.Key != attr { newAttr := new(html.Attribute) newAttr.Key = a.Key newAttr.Val = a.Val attrs = append(attrs, *newAttr) } } newAttr := new(html.Attribute) newAttr.Key = attr newAttr.Val = value attrs = append(attrs, *newAttr) node.Attr = attrs } }
func cleanURL(c *Config, a atom.Atom, attr *html.Attribute) bool { if a != atom.Href && a != atom.Src && a != atom.Poster { return true } u, err := url.Parse(attr.Val) if err != nil { return false } if c.ValidateURL != nil && !c.ValidateURL(u) { return false } attr.Val = u.String() return true }
func nodesCleanup(nodes []*html.Node) bool { changed := false for _, n := range nodes { if n.Type == html.ElementNode && n.Data == "script" || n.Type == html.CommentNode { if n.Parent != nil { n.Parent.RemoveChild(n) changed = true } break } if n.Type == html.ElementNode { // Remove all 'on*' attributes, and any that contain 'javascript:' attrs := []html.Attribute{} for _, a := range n.Attr { if strings.HasPrefix(a.Key, "on") { changed = true break } i := strings.Index(a.Val, "javascript:") if i != -1 { onlySpace := true if i > 0 { for _, r := range a.Val[:i] { if !unicode.IsSpace(r) { onlySpace = false break } } } if onlySpace { changed = true break } } attrs = append(attrs, a) } n.Attr = attrs // Add a target attribute to the article links if n.Data == "a" { var attr *html.Attribute for i, a := range n.Attr { if a.Key == "target" { attr = &n.Attr[i] break } } val := "feed-article" if attr == nil { n.Attr = append(n.Attr, html.Attribute{Key: "target", Val: val}) } else { attr.Val = val } changed = true } } children := []*html.Node{} for c := n.FirstChild; c != nil; c = c.NextSibling { children = append(children, c) } if len(children) > 0 { if nodesCleanup(children) { changed = true } } } return changed }
// sanitizeAttrs takes a set of element attribute policies and the global // attribute policies and applies them to the []html.Attribute returning a set // of html.Attributes that match the policies func (p *Policy) sanitizeAttrs( elementName string, attrs []html.Attribute, aps map[string]attrPolicy, ) []html.Attribute { if len(attrs) == 0 { return attrs } // Builds a new attribute slice based on the whether the attribute has been // whitelisted explicitly or globally. cleanAttrs := []html.Attribute{} for _, htmlAttr := range attrs { // Is there an element specific attribute policy that applies? if ap, ok := aps[htmlAttr.Key]; ok { if ap.regexp != nil { if ap.regexp.MatchString(htmlAttr.Val) { cleanAttrs = append(cleanAttrs, htmlAttr) continue } } else { cleanAttrs = append(cleanAttrs, htmlAttr) continue } } // Is there a global attribute policy that applies? if ap, ok := p.globalAttrs[htmlAttr.Key]; ok { if ap.regexp != nil { if ap.regexp.MatchString(htmlAttr.Val) { cleanAttrs = append(cleanAttrs, htmlAttr) } } else { cleanAttrs = append(cleanAttrs, htmlAttr) } } } if len(cleanAttrs) == 0 { // If nothing was allowed, let's get out of here return cleanAttrs } // cleanAttrs now contains the attributes that are permitted if linkable(elementName) { if p.requireParseableURLs { // Ensure URLs are parseable: // - a.href // - area.href // - link.href // - blockquote.cite // - q.cite // - img.src // - script.src tmpAttrs := []html.Attribute{} for _, htmlAttr := range cleanAttrs { switch elementName { case "a", "area", "link": if htmlAttr.Key == "href" { if u, ok := p.validURL(htmlAttr.Val); ok { htmlAttr.Val = u tmpAttrs = append(tmpAttrs, htmlAttr) } break } tmpAttrs = append(tmpAttrs, htmlAttr) case "blockquote", "q": if htmlAttr.Key == "cite" { if u, ok := p.validURL(htmlAttr.Val); ok { htmlAttr.Val = u tmpAttrs = append(tmpAttrs, htmlAttr) } break } tmpAttrs = append(tmpAttrs, htmlAttr) case "img", "script": if htmlAttr.Key == "src" { if u, ok := p.validURL(htmlAttr.Val); ok { htmlAttr.Val = u tmpAttrs = append(tmpAttrs, htmlAttr) } break } tmpAttrs = append(tmpAttrs, htmlAttr) default: tmpAttrs = append(tmpAttrs, htmlAttr) } } cleanAttrs = tmpAttrs } if (p.requireNoFollow || p.requireNoFollowFullyQualifiedLinks || p.addTargetBlankToFullyQualifiedLinks) && len(cleanAttrs) > 0 { // Add rel="nofollow" if a "href" exists switch elementName { case "a", "area", "link": var hrefFound bool var externalLink bool for _, htmlAttr := range cleanAttrs { if htmlAttr.Key == "href" { hrefFound = true u, err := url.Parse(htmlAttr.Val) if err != nil { continue } if u.Host != "" { externalLink = true } continue } } if hrefFound { var noFollowFound bool var targetFound bool addNoFollow := (p.requireNoFollow || externalLink && p.requireNoFollowFullyQualifiedLinks) addTargetBlank := (externalLink && p.addTargetBlankToFullyQualifiedLinks) tmpAttrs := []html.Attribute{} for _, htmlAttr := range cleanAttrs { var appended bool if htmlAttr.Key == "rel" && addNoFollow { if strings.Contains(htmlAttr.Val, "nofollow") { noFollowFound = true tmpAttrs = append(tmpAttrs, htmlAttr) } else { htmlAttr.Val += " nofollow" noFollowFound = true tmpAttrs = append(tmpAttrs, htmlAttr) } appended = true } if elementName == "a" && htmlAttr.Key == "target" && addTargetBlank { if strings.Contains(htmlAttr.Val, "_blank") { targetFound = true tmpAttrs = append(tmpAttrs, htmlAttr) } else { htmlAttr.Val = "_blank" targetFound = true tmpAttrs = append(tmpAttrs, htmlAttr) } appended = true } if !appended { tmpAttrs = append(tmpAttrs, htmlAttr) } } if noFollowFound || targetFound { cleanAttrs = tmpAttrs } if addNoFollow && !noFollowFound { rel := html.Attribute{} rel.Key = "rel" rel.Val = "nofollow" cleanAttrs = append(cleanAttrs, rel) } if elementName == "a" && addTargetBlank && !targetFound { rel := html.Attribute{} rel.Key = "target" rel.Val = "_blank" cleanAttrs = append(cleanAttrs, rel) } } default: } } } return cleanAttrs }
//从nodes中找到node 根据index 和 属性 先index func findNodeformNodesbyIndexOrPro(nodes []*goquery.Selection, index *int, m map[string]string, Type string, visible bool) { switch { case Type == OPTION || Type == RADIO: for _, v := range nodes { for _, vv := range v.Get(0).Attr { if vv.Key == VALUE { if vv.Val == m[VALUE] { if Type == RADIO { v.SetAttr("checked", "checked") } else { v.SetAttr("selected", "selected") } return } } } } if visible { var node html.Node node.Data = nodes[0].Get(0).Data node.Type = nodes[0].Get(0).Type attr := make([]html.Attribute, 0, 2) var tr html.Attribute tr.Key = VALUE tr.Val = m[VALUE] attr = append(attr, tr) if Type == RADIO { tr.Key = "checked" tr.Val = "checked" } else { tr.Key = "selected" tr.Val = "selected" } attr = append(attr, tr) tr.Key = TYPE tr.Val = Type attr = append(attr, tr) node.Attr = attr nodes[0].Parent().AppendNodes(&node) } return default: } if len(nodes) <= *index { return } for k, v := range m { nodes[*index].SetAttr(k, v) } *index++ }
// sanitizeAttrs takes a set of element attribute policies and the global // attribute policies and applies them to the []html.Attribute returning a set // of html.Attributes that match the policies func (p *Policy) sanitizeAttrs( elementName string, attrs []html.Attribute, aps map[string]attrPolicy, ) []html.Attribute { if len(attrs) == 0 { return attrs } // Builds a new attribute slice based on the whether the attribute has been // whitelisted explicitly or globally. cleanAttrs := []html.Attribute{} for _, htmlAttr := range attrs { // Is there an element specific attribute policy that applies? if ap, ok := aps[htmlAttr.Key]; ok { if ap.regexp != nil { if ap.regexp.MatchString(htmlAttr.Val) { cleanAttrs = append(cleanAttrs, htmlAttr) continue } } else { cleanAttrs = append(cleanAttrs, htmlAttr) continue } } // Is there a global attribute policy that applies? if ap, ok := p.globalAttrs[htmlAttr.Key]; ok { if ap.regexp != nil { if ap.regexp.MatchString(htmlAttr.Val) { cleanAttrs = append(cleanAttrs, htmlAttr) } } else { cleanAttrs = append(cleanAttrs, htmlAttr) } } } if len(cleanAttrs) == 0 { // If nothing was allowed, let's get out of here return cleanAttrs } // cleanAttrs now contains the attributes that are permitted if linkable(elementName) { if p.requireParseableURLs { // Ensure URLs are parseable: // - a.href // - area.href // - link.href // - blockquote.cite // - q.cite // - img.src // - script.src tmpAttrs := []html.Attribute{} for _, htmlAttr := range cleanAttrs { switch elementName { case "a", "area", "link": if htmlAttr.Key == "href" { if u, ok := p.validURL(htmlAttr.Val); ok { htmlAttr.Val = u tmpAttrs = append(tmpAttrs, htmlAttr) } break } tmpAttrs = append(tmpAttrs, htmlAttr) case "blockquote", "q": if htmlAttr.Key == "cite" { if u, ok := p.validURL(htmlAttr.Val); ok { htmlAttr.Val = u tmpAttrs = append(tmpAttrs, htmlAttr) } break } tmpAttrs = append(tmpAttrs, htmlAttr) case "img", "script": if htmlAttr.Key == "src" { if u, ok := p.validURL(htmlAttr.Val); ok { htmlAttr.Val = u tmpAttrs = append(tmpAttrs, htmlAttr) } break } tmpAttrs = append(tmpAttrs, htmlAttr) default: tmpAttrs = append(tmpAttrs, htmlAttr) } } cleanAttrs = tmpAttrs } if (p.requireNoFollow || p.requireNoFollowFullyQualifiedLinks || p.addTargetBlankToFullyQualifiedLinks) && len(cleanAttrs) > 0 { // Add rel="nofollow" if a "href" exists switch elementName { case "a", "area", "link": var hrefFound bool var externalLink bool for _, htmlAttr := range cleanAttrs { if htmlAttr.Key == "href" { hrefFound = true u, err := url.Parse(htmlAttr.Val) if err != nil { continue } if u.Host != "" { externalLink = true } continue } } if hrefFound { var ( noFollowFound bool targetBlankFound bool ) addNoFollow := (p.requireNoFollow || externalLink && p.requireNoFollowFullyQualifiedLinks) addTargetBlank := (externalLink && p.addTargetBlankToFullyQualifiedLinks) tmpAttrs := []html.Attribute{} for _, htmlAttr := range cleanAttrs { var appended bool if htmlAttr.Key == "rel" && addNoFollow { if strings.Contains(htmlAttr.Val, "nofollow") { noFollowFound = true tmpAttrs = append(tmpAttrs, htmlAttr) appended = true } else { htmlAttr.Val += " nofollow" noFollowFound = true tmpAttrs = append(tmpAttrs, htmlAttr) appended = true } } if elementName == "a" && htmlAttr.Key == "target" { if htmlAttr.Val == "_blank" { targetBlankFound = true } if addTargetBlank && !targetBlankFound { htmlAttr.Val = "_blank" targetBlankFound = true tmpAttrs = append(tmpAttrs, htmlAttr) appended = true } } if !appended { tmpAttrs = append(tmpAttrs, htmlAttr) } } if noFollowFound || targetBlankFound { cleanAttrs = tmpAttrs } if addNoFollow && !noFollowFound { rel := html.Attribute{} rel.Key = "rel" rel.Val = "nofollow" cleanAttrs = append(cleanAttrs, rel) } if elementName == "a" && addTargetBlank && !targetBlankFound { rel := html.Attribute{} rel.Key = "target" rel.Val = "_blank" targetBlankFound = true cleanAttrs = append(cleanAttrs, rel) } if targetBlankFound { // target="_blank" has a security risk that allows the // opened window/tab to issue JavaScript calls against // window.opener, which in effect allow the destination // of the link to control the source: // https://dev.to/ben/the-targetblank-vulnerability-by-example // // To mitigate this risk, we need to add a specific rel // attribute if it is not already present. // rel="noopener" // // Unfortunately this is processing the rel twice (we // already looked at it earlier ^^) as we cannot be sure // of the ordering of the href and rel, and whether we // have fully satisfied that we need to do this. This // double processing only happens *if* target="_blank" // is true. var noOpenerAdded bool tmpAttrs := []html.Attribute{} for _, htmlAttr := range cleanAttrs { var appended bool if htmlAttr.Key == "rel" { if strings.Contains(htmlAttr.Val, "noopener") { noOpenerAdded = true tmpAttrs = append(tmpAttrs, htmlAttr) } else { htmlAttr.Val += " noopener" noOpenerAdded = true tmpAttrs = append(tmpAttrs, htmlAttr) } appended = true } if !appended { tmpAttrs = append(tmpAttrs, htmlAttr) } } if noOpenerAdded { cleanAttrs = tmpAttrs } else { // rel attr was not found, or else noopener would // have been added already rel := html.Attribute{} rel.Key = "rel" rel.Val = "noopener" cleanAttrs = append(cleanAttrs, rel) } } } default: } } } return cleanAttrs }