func bodyToUTF8(body []byte, contentType string) (*transform.Reader, error) { enc, _, _ := charset.DetermineEncoding(body, contentType) if enc == encoding.Nop { return nil, werrors.New(ErrEncodingNotFound) } return transform.NewReader(bytes.NewReader(body), enc.NewDecoder()), nil }
func (m *minificationText) parseNode(node *html.Node) (*html.Node, error) { switch node.Type { case html.DocumentNode: // +children -attr (first node) return m.parseChildren(node) case html.ElementNode: // +children +attr return m.parseElements(node) case html.TextNode: // -children -attr return m.parseText(node) default: // ErrorNode, CommentNode, DoctypeNode return nil, werrors.New(ErrUnexpectedNodeType) } }
func (extractor *dataExtractor) parseNode(node *html.Node) error { if !extractor.meta.MetaTagIndex { return nil } switch node.Type { case html.DocumentNode: return extractor.parseChildren(node) case html.ElementNode: return extractor.parseElements(node) case html.CommentNode, html.TextNode, html.DoctypeNode: // skip return nil default: return werrors.New(ErrUnexpectedNodeType) } }
func (m *minificationHTML) parseNode(node *html.Node) (*html.Node, error) { switch node.Type { case html.DocumentNode: // +children -attr (first node) return m.parseChildren(node) case html.ElementNode: // +children +attr return m.parseElements(node) case html.TextNode: // -children -attr return node.NextSibling, nil case html.DoctypeNode: // ignore return m.removeNode(node, false) case html.CommentNode: // remove return m.removeNode(node, false) default: return nil, werrors.New(ErrUnexpectedNodeType) } }
func (m *minificationText) parseElements(node *html.Node) (*html.Node, error) { switch node.DataAtom { case atom.Head, atom.Html: return m.parseChildren(node) case atom.Body, atom.Div: next, err := m.parseChildren(node) if err != nil { return next, err } child := node.FirstChild if child == nil { next = m.removeTag(node) } else if child == node.LastChild && child.DataAtom == atom.Div { m.openTag(child) } return next, err default: return nil, werrors.New(ErrUnexpectedTag) } }
func checkContentType(header *http.Header) (string, error) { contentTypeArr, ok := (*header)["Content-Type"] if !ok || len(contentTypeArr) == 0 { return "", werrors.New(ErrNotFountContentType) } contentType := contentTypeArr[0] mediatype, _, err := mime.ParseMediaType(contentType) if err != nil { return "", werrors.NewFields(ErrParseContentType, zap.String("detail", err.Error()), zap.String("content_type", contentType)) } if mediatype != "text/html" { return "", werrors.NewEx(zap.InfoLevel, InfoUnsupportedMimeFormat, zap.String("content_type", contentType)) } return contentType, nil }
func (r *responseParser) processBody(body []byte, contentType string) (database.State, error) { if !isHTML(body) { return database.StateParseError, werrors.New(ErrBodyNotHTML) } bodyReader, err := bodyToUTF8(body, contentType) if err != nil { return database.StateEncodingError, err } node, err := html.Parse(bodyReader) if err != nil { return database.StateParseError, werrors.NewDetails(ErrHTMLParse, err) } parser, err := RunDataExtrator(r.hostMng, node, r.meta.GetURL()) if err != nil { return database.StateParseError, err } if !parser.MetaTagIndex { return database.StateNoFollow, werrors.NewLevel(zap.InfoLevel, WarnPageNotIndexed) } parser.WrongURLsToLog(r.logger) var buf bytes.Buffer err = bodyMinification(node, &buf) if err != nil { return database.StateParseError, err } r.URLs = parser.URLs r.meta.SetContent(proxy.NewContent(buf.Bytes(), parser.GetTitle())) r.logger.Debug(DbgBodySize, zap.Int("size", buf.Len())) return database.StateSuccess, nil }