Exemple #1
0
func bodyToUTF8(body []byte, contentType string) (*transform.Reader, error) {
	enc, _, _ := charset.DetermineEncoding(body, contentType)
	if enc == encoding.Nop {
		return nil, werrors.New(ErrEncodingNotFound)
	}

	return transform.NewReader(bytes.NewReader(body), enc.NewDecoder()), nil
}
func (m *minificationText) parseNode(node *html.Node) (*html.Node, error) {
	switch node.Type {
	case html.DocumentNode: // +children -attr (first node)
		return m.parseChildren(node)
	case html.ElementNode: // +children +attr
		return m.parseElements(node)
	case html.TextNode: // -children -attr
		return m.parseText(node)
	default: // ErrorNode, CommentNode, DoctypeNode
		return nil, werrors.New(ErrUnexpectedNodeType)
	}
}
func (extractor *dataExtractor) parseNode(node *html.Node) error {
	if !extractor.meta.MetaTagIndex {
		return nil
	}
	switch node.Type {
	case html.DocumentNode:
		return extractor.parseChildren(node)
	case html.ElementNode:
		return extractor.parseElements(node)
	case html.CommentNode, html.TextNode, html.DoctypeNode: // skip
		return nil
	default:
		return werrors.New(ErrUnexpectedNodeType)
	}
}
func (m *minificationHTML) parseNode(node *html.Node) (*html.Node, error) {
	switch node.Type {
	case html.DocumentNode: // +children -attr (first node)
		return m.parseChildren(node)
	case html.ElementNode: // +children +attr
		return m.parseElements(node)
	case html.TextNode: // -children -attr
		return node.NextSibling, nil
	case html.DoctypeNode: // ignore
		return m.removeNode(node, false)
	case html.CommentNode: // remove
		return m.removeNode(node, false)
	default:
		return nil, werrors.New(ErrUnexpectedNodeType)
	}
}
func (m *minificationText) parseElements(node *html.Node) (*html.Node, error) {
	switch node.DataAtom {
	case atom.Head, atom.Html:
		return m.parseChildren(node)
	case atom.Body, atom.Div:
		next, err := m.parseChildren(node)
		if err != nil {
			return next, err
		}
		child := node.FirstChild
		if child == nil {
			next = m.removeTag(node)
		} else if child == node.LastChild && child.DataAtom == atom.Div {
			m.openTag(child)
		}
		return next, err
	default:
		return nil, werrors.New(ErrUnexpectedTag)
	}
}
func checkContentType(header *http.Header) (string, error) {
	contentTypeArr, ok := (*header)["Content-Type"]
	if !ok || len(contentTypeArr) == 0 {
		return "", werrors.New(ErrNotFountContentType)
	}
	contentType := contentTypeArr[0]

	mediatype, _, err := mime.ParseMediaType(contentType)
	if err != nil {
		return "", werrors.NewFields(ErrParseContentType,
			zap.String("detail", err.Error()),
			zap.String("content_type", contentType))
	}

	if mediatype != "text/html" {
		return "", werrors.NewEx(zap.InfoLevel, InfoUnsupportedMimeFormat,
			zap.String("content_type", contentType))
	}

	return contentType, nil
}
func (r *responseParser) processBody(body []byte, contentType string) (database.State, error) {
	if !isHTML(body) {
		return database.StateParseError, werrors.New(ErrBodyNotHTML)
	}

	bodyReader, err := bodyToUTF8(body, contentType)
	if err != nil {
		return database.StateEncodingError, err
	}

	node, err := html.Parse(bodyReader)
	if err != nil {
		return database.StateParseError, werrors.NewDetails(ErrHTMLParse, err)
	}

	parser, err := RunDataExtrator(r.hostMng, node, r.meta.GetURL())
	if err != nil {
		return database.StateParseError, err
	}
	if !parser.MetaTagIndex {
		return database.StateNoFollow, werrors.NewLevel(zap.InfoLevel, WarnPageNotIndexed)
	}
	parser.WrongURLsToLog(r.logger)

	var buf bytes.Buffer
	err = bodyMinification(node, &buf)
	if err != nil {
		return database.StateParseError, err
	}

	r.URLs = parser.URLs
	r.meta.SetContent(proxy.NewContent(buf.Bytes(), parser.GetTitle()))

	r.logger.Debug(DbgBodySize, zap.Int("size", buf.Len()))
	return database.StateSuccess, nil
}