Beispiel #1
0
func readBody(contentEncoding string, body io.Reader) ([]byte, error) {
	var err error
	result := []byte{}
	if contentEncoding == "gzip" {
		reader, err := gzip.NewReader(body)
		if err != nil {
			return result, werrors.NewDetails(ErrReadGZipResponse, err)
		}
		result, err = ioutil.ReadAll(reader)
		if err == nil {
			err = reader.Close()
		} else {
			_ = reader.Close()
		}
		if err != nil {
			return result, werrors.NewDetails(ErrReadGZipResponse, err)
		}
	} else if contentEncoding == "identity" || contentEncoding == "" {
		result, err = ioutil.ReadAll(body)
		if err != nil {
			return result, werrors.NewDetails(ErrReadResponse, err)
		}
	} else {
		return result, werrors.NewFields(ErrUnknownContentEncoding, zap.String("encoding", contentEncoding))
	}

	return result, nil
}
func (r *responseParser) Run(response *http.Response) error {
	contentType, contentEncoding, err := r.processMeta(response.StatusCode, &response.Header)
	if err != nil {
		_ = response.Body.Close()
		return err
	}

	body, err := readBody(contentEncoding, response.Body)
	closeErr := response.Body.Close()
	defer r.timeTrack(time.Now())
	if err != nil {
		r.meta.SetState(database.StateAnswerError)
		return err
	}
	if closeErr != nil {
		r.meta.SetState(database.StateAnswerError)
		return werrors.NewDetails(ErrCloseResponseBody, closeErr)
	}

	state, err := r.processBody(body, contentType)
	if err != nil {
		r.meta.SetState(state)
		return err
	}

	r.meta.SetState(database.StateSuccess)
	return nil
}
Beispiel #3
0
func (m *hostsManager) initByHostName(db proxy.DbHost, hostName string) error {
	baseURL, err := m.resolveURL(hostName)
	if err != nil {
		return err
	}

	statusCode, body, err := m.readRobotTxt(hostName)
	if err != nil {
		return err
	}

	robot, err := robotstxt.FromStatusAndBytes(statusCode, body)
	if err != nil {
		return werrors.NewDetails(ErrCreateRobotsTxtFromURL, err)
	}

	host := proxy.NewHost(hostName, statusCode, body)
	hostID, err := db.AddHost(host, baseURL)

	if err == nil {
		m.hosts[hostName] = hostID
		m.robotsTxt[hostID] = robot.FindGroup("Googlebot")
	}

	return err
}
Beispiel #4
0
func bodyMinification(node *html.Node, buf io.Writer) error {
	htmlMinification := minificationHTML{}
	err := htmlMinification.Run(node)

	if err == nil {
		textMinification := minificationText{}
		err = textMinification.Run(node)
	}

	if err == nil {
		wbuf := bufio.NewWriter(buf)
		err = html.Render(wbuf, node)
		if err == nil {
			err = wbuf.Flush()
		}
		if err != nil {
			return werrors.NewDetails(ErrRenderHTML, err)
		}
	}

	return err
}
func (r *responseParser) processBody(body []byte, contentType string) (database.State, error) {
	if !isHTML(body) {
		return database.StateParseError, werrors.New(ErrBodyNotHTML)
	}

	bodyReader, err := bodyToUTF8(body, contentType)
	if err != nil {
		return database.StateEncodingError, err
	}

	node, err := html.Parse(bodyReader)
	if err != nil {
		return database.StateParseError, werrors.NewDetails(ErrHTMLParse, err)
	}

	parser, err := RunDataExtrator(r.hostMng, node, r.meta.GetURL())
	if err != nil {
		return database.StateParseError, err
	}
	if !parser.MetaTagIndex {
		return database.StateNoFollow, werrors.NewLevel(zap.InfoLevel, WarnPageNotIndexed)
	}
	parser.WrongURLsToLog(r.logger)

	var buf bytes.Buffer
	err = bodyMinification(node, &buf)
	if err != nil {
		return database.StateParseError, err
	}

	r.URLs = parser.URLs
	r.meta.SetContent(proxy.NewContent(buf.Bytes(), parser.GetTitle()))

	r.logger.Debug(DbgBodySize, zap.Int("size", buf.Len()))
	return database.StateSuccess, nil
}