func readBody(contentEncoding string, body io.Reader) ([]byte, error) { var err error result := []byte{} if contentEncoding == "gzip" { reader, err := gzip.NewReader(body) if err != nil { return result, werrors.NewDetails(ErrReadGZipResponse, err) } result, err = ioutil.ReadAll(reader) if err == nil { err = reader.Close() } else { _ = reader.Close() } if err != nil { return result, werrors.NewDetails(ErrReadGZipResponse, err) } } else if contentEncoding == "identity" || contentEncoding == "" { result, err = ioutil.ReadAll(body) if err != nil { return result, werrors.NewDetails(ErrReadResponse, err) } } else { return result, werrors.NewFields(ErrUnknownContentEncoding, zap.String("encoding", contentEncoding)) } return result, nil }
func (r *responseParser) Run(response *http.Response) error { contentType, contentEncoding, err := r.processMeta(response.StatusCode, &response.Header) if err != nil { _ = response.Body.Close() return err } body, err := readBody(contentEncoding, response.Body) closeErr := response.Body.Close() defer r.timeTrack(time.Now()) if err != nil { r.meta.SetState(database.StateAnswerError) return err } if closeErr != nil { r.meta.SetState(database.StateAnswerError) return werrors.NewDetails(ErrCloseResponseBody, closeErr) } state, err := r.processBody(body, contentType) if err != nil { r.meta.SetState(state) return err } r.meta.SetState(database.StateSuccess) return nil }
func (m *hostsManager) initByHostName(db proxy.DbHost, hostName string) error { baseURL, err := m.resolveURL(hostName) if err != nil { return err } statusCode, body, err := m.readRobotTxt(hostName) if err != nil { return err } robot, err := robotstxt.FromStatusAndBytes(statusCode, body) if err != nil { return werrors.NewDetails(ErrCreateRobotsTxtFromURL, err) } host := proxy.NewHost(hostName, statusCode, body) hostID, err := db.AddHost(host, baseURL) if err == nil { m.hosts[hostName] = hostID m.robotsTxt[hostID] = robot.FindGroup("Googlebot") } return err }
func bodyMinification(node *html.Node, buf io.Writer) error { htmlMinification := minificationHTML{} err := htmlMinification.Run(node) if err == nil { textMinification := minificationText{} err = textMinification.Run(node) } if err == nil { wbuf := bufio.NewWriter(buf) err = html.Render(wbuf, node) if err == nil { err = wbuf.Flush() } if err != nil { return werrors.NewDetails(ErrRenderHTML, err) } } return err }
func (r *responseParser) processBody(body []byte, contentType string) (database.State, error) { if !isHTML(body) { return database.StateParseError, werrors.New(ErrBodyNotHTML) } bodyReader, err := bodyToUTF8(body, contentType) if err != nil { return database.StateEncodingError, err } node, err := html.Parse(bodyReader) if err != nil { return database.StateParseError, werrors.NewDetails(ErrHTMLParse, err) } parser, err := RunDataExtrator(r.hostMng, node, r.meta.GetURL()) if err != nil { return database.StateParseError, err } if !parser.MetaTagIndex { return database.StateNoFollow, werrors.NewLevel(zap.InfoLevel, WarnPageNotIndexed) } parser.WrongURLsToLog(r.logger) var buf bytes.Buffer err = bodyMinification(node, &buf) if err != nil { return database.StateParseError, err } r.URLs = parser.URLs r.meta.SetContent(proxy.NewContent(buf.Bytes(), parser.GetTitle())) r.logger.Debug(DbgBodySize, zap.Int("size", buf.Len())) return database.StateSuccess, nil }