func readBody(contentEncoding string, body io.Reader) ([]byte, error) { var err error result := []byte{} if contentEncoding == "gzip" { reader, err := gzip.NewReader(body) if err != nil { return result, werrors.NewDetails(ErrReadGZipResponse, err) } result, err = ioutil.ReadAll(reader) if err == nil { err = reader.Close() } else { _ = reader.Close() } if err != nil { return result, werrors.NewDetails(ErrReadGZipResponse, err) } } else if contentEncoding == "identity" || contentEncoding == "" { result, err = ioutil.ReadAll(body) if err != nil { return result, werrors.NewDetails(ErrReadResponse, err) } } else { return result, werrors.NewFields(ErrUnknownContentEncoding, zap.String("encoding", contentEncoding)) } return result, nil }
func (m *hostsManager) resolveURL(hostName string) (string, error) { hostURL := NormalizeURL(&url.URL{Scheme: "http", Host: hostName}) response, err := http.Get(hostURL) if err == nil { err = response.Body.Close() if response.StatusCode != 200 { return "", werrors.NewFields(ErrResolveBaseURL, zap.Int("status_code", response.StatusCode), zap.String("url", hostURL)) } } if err != nil { return "", werrors.NewFields(ErrGetRequest, zap.String("details", err.Error()), zap.String("url", hostURL)) } return response.Request.URL.String(), nil }
// NewHTMLMetadata - create new HTMLMetadata struct func NewHTMLMetadata(hostMng *hostsManager, urlStr string) (*HTMLMetadata, error) { baseURL, err := url.Parse(urlStr) if err != nil { return nil, werrors.NewFields(ErrParseBaseURL, zap.String("details", err.Error()), zap.String("parsed_url", urlStr)) } return &HTMLMetadata{ URLs: make(map[string]sql.NullInt64), wrongURLs: make(map[string]string), title: "", MetaTagIndex: true, baseURL: baseURL, hostMng: hostMng, }, nil }
func (m *hostsManager) initByDb(db proxy.DbHost) error { hosts, err := db.GetHosts() if err != nil { return err } for id, host := range hosts { hostName := host.GetName() robot, err := robotstxt.FromStatusAndBytes(host.GetRobotsTxt()) if err != nil { return werrors.NewFields(ErrCreateRobotsTxtFromDb, zap.String("host", hostName), zap.String("details", err.Error())) } m.hosts[hostName] = id m.robotsTxt[id] = robot.FindGroup("Googlebot") } return nil }
func (m *hostsManager) readRobotTxt(hostName string) (int, []byte, error) { var body []byte robotsURL := NormalizeURL(&url.URL{Scheme: "http", Host: hostName, Path: "robots.txt"}) response, err := http.Get(robotsURL) if err == nil { body, err = ioutil.ReadAll(response.Body) closeErr := response.Body.Close() if err == nil { err = closeErr } } if err != nil { return 0, body, werrors.NewFields(ErrGetRequest, zap.String("details", err.Error()), zap.String("url", robotsURL)) } return response.StatusCode, body, nil }
func checkContentType(header *http.Header) (string, error) { contentTypeArr, ok := (*header)["Content-Type"] if !ok || len(contentTypeArr) == 0 { return "", werrors.New(ErrNotFountContentType) } contentType := contentTypeArr[0] mediatype, _, err := mime.ParseMediaType(contentType) if err != nil { return "", werrors.NewFields(ErrParseContentType, zap.String("detail", err.Error()), zap.String("content_type", contentType)) } if mediatype != "text/html" { return "", werrors.NewEx(zap.InfoLevel, InfoUnsupportedMimeFormat, zap.String("content_type", contentType)) } return contentType, nil }