Esempio n. 1
0
func readBody(contentEncoding string, body io.Reader) ([]byte, error) {
	var err error
	result := []byte{}
	if contentEncoding == "gzip" {
		reader, err := gzip.NewReader(body)
		if err != nil {
			return result, werrors.NewDetails(ErrReadGZipResponse, err)
		}
		result, err = ioutil.ReadAll(reader)
		if err == nil {
			err = reader.Close()
		} else {
			_ = reader.Close()
		}
		if err != nil {
			return result, werrors.NewDetails(ErrReadGZipResponse, err)
		}
	} else if contentEncoding == "identity" || contentEncoding == "" {
		result, err = ioutil.ReadAll(body)
		if err != nil {
			return result, werrors.NewDetails(ErrReadResponse, err)
		}
	} else {
		return result, werrors.NewFields(ErrUnknownContentEncoding, zap.String("encoding", contentEncoding))
	}

	return result, nil
}
Esempio n. 2
0
func (m *hostsManager) resolveURL(hostName string) (string, error) {
	hostURL := NormalizeURL(&url.URL{Scheme: "http", Host: hostName})
	response, err := http.Get(hostURL)
	if err == nil {
		err = response.Body.Close()
		if response.StatusCode != 200 {
			return "", werrors.NewFields(ErrResolveBaseURL,
				zap.Int("status_code", response.StatusCode),
				zap.String("url", hostURL))
		}
	}
	if err != nil {
		return "", werrors.NewFields(ErrGetRequest,
			zap.String("details", err.Error()),
			zap.String("url", hostURL))
	}

	return response.Request.URL.String(), nil
}
Esempio n. 3
0
// NewHTMLMetadata - create new HTMLMetadata struct
func NewHTMLMetadata(hostMng *hostsManager, urlStr string) (*HTMLMetadata, error) {
	baseURL, err := url.Parse(urlStr)
	if err != nil {
		return nil, werrors.NewFields(ErrParseBaseURL,
			zap.String("details", err.Error()),
			zap.String("parsed_url", urlStr))
	}

	return &HTMLMetadata{
		URLs:         make(map[string]sql.NullInt64),
		wrongURLs:    make(map[string]string),
		title:        "",
		MetaTagIndex: true,
		baseURL:      baseURL,
		hostMng:      hostMng,
	}, nil
}
Esempio n. 4
0
func (m *hostsManager) initByDb(db proxy.DbHost) error {
	hosts, err := db.GetHosts()
	if err != nil {
		return err
	}
	for id, host := range hosts {
		hostName := host.GetName()
		robot, err := robotstxt.FromStatusAndBytes(host.GetRobotsTxt())
		if err != nil {
			return werrors.NewFields(ErrCreateRobotsTxtFromDb,
				zap.String("host", hostName),
				zap.String("details", err.Error()))
		}
		m.hosts[hostName] = id
		m.robotsTxt[id] = robot.FindGroup("Googlebot")
	}

	return nil
}
Esempio n. 5
0
func (m *hostsManager) readRobotTxt(hostName string) (int, []byte, error) {
	var body []byte
	robotsURL := NormalizeURL(&url.URL{Scheme: "http", Host: hostName, Path: "robots.txt"})
	response, err := http.Get(robotsURL)
	if err == nil {
		body, err = ioutil.ReadAll(response.Body)
		closeErr := response.Body.Close()
		if err == nil {
			err = closeErr
		}
	}

	if err != nil {
		return 0, body, werrors.NewFields(ErrGetRequest,
			zap.String("details", err.Error()),
			zap.String("url", robotsURL))
	}

	return response.StatusCode, body, nil
}
Esempio n. 6
0
func checkContentType(header *http.Header) (string, error) {
	contentTypeArr, ok := (*header)["Content-Type"]
	if !ok || len(contentTypeArr) == 0 {
		return "", werrors.New(ErrNotFountContentType)
	}
	contentType := contentTypeArr[0]

	mediatype, _, err := mime.ParseMediaType(contentType)
	if err != nil {
		return "", werrors.NewFields(ErrParseContentType,
			zap.String("detail", err.Error()),
			zap.String("content_type", contentType))
	}

	if mediatype != "text/html" {
		return "", werrors.NewEx(zap.InfoLevel, InfoUnsupportedMimeFormat,
			zap.String("content_type", contentType))
	}

	return contentType, nil
}