func SanitizeUrl(in string) string { url, err := purell.NormalizeURLString(in, purell.FlagsSafe|purell.FlagRemoveTrailingSlash|purell.FlagRemoveDotSegments|purell.FlagRemoveDuplicateSlashes|purell.FlagRemoveUnnecessaryHostDots|purell.FlagRemoveEmptyPortSeparator) if err != nil { return in } return url }
func (r *RequestBundle) releaseAddress(address string) error { // start instrumentation var err error address, err = purell.NormalizeURLString(address, purell.FlagsSafe) if err != nil { r.Log.Error(err.Error()) return err } reply := r.Repo.client.Hget("urls_to_ids", address) // report the repo call to instrumentation if reply.Err != nil { r.Log.Error(reply.Err.Error()) return reply.Err } if reply.Type == redis.ReplyNil { return nil } was, err := reply.Str() if err != nil { r.Log.Error(err.Error()) return err } reply = r.Repo.client.Hdel("urls_to_ids", address) // report the repo call to instrumentation if reply.Err != nil { r.Log.Error(err.Error()) return reply.Err } r.Audit("urls_to_ids", address, was, "") // report repo calls to instrumentation // stop instrumentation return nil }
func sanitizeURLWithFlags(in string, f purell.NormalizationFlags) string { s, err := purell.NormalizeURLString(in, f) if err != nil { return in } // Temporary workaround for the bug fix and resulting // behavioral change in purell.NormalizeURLString(): // a leading '/' was inadvertently added to relative links, // but no longer, see #878. // // I think the real solution is to allow Hugo to // make relative URL with relative path, // e.g. "../../post/hello-again/", as wished by users // in issues #157, #622, etc., without forcing // relative URLs to begin with '/'. // Once the fixes are in, let's remove this kludge // and restore SanitizeURL() to the way it was. // -- @anthonyfok, 2015-02-16 // // Begin temporary kludge u, err := url.Parse(s) if err != nil { panic(err) } if len(u.Path) > 0 && !strings.HasPrefix(u.Path, "/") { u.Path = "/" + u.Path } return u.String() // End temporary kludge //return s }
func Save(u string) (content *Content, err error) { url, err := purell.NormalizeURLString(u, purell.FlagsSafe|purell.FlagAddTrailingSlash) md5sum := Md5sum(url) coll, err := sess.Collection("urls") // checkError(err) err = coll.Find(db.Cond{"md5": md5sum}).One(&content) if err == db.ErrNoMoreRows { // not found, create one content = &Content{Url: url, Md5: md5sum} } else { checkError(err) } log.Printf("fetching %s...", url) resp, _ := http.Get(url) bytes, _ := ioutil.ReadAll(resp.Body) content.Header = resp.Header content.Content = string(bytes) log.Printf("saving %s...", url) if content.Id == "" { content.Id = bson.NewObjectIdWithTime(time.Now()) } _, err = coll.Append(content) return }
func (r *RequestBundle) getIDFromAddress(address string) (uint64, error) { // start instrumentation var err error address, err = purell.NormalizeURLString(address, purell.FlagsSafe) if err != nil { r.Log.Error(err.Error()) return uint64(0), err } reply := r.Repo.client.Hget("urls_to_ids", address) // report repo call to instrumentation if reply.Err != nil { r.Log.Error(reply.Err.Error()) return uint64(0), reply.Err } idstr, err := reply.Str() if err != nil { r.Log.Error(err.Error()) return uint64(0), err } id, err := strconv.ParseUint(idstr, 10, 64) if err != nil { r.Log.Error(err.Error()) return uint64(0), err } return id, nil // stop instrumentation }
func SanitizeUrl(in string) string { url, err := purell.NormalizeURLString(in, purell.FlagsUsuallySafeGreedy|purell.FlagRemoveDuplicateSlashes|purell.FlagRemoveUnnecessaryHostDots|purell.FlagRemoveEmptyPortSeparator) if err != nil { return in } return url }
func ExampleNormalizeURLString() { if normalized, err := purell.NormalizeURLString("hTTp://someWEBsite.com:80/Amazing%41%3f/url/", purell.FlagsAllGreedy); err != nil { panic(err) } else { fmt.Print(normalized) } //Output: http://somewebsite.com:80/Amazing%3F/url/ }
// buildURL constructs an URL to make a call to the Nexus API func (c *Client) buildURL(path string) string { endpoint, err := purell.NormalizeURLString( c.endpoint+path, purell.FlagLowercaseScheme|purell.FlagLowercaseScheme|purell.FlagLowercaseHost|purell.FlagRemoveDuplicateSlashes, ) if err != nil { panic(err) } return endpoint }
func UrlPrep(ugly bool, in string) string { if ugly { x := Uglify(SanitizeUrl(in)) return x } else { x := PrettifyUrl(SanitizeUrl(in)) url, err := purell.NormalizeURLString(x, purell.FlagAddTrailingSlash) if err != nil { return in } return url } }
func URLPrep(ugly bool, in string) string { if ugly { x := Uglify(SanitizeURL(in)) return x } x := PrettifyURL(SanitizeURL(in)) if path.Ext(x) == ".xml" { return x } url, err := purell.NormalizeURLString(x, purell.FlagAddTrailingSlash) if err != nil { fmt.Printf("ERROR returned by NormalizeURLString. Returning in = %q\n", in) return in } return url }
func (r *RequestBundle) reserveAddress(address string, id uint64) (bool, error) { // start instrumentation var err error address, err = purell.NormalizeURLString(address, purell.FlagsSafe) if err != nil { r.Log.Error(err.Error()) return false, err } reply := r.Repo.client.Hsetnx("urls_to_ids", address, id) // report repo call to instrumentation if reply.Err != nil { r.Log.Error(reply.Err.Error()) return false, reply.Err } r.Audit("urls_to_ids", address, "", strconv.FormatUint(id, 10)) // report repo calls to instrumentation // stop instrumentation return reply.Bool() }
// Parse the seeds URL strings to URL objects, and return the URL objects slice, // along with the count of distinct hosts. func (this *Crawler) parseSeeds(seeds []string) ([]*url.URL, int) { // Translate seeds strings to URLs, normalized right away (to allow host count) hosts := make([]string, 0, len(seeds)) parsedSeeds := make([]*url.URL, 0, len(seeds)) for _, s := range seeds { if u, e := purell.NormalizeURLString(s, this.Options.URLNormalizationFlags); e != nil { this.logFunc(LogError, "ERROR parsing seed %s\n", s) } else { if parsed, e := url.Parse(u); e != nil { this.logFunc(LogError, "ERROR parsing normalized seed %s\n", u) } else { parsedSeeds = append(parsedSeeds, parsed) if indexInStrings(hosts, parsed.Host) == -1 { hosts = append(hosts, parsed.Host) } } } } return parsedSeeds, len(hosts) }
// Convert URLs to a standard form for comparison. func normalizeURL(url string) string { if n, err := purell.NormalizeURLString(url, purell.FlagsUsuallySafe); err != nil { return n } return url }