Ejemplo n.º 1
0
func SanitizeUrl(in string) string {
	url, err := purell.NormalizeURLString(in, purell.FlagsSafe|purell.FlagRemoveTrailingSlash|purell.FlagRemoveDotSegments|purell.FlagRemoveDuplicateSlashes|purell.FlagRemoveUnnecessaryHostDots|purell.FlagRemoveEmptyPortSeparator)
	if err != nil {
		return in
	}
	return url
}
Ejemplo n.º 2
0
func (r *RequestBundle) releaseAddress(address string) error {
	// start instrumentation
	var err error
	address, err = purell.NormalizeURLString(address, purell.FlagsSafe)
	if err != nil {
		r.Log.Error(err.Error())
		return err
	}
	reply := r.Repo.client.Hget("urls_to_ids", address)
	// report the repo call to instrumentation
	if reply.Err != nil {
		r.Log.Error(reply.Err.Error())
		return reply.Err
	}
	if reply.Type == redis.ReplyNil {
		return nil
	}
	was, err := reply.Str()
	if err != nil {
		r.Log.Error(err.Error())
		return err
	}
	reply = r.Repo.client.Hdel("urls_to_ids", address)
	// report the repo call to instrumentation
	if reply.Err != nil {
		r.Log.Error(err.Error())
		return reply.Err
	}
	r.Audit("urls_to_ids", address, was, "")
	// report repo calls to instrumentation
	// stop instrumentation
	return nil
}
Ejemplo n.º 3
0
func sanitizeURLWithFlags(in string, f purell.NormalizationFlags) string {
	s, err := purell.NormalizeURLString(in, f)
	if err != nil {
		return in
	}

	// Temporary workaround for the bug fix and resulting
	// behavioral change in purell.NormalizeURLString():
	// a leading '/' was inadvertently added to relative links,
	// but no longer, see #878.
	//
	// I think the real solution is to allow Hugo to
	// make relative URL with relative path,
	// e.g. "../../post/hello-again/", as wished by users
	// in issues #157, #622, etc., without forcing
	// relative URLs to begin with '/'.
	// Once the fixes are in, let's remove this kludge
	// and restore SanitizeURL() to the way it was.
	//                         -- @anthonyfok, 2015-02-16
	//
	// Begin temporary kludge
	u, err := url.Parse(s)
	if err != nil {
		panic(err)
	}
	if len(u.Path) > 0 && !strings.HasPrefix(u.Path, "/") {
		u.Path = "/" + u.Path
	}
	return u.String()
	// End temporary kludge

	//return s

}
Ejemplo n.º 4
0
func Save(u string) (content *Content, err error) {
	url, err := purell.NormalizeURLString(u, purell.FlagsSafe|purell.FlagAddTrailingSlash)
	md5sum := Md5sum(url)

	coll, err := sess.Collection("urls")
	// checkError(err)

	err = coll.Find(db.Cond{"md5": md5sum}).One(&content)
	if err == db.ErrNoMoreRows {
		// not found, create one
		content = &Content{Url: url, Md5: md5sum}
	} else {
		checkError(err)
	}

	log.Printf("fetching %s...", url)
	resp, _ := http.Get(url)
	bytes, _ := ioutil.ReadAll(resp.Body)

	content.Header = resp.Header
	content.Content = string(bytes)

	log.Printf("saving %s...", url)
	if content.Id == "" {
		content.Id = bson.NewObjectIdWithTime(time.Now())
	}
	_, err = coll.Append(content)

	return
}
Ejemplo n.º 5
0
func (r *RequestBundle) getIDFromAddress(address string) (uint64, error) {
	// start instrumentation
	var err error
	address, err = purell.NormalizeURLString(address, purell.FlagsSafe)
	if err != nil {
		r.Log.Error(err.Error())
		return uint64(0), err
	}
	reply := r.Repo.client.Hget("urls_to_ids", address)
	// report repo call to instrumentation
	if reply.Err != nil {
		r.Log.Error(reply.Err.Error())
		return uint64(0), reply.Err
	}
	idstr, err := reply.Str()
	if err != nil {
		r.Log.Error(err.Error())
		return uint64(0), err
	}
	id, err := strconv.ParseUint(idstr, 10, 64)
	if err != nil {
		r.Log.Error(err.Error())
		return uint64(0), err
	}
	return id, nil
	// stop instrumentation
}
Ejemplo n.º 6
0
func SanitizeUrl(in string) string {
	url, err := purell.NormalizeURLString(in, purell.FlagsUsuallySafeGreedy|purell.FlagRemoveDuplicateSlashes|purell.FlagRemoveUnnecessaryHostDots|purell.FlagRemoveEmptyPortSeparator)
	if err != nil {
		return in
	}
	return url
}
Ejemplo n.º 7
0
func ExampleNormalizeURLString() {
	if normalized, err := purell.NormalizeURLString("hTTp://someWEBsite.com:80/Amazing%41%3f/url/",
		purell.FlagsAllGreedy); err != nil {
		panic(err)
	} else {
		fmt.Print(normalized)
	}
	//Output: http://somewebsite.com:80/Amazing%3F/url/
}
Ejemplo n.º 8
0
// buildURL constructs an URL to make a call to the Nexus API
func (c *Client) buildURL(path string) string {
	endpoint, err := purell.NormalizeURLString(
		c.endpoint+path,
		purell.FlagLowercaseScheme|purell.FlagLowercaseScheme|purell.FlagLowercaseHost|purell.FlagRemoveDuplicateSlashes,
	)

	if err != nil {
		panic(err)
	}

	return endpoint
}
Ejemplo n.º 9
0
func UrlPrep(ugly bool, in string) string {
	if ugly {
		x := Uglify(SanitizeUrl(in))
		return x
	} else {
		x := PrettifyUrl(SanitizeUrl(in))
		url, err := purell.NormalizeURLString(x, purell.FlagAddTrailingSlash)
		if err != nil {
			return in
		}
		return url
	}
}
Ejemplo n.º 10
0
func URLPrep(ugly bool, in string) string {
	if ugly {
		x := Uglify(SanitizeURL(in))
		return x
	}
	x := PrettifyURL(SanitizeURL(in))
	if path.Ext(x) == ".xml" {
		return x
	}
	url, err := purell.NormalizeURLString(x, purell.FlagAddTrailingSlash)
	if err != nil {
		fmt.Printf("ERROR returned by NormalizeURLString. Returning in = %q\n", in)
		return in
	}
	return url
}
Ejemplo n.º 11
0
func (r *RequestBundle) reserveAddress(address string, id uint64) (bool, error) {
	// start instrumentation
	var err error
	address, err = purell.NormalizeURLString(address, purell.FlagsSafe)
	if err != nil {
		r.Log.Error(err.Error())
		return false, err
	}
	reply := r.Repo.client.Hsetnx("urls_to_ids", address, id)
	// report repo call to instrumentation
	if reply.Err != nil {
		r.Log.Error(reply.Err.Error())
		return false, reply.Err
	}
	r.Audit("urls_to_ids", address, "", strconv.FormatUint(id, 10))
	// report repo calls to instrumentation
	// stop instrumentation
	return reply.Bool()
}
Ejemplo n.º 12
0
// Parse the seeds URL strings to URL objects, and return the URL objects slice,
// along with the count of distinct hosts.
func (this *Crawler) parseSeeds(seeds []string) ([]*url.URL, int) {
	// Translate seeds strings to URLs, normalized right away (to allow host count)
	hosts := make([]string, 0, len(seeds))
	parsedSeeds := make([]*url.URL, 0, len(seeds))

	for _, s := range seeds {
		if u, e := purell.NormalizeURLString(s, this.Options.URLNormalizationFlags); e != nil {
			this.logFunc(LogError, "ERROR parsing seed %s\n", s)
		} else {
			if parsed, e := url.Parse(u); e != nil {
				this.logFunc(LogError, "ERROR parsing normalized seed %s\n", u)
			} else {
				parsedSeeds = append(parsedSeeds, parsed)
				if indexInStrings(hosts, parsed.Host) == -1 {
					hosts = append(hosts, parsed.Host)
				}
			}
		}
	}

	return parsedSeeds, len(hosts)
}
Ejemplo n.º 13
0
// Convert URLs to a standard form for comparison.
func normalizeURL(url string) string {
	if n, err := purell.NormalizeURLString(url, purell.FlagsUsuallySafe); err != nil {
		return n
	}
	return url
}