Пример #1
0
// "Constructor", parses the given string JSON reference
func (r *Ref) parse(jsonReferenceString string) error {

	parsed, err := url.Parse(jsonReferenceString)
	if err != nil {
		return err
	}

	r.referenceURL, _ = url.Parse(purell.NormalizeURL(parsed, purell.FlagsSafe|purell.FlagRemoveDuplicateSlashes))
	refURL := r.referenceURL

	if refURL.Scheme != "" && refURL.Host != "" {
		r.HasFullURL = true
	} else {
		if refURL.Path != "" {
			r.HasURLPathOnly = true
		} else if refURL.RawQuery == "" && refURL.Fragment != "" {
			r.HasFragmentOnly = true
		}
	}

	r.HasFileScheme = refURL.Scheme == "file"
	r.HasFullFilePath = strings.HasPrefix(refURL.Path, "/")

	// invalid json-pointer error means url has no json-pointer fragment. simply ignore error
	r.referencePointer, _ = jsonpointer.New(refURL.Fragment)

	return nil
}
Пример #2
0
// NewACIArchive creates a new aci-archive distribution from the provided distribution uri.
func NewACIArchive(u *url.URL) (Distribution, error) {
	c, err := parseCIMD(u)
	if err != nil {
		return nil, fmt.Errorf("cannot parse URI: %q: %v", u.String(), err)
	}
	if c.Type != TypeACIArchive {
		return nil, fmt.Errorf("illegal ACI archive distribution type: %q", c.Type)
	}

	// This should be a valid URL
	data, err := url.QueryUnescape(c.Data)
	if err != nil {
		return nil, errwrap.Wrap(fmt.Errorf("error unescaping url %q", c.Data), err)
	}
	aciu, err := url.Parse(data)
	if err != nil {
		return nil, errwrap.Wrap(fmt.Errorf("error parsing url %q", c.Data), err)
	}

	// save the URI as sorted to make it ready for comparison
	purell.NormalizeURL(u, purell.FlagSortQuery)

	str := u.String()
	if path := aciu.String(); filepath.Ext(path) == schema.ACIExtension {
		str = path
	}

	return &ACIArchive{
		cimdURL:      u,
		transportURL: aciu,
		str:          str,
	}, nil
}
Пример #3
0
// cloneForRedirect returns a new URLContext with the given
// destination URL with the same sourceURL and normalizedSourceURL.
func (uc *URLContext) cloneForRedirect(dst *url.URL, normFlags purell.NormalizationFlags) *URLContext {
	var src, normalizedSrc *url.URL
	if uc.sourceURL != nil {
		src = &url.URL{}
		*src = *uc.sourceURL
	}
	if src == nil && uc.url != nil {
		// if the current context doesn't have a source URL, use its URL as
		// source (e.g. for a seed URL that triggers a redirect)
		src = &url.URL{}
		*src = *uc.url
	}

	if uc.normalizedSourceURL != nil {
		normalizedSrc = &url.URL{}
		*normalizedSrc = *uc.normalizedSourceURL
	}
	if normalizedSrc == nil {
		normalizedSrc = &url.URL{}
		*normalizedSrc = *uc.normalizedURL
	}

	rawDst := &url.URL{}
	*rawDst = *dst
	purell.NormalizeURL(dst, normFlags)
	return &URLContext{
		HeadBeforeGet:       uc.HeadBeforeGet,
		State:               uc.State,
		url:                 rawDst,
		normalizedURL:       dst,
		sourceURL:           src,
		normalizedSourceURL: normalizedSrc,
	}
}
Пример #4
0
// grabUrls looks for rel-canonical, og:url and rel-shortlink urls
// returns canonical url (or "") and a list of all urls (including baseURL)
func grabURLs(root *html.Node, baseURL *url.URL) (string, []string) {

	dbug := Debug.URLLogger

	canonical := ""
	all := make(map[string]struct{})

	// start with base URL
	u := purell.NormalizeURL(baseURL, purell.FlagsSafe)
	if u != "" {
		all[u] = struct{}{}
	}

	// look for canonical urls first
	for _, link := range urlSels.ogUrl.MatchAll(root) {
		txt := getAttr(link, "content")
		u, err := sanitiseURL(txt, baseURL)
		if err != nil {
			dbug.Printf("Reject og:url %s (%s)\n", txt, err)
			continue
		}

		dbug.Printf("Accept og:url %s\n", u)
		all[u] = struct{}{}
		canonical = u
	}
	for _, link := range urlSels.relCanonical.MatchAll(root) {
		txt := getAttr(link, "href")
		u, err := sanitiseURL(txt, baseURL)
		if err != nil {
			dbug.Printf("Reject rel-canonical %s (%s)\n", txt, err)
			continue
		}

		dbug.Printf("Accept rel-canonical %s\n", u)
		all[u] = struct{}{}
		canonical = u
	}

	// look for other (non-canonical) urls
	for _, link := range urlSels.relShortlink.MatchAll(root) {
		txt := getAttr(link, "href")
		u, err := sanitiseURL(getAttr(link, "href"), baseURL)
		if err != nil {
			dbug.Printf("Reject rel-shortlink %s (%s)\n", txt, err)
			continue
		}
		dbug.Printf("Accept rel-shortlink %s\n", u)
		all[u] = struct{}{}
	}

	// build up list of alternates
	allList := make([]string, 0, 8)
	for u, _ := range all {
		allList = append(allList, u)
	}

	return canonical, allList
}
Пример #5
0
// Handler function for creating new short URL.
func create(w http.ResponseWriter, r *http.Request) {
	// We only allow POST requests because of the fact that URLs can be way too
	// long for gets. Thus for anything that's not POST, we error out.
	if "POST" != r.Method {
		http.Error(w, "Method Not Supported", http.StatusMethodNotAllowed)
		return
	}

	c := appengine.NewContext(r)

	originalURLString := r.PostFormValue("OriginalUrl")
	callback := r.PostFormValue("callback")

	if "" == callback {
		w.Header().Set("Content-Type", "application/json; charset=UTF-8")
	} else {
		w.Header().Set("Content-Type", "application/javascript; charset=UTF-8")
	}

	// The sequence of checks to ensure that the original URL makes sense.
	// 1. If it is given.
	if originalURLString == "" {
		w.Write(output(c, StatusFailure, "Missing URL to be shortened", callback))
		return
	}

	// 2. If the URL parses properly
	originalURL, err := url.Parse(originalURLString)
	if err != nil {
		w.Write(output(c, StatusFailure, "URL cannot be parsed", callback))
		return
	}

	// 3. If the URL is absolute
	if !originalURL.IsAbs() {
		w.Write(output(c, StatusFailure, "URL is not absolute", callback))
		return
	}

	// 4. If the URL's domain is already the same as the current Yordle
	// instace, we just return the exact same URL
	if originalURL.Host == r.Host {
		w.Write(output(c, StatusSuccess, originalURLString, callback))
		return
	}

	// Now we normalize the URL so that we don't have to store duplicates
	originalURLString = purell.NormalizeURL(originalURL, purell.FlagsSafe)

	id, err := shorturl.Persist(originalURLString, c)
	if err != nil {
		c.Errorf("Error persisting %s. Message: %s", originalURLString, err.Error())
		w.Write(output(c, StatusFailure, "Error Persisting URL", callback))
		return
	}

	c.Infof("Successfully created short url for '%s', ID: %d", originalURL, id)
	w.Write(output(c, StatusSuccess, fmt.Sprintf("http://%s/%s", r.Host, base62.Encode(id)), callback))
}
Пример #6
0
func (this *Crawler) urlToURLContext(u, src *url.URL) *URLContext {
	var rawSrc *url.URL

	rawU := *u
	purell.NormalizeURL(u, this.Options.URLNormalizationFlags)
	if src != nil {
		rawSrc = &url.URL{}
		*rawSrc = *src
		purell.NormalizeURL(src, this.Options.URLNormalizationFlags)
	}

	return &URLContext{
		this.Options.HeadBeforeGet,
		nil,
		&rawU,
		u,
		rawSrc,
		src,
	}
}
Пример #7
0
// This function has been taken from https://github.com/aybabtme/crawler/blob/master/util.go
func cleanFromURLString(from *url.URL, link string) (*url.URL, error) {
	u, err := url.Parse(link)
	if u.Host == "" {
		u.Scheme = from.Scheme
		u.Host = from.Host
	}
	uStr := purell.NormalizeURL(u, purell.FlagsUsuallySafeGreedy)

	clean, err := from.Parse(uStr)

	return clean, err
}
Пример #8
0
func sanitiseURL(link string, baseURL *url.URL) (string, error) {
	u, err := baseURL.Parse(link)
	if err != nil {
		return "", err
	}

	// we're only interested in articles, so reject obviously-not-article urls
	if (u.Path == "/" || u.Path == "") && len(u.RawQuery) == 0 {
		return "", fmt.Errorf("obviously not article")
	}

	return purell.NormalizeURL(u, purell.FlagsSafe), nil
}
Пример #9
0
// NewAppcFromApp returns an Appc distribution from an appc App discovery string
func NewAppcFromApp(app *discovery.App) Distribution {
	rawuri := NewCIMDString(TypeAppc, distAppcVersion, url.QueryEscape(app.Name.String()))

	var version string
	labels := types.Labels{}
	for n, v := range app.Labels {
		if n == "version" {
			version = v
		}

		labels = append(labels, types.Label{Name: n, Value: v})
	}

	if len(labels) > 0 {
		queries := make([]string, len(labels))
		rawuri += "?"
		for i, l := range labels {
			queries[i] = fmt.Sprintf("%s=%s", l.Name, url.QueryEscape(l.Value))
		}
		rawuri += strings.Join(queries, "&")
	}

	u, err := url.Parse(rawuri)
	if err != nil {
		panic(fmt.Errorf("cannot parse URI %q: %v", rawuri, err))
	}

	// save the URI as sorted to make it ready for comparison
	purell.NormalizeURL(u, purell.FlagSortQuery)

	str := app.Name.String()
	if version != "" {
		str += fmt.Sprintf(":%s", version)
	}

	labelsort.By(labelsort.RankedName).Sort(labels)
	for _, l := range labels {
		if l.Name != "version" {
			str += fmt.Sprintf(",%s=%s", l.Name, l.Value)
		}
	}

	return &Appc{
		cimd: u,
		app:  app.Copy(),
		str:  str,
	}
}
Пример #10
0
func normalizeURL(r *http.Request) *http.Request {
	normalized := purell.NormalizeURL(r.URL, purell.FlagsSafe|purell.FlagRemoveDotSegments|purell.FlagRemoveDuplicateSlashes|purell.FlagSortQuery)
	newurl, err := url.Parse(normalized)
	if err != nil {
		debug("Error parsing normalized URL", normalized)
		newurl = nil
	}
	//	if r.URL.String() != newurl.String() {
	//		debug("normalize:", r.URL.String(), "to", newurl.String())
	//	}
	req, err := http.NewRequest("GET", newurl.String(), nil)
	if err != nil {
		return nil
	}
	return req
}
Пример #11
0
// Normalize returns normalized URL string.
// Behavior:
// 1. Remove unnecessary host dots.
// 2. Remove default port (http://localhost:80 becomes http://localhost).
// 3. Remove duplicate slashes.
// 4. Remove unnecessary dots from path.
// 5. Sort query parameters.
// 6. Decode host IP into decimal numbers.
// 7. Handle escape values.
// 8. Decode Punycode domains into UTF8 representation.
func Normalize(u *url.URL) (string, error) {
	host, port, err := SplitHostPort(u)
	if err != nil {
		return "", err
	}
	if err := checkHost(host); err != nil {
		return "", err
	}

	// Decode Punycode.
	host, err = idna.ToUnicode(host)
	if err != nil {
		return "", err
	}

	u.Host = strings.ToLower(host)
	if port != "" {
		u.Host += ":" + port
	}
	u.Scheme = strings.ToLower(u.Scheme)

	return purell.NormalizeURL(u, normalizeFlags), nil
}
Пример #12
0
// NormalizeURL - nomalize URL
func NormalizeURL(u *url.URL) string {
	removeUtmFromQuery(u)
	return purell.NormalizeURL(u, defaultNormalizationFlags)
}
Пример #13
0
// Enqueue the URLs returned from the worker, as long as it complies with the
// selection policies.
func (this *Crawler) enqueueUrls(res *workerResponse) (cnt int) {
	for _, u := range res.harvestedUrls {
		var isVisited, enqueue, head bool
		var hr HeadRequestMode

		// Normalize URL
		purell.NormalizeURL(u, DefaultNormalizationFlags)
		_, isVisited = this.visited[u.String()]

		// Filter the URL - TODO : Priority is ignored at the moment
		if enqueue, _, hr = this.Options.Extender.Filter(u, res.sourceUrl, isVisited); !enqueue {
			// Filter said NOT to use this url, so continue with next
			this.logFunc(LogIgnored, "ignore on filter policy: %s", u.String())
			continue
		}

		// Even if filter said to use the URL, it still MUST be absolute, http(s)-prefixed,
		// and comply with the same host policy if requested.
		if !u.IsAbs() {
			// Only absolute URLs are processed, so ignore
			this.logFunc(LogIgnored, "ignore on absolute policy: %s", u.String())

		} else if !strings.HasPrefix(u.Scheme, "http") {
			this.logFunc(LogIgnored, "ignore on scheme policy: %s", u.String())

		} else if res.sourceUrl != nil && u.Host != res.sourceUrl.Host && this.Options.SameHostOnly {
			// Only allow URLs coming from the same host
			this.logFunc(LogIgnored, "ignore on same host policy: %s", u.String())

		} else {
			// All is good, visit this URL (robots.txt verification is done by worker)

			// Launch worker if required
			w, ok := this.workers[u.Host]
			if !ok {
				// No worker exists for this host, launch a new one
				w = this.launchWorker(u)
				// Automatically enqueue the robots.txt URL as first in line
				if robUrl, e := getRobotsTxtUrl(u); e != nil {
					this.Options.Extender.Error(newCrawlError(e, CekParseRobots, u))
					this.logFunc(LogError, "ERROR parsing robots.txt from %s: %s", u.String(), e.Error())
				} else {
					this.logFunc(LogEnqueued, "enqueue: %s", robUrl.String())
					this.Options.Extender.Enqueued(robUrl, res.sourceUrl)
					w.pop.stack(&workerCommand{robUrl, false})
				}
			}

			cnt++
			this.logFunc(LogEnqueued, "enqueue: %s", u.String())
			this.Options.Extender.Enqueued(u, res.sourceUrl)
			switch hr {
			case HrmIgnore:
				head = false
			case HrmRequest:
				head = true
			default:
				head = this.Options.HeadBeforeGet
			}
			w.pop.stack(&workerCommand{u, head})
			this.pushPopRefCount++

			// Once it is stacked, it WILL be visited eventually, so add it to the visited slice
			// (unless denied by robots.txt, but this is out of our hands, for all we
			// care, it is visited).
			if !isVisited {
				this.visited[u.String()] = '0'
			}
		}
	}
	return
}
Пример #14
0
// Enqueue the URLs returned from the worker, as long as it complies with the
// selection policies.
func (this *Crawler) enqueueUrls(res *workerResponse) (cnt int) {
	for _, u := range res.harvestedUrls {
		var isVisited, forceEnqueue bool

		// Normalize URL
		purell.NormalizeURL(u, DefaultNormalizationFlags)
		_, isVisited = this.visited[u.String()]

		// If a selector callback is specified, use this to filter URL
		if this.Options.URLSelector != nil {
			if forceEnqueue = this.Options.URLSelector(u, res.sourceUrl, isVisited); !forceEnqueue {
				// Custom selector said NOT to use this url, so continue with next
				this.logFunc(LogIgnored, "ignore on custom selector policy: %s\n", u.String())
				continue
			}
		}

		// Even if custom selector said to use the URL, it still MUST be absolute, http(s)-prefixed,
		// and comply with the same host policy if requested.
		if !u.IsAbs() {
			// Only absolute URLs are processed, so ignore
			this.logFunc(LogIgnored, "ignore on absolute policy: %s\n", u.String())

		} else if !strings.HasPrefix(u.Scheme, "http") {
			this.logFunc(LogIgnored, "ignore on scheme policy: %s\n", u.String())

		} else if res.sourceUrl != nil && u.Host != res.sourceUrl.Host && this.Options.SameHostOnly {
			// Only allow URLs coming from the same host
			this.logFunc(LogIgnored, "ignore on same host policy: %s\n", u.String())

		} else if !isVisited || forceEnqueue {
			// All is good, visit this URL (robots.txt verification is done by worker)

			// Launch worker if required
			w, ok := this.workers[u.Host]
			if !ok {
				w = this.launchWorker(u)
				// Automatically enqueue the robots.txt URL as first in line
				if robUrl, e := getRobotsTxtUrl(u); e != nil {
					this.logFunc(LogError, "ERROR parsing robots.txt from %s: %s\n", u.String(), e.Error())
				} else {
					this.logFunc(LogEnqueued, "enqueue: %s\n", robUrl.String())
					w.pop.stack(robUrl)
				}
			}

			cnt++
			this.logFunc(LogEnqueued, "enqueue: %s\n", u.String())
			w.pop.stack(u)
			this.pushPopRefCount++

			// Once it is stacked, it WILL be visited eventually, so add it to the visited slice
			if !isVisited {
				this.visited[u.String()] = '0'
			}

		} else {
			this.logFunc(LogIgnored, "ignore on already visited policy: %s\n", u.String())
		}
	}
	return
}