// "Constructor", parses the given string JSON reference func (r *Ref) parse(jsonReferenceString string) error { parsed, err := url.Parse(jsonReferenceString) if err != nil { return err } r.referenceURL, _ = url.Parse(purell.NormalizeURL(parsed, purell.FlagsSafe|purell.FlagRemoveDuplicateSlashes)) refURL := r.referenceURL if refURL.Scheme != "" && refURL.Host != "" { r.HasFullURL = true } else { if refURL.Path != "" { r.HasURLPathOnly = true } else if refURL.RawQuery == "" && refURL.Fragment != "" { r.HasFragmentOnly = true } } r.HasFileScheme = refURL.Scheme == "file" r.HasFullFilePath = strings.HasPrefix(refURL.Path, "/") // invalid json-pointer error means url has no json-pointer fragment. simply ignore error r.referencePointer, _ = jsonpointer.New(refURL.Fragment) return nil }
// NewACIArchive creates a new aci-archive distribution from the provided distribution uri. func NewACIArchive(u *url.URL) (Distribution, error) { c, err := parseCIMD(u) if err != nil { return nil, fmt.Errorf("cannot parse URI: %q: %v", u.String(), err) } if c.Type != TypeACIArchive { return nil, fmt.Errorf("illegal ACI archive distribution type: %q", c.Type) } // This should be a valid URL data, err := url.QueryUnescape(c.Data) if err != nil { return nil, errwrap.Wrap(fmt.Errorf("error unescaping url %q", c.Data), err) } aciu, err := url.Parse(data) if err != nil { return nil, errwrap.Wrap(fmt.Errorf("error parsing url %q", c.Data), err) } // save the URI as sorted to make it ready for comparison purell.NormalizeURL(u, purell.FlagSortQuery) str := u.String() if path := aciu.String(); filepath.Ext(path) == schema.ACIExtension { str = path } return &ACIArchive{ cimdURL: u, transportURL: aciu, str: str, }, nil }
// cloneForRedirect returns a new URLContext with the given // destination URL with the same sourceURL and normalizedSourceURL. func (uc *URLContext) cloneForRedirect(dst *url.URL, normFlags purell.NormalizationFlags) *URLContext { var src, normalizedSrc *url.URL if uc.sourceURL != nil { src = &url.URL{} *src = *uc.sourceURL } if src == nil && uc.url != nil { // if the current context doesn't have a source URL, use its URL as // source (e.g. for a seed URL that triggers a redirect) src = &url.URL{} *src = *uc.url } if uc.normalizedSourceURL != nil { normalizedSrc = &url.URL{} *normalizedSrc = *uc.normalizedSourceURL } if normalizedSrc == nil { normalizedSrc = &url.URL{} *normalizedSrc = *uc.normalizedURL } rawDst := &url.URL{} *rawDst = *dst purell.NormalizeURL(dst, normFlags) return &URLContext{ HeadBeforeGet: uc.HeadBeforeGet, State: uc.State, url: rawDst, normalizedURL: dst, sourceURL: src, normalizedSourceURL: normalizedSrc, } }
// grabUrls looks for rel-canonical, og:url and rel-shortlink urls // returns canonical url (or "") and a list of all urls (including baseURL) func grabURLs(root *html.Node, baseURL *url.URL) (string, []string) { dbug := Debug.URLLogger canonical := "" all := make(map[string]struct{}) // start with base URL u := purell.NormalizeURL(baseURL, purell.FlagsSafe) if u != "" { all[u] = struct{}{} } // look for canonical urls first for _, link := range urlSels.ogUrl.MatchAll(root) { txt := getAttr(link, "content") u, err := sanitiseURL(txt, baseURL) if err != nil { dbug.Printf("Reject og:url %s (%s)\n", txt, err) continue } dbug.Printf("Accept og:url %s\n", u) all[u] = struct{}{} canonical = u } for _, link := range urlSels.relCanonical.MatchAll(root) { txt := getAttr(link, "href") u, err := sanitiseURL(txt, baseURL) if err != nil { dbug.Printf("Reject rel-canonical %s (%s)\n", txt, err) continue } dbug.Printf("Accept rel-canonical %s\n", u) all[u] = struct{}{} canonical = u } // look for other (non-canonical) urls for _, link := range urlSels.relShortlink.MatchAll(root) { txt := getAttr(link, "href") u, err := sanitiseURL(getAttr(link, "href"), baseURL) if err != nil { dbug.Printf("Reject rel-shortlink %s (%s)\n", txt, err) continue } dbug.Printf("Accept rel-shortlink %s\n", u) all[u] = struct{}{} } // build up list of alternates allList := make([]string, 0, 8) for u, _ := range all { allList = append(allList, u) } return canonical, allList }
// Handler function for creating new short URL. func create(w http.ResponseWriter, r *http.Request) { // We only allow POST requests because of the fact that URLs can be way too // long for gets. Thus for anything that's not POST, we error out. if "POST" != r.Method { http.Error(w, "Method Not Supported", http.StatusMethodNotAllowed) return } c := appengine.NewContext(r) originalURLString := r.PostFormValue("OriginalUrl") callback := r.PostFormValue("callback") if "" == callback { w.Header().Set("Content-Type", "application/json; charset=UTF-8") } else { w.Header().Set("Content-Type", "application/javascript; charset=UTF-8") } // The sequence of checks to ensure that the original URL makes sense. // 1. If it is given. if originalURLString == "" { w.Write(output(c, StatusFailure, "Missing URL to be shortened", callback)) return } // 2. If the URL parses properly originalURL, err := url.Parse(originalURLString) if err != nil { w.Write(output(c, StatusFailure, "URL cannot be parsed", callback)) return } // 3. If the URL is absolute if !originalURL.IsAbs() { w.Write(output(c, StatusFailure, "URL is not absolute", callback)) return } // 4. If the URL's domain is already the same as the current Yordle // instace, we just return the exact same URL if originalURL.Host == r.Host { w.Write(output(c, StatusSuccess, originalURLString, callback)) return } // Now we normalize the URL so that we don't have to store duplicates originalURLString = purell.NormalizeURL(originalURL, purell.FlagsSafe) id, err := shorturl.Persist(originalURLString, c) if err != nil { c.Errorf("Error persisting %s. Message: %s", originalURLString, err.Error()) w.Write(output(c, StatusFailure, "Error Persisting URL", callback)) return } c.Infof("Successfully created short url for '%s', ID: %d", originalURL, id) w.Write(output(c, StatusSuccess, fmt.Sprintf("http://%s/%s", r.Host, base62.Encode(id)), callback)) }
func (this *Crawler) urlToURLContext(u, src *url.URL) *URLContext { var rawSrc *url.URL rawU := *u purell.NormalizeURL(u, this.Options.URLNormalizationFlags) if src != nil { rawSrc = &url.URL{} *rawSrc = *src purell.NormalizeURL(src, this.Options.URLNormalizationFlags) } return &URLContext{ this.Options.HeadBeforeGet, nil, &rawU, u, rawSrc, src, } }
// This function has been taken from https://github.com/aybabtme/crawler/blob/master/util.go func cleanFromURLString(from *url.URL, link string) (*url.URL, error) { u, err := url.Parse(link) if u.Host == "" { u.Scheme = from.Scheme u.Host = from.Host } uStr := purell.NormalizeURL(u, purell.FlagsUsuallySafeGreedy) clean, err := from.Parse(uStr) return clean, err }
func sanitiseURL(link string, baseURL *url.URL) (string, error) { u, err := baseURL.Parse(link) if err != nil { return "", err } // we're only interested in articles, so reject obviously-not-article urls if (u.Path == "/" || u.Path == "") && len(u.RawQuery) == 0 { return "", fmt.Errorf("obviously not article") } return purell.NormalizeURL(u, purell.FlagsSafe), nil }
// NewAppcFromApp returns an Appc distribution from an appc App discovery string func NewAppcFromApp(app *discovery.App) Distribution { rawuri := NewCIMDString(TypeAppc, distAppcVersion, url.QueryEscape(app.Name.String())) var version string labels := types.Labels{} for n, v := range app.Labels { if n == "version" { version = v } labels = append(labels, types.Label{Name: n, Value: v}) } if len(labels) > 0 { queries := make([]string, len(labels)) rawuri += "?" for i, l := range labels { queries[i] = fmt.Sprintf("%s=%s", l.Name, url.QueryEscape(l.Value)) } rawuri += strings.Join(queries, "&") } u, err := url.Parse(rawuri) if err != nil { panic(fmt.Errorf("cannot parse URI %q: %v", rawuri, err)) } // save the URI as sorted to make it ready for comparison purell.NormalizeURL(u, purell.FlagSortQuery) str := app.Name.String() if version != "" { str += fmt.Sprintf(":%s", version) } labelsort.By(labelsort.RankedName).Sort(labels) for _, l := range labels { if l.Name != "version" { str += fmt.Sprintf(",%s=%s", l.Name, l.Value) } } return &Appc{ cimd: u, app: app.Copy(), str: str, } }
func normalizeURL(r *http.Request) *http.Request { normalized := purell.NormalizeURL(r.URL, purell.FlagsSafe|purell.FlagRemoveDotSegments|purell.FlagRemoveDuplicateSlashes|purell.FlagSortQuery) newurl, err := url.Parse(normalized) if err != nil { debug("Error parsing normalized URL", normalized) newurl = nil } // if r.URL.String() != newurl.String() { // debug("normalize:", r.URL.String(), "to", newurl.String()) // } req, err := http.NewRequest("GET", newurl.String(), nil) if err != nil { return nil } return req }
// Normalize returns normalized URL string. // Behavior: // 1. Remove unnecessary host dots. // 2. Remove default port (http://localhost:80 becomes http://localhost). // 3. Remove duplicate slashes. // 4. Remove unnecessary dots from path. // 5. Sort query parameters. // 6. Decode host IP into decimal numbers. // 7. Handle escape values. // 8. Decode Punycode domains into UTF8 representation. func Normalize(u *url.URL) (string, error) { host, port, err := SplitHostPort(u) if err != nil { return "", err } if err := checkHost(host); err != nil { return "", err } // Decode Punycode. host, err = idna.ToUnicode(host) if err != nil { return "", err } u.Host = strings.ToLower(host) if port != "" { u.Host += ":" + port } u.Scheme = strings.ToLower(u.Scheme) return purell.NormalizeURL(u, normalizeFlags), nil }
// NormalizeURL - nomalize URL func NormalizeURL(u *url.URL) string { removeUtmFromQuery(u) return purell.NormalizeURL(u, defaultNormalizationFlags) }
// Enqueue the URLs returned from the worker, as long as it complies with the // selection policies. func (this *Crawler) enqueueUrls(res *workerResponse) (cnt int) { for _, u := range res.harvestedUrls { var isVisited, enqueue, head bool var hr HeadRequestMode // Normalize URL purell.NormalizeURL(u, DefaultNormalizationFlags) _, isVisited = this.visited[u.String()] // Filter the URL - TODO : Priority is ignored at the moment if enqueue, _, hr = this.Options.Extender.Filter(u, res.sourceUrl, isVisited); !enqueue { // Filter said NOT to use this url, so continue with next this.logFunc(LogIgnored, "ignore on filter policy: %s", u.String()) continue } // Even if filter said to use the URL, it still MUST be absolute, http(s)-prefixed, // and comply with the same host policy if requested. if !u.IsAbs() { // Only absolute URLs are processed, so ignore this.logFunc(LogIgnored, "ignore on absolute policy: %s", u.String()) } else if !strings.HasPrefix(u.Scheme, "http") { this.logFunc(LogIgnored, "ignore on scheme policy: %s", u.String()) } else if res.sourceUrl != nil && u.Host != res.sourceUrl.Host && this.Options.SameHostOnly { // Only allow URLs coming from the same host this.logFunc(LogIgnored, "ignore on same host policy: %s", u.String()) } else { // All is good, visit this URL (robots.txt verification is done by worker) // Launch worker if required w, ok := this.workers[u.Host] if !ok { // No worker exists for this host, launch a new one w = this.launchWorker(u) // Automatically enqueue the robots.txt URL as first in line if robUrl, e := getRobotsTxtUrl(u); e != nil { this.Options.Extender.Error(newCrawlError(e, CekParseRobots, u)) this.logFunc(LogError, "ERROR parsing robots.txt from %s: %s", u.String(), e.Error()) } else { this.logFunc(LogEnqueued, "enqueue: %s", robUrl.String()) this.Options.Extender.Enqueued(robUrl, res.sourceUrl) w.pop.stack(&workerCommand{robUrl, false}) } } cnt++ this.logFunc(LogEnqueued, "enqueue: %s", u.String()) this.Options.Extender.Enqueued(u, res.sourceUrl) switch hr { case HrmIgnore: head = false case HrmRequest: head = true default: head = this.Options.HeadBeforeGet } w.pop.stack(&workerCommand{u, head}) this.pushPopRefCount++ // Once it is stacked, it WILL be visited eventually, so add it to the visited slice // (unless denied by robots.txt, but this is out of our hands, for all we // care, it is visited). if !isVisited { this.visited[u.String()] = '0' } } } return }
// Enqueue the URLs returned from the worker, as long as it complies with the // selection policies. func (this *Crawler) enqueueUrls(res *workerResponse) (cnt int) { for _, u := range res.harvestedUrls { var isVisited, forceEnqueue bool // Normalize URL purell.NormalizeURL(u, DefaultNormalizationFlags) _, isVisited = this.visited[u.String()] // If a selector callback is specified, use this to filter URL if this.Options.URLSelector != nil { if forceEnqueue = this.Options.URLSelector(u, res.sourceUrl, isVisited); !forceEnqueue { // Custom selector said NOT to use this url, so continue with next this.logFunc(LogIgnored, "ignore on custom selector policy: %s\n", u.String()) continue } } // Even if custom selector said to use the URL, it still MUST be absolute, http(s)-prefixed, // and comply with the same host policy if requested. if !u.IsAbs() { // Only absolute URLs are processed, so ignore this.logFunc(LogIgnored, "ignore on absolute policy: %s\n", u.String()) } else if !strings.HasPrefix(u.Scheme, "http") { this.logFunc(LogIgnored, "ignore on scheme policy: %s\n", u.String()) } else if res.sourceUrl != nil && u.Host != res.sourceUrl.Host && this.Options.SameHostOnly { // Only allow URLs coming from the same host this.logFunc(LogIgnored, "ignore on same host policy: %s\n", u.String()) } else if !isVisited || forceEnqueue { // All is good, visit this URL (robots.txt verification is done by worker) // Launch worker if required w, ok := this.workers[u.Host] if !ok { w = this.launchWorker(u) // Automatically enqueue the robots.txt URL as first in line if robUrl, e := getRobotsTxtUrl(u); e != nil { this.logFunc(LogError, "ERROR parsing robots.txt from %s: %s\n", u.String(), e.Error()) } else { this.logFunc(LogEnqueued, "enqueue: %s\n", robUrl.String()) w.pop.stack(robUrl) } } cnt++ this.logFunc(LogEnqueued, "enqueue: %s\n", u.String()) w.pop.stack(u) this.pushPopRefCount++ // Once it is stacked, it WILL be visited eventually, so add it to the visited slice if !isVisited { this.visited[u.String()] = '0' } } else { this.logFunc(LogIgnored, "ignore on already visited policy: %s\n", u.String()) } } return }