func GitClone(repo url.URL, destination string) error { gitPath, err := exec.LookPath("git") if err != nil { return err } branch := repo.Fragment repo.Fragment = "" gitUrl := repo.String() err = performGitClone(gitPath, []string{ "--depth", "1", "--recursive", gitUrl, destination, }, branch) if err != nil { err = performGitClone(gitPath, []string{ "--recursive", gitUrl, destination, }, branch) if err != nil { return fmt.Errorf("Failed to clone git repository at %s", gitUrl) } } return nil }
func GetLink(links *bolt.Bucket, stats *het.CountStats, url *url.URL) (het.Link, error) { url.Fragment = "" lbytes := links.Get([]byte(url.String())) link := het.Link{} if lbytes != nil { // link already exists, return early json.Unmarshal(lbytes, &link) // follow redirects in the links bucket if link.Redirect { return GetLink(links, stats, &link.URL) } return link, nil } resp, err := http.Get(url.String()) if err != nil { return link, err } defer resp.Body.Close() finalURL := resp.Request.URL finalURL.Fragment = "" link = het.Link{ URL: *finalURL, StatusCode: resp.StatusCode, ContentType: resp.Header.Get("Content-Type"), LastModified: strings.Trim(resp.Header.Get("Last-Modified"), " \t\n"), } lbytes, err = json.Marshal(&link) if err != nil { log.Fatal(err) } links.Put([]byte(finalURL.String()), lbytes) stats.LinkCount++ // redirect link if finalURL.String() != url.String() { lrbytes, err := json.Marshal(&het.Link{ URL: *finalURL, Redirect: true, }) if err != nil { log.Fatal(err) } links.Put([]byte(url.String()), lrbytes) stats.LinkCount++ } return link, nil }
// NewResourceLocation appends a resource id to the end of the requested URL path. func NewResourceLocation(reqURL *url.URL, id string) string { var u url.URL u = *reqURL u.Path = path.Join(u.Path, id) u.RawQuery = "" u.Fragment = "" return u.String() }
func (c *Crawler) enqueue(link *url.URL, base *url.URL) { if base != nil { link = base.ResolveReference(link) link.Fragment = "" } if link.Path == "" { link.Path = "/" } link.Fragment = "" if link.Host != "" { link.Host = c.normalize_host(link.Host) } if c.known.Exists([]byte(link.String())) { return } c.known.Insert([]byte(link.String())) c.report_found(link) if link.Scheme == "http" { if c.domains[link.Host] { c.waiter.Add(1) c.queue.Enqueue(&task{url: link}) return } else { c.report_ignored(link, 0, "external domain") return } } else { c.report_ignored(link, 0, "wrong scheme: "+link.Scheme) return } /*c.waiter.Add(1)*/ /*c.queue <- u.String()*/ }
// MungeNoProtocolURL will take a URL returned from net/url.Parse and make // corrections to the URL when no protocol is specified in the Scheme, where there // are valid protocol-less git url spec formats that result in either file:// or ssh:// protocol usage; // if an explicit protocol is already specified, then the // URL is left unchaged and the method simply returns with no error reported, // since the URL is func (h *stiGit) MungeNoProtocolURL(source string, uri *url.URL) error { if uri == nil { return nil } // only deal with protocol-less url's if uri.Scheme != "" { return nil } details, mods, err := ParseFile(source) if err != nil { return err } if details.BadRef { return fmt.Errorf("bad reference following # in %s", source) } if !details.FileExists { mods2, err := ParseSSH(source) if err != nil { glog.Errorf("ssh git clone spec error: %v", err) return err } mods = mods2 } // update the either file or ssh url accordingly if mods != nil { if len(mods.User) > 0 { uri.User = url.User(mods.User) } if len(mods.Scheme) > 0 { uri.Scheme = mods.Scheme } if len(mods.Host) > 0 { uri.Host = mods.Host } if len(mods.Path) > 0 { uri.Path = mods.Path } if len(mods.Ref) > 0 { uri.Fragment = mods.Ref } } return nil }
// Clean returns the sanitized HTML (based on a tag and attribute whitelist) and // the text contents of s. Links are made relative to u, if non-nil. func Clean(s string, u *url.URL) (string, string) { r := bytes.NewReader([]byte(strings.TrimSpace(s))) z := html.NewTokenizer(r) buf := &bytes.Buffer{} strip := &bytes.Buffer{} skip := 0 if u != nil { u.RawQuery = "" u.Fragment = "" } for { if z.Next() == html.ErrorToken { if err := z.Err(); err == io.EOF { break } else { return s, s } } t := z.Token() if t.Type == html.StartTagToken || t.Type == html.SelfClosingTagToken { if !AcceptableElements[t.Data] { if UnacceptableElementsWithEndTag[t.Data] && t.Type != html.SelfClosingTagToken { skip += 1 } } else { cleanAttributes(u, &t) buf.WriteString(t.String()) } } else if t.Type == html.EndTagToken { if !AcceptableElements[t.Data] { if UnacceptableElementsWithEndTag[t.Data] { skip -= 1 } } else { buf.WriteString(t.String()) } } else if skip == 0 { buf.WriteString(t.String()) if t.Type == html.TextToken { strip.WriteString(t.String()) } } } return buf.String(), strip.String() }
// RepositoryURL creates the public URL for the named git repo. If both config.URL and // request are nil, the returned URL will be nil. func RepositoryURL(config *Config, name string, r *http.Request) *url.URL { var url url.URL switch { case config.URL != nil: url = *config.URL case r != nil: url = *r.URL url.Host = r.Host url.Scheme = "http" default: return nil } url.Path = "/" + name url.RawQuery = "" url.Fragment = "" return &url }
func ensureCanonical(u *url.URL) bool { if u.Scheme != "http" && u.Scheme != "https" { return false } // consider https as http to reduce redundancy u.Scheme = "http" // remove trailing slash to reduce redundancy if len(u.Path) != 0 && u.Path[len(u.Path)-1] == '/' { u.Path = u.Path[:len(u.Path)-1] } // clear fragment to reduce redundancy u.Fragment = "" return true }
func GitClone(repo url.URL, destination string) error { gitPath, err := exec.LookPath("git") if err != nil { return err } branch := repo.Fragment repo.Fragment = "" gitUrl := repo.String() args := []string{ "clone", "-depth", "1", } if branch != "" { args = append(args, "-b", branch) } args = append(args, "--recursive", gitUrl, destination) cmd := exec.Command(gitPath, args...) err = cmd.Run() if err != nil { cmd = exec.Command(gitPath, "clone", "--recursive", gitUrl, destination) err = cmd.Run() if err != nil { return fmt.Errorf("Failed to clone git repository at %s", gitUrl) } if branch != "" { cmd = exec.Command(gitPath, "--git-dir="+destination+"/.git", "--work-tree="+destination, "checkout", branch) err = cmd.Run() if err != nil { return fmt.Errorf("Failed to checkout branch '%s' for git repository at %s", branch, gitUrl) } } } return nil }
func Normalize(u *url.URL) error { if !utf8.ValidString(u.String()) { return fmt.Errorf("normalize URL: invalid UTF-8 string: %q", u.String()) } u.Scheme = strings.ToLower(u.Scheme) if u.Scheme != "http" && u.Scheme != "https" { return fmt.Errorf("normalize URL: unsupported scheme: %v", u.Scheme) } host, port, err := net.SplitHostPort(u.Host) if err != nil { // missing port host, port = u.Host, "" } if host == "" { return errors.New("normalize URL: empty host") } else if v, err := validateHost(host); err != nil { return fmt.Errorf("normalize URL: invalid host %q: %v", host, err) } else { u.Host = v } if (u.Scheme == "http" && port == "80") || (u.Scheme == "https" && port == "443") { port = "" } if port != "" { u.Host = net.JoinHostPort(u.Host, port) } clean := func(pth string) string { p := path.Clean(pth) if p == "." { p = "" } else if strings.HasSuffix(pth, "/") && !strings.HasSuffix(p, "/") { p += "/" } return p } u.Path = clean(u.Path) u.RawPath = clean(u.RawPath) u.Fragment = "" return nil }
func cloneArgs(remoteURL *url.URL, root string) []string { args := []string{"clone", "--recursive"} shallow := len(remoteURL.Fragment) == 0 if shallow && strings.HasPrefix(remoteURL.Scheme, "http") { res, err := http.Head(fmt.Sprintf("%s/info/refs?service=git-upload-pack", remoteURL)) if err != nil || res.Header.Get("Content-Type") != "application/x-git-upload-pack-advertisement" { shallow = false } } if shallow { args = append(args, "--depth", "1") } if remoteURL.Fragment != "" { remoteURL.Fragment = "" } return append(args, remoteURL.String(), root) }
//ReorderAndCrop removes the anchor (#sth) Fragment, //sorts, removes and encodes query string parameters //and lowercases Host func ReorderAndCrop(conf *config.ParsingConfig, url *url.URL) { url.Path = strings.TrimSuffix(strings.TrimSpace(url.Path), "/") if conf.StripQueryString { url.RawQuery = "" } else { stringURL := url.String() query := url.Query() for _, filter := range conf.Params { if filter.Regex.MatchString(stringURL) { //If only specified params are relevent if filter.Include { for key := range query { //Check if param is allowed found := false for _, param := range filter.Params { if param == key { found = true } } //If not remove if !found { query.Del(key) } } //If params are irrelevant } else { for _, param := range filter.Params { query.Del(param) } } } } url.RawQuery = query.Encode() } url.Fragment = "" url.Host = strings.ToLower(url.Host) if conf.StripWWW { url.Host = StripWWW(url.Host) } }
func (ca *CrawlerApp) Crawl(rootURL *url.URL, depth int) { defer ca.waitGroup.Done() if depth <= 0 { ca.Errors <- errors.New("Reached max depth") return } rootURL.Fragment = "" ca.mutex.Lock() if _, found := ca.Visited[rootURL.String()]; found { ca.mutex.Unlock() return } else if !found { ca.Visited[rootURL.String()] = true ca.mutex.Unlock() } results, err := ca.Fetch(rootURL.String()) if err != nil { ca.Errors <- err return } ca.PrettyPrint(rootURL, results) for internalURLString, _ := range results.internalURLs { internalURL, err := url.Parse(internalURLString) if err != nil { ca.Errors <- err return } ca.waitGroup.Add(1) go ca.Crawl(internalURL, depth-1) } }
func (e *Extractor) links(u *url.URL, doc *goquery.Document) ([]*url.URL, error) { urls := make([]*url.URL, 0, 5) doc.Find("a[href]").Each(func(i int, sel *goquery.Selection) { val, _ := sel.Attr("href") u, err := u.Parse(val) if err != nil { e.log.WithError(err).Errorf("Error resolving URL %s", val) return } u.Fragment = "" if u.Path != "" && u.Path[len(u.Path)-1:] == "/" { u.Path = u.Path[:len(u.Path)-1] } urls = append(urls, u) }) return urls, nil }
func removeFragment(u *url.URL) (*url.URL, error) { u.Fragment = "" return u, nil }
func pushURL(l *lua.State, u *url.URL) { l.NewTable() var urlFunc = map[string]func(*url.URL) lua.Function{ "isAbs": urlIsAbs, "parse": urlParse, "requestURI": urlRequestURI, "string": urlString, } for name, goFn := range urlFunc { l.PushGoFunction(goFn(u)) l.SetField(-2, name) } l.NewTable() getHook := func(l *lua.State) int { key := lua.CheckString(l, 2) switch key { case "scheme": l.PushString(u.Scheme) case "opaque": l.PushString(u.Opaque) case "host": l.PushString(u.Host) case "path": l.PushString(u.Path) case "rawQuery": l.PushString(u.RawQuery) case "fragment": l.PushString(u.Fragment) default: return 0 } return 1 } l.PushGoFunction(getHook) l.SetField(-2, "__index") setHook := func(l *lua.State) int { key := lua.CheckString(l, 2) val := lua.CheckString(l, 3) switch key { case "scheme": u.Scheme = val case "opaque": u.Opaque = val case "host": u.Host = val case "path": u.Path = val case "rawQuery": u.RawQuery = val case "fragment": u.Fragment = val default: l.RawSet(1) } return 0 } l.PushGoFunction(setHook) l.SetField(-2, "__newindex") l.SetMetaTable(-2) }
// ParseWithSocket parses url like this: mysql://root:pass@unix(/var/run/mysql.socket)/db // and normal urls. func ParseWithSocket(url_ string) (*url.URL, error) { u := new(url.URL) s := strings.SplitN(url_, "://", 2) if len(s) != 2 { return nil, e.New("invalid url") } u.Scheme = s[0] rest := "" s = strings.SplitN(s[1], "@", 2) if len(s) == 1 { rest = s[0] } else if len(s) == 2 { userpass := strings.SplitN(s[0], ":", 2) if len(userpass) == 1 { u.User = url.User(userpass[0]) } else if len(userpass) == 2 { pass, err := url.QueryUnescape(userpass[1]) if err != nil { return nil, e.New(err) } u.User = url.UserPassword(userpass[0], pass) } else { return nil, e.New("invalid user password") } rest = s[1] } else { return nil, e.New("invalid user string") } unix := regUnix.FindAllString(rest, 1) if len(unix) == 1 { u.Host = unix[0] rest = strings.TrimSpace(regUnix.ReplaceAllLiteralString(rest, "")) q := strings.Index(rest, "?") f := strings.Index(rest, "#") pend := f if q > f { pend = q } i := strings.Index(rest, "/") if i != -1 && pend != -1 { u.Path = rest[i:pend] rest = rest[pend:] } else if i == -1 && pend != -1 { rest = rest[pend:] } else if i != -1 && pend == -1 { u.Path = rest[i:] pathInHost(u) return u, nil } else if i == -1 && pend == -1 { pathInHost(u) return u, nil } } else if len(unix) == 0 { q := strings.Index(rest, "?") f := strings.Index(rest, "#") pend := f ff := f if f == -1 { ff = math.MaxInt64 } if q < ff && q >= 0 { pend = q } i := strings.Index(rest, "/") if i != -1 && pend != -1 { u.Host = rest[:i] u.Path = rest[i:pend] rest = rest[pend:] } else if i == -1 && pend != -1 { u.Host = rest[:pend] rest = rest[pend:] } else if i != -1 && pend == -1 { u.Host = rest[:i] u.Path = rest[i:] pathInHost(u) return u, nil } else if i == -1 && pend == -1 { u.Host = rest pathInHost(u) return u, nil } } else { return nil, e.New("socket address is invalid") } pathInHost(u) q := strings.Index(rest, "?") f := strings.Index(rest, "#") if q+1 >= len(rest) { return nil, e.New("error parsing query") } if f+1 >= len(rest) { return nil, e.New("error parsing fragment") } if q != -1 && f != -1 && q <= f { u.RawQuery = rest[q+1 : f] u.Fragment = rest[f+1:] } else if q != -1 && f == -1 { u.RawQuery = rest[q+1:] } else if q == -1 && f != -1 { u.Fragment = rest[f+1:] } else if q == -1 && f == -1 { return u, nil } else { return nil, e.New("error parsing query and fragment %v, %v", q, f) } return u, nil }
// Set attempts to set a string value to an address func (a *Addr) Set(value string) error { var addr *url.URL isURL := a.isURL(value) if isURL { parsed, err := url.Parse(value) if err != nil { return fmt.Errorf("not a valid URL: %v", err) } addr = parsed } else { addr = &url.URL{ Scheme: a.DefaultScheme, Host: value, } if len(addr.Scheme) == 0 { addr.Scheme = "tcp" } } if strings.Contains(addr.Host, ":") { host, port, err := net.SplitHostPort(addr.Host) if err != nil { return fmt.Errorf("not a valid host:port: %v", err) } portNum, err := strconv.ParseUint(port, 10, 64) if err != nil { return fmt.Errorf("not a valid port: %v", err) } a.Host = host a.Port = int(portNum) } else { port := 0 if !isURL { port = a.DefaultPort } if port == 0 { switch addr.Scheme { case "http": port = 80 case "https": port = 443 default: return fmt.Errorf("no port specified") } } a.Host = addr.Host a.Port = port addr.Host = net.JoinHostPort(addr.Host, strconv.FormatInt(int64(a.Port), 10)) } if !a.AllowPrefix { addr.Path = "" } addr.RawQuery = "" addr.Fragment = "" if value != a.Value { a.Provided = true } a.URL = addr a.Value = value return nil }
func urlWithoutRef(url url.URL) string { url.Fragment = "" return url.String() }
func removeFragment(u *url.URL) { u.Fragment = "" }
/* * Remove #fragment from a URL. */ func RemoveFragment(url *url.URL) { url.Fragment = "" }