func GitClone(repo url.URL, destination string) error {
	gitPath, err := exec.LookPath("git")
	if err != nil {
		return err
	}

	branch := repo.Fragment
	repo.Fragment = ""
	gitUrl := repo.String()

	err = performGitClone(gitPath,
		[]string{
			"--depth",
			"1",
			"--recursive",
			gitUrl,
			destination,
		}, branch)

	if err != nil {
		err = performGitClone(gitPath,
			[]string{
				"--recursive",
				gitUrl,
				destination,
			}, branch)

		if err != nil {
			return fmt.Errorf("Failed to clone git repository at %s", gitUrl)
		}
	}

	return nil
}
Exemple #2
0
func GetLink(links *bolt.Bucket, stats *het.CountStats, url *url.URL) (het.Link, error) {
	url.Fragment = ""

	lbytes := links.Get([]byte(url.String()))
	link := het.Link{}
	if lbytes != nil {
		// link already exists, return early
		json.Unmarshal(lbytes, &link)

		// follow redirects in the links bucket
		if link.Redirect {
			return GetLink(links, stats, &link.URL)
		}

		return link, nil
	}

	resp, err := http.Get(url.String())
	if err != nil {
		return link, err
	}

	defer resp.Body.Close()

	finalURL := resp.Request.URL
	finalURL.Fragment = ""

	link = het.Link{
		URL:          *finalURL,
		StatusCode:   resp.StatusCode,
		ContentType:  resp.Header.Get("Content-Type"),
		LastModified: strings.Trim(resp.Header.Get("Last-Modified"), " \t\n"),
	}

	lbytes, err = json.Marshal(&link)
	if err != nil {
		log.Fatal(err)
	}

	links.Put([]byte(finalURL.String()), lbytes)
	stats.LinkCount++

	// redirect link
	if finalURL.String() != url.String() {
		lrbytes, err := json.Marshal(&het.Link{
			URL:      *finalURL,
			Redirect: true,
		})

		if err != nil {
			log.Fatal(err)
		}

		links.Put([]byte(url.String()), lrbytes)
		stats.LinkCount++
	}

	return link, nil

}
Exemple #3
0
// NewResourceLocation appends a resource id to the end of the requested URL path.
func NewResourceLocation(reqURL *url.URL, id string) string {
	var u url.URL
	u = *reqURL
	u.Path = path.Join(u.Path, id)
	u.RawQuery = ""
	u.Fragment = ""
	return u.String()
}
Exemple #4
0
func (c *Crawler) enqueue(link *url.URL, base *url.URL) {
	if base != nil {
		link = base.ResolveReference(link)
		link.Fragment = ""
	}

	if link.Path == "" {
		link.Path = "/"
	}

	link.Fragment = ""

	if link.Host != "" {
		link.Host = c.normalize_host(link.Host)
	}

	if c.known.Exists([]byte(link.String())) {
		return
	}

	c.known.Insert([]byte(link.String()))

	c.report_found(link)

	if link.Scheme == "http" {
		if c.domains[link.Host] {
			c.waiter.Add(1)
			c.queue.Enqueue(&task{url: link})
			return
		} else {
			c.report_ignored(link, 0, "external domain")
			return
		}
	} else {
		c.report_ignored(link, 0, "wrong scheme: "+link.Scheme)
		return
	}

	/*c.waiter.Add(1)*/
	/*c.queue <- u.String()*/
}
Exemple #5
0
// MungeNoProtocolURL will take a URL returned from net/url.Parse and make
// corrections to the URL when no protocol is specified in the Scheme, where there
// are valid protocol-less git url spec formats that result in either file:// or ssh:// protocol usage;
// if an explicit protocol is already specified, then the
// URL is left unchaged and the method simply returns with no error reported,
// since the URL is
func (h *stiGit) MungeNoProtocolURL(source string, uri *url.URL) error {
	if uri == nil {
		return nil
	}

	// only deal with protocol-less url's
	if uri.Scheme != "" {
		return nil
	}

	details, mods, err := ParseFile(source)
	if err != nil {
		return err
	}

	if details.BadRef {
		return fmt.Errorf("bad reference following # in %s", source)
	}
	if !details.FileExists {
		mods2, err := ParseSSH(source)
		if err != nil {
			glog.Errorf("ssh git clone spec error: %v", err)
			return err
		}
		mods = mods2
	}

	// update the either file or ssh url accordingly
	if mods != nil {
		if len(mods.User) > 0 {
			uri.User = url.User(mods.User)
		}
		if len(mods.Scheme) > 0 {
			uri.Scheme = mods.Scheme
		}
		if len(mods.Host) > 0 {
			uri.Host = mods.Host
		}
		if len(mods.Path) > 0 {
			uri.Path = mods.Path
		}
		if len(mods.Ref) > 0 {
			uri.Fragment = mods.Ref
		}
	}
	return nil
}
Exemple #6
0
// Clean returns the sanitized HTML (based on a tag and attribute whitelist) and
// the text contents of s. Links are made relative to u, if non-nil.
func Clean(s string, u *url.URL) (string, string) {
	r := bytes.NewReader([]byte(strings.TrimSpace(s)))
	z := html.NewTokenizer(r)
	buf := &bytes.Buffer{}
	strip := &bytes.Buffer{}
	skip := 0
	if u != nil {
		u.RawQuery = ""
		u.Fragment = ""
	}
	for {
		if z.Next() == html.ErrorToken {
			if err := z.Err(); err == io.EOF {
				break
			} else {
				return s, s
			}
		}

		t := z.Token()
		if t.Type == html.StartTagToken || t.Type == html.SelfClosingTagToken {
			if !AcceptableElements[t.Data] {
				if UnacceptableElementsWithEndTag[t.Data] && t.Type != html.SelfClosingTagToken {
					skip += 1
				}
			} else {
				cleanAttributes(u, &t)
				buf.WriteString(t.String())
			}
		} else if t.Type == html.EndTagToken {
			if !AcceptableElements[t.Data] {
				if UnacceptableElementsWithEndTag[t.Data] {
					skip -= 1
				}
			} else {
				buf.WriteString(t.String())
			}
		} else if skip == 0 {
			buf.WriteString(t.String())
			if t.Type == html.TextToken {
				strip.WriteString(t.String())
			}
		}
	}

	return buf.String(), strip.String()
}
Exemple #7
0
// RepositoryURL creates the public URL for the named git repo. If both config.URL and
// request are nil, the returned URL will be nil.
func RepositoryURL(config *Config, name string, r *http.Request) *url.URL {
	var url url.URL
	switch {
	case config.URL != nil:
		url = *config.URL
	case r != nil:
		url = *r.URL
		url.Host = r.Host
		url.Scheme = "http"
	default:
		return nil
	}
	url.Path = "/" + name
	url.RawQuery = ""
	url.Fragment = ""
	return &url
}
Exemple #8
0
func ensureCanonical(u *url.URL) bool {

	if u.Scheme != "http" && u.Scheme != "https" {
		return false
	}
	// consider https as http to reduce redundancy
	u.Scheme = "http"

	// remove trailing slash to reduce redundancy
	if len(u.Path) != 0 && u.Path[len(u.Path)-1] == '/' {
		u.Path = u.Path[:len(u.Path)-1]
	}

	// clear fragment to reduce redundancy
	u.Fragment = ""

	return true
}
func GitClone(repo url.URL, destination string) error {
	gitPath, err := exec.LookPath("git")
	if err != nil {
		return err
	}

	branch := repo.Fragment
	repo.Fragment = ""
	gitUrl := repo.String()

	args := []string{
		"clone",
		"-depth",
		"1",
	}

	if branch != "" {
		args = append(args, "-b", branch)
	}

	args = append(args, "--recursive", gitUrl, destination)
	cmd := exec.Command(gitPath, args...)

	err = cmd.Run()

	if err != nil {
		cmd = exec.Command(gitPath, "clone", "--recursive", gitUrl, destination)
		err = cmd.Run()
		if err != nil {
			return fmt.Errorf("Failed to clone git repository at %s", gitUrl)
		}

		if branch != "" {
			cmd = exec.Command(gitPath, "--git-dir="+destination+"/.git", "--work-tree="+destination, "checkout", branch)
			err = cmd.Run()
			if err != nil {
				return fmt.Errorf("Failed to checkout branch '%s' for git repository at %s", branch, gitUrl)
			}
		}
	}

	return nil
}
Exemple #10
0
func Normalize(u *url.URL) error {
	if !utf8.ValidString(u.String()) {
		return fmt.Errorf("normalize URL: invalid UTF-8 string: %q", u.String())
	}

	u.Scheme = strings.ToLower(u.Scheme)
	if u.Scheme != "http" && u.Scheme != "https" {
		return fmt.Errorf("normalize URL: unsupported scheme: %v", u.Scheme)
	}
	host, port, err := net.SplitHostPort(u.Host)
	if err != nil { // missing port
		host, port = u.Host, ""
	}
	if host == "" {
		return errors.New("normalize URL: empty host")
	} else if v, err := validateHost(host); err != nil {
		return fmt.Errorf("normalize URL: invalid host %q: %v", host, err)
	} else {
		u.Host = v
	}

	if (u.Scheme == "http" && port == "80") ||
		(u.Scheme == "https" && port == "443") {
		port = ""
	}
	if port != "" {
		u.Host = net.JoinHostPort(u.Host, port)
	}

	clean := func(pth string) string {
		p := path.Clean(pth)
		if p == "." {
			p = ""
		} else if strings.HasSuffix(pth, "/") && !strings.HasSuffix(p, "/") {
			p += "/"
		}
		return p
	}
	u.Path = clean(u.Path)
	u.RawPath = clean(u.RawPath)
	u.Fragment = ""
	return nil
}
Exemple #11
0
func cloneArgs(remoteURL *url.URL, root string) []string {
	args := []string{"clone", "--recursive"}
	shallow := len(remoteURL.Fragment) == 0

	if shallow && strings.HasPrefix(remoteURL.Scheme, "http") {
		res, err := http.Head(fmt.Sprintf("%s/info/refs?service=git-upload-pack", remoteURL))
		if err != nil || res.Header.Get("Content-Type") != "application/x-git-upload-pack-advertisement" {
			shallow = false
		}
	}

	if shallow {
		args = append(args, "--depth", "1")
	}

	if remoteURL.Fragment != "" {
		remoteURL.Fragment = ""
	}

	return append(args, remoteURL.String(), root)
}
Exemple #12
0
//ReorderAndCrop removes the anchor (#sth) Fragment,
//sorts, removes and encodes query string parameters
//and lowercases Host
func ReorderAndCrop(conf *config.ParsingConfig, url *url.URL) {
	url.Path = strings.TrimSuffix(strings.TrimSpace(url.Path), "/")
	if conf.StripQueryString {
		url.RawQuery = ""
	} else {
		stringURL := url.String()
		query := url.Query()
		for _, filter := range conf.Params {
			if filter.Regex.MatchString(stringURL) {
				//If only specified params are relevent
				if filter.Include {
					for key := range query {
						//Check if param is allowed
						found := false
						for _, param := range filter.Params {
							if param == key {
								found = true
							}
						}
						//If not remove
						if !found {
							query.Del(key)
						}
					}
					//If params are irrelevant
				} else {
					for _, param := range filter.Params {
						query.Del(param)
					}
				}
			}
		}
		url.RawQuery = query.Encode()
	}
	url.Fragment = ""
	url.Host = strings.ToLower(url.Host)
	if conf.StripWWW {
		url.Host = StripWWW(url.Host)
	}
}
Exemple #13
0
func (ca *CrawlerApp) Crawl(rootURL *url.URL, depth int) {
	defer ca.waitGroup.Done()

	if depth <= 0 {
		ca.Errors <- errors.New("Reached max depth")
		return
	}

	rootURL.Fragment = ""

	ca.mutex.Lock()
	if _, found := ca.Visited[rootURL.String()]; found {
		ca.mutex.Unlock()
		return
	} else if !found {
		ca.Visited[rootURL.String()] = true
		ca.mutex.Unlock()
	}

	results, err := ca.Fetch(rootURL.String())
	if err != nil {
		ca.Errors <- err
		return
	}

	ca.PrettyPrint(rootURL, results)

	for internalURLString, _ := range results.internalURLs {
		internalURL, err := url.Parse(internalURLString)
		if err != nil {
			ca.Errors <- err
			return
		}

		ca.waitGroup.Add(1)
		go ca.Crawl(internalURL, depth-1)
	}
}
Exemple #14
0
func (e *Extractor) links(u *url.URL, doc *goquery.Document) ([]*url.URL, error) {
	urls := make([]*url.URL, 0, 5)

	doc.Find("a[href]").Each(func(i int, sel *goquery.Selection) {
		val, _ := sel.Attr("href")
		u, err := u.Parse(val)

		if err != nil {
			e.log.WithError(err).Errorf("Error resolving URL %s", val)
			return
		}

		u.Fragment = ""

		if u.Path != "" && u.Path[len(u.Path)-1:] == "/" {
			u.Path = u.Path[:len(u.Path)-1]
		}

		urls = append(urls, u)
	})

	return urls, nil
}
Exemple #15
0
func removeFragment(u *url.URL) (*url.URL, error) {
	u.Fragment = ""
	return u, nil
}
Exemple #16
0
func pushURL(l *lua.State, u *url.URL) {

	l.NewTable()

	var urlFunc = map[string]func(*url.URL) lua.Function{
		"isAbs":      urlIsAbs,
		"parse":      urlParse,
		"requestURI": urlRequestURI,
		"string":     urlString,
	}

	for name, goFn := range urlFunc {
		l.PushGoFunction(goFn(u))
		l.SetField(-2, name)
	}

	l.NewTable()

	getHook := func(l *lua.State) int {
		key := lua.CheckString(l, 2)
		switch key {
		case "scheme":
			l.PushString(u.Scheme)
		case "opaque":
			l.PushString(u.Opaque)
		case "host":
			l.PushString(u.Host)
		case "path":
			l.PushString(u.Path)
		case "rawQuery":
			l.PushString(u.RawQuery)
		case "fragment":
			l.PushString(u.Fragment)
		default:
			return 0
		}
		return 1
	}

	l.PushGoFunction(getHook)
	l.SetField(-2, "__index")

	setHook := func(l *lua.State) int {
		key := lua.CheckString(l, 2)
		val := lua.CheckString(l, 3)
		switch key {
		case "scheme":
			u.Scheme = val
		case "opaque":
			u.Opaque = val
		case "host":
			u.Host = val
		case "path":
			u.Path = val
		case "rawQuery":
			u.RawQuery = val
		case "fragment":
			u.Fragment = val
		default:
			l.RawSet(1)
		}
		return 0
	}

	l.PushGoFunction(setHook)
	l.SetField(-2, "__newindex")

	l.SetMetaTable(-2)
}
Exemple #17
0
// ParseWithSocket parses url like this: mysql://root:pass@unix(/var/run/mysql.socket)/db
// and normal urls.
func ParseWithSocket(url_ string) (*url.URL, error) {
	u := new(url.URL)
	s := strings.SplitN(url_, "://", 2)
	if len(s) != 2 {
		return nil, e.New("invalid url")
	}
	u.Scheme = s[0]
	rest := ""
	s = strings.SplitN(s[1], "@", 2)
	if len(s) == 1 {
		rest = s[0]
	} else if len(s) == 2 {
		userpass := strings.SplitN(s[0], ":", 2)
		if len(userpass) == 1 {
			u.User = url.User(userpass[0])
		} else if len(userpass) == 2 {
			pass, err := url.QueryUnescape(userpass[1])
			if err != nil {
				return nil, e.New(err)
			}
			u.User = url.UserPassword(userpass[0], pass)
		} else {
			return nil, e.New("invalid user password")
		}
		rest = s[1]
	} else {
		return nil, e.New("invalid user string")
	}

	unix := regUnix.FindAllString(rest, 1)
	if len(unix) == 1 {
		u.Host = unix[0]
		rest = strings.TrimSpace(regUnix.ReplaceAllLiteralString(rest, ""))
		q := strings.Index(rest, "?")
		f := strings.Index(rest, "#")
		pend := f
		if q > f {
			pend = q
		}
		i := strings.Index(rest, "/")
		if i != -1 && pend != -1 {
			u.Path = rest[i:pend]
			rest = rest[pend:]
		} else if i == -1 && pend != -1 {
			rest = rest[pend:]
		} else if i != -1 && pend == -1 {
			u.Path = rest[i:]
			pathInHost(u)
			return u, nil
		} else if i == -1 && pend == -1 {
			pathInHost(u)
			return u, nil
		}
	} else if len(unix) == 0 {
		q := strings.Index(rest, "?")
		f := strings.Index(rest, "#")
		pend := f
		ff := f
		if f == -1 {
			ff = math.MaxInt64
		}
		if q < ff && q >= 0 {
			pend = q
		}
		i := strings.Index(rest, "/")
		if i != -1 && pend != -1 {
			u.Host = rest[:i]
			u.Path = rest[i:pend]
			rest = rest[pend:]
		} else if i == -1 && pend != -1 {
			u.Host = rest[:pend]
			rest = rest[pend:]
		} else if i != -1 && pend == -1 {
			u.Host = rest[:i]
			u.Path = rest[i:]
			pathInHost(u)
			return u, nil
		} else if i == -1 && pend == -1 {
			u.Host = rest
			pathInHost(u)
			return u, nil
		}
	} else {
		return nil, e.New("socket address is invalid")
	}

	pathInHost(u)

	q := strings.Index(rest, "?")
	f := strings.Index(rest, "#")

	if q+1 >= len(rest) {
		return nil, e.New("error parsing query")
	}
	if f+1 >= len(rest) {
		return nil, e.New("error parsing fragment")
	}

	if q != -1 && f != -1 && q <= f {
		u.RawQuery = rest[q+1 : f]
		u.Fragment = rest[f+1:]
	} else if q != -1 && f == -1 {
		u.RawQuery = rest[q+1:]
	} else if q == -1 && f != -1 {
		u.Fragment = rest[f+1:]
	} else if q == -1 && f == -1 {
		return u, nil
	} else {
		return nil, e.New("error parsing query and fragment %v, %v", q, f)
	}
	return u, nil
}
Exemple #18
0
// Set attempts to set a string value to an address
func (a *Addr) Set(value string) error {
	var addr *url.URL
	isURL := a.isURL(value)
	if isURL {
		parsed, err := url.Parse(value)
		if err != nil {
			return fmt.Errorf("not a valid URL: %v", err)
		}
		addr = parsed
	} else {
		addr = &url.URL{
			Scheme: a.DefaultScheme,
			Host:   value,
		}
		if len(addr.Scheme) == 0 {
			addr.Scheme = "tcp"
		}
	}

	if strings.Contains(addr.Host, ":") {
		host, port, err := net.SplitHostPort(addr.Host)
		if err != nil {
			return fmt.Errorf("not a valid host:port: %v", err)
		}
		portNum, err := strconv.ParseUint(port, 10, 64)
		if err != nil {
			return fmt.Errorf("not a valid port: %v", err)
		}
		a.Host = host
		a.Port = int(portNum)
	} else {
		port := 0
		if !isURL {
			port = a.DefaultPort
		}
		if port == 0 {
			switch addr.Scheme {
			case "http":
				port = 80
			case "https":
				port = 443
			default:
				return fmt.Errorf("no port specified")
			}
		}
		a.Host = addr.Host
		a.Port = port
		addr.Host = net.JoinHostPort(addr.Host, strconv.FormatInt(int64(a.Port), 10))
	}

	if !a.AllowPrefix {
		addr.Path = ""
	}
	addr.RawQuery = ""
	addr.Fragment = ""

	if value != a.Value {
		a.Provided = true
	}
	a.URL = addr
	a.Value = value

	return nil
}
Exemple #19
0
func urlWithoutRef(url url.URL) string {
	url.Fragment = ""
	return url.String()
}
Exemple #20
0
func removeFragment(u *url.URL) {
	u.Fragment = ""
}
Exemple #21
0
/*
 * Remove #fragment from a URL.
 */
func RemoveFragment(url *url.URL) {
	url.Fragment = ""
}