// 生成真实的url
// 传来的url可能是http://a.com, 也可能是a.com
// getRelativeUrl传来的可以是http://a.com
// url = a.com/a/?id=12&id=1221, 那么genUrl=a.com/a/index.html?id=121
func (this *LeaSpider) genUrl(url string) string {
	// 去掉?后面的
	queryParam, fragment := "", "" // 包含?,#
	pos := strings.Index(url, "?")
	if pos != -1 {
		queryParam = util.Substring(url, pos)
		url = util.Substr(url, 0, pos)
	} else {
		pos = strings.Index(url, "#")
		if pos != -1 {
			fragment = util.Substring(url, pos)
			url = util.Substr(url, 0, pos)
		}
	}

	// 如果url == host
	if url == this.host || url == this.schemeAndHost {
		return url + "/" + this.defaultFilename + queryParam + fragment
	}

	genFilename, needApend := this.genFilename(url)
	if genFilename != "" {
		if needApend {
			url += "/" + genFilename + queryParam + fragment
		} else {
			// 是a.php => a.html
			urlArr := strings.Split(url, "/")
			urlArr = urlArr[:len(urlArr)-1]
			url = strings.Join(urlArr, "/") + "/" + genFilename
		}
	}

	return url
}
Ejemplo n.º 2
0
func trimQueryParams(url string) string {
	pos := strings.Index(url, "?")
	if pos != -1 {
		url = util.Substr(url, 0, pos)
	}

	pos = strings.Index(url, "#")
	if pos != -1 {
		url = util.Substr(url, 0, pos)
	}
	return url
}
// 如果url是 a.com/b/c/d
// 生成一个文件a.com/b/c/d/d_leaui_index.html
// 返回 d_leaui_index.html
// 如果不是一个目录, 返回""
func (this *LeaSpider) genFilename(url string) (string, bool) {
	urlArr := strings.Split(url, "/")
	if urlArr != nil {
		last := urlArr[len(urlArr)-1]
		ext := strings.ToLower(filepath.Ext(last))
		if ext == "" {
			return this.defaultFilename, true // 需要append到url后面
		} else if util.InArray([]string{".php", ".jsp", ".asp", ".aspx"}, ext) {
			filename := filepath.Base(last)                             // a.php
			filename = util.Substr(filename, 0, len(filename)-len(ext)) // a
			return filename + ".html", false
		}
	}
	return "", true
}
// 处理url, 得到scheme, host
func (this *LeaSpider) parseUrl(url string) {
	if strings.HasPrefix(url, "http://") {
		this.scheme = "http://"
	} else {
		this.scheme = "https://"
	}

	// http://lealife.com/b/c
	url = strings.Replace(url, this.scheme, "", 1)
	index := strings.Index(url, "/")
	if index == -1 {
		this.host = url
	} else {
		this.host = util.Substr(url, 0, index)
	}

	this.schemeAndHost = this.scheme + this.host
}
Ejemplo n.º 5
0
func main() {
	regular := "http:\\/|https:\\/|javascript:|mailto:|" class=|@.*?\\..+"
	reg := regexp.MustCompile(regular)
	url := "javascript:"

	println(reg.MatchString(url))
	filename := filepath.Dir("a/b/c/aaabceeeeeeee.php") // a.php
	println(filename)
	filename = util.Substr(filename, 0, len(filename)-len(".php")) // a
	println(filename)

	// _, err2 := os.Open("D:\\a.jpg")
	_, err2 := os.Stat("D:/a.jpg")
	if err2 != nil && os.IsNotExist(err2) {
		println("file not exist!\n")
	} else {
		println("file exists")
	}

	println(trimQueryParams("a.jgp33333##"))

	regular = "(?i)(src=|href=)[\"']([^#].*?)[\"']"
	reg = regexp.MustCompile(regular)
	println(reg.MatchString("url(a)"))
	println()
	re := reg.FindAllStringSubmatch("src='xaaxx3333333333331'  href=\"aaaxx\"", -1)
	for _, each := range re {
		fmt.Println(each[2])
	}

	url = "http://www.a.comddd/b/c"
	url = strings.Replace(url, "http://", "", 1)
	index := strings.Index(url, "/")
	if index == -1 {
		println(url)
	} else {
		println(util.Substr(url, 0, index))
	}

	println("--------------")
	arr := []string{"life", "ax", "cdj"}
	sort.Strings(arr)
	fmt.Println(arr)

	cUrl := "//lifedddddddddddddddddddd"
	println(util.Substring(cUrl, 2))

	queryParam, fragment := "", ""
	url = "http://a.com?id=12#ddd"
	pos := strings.Index(url, "?")
	if pos != -1 {
		queryParam = util.Substring(url, pos)
		url = util.Substr(url, 0, pos)
	} else {
		pos = strings.Index(url, "#")
		if pos != -1 {
			fragment = util.Substring(url, pos)
			url = util.Substr(url, 0, pos)
		}
	}

	println(queryParam, fragment)

	urlArr := strings.Split("a/b/c/d.html", "/")
	urlArr = urlArr[:len(urlArr)-1]
	println(strings.Join(urlArr, "/"))
}