예제 #1
0
// 生成真实的url
// 传来的url可能是http://a.com, 也可能是a.com
// getRelativeUrl传来的可以是http://a.com
// url = a.com/a/?id=12&id=1221, 那么genUrl=a.com/a/index.html?id=121
func (this *Crawler) genUrl(url string) string {
	// 去掉?后面的
	queryParam, fragment := "", "" // 包含?,#
	pos := strings.Index(url, "?")
	if pos != -1 {
		queryParam = util.Substring(url, pos)
		url = util.Substr(url, 0, pos)
	} else {
		pos = strings.Index(url, "#")
		if pos != -1 {
			fragment = util.Substring(url, pos)
			url = util.Substr(url, 0, pos)
		}
	}

	// 如果url == host
	if url == this.host || url == this.schemeAndHost {
		return url + "/" + this.defaultFilename + queryParam + fragment
	}

	genFilename, needApend := this.genFilename(url)
	if genFilename != "" {
		if needApend {
			url += "/" + genFilename + queryParam + fragment
		} else {
			// 是a.php => a.html
			urlArr := strings.Split(url, "/")
			urlArr = urlArr[:len(urlArr)-1]
			url = strings.Join(urlArr, "/") + "/" + genFilename
		}
	}

	return url
}
예제 #2
0
// 将url ?, #后面的字符串去掉
func (this *Crawler) trimQueryParams(url string) string {
	pos := strings.Index(url, "?")
	if pos != -1 {
		url = util.Substr(url, 0, pos)
	}

	pos = strings.Index(url, "#")
	if pos != -1 {
		url = util.Substr(url, 0, pos)
	}
	return url
}
예제 #3
0
// 如果url是 a.com/b/c/d
// 生成一个文件a.com/b/c/d/d_leaui_index.html
// 返回 d_leaui_index.html
// 如果不是一个目录, 返回""
func (this *Crawler) genFilename(url string) (string, bool) {
	urlArr := strings.Split(url, "/")
	if urlArr != nil {
		last := urlArr[len(urlArr)-1]
		ext := strings.ToLower(filepath.Ext(last))
		if ext == "" {
			return this.defaultFilename, true // 需要append到url后面
		} else if util.InArray([]string{".php", ".jsp", ".asp", ".aspx"}, ext) {
			filename := filepath.Base(last)                             // a.php
			filename = util.Substr(filename, 0, len(filename)-len(ext)) // a
			return filename + ".html", false
		}
	}
	return "", true
}
예제 #4
0
// 处理url, 得到scheme, host
func (this *Crawler) parseUrl(url string) {
	if strings.HasPrefix(url, "http://") {
		this.scheme = "http://"
	} else {
		this.scheme = "https://"
	}

	// http://lealife.com/b/c
	url = strings.Replace(url, this.scheme, "", 1)
	index := strings.Index(url, "/")
	if index == -1 {
		this.host = url
	} else {
		this.host = util.Substr(url, 0, index)
	}

	this.schemeAndHost = this.scheme + this.host
}