예제 #1
0
//htt
func (this *Tingo) genUrl(url string) string {
	queryParam, fragment := "", "" // 包含?,#
	paramIndex := strings.Index(url, "?")
	if paramIndex != -1 {
		queryParam = com.Substring(url, paramIndex) //"?"后边的参数
		url = com.Substr(url, 0, paramIndex)
	} else {
		paramIndex = strings.Index(url, "#")
		if paramIndex != -1 {
			fragment = com.Substring(url, paramIndex) //"#"后边的参数
			url = com.Substr(url, 0, paramIndex)
		}
	}
	// 如果url == host
	if url == this.host || url == this.agreementAndHost {
		return url + "/" + this.defaultFilename + queryParam + fragment
	}

	genFilename, needApend := this.genFilename(url)
	if genFilename != "" {
		if needApend {
			url += "/" + genFilename + queryParam + fragment
		} else {
			// 是a.php => a.html
			urlArr := strings.Split(url, "/")
			urlArr = urlArr[:len(urlArr)-1]
			url = strings.Join(urlArr, "/") + "/" + genFilename
		}
	}

	return url
}
예제 #2
0
func (this *Tingo) handleHTML(pUrl, realPUrl, content string) (children []string) {
	regular := "(?i)(src=|href=)[\"']([^#].*?)[\"']"
	reg := regexp.MustCompile(regular)
	re := reg.FindAllStringSubmatch(content, -1)

	log.Println(pUrl + " => " + realPUrl)
	log.Println(pUrl + " 含有: ")

	baseDir := filepath.Dir(realPUrl)

	for _, each := range re {
		rawFullUrl := each[0]
		rawFullUrlPrefix := each[1]

		rawCUrl := each[2]
		cUrl := rawCUrl

		prefixNotHttp := false
		if strings.HasPrefix(cUrl, "//") {
			cUrl = this.agreement + com.Substring(cUrl, 2)
			prefixNotHttp = true
		} else if strings.HasPrefix(cUrl, "/") {
			cUrl = this.agreementAndHost + cUrl
		}

		// 如果这个url是一个目录, 新建一个文件
		// 如果这个url是以http://a.com开头的, host是一样的,
		// 那么content的url是相对于该url
		// 生成的url, 如果是目录, 会生成一个文件
		cRealUrl, ok := this.getRalativeUrl(realPUrl, cUrl)
		// 表示已处理过, 是相对目录了, 必须把内容的替换掉
		// 但要处理的还是之前的链接http://
		if ok == -1 {
			// 如果之前//替换成了http://
			if prefixNotHttp {
				content = strings.Replace(content, rawFullUrl, rawFullUrlPrefix+"\""+cRealUrl+"\"", -1)
			}
			continue
		}

		if ok == 1 {
			cRealUrl = strings.Trim(cRealUrl, "/")
			// 把//变成/
			for strings.Index(cRealUrl, "//") != -1 {
				cRealUrl = strings.Replace(cRealUrl, "//", "/", -1)
			}
			log.Println(rawCUrl + " >>>>>> " + cRealUrl)
			content = strings.Replace(content, rawFullUrl, rawFullUrlPrefix+"\""+cRealUrl+"\"", -1)
			cUrl = strings.Replace(cUrl, this.agreement, "", 1) // 把sheme去掉, do
			children = append(children, cUrl)                   // 不需要clean
		} else {
			children = append(children, this.cleanUrl(baseDir+"/"+cRealUrl))
		}
	}
	// 把content保存起来
	if !this.writeFile(realPUrl, content) {
		return
	}
	return
}
예제 #3
0
func (this *Tingo) Fetch(url, targetPath string) {
	url = strings.TrimSpace(url) //去两边的空格
	//格式化url
	this.parseUrl(url)
	//保存路径
	this.handleTargetPath(targetPath)
	//去掉"http://"或是"https://"
	url = com.Substring(url, len(this.agreement))

	this.handleUrl(url, false)
	//Wait方法阻塞直到WaitGroup计数器减为0
	this.wg.Wait()

	// 处理异常
	this.handleExceptionUrl()

}