//htt func (this *Tingo) genUrl(url string) string { queryParam, fragment := "", "" // 包含?,# paramIndex := strings.Index(url, "?") if paramIndex != -1 { queryParam = com.Substring(url, paramIndex) //"?"后边的参数 url = com.Substr(url, 0, paramIndex) } else { paramIndex = strings.Index(url, "#") if paramIndex != -1 { fragment = com.Substring(url, paramIndex) //"#"后边的参数 url = com.Substr(url, 0, paramIndex) } } // 如果url == host if url == this.host || url == this.agreementAndHost { return url + "/" + this.defaultFilename + queryParam + fragment } genFilename, needApend := this.genFilename(url) if genFilename != "" { if needApend { url += "/" + genFilename + queryParam + fragment } else { // 是a.php => a.html urlArr := strings.Split(url, "/") urlArr = urlArr[:len(urlArr)-1] url = strings.Join(urlArr, "/") + "/" + genFilename } } return url }
func (this *Tingo) handleHTML(pUrl, realPUrl, content string) (children []string) { regular := "(?i)(src=|href=)[\"']([^#].*?)[\"']" reg := regexp.MustCompile(regular) re := reg.FindAllStringSubmatch(content, -1) log.Println(pUrl + " => " + realPUrl) log.Println(pUrl + " 含有: ") baseDir := filepath.Dir(realPUrl) for _, each := range re { rawFullUrl := each[0] rawFullUrlPrefix := each[1] rawCUrl := each[2] cUrl := rawCUrl prefixNotHttp := false if strings.HasPrefix(cUrl, "//") { cUrl = this.agreement + com.Substring(cUrl, 2) prefixNotHttp = true } else if strings.HasPrefix(cUrl, "/") { cUrl = this.agreementAndHost + cUrl } // 如果这个url是一个目录, 新建一个文件 // 如果这个url是以http://a.com开头的, host是一样的, // 那么content的url是相对于该url // 生成的url, 如果是目录, 会生成一个文件 cRealUrl, ok := this.getRalativeUrl(realPUrl, cUrl) // 表示已处理过, 是相对目录了, 必须把内容的替换掉 // 但要处理的还是之前的链接http:// if ok == -1 { // 如果之前//替换成了http:// if prefixNotHttp { content = strings.Replace(content, rawFullUrl, rawFullUrlPrefix+"\""+cRealUrl+"\"", -1) } continue } if ok == 1 { cRealUrl = strings.Trim(cRealUrl, "/") // 把//变成/ for strings.Index(cRealUrl, "//") != -1 { cRealUrl = strings.Replace(cRealUrl, "//", "/", -1) } log.Println(rawCUrl + " >>>>>> " + cRealUrl) content = strings.Replace(content, rawFullUrl, rawFullUrlPrefix+"\""+cRealUrl+"\"", -1) cUrl = strings.Replace(cUrl, this.agreement, "", 1) // 把sheme去掉, do children = append(children, cUrl) // 不需要clean } else { children = append(children, this.cleanUrl(baseDir+"/"+cRealUrl)) } } // 把content保存起来 if !this.writeFile(realPUrl, content) { return } return }
func (this *Tingo) Fetch(url, targetPath string) { url = strings.TrimSpace(url) //去两边的空格 //格式化url this.parseUrl(url) //保存路径 this.handleTargetPath(targetPath) //去掉"http://"或是"https://" url = com.Substring(url, len(this.agreement)) this.handleUrl(url, false) //Wait方法阻塞直到WaitGroup计数器减为0 this.wg.Wait() // 处理异常 this.handleExceptionUrl() }