// 生成真实的url // 传来的url可能是http://a.com, 也可能是a.com // getRelativeUrl传来的可以是http://a.com // url = a.com/a/?id=12&id=1221, 那么genUrl=a.com/a/index.html?id=121 func (this *LeaSpider) genUrl(url string) string { // 去掉?后面的 queryParam, fragment := "", "" // 包含?,# pos := strings.Index(url, "?") if pos != -1 { queryParam = util.Substring(url, pos) url = util.Substr(url, 0, pos) } else { pos = strings.Index(url, "#") if pos != -1 { fragment = util.Substring(url, pos) url = util.Substr(url, 0, pos) } } // 如果url == host if url == this.host || url == this.schemeAndHost { return url + "/" + this.defaultFilename + queryParam + fragment } genFilename, needApend := this.genFilename(url) if genFilename != "" { if needApend { url += "/" + genFilename + queryParam + fragment } else { // 是a.php => a.html urlArr := strings.Split(url, "/") urlArr = urlArr[:len(urlArr)-1] url = strings.Join(urlArr, "/") + "/" + genFilename } } return url }
func trimQueryParams(url string) string { pos := strings.Index(url, "?") if pos != -1 { url = util.Substr(url, 0, pos) } pos = strings.Index(url, "#") if pos != -1 { url = util.Substr(url, 0, pos) } return url }
// 如果url是 a.com/b/c/d // 生成一个文件a.com/b/c/d/d_leaui_index.html // 返回 d_leaui_index.html // 如果不是一个目录, 返回"" func (this *LeaSpider) genFilename(url string) (string, bool) { urlArr := strings.Split(url, "/") if urlArr != nil { last := urlArr[len(urlArr)-1] ext := strings.ToLower(filepath.Ext(last)) if ext == "" { return this.defaultFilename, true // 需要append到url后面 } else if util.InArray([]string{".php", ".jsp", ".asp", ".aspx"}, ext) { filename := filepath.Base(last) // a.php filename = util.Substr(filename, 0, len(filename)-len(ext)) // a return filename + ".html", false } } return "", true }
// 处理url, 得到scheme, host func (this *LeaSpider) parseUrl(url string) { if strings.HasPrefix(url, "http://") { this.scheme = "http://" } else { this.scheme = "https://" } // http://lealife.com/b/c url = strings.Replace(url, this.scheme, "", 1) index := strings.Index(url, "/") if index == -1 { this.host = url } else { this.host = util.Substr(url, 0, index) } this.schemeAndHost = this.scheme + this.host }
func main() { regular := "http:\\/|https:\\/|javascript:|mailto:|" class=|@.*?\\..+" reg := regexp.MustCompile(regular) url := "javascript:" println(reg.MatchString(url)) filename := filepath.Dir("a/b/c/aaabceeeeeeee.php") // a.php println(filename) filename = util.Substr(filename, 0, len(filename)-len(".php")) // a println(filename) // _, err2 := os.Open("D:\\a.jpg") _, err2 := os.Stat("D:/a.jpg") if err2 != nil && os.IsNotExist(err2) { println("file not exist!\n") } else { println("file exists") } println(trimQueryParams("a.jgp33333##")) regular = "(?i)(src=|href=)[\"']([^#].*?)[\"']" reg = regexp.MustCompile(regular) println(reg.MatchString("url(a)")) println() re := reg.FindAllStringSubmatch("src='xaaxx3333333333331' href=\"aaaxx\"", -1) for _, each := range re { fmt.Println(each[2]) } url = "http://www.a.comddd/b/c" url = strings.Replace(url, "http://", "", 1) index := strings.Index(url, "/") if index == -1 { println(url) } else { println(util.Substr(url, 0, index)) } println("--------------") arr := []string{"life", "ax", "cdj"} sort.Strings(arr) fmt.Println(arr) cUrl := "//lifedddddddddddddddddddd" println(util.Substring(cUrl, 2)) queryParam, fragment := "", "" url = "http://a.com?id=12#ddd" pos := strings.Index(url, "?") if pos != -1 { queryParam = util.Substring(url, pos) url = util.Substr(url, 0, pos) } else { pos = strings.Index(url, "#") if pos != -1 { fragment = util.Substring(url, pos) url = util.Substr(url, 0, pos) } } println(queryParam, fragment) urlArr := strings.Split("a/b/c/d.html", "/") urlArr = urlArr[:len(urlArr)-1] println(strings.Join(urlArr, "/")) }