func extractLinks(runtimeConfig RuntimeConfig, fileUrl string, fileName []byte, body []byte) { // siteUrlStr := string(fileName) // siteUrlStr = strings.TrimLeft(siteUrlStr, "data/") // siteUrlStr = "http://" + siteUrlStr // log.Debug("fileName to Url:", string(fileName), ",", siteUrlStr) siteUrlStr := fileUrl siteConfig := runtimeConfig.TaskConfig siteUrlByte := []byte(siteUrlStr) log.Debug("enter links extract,", siteUrlStr) if siteConfig.SkipPageParsePattern.Match(siteUrlByte) { log.Debug("hit SkipPageParsePattern pattern,", siteUrlStr) return } log.Debug("parsing external links:", siteUrlStr, ",using:", siteConfig.LinkUrlExtractRegex) matches := siteConfig.LinkUrlExtractRegex.FindAllSubmatch(body, -1) log.Debug("extract links with pattern,total matchs:", len(matches), " match result,", string(fileName)) xIndex := 0 for _, match := range matches { log.Debug("dealing with match result,", xIndex) xIndex = xIndex + 1 url := match[siteConfig.LinkUrlExtractRegexGroupIndex] filterUrl := formatUrlForFilter(url) log.Debug("url clean result:", string(filterUrl), ",original url:", string(url)) filteredUrl := string(filterUrl) //filter error link if filteredUrl == "" { log.Debug("filteredUrl is empty,continue") continue } result1 := strings.HasPrefix(filteredUrl, "#") if result1 { log.Debug("filteredUrl started with: # ,continue") continue } result2 := strings.HasPrefix(filteredUrl, "javascript:") if result2 { log.Debug("filteredUrl started with: javascript: ,continue") continue } hit := false // l.Lock(); // defer l.Unlock(); if runtimeConfig.Storage.CheckWalkedUrl(filterUrl) || runtimeConfig.Storage.CheckFetchedUrl(filterUrl) || runtimeConfig.Storage.CheckPendingFetchUrl(filterUrl) { log.Debug("hit bloomFilter,continue") hit = true continue } if !hit { currentUrlStr := string(url) currentUrlStr = strings.Trim(currentUrlStr, " ") seedUrlStr := siteUrlStr seedURI, err := ParseRequestURI(seedUrlStr) if err != nil { log.Error("ParseSeedURI failed!: ", seedUrlStr, " , ", err) continue } currentURI1, err := ParseRequestURI(currentUrlStr) currentURI := currentURI1 if err != nil { if strings.Contains(err.Error(), "invalid URI for request") { log.Debug("invalid URI for request,fix relative url,original:", currentUrlStr) // log.Debug("old relatived url,", currentUrlStr) //page based relative urls currentUrlStr = "http://" + seedURI.Host + "/" + currentUrlStr currentURI1, err = ParseRequestURI(currentUrlStr) currentURI = currentURI1 if err != nil { log.Error("ParseCurrentURI internal failed!: ", currentUrlStr, " , ", err) continue } log.Debug("new relatived url,", currentUrlStr) } else { log.Error("ParseCurrentURI failed!: ", currentUrlStr, " , ", err) continue } } // relative links if currentURI == nil || currentURI.Host == "" { if strings.HasPrefix(currentURI.Path, "/") { //root based relative urls log.Debug("old relatived url,", currentUrlStr) currentUrlStr = "http://" + seedURI.Host + currentUrlStr log.Debug("new relatived url,", currentUrlStr) } else { log.Debug("old relatived url,", currentUrlStr) //page based relative urls urlPath := getRootUrl(currentURI) currentUrlStr = "http://" + urlPath + currentUrlStr log.Debug("new relatived url,", currentUrlStr) } } else { log.Debug("host:", currentURI.Host, " ", currentURI.Host == "") //resolve domain specific filter if siteConfig.FollowSameDomain { if siteConfig.FollowSubDomain { //TODO handler com.cn and .com,using a TLC-domain list } else if seedURI.Host != currentURI.Host { log.Debug("domain mismatch,", seedURI.Host, " vs ", currentURI.Host) //continue } //TODO follow all or list of domain } } if len(siteConfig.LinkUrlMustContain) > 0 { if !util.ContainStr(currentUrlStr, siteConfig.LinkUrlMustContain) { log.Debug("link does not hit must-contain,ignore,", currentUrlStr, " , ", siteConfig.LinkUrlMustNotContain) continue } } if len(siteConfig.LinkUrlMustNotContain) > 0 { if util.ContainStr(currentUrlStr, siteConfig.LinkUrlMustNotContain) { log.Debug("link hit must-not-contain,ignore,", currentUrlStr, " , ", siteConfig.LinkUrlMustNotContain) continue } } //normalize url currentUrlStr = MustNormalizeURLString(currentUrlStr, FlagLowercaseScheme|FlagLowercaseHost|FlagUppercaseEscapes| FlagRemoveUnnecessaryHostDots|FlagRemoveDuplicateSlashes|FlagRemoveFragment) log.Debug("normalized url:", currentUrlStr) currentUrlByte := []byte(currentUrlStr) if !(runtimeConfig.Storage.CheckWalkedUrl(currentUrlByte) || runtimeConfig.Storage.CheckFetchedUrl(currentUrlByte) || runtimeConfig.Storage.CheckPendingFetchUrl(currentUrlByte)) { //bloomFilter.Lookup(currentUrlByte) { // if(CheckIgnore(currentUrlStr)){} // log.Info("enqueue fetch: ", currentUrlStr) // broker.Publish(kafka.NewMessage(currentUrlByte)) //copied form fetchTask,TODO refactor //checking fetchUrlPattern log.Debug("started check fetchUrlPattern,", currentUrlStr) if siteConfig.FetchUrlPattern.Match(currentUrlByte) { log.Debug("match fetch url pattern,", currentUrlStr) if len(siteConfig.FetchUrlMustNotContain) > 0 { if util.ContainStr(currentUrlStr, siteConfig.FetchUrlMustNotContain) { log.Debug("hit FetchUrlMustNotContain,ignore,", currentUrlStr) continue } } if len(siteConfig.FetchUrlMustContain) > 0 { if !util.ContainStr(currentUrlStr, siteConfig.FetchUrlMustContain) { log.Debug("not hit FetchUrlMustContain,ignore,", currentUrlStr) continue } } } else { log.Debug("does not hit FetchUrlPattern ignoring,", currentUrlStr) continue } if !runtimeConfig.Storage.CheckPendingFetchUrl(currentUrlByte) { log.Debug("log new pendingFetch url", currentUrlStr) runtimeConfig.Storage.LogPendingFetchUrl(runtimeConfig.PathConfig.PendingFetchLog, currentUrlStr) runtimeConfig.Storage.AddPendingFetchUrl(currentUrlByte) } else { log.Debug("hit new pendingFetch filter,ignore:", currentUrlStr) } // pendingUrls <- currentUrlByte // TODO pendingFetchFilter bloomFilter.Add(currentUrlByte) } else { log.Debug("hit bloom filter,ignore:", currentUrlStr) } // bloomFilter.Add([]byte(filterUrl)) } else { log.Debug("hit bloom filter,ignore,", string(url)) } log.Debug("exit links extract,", siteUrlStr) } //TODO 处理ruled fetch pattern log.Info("all links within ", siteUrlStr, " is done") }
//fetch url's content func fetchUrl(url []byte, timeout time.Duration, runtimeConfig RuntimeConfig, offsets *RoutingOffset) { t := time.NewTimer(timeout) defer t.Stop() resource := string(url) log.Debug("enter fetchUrl method:", resource) config := runtimeConfig.TaskConfig if runtimeConfig.Storage.CheckFetchedUrl(url) { return } path := getSavedPath(runtimeConfig, url) if runtimeConfig.Storage.CheckSavedFile(path) { log.Warn("file is already saved,skip fetch.", path) runtimeConfig.Storage.AddSavedUrl(url) //re-parse local's previous saved page if runtimeConfig.ParseUrlsFromPreviousSavedPage { if !runtimeConfig.Storage.CheckParsedFile([]byte(path)) { log.Debug("previous saved page send to parse-queue:", path) runtimeConfig.Storage.LogSavedFile(runtimeConfig.PathConfig.SavedFileLog, resource+"|||"+path) } } return } //checking fetchUrlPattern log.Debug("started check fetchUrlPattern,", config.FetchUrlPattern, ",", resource) if config.FetchUrlPattern.Match(url) { log.Debug("match fetch url pattern,", resource) if len(config.FetchUrlMustNotContain) > 0 { if util.ContainStr(resource, config.FetchUrlMustNotContain) { log.Debug("hit FetchUrlMustNotContain,ignore,", resource, " , ", config.FetchUrlMustNotContain) return } } if len(config.FetchUrlMustContain) > 0 { if !util.ContainStr(resource, config.FetchUrlMustContain) { log.Debug("not hit FetchUrlMustContain,ignore,", resource, " , ", config.FetchUrlMustContain) return } } } else { log.Debug("does not hit FetchUrlPattern ignoring,", resource) return } log.Debug("start fetch url,", resource) flg := make(chan bool, 1) go func() { body, err := HttpGetWithCookie(resource, config.Cookie) if err == nil { if body != nil { //todo parse urls from this page log.Debug("started check savingUrlPattern,", config.SavingUrlPattern, ",", string(url)) if config.SavingUrlPattern.Match(url) { log.Debug("match saving url pattern,", resource) if len(config.SavingUrlMustNotContain) > 0 { if util.ContainStr(resource, config.SavingUrlMustNotContain) { log.Debug("hit SavingUrlMustNotContain,ignore,", resource, " , ", config.SavingUrlMustNotContain) goto exitPage } } if len(config.SavingUrlMustContain) > 0 { if !util.ContainStr(resource, config.SavingUrlMustContain) { log.Debug("not hit SavingUrlMustContain,ignore,", resource, " , ", config.SavingUrlMustContain) goto exitPage } } _, err := Save(runtimeConfig, path, body) if err == nil { log.Info("saved:", path) //todo saved per shard runtimeConfig.Storage.LogSavedFile(runtimeConfig.PathConfig.SavedFileLog, resource+"|||"+path) } else { log.Info("error while saved:", path, ",", err) goto exitPage } } else { log.Debug("does not hit SavingUrlPattern ignoring,", resource) } } runtimeConfig.Storage.AddFetchedUrl(url) exitPage: log.Debug("exit fetchUrl method:", resource) } else { // runtimeConfig.Storage.AddFetchFailedUrl(url) runtimeConfig.Storage.LogFetchFailedUrl(runtimeConfig.PathConfig.FetchFailedLog, resource) } flg <- true }() //监听通道,由于设有超时,不可能泄露 select { case <-t.C: log.Error("fetching url time out,", resource) case <-flg: log.Debug("fetching url normal exit,", resource) return } }