func FetchGo(runtimeConfig RuntimeConfig, taskC *chan []byte, quitC *chan bool, offsets *RoutingOffset) { shard := offsets.Shard log.Info("fetch task started.shard:", shard) go func() { for { url := <-*taskC log.Debug("shard:", shard, ",url received:", string(url)) if !runtimeConfig.Storage.CheckFetchedUrl(url) { timeout := 10 * time.Second // if(fetchFilter.Lookup(url)){ // log.Debug("hit fetch filter ,ignore,",string(url)) // continue // } // fetchFilter.Add(url) log.Debug("shard:", shard, ",url cool,start fetching:", string(url)) fetchUrl(url, timeout, runtimeConfig, offsets) if runtimeConfig.TaskConfig.FetchDelayThreshold > 0 { log.Debug("sleep ", runtimeConfig.TaskConfig.FetchDelayThreshold, "ms to control crawling speed") time.Sleep(time.Duration(runtimeConfig.TaskConfig.FetchDelayThreshold) * time.Millisecond) log.Debug("wake up now,continue crawing") } } else { log.Debug("hit fetch-bloomfilter,ignore,", string(url)) } } }() <-*quitC log.Info("fetch task exit.shard:", shard) }
func DeleteHandler(w http.ResponseWriter, r *http.Request) { n := new(storage.Needle) vid, fid, _ := parseURLPath(r.URL.Path) volumeId, _ := strconv.ParseUint(vid, 10, 64) n.ParsePath(fid) if *IsDebug { log.Info("deleting", n) } cookie := n.Cookie count, ok := store.Read(volumeId, n) if ok != nil { m := make(map[string]uint32) m["size"] = 0 writeJson(w, r, m) return } if n.Cookie != cookie { log.Info("delete with unmaching cookie from ", r.RemoteAddr, "agent", r.UserAgent()) return } n.Size = 0 store.Delete(volumeId, n) m := make(map[string]uint32) m["size"] = uint32(count) writeJson(w, r, m) }
func main() { flag.Parse() //volume master block log.Info("Volume Size Limit is", *volumeSizeLimitMB, "MB") mapper = directory.NewMapper(*metaFolder, "directory", uint64(*volumeSizeLimitMB)*1024*1024) //weedfs master handler http.HandleFunc("/dir/assign", dirAssignHandler) http.HandleFunc("/dir/lookup", dirLookupHandler) http.HandleFunc("/dir/join", dirJoinHandler) http.HandleFunc("/dir/status", dirStatusHandler) //start weedfs master go func() { log.Info("Start directory service at http://127.0.0.1:" + strconv.Itoa(*port)) e := http.ListenAndServe(":"+strconv.Itoa(*port), nil) if e != nil { log.Error("Fail to start:", e) } }() //volume block //TODO: now default to 1G, this value should come from server? store = storage.NewStore(*storePort, *publicUrl, *chunkFolder, *volumes) defer store.Close() //weedfs volume handler http.HandleFunc("/", storeHandler) http.HandleFunc("/status", statusHandler) http.HandleFunc("/add_volume", addVolumeHandler) go func() { for { store.Join(*metaServer) time.Sleep(time.Duration(float32(*pulse*1e3)*(1+rand.Float32())) * time.Millisecond) } }() log.Info("store joined at", *metaServer) //start weedfs volume go func() { log.Info("Start storage service at http://127.0.0.1:"+strconv.Itoa(*storePort), "public url", *publicUrl) e := http.ListenAndServe(":"+strconv.Itoa(*storePort), nil) if e != nil { log.Error("Fail to start:", e) } }() }
func Start(runtimeConfig RuntimeConfig) { config = runtimeConfig http.HandleFunc("/", index) http.ListenAndServe(":8001", nil) log.Info("http server is up,http://localhost:8001/") }
func NewStore(port int, publicUrl, dirname string, volumeListString string) (s *Store) { s = &Store{Port: port, PublicUrl: publicUrl, dir: dirname} s.volumes = make(map[uint64]*Volume) s.AddVolume(volumeListString) log.Info("Store started on dir:", dirname, "with", len(s.volumes), "volumes") return }
func (m *Mapper) saveSequence() { log.Info("Saving file id sequence", m.FileIdSequence, "to", path.Join(m.dir, m.fileName+".seq")) seqFile, e := os.OpenFile(path.Join(m.dir, m.fileName+".seq"), os.O_CREATE|os.O_WRONLY, 0644) if e != nil { log.Error("Sequence File Save [ERROR] %s,", e) } defer seqFile.Close() encoder := gob.NewEncoder(seqFile) encoder.Encode(m.FileIdSequence) }
func dirJoinHandler(w http.ResponseWriter, r *http.Request) { s := r.RemoteAddr[0:strings.Index(r.RemoteAddr, ":")+1] + r.FormValue("port") publicUrl := r.FormValue("publicUrl") volumes := new([]storage.VolumeInfo) json.Unmarshal([]byte(r.FormValue("volumes")), volumes) if *IsDebug { log.Info(s, "volumes", r.FormValue("volumes")) } mapper.Add(*directory.NewMachine(s, publicUrl, *volumes)) }
func NewMapper(dirname string, filename string, volumeSizeLimit uint64) (m *Mapper) { m = &Mapper{dir: dirname, fileName: filename} m.vid2machineId = make(map[uint32]int) m.volumeSizeLimit = volumeSizeLimit m.Writers = *new([]uint32) m.Machines = *new([]*Machine) seqFile, se := os.OpenFile(path.Join(m.dir, m.fileName+".seq"), os.O_RDONLY, 0644) if se != nil { m.FileIdSequence = FileIdSaveInterval log.Info("Setting file id sequence", m.FileIdSequence) } else { decoder := gob.NewDecoder(seqFile) defer seqFile.Close() decoder.Decode(&m.FileIdSequence) log.Info("Loading file id sequence", m.FileIdSequence, "=>", m.FileIdSequence+FileIdSaveInterval) //in case the server stops between intervals m.FileIdSequence += FileIdSaveInterval } return }
func initBloomFilter(bloomFilterPersistFileName string) *Filter { var bloomFilter = new(Filter) //loading or initializing bloom filter if util.CheckFileExists(bloomFilterPersistFileName) { log.Debug("found bloomFilter,start reload,", bloomFilterPersistFileName) n, err := ioutil.ReadFile(bloomFilterPersistFileName) if err != nil { log.Error("bloomFilter:", bloomFilterPersistFileName, err) } if err := bloomFilter.GobDecode(n); err != nil { log.Error("bloomFilter:", bloomFilterPersistFileName, err) } log.Info("bloomFilter successfully reloaded:", bloomFilterPersistFileName) } else { probItems := config.GetIntConfig("BloomFilter", "ItemSize", 100000) log.Debug("initializing bloom-filter", bloomFilterPersistFileName, ",virual size is,", probItems) bloomFilter = NewFilter(fnv.New64(), probItems) log.Info("bloomFilter successfully initialized:", bloomFilterPersistFileName) } return bloomFilter }
func GetHandler(w http.ResponseWriter, r *http.Request) { n := new(storage.Needle) vid, fid, ext := parseURLPath(r.URL.Path) volumeId, _ := strconv.ParseUint(vid, 10, 64) n.ParsePath(fid) if *IsDebug { log.Info("volume", volumeId, "reading", n) } cookie := n.Cookie count, e := store.Read(volumeId, n) if *IsDebug { log.Info("read bytes", count, "error", e) } if n.Cookie != cookie { log.Info("request with unmaching cookie from ", r.RemoteAddr, "agent", r.UserAgent()) return } if ext != "" { w.Header().Set("Content-Type", mime.TypeByExtension(ext)) } w.Write(n.Data) }
func persistBloomFilter(bloomFilterPersistFileName string, bloomFilter *Filter) { //save bloom-filter m, err := bloomFilter.GobEncode() if err != nil { log.Error(err) return } err = ioutil.WriteFile(bloomFilterPersistFileName, m, 0600) if err != nil { panic(err) return } log.Info("bloomFilter safety persisted.") }
func ParseGo(pendingUrls chan []byte, runtimeConfig RuntimeConfig, quit *chan bool, offsets *RoutingOffset) { log.Info("parsing task started.") path := runtimeConfig.PathConfig.SavedFileLog //touch local's file //read all of line //if hit the EOF,will wait 2s,and then reopen the file,and try again,may be check the time of last modified waitFile: if !util.CheckFileExists(path) { log.Trace("waiting file create", path) time.Sleep(10 * time.Millisecond) goto waitFile } var offset int64 = runtimeConfig.Storage.LoadOffset(runtimeConfig.PathConfig.SavedFileLog + ".offset") FetchFileWithOffset(runtimeConfig, path, offset) }
func (m *Mapper) PickForWrite(c string) (string, int, MachineInfo, error) { len_writers := len(m.Writers) if len_writers <= 0 { log.Info("No more writable volumes!") return "", 0, m.Machines[rand.Intn(len(m.Machines))].Server, errors.New("No more writable volumes!") } vid := m.Writers[rand.Intn(len_writers)] machine_id := m.vid2machineId[vid] if machine_id > 0 { machine := m.Machines[machine_id-1] fileId, count := m.NextFileId(c) if count == 0 { return "", 0, m.Machines[rand.Intn(len(m.Machines))].Server, errors.New("Strange count:" + c) } return NewFileId(vid, fileId, rand.Uint32()).String(), count, machine.Server, nil } return "", 0, m.Machines[rand.Intn(len(m.Machines))].Server, errors.New("Strangely vid " + strconv.FormatUint(uint64(vid), 10) + " is on no machine!") }
func parseURLPath(path string) (vid, fid, ext string) { sepIndex := strings.LastIndex(path, "/") commaIndex := strings.LastIndex(path[sepIndex:], ",") if commaIndex <= 0 { if "favicon.ico" != path[sepIndex+1:] { log.Info("unknown file id", path[sepIndex+1:]) } return } dotIndex := strings.LastIndex(path[sepIndex:], ".") vid = path[sepIndex+1 : commaIndex] fid = path[commaIndex+1:] ext = "" if dotIndex > 0 { fid = path[commaIndex+1 : dotIndex] ext = path[dotIndex+1:] } return }
func (this *FsStore) LoadOffset(fileName string) int64 { log.Debug("start init offsets,", fileName) if util.CheckFileExists(fileName) { log.Debug("found offset file,start loading,", fileName) n, err := ioutil.ReadFile(fileName) if err != nil { log.Error("offset", fileName, ",", err) return 0 } ret, err := strconv.ParseInt(string(n), 10, 64) if err != nil { log.Error("offset", fileName, ",", err) return 0 } log.Info("init offsets successfully,", fileName, ":", ret) return int64(ret) } return 0 }
func LoadNeedleMap(file *os.File) *NeedleMap { nm := NewNeedleMap(file) bytes := make([]byte, 16*RowsToRead) count, e := nm.indexFile.Read(bytes) if count > 0 { fstat, _ := file.Stat() log.Info("Loading index file", fstat.Name(), "size", fstat.Size()) } for count > 0 && e == nil { for i := 0; i < count; i += 16 { key := util.BytesToUint64(bytes[i : i+8]) offset := util.BytesToUint32(bytes[i+8 : i+12]) size := util.BytesToUint32(bytes[i+12 : i+16]) if offset > 0 { nm.m[key] = &NeedleValue{Offset: offset, Size: size} } else { delete(nm.m, key) } } count, e = nm.indexFile.Read(bytes) } return nm }
func shutdown(offsets []*RoutingOffset, quitChannels []*chan bool, offsets2 []*RoutingOffset, quitChannels2 []*chan bool, quit chan bool) { log.Debug("start shutting down") for i := range quitChannels { log.Debug("send exit signal to channel,", i) *quitChannels[i] <- true } for i := range quitChannels2 { log.Debug("send exit signal to channel,", i) *quitChannels2[i] <- true } log.Info("sent quit signal to go routings done") // for i:=range offsets{ // //TODO // log.Info("persist offset,",i,":",offsets[i].Offset,",",offsets[i].shard) // } // log.Info("persist kafka offsets done") quit <- true log.Debug("finished shutting down") }
func post(url string, cookie string, postStr string) []byte { log.Debug("let's post :" + url) client := &http.Client{ CheckRedirect: nil, } postBytesReader := bytes.NewReader([]byte(postStr)) reqest, _ := http.NewRequest("POST", url, postBytesReader) reqest.Header.Set("User-Agent", " Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36") reqest.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") reqest.Header.Set("Accept-Charset", "GBK,utf-8;q=0.7,*;q=0.3") reqest.Header.Set("Accept-Encoding", "gzip,deflate,sdch") // reqest.Header.Add("Content-Type", "application/x-www-form-urlencoded") reqest.Header.Add("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8") reqest.Header.Set("Accept-Language", "zh-CN,zh;q=0.8") reqest.Header.Set("Cache-Control", "max-age=0") reqest.Header.Set("Connection", "keep-alive") reqest.Header.Set("Referer", url) if len(cookie) > 0 { log.Debug("dealing with cookie:" + cookie) array := strings.Split(cookie, ";") for item := range array { array2 := strings.Split(array[item], "=") if len(array2) == 2 { cookieObj := http.Cookie{} cookieObj.Name = array2[0] cookieObj.Value = array2[1] reqest.AddCookie(&cookieObj) } else { log.Info("error,index out of range:" + array[item]) } } } resp, err := client.Do(reqest) if err != nil { log.Error(url, err) return nil } defer resp.Body.Close() var reader io.ReadCloser switch resp.Header.Get("Content-Encoding") { case "gzip": reader, err = gzip.NewReader(resp.Body) if err != nil { log.Error(url, err) return nil } defer reader.Close() default: reader = resp.Body } if reader != nil { body, err := ioutil.ReadAll(reader) if err != nil { log.Error(url, err) return nil } return body } return nil }
//fetch url's content func fetchUrl(url []byte, timeout time.Duration, runtimeConfig RuntimeConfig, offsets *RoutingOffset) { t := time.NewTimer(timeout) defer t.Stop() resource := string(url) log.Debug("enter fetchUrl method:", resource) config := runtimeConfig.TaskConfig if runtimeConfig.Storage.CheckFetchedUrl(url) { return } path := getSavedPath(runtimeConfig, url) if runtimeConfig.Storage.CheckSavedFile(path) { log.Warn("file is already saved,skip fetch.", path) runtimeConfig.Storage.AddSavedUrl(url) //re-parse local's previous saved page if runtimeConfig.ParseUrlsFromPreviousSavedPage { if !runtimeConfig.Storage.CheckParsedFile([]byte(path)) { log.Debug("previous saved page send to parse-queue:", path) runtimeConfig.Storage.LogSavedFile(runtimeConfig.PathConfig.SavedFileLog, resource+"|||"+path) } } return } //checking fetchUrlPattern log.Debug("started check fetchUrlPattern,", config.FetchUrlPattern, ",", resource) if config.FetchUrlPattern.Match(url) { log.Debug("match fetch url pattern,", resource) if len(config.FetchUrlMustNotContain) > 0 { if util.ContainStr(resource, config.FetchUrlMustNotContain) { log.Debug("hit FetchUrlMustNotContain,ignore,", resource, " , ", config.FetchUrlMustNotContain) return } } if len(config.FetchUrlMustContain) > 0 { if !util.ContainStr(resource, config.FetchUrlMustContain) { log.Debug("not hit FetchUrlMustContain,ignore,", resource, " , ", config.FetchUrlMustContain) return } } } else { log.Debug("does not hit FetchUrlPattern ignoring,", resource) return } log.Debug("start fetch url,", resource) flg := make(chan bool, 1) go func() { body, err := HttpGetWithCookie(resource, config.Cookie) if err == nil { if body != nil { //todo parse urls from this page log.Debug("started check savingUrlPattern,", config.SavingUrlPattern, ",", string(url)) if config.SavingUrlPattern.Match(url) { log.Debug("match saving url pattern,", resource) if len(config.SavingUrlMustNotContain) > 0 { if util.ContainStr(resource, config.SavingUrlMustNotContain) { log.Debug("hit SavingUrlMustNotContain,ignore,", resource, " , ", config.SavingUrlMustNotContain) goto exitPage } } if len(config.SavingUrlMustContain) > 0 { if !util.ContainStr(resource, config.SavingUrlMustContain) { log.Debug("not hit SavingUrlMustContain,ignore,", resource, " , ", config.SavingUrlMustContain) goto exitPage } } _, err := Save(runtimeConfig, path, body) if err == nil { log.Info("saved:", path) //todo saved per shard runtimeConfig.Storage.LogSavedFile(runtimeConfig.PathConfig.SavedFileLog, resource+"|||"+path) } else { log.Info("error while saved:", path, ",", err) goto exitPage } } else { log.Debug("does not hit SavingUrlPattern ignoring,", resource) } } runtimeConfig.Storage.AddFetchedUrl(url) exitPage: log.Debug("exit fetchUrl method:", resource) } else { // runtimeConfig.Storage.AddFetchFailedUrl(url) runtimeConfig.Storage.LogFetchFailedUrl(runtimeConfig.PathConfig.FetchFailedLog, resource) } flg <- true }() //监听通道,由于设有超时,不可能泄露 select { case <-t.C: log.Error("fetching url time out,", resource) case <-flg: log.Debug("fetching url normal exit,", resource) return } }
func main() { flag.StringVar(&seedUrl, "seed", "http://example.com", "the seed url,where everything starts") flag.StringVar(&logLevel, "log", "info", "setting log level,options:trace,debug,info,warn,error") flag.Parse() defer log.Flush() runtime.GOMAXPROCS(runtime.NumCPU()) log.SetInitLogging(logLevel) runtimeConfig.PathConfig = new(PathConfig) runtimeConfig.ClusterConfig = new(ClusterConfig) runtimeConfig.ClusterConfig.Name = config.GetStringConfig("cluster", "name", "gopa") // per cluster:data/gopa/ runtimeConfig.PathConfig.Home = config.GetStringConfig("path", "home", "cluster/"+runtimeConfig.ClusterConfig.Name+"/") runtimeConfig.PathConfig.Data = config.GetStringConfig("path", "data", "") if runtimeConfig.PathConfig.Data == "" { runtimeConfig.PathConfig.Data = runtimeConfig.PathConfig.Home + "/" + "data/" } runtimeConfig.PathConfig.Log = config.GetStringConfig("path", "log", "") if runtimeConfig.PathConfig.Log == "" { runtimeConfig.PathConfig.Log = runtimeConfig.PathConfig.Home + "/" + "log/" } runtimeConfig.PathConfig.WebData = config.GetStringConfig("path", "webdata", "") if runtimeConfig.PathConfig.WebData == "" { runtimeConfig.PathConfig.WebData = runtimeConfig.PathConfig.Data + "/" + "webdata/" } runtimeConfig.PathConfig.TaskData = config.GetStringConfig("path", "taskdata", "") if runtimeConfig.PathConfig.TaskData == "" { runtimeConfig.PathConfig.TaskData = runtimeConfig.PathConfig.Data + "/" + "taskdata/" } runtimeConfig.StoreWebPageTogether = config.GetBoolConfig("Global", "StoreWebPageTogether", true) runtimeConfig.TaskConfig = parseConfig() //set default logging logPath := runtimeConfig.PathConfig.Log + "/" + runtimeConfig.TaskConfig.Name + "/gopa.log" log.SetLogging(logLevel, logPath) runtimeConfig.ParseUrlsFromSavedFileLog = config.GetBoolConfig("Switch", "ParseUrlsFromSavedFileLog", true) runtimeConfig.LoadTemplatedFetchJob = config.GetBoolConfig("Switch", "LoadTemplatedFetchJob", true) runtimeConfig.LoadRuledFetchJob = config.GetBoolConfig("Switch", "LoadRuledFetchJob", false) runtimeConfig.LoadPendingFetchJobs = config.GetBoolConfig("Switch", "LoadPendingFetchJobs", true) runtimeConfig.HttpEnabled = config.GetBoolConfig("Switch", "HttpEnabled", true) runtimeConfig.ParseUrlsFromPreviousSavedPage = config.GetBoolConfig("Switch", "ParseUrlsFromPreviousSavedPage", false) runtimeConfig.ArrayStringSplitter = config.GetStringConfig("CrawlerRule", "ArrayStringSplitter", ",") runtimeConfig.GoProfEnabled = config.GetBoolConfig("CrawlerRule", "GoProfEnabled", false) runtimeConfig.WalkBloomFilterFileName = config.GetStringConfig("BloomFilter", "WalkBloomFilterFileName", runtimeConfig.TaskConfig.TaskDataPath+"/filters/walk.bloomfilter") runtimeConfig.FetchBloomFilterFileName = config.GetStringConfig("BloomFilter", "FetchBloomFilterFileName", runtimeConfig.TaskConfig.TaskDataPath+"/filters/fetch.bloomfilter") runtimeConfig.ParseBloomFilterFileName = config.GetStringConfig("BloomFilter", "ParseBloomFilterFileName", runtimeConfig.TaskConfig.TaskDataPath+"/filters/parse.bloomfilter") runtimeConfig.PendingFetchBloomFilterFileName = config.GetStringConfig("BloomFilter", "PendingFetchBloomFilterFileName", runtimeConfig.TaskConfig.TaskDataPath+"/filters/pending_fetch.bloomfilter") runtimeConfig.PathConfig.SavedFileLog = runtimeConfig.TaskConfig.TaskDataPath + "/tasks/pending_parse.files" runtimeConfig.PathConfig.PendingFetchLog = runtimeConfig.TaskConfig.TaskDataPath + "/tasks/pending_fetch.urls" runtimeConfig.PathConfig.FetchFailedLog = runtimeConfig.TaskConfig.TaskDataPath + "/tasks/failed_fetch.urls" runtimeConfig.MaxGoRoutine = config.GetIntConfig("Global", "MaxGoRoutine", 2) if runtimeConfig.MaxGoRoutine < 2 { runtimeConfig.MaxGoRoutine = 2 } log.Debug("maxGoRoutine:", runtimeConfig.MaxGoRoutine) log.Debug("path.home:", runtimeConfig.PathConfig.Home) os.MkdirAll(runtimeConfig.PathConfig.Home, 0777) os.MkdirAll(runtimeConfig.PathConfig.Data, 0777) os.MkdirAll(runtimeConfig.PathConfig.Log, 0777) os.MkdirAll(runtimeConfig.PathConfig.WebData, 0777) os.MkdirAll(runtimeConfig.PathConfig.TaskData, 0777) os.MkdirAll(runtimeConfig.TaskConfig.TaskDataPath, 0777) os.MkdirAll(runtimeConfig.TaskConfig.TaskDataPath+"/tasks/", 0777) os.MkdirAll(runtimeConfig.TaskConfig.TaskDataPath+"/filters/", 0777) os.MkdirAll(runtimeConfig.TaskConfig.TaskDataPath+"/urls/", 0777) os.MkdirAll(runtimeConfig.TaskConfig.WebDataPath, 0777) runtimeConfig.RuledFetchConfig = new(RuledFetchConfig) runtimeConfig.RuledFetchConfig.UrlTemplate = config.GetStringConfig("RuledFetch", "UrlTemplate", "") runtimeConfig.RuledFetchConfig.From = config.GetIntConfig("RuledFetch", "From", 0) runtimeConfig.RuledFetchConfig.To = config.GetIntConfig("RuledFetch", "To", 10) runtimeConfig.RuledFetchConfig.Step = config.GetIntConfig("RuledFetch", "Step", 1) runtimeConfig.RuledFetchConfig.LinkExtractPattern = config.GetStringConfig("RuledFetch", "LinkExtractPattern", "") runtimeConfig.RuledFetchConfig.LinkTemplate = config.GetStringConfig("RuledFetch", "LinkTemplate", "") if seedUrl == "" || seedUrl == "http://example.com" { log.Error("no seed was given. type:\"gopa -h\" for help.") os.Exit(1) } log.Info("[gopa] " + runtimeConfig.Version + " is on.") runtimeConfig.Storage = &fsstore.FsStore{} // if(runtimeConfig.) runtimeConfig.Storage.InitWalkBloomFilter(runtimeConfig.WalkBloomFilterFileName) runtimeConfig.Storage.InitFetchBloomFilter(runtimeConfig.FetchBloomFilterFileName) runtimeConfig.Storage.InitParseBloomFilter(runtimeConfig.ParseBloomFilterFileName) runtimeConfig.Storage.InitPendingFetchBloomFilter(runtimeConfig.PendingFetchBloomFilterFileName) // atr:="AZaz" // btr:=[]byte(atr) // fmt.Println(btr) // // id:= getSeqStr([]byte("AA"),[]byte("ZZ"),false) // fmt.Println(id) //pprof serves if runtimeConfig.GoProfEnabled { go func() { log.Info(http.ListenAndServe("localhost:6060", nil)) log.Info("pprof server is up,http://localhost:6060/debug/pprof") }() } //http serves if runtimeConfig.HttpEnabled { go func() { httpServ.Start(runtimeConfig) }() } //adding default http protocol if !strings.HasPrefix(seedUrl, "http") { seedUrl = "http://" + seedUrl } maxGoRoutine := runtimeConfig.MaxGoRoutine fetchQuitChannels := make([]*chan bool, maxGoRoutine) //shutdownSignal signals for each go routing fetchTaskChannels := make([]*chan []byte, maxGoRoutine) //fetchTask channels fetchOffsets := make([]*RoutingOffset, maxGoRoutine) //kafka fetchOffsets parseQuitChannels := make([]*chan bool, 2) //shutdownSignal signals for each go routing // parseQuitChannels := make([]*chan bool, MaxGoRoutine) //shutdownSignal signals for each go routing parseOffsets := make([]*RoutingOffset, maxGoRoutine) //kafka fetchOffsets shutdownSignal := make(chan bool, 1) finalQuitSignal := make(chan bool, 1) //handle exit event exitEventChannel := make(chan os.Signal, 1) signal.Notify(exitEventChannel, syscall.SIGINT) signal.Notify(exitEventChannel, os.Interrupt) go func() { s := <-exitEventChannel log.Debug("got signal:", s) if s == os.Interrupt || s.(os.Signal) == syscall.SIGINT { log.Warn("got signal:os.Interrupt,saving data and exit") // defer os.Exit(0) runtimeConfig.Storage.PersistBloomFilter() //wait workers to exit log.Info("waiting workers exit") go shutdown(fetchOffsets, fetchQuitChannels, parseOffsets, parseQuitChannels, shutdownSignal) <-shutdownSignal log.Info("workers shutdown") finalQuitSignal <- true } }() //start fetcher for i := 0; i < maxGoRoutine; i++ { quitC := make(chan bool, 1) taskC := make(chan []byte) fetchQuitChannels[i] = &quitC fetchTaskChannels[i] = &taskC offset := new(RoutingOffset) // offset.Offset = initOffset(runtimeConfig, "fetch", i) offset.Shard = i fetchOffsets[i] = offset go task.FetchGo(runtimeConfig, &taskC, &quitC, offset) } c2 := make(chan bool, 1) parseQuitChannels[0] = &c2 offset2 := new(RoutingOffset) // offset2.Offset = initOffset(runtimeConfig, "parse", 0) offset2.Shard = 0 parseOffsets[0] = offset2 pendingFetchUrls := make(chan []byte) //fetch rule:all urls -> persisted to sotre -> fetched from store -> pushed to pendingFetchUrls -> redistributed to sharded goroutines -> fetch -> save webpage to store -> done //parse rule:url saved to store -> local path persisted to store -> fetched to pendingParseFiles -> redistributed to sharded goroutines -> parse -> clean urls -> enqueue to url store ->done //sending feed to task queue go func() { //notice seed will not been persisted log.Debug("sending feed to fetch queue,", seedUrl) pendingFetchUrls <- []byte(seedUrl) }() //start local saved file parser if runtimeConfig.ParseUrlsFromSavedFileLog { go task.ParseGo(pendingFetchUrls, runtimeConfig, &c2, offset2) } //redistribute pendingFetchUrls to sharded workers go func() { for { url := <-pendingFetchUrls if !runtimeConfig.Storage.CheckWalkedUrl(url) { if runtimeConfig.Storage.CheckFetchedUrl(url) { log.Warn("dont hit walk bloomfilter but hit fetch bloomfilter,also ignore,", string(url)) runtimeConfig.Storage.AddWalkedUrl(url) continue } randomShard := 0 if maxGoRoutine > 1 { randomShard = rand.Intn(maxGoRoutine - 1) } log.Debug("publish:", string(url), ",shard:", randomShard) runtimeConfig.Storage.AddWalkedUrl(url) *fetchTaskChannels[randomShard] <- url } else { log.Trace("hit walk or fetch bloomfilter,just ignore,", string(url)) } } }() //load predefined fetch jobs if runtimeConfig.LoadTemplatedFetchJob { go func() { if util.CheckFileExists(runtimeConfig.TaskConfig.TaskDataPath + "/urls/template.txt") { templates := util.ReadAllLines(runtimeConfig.TaskConfig.TaskDataPath + "/urls/template.txt") ids := util.ReadAllLines(runtimeConfig.TaskConfig.TaskDataPath + "/urls/id.txt") for _, id := range ids { for _, template := range templates { log.Trace("id:", id) log.Trace("template:", template) url := strings.Replace(template, "{id}", id, -1) log.Debug("new task from template:", url) pendingFetchUrls <- []byte(url) } } log.Info("templated download is done.") } }() } //fetch urls from saved pages if runtimeConfig.LoadPendingFetchJobs { c3 := make(chan bool, 1) parseQuitChannels[1] = &c3 offset3 := new(RoutingOffset) // offset3.Offset = initOffset(runtimeConfig, "fetch_from_saved", 0) offset3.Shard = 0 parseOffsets[1] = offset3 go task.LoadTaskFromLocalFile(pendingFetchUrls, &runtimeConfig, &c3, offset3) } //parse fetch failed jobs,and will ignore the walk-filter //TODO if runtimeConfig.LoadRuledFetchJob { log.Debug("start ruled fetch") go func() { if runtimeConfig.RuledFetchConfig.UrlTemplate != "" { for i := runtimeConfig.RuledFetchConfig.From; i <= runtimeConfig.RuledFetchConfig.To; i += runtimeConfig.RuledFetchConfig.Step { url := strings.Replace(runtimeConfig.RuledFetchConfig.UrlTemplate, "{id}", strconv.FormatInt(int64(i), 10), -1) log.Debug("add ruled url:", url) pendingFetchUrls <- []byte(url) } } else { log.Error("ruled template is empty,ignore") } }() } <-finalQuitSignal log.Info("[gopa] is down") }
func (this *FsStore) TaskEnqueue(url []byte) { log.Info("task enqueue:", string(url)) }
func extractLinks(runtimeConfig RuntimeConfig, fileUrl string, fileName []byte, body []byte) { // siteUrlStr := string(fileName) // siteUrlStr = strings.TrimLeft(siteUrlStr, "data/") // siteUrlStr = "http://" + siteUrlStr // log.Debug("fileName to Url:", string(fileName), ",", siteUrlStr) siteUrlStr := fileUrl siteConfig := runtimeConfig.TaskConfig siteUrlByte := []byte(siteUrlStr) log.Debug("enter links extract,", siteUrlStr) if siteConfig.SkipPageParsePattern.Match(siteUrlByte) { log.Debug("hit SkipPageParsePattern pattern,", siteUrlStr) return } log.Debug("parsing external links:", siteUrlStr, ",using:", siteConfig.LinkUrlExtractRegex) matches := siteConfig.LinkUrlExtractRegex.FindAllSubmatch(body, -1) log.Debug("extract links with pattern,total matchs:", len(matches), " match result,", string(fileName)) xIndex := 0 for _, match := range matches { log.Debug("dealing with match result,", xIndex) xIndex = xIndex + 1 url := match[siteConfig.LinkUrlExtractRegexGroupIndex] filterUrl := formatUrlForFilter(url) log.Debug("url clean result:", string(filterUrl), ",original url:", string(url)) filteredUrl := string(filterUrl) //filter error link if filteredUrl == "" { log.Debug("filteredUrl is empty,continue") continue } result1 := strings.HasPrefix(filteredUrl, "#") if result1 { log.Debug("filteredUrl started with: # ,continue") continue } result2 := strings.HasPrefix(filteredUrl, "javascript:") if result2 { log.Debug("filteredUrl started with: javascript: ,continue") continue } hit := false // l.Lock(); // defer l.Unlock(); if runtimeConfig.Storage.CheckWalkedUrl(filterUrl) || runtimeConfig.Storage.CheckFetchedUrl(filterUrl) || runtimeConfig.Storage.CheckPendingFetchUrl(filterUrl) { log.Debug("hit bloomFilter,continue") hit = true continue } if !hit { currentUrlStr := string(url) currentUrlStr = strings.Trim(currentUrlStr, " ") seedUrlStr := siteUrlStr seedURI, err := ParseRequestURI(seedUrlStr) if err != nil { log.Error("ParseSeedURI failed!: ", seedUrlStr, " , ", err) continue } currentURI1, err := ParseRequestURI(currentUrlStr) currentURI := currentURI1 if err != nil { if strings.Contains(err.Error(), "invalid URI for request") { log.Debug("invalid URI for request,fix relative url,original:", currentUrlStr) // log.Debug("old relatived url,", currentUrlStr) //page based relative urls currentUrlStr = "http://" + seedURI.Host + "/" + currentUrlStr currentURI1, err = ParseRequestURI(currentUrlStr) currentURI = currentURI1 if err != nil { log.Error("ParseCurrentURI internal failed!: ", currentUrlStr, " , ", err) continue } log.Debug("new relatived url,", currentUrlStr) } else { log.Error("ParseCurrentURI failed!: ", currentUrlStr, " , ", err) continue } } // relative links if currentURI == nil || currentURI.Host == "" { if strings.HasPrefix(currentURI.Path, "/") { //root based relative urls log.Debug("old relatived url,", currentUrlStr) currentUrlStr = "http://" + seedURI.Host + currentUrlStr log.Debug("new relatived url,", currentUrlStr) } else { log.Debug("old relatived url,", currentUrlStr) //page based relative urls urlPath := getRootUrl(currentURI) currentUrlStr = "http://" + urlPath + currentUrlStr log.Debug("new relatived url,", currentUrlStr) } } else { log.Debug("host:", currentURI.Host, " ", currentURI.Host == "") //resolve domain specific filter if siteConfig.FollowSameDomain { if siteConfig.FollowSubDomain { //TODO handler com.cn and .com,using a TLC-domain list } else if seedURI.Host != currentURI.Host { log.Debug("domain mismatch,", seedURI.Host, " vs ", currentURI.Host) //continue } //TODO follow all or list of domain } } if len(siteConfig.LinkUrlMustContain) > 0 { if !util.ContainStr(currentUrlStr, siteConfig.LinkUrlMustContain) { log.Debug("link does not hit must-contain,ignore,", currentUrlStr, " , ", siteConfig.LinkUrlMustNotContain) continue } } if len(siteConfig.LinkUrlMustNotContain) > 0 { if util.ContainStr(currentUrlStr, siteConfig.LinkUrlMustNotContain) { log.Debug("link hit must-not-contain,ignore,", currentUrlStr, " , ", siteConfig.LinkUrlMustNotContain) continue } } //normalize url currentUrlStr = MustNormalizeURLString(currentUrlStr, FlagLowercaseScheme|FlagLowercaseHost|FlagUppercaseEscapes| FlagRemoveUnnecessaryHostDots|FlagRemoveDuplicateSlashes|FlagRemoveFragment) log.Debug("normalized url:", currentUrlStr) currentUrlByte := []byte(currentUrlStr) if !(runtimeConfig.Storage.CheckWalkedUrl(currentUrlByte) || runtimeConfig.Storage.CheckFetchedUrl(currentUrlByte) || runtimeConfig.Storage.CheckPendingFetchUrl(currentUrlByte)) { //bloomFilter.Lookup(currentUrlByte) { // if(CheckIgnore(currentUrlStr)){} // log.Info("enqueue fetch: ", currentUrlStr) // broker.Publish(kafka.NewMessage(currentUrlByte)) //copied form fetchTask,TODO refactor //checking fetchUrlPattern log.Debug("started check fetchUrlPattern,", currentUrlStr) if siteConfig.FetchUrlPattern.Match(currentUrlByte) { log.Debug("match fetch url pattern,", currentUrlStr) if len(siteConfig.FetchUrlMustNotContain) > 0 { if util.ContainStr(currentUrlStr, siteConfig.FetchUrlMustNotContain) { log.Debug("hit FetchUrlMustNotContain,ignore,", currentUrlStr) continue } } if len(siteConfig.FetchUrlMustContain) > 0 { if !util.ContainStr(currentUrlStr, siteConfig.FetchUrlMustContain) { log.Debug("not hit FetchUrlMustContain,ignore,", currentUrlStr) continue } } } else { log.Debug("does not hit FetchUrlPattern ignoring,", currentUrlStr) continue } if !runtimeConfig.Storage.CheckPendingFetchUrl(currentUrlByte) { log.Debug("log new pendingFetch url", currentUrlStr) runtimeConfig.Storage.LogPendingFetchUrl(runtimeConfig.PathConfig.PendingFetchLog, currentUrlStr) runtimeConfig.Storage.AddPendingFetchUrl(currentUrlByte) } else { log.Debug("hit new pendingFetch filter,ignore:", currentUrlStr) } // pendingUrls <- currentUrlByte // TODO pendingFetchFilter bloomFilter.Add(currentUrlByte) } else { log.Debug("hit bloom filter,ignore:", currentUrlStr) } // bloomFilter.Add([]byte(filterUrl)) } else { log.Debug("hit bloom filter,ignore,", string(url)) } log.Debug("exit links extract,", siteUrlStr) } //TODO 处理ruled fetch pattern log.Info("all links within ", siteUrlStr, " is done") }