func GetBoolConfig(configSection string, configKey string, defaultValue bool) bool { if loadingConfig == nil { log.Trace("loadingConfig is nil,just return") return defaultValue } //loading or initializing bloom filter value, error := loadingConfig.Bool(configSection, configKey) if error != nil { value = defaultValue } log.Trace("get config value,", configSection, ".", configKey, ":", value) return value }
func FetchFileWithOffset(runtimeConfig RuntimeConfig, path string, skipOffset int64) { var offset int64 = 0 time1, _ := util.FileMTime(path) log.Debug("start touch time:", time1) f, err := os.Open(path) if err != nil { log.Debug("error opening file,", path, " ", err) return } r := bufio.NewReader(f) s, e := util.Readln(r) offset = 0 log.Trace("new offset:", offset) for e == nil { offset = offset + 1 //TODO use byte offset instead of lines if offset > skipOffset { ParsedSavedFileLog(runtimeConfig, s) } runtimeConfig.Storage.PersistOffset(runtimeConfig.PathConfig.SavedFileLog+".offset", offset) s, e = util.Readln(r) //todo store offset } log.Trace("end offset:", offset, "vs ", skipOffset) waitUpdate: time2, _ := util.FileMTime(path) log.Trace("2nd touch time:", time2) if time2 > time1 { log.Trace("file has been changed,restart parse") FetchFileWithOffset(runtimeConfig, path, offset) } else { log.Trace("waiting file update", path) time.Sleep(10 * time.Millisecond) goto waitUpdate } }
func LoadTaskFromLocalFile(pendingFetchUrls chan []byte, runtimeConfig *RuntimeConfig, quit *chan bool, offsets *RoutingOffset) { log.Trace("LoadTaskFromLocalFile task started.") path := runtimeConfig.PathConfig.PendingFetchLog //touch local's file //read all of line //if hit the EOF,will wait 2s,and then reopen the file,and try again,may be check the time of last modified waitFile: if !util.CheckFileExists(path) { log.Trace("waiting file create", path) time.Sleep(10 * time.Millisecond) goto waitFile } var offset int64 = runtimeConfig.Storage.LoadOffset(runtimeConfig.PathConfig.PendingFetchLog + ".offset") FetchFileWithOffset2(*runtimeConfig, pendingFetchUrls, path, offset) }
func ParsedSavedFileLog2(runtimeConfig RuntimeConfig, pendingFetchUrls chan []byte, url string) { if url != "" { log.Trace("start parse filelog:", url) if runtimeConfig.Storage.CheckFetchedUrl([]byte(url)) { log.Debug("hit fetch filter ignore,", url) return } log.Debug("new task extracted from saved page:", url) pendingFetchUrls <- []byte(url) } }
func loadFileContent(fileName string) []byte { if util.CheckFileExists(fileName) { log.Trace("found fileName,start loading:", fileName) n, err := ioutil.ReadFile(fileName) if err != nil { log.Error("loadFile", err, ",", fileName) return nil } return n } return nil }
func main() { flag.StringVar(&seedUrl, "seed", "http://example.com", "the seed url,where everything starts") flag.StringVar(&logLevel, "log", "info", "setting log level,options:trace,debug,info,warn,error") flag.Parse() defer log.Flush() runtime.GOMAXPROCS(runtime.NumCPU()) log.SetInitLogging(logLevel) runtimeConfig.PathConfig = new(PathConfig) runtimeConfig.ClusterConfig = new(ClusterConfig) runtimeConfig.ClusterConfig.Name = config.GetStringConfig("cluster", "name", "gopa") // per cluster:data/gopa/ runtimeConfig.PathConfig.Home = config.GetStringConfig("path", "home", "cluster/"+runtimeConfig.ClusterConfig.Name+"/") runtimeConfig.PathConfig.Data = config.GetStringConfig("path", "data", "") if runtimeConfig.PathConfig.Data == "" { runtimeConfig.PathConfig.Data = runtimeConfig.PathConfig.Home + "/" + "data/" } runtimeConfig.PathConfig.Log = config.GetStringConfig("path", "log", "") if runtimeConfig.PathConfig.Log == "" { runtimeConfig.PathConfig.Log = runtimeConfig.PathConfig.Home + "/" + "log/" } runtimeConfig.PathConfig.WebData = config.GetStringConfig("path", "webdata", "") if runtimeConfig.PathConfig.WebData == "" { runtimeConfig.PathConfig.WebData = runtimeConfig.PathConfig.Data + "/" + "webdata/" } runtimeConfig.PathConfig.TaskData = config.GetStringConfig("path", "taskdata", "") if runtimeConfig.PathConfig.TaskData == "" { runtimeConfig.PathConfig.TaskData = runtimeConfig.PathConfig.Data + "/" + "taskdata/" } runtimeConfig.StoreWebPageTogether = config.GetBoolConfig("Global", "StoreWebPageTogether", true) runtimeConfig.TaskConfig = parseConfig() //set default logging logPath := runtimeConfig.PathConfig.Log + "/" + runtimeConfig.TaskConfig.Name + "/gopa.log" log.SetLogging(logLevel, logPath) runtimeConfig.ParseUrlsFromSavedFileLog = config.GetBoolConfig("Switch", "ParseUrlsFromSavedFileLog", true) runtimeConfig.LoadTemplatedFetchJob = config.GetBoolConfig("Switch", "LoadTemplatedFetchJob", true) runtimeConfig.LoadRuledFetchJob = config.GetBoolConfig("Switch", "LoadRuledFetchJob", false) runtimeConfig.LoadPendingFetchJobs = config.GetBoolConfig("Switch", "LoadPendingFetchJobs", true) runtimeConfig.HttpEnabled = config.GetBoolConfig("Switch", "HttpEnabled", true) runtimeConfig.ParseUrlsFromPreviousSavedPage = config.GetBoolConfig("Switch", "ParseUrlsFromPreviousSavedPage", false) runtimeConfig.ArrayStringSplitter = config.GetStringConfig("CrawlerRule", "ArrayStringSplitter", ",") runtimeConfig.GoProfEnabled = config.GetBoolConfig("CrawlerRule", "GoProfEnabled", false) runtimeConfig.WalkBloomFilterFileName = config.GetStringConfig("BloomFilter", "WalkBloomFilterFileName", runtimeConfig.TaskConfig.TaskDataPath+"/filters/walk.bloomfilter") runtimeConfig.FetchBloomFilterFileName = config.GetStringConfig("BloomFilter", "FetchBloomFilterFileName", runtimeConfig.TaskConfig.TaskDataPath+"/filters/fetch.bloomfilter") runtimeConfig.ParseBloomFilterFileName = config.GetStringConfig("BloomFilter", "ParseBloomFilterFileName", runtimeConfig.TaskConfig.TaskDataPath+"/filters/parse.bloomfilter") runtimeConfig.PendingFetchBloomFilterFileName = config.GetStringConfig("BloomFilter", "PendingFetchBloomFilterFileName", runtimeConfig.TaskConfig.TaskDataPath+"/filters/pending_fetch.bloomfilter") runtimeConfig.PathConfig.SavedFileLog = runtimeConfig.TaskConfig.TaskDataPath + "/tasks/pending_parse.files" runtimeConfig.PathConfig.PendingFetchLog = runtimeConfig.TaskConfig.TaskDataPath + "/tasks/pending_fetch.urls" runtimeConfig.PathConfig.FetchFailedLog = runtimeConfig.TaskConfig.TaskDataPath + "/tasks/failed_fetch.urls" runtimeConfig.MaxGoRoutine = config.GetIntConfig("Global", "MaxGoRoutine", 2) if runtimeConfig.MaxGoRoutine < 2 { runtimeConfig.MaxGoRoutine = 2 } log.Debug("maxGoRoutine:", runtimeConfig.MaxGoRoutine) log.Debug("path.home:", runtimeConfig.PathConfig.Home) os.MkdirAll(runtimeConfig.PathConfig.Home, 0777) os.MkdirAll(runtimeConfig.PathConfig.Data, 0777) os.MkdirAll(runtimeConfig.PathConfig.Log, 0777) os.MkdirAll(runtimeConfig.PathConfig.WebData, 0777) os.MkdirAll(runtimeConfig.PathConfig.TaskData, 0777) os.MkdirAll(runtimeConfig.TaskConfig.TaskDataPath, 0777) os.MkdirAll(runtimeConfig.TaskConfig.TaskDataPath+"/tasks/", 0777) os.MkdirAll(runtimeConfig.TaskConfig.TaskDataPath+"/filters/", 0777) os.MkdirAll(runtimeConfig.TaskConfig.TaskDataPath+"/urls/", 0777) os.MkdirAll(runtimeConfig.TaskConfig.WebDataPath, 0777) runtimeConfig.RuledFetchConfig = new(RuledFetchConfig) runtimeConfig.RuledFetchConfig.UrlTemplate = config.GetStringConfig("RuledFetch", "UrlTemplate", "") runtimeConfig.RuledFetchConfig.From = config.GetIntConfig("RuledFetch", "From", 0) runtimeConfig.RuledFetchConfig.To = config.GetIntConfig("RuledFetch", "To", 10) runtimeConfig.RuledFetchConfig.Step = config.GetIntConfig("RuledFetch", "Step", 1) runtimeConfig.RuledFetchConfig.LinkExtractPattern = config.GetStringConfig("RuledFetch", "LinkExtractPattern", "") runtimeConfig.RuledFetchConfig.LinkTemplate = config.GetStringConfig("RuledFetch", "LinkTemplate", "") if seedUrl == "" || seedUrl == "http://example.com" { log.Error("no seed was given. type:\"gopa -h\" for help.") os.Exit(1) } log.Info("[gopa] " + runtimeConfig.Version + " is on.") runtimeConfig.Storage = &fsstore.FsStore{} // if(runtimeConfig.) runtimeConfig.Storage.InitWalkBloomFilter(runtimeConfig.WalkBloomFilterFileName) runtimeConfig.Storage.InitFetchBloomFilter(runtimeConfig.FetchBloomFilterFileName) runtimeConfig.Storage.InitParseBloomFilter(runtimeConfig.ParseBloomFilterFileName) runtimeConfig.Storage.InitPendingFetchBloomFilter(runtimeConfig.PendingFetchBloomFilterFileName) // atr:="AZaz" // btr:=[]byte(atr) // fmt.Println(btr) // // id:= getSeqStr([]byte("AA"),[]byte("ZZ"),false) // fmt.Println(id) //pprof serves if runtimeConfig.GoProfEnabled { go func() { log.Info(http.ListenAndServe("localhost:6060", nil)) log.Info("pprof server is up,http://localhost:6060/debug/pprof") }() } //http serves if runtimeConfig.HttpEnabled { go func() { httpServ.Start(runtimeConfig) }() } //adding default http protocol if !strings.HasPrefix(seedUrl, "http") { seedUrl = "http://" + seedUrl } maxGoRoutine := runtimeConfig.MaxGoRoutine fetchQuitChannels := make([]*chan bool, maxGoRoutine) //shutdownSignal signals for each go routing fetchTaskChannels := make([]*chan []byte, maxGoRoutine) //fetchTask channels fetchOffsets := make([]*RoutingOffset, maxGoRoutine) //kafka fetchOffsets parseQuitChannels := make([]*chan bool, 2) //shutdownSignal signals for each go routing // parseQuitChannels := make([]*chan bool, MaxGoRoutine) //shutdownSignal signals for each go routing parseOffsets := make([]*RoutingOffset, maxGoRoutine) //kafka fetchOffsets shutdownSignal := make(chan bool, 1) finalQuitSignal := make(chan bool, 1) //handle exit event exitEventChannel := make(chan os.Signal, 1) signal.Notify(exitEventChannel, syscall.SIGINT) signal.Notify(exitEventChannel, os.Interrupt) go func() { s := <-exitEventChannel log.Debug("got signal:", s) if s == os.Interrupt || s.(os.Signal) == syscall.SIGINT { log.Warn("got signal:os.Interrupt,saving data and exit") // defer os.Exit(0) runtimeConfig.Storage.PersistBloomFilter() //wait workers to exit log.Info("waiting workers exit") go shutdown(fetchOffsets, fetchQuitChannels, parseOffsets, parseQuitChannels, shutdownSignal) <-shutdownSignal log.Info("workers shutdown") finalQuitSignal <- true } }() //start fetcher for i := 0; i < maxGoRoutine; i++ { quitC := make(chan bool, 1) taskC := make(chan []byte) fetchQuitChannels[i] = &quitC fetchTaskChannels[i] = &taskC offset := new(RoutingOffset) // offset.Offset = initOffset(runtimeConfig, "fetch", i) offset.Shard = i fetchOffsets[i] = offset go task.FetchGo(runtimeConfig, &taskC, &quitC, offset) } c2 := make(chan bool, 1) parseQuitChannels[0] = &c2 offset2 := new(RoutingOffset) // offset2.Offset = initOffset(runtimeConfig, "parse", 0) offset2.Shard = 0 parseOffsets[0] = offset2 pendingFetchUrls := make(chan []byte) //fetch rule:all urls -> persisted to sotre -> fetched from store -> pushed to pendingFetchUrls -> redistributed to sharded goroutines -> fetch -> save webpage to store -> done //parse rule:url saved to store -> local path persisted to store -> fetched to pendingParseFiles -> redistributed to sharded goroutines -> parse -> clean urls -> enqueue to url store ->done //sending feed to task queue go func() { //notice seed will not been persisted log.Debug("sending feed to fetch queue,", seedUrl) pendingFetchUrls <- []byte(seedUrl) }() //start local saved file parser if runtimeConfig.ParseUrlsFromSavedFileLog { go task.ParseGo(pendingFetchUrls, runtimeConfig, &c2, offset2) } //redistribute pendingFetchUrls to sharded workers go func() { for { url := <-pendingFetchUrls if !runtimeConfig.Storage.CheckWalkedUrl(url) { if runtimeConfig.Storage.CheckFetchedUrl(url) { log.Warn("dont hit walk bloomfilter but hit fetch bloomfilter,also ignore,", string(url)) runtimeConfig.Storage.AddWalkedUrl(url) continue } randomShard := 0 if maxGoRoutine > 1 { randomShard = rand.Intn(maxGoRoutine - 1) } log.Debug("publish:", string(url), ",shard:", randomShard) runtimeConfig.Storage.AddWalkedUrl(url) *fetchTaskChannels[randomShard] <- url } else { log.Trace("hit walk or fetch bloomfilter,just ignore,", string(url)) } } }() //load predefined fetch jobs if runtimeConfig.LoadTemplatedFetchJob { go func() { if util.CheckFileExists(runtimeConfig.TaskConfig.TaskDataPath + "/urls/template.txt") { templates := util.ReadAllLines(runtimeConfig.TaskConfig.TaskDataPath + "/urls/template.txt") ids := util.ReadAllLines(runtimeConfig.TaskConfig.TaskDataPath + "/urls/id.txt") for _, id := range ids { for _, template := range templates { log.Trace("id:", id) log.Trace("template:", template) url := strings.Replace(template, "{id}", id, -1) log.Debug("new task from template:", url) pendingFetchUrls <- []byte(url) } } log.Info("templated download is done.") } }() } //fetch urls from saved pages if runtimeConfig.LoadPendingFetchJobs { c3 := make(chan bool, 1) parseQuitChannels[1] = &c3 offset3 := new(RoutingOffset) // offset3.Offset = initOffset(runtimeConfig, "fetch_from_saved", 0) offset3.Shard = 0 parseOffsets[1] = offset3 go task.LoadTaskFromLocalFile(pendingFetchUrls, &runtimeConfig, &c3, offset3) } //parse fetch failed jobs,and will ignore the walk-filter //TODO if runtimeConfig.LoadRuledFetchJob { log.Debug("start ruled fetch") go func() { if runtimeConfig.RuledFetchConfig.UrlTemplate != "" { for i := runtimeConfig.RuledFetchConfig.From; i <= runtimeConfig.RuledFetchConfig.To; i += runtimeConfig.RuledFetchConfig.Step { url := strings.Replace(runtimeConfig.RuledFetchConfig.UrlTemplate, "{id}", strconv.FormatInt(int64(i), 10), -1) log.Debug("add ruled url:", url) pendingFetchUrls <- []byte(url) } } else { log.Error("ruled template is empty,ignore") } }() } <-finalQuitSignal log.Info("[gopa] is down") }