func main() { log.Println("Running tocrawl tool, to generate crawling list") log.Println("NonCrawlHosts: ", gcse.NonCrawlHosts) log.Println("CrawlGithubUpdate: ", gcse.CrawlGithubUpdate) log.Println("CrawlByGodocApi: ", gcse.CrawlByGodocApi) // Load CrawlerDB cDB = gcse.LoadCrawlerDB() if gcse.CrawlGithubUpdate || gcse.CrawlByGodocApi { // load pkgUTs pkgUTs, err := loadPackageUpdateTimes( sophie.LocalFsPath(gcse.DocsDBPath.S())) if err != nil { log.Fatalf("loadPackageUpdateTimes failed: %v", err) } if gcse.CrawlGithubUpdate { touchByGithubUpdates(pkgUTs) } if gcse.CrawlByGodocApi { httpClient := gcse.GenHttpClient("") pkgs, err := gcse.FetchAllPackagesInGodoc(httpClient) if err != nil { log.Fatalf("FetchAllPackagesInGodoc failed: %v", err) } log.Printf("FetchAllPackagesInGodoc returns %d entries", len(pkgs)) for _, pkg := range pkgs { cDB.AppendPackage(pkg, func(pkg string) bool { _, ok := pkgUTs[pkg] return ok }) } } syncDatabases() } log.Printf("Package DB: %d entries", cDB.PackageDB.Count()) log.Printf("Person DB: %d entries", cDB.PersonDB.Count()) pathToCrawl := gcse.DataRoot.Join(gcse.FnToCrawl) kvPackage := kv.DirOutput(sophie.LocalFsPath( pathToCrawl.Join(gcse.FnPackage).S())) kvPackage.Clean() if err := generateCrawlEntries(cDB.PackageDB, gcse.HostOfPackage, kvPackage); err != nil { log.Fatalf("generateCrawlEntries %v failed: %v", kvPackage.Path, err) } kvPerson := kv.DirOutput(sophie.LocalFsPath( pathToCrawl.Join(gcse.FnPerson).S())) kvPerson.Clean() if err := generateCrawlEntries(cDB.PersonDB, func(id string) string { site, _ := gcse.ParsePersonId(id) return site }, kvPerson); err != nil { log.Fatalf("generateCrawlEntries %v failed: %v", kvPerson.Path, err) } }
// crawl packages, send error back to end func crawlPackages(httpClient doc.HttpClient, fpToCrawlPkg, fpOutNewDocs sophie.FsPath, end chan error) { time.AfterFunc(configs.CrawlerDuePerRun+time.Minute*10, func() { end <- errorsp.NewWithStacks("Crawling packages timeout!") }) end <- func() error { outNewDocs := kv.DirOutput(fpOutNewDocs) outNewDocs.Clean() job := mr.MapOnlyJob{ Source: []mr.Input{ kv.DirInput(fpToCrawlPkg), }, NewMapperF: func(src, part int) mr.OnlyMapper { return &PackageCrawler{ part: part, httpClient: httpClient, } }, Dest: []mr.Output{ outNewDocs, }, } if err := job.Run(); err != nil { log.Printf("crawlPackages: job.Run failed: %v", err) return err } return nil }() }
func main() { fmt.Println("Data conversion tool") fpRoot := sophie.LocalFsPath("./data") /* * Doc db */ if DocDBPath.Exists() { if DocDBPath.Join(gcse.KindDocDB+".gob").Exists() && !gcse.DataRoot.Join(fnNewDocDB).Exists() { src := DocDBPath.Join(gcse.KindDocDB + ".gob") dst := fpRoot.Join(fnNewDocDB) fmt.Println("Convert", src, "to", dst, "...") srcDB := gcse.PackedDocDB{MemDB: gcse.NewMemDB(DocDBPath, gcse.KindDocDB)} if err := srcDB.Load(); err != nil { log.Fatalf("srcDB.Load: %v", err) } fpDocs := fpRoot.Join(fnNewDocDB) dstDB := kv.DirOutput(fpDocs) c, err := dstDB.Collector(0) if err != nil { log.Fatalf("dstDB.Collector: %v", err) } count := 0 if err := srcDB.Iterate(func(key string, val interface{}) error { k := sophie.RawString(key) v := val.(gcse.DocInfo) if count < 10 { fmtp.Printfln(" key: %+v, value: %+v", k, v) } count++ return c.Collect(k, &v) }); err != nil { fpDocs.Remove() log.Fatalf("srcDB.Iterate: %v", err) } c.Close() fmtp.Printfln("Conversion sucess, %d entries collected.", count) } } }
func main() { log.Println("Merging new crawled docs back...") fpDataRoot := sophie.LocalFsPath(gcse.DataRoot.S()) fpCrawler := fpDataRoot.Join(gcse.FnCrawlerDB) outDocsUpdated := kv.DirOutput(fpDataRoot.Join("docs-updated")) outDocsUpdated.Clean() var cntDeleted, cntUpdated, cntNewUnchange int64 job := mr.MrJob{ Source: []mr.Input{ kv.DirInput(fpDataRoot.Join(gcse.FnDocs)), // 0 kv.DirInput(fpCrawler.Join(gcse.FnNewDocs)), // 1 }, NewMapperF: func(src, part int) mr.Mapper { if src == 0 { return &mr.MapperStruct{ NewKeyF: sophie.NewRawString, NewValF: gcse.NewDocInfo, MapF: func(key, val sophie.SophieWriter, c mr.PartCollector) error { pkg := key.(*sophie.RawString).String() di := val.(*gcse.DocInfo) act := gcse.NewDocAction{ Action: gcse.NDA_UPDATE, DocInfo: *di, } part := gcse.CalcPackagePartition(pkg, gcse.DOCS_PARTS) return c.CollectTo(part, key, &act) }, } } return &mr.MapperStruct{ NewKeyF: sophie.NewRawString, NewValF: gcse.NewNewDocAction, MapF: func(key, val sophie.SophieWriter, c mr.PartCollector) error { pkg := string(*key.(*sophie.RawString)) part := gcse.CalcPackagePartition(pkg, gcse.DOCS_PARTS) return c.CollectTo(part, key, val) }, } }, Sorter: mr.NewFileSorter(fpDataRoot.Join("tmp")), NewReducerF: func(part int) mr.Reducer { return &mr.ReducerStruct{ NewKeyF: sophie.NewRawString, NewValF: gcse.NewNewDocAction, ReduceF: func(key sophie.SophieWriter, nextVal mr.SophierIterator, c []sophie.Collector) error { var act gcse.DocInfo isSet := false isUpdated := false for { val, err := nextVal() if err == sophie.EOF { break } if err != nil { return err } cur := val.(*gcse.NewDocAction) if cur.Action == gcse.NDA_DEL { // not collect out to delete it atomic.AddInt64(&cntDeleted, 1) return nil } if !isSet { isSet = true act = cur.DocInfo } else { if cur.LastUpdated.After(act.LastUpdated) { isUpdated = true act = cur.DocInfo } } } if isSet { if isUpdated { atomic.AddInt64(&cntUpdated, 1) } else { atomic.AddInt64(&cntNewUnchange, 1) } return c[0].Collect(key, &act) } else { return nil } }, } }, Dest: []mr.Output{ outDocsUpdated, }, } if err := job.Run(); err != nil { log.Fatalf("job.Run failed: %v", err) } log.Printf("Deleted: %v", cntDeleted) log.Printf("Updated: %v", cntUpdated) log.Printf("NewUnchange: %v", cntNewUnchange) pDocs := gcse.DataRoot.Join(gcse.FnDocs) pUpdated := gcse.DataRoot.Join("docs-updated") pTmp := gcse.DataRoot.Join("docs-tmp") pTmp.RemoveAll() if err := pDocs.Rename(pTmp); err != nil { log.Fatalf("rename %v to %v failed: %v", pDocs, pTmp, err) } if err := pUpdated.Rename(pDocs); err != nil { log.Fatalf("rename %v to %v failed: %v", pUpdated, pDocs, err) } log.Println("Merging success...") }
func main() { log.Println("Running tocrawl tool, to generate crawling list") log.Println("NonCrawlHosts: ", configs.NonCrawlHosts) log.Println("CrawlGithubUpdate: ", configs.CrawlGithubUpdate) log.Println("CrawlByGodocApi: ", configs.CrawlByGodocApi) log.Printf("Using personal: %v", configs.CrawlerGithubPersonal) gcse.GithubSpider = github.NewSpiderWithToken(configs.CrawlerGithubPersonal) // Load CrawlerDB cDB = gcse.LoadCrawlerDB() if configs.CrawlGithubUpdate || configs.CrawlByGodocApi { // load pkgUTs pkgUTs, err := loadPackageUpdateTimes( sophie.LocalFsPath(configs.DocsDBPath().S())) if err != nil { log.Fatalf("loadPackageUpdateTimes failed: %v", err) } if configs.CrawlGithubUpdate { touchByGithubUpdates(pkgUTs) } if configs.CrawlByGodocApi { httpClient := gcse.GenHttpClient("") pkgs, err := gcse.FetchAllPackagesInGodoc(httpClient) if err != nil { log.Fatalf("FetchAllPackagesInGodoc failed: %v", err) } gcse.AddBiValueAndProcess(bi.Max, "godoc.doc-count", len(pkgs)) log.Printf("FetchAllPackagesInGodoc returns %d entries", len(pkgs)) now := time.Now() for _, pkg := range pkgs { cDB.AppendPackage(pkg, func(pkg string) bool { _, ok := pkgUTs[pkg] return ok }) site, path := utils.SplitPackage(pkg) if err := store.AppendPackageEvent(site, path, "godoc", now, sppb.HistoryEvent_Action_None); err != nil { log.Printf("UpdatePackageHistory %s %s failed: %v", site, path, err) } } } syncDatabases() } log.Printf("Package DB: %d entries", cDB.PackageDB.Count()) log.Printf("Person DB: %d entries", cDB.PersonDB.Count()) pathToCrawl := configs.DataRoot.Join(configs.FnToCrawl) kvPackage := kv.DirOutput(sophie.LocalFsPath( pathToCrawl.Join(configs.FnPackage).S())) kvPackage.Clean() if err := generateCrawlEntries(cDB.PackageDB, gcse.HostOfPackage, kvPackage); err != nil { log.Fatalf("generateCrawlEntries %v failed: %v", kvPackage.Path, err) } kvPerson := kv.DirOutput(sophie.LocalFsPath( pathToCrawl.Join(configs.FnPerson).S())) kvPerson.Clean() if err := generateCrawlEntries(cDB.PersonDB, func(id string) string { site, _ := gcse.ParsePersonId(id) return site }, kvPerson); err != nil { log.Fatalf("generateCrawlEntries %v failed: %v", kvPerson.Path, err) } }
func TestMRFromFile(t *testing.T) { fmt.Println("TestMRFromFile starts") fpRoot := sophie.LocalFsPath(".") mrin := fpRoot.Join("mrin") mrin.Mkdir(0755) mrtmp := fpRoot.Join("tmp") /* * Prepare input */ var inF *kv.Writer = nil index := 0 lines := strings.Split(WORDS, "\n") for i, line := range lines { if i%3 == 0 { if inF != nil { assert.NoErrorf(t, "inF.Close: %v", inF.Close()) index++ } var err error inF, err = kv.NewWriter(mrin.Join(fmt.Sprintf("part-%05d", index))) assert.NoErrorf(t, "NewKVWriter: %v", err) } assert.NoErrorf(t, "inF.Collect", inF.Collect(sophie.RawString(line), sophie.Null{})) } if inF != nil { assert.NoErrorf(t, "inF.Close: %v", inF.Close()) } mrout := fpRoot.Join("mrout") assert.NoErrorf(t, "Remove mrout: %v", mrout.Remove()) /* * MrJob */ var mapper WordCountMapper reducer := WordCountReducer{counts: make(map[string]int)} job := MrJob{ Source: []Input{kv.DirInput(mrin)}, NewMapperF: func(src, part int) Mapper { return &mapper }, Sorter: NewFileSorter(mrtmp), NewReducerF: func(part int) Reducer { return &reducer }, Dest: []Output{kv.DirOutput(mrout)}, } assert.NoErrorf(t, "RunJob: %v", job.Run()) /* * Check result */ resIn := kv.DirInput(mrout) n, err := resIn.PartCount() assert.NoErrorf(t, "resIn.PartCount(): %v", err) var word sophie.RawString var cnt sophie.RawVInt actCnts := make(map[string]int) for i := 0; i < n; i++ { iter, err := resIn.Iterator(i) assert.NoErrorf(t, "resIn.Iterator: %v", err) for { err := iter.Next(&word, &cnt) if err == sophie.EOF { break } assert.NoErrorf(t, "iter.Next: %v", err) actCnts[string(word)] = int(cnt) } } expCnts := statWords(WORDS) // fmt.Println(expCnts) // fmt.Println(actCnts) assertMapEquals(t, actCnts, expCnts) fmt.Println("TestMRFromFile ends") }
func main() { log.Println("Merging new crawled docs back...") var nonStorePackage *regexp.Regexp if len(configs.NonStorePackageRegexps) > 0 { nonStorePackage = regexp.MustCompile( stringsp.FullJoin(configs.NonStorePackageRegexps, "(", ")|(", ")")) } fpDataRoot := sophie.LocalFsPath(configs.DataRoot.S()) fpCrawler := fpDataRoot.Join(configs.FnCrawlerDB) outDocsUpdated := kv.DirOutput(fpDataRoot.Join("docs-updated")) outDocsUpdated.Clean() var cntDeleted, cntUpdated, cntNew, cntUnchanged int64 job := mr.MrJob{ Source: []mr.Input{ kv.DirInput(fpDataRoot.Join(configs.FnDocs)), // 0 kv.DirInput(fpCrawler.Join(configs.FnNewDocs)), // 1 }, NewMapperF: func(src, part int) mr.Mapper { if src == 0 { // Mapper for docs return &mr.MapperStruct{ NewKeyF: sophie.NewRawString, NewValF: gcse.NewDocInfo, MapF: func(key, val sophie.SophieWriter, c mr.PartCollector) error { pkg := key.(*sophie.RawString).String() di := val.(*gcse.DocInfo) act := gcse.NewDocAction{ Action: gcse.NDA_ORIGINAL, DocInfo: *di, } part := gcse.CalcPackagePartition(pkg, gcse.DOCS_PARTS) return c.CollectTo(part, key, &act) }, } } // Mapper for new docs return &mr.MapperStruct{ NewKeyF: sophie.NewRawString, NewValF: gcse.NewNewDocAction, MapF: func(key, val sophie.SophieWriter, c mr.PartCollector) error { pkg := string(*key.(*sophie.RawString)) part := gcse.CalcPackagePartition(pkg, gcse.DOCS_PARTS) return c.CollectTo(part, key, val) }, } }, Sorter: mr.NewFileSorter(fpDataRoot.Join("tmp")), NewReducerF: func(part int) mr.Reducer { return &mr.ReducerStruct{ NewKeyF: sophie.NewRawString, NewValF: gcse.NewNewDocAction, ReduceF: func(key sophie.SophieWriter, nextVal mr.SophierIterator, c []sophie.Collector) error { if nonStorePackage != nil { pkg := string(*key.(*sophie.RawString)) if nonStorePackage.MatchString(pkg) { log.Printf("Ignoring non-store pkg: %s", pkg) return nil } } var act gcse.DocInfo isSet := false isUpdated := false hasOriginal := false for { val, err := nextVal() if errorsp.Cause(err) == io.EOF { break } if err != nil { return err } cur := val.(*gcse.NewDocAction) switch cur.Action { case gcse.NDA_DEL: // not collect out to delete it atomic.AddInt64(&cntDeleted, 1) return nil case gcse.NDA_ORIGINAL: hasOriginal = true } if !isSet { isSet = true act = cur.DocInfo } else { if cur.LastUpdated.After(act.LastUpdated) { isUpdated = true act = cur.DocInfo } } } if isSet { if isUpdated { atomic.AddInt64(&cntUpdated, 1) } else if hasOriginal { atomic.AddInt64(&cntUnchanged, 1) } else { atomic.AddInt64(&cntNew, 1) } return c[0].Collect(key, &act) } else { return nil } }, } }, Dest: []mr.Output{ outDocsUpdated, }, } if err := job.Run(); err != nil { log.Fatalf("job.Run failed: %v", err) } log.Printf("Deleted: %v", cntDeleted) log.Printf("Updated: %v", cntUpdated) log.Printf("New: %v", cntNew) log.Printf("Unchanged: %v", cntUnchanged) pDocs := configs.DataRoot.Join(configs.FnDocs) pUpdated := configs.DataRoot.Join("docs-updated") pTmp := configs.DataRoot.Join("docs-tmp") pTmp.RemoveAll() if err := pDocs.Rename(pTmp); err != nil { log.Fatalf("rename %v to %v failed: %v", pDocs, pTmp, err) } if err := pUpdated.Rename(pDocs); err != nil { log.Fatalf("rename %v to %v failed: %v", pUpdated, pDocs, err) } log.Println("Merging success...") }