func doIndex() bool { idxSegm, err := gcse.IndexSegments.GenMaxSegment() if err != nil { log.Printf("GenMaxSegment failed: %v", err) return false } runtime.GC() gcse.DumpMemStats() log.Printf("Indexing to %v ...", idxSegm) fpDocDB := sophie.LocalFsPath(configs.DocsDBPath().S()) ts, err := gcse.Index(kv.DirInput(fpDocDB), idxSegm.Join("").S()) if err != nil { log.Printf("Indexing failed: %v", err) return false } if !func() bool { f, err := idxSegm.Join(gcse.IndexFn).Create() if err != nil { log.Printf("Create index file failed: %v", err) return false } defer f.Close() log.Printf("Saving index to %v ...", idxSegm) if err := ts.Save(f); err != nil { log.Printf("ts.Save failed: %v", err) return false } return true }() { return false } runtime.GC() gcse.DumpMemStats() storePath := idxSegm.Join(configs.FnStore) log.Printf("Saving store snapshot to %v", storePath) if err := store.SaveSnapshot(storePath.S()); err != nil { log.Printf("SaveSnapshot %v failed: %v", storePath, err) } if err := idxSegm.Done(); err != nil { log.Printf("segm.Done failed: %v", err) return false } log.Printf("Indexing success: %s (%d)", idxSegm, ts.DocCount()) gcse.AddBiValueAndProcess(bi.Average, "index.doc-count", ts.DocCount()) ts = nil gcse.DumpMemStats() runtime.GC() gcse.DumpMemStats() return true }
func loadPackageUpdateTimes(fpDocs sophie.FsPath) (map[string]time.Time, error) { dir := kv.DirInput(fpDocs) cnt, err := dir.PartCount() if err != nil { return nil, err } pkgUTs := make(map[string]time.Time) var pkg sophie.RawString var info gcse.DocInfo for i := 0; i < cnt; i++ { it, err := dir.Iterator(i) if err != nil { return nil, err } for { if err := it.Next(&pkg, &info); err != nil { if err == sophie.EOF { break } return nil, err } pkgUTs[string(pkg)] = info.LastUpdated } } return pkgUTs, nil }
func main() { // path := "data/docs" path := "data/docs-updated" kvDir := kv.DirInput(sophie.LocalFsPath(path)) cnt, err := kvDir.PartCount() if err != nil { log.Fatalf("kvDir.PartCount failed: %v", err) } totalEntries := 0 for i := 0; i < cnt; i++ { it, err := kvDir.Iterator(i) if err != nil { log.Fatalf("kvDir.Collector(%d) failed: %v", i, err) } var key sophie.RawString var val gcse.DocInfo for { if err := it.Next(&key, &val); err != nil { if err == sophie.EOF { break } log.Fatalf("it.Next failed %v", err) } totalEntries++ } it.Close() } fmtp.Printfln("Total %d files, %d entries.", cnt, totalEntries) }
// crawl packages, send error back to end func crawlPackages(httpClient doc.HttpClient, fpToCrawlPkg, fpOutNewDocs sophie.FsPath, end chan error) { time.AfterFunc(configs.CrawlerDuePerRun+time.Minute*10, func() { end <- errorsp.NewWithStacks("Crawling packages timeout!") }) end <- func() error { outNewDocs := kv.DirOutput(fpOutNewDocs) outNewDocs.Clean() job := mr.MapOnlyJob{ Source: []mr.Input{ kv.DirInput(fpToCrawlPkg), }, NewMapperF: func(src, part int) mr.OnlyMapper { return &PackageCrawler{ part: part, httpClient: httpClient, } }, Dest: []mr.Output{ outNewDocs, }, } if err := job.Run(); err != nil { log.Printf("crawlPackages: job.Run failed: %v", err) return err } return nil }() }
// crawl packages, send error back to end func crawlPersons(httpClient doc.HttpClient, fpToCrawlPsn sophie.FsPath, end chan error) { time.AfterFunc(gcse.CrawlerDuePerRun+time.Minute*10, func() { end <- errors.New("Crawling persons timeout!") }) end <- func() error { job := mr.MapOnlyJob{ Source: []mr.Input{ kv.DirInput(fpToCrawlPsn), }, NewMapperF: func(src, part int) mr.OnlyMapper { return &PersonCrawler{ part: part, httpClient: httpClient, } }, } if err := job.Run(); err != nil { log.Printf("crawlPersons: job.Run failed: %v", err) return err } return nil }() }
func dumpDocs(keys []string) { path := "data/docs" kvDir := kv.DirInput(sophie.LocalFsPath(path)) cnt, err := kvDir.PartCount() if err != nil { log.Fatalf("kvDir.PartCount() failed: %v", err) } parts := make(map[int]map[string]bool) for _, key := range keys { part := gcse.CalcPackagePartition(key, gcse.DOCS_PARTS) if parts[part] == nil { parts[part] = make(map[string]bool) } parts[part][key] = true } var key sophie.RawString var val gcse.DocInfo for part := 0; part < cnt; part++ { if len(keys) > 0 && parts[part] == nil { continue } it, err := kvDir.Iterator(part) if err != nil { log.Fatalf("kvDir.Collector(%d) failed: %v", part, err) } func() { defer it.Close() for { if err := it.Next(&key, &val); err != nil { if err == sophie.EOF { break } log.Fatalf("it.Next failed %v", err) } pkg := key.String() if len(keys) > 0 && !parts[part][pkg] { continue } fmtp.Printfln("%v -> %+v", key, val) } it.Close() }() } }
func doIndex() bool { idxSegm, err := gcse.IndexSegments.GenMaxSegment() if err != nil { log.Printf("GenMaxSegment failed: %v", err) return false } runtime.GC() gcse.DumpMemStats() log.Printf("Indexing to %v ...", idxSegm) fpDocDB := sophie.LocalFsPath(gcse.DocsDBPath.S()) ts, err := gcse.Index(kv.DirInput(fpDocDB)) if err != nil { log.Printf("Indexing failed: %v", err) return false } f, err := idxSegm.Join(gcse.IndexFn).Create() if err != nil { log.Printf("Create index file failed: %v", err) return false } //defer f.Close() log.Printf("Saving index to %v ...", idxSegm) if err := ts.Save(f); err != nil { log.Printf("ts.Save failed: %v", err) return false } f.Close() f = nil runtime.GC() gcse.DumpMemStats() if err := idxSegm.Done(); err != nil { log.Printf("segm.Done failed: %v", err) return false } log.Printf("Indexing success: %s (%d)", idxSegm, ts.DocCount()) ts = nil gcse.DumpMemStats() runtime.GC() gcse.DumpMemStats() return true }
func main() { defer func() { tmpFn := villa.Path("/tmp/gddo") if err := tmpFn.RemoveAll(); err != nil { log.Printf("Delete %v failed: %v", tmpFn, err) } }() singlePackge := "" singleETag := "" flag.StringVar(&singlePackge, "pkg", singlePackge, "Crawling single package") flag.StringVar(&singleETag, "etag", singleETag, "ETag for single package crawling") flag.Parse() httpClient := gcse.GenHttpClient("") if singlePackge != "" { log.Printf("Crawling single package %s ...", singlePackge) p, err := gcse.CrawlPackage(httpClient, singlePackge, singleETag) if err != nil { fmtp.Printfln("Crawling package %s failured: %v", singlePackge, err) } else { fmtp.Printfln("Package %s: %+v", singlePackge, p) } return } log.Println("crawler started...") // Load CrawlerDB cDB = gcse.LoadCrawlerDB() fpDataRoot := sophie.FsPath{ Fs: sophie.LocalFS, Path: gcse.DataRoot.S(), } fpDocs := fpDataRoot.Join(gcse.FnDocs) if err := loadAllDocsPkgs(kv.DirInput(fpDocs)); err != nil { log.Fatalf("loadAllDocsPkgs: %v", err) } log.Printf("%d docs loaded!", len(allDocsPkgs)) AppStopTime = time.Now().Add(gcse.CrawlerDuePerRun) //pathToCrawl := gcse.DataRoot.Join(gcse.FnToCrawl) fpCrawler := fpDataRoot.Join(gcse.FnCrawlerDB) fpToCrawl := fpDataRoot.Join(gcse.FnToCrawl) fpNewDocs := fpCrawler.Join(gcse.FnNewDocs) fpNewDocs.Remove() pkgEnd := make(chan error, 1) go crawlPackages(httpClient, fpToCrawl.Join(gcse.FnPackage), fpNewDocs, pkgEnd) psnEnd := make(chan error, 1) go crawlPersons(httpClient, fpToCrawl.Join(gcse.FnPerson), psnEnd) errPkg, errPsn := <-pkgEnd, <-psnEnd if errPkg != nil || errPsn != nil { log.Fatalf("Some job may failed, package: %v, person: %v", errPkg, errPsn) } if err := processImports(); err != nil { log.Printf("processImports failed: %v", err) } syncDatabases() log.Println("crawler stopped...") }
func main() { log.Println("Merging new crawled docs back...") fpDataRoot := sophie.LocalFsPath(gcse.DataRoot.S()) fpCrawler := fpDataRoot.Join(gcse.FnCrawlerDB) outDocsUpdated := kv.DirOutput(fpDataRoot.Join("docs-updated")) outDocsUpdated.Clean() var cntDeleted, cntUpdated, cntNewUnchange int64 job := mr.MrJob{ Source: []mr.Input{ kv.DirInput(fpDataRoot.Join(gcse.FnDocs)), // 0 kv.DirInput(fpCrawler.Join(gcse.FnNewDocs)), // 1 }, NewMapperF: func(src, part int) mr.Mapper { if src == 0 { return &mr.MapperStruct{ NewKeyF: sophie.NewRawString, NewValF: gcse.NewDocInfo, MapF: func(key, val sophie.SophieWriter, c mr.PartCollector) error { pkg := key.(*sophie.RawString).String() di := val.(*gcse.DocInfo) act := gcse.NewDocAction{ Action: gcse.NDA_UPDATE, DocInfo: *di, } part := gcse.CalcPackagePartition(pkg, gcse.DOCS_PARTS) return c.CollectTo(part, key, &act) }, } } return &mr.MapperStruct{ NewKeyF: sophie.NewRawString, NewValF: gcse.NewNewDocAction, MapF: func(key, val sophie.SophieWriter, c mr.PartCollector) error { pkg := string(*key.(*sophie.RawString)) part := gcse.CalcPackagePartition(pkg, gcse.DOCS_PARTS) return c.CollectTo(part, key, val) }, } }, Sorter: mr.NewFileSorter(fpDataRoot.Join("tmp")), NewReducerF: func(part int) mr.Reducer { return &mr.ReducerStruct{ NewKeyF: sophie.NewRawString, NewValF: gcse.NewNewDocAction, ReduceF: func(key sophie.SophieWriter, nextVal mr.SophierIterator, c []sophie.Collector) error { var act gcse.DocInfo isSet := false isUpdated := false for { val, err := nextVal() if err == sophie.EOF { break } if err != nil { return err } cur := val.(*gcse.NewDocAction) if cur.Action == gcse.NDA_DEL { // not collect out to delete it atomic.AddInt64(&cntDeleted, 1) return nil } if !isSet { isSet = true act = cur.DocInfo } else { if cur.LastUpdated.After(act.LastUpdated) { isUpdated = true act = cur.DocInfo } } } if isSet { if isUpdated { atomic.AddInt64(&cntUpdated, 1) } else { atomic.AddInt64(&cntNewUnchange, 1) } return c[0].Collect(key, &act) } else { return nil } }, } }, Dest: []mr.Output{ outDocsUpdated, }, } if err := job.Run(); err != nil { log.Fatalf("job.Run failed: %v", err) } log.Printf("Deleted: %v", cntDeleted) log.Printf("Updated: %v", cntUpdated) log.Printf("NewUnchange: %v", cntNewUnchange) pDocs := gcse.DataRoot.Join(gcse.FnDocs) pUpdated := gcse.DataRoot.Join("docs-updated") pTmp := gcse.DataRoot.Join("docs-tmp") pTmp.RemoveAll() if err := pDocs.Rename(pTmp); err != nil { log.Fatalf("rename %v to %v failed: %v", pDocs, pTmp, err) } if err := pUpdated.Rename(pDocs); err != nil { log.Fatalf("rename %v to %v failed: %v", pUpdated, pDocs, err) } log.Println("Merging success...") }
func TestMRFromFile(t *testing.T) { fmt.Println("TestMRFromFile starts") fpRoot := sophie.LocalFsPath(".") mrin := fpRoot.Join("mrin") mrin.Mkdir(0755) mrtmp := fpRoot.Join("tmp") /* * Prepare input */ var inF *kv.Writer = nil index := 0 lines := strings.Split(WORDS, "\n") for i, line := range lines { if i%3 == 0 { if inF != nil { assert.NoErrorf(t, "inF.Close: %v", inF.Close()) index++ } var err error inF, err = kv.NewWriter(mrin.Join(fmt.Sprintf("part-%05d", index))) assert.NoErrorf(t, "NewKVWriter: %v", err) } assert.NoErrorf(t, "inF.Collect", inF.Collect(sophie.RawString(line), sophie.Null{})) } if inF != nil { assert.NoErrorf(t, "inF.Close: %v", inF.Close()) } mrout := fpRoot.Join("mrout") assert.NoErrorf(t, "Remove mrout: %v", mrout.Remove()) /* * MrJob */ var mapper WordCountMapper reducer := WordCountReducer{counts: make(map[string]int)} job := MrJob{ Source: []Input{kv.DirInput(mrin)}, NewMapperF: func(src, part int) Mapper { return &mapper }, Sorter: NewFileSorter(mrtmp), NewReducerF: func(part int) Reducer { return &reducer }, Dest: []Output{kv.DirOutput(mrout)}, } assert.NoErrorf(t, "RunJob: %v", job.Run()) /* * Check result */ resIn := kv.DirInput(mrout) n, err := resIn.PartCount() assert.NoErrorf(t, "resIn.PartCount(): %v", err) var word sophie.RawString var cnt sophie.RawVInt actCnts := make(map[string]int) for i := 0; i < n; i++ { iter, err := resIn.Iterator(i) assert.NoErrorf(t, "resIn.Iterator: %v", err) for { err := iter.Next(&word, &cnt) if err == sophie.EOF { break } assert.NoErrorf(t, "iter.Next: %v", err) actCnts[string(word)] = int(cnt) } } expCnts := statWords(WORDS) // fmt.Println(expCnts) // fmt.Println(actCnts) assertMapEquals(t, actCnts, expCnts) fmt.Println("TestMRFromFile ends") }
func main() { runtime.GOMAXPROCS(2) log.Printf("Using personal: %v", configs.CrawlerGithubPersonal) gcse.GithubSpider = github.NewSpiderWithToken(configs.CrawlerGithubPersonal) if db, err := bh.Open(configs.DataRoot.Join("filecache.bolt").S(), 0644, nil); err == nil { log.Print("Using file cache!") gcse.GithubSpider.FileCache = spider.BoltFileCache{ DB: db, IncCounter: bi.Inc, } } else { log.Printf("Open file cache failed: %v", err) } cleanTempDir() defer cleanTempDir() singlePackage := flag.String("pkg", "", "Crawling a single package") singleETag := flag.String("etag", "", "ETag for the single package crawling") singlePerson := flag.String("person", "", "Crawling a single person") flag.Parse() httpClient := gcse.GenHttpClient("") if *singlePerson != "" { log.Printf("Crawling single person %s ...", *singlePerson) p, err := gcse.CrawlPerson(httpClient, *singlePerson) if err != nil { fmtp.Printfln("Crawling person %s failed: %v", *singlePerson, err) } else { fmtp.Printfln("Person %s: %+v", *singlePerson, p) } } if *singlePackage != "" { log.Printf("Crawling single package %s ...", *singlePackage) p, flds, err := gcse.CrawlPackage(httpClient, *singlePackage, *singleETag) if err != nil { fmtp.Printfln("Crawling package %s failed: %v, folders: %v", *singlePackage, err, flds) } else { fmtp.Printfln("Package %s: %+v, folders: %v", *singlePackage, p, flds) } } if *singlePackage != "" || *singlePerson != "" { return } log.Println("crawler started...") // Load CrawlerDB cDB = gcse.LoadCrawlerDB() fpDataRoot := sophie.FsPath{ Fs: sophie.LocalFS, Path: configs.DataRoot.S(), } fpDocs := fpDataRoot.Join(configs.FnDocs) if err := loadAllDocsPkgs(kv.DirInput(fpDocs)); err != nil { log.Fatalf("loadAllDocsPkgs: %v", err) } log.Printf("%d docs loaded!", len(allDocsPkgs)) AppStopTime = time.Now().Add(configs.CrawlerDuePerRun) //pathToCrawl := gcse.DataRoot.Join(gcse.FnToCrawl) fpCrawler := fpDataRoot.Join(configs.FnCrawlerDB) fpToCrawl := fpDataRoot.Join(configs.FnToCrawl) fpNewDocs := fpCrawler.Join(configs.FnNewDocs) fpNewDocs.Remove() if err := processImports(); err != nil { log.Printf("processImports failed: %v", err) } pkgEnd := make(chan error, 1) go crawlPackages(httpClient, fpToCrawl.Join(configs.FnPackage), fpNewDocs, pkgEnd) psnEnd := make(chan error, 1) go crawlPersons(httpClient, fpToCrawl.Join(configs.FnPerson), psnEnd) errPkg, errPsn := <-pkgEnd, <-psnEnd bi.Flush() bi.Process() syncDatabases() if errPkg != nil || errPsn != nil { log.Fatalf("Some job may failed, package: %v, person: %v", errPkg, errPsn) } log.Println("crawler stopped...") }
func main() { log.Println("Merging new crawled docs back...") var nonStorePackage *regexp.Regexp if len(configs.NonStorePackageRegexps) > 0 { nonStorePackage = regexp.MustCompile( stringsp.FullJoin(configs.NonStorePackageRegexps, "(", ")|(", ")")) } fpDataRoot := sophie.LocalFsPath(configs.DataRoot.S()) fpCrawler := fpDataRoot.Join(configs.FnCrawlerDB) outDocsUpdated := kv.DirOutput(fpDataRoot.Join("docs-updated")) outDocsUpdated.Clean() var cntDeleted, cntUpdated, cntNew, cntUnchanged int64 job := mr.MrJob{ Source: []mr.Input{ kv.DirInput(fpDataRoot.Join(configs.FnDocs)), // 0 kv.DirInput(fpCrawler.Join(configs.FnNewDocs)), // 1 }, NewMapperF: func(src, part int) mr.Mapper { if src == 0 { // Mapper for docs return &mr.MapperStruct{ NewKeyF: sophie.NewRawString, NewValF: gcse.NewDocInfo, MapF: func(key, val sophie.SophieWriter, c mr.PartCollector) error { pkg := key.(*sophie.RawString).String() di := val.(*gcse.DocInfo) act := gcse.NewDocAction{ Action: gcse.NDA_ORIGINAL, DocInfo: *di, } part := gcse.CalcPackagePartition(pkg, gcse.DOCS_PARTS) return c.CollectTo(part, key, &act) }, } } // Mapper for new docs return &mr.MapperStruct{ NewKeyF: sophie.NewRawString, NewValF: gcse.NewNewDocAction, MapF: func(key, val sophie.SophieWriter, c mr.PartCollector) error { pkg := string(*key.(*sophie.RawString)) part := gcse.CalcPackagePartition(pkg, gcse.DOCS_PARTS) return c.CollectTo(part, key, val) }, } }, Sorter: mr.NewFileSorter(fpDataRoot.Join("tmp")), NewReducerF: func(part int) mr.Reducer { return &mr.ReducerStruct{ NewKeyF: sophie.NewRawString, NewValF: gcse.NewNewDocAction, ReduceF: func(key sophie.SophieWriter, nextVal mr.SophierIterator, c []sophie.Collector) error { if nonStorePackage != nil { pkg := string(*key.(*sophie.RawString)) if nonStorePackage.MatchString(pkg) { log.Printf("Ignoring non-store pkg: %s", pkg) return nil } } var act gcse.DocInfo isSet := false isUpdated := false hasOriginal := false for { val, err := nextVal() if errorsp.Cause(err) == io.EOF { break } if err != nil { return err } cur := val.(*gcse.NewDocAction) switch cur.Action { case gcse.NDA_DEL: // not collect out to delete it atomic.AddInt64(&cntDeleted, 1) return nil case gcse.NDA_ORIGINAL: hasOriginal = true } if !isSet { isSet = true act = cur.DocInfo } else { if cur.LastUpdated.After(act.LastUpdated) { isUpdated = true act = cur.DocInfo } } } if isSet { if isUpdated { atomic.AddInt64(&cntUpdated, 1) } else if hasOriginal { atomic.AddInt64(&cntUnchanged, 1) } else { atomic.AddInt64(&cntNew, 1) } return c[0].Collect(key, &act) } else { return nil } }, } }, Dest: []mr.Output{ outDocsUpdated, }, } if err := job.Run(); err != nil { log.Fatalf("job.Run failed: %v", err) } log.Printf("Deleted: %v", cntDeleted) log.Printf("Updated: %v", cntUpdated) log.Printf("New: %v", cntNew) log.Printf("Unchanged: %v", cntUnchanged) pDocs := configs.DataRoot.Join(configs.FnDocs) pUpdated := configs.DataRoot.Join("docs-updated") pTmp := configs.DataRoot.Join("docs-tmp") pTmp.RemoveAll() if err := pDocs.Rename(pTmp); err != nil { log.Fatalf("rename %v to %v failed: %v", pDocs, pTmp, err) } if err := pUpdated.Rename(pDocs); err != nil { log.Fatalf("rename %v to %v failed: %v", pUpdated, pDocs, err) } log.Println("Merging success...") }