func main() { log.Println("crawler started...") AppStopTime = time.Now().Add(30 * time.Minute) docDB = loadDocDB(gcse.DataRoot.Join(fnOldDocDB), DocDBPath) cPackageDB = gcse.NewMemDB(CrawlerDBPath, kindPackage) cPersonDB = gcse.NewMemDB(CrawlerDBPath, kindPerson) syncDatabases() go dumpingStatusLoop() var wg sync.WaitGroup wg.Add(1) go syncLoop(&wg) crawlEntriesLoop() // dump docDB if err := gcse.DBOutSegments.ClearUndones(); err != nil { log.Printf("DBOutSegments.ClearUndones failed: %v", err) } if err := dumpDB(); err != nil { log.Printf("dumpDB failed: %v", err) } wg.Wait() log.Println("crawler stopped...") }
func loadDocDB(oldDocDBPath, docDBPath villa.Path) (docDB gcse.PackedDocDB) { log.Printf("loadDocDB: old from %v, current from %v", oldDocDBPath, docDBPath) oldDocDB := gcse.NewMemDB(oldDocDBPath, gcse.KindDocDB) docDB = gcse.PackedDocDB{gcse.NewMemDB(docDBPath, gcse.KindDocDB)} all, put := 0, 0 if err := oldDocDB.Iterate(func(pkg string, data interface{}) error { all++ var info gcse.DocInfo if docDB.Get(pkg, &info) { return nil } docDB.Put(pkg, data.(gcse.DocInfo)) put++ return nil }); err != nil { log.Fatalf("oldDocDB.Iterate failed: %v", err) } log.Printf("All %d entries in old DocDB, %d put!", all, put) oldDocDB = nil runtime.GC() return docDB }
func doIndex(dbSegm gcse.Segment) bool { idxSegm, err := gcse.IndexSegments.GenMaxSegment() if err != nil { log.Printf("GenMaxSegment failed: %v", err) return false } runtime.GC() gcse.DumpMemStats() log.Printf("Reading docDB from %v ...", dbSegm) // read docDB docDB := gcse.PackedDocDB{gcse.NewMemDB(dbSegm.Join(""), gcse.KindDocDB)} log.Printf("Indexing to %v ...", idxSegm) ts, err := gcse.Index(docDB) if err != nil { log.Printf("Indexing failed: %v", err) return false } f, err := idxSegm.Join(gcse.IndexFn).Create() if err != nil { log.Printf("Create index file failed: %v", err) return false } defer f.Close() if err := ts.Save(f); err != nil { log.Printf("ts.Save failed: %v", err) return false } if err := idxSegm.Done(); err != nil { log.Printf("segm.Done failed: %v", err) return false } log.Printf("Indexing success: %s (%d)", idxSegm, ts.DocCount()) docDB.MemDB, ts = nil, nil gcse.DumpMemStats() runtime.GC() gcse.DumpMemStats() if err := dbSegm.Remove(); err != nil { log.Printf("Delete segment %v failed: %v", dbSegm, err) } return true }
func main() { fmt.Println("Data conversion tool") fpRoot := sophie.LocalFsPath("./data") /* * Doc db */ if DocDBPath.Exists() { if DocDBPath.Join(gcse.KindDocDB+".gob").Exists() && !gcse.DataRoot.Join(fnNewDocDB).Exists() { src := DocDBPath.Join(gcse.KindDocDB + ".gob") dst := fpRoot.Join(fnNewDocDB) fmt.Println("Convert", src, "to", dst, "...") srcDB := gcse.PackedDocDB{MemDB: gcse.NewMemDB(DocDBPath, gcse.KindDocDB)} if err := srcDB.Load(); err != nil { log.Fatalf("srcDB.Load: %v", err) } fpDocs := fpRoot.Join(fnNewDocDB) dstDB := kv.DirOutput(fpDocs) c, err := dstDB.Collector(0) if err != nil { log.Fatalf("dstDB.Collector: %v", err) } count := 0 if err := srcDB.Iterate(func(key string, val interface{}) error { k := sophie.RawString(key) v := val.(gcse.DocInfo) if count < 10 { fmtp.Printfln(" key: %+v, value: %+v", k, v) } count++ return c.Collect(k, &v) }); err != nil { fpDocs.Remove() log.Fatalf("srcDB.Iterate: %v", err) } c.Close() fmtp.Printfln("Conversion sucess, %d entries collected.", count) } } }
func main() { docDB := gcse.NewMemDB(DocDBPath, gcse.KindDocDB) countAll, countReadme, countHasSents := 0, 0, 0 countSents := 0 f, err := villa.Path("exps/notfound.txt").Create() if err != nil { log.Fatal(err) } defer f.Close() log.Printf("Start processing ...") if err := docDB.Iterate(func(key string, val interface{}) error { countAll++ d := val.(gcse.DocInfo) if d.ReadmeData != "" { countReadme++ readme := gcse.ReadmeToText(d.ReadmeFn, d.ReadmeData) sents := gcse.ChooseImportantSentenses(readme, d.Name, d.Package) if len(sents) > 0 { countSents += len(sents) countHasSents++ } else { fmt.Fprintln(f, "$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$") fmt.Fprintf(f, "%s - %s - %s\n", d.Name, d.Package, d.ReadmeFn) fmt.Fprintf(f, "%s\n", readme) } } return nil }); err != nil { log.Fatalf("docDB.Iterate failed: %v", err) } log.Printf("%d documents processed.", countAll) log.Printf("%d have readme.", countReadme) log.Printf("%d found %d important sentenses.", countHasSents, countSents) }