Ejemplo n.º 1
0
func main() {
	log.Println("crawler started...")

	AppStopTime = time.Now().Add(30 * time.Minute)

	docDB = loadDocDB(gcse.DataRoot.Join(fnOldDocDB), DocDBPath)

	cPackageDB = gcse.NewMemDB(CrawlerDBPath, kindPackage)
	cPersonDB = gcse.NewMemDB(CrawlerDBPath, kindPerson)

	syncDatabases()

	go dumpingStatusLoop()

	var wg sync.WaitGroup
	wg.Add(1)
	go syncLoop(&wg)

	crawlEntriesLoop()

	// dump docDB
	if err := gcse.DBOutSegments.ClearUndones(); err != nil {
		log.Printf("DBOutSegments.ClearUndones failed: %v", err)
	}

	if err := dumpDB(); err != nil {
		log.Printf("dumpDB failed: %v", err)
	}

	wg.Wait()
	log.Println("crawler stopped...")
}
Ejemplo n.º 2
0
func loadDocDB(oldDocDBPath, docDBPath villa.Path) (docDB gcse.PackedDocDB) {
	log.Printf("loadDocDB: old from %v, current from %v", oldDocDBPath, docDBPath)
	oldDocDB := gcse.NewMemDB(oldDocDBPath, gcse.KindDocDB)
	docDB = gcse.PackedDocDB{gcse.NewMemDB(docDBPath, gcse.KindDocDB)}
	all, put := 0, 0
	if err := oldDocDB.Iterate(func(pkg string, data interface{}) error {
		all++
		var info gcse.DocInfo
		if docDB.Get(pkg, &info) {
			return nil
		}

		docDB.Put(pkg, data.(gcse.DocInfo))
		put++
		return nil
	}); err != nil {
		log.Fatalf("oldDocDB.Iterate failed: %v", err)
	}

	log.Printf("All %d entries in old DocDB, %d put!", all, put)

	oldDocDB = nil
	runtime.GC()

	return docDB
}
Ejemplo n.º 3
0
func doIndex(dbSegm gcse.Segment) bool {
	idxSegm, err := gcse.IndexSegments.GenMaxSegment()
	if err != nil {
		log.Printf("GenMaxSegment failed: %v", err)
		return false
	}

	runtime.GC()
	gcse.DumpMemStats()
	log.Printf("Reading docDB from %v ...", dbSegm)
	// read docDB
	docDB := gcse.PackedDocDB{gcse.NewMemDB(dbSegm.Join(""), gcse.KindDocDB)}

	log.Printf("Indexing to %v ...", idxSegm)

	ts, err := gcse.Index(docDB)
	if err != nil {
		log.Printf("Indexing failed: %v", err)
		return false
	}

	f, err := idxSegm.Join(gcse.IndexFn).Create()
	if err != nil {
		log.Printf("Create index file failed: %v", err)
		return false
	}
	defer f.Close()
	if err := ts.Save(f); err != nil {
		log.Printf("ts.Save failed: %v", err)
		return false
	}

	if err := idxSegm.Done(); err != nil {
		log.Printf("segm.Done failed: %v", err)
		return false
	}

	log.Printf("Indexing success: %s (%d)", idxSegm, ts.DocCount())

	docDB.MemDB, ts = nil, nil
	gcse.DumpMemStats()
	runtime.GC()
	gcse.DumpMemStats()

	if err := dbSegm.Remove(); err != nil {
		log.Printf("Delete segment %v failed: %v", dbSegm, err)
	}

	return true
}
Ejemplo n.º 4
0
func main() {
	fmt.Println("Data conversion tool")
	fpRoot := sophie.LocalFsPath("./data")
	/*
	 * Doc db
	 */
	if DocDBPath.Exists() {
		if DocDBPath.Join(gcse.KindDocDB+".gob").Exists() &&
			!gcse.DataRoot.Join(fnNewDocDB).Exists() {
			src := DocDBPath.Join(gcse.KindDocDB + ".gob")
			dst := fpRoot.Join(fnNewDocDB)
			fmt.Println("Convert", src, "to", dst, "...")

			srcDB := gcse.PackedDocDB{MemDB: gcse.NewMemDB(DocDBPath, gcse.KindDocDB)}
			if err := srcDB.Load(); err != nil {
				log.Fatalf("srcDB.Load: %v", err)
			}

			fpDocs := fpRoot.Join(fnNewDocDB)
			dstDB := kv.DirOutput(fpDocs)
			c, err := dstDB.Collector(0)
			if err != nil {
				log.Fatalf("dstDB.Collector: %v", err)
			}

			count := 0
			if err := srcDB.Iterate(func(key string, val interface{}) error {
				k := sophie.RawString(key)
				v := val.(gcse.DocInfo)

				if count < 10 {
					fmtp.Printfln("  key: %+v, value: %+v", k, v)
				}

				count++
				return c.Collect(k, &v)
			}); err != nil {
				fpDocs.Remove()
				log.Fatalf("srcDB.Iterate: %v", err)
			}
			c.Close()

			fmtp.Printfln("Conversion sucess, %d entries collected.", count)
		}
	}
}
Ejemplo n.º 5
0
func main() {
	docDB := gcse.NewMemDB(DocDBPath, gcse.KindDocDB)
	countAll, countReadme, countHasSents := 0, 0, 0
	countSents := 0

	f, err := villa.Path("exps/notfound.txt").Create()
	if err != nil {
		log.Fatal(err)
	}
	defer f.Close()

	log.Printf("Start processing ...")
	if err := docDB.Iterate(func(key string, val interface{}) error {
		countAll++

		d := val.(gcse.DocInfo)
		if d.ReadmeData != "" {
			countReadme++

			readme := gcse.ReadmeToText(d.ReadmeFn, d.ReadmeData)

			sents := gcse.ChooseImportantSentenses(readme, d.Name, d.Package)
			if len(sents) > 0 {
				countSents += len(sents)
				countHasSents++
			} else {
				fmt.Fprintln(f, "$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$")
				fmt.Fprintf(f, "%s - %s - %s\n", d.Name, d.Package, d.ReadmeFn)
				fmt.Fprintf(f, "%s\n", readme)
			}
		}

		return nil
	}); err != nil {
		log.Fatalf("docDB.Iterate failed: %v", err)
	}

	log.Printf("%d documents processed.", countAll)
	log.Printf("%d have readme.", countReadme)
	log.Printf("%d found %d important sentenses.", countHasSents, countSents)
}