Example #1
0
func main() {
	log.Println("Running tocrawl tool, to generate crawling list")
	log.Println("NonCrawlHosts: ", gcse.NonCrawlHosts)
	log.Println("CrawlGithubUpdate: ", gcse.CrawlGithubUpdate)
	log.Println("CrawlByGodocApi: ", gcse.CrawlByGodocApi)
	// Load CrawlerDB
	cDB = gcse.LoadCrawlerDB()

	if gcse.CrawlGithubUpdate || gcse.CrawlByGodocApi {
		// load pkgUTs
		pkgUTs, err := loadPackageUpdateTimes(
			sophie.LocalFsPath(gcse.DocsDBPath.S()))
		if err != nil {
			log.Fatalf("loadPackageUpdateTimes failed: %v", err)
		}

		if gcse.CrawlGithubUpdate {
			touchByGithubUpdates(pkgUTs)
		}

		if gcse.CrawlByGodocApi {
			httpClient := gcse.GenHttpClient("")
			pkgs, err := gcse.FetchAllPackagesInGodoc(httpClient)
			if err != nil {
				log.Fatalf("FetchAllPackagesInGodoc failed: %v", err)
			}
			log.Printf("FetchAllPackagesInGodoc returns %d entries", len(pkgs))
			for _, pkg := range pkgs {
				cDB.AppendPackage(pkg, func(pkg string) bool {
					_, ok := pkgUTs[pkg]
					return ok
				})
			}
		}
		syncDatabases()
	}

	log.Printf("Package DB: %d entries", cDB.PackageDB.Count())
	log.Printf("Person DB: %d entries", cDB.PersonDB.Count())

	pathToCrawl := gcse.DataRoot.Join(gcse.FnToCrawl)

	kvPackage := kv.DirOutput(sophie.LocalFsPath(
		pathToCrawl.Join(gcse.FnPackage).S()))
	kvPackage.Clean()
	if err := generateCrawlEntries(cDB.PackageDB, gcse.HostOfPackage, kvPackage); err != nil {
		log.Fatalf("generateCrawlEntries %v failed: %v", kvPackage.Path, err)
	}

	kvPerson := kv.DirOutput(sophie.LocalFsPath(
		pathToCrawl.Join(gcse.FnPerson).S()))

	kvPerson.Clean()
	if err := generateCrawlEntries(cDB.PersonDB, func(id string) string {
		site, _ := gcse.ParsePersonId(id)
		return site
	}, kvPerson); err != nil {
		log.Fatalf("generateCrawlEntries %v failed: %v", kvPerson.Path, err)
	}
}
Example #2
0
func doIndex() bool {
	idxSegm, err := gcse.IndexSegments.GenMaxSegment()
	if err != nil {
		log.Printf("GenMaxSegment failed: %v", err)
		return false
	}

	runtime.GC()
	gcse.DumpMemStats()

	log.Printf("Indexing to %v ...", idxSegm)

	fpDocDB := sophie.LocalFsPath(configs.DocsDBPath().S())
	ts, err := gcse.Index(kv.DirInput(fpDocDB), idxSegm.Join("").S())
	if err != nil {
		log.Printf("Indexing failed: %v", err)
		return false
	}

	if !func() bool {
		f, err := idxSegm.Join(gcse.IndexFn).Create()
		if err != nil {
			log.Printf("Create index file failed: %v", err)
			return false
		}
		defer f.Close()

		log.Printf("Saving index to %v ...", idxSegm)
		if err := ts.Save(f); err != nil {
			log.Printf("ts.Save failed: %v", err)
			return false
		}
		return true
	}() {
		return false
	}
	runtime.GC()
	gcse.DumpMemStats()

	storePath := idxSegm.Join(configs.FnStore)
	log.Printf("Saving store snapshot to %v", storePath)
	if err := store.SaveSnapshot(storePath.S()); err != nil {
		log.Printf("SaveSnapshot %v failed: %v", storePath, err)
	}

	if err := idxSegm.Done(); err != nil {
		log.Printf("segm.Done failed: %v", err)
		return false
	}

	log.Printf("Indexing success: %s (%d)", idxSegm, ts.DocCount())
	gcse.AddBiValueAndProcess(bi.Average, "index.doc-count", ts.DocCount())

	ts = nil
	gcse.DumpMemStats()
	runtime.GC()
	gcse.DumpMemStats()

	return true
}
Example #3
0
func main() {
	//	path := "data/docs"
	path := "data/docs-updated"
	kvDir := kv.DirInput(sophie.LocalFsPath(path))

	cnt, err := kvDir.PartCount()
	if err != nil {
		log.Fatalf("kvDir.PartCount failed: %v", err)
	}

	totalEntries := 0
	for i := 0; i < cnt; i++ {
		it, err := kvDir.Iterator(i)
		if err != nil {
			log.Fatalf("kvDir.Collector(%d) failed: %v", i, err)
		}

		var key sophie.RawString
		var val gcse.DocInfo
		for {
			if err := it.Next(&key, &val); err != nil {
				if err == sophie.EOF {
					break
				}
				log.Fatalf("it.Next failed %v", err)
			}
			totalEntries++
		}

		it.Close()
	}

	fmtp.Printfln("Total %d files, %d entries.", cnt, totalEntries)
}
Example #4
0
func dumpDocs(keys []string) {
	path := "data/docs"
	kvDir := kv.DirInput(sophie.LocalFsPath(path))
	cnt, err := kvDir.PartCount()
	if err != nil {
		log.Fatalf("kvDir.PartCount() failed: %v", err)
	}

	parts := make(map[int]map[string]bool)
	for _, key := range keys {
		part := gcse.CalcPackagePartition(key, gcse.DOCS_PARTS)
		if parts[part] == nil {
			parts[part] = make(map[string]bool)
		}

		parts[part][key] = true
	}

	var key sophie.RawString
	var val gcse.DocInfo
	for part := 0; part < cnt; part++ {
		if len(keys) > 0 && parts[part] == nil {
			continue
		}

		it, err := kvDir.Iterator(part)
		if err != nil {
			log.Fatalf("kvDir.Collector(%d) failed: %v", part, err)
		}

		func() {
			defer it.Close()

			for {
				if err := it.Next(&key, &val); err != nil {
					if err == sophie.EOF {
						break
					}
					log.Fatalf("it.Next failed %v", err)
				}
				pkg := key.String()
				if len(keys) > 0 && !parts[part][pkg] {
					continue
				}
				fmtp.Printfln("%v -> %+v", key, val)
			}

			it.Close()
		}()
	}
}
Example #5
0
func doIndex() bool {
	idxSegm, err := gcse.IndexSegments.GenMaxSegment()
	if err != nil {
		log.Printf("GenMaxSegment failed: %v", err)
		return false
	}

	runtime.GC()
	gcse.DumpMemStats()

	log.Printf("Indexing to %v ...", idxSegm)

	fpDocDB := sophie.LocalFsPath(gcse.DocsDBPath.S())

	ts, err := gcse.Index(kv.DirInput(fpDocDB))
	if err != nil {
		log.Printf("Indexing failed: %v", err)
		return false
	}

	f, err := idxSegm.Join(gcse.IndexFn).Create()
	if err != nil {
		log.Printf("Create index file failed: %v", err)
		return false
	}
	//defer f.Close()
	log.Printf("Saving index to %v ...", idxSegm)
	if err := ts.Save(f); err != nil {
		log.Printf("ts.Save failed: %v", err)
		return false
	}
	f.Close()
	f = nil
	runtime.GC()
	gcse.DumpMemStats()

	if err := idxSegm.Done(); err != nil {
		log.Printf("segm.Done failed: %v", err)
		return false
	}

	log.Printf("Indexing success: %s (%d)", idxSegm, ts.DocCount())

	ts = nil
	gcse.DumpMemStats()
	runtime.GC()
	gcse.DumpMemStats()

	return true
}
Example #6
0
func main() {
	fmt.Println("Data conversion tool")
	fpRoot := sophie.LocalFsPath("./data")
	/*
	 * Doc db
	 */
	if DocDBPath.Exists() {
		if DocDBPath.Join(gcse.KindDocDB+".gob").Exists() &&
			!gcse.DataRoot.Join(fnNewDocDB).Exists() {
			src := DocDBPath.Join(gcse.KindDocDB + ".gob")
			dst := fpRoot.Join(fnNewDocDB)
			fmt.Println("Convert", src, "to", dst, "...")

			srcDB := gcse.PackedDocDB{MemDB: gcse.NewMemDB(DocDBPath, gcse.KindDocDB)}
			if err := srcDB.Load(); err != nil {
				log.Fatalf("srcDB.Load: %v", err)
			}

			fpDocs := fpRoot.Join(fnNewDocDB)
			dstDB := kv.DirOutput(fpDocs)
			c, err := dstDB.Collector(0)
			if err != nil {
				log.Fatalf("dstDB.Collector: %v", err)
			}

			count := 0
			if err := srcDB.Iterate(func(key string, val interface{}) error {
				k := sophie.RawString(key)
				v := val.(gcse.DocInfo)

				if count < 10 {
					fmtp.Printfln("  key: %+v, value: %+v", k, v)
				}

				count++
				return c.Collect(k, &v)
			}); err != nil {
				fpDocs.Remove()
				log.Fatalf("srcDB.Iterate: %v", err)
			}
			c.Close()

			fmtp.Printfln("Conversion sucess, %d entries collected.", count)
		}
	}
}
Example #7
0
func TestBasic(t *testing.T) {
	fn := sophie.LocalFsPath("./test.kv")
	defer villa.Path(fn.Path).Remove()

	keys := []sophie.String{
		"abc", "def",
	}
	vals := []sophie.VInt{
		2, 2013,
	}

	writer, err := NewWriter(fn)
	assert.NoErrorf(t, "NewWriter: %v", err)

	for i, key := range keys {
		val := vals[i]
		assert.NoErrorf(t, "Collect: %v", writer.Collect(key, val))
	}
	assert.NoErrorf(t, "writer.Close()", writer.Close())

	reader, err := NewReader(fn)
	assert.NoErrorf(t, "NewReader: %v", err)

	var key sophie.String
	var val sophie.VInt
	for i := 0; ; i++ {
		err := reader.Next(&key, &val)
		if err == sophie.EOF {
			break
		}
		assert.NoErrorf(t, "reader.Next: %v", err)
		assert.Equals(t, fmt.Sprintf("key[%d]", i), key, keys[i])
		assert.Equals(t, fmt.Sprintf("val[%d]", i), val, vals[i])
	}

	assert.NoErrorf(t, "reader.Close()", reader.Close())
}
Example #8
0
func main() {
	log.Println("Merging new crawled docs back...")

	fpDataRoot := sophie.LocalFsPath(gcse.DataRoot.S())

	fpCrawler := fpDataRoot.Join(gcse.FnCrawlerDB)
	outDocsUpdated := kv.DirOutput(fpDataRoot.Join("docs-updated"))
	outDocsUpdated.Clean()

	var cntDeleted, cntUpdated, cntNewUnchange int64

	job := mr.MrJob{
		Source: []mr.Input{
			kv.DirInput(fpDataRoot.Join(gcse.FnDocs)),   // 0
			kv.DirInput(fpCrawler.Join(gcse.FnNewDocs)), // 1
		},

		NewMapperF: func(src, part int) mr.Mapper {
			if src == 0 {
				return &mr.MapperStruct{
					NewKeyF: sophie.NewRawString,
					NewValF: gcse.NewDocInfo,
					MapF: func(key, val sophie.SophieWriter,
						c mr.PartCollector) error {

						pkg := key.(*sophie.RawString).String()
						di := val.(*gcse.DocInfo)
						act := gcse.NewDocAction{
							Action:  gcse.NDA_UPDATE,
							DocInfo: *di,
						}

						part := gcse.CalcPackagePartition(pkg, gcse.DOCS_PARTS)
						return c.CollectTo(part, key, &act)
					},
				}
			}

			return &mr.MapperStruct{
				NewKeyF: sophie.NewRawString,
				NewValF: gcse.NewNewDocAction,
				MapF: func(key, val sophie.SophieWriter,
					c mr.PartCollector) error {

					pkg := string(*key.(*sophie.RawString))
					part := gcse.CalcPackagePartition(pkg, gcse.DOCS_PARTS)
					return c.CollectTo(part, key, val)
				},
			}
		},

		Sorter: mr.NewFileSorter(fpDataRoot.Join("tmp")),

		NewReducerF: func(part int) mr.Reducer {
			return &mr.ReducerStruct{
				NewKeyF: sophie.NewRawString,
				NewValF: gcse.NewNewDocAction,
				ReduceF: func(key sophie.SophieWriter,
					nextVal mr.SophierIterator, c []sophie.Collector) error {

					var act gcse.DocInfo
					isSet := false
					isUpdated := false
					for {
						val, err := nextVal()
						if err == sophie.EOF {
							break
						}
						if err != nil {
							return err
						}

						cur := val.(*gcse.NewDocAction)
						if cur.Action == gcse.NDA_DEL {
							// not collect out to delete it
							atomic.AddInt64(&cntDeleted, 1)
							return nil
						}
						if !isSet {
							isSet = true
							act = cur.DocInfo
						} else {
							if cur.LastUpdated.After(act.LastUpdated) {
								isUpdated = true
								act = cur.DocInfo
							}
						}
					}

					if isSet {
						if isUpdated {
							atomic.AddInt64(&cntUpdated, 1)
						} else {
							atomic.AddInt64(&cntNewUnchange, 1)
						}
						return c[0].Collect(key, &act)
					} else {
						return nil
					}
				},
			}
		},

		Dest: []mr.Output{
			outDocsUpdated,
		},
	}

	if err := job.Run(); err != nil {
		log.Fatalf("job.Run failed: %v", err)
	}

	log.Printf("Deleted: %v", cntDeleted)
	log.Printf("Updated: %v", cntUpdated)
	log.Printf("NewUnchange: %v", cntNewUnchange)

	pDocs := gcse.DataRoot.Join(gcse.FnDocs)
	pUpdated := gcse.DataRoot.Join("docs-updated")
	pTmp := gcse.DataRoot.Join("docs-tmp")

	pTmp.RemoveAll()
	if err := pDocs.Rename(pTmp); err != nil {
		log.Fatalf("rename %v to %v failed: %v", pDocs, pTmp, err)
	}
	if err := pUpdated.Rename(pDocs); err != nil {
		log.Fatalf("rename %v to %v failed: %v", pUpdated, pDocs, err)
	}

	log.Println("Merging success...")
}
Example #9
0
func main() {
	log.Println("Running tocrawl tool, to generate crawling list")
	log.Println("NonCrawlHosts: ", configs.NonCrawlHosts)
	log.Println("CrawlGithubUpdate: ", configs.CrawlGithubUpdate)
	log.Println("CrawlByGodocApi: ", configs.CrawlByGodocApi)

	log.Printf("Using personal: %v", configs.CrawlerGithubPersonal)
	gcse.GithubSpider = github.NewSpiderWithToken(configs.CrawlerGithubPersonal)

	// Load CrawlerDB
	cDB = gcse.LoadCrawlerDB()

	if configs.CrawlGithubUpdate || configs.CrawlByGodocApi {
		// load pkgUTs
		pkgUTs, err := loadPackageUpdateTimes(
			sophie.LocalFsPath(configs.DocsDBPath().S()))
		if err != nil {
			log.Fatalf("loadPackageUpdateTimes failed: %v", err)
		}

		if configs.CrawlGithubUpdate {
			touchByGithubUpdates(pkgUTs)
		}

		if configs.CrawlByGodocApi {
			httpClient := gcse.GenHttpClient("")
			pkgs, err := gcse.FetchAllPackagesInGodoc(httpClient)
			if err != nil {
				log.Fatalf("FetchAllPackagesInGodoc failed: %v", err)
			}
			gcse.AddBiValueAndProcess(bi.Max, "godoc.doc-count", len(pkgs))
			log.Printf("FetchAllPackagesInGodoc returns %d entries", len(pkgs))
			now := time.Now()
			for _, pkg := range pkgs {
				cDB.AppendPackage(pkg, func(pkg string) bool {
					_, ok := pkgUTs[pkg]
					return ok
				})
				site, path := utils.SplitPackage(pkg)
				if err := store.AppendPackageEvent(site, path, "godoc", now, sppb.HistoryEvent_Action_None); err != nil {
					log.Printf("UpdatePackageHistory %s %s failed: %v", site, path, err)
				}
			}
		}
		syncDatabases()
	}

	log.Printf("Package DB: %d entries", cDB.PackageDB.Count())
	log.Printf("Person DB: %d entries", cDB.PersonDB.Count())

	pathToCrawl := configs.DataRoot.Join(configs.FnToCrawl)

	kvPackage := kv.DirOutput(sophie.LocalFsPath(
		pathToCrawl.Join(configs.FnPackage).S()))
	kvPackage.Clean()
	if err := generateCrawlEntries(cDB.PackageDB, gcse.HostOfPackage, kvPackage); err != nil {
		log.Fatalf("generateCrawlEntries %v failed: %v", kvPackage.Path, err)
	}

	kvPerson := kv.DirOutput(sophie.LocalFsPath(
		pathToCrawl.Join(configs.FnPerson).S()))

	kvPerson.Clean()
	if err := generateCrawlEntries(cDB.PersonDB, func(id string) string {
		site, _ := gcse.ParsePersonId(id)
		return site
	}, kvPerson); err != nil {
		log.Fatalf("generateCrawlEntries %v failed: %v", kvPerson.Path, err)
	}
}
Example #10
0
func TestMRFromFile(t *testing.T) {
	fmt.Println("TestMRFromFile starts")
	fpRoot := sophie.LocalFsPath(".")

	mrin := fpRoot.Join("mrin")
	mrin.Mkdir(0755)

	mrtmp := fpRoot.Join("tmp")

	/*
	 * Prepare input
	 */
	var inF *kv.Writer = nil
	index := 0
	lines := strings.Split(WORDS, "\n")
	for i, line := range lines {
		if i%3 == 0 {
			if inF != nil {
				assert.NoErrorf(t, "inF.Close: %v", inF.Close())
				index++
			}
			var err error
			inF, err = kv.NewWriter(mrin.Join(fmt.Sprintf("part-%05d", index)))
			assert.NoErrorf(t, "NewKVWriter: %v", err)
		}

		assert.NoErrorf(t, "inF.Collect",
			inF.Collect(sophie.RawString(line), sophie.Null{}))
	}
	if inF != nil {
		assert.NoErrorf(t, "inF.Close: %v", inF.Close())
	}

	mrout := fpRoot.Join("mrout")
	assert.NoErrorf(t, "Remove mrout: %v", mrout.Remove())

	/*
	 * MrJob
	 */
	var mapper WordCountMapper
	reducer := WordCountReducer{counts: make(map[string]int)}

	job := MrJob{
		Source: []Input{kv.DirInput(mrin)},
		NewMapperF: func(src, part int) Mapper {
			return &mapper
		},

		Sorter: NewFileSorter(mrtmp),

		NewReducerF: func(part int) Reducer {
			return &reducer
		},
		Dest: []Output{kv.DirOutput(mrout)},
	}

	assert.NoErrorf(t, "RunJob: %v", job.Run())

	/*
	 * Check result
	 */
	resIn := kv.DirInput(mrout)
	n, err := resIn.PartCount()
	assert.NoErrorf(t, "resIn.PartCount(): %v", err)
	var word sophie.RawString
	var cnt sophie.RawVInt
	actCnts := make(map[string]int)
	for i := 0; i < n; i++ {
		iter, err := resIn.Iterator(i)
		assert.NoErrorf(t, "resIn.Iterator: %v", err)
		for {
			err := iter.Next(&word, &cnt)
			if err == sophie.EOF {
				break
			}
			assert.NoErrorf(t, "iter.Next: %v", err)
			actCnts[string(word)] = int(cnt)
		}
	}

	expCnts := statWords(WORDS)
	// fmt.Println(expCnts)
	// fmt.Println(actCnts)

	assertMapEquals(t, actCnts, expCnts)
	fmt.Println("TestMRFromFile ends")
}
Example #11
0
func main() {
	log.Println("Merging new crawled docs back...")

	var nonStorePackage *regexp.Regexp
	if len(configs.NonStorePackageRegexps) > 0 {
		nonStorePackage = regexp.MustCompile(
			stringsp.FullJoin(configs.NonStorePackageRegexps, "(", ")|(", ")"))
	}

	fpDataRoot := sophie.LocalFsPath(configs.DataRoot.S())

	fpCrawler := fpDataRoot.Join(configs.FnCrawlerDB)
	outDocsUpdated := kv.DirOutput(fpDataRoot.Join("docs-updated"))
	outDocsUpdated.Clean()

	var cntDeleted, cntUpdated, cntNew, cntUnchanged int64

	job := mr.MrJob{
		Source: []mr.Input{
			kv.DirInput(fpDataRoot.Join(configs.FnDocs)),   // 0
			kv.DirInput(fpCrawler.Join(configs.FnNewDocs)), // 1
		},

		NewMapperF: func(src, part int) mr.Mapper {
			if src == 0 {
				// Mapper for docs
				return &mr.MapperStruct{
					NewKeyF: sophie.NewRawString,
					NewValF: gcse.NewDocInfo,
					MapF: func(key, val sophie.SophieWriter, c mr.PartCollector) error {
						pkg := key.(*sophie.RawString).String()
						di := val.(*gcse.DocInfo)
						act := gcse.NewDocAction{
							Action:  gcse.NDA_ORIGINAL,
							DocInfo: *di,
						}
						part := gcse.CalcPackagePartition(pkg, gcse.DOCS_PARTS)
						return c.CollectTo(part, key, &act)
					},
				}
			}
			// Mapper for new docs
			return &mr.MapperStruct{
				NewKeyF: sophie.NewRawString,
				NewValF: gcse.NewNewDocAction,
				MapF: func(key, val sophie.SophieWriter, c mr.PartCollector) error {
					pkg := string(*key.(*sophie.RawString))
					part := gcse.CalcPackagePartition(pkg, gcse.DOCS_PARTS)
					return c.CollectTo(part, key, val)
				},
			}
		},

		Sorter: mr.NewFileSorter(fpDataRoot.Join("tmp")),

		NewReducerF: func(part int) mr.Reducer {
			return &mr.ReducerStruct{
				NewKeyF: sophie.NewRawString,
				NewValF: gcse.NewNewDocAction,
				ReduceF: func(key sophie.SophieWriter,
					nextVal mr.SophierIterator, c []sophie.Collector) error {

					if nonStorePackage != nil {
						pkg := string(*key.(*sophie.RawString))
						if nonStorePackage.MatchString(pkg) {
							log.Printf("Ignoring non-store pkg: %s", pkg)
							return nil
						}
					}

					var act gcse.DocInfo
					isSet := false
					isUpdated := false
					hasOriginal := false
					for {
						val, err := nextVal()
						if errorsp.Cause(err) == io.EOF {
							break
						}
						if err != nil {
							return err
						}

						cur := val.(*gcse.NewDocAction)
						switch cur.Action {
						case gcse.NDA_DEL:
							// not collect out to delete it
							atomic.AddInt64(&cntDeleted, 1)
							return nil

						case gcse.NDA_ORIGINAL:
							hasOriginal = true
						}

						if !isSet {
							isSet = true
							act = cur.DocInfo
						} else {
							if cur.LastUpdated.After(act.LastUpdated) {
								isUpdated = true
								act = cur.DocInfo
							}
						}
					}

					if isSet {
						if isUpdated {
							atomic.AddInt64(&cntUpdated, 1)
						} else if hasOriginal {
							atomic.AddInt64(&cntUnchanged, 1)
						} else {
							atomic.AddInt64(&cntNew, 1)
						}
						return c[0].Collect(key, &act)
					} else {
						return nil
					}
				},
			}
		},

		Dest: []mr.Output{
			outDocsUpdated,
		},
	}

	if err := job.Run(); err != nil {
		log.Fatalf("job.Run failed: %v", err)
	}

	log.Printf("Deleted:   %v", cntDeleted)
	log.Printf("Updated:   %v", cntUpdated)
	log.Printf("New:       %v", cntNew)
	log.Printf("Unchanged: %v", cntUnchanged)

	pDocs := configs.DataRoot.Join(configs.FnDocs)
	pUpdated := configs.DataRoot.Join("docs-updated")
	pTmp := configs.DataRoot.Join("docs-tmp")

	pTmp.RemoveAll()
	if err := pDocs.Rename(pTmp); err != nil {
		log.Fatalf("rename %v to %v failed: %v", pDocs, pTmp, err)
	}
	if err := pUpdated.Rename(pDocs); err != nil {
		log.Fatalf("rename %v to %v failed: %v", pUpdated, pDocs, err)
	}

	log.Println("Merging success...")
}