func TestReduceValues(t *testing.T) { /* * Source are of two parts with nothing in each, but at each mapend, a pair * of <"part", <part>> is collected. So the reducer will check whether a key * of "part" with two different values are reduced. */ job := MrJob{ Source: []Input{ &InputStruct{ PartCountF: func() (int, error) { return 2, nil }, }, }, NewMapperF: func(src, part int) Mapper { return &MapperStruct{ MapEndF: func(c PartCollector) error { return c.CollectTo(0, sophie.RawString("part"), sophie.VInt(part)) }, } }, NewReducerF: func(part int) Reducer { st := make(map[sophie.VInt]bool) return &ReducerStruct{ NewKeyF: sophie.NewRawString, NewValF: sophie.NewVInt, ReduceF: func(key sophie.SophieWriter, nextVal SophierIterator, c []sophie.Collector) error { keyStr := string(*key.(*sophie.RawString)) if keyStr != "part" { return errors.New(`Key should be "part"`) } for { val, err := nextVal() if err == sophie.EOF { break } if err != nil { return err } part := *val.(*sophie.VInt) if st[part] { t.Errorf("Duplicated value: %v", part) } st[part] = true } return nil }, } }, } assert.NoErrorf(t, "job.Run failed: %v", job.Run()) }
func (iter *linesIter) Next(key, val sophie.SophieReader) error { if iter.pos >= len(iter.lines) { return sophie.EOF } *(key.(*sophie.RawString)) = sophie.RawString(iter.lines[iter.pos]) iter.pos++ return nil }
func generateCrawlEntries(db *gcse.MemDB, hostFromID func(id string) string, out kv.DirOutput) error { now := time.Now() groups := make(map[string]sophie.CollectCloser) count := 0 if err := db.Iterate(func(id string, val interface{}) error { ent, ok := val.(gcse.CrawlingEntry) if !ok { log.Printf("Wrong entry: %+v", ent) return nil } if ent.Version == gcse.CrawlerVersion && ent.ScheduleTime.After(now) { return nil } host := hostFromID(id) // check host black list if gcse.NonCrawlHosts.In(host) { return nil } c, ok := groups[host] if !ok { index := len(groups) var err error c, err = out.Collector(index) if err != nil { return err } groups[host] = c } if rand.Intn(10) == 0 { // randomly set Etag to empty to fetch stars ent.Etag = "" } count++ return c.Collect(sophie.RawString(id), &ent) }); err != nil { return err } for _, c := range groups { c.Close() } log.Printf("%d entries to crawl for folder %v", count, out.Path) return nil }
func (wcm *WordCountMapper) Map(key, val sophie.SophieWriter, c PartCollector) error { //fmt.Printf("WordCountMapper (%v, %v) ...\n", key, val) line := *(key.(*sophie.RawString)) words := strings.Split(string(line), " ") for _, word := range words { if len(word) == 0 { continue } word = strings.ToLower(word) //fmt.Printf("CollectTo %v\n", word) c.CollectTo(int(word[0]), sophie.RawString(word), sophie.RawVInt(1)) // c.CollectTo(0, RawString(word), RawVInt(1)) } return nil }
func main() { fmt.Println("Data conversion tool") fpRoot := sophie.LocalFsPath("./data") /* * Doc db */ if DocDBPath.Exists() { if DocDBPath.Join(gcse.KindDocDB+".gob").Exists() && !gcse.DataRoot.Join(fnNewDocDB).Exists() { src := DocDBPath.Join(gcse.KindDocDB + ".gob") dst := fpRoot.Join(fnNewDocDB) fmt.Println("Convert", src, "to", dst, "...") srcDB := gcse.PackedDocDB{MemDB: gcse.NewMemDB(DocDBPath, gcse.KindDocDB)} if err := srcDB.Load(); err != nil { log.Fatalf("srcDB.Load: %v", err) } fpDocs := fpRoot.Join(fnNewDocDB) dstDB := kv.DirOutput(fpDocs) c, err := dstDB.Collector(0) if err != nil { log.Fatalf("dstDB.Collector: %v", err) } count := 0 if err := srcDB.Iterate(func(key string, val interface{}) error { k := sophie.RawString(key) v := val.(gcse.DocInfo) if count < 10 { fmtp.Printfln(" key: %+v, value: %+v", k, v) } count++ return c.Collect(k, &v) }); err != nil { fpDocs.Remove() log.Fatalf("srcDB.Iterate: %v", err) } c.Close() fmtp.Printfln("Conversion sucess, %d entries collected.", count) } } }
func TestIndex(t *testing.T) { docs := []DocInfo{ { Package: "github.com/daviddengcn/gcse", Name: "gcse", TestImports: []string{ "github.com/daviddengcn/go-villa", "github.com/daviddengcn/gcse", }, }, { Package: "github.com/daviddengcn/gcse/indexer", Name: "main", Imports: []string{ "github.com/daviddengcn/gcse", "github.com/daviddengcn/go-villa", "github.com/daviddengcn/gcse/indexer", }, }, { Package: "github.com/daviddengcn/go-villa", Name: "villa", }, } ts, err := Index(&mr.InputStruct{ PartCountF: func() (int, error) { return 1, nil }, IteratorF: func(int) (sophie.IterateCloser, error) { index := 0 return &sophie.IterateCloserStruct{ NextF: func(key, val sophie.SophieReader) error { if index >= len(docs) { return sophie.EOF } *key.(*sophie.RawString) = sophie.RawString( docs[index].Package) *val.(*DocInfo) = docs[index] val.(*DocInfo).Imports = append([]string{}, docs[index].Imports...) val.(*DocInfo).TestImports = append([]string{}, docs[index].TestImports...) index++ return nil }, }, nil }, }) if err != nil { t.Error(err) return } numDocs := ts.DocCount() assert.Equal(t, "DocCount", numDocs, 3) var pkgs []string if err := ts.Search(map[string]villa.StrSet{IndexTextField: nil}, func(docID int32, data interface{}) error { hit := data.(HitInfo) pkgs = append(pkgs, hit.Package) return nil }, ); err != nil { t.Error(err) return } assert.StringEqual(t, "all", pkgs, []string{ "github.com/daviddengcn/gcse", "github.com/daviddengcn/go-villa", "github.com/daviddengcn/gcse/indexer", }) var gcseInfo HitInfo if err := ts.Search(map[string]villa.StrSet{ IndexPkgField: villa.NewStrSet("github.com/daviddengcn/gcse"), }, func(docID int32, data interface{}) error { gcseInfo = data.(HitInfo) return nil }); err != nil { t.Errorf("ts.Search: %v", err) return } assert.StringEqual(t, "gcseInfo.Imported", gcseInfo.Imported, []string{ "github.com/daviddengcn/gcse/indexer", }) assert.StringEqual(t, "gcseInfo.TestImports", gcseInfo.TestImports, []string{ "github.com/daviddengcn/go-villa", }) var indexerInfo HitInfo if err := ts.Search(map[string]villa.StrSet{ IndexPkgField: villa.NewStrSet("github.com/daviddengcn/gcse/indexer"), }, func(docID int32, data interface{}) error { gcseInfo = data.(HitInfo) return nil }); err != nil { t.Errorf("ts.Search: %v", err) return } assert.StringEqual(t, "indexerInfo.Imported", indexerInfo.Imported, []string{}) assert.StringEqual(t, "indexerInfo.Imports", indexerInfo.Imports, []string{}) if err := ts.Search(map[string]villa.StrSet{ IndexPkgField: villa.NewStrSet("github.com/daviddengcn/go-villa"), }, func(docID int32, data interface{}) error { gcseInfo = data.(HitInfo) return nil }); err != nil { t.Errorf("ts.Search: %v", err) return } assert.StringEqual(t, "indexerInfo.Imported", fmt.Sprintf("%+v", indexerInfo.Imported), "[]") assert.StringEqual(t, "gcseInfo.TestImported", gcseInfo.TestImported, []string{"github.com/daviddengcn/gcse"}) }
func TestIndex(t *testing.T) { const ( package0 = "github.com/daviddengcn/gcse" package1 = "github.com/daviddengcn/gcse/indexer" package2 = "github.com/daviddengcn/go-villa" ) docs := []DocInfo{ { Package: package0, Name: "gcse", TestImports: []string{ package2, package0, }, }, { Package: package1, Name: "main", Imports: []string{ package0, package2, package1, }, }, { Package: package2, Name: "villa", }, } ts, err := Index(&mr.InputStruct{ PartCountF: func() (int, error) { return 1, nil }, IteratorF: func(int) (sophie.IterateCloser, error) { index := 0 return &sophie.IterateCloserStruct{ NextF: func(key, val sophie.SophieReader) error { if index >= len(docs) { return io.EOF } *key.(*sophie.RawString) = sophie.RawString( docs[index].Package) *val.(*DocInfo) = docs[index] val.(*DocInfo).Imports = append([]string{}, docs[index].Imports...) val.(*DocInfo).TestImports = append([]string{}, docs[index].TestImports...) index++ return nil }, }, nil }, }, "./tmp") assert.NoErrorOrDie(t, err) hitsArr, err := index.OpenConstArray(path.Join("./tmp", HitsArrFn)) for _, doc := range docs { idx := -1 ts.Search(index.SingleFieldQuery(IndexPkgField, doc.Package), func(docID int32, data interface{}) error { idx = int(docID) return nil }) d, err := hitsArr.GetGob(idx) assert.NoError(t, err) assert.Equal(t, "d.Package", d.(HitInfo).Package, doc.Package) } numDocs := ts.DocCount() assert.Equal(t, "DocCount", numDocs, 3) var pkgs []string if err := ts.Search(map[string]stringsp.Set{IndexTextField: nil}, func(docID int32, data interface{}) error { hit := data.(HitInfo) pkgs = append(pkgs, hit.Package) return nil }, ); err != nil { t.Error(err) return } assert.StringEqual(t, "all", pkgs, []string{ "github.com/daviddengcn/gcse", "github.com/daviddengcn/go-villa", "github.com/daviddengcn/gcse/indexer", }) var gcseInfo HitInfo if err := ts.Search(map[string]stringsp.Set{ IndexPkgField: stringsp.NewSet("github.com/daviddengcn/gcse"), }, func(docID int32, data interface{}) error { gcseInfo = data.(HitInfo) return nil }); err != nil { t.Errorf("ts.Search: %v", err) return } assert.Equal(t, "gcseInfo.Imported", gcseInfo.Imported, []string(nil)) assert.Equal(t, "gcseInfo.ImportedLen", gcseInfo.ImportedLen, 1) assert.Equal(t, "gcseInfo.TestImports", gcseInfo.TestImports, []string{"github.com/daviddengcn/go-villa"}) var indexerInfo HitInfo if err := ts.Search(map[string]stringsp.Set{ IndexPkgField: stringsp.NewSet("github.com/daviddengcn/gcse/indexer"), }, func(docID int32, data interface{}) error { gcseInfo = data.(HitInfo) return nil }); err != nil { t.Errorf("ts.Search: %v", err) return } assert.StringEqual(t, "indexerInfo.Imported", indexerInfo.Imported, []string{}) assert.StringEqual(t, "indexerInfo.Imports", indexerInfo.Imports, []string{}) if err := ts.Search(map[string]stringsp.Set{ IndexPkgField: stringsp.NewSet("github.com/daviddengcn/go-villa"), }, func(docID int32, data interface{}) error { gcseInfo = data.(HitInfo) return nil }); err != nil { t.Errorf("ts.Search: %v", err) return } assert.Equal(t, "indexerInfo.Imported", indexerInfo.Imported, []string(nil)) assert.Equal(t, "gcseInfo.TestImportedLen", gcseInfo.TestImportedLen, 1) assert.Equal(t, "gcseInfo.TestImported", gcseInfo.TestImported, []string(nil)) }
func generateCrawlEntries(db *gcse.MemDB, hostFromID func(id string) string, out kv.DirOutput) error { now := time.Now() type idAndCrawlingEntry struct { id string ent *gcse.CrawlingEntry } groups := make(map[string][]idAndCrawlingEntry) count := 0 type nameAndAges struct { maxName string maxAge time.Duration sumAgeHours float64 cnt int } ages := make(map[string]nameAndAges) if err := db.Iterate(func(id string, val interface{}) error { ent, ok := val.(gcse.CrawlingEntry) if !ok { log.Printf("Wrong entry: %+v", ent) return nil } if ent.Version == gcse.CrawlerVersion && ent.ScheduleTime.After(now) { return nil } host := hostFromID(id) // check host black list if configs.NonCrawlHosts.Contain(host) { return nil } if rand.Intn(10) == 0 { // randomly set Etag to empty to fetch stars ent.Etag = "" } groups[host] = append(groups[host], idAndCrawlingEntry{id, &ent}) age := now.Sub(ent.ScheduleTime) na := ages[host] if age > na.maxAge { na.maxName, na.maxAge = id, age } na.sumAgeHours += age.Hours() na.cnt++ ages[host] = na count++ return nil }); err != nil { return errorsp.WithStacks(err) } index := 0 for _, g := range groups { sortp.SortF(len(g), func(i, j int) bool { return g[i].ent.ScheduleTime.Before(g[j].ent.ScheduleTime) }, func(i, j int) { g[i], g[j] = g[j], g[i] }) if err := func(index int, ies []idAndCrawlingEntry) error { c, err := out.Collector(index) if err != nil { return err } defer c.Close() for _, ie := range ies { if err := c.Collect(sophie.RawString(ie.id), ie.ent); err != nil { return err } } return nil }(index, g); err != nil { log.Printf("Saving ents failed: %v", err) } index++ } for host, na := range ages { aveAge := time.Duration(na.sumAgeHours / float64(na.cnt) * float64(time.Hour)) log.Printf("%s age: max -> %v(%s), ave -> %v", host, na.maxAge, na.maxName, aveAge) if host == "github.com" && strings.Contains(out.Path, configs.FnPackage) { gcse.AddBiValueAndProcess(bi.Average, "crawler.github_max_age.hours", int(na.maxAge.Hours())) gcse.AddBiValueAndProcess(bi.Average, "crawler.github_max_age.days", int(na.maxAge/timep.Day)) gcse.AddBiValueAndProcess(bi.Average, "crawler.github_ave_age.hours", int(aveAge.Hours())) gcse.AddBiValueAndProcess(bi.Average, "crawler.github_ave_age.days", int(aveAge/timep.Day)) } } log.Printf("%d entries to crawl for folder %v", count, out.Path) return nil }
func TestMRFromFile(t *testing.T) { fmt.Println("TestMRFromFile starts") fpRoot := sophie.LocalFsPath(".") mrin := fpRoot.Join("mrin") mrin.Mkdir(0755) mrtmp := fpRoot.Join("tmp") /* * Prepare input */ var inF *kv.Writer = nil index := 0 lines := strings.Split(WORDS, "\n") for i, line := range lines { if i%3 == 0 { if inF != nil { assert.NoErrorf(t, "inF.Close: %v", inF.Close()) index++ } var err error inF, err = kv.NewWriter(mrin.Join(fmt.Sprintf("part-%05d", index))) assert.NoErrorf(t, "NewKVWriter: %v", err) } assert.NoErrorf(t, "inF.Collect", inF.Collect(sophie.RawString(line), sophie.Null{})) } if inF != nil { assert.NoErrorf(t, "inF.Close: %v", inF.Close()) } mrout := fpRoot.Join("mrout") assert.NoErrorf(t, "Remove mrout: %v", mrout.Remove()) /* * MrJob */ var mapper WordCountMapper reducer := WordCountReducer{counts: make(map[string]int)} job := MrJob{ Source: []Input{kv.DirInput(mrin)}, NewMapperF: func(src, part int) Mapper { return &mapper }, Sorter: NewFileSorter(mrtmp), NewReducerF: func(part int) Reducer { return &reducer }, Dest: []Output{kv.DirOutput(mrout)}, } assert.NoErrorf(t, "RunJob: %v", job.Run()) /* * Check result */ resIn := kv.DirInput(mrout) n, err := resIn.PartCount() assert.NoErrorf(t, "resIn.PartCount(): %v", err) var word sophie.RawString var cnt sophie.RawVInt actCnts := make(map[string]int) for i := 0; i < n; i++ { iter, err := resIn.Iterator(i) assert.NoErrorf(t, "resIn.Iterator: %v", err) for { err := iter.Next(&word, &cnt) if err == sophie.EOF { break } assert.NoErrorf(t, "iter.Next: %v", err) actCnts[string(word)] = int(cnt) } } expCnts := statWords(WORDS) // fmt.Println(expCnts) // fmt.Println(actCnts) assertMapEquals(t, actCnts, expCnts) fmt.Println("TestMRFromFile ends") }
// OnlyMapper.Map func (pc *PackageCrawler) Map(key, val sophie.SophieWriter, c []sophie.Collector) error { if time.Now().After(AppStopTime) { log.Printf("[Part %d] Timeout(key = %v), PackageCrawler returns EOM", pc.part, key) return mr.EOM } pkg := string(*key.(*sophie.RawString)) ent := val.(*gcse.CrawlingEntry) if ent.Version < gcse.CrawlerVersion { // if gcse.CrawlerVersion is larger than Version, Etag is ignored. ent.Etag = "" } log.Printf("[Part %d] Crawling package %v with etag %s\n", pc.part, pkg, ent.Etag) p, flds, err := gcse.CrawlPackage(pc.httpClient, pkg, ent.Etag) for _, fld := range flds { if spider.LikeGoSubFolder(fld.Name) { appendNewPackage(pkg+"/"+fld.Path, "parent") } } site, path := utils.SplitPackage(pkg) if err != nil && errorsp.Cause(err) != gcse.ErrPackageNotModifed { log.Printf("[Part %d] Crawling pkg %s failed: %v", pc.part, pkg, err) if gcse.IsBadPackage(err) { utils.LogError(store.AppendPackageEvent(site, path, "", time.Now(), sppb.HistoryEvent_Action_Invalid), "AppendPackageEvent %v %v failed", site, path) bi.AddValue(bi.Sum, "crawler.package.wrong-package", 1) // a wrong path nda := gcse.NewDocAction{ Action: gcse.NDA_DEL, } c[0].Collect(sophie.RawString(pkg), &nda) cDB.PackageDB.Delete(pkg) log.Printf("[Part %d] Remove wrong package %s", pc.part, pkg) } else { utils.LogError(store.AppendPackageEvent(site, path, "", time.Now(), sppb.HistoryEvent_Action_Failed), "AppendPackageEvent %v %v failed", site, path) bi.Inc("crawler.package.failed") if strings.HasPrefix(pkg, "github.com/") { bi.Inc("crawler.package.failed.github") } pc.failCount++ cDB.SchedulePackage(pkg, time.Now().Add(12*time.Hour), ent.Etag) if pc.failCount >= 10 || strings.Contains(err.Error(), "403") { durToSleep := 10 * time.Minute if time.Now().Add(durToSleep).After(AppStopTime) { log.Printf("[Part %d] Timeout(key = %v), PackageCrawler returns EOM", pc.part, key) return mr.EOM } log.Printf("[Part %d] Last ten crawling packages failed, sleep for a while...(current: %s)", pc.part, pkg) time.Sleep(durToSleep) pc.failCount = 0 } } return nil } utils.LogError(store.AppendPackageEvent(site, path, "", time.Now(), sppb.HistoryEvent_Action_Success), "AppendPackageEvent %v %v failed", site, path) pc.failCount = 0 if errorsp.Cause(err) == gcse.ErrPackageNotModifed { // TODO crawling stars for unchanged project log.Printf("[Part %d] Package %s unchanged!", pc.part, pkg) schedulePackageNextCrawl(pkg, ent.Etag) bi.AddValue(bi.Sum, "crawler.package.not-modified", 1) return nil } bi.AddValue(bi.Sum, "crawler.package.success", 1) if strings.HasPrefix(pkg, "github.com/") { bi.AddValue(bi.Sum, "crawler.package.success.github", 1) } log.Printf("[Part %d] Crawled package %s success!", pc.part, pkg) var pkgInfo *stpb.PackageInfo if err := store.UpdatePackage(site, path, func(pi *stpb.PackageInfo) error { fillPackageInfo(p, pi) pkgInfo = pi return nil }); err != nil { log.Printf("UpdatePackage %v %v failed: %v", site, path, err) } saveRelatedInfo(pkgInfo) nda := gcse.NewDocAction{ Action: gcse.NDA_UPDATE, DocInfo: packageToDoc(p), } c[0].Collect(sophie.RawString(pkg), &nda) log.Printf("[Part %d] Package %s saved!", pc.part, pkg) if !strings.HasPrefix(pkg, "github.com/") { // github.com throttling is done within the GithubSpider. time.Sleep(10 * time.Second) } return nil }
// OnlyMapper.Map func (pc *PackageCrawler) Map(key, val sophie.SophieWriter, c []sophie.Collector) error { if time.Now().After(AppStopTime) { log.Printf("[Part %d] Timeout(key = %v), PackageCrawler returns EOM", pc.part, key) return mr.EOM } pkg := string(*key.(*sophie.RawString)) ent := val.(*gcse.CrawlingEntry) if ent.Version < gcse.CrawlerVersion { // if gcse.CrawlerVersion is larger than Version, Etag is ignored. ent.Etag = "" } log.Printf("[Part %d] Crawling package %v with etag %s\n", pc.part, pkg, ent.Etag) p, err := gcse.CrawlPackage(pc.httpClient, pkg, ent.Etag) _ = p if err != nil && err != gcse.ErrPackageNotModifed { log.Printf("[Part %d] Crawling pkg %s failed: %v", pc.part, pkg, err) if gcse.IsBadPackage(err) { // a wrong path nda := gcse.NewDocAction{ Action: gcse.NDA_DEL, } c[0].Collect(sophie.RawString(pkg), &nda) cDB.PackageDB.Delete(pkg) log.Printf("[Part %d] Remove wrong package %s", pc.part, pkg) } else { pc.failCount++ cDB.SchedulePackage(pkg, time.Now().Add(12*time.Hour), ent.Etag) if pc.failCount >= 10 || strings.Contains(err.Error(), "403") { durToSleep := 10 * time.Minute if time.Now().Add(durToSleep).After(AppStopTime) { log.Printf("[Part %d] Timeout(key = %v), PackageCrawler returns EOM", pc.part, key) return mr.EOM } log.Printf("[Part %d] Last ten crawling packages failed, sleep for a while...(current: %s)", pc.part, pkg) time.Sleep(durToSleep) pc.failCount = 0 } } return nil } pc.failCount = 0 if err == gcse.ErrPackageNotModifed { // TODO crawling stars for unchanged project log.Printf("[Part %d] Package %s unchanged!", pc.part, pkg) schedulePackageNextCrawl(pkg, ent.Etag) return nil } log.Printf("[Part %d] Crawled package %s success!", pc.part, pkg) nda := gcse.NewDocAction{ Action: gcse.NDA_UPDATE, DocInfo: packageToDoc(p), } c[0].Collect(sophie.RawString(pkg), &nda) log.Printf("[Part %d] Package %s saved!", pc.part, pkg) time.Sleep(10 * time.Second) return nil }