Example #1
0
func main() {
	log.Println("Running tocrawl tool, to generate crawling list")
	log.Println("NonCrawlHosts: ", gcse.NonCrawlHosts)
	log.Println("CrawlGithubUpdate: ", gcse.CrawlGithubUpdate)
	log.Println("CrawlByGodocApi: ", gcse.CrawlByGodocApi)
	// Load CrawlerDB
	cDB = gcse.LoadCrawlerDB()

	if gcse.CrawlGithubUpdate || gcse.CrawlByGodocApi {
		// load pkgUTs
		pkgUTs, err := loadPackageUpdateTimes(
			sophie.LocalFsPath(gcse.DocsDBPath.S()))
		if err != nil {
			log.Fatalf("loadPackageUpdateTimes failed: %v", err)
		}

		if gcse.CrawlGithubUpdate {
			touchByGithubUpdates(pkgUTs)
		}

		if gcse.CrawlByGodocApi {
			httpClient := gcse.GenHttpClient("")
			pkgs, err := gcse.FetchAllPackagesInGodoc(httpClient)
			if err != nil {
				log.Fatalf("FetchAllPackagesInGodoc failed: %v", err)
			}
			log.Printf("FetchAllPackagesInGodoc returns %d entries", len(pkgs))
			for _, pkg := range pkgs {
				cDB.AppendPackage(pkg, func(pkg string) bool {
					_, ok := pkgUTs[pkg]
					return ok
				})
			}
		}
		syncDatabases()
	}

	log.Printf("Package DB: %d entries", cDB.PackageDB.Count())
	log.Printf("Person DB: %d entries", cDB.PersonDB.Count())

	pathToCrawl := gcse.DataRoot.Join(gcse.FnToCrawl)

	kvPackage := kv.DirOutput(sophie.LocalFsPath(
		pathToCrawl.Join(gcse.FnPackage).S()))
	kvPackage.Clean()
	if err := generateCrawlEntries(cDB.PackageDB, gcse.HostOfPackage, kvPackage); err != nil {
		log.Fatalf("generateCrawlEntries %v failed: %v", kvPackage.Path, err)
	}

	kvPerson := kv.DirOutput(sophie.LocalFsPath(
		pathToCrawl.Join(gcse.FnPerson).S()))

	kvPerson.Clean()
	if err := generateCrawlEntries(cDB.PersonDB, func(id string) string {
		site, _ := gcse.ParsePersonId(id)
		return site
	}, kvPerson); err != nil {
		log.Fatalf("generateCrawlEntries %v failed: %v", kvPerson.Path, err)
	}
}
Example #2
0
// crawl packages, send error back to end
func crawlPackages(httpClient doc.HttpClient, fpToCrawlPkg,
	fpOutNewDocs sophie.FsPath, end chan error) {

	time.AfterFunc(configs.CrawlerDuePerRun+time.Minute*10, func() {
		end <- errorsp.NewWithStacks("Crawling packages timeout!")
	})
	end <- func() error {
		outNewDocs := kv.DirOutput(fpOutNewDocs)
		outNewDocs.Clean()
		job := mr.MapOnlyJob{
			Source: []mr.Input{
				kv.DirInput(fpToCrawlPkg),
			},
			NewMapperF: func(src, part int) mr.OnlyMapper {
				return &PackageCrawler{
					part:       part,
					httpClient: httpClient,
				}
			},
			Dest: []mr.Output{
				outNewDocs,
			},
		}
		if err := job.Run(); err != nil {
			log.Printf("crawlPackages: job.Run failed: %v", err)
			return err
		}
		return nil
	}()
}
Example #3
0
func main() {
	fmt.Println("Data conversion tool")
	fpRoot := sophie.LocalFsPath("./data")
	/*
	 * Doc db
	 */
	if DocDBPath.Exists() {
		if DocDBPath.Join(gcse.KindDocDB+".gob").Exists() &&
			!gcse.DataRoot.Join(fnNewDocDB).Exists() {
			src := DocDBPath.Join(gcse.KindDocDB + ".gob")
			dst := fpRoot.Join(fnNewDocDB)
			fmt.Println("Convert", src, "to", dst, "...")

			srcDB := gcse.PackedDocDB{MemDB: gcse.NewMemDB(DocDBPath, gcse.KindDocDB)}
			if err := srcDB.Load(); err != nil {
				log.Fatalf("srcDB.Load: %v", err)
			}

			fpDocs := fpRoot.Join(fnNewDocDB)
			dstDB := kv.DirOutput(fpDocs)
			c, err := dstDB.Collector(0)
			if err != nil {
				log.Fatalf("dstDB.Collector: %v", err)
			}

			count := 0
			if err := srcDB.Iterate(func(key string, val interface{}) error {
				k := sophie.RawString(key)
				v := val.(gcse.DocInfo)

				if count < 10 {
					fmtp.Printfln("  key: %+v, value: %+v", k, v)
				}

				count++
				return c.Collect(k, &v)
			}); err != nil {
				fpDocs.Remove()
				log.Fatalf("srcDB.Iterate: %v", err)
			}
			c.Close()

			fmtp.Printfln("Conversion sucess, %d entries collected.", count)
		}
	}
}
Example #4
0
func main() {
	log.Println("Merging new crawled docs back...")

	fpDataRoot := sophie.LocalFsPath(gcse.DataRoot.S())

	fpCrawler := fpDataRoot.Join(gcse.FnCrawlerDB)
	outDocsUpdated := kv.DirOutput(fpDataRoot.Join("docs-updated"))
	outDocsUpdated.Clean()

	var cntDeleted, cntUpdated, cntNewUnchange int64

	job := mr.MrJob{
		Source: []mr.Input{
			kv.DirInput(fpDataRoot.Join(gcse.FnDocs)),   // 0
			kv.DirInput(fpCrawler.Join(gcse.FnNewDocs)), // 1
		},

		NewMapperF: func(src, part int) mr.Mapper {
			if src == 0 {
				return &mr.MapperStruct{
					NewKeyF: sophie.NewRawString,
					NewValF: gcse.NewDocInfo,
					MapF: func(key, val sophie.SophieWriter,
						c mr.PartCollector) error {

						pkg := key.(*sophie.RawString).String()
						di := val.(*gcse.DocInfo)
						act := gcse.NewDocAction{
							Action:  gcse.NDA_UPDATE,
							DocInfo: *di,
						}

						part := gcse.CalcPackagePartition(pkg, gcse.DOCS_PARTS)
						return c.CollectTo(part, key, &act)
					},
				}
			}

			return &mr.MapperStruct{
				NewKeyF: sophie.NewRawString,
				NewValF: gcse.NewNewDocAction,
				MapF: func(key, val sophie.SophieWriter,
					c mr.PartCollector) error {

					pkg := string(*key.(*sophie.RawString))
					part := gcse.CalcPackagePartition(pkg, gcse.DOCS_PARTS)
					return c.CollectTo(part, key, val)
				},
			}
		},

		Sorter: mr.NewFileSorter(fpDataRoot.Join("tmp")),

		NewReducerF: func(part int) mr.Reducer {
			return &mr.ReducerStruct{
				NewKeyF: sophie.NewRawString,
				NewValF: gcse.NewNewDocAction,
				ReduceF: func(key sophie.SophieWriter,
					nextVal mr.SophierIterator, c []sophie.Collector) error {

					var act gcse.DocInfo
					isSet := false
					isUpdated := false
					for {
						val, err := nextVal()
						if err == sophie.EOF {
							break
						}
						if err != nil {
							return err
						}

						cur := val.(*gcse.NewDocAction)
						if cur.Action == gcse.NDA_DEL {
							// not collect out to delete it
							atomic.AddInt64(&cntDeleted, 1)
							return nil
						}
						if !isSet {
							isSet = true
							act = cur.DocInfo
						} else {
							if cur.LastUpdated.After(act.LastUpdated) {
								isUpdated = true
								act = cur.DocInfo
							}
						}
					}

					if isSet {
						if isUpdated {
							atomic.AddInt64(&cntUpdated, 1)
						} else {
							atomic.AddInt64(&cntNewUnchange, 1)
						}
						return c[0].Collect(key, &act)
					} else {
						return nil
					}
				},
			}
		},

		Dest: []mr.Output{
			outDocsUpdated,
		},
	}

	if err := job.Run(); err != nil {
		log.Fatalf("job.Run failed: %v", err)
	}

	log.Printf("Deleted: %v", cntDeleted)
	log.Printf("Updated: %v", cntUpdated)
	log.Printf("NewUnchange: %v", cntNewUnchange)

	pDocs := gcse.DataRoot.Join(gcse.FnDocs)
	pUpdated := gcse.DataRoot.Join("docs-updated")
	pTmp := gcse.DataRoot.Join("docs-tmp")

	pTmp.RemoveAll()
	if err := pDocs.Rename(pTmp); err != nil {
		log.Fatalf("rename %v to %v failed: %v", pDocs, pTmp, err)
	}
	if err := pUpdated.Rename(pDocs); err != nil {
		log.Fatalf("rename %v to %v failed: %v", pUpdated, pDocs, err)
	}

	log.Println("Merging success...")
}
Example #5
0
func main() {
	log.Println("Running tocrawl tool, to generate crawling list")
	log.Println("NonCrawlHosts: ", configs.NonCrawlHosts)
	log.Println("CrawlGithubUpdate: ", configs.CrawlGithubUpdate)
	log.Println("CrawlByGodocApi: ", configs.CrawlByGodocApi)

	log.Printf("Using personal: %v", configs.CrawlerGithubPersonal)
	gcse.GithubSpider = github.NewSpiderWithToken(configs.CrawlerGithubPersonal)

	// Load CrawlerDB
	cDB = gcse.LoadCrawlerDB()

	if configs.CrawlGithubUpdate || configs.CrawlByGodocApi {
		// load pkgUTs
		pkgUTs, err := loadPackageUpdateTimes(
			sophie.LocalFsPath(configs.DocsDBPath().S()))
		if err != nil {
			log.Fatalf("loadPackageUpdateTimes failed: %v", err)
		}

		if configs.CrawlGithubUpdate {
			touchByGithubUpdates(pkgUTs)
		}

		if configs.CrawlByGodocApi {
			httpClient := gcse.GenHttpClient("")
			pkgs, err := gcse.FetchAllPackagesInGodoc(httpClient)
			if err != nil {
				log.Fatalf("FetchAllPackagesInGodoc failed: %v", err)
			}
			gcse.AddBiValueAndProcess(bi.Max, "godoc.doc-count", len(pkgs))
			log.Printf("FetchAllPackagesInGodoc returns %d entries", len(pkgs))
			now := time.Now()
			for _, pkg := range pkgs {
				cDB.AppendPackage(pkg, func(pkg string) bool {
					_, ok := pkgUTs[pkg]
					return ok
				})
				site, path := utils.SplitPackage(pkg)
				if err := store.AppendPackageEvent(site, path, "godoc", now, sppb.HistoryEvent_Action_None); err != nil {
					log.Printf("UpdatePackageHistory %s %s failed: %v", site, path, err)
				}
			}
		}
		syncDatabases()
	}

	log.Printf("Package DB: %d entries", cDB.PackageDB.Count())
	log.Printf("Person DB: %d entries", cDB.PersonDB.Count())

	pathToCrawl := configs.DataRoot.Join(configs.FnToCrawl)

	kvPackage := kv.DirOutput(sophie.LocalFsPath(
		pathToCrawl.Join(configs.FnPackage).S()))
	kvPackage.Clean()
	if err := generateCrawlEntries(cDB.PackageDB, gcse.HostOfPackage, kvPackage); err != nil {
		log.Fatalf("generateCrawlEntries %v failed: %v", kvPackage.Path, err)
	}

	kvPerson := kv.DirOutput(sophie.LocalFsPath(
		pathToCrawl.Join(configs.FnPerson).S()))

	kvPerson.Clean()
	if err := generateCrawlEntries(cDB.PersonDB, func(id string) string {
		site, _ := gcse.ParsePersonId(id)
		return site
	}, kvPerson); err != nil {
		log.Fatalf("generateCrawlEntries %v failed: %v", kvPerson.Path, err)
	}
}
Example #6
0
func TestMRFromFile(t *testing.T) {
	fmt.Println("TestMRFromFile starts")
	fpRoot := sophie.LocalFsPath(".")

	mrin := fpRoot.Join("mrin")
	mrin.Mkdir(0755)

	mrtmp := fpRoot.Join("tmp")

	/*
	 * Prepare input
	 */
	var inF *kv.Writer = nil
	index := 0
	lines := strings.Split(WORDS, "\n")
	for i, line := range lines {
		if i%3 == 0 {
			if inF != nil {
				assert.NoErrorf(t, "inF.Close: %v", inF.Close())
				index++
			}
			var err error
			inF, err = kv.NewWriter(mrin.Join(fmt.Sprintf("part-%05d", index)))
			assert.NoErrorf(t, "NewKVWriter: %v", err)
		}

		assert.NoErrorf(t, "inF.Collect",
			inF.Collect(sophie.RawString(line), sophie.Null{}))
	}
	if inF != nil {
		assert.NoErrorf(t, "inF.Close: %v", inF.Close())
	}

	mrout := fpRoot.Join("mrout")
	assert.NoErrorf(t, "Remove mrout: %v", mrout.Remove())

	/*
	 * MrJob
	 */
	var mapper WordCountMapper
	reducer := WordCountReducer{counts: make(map[string]int)}

	job := MrJob{
		Source: []Input{kv.DirInput(mrin)},
		NewMapperF: func(src, part int) Mapper {
			return &mapper
		},

		Sorter: NewFileSorter(mrtmp),

		NewReducerF: func(part int) Reducer {
			return &reducer
		},
		Dest: []Output{kv.DirOutput(mrout)},
	}

	assert.NoErrorf(t, "RunJob: %v", job.Run())

	/*
	 * Check result
	 */
	resIn := kv.DirInput(mrout)
	n, err := resIn.PartCount()
	assert.NoErrorf(t, "resIn.PartCount(): %v", err)
	var word sophie.RawString
	var cnt sophie.RawVInt
	actCnts := make(map[string]int)
	for i := 0; i < n; i++ {
		iter, err := resIn.Iterator(i)
		assert.NoErrorf(t, "resIn.Iterator: %v", err)
		for {
			err := iter.Next(&word, &cnt)
			if err == sophie.EOF {
				break
			}
			assert.NoErrorf(t, "iter.Next: %v", err)
			actCnts[string(word)] = int(cnt)
		}
	}

	expCnts := statWords(WORDS)
	// fmt.Println(expCnts)
	// fmt.Println(actCnts)

	assertMapEquals(t, actCnts, expCnts)
	fmt.Println("TestMRFromFile ends")
}
Example #7
0
func main() {
	log.Println("Merging new crawled docs back...")

	var nonStorePackage *regexp.Regexp
	if len(configs.NonStorePackageRegexps) > 0 {
		nonStorePackage = regexp.MustCompile(
			stringsp.FullJoin(configs.NonStorePackageRegexps, "(", ")|(", ")"))
	}

	fpDataRoot := sophie.LocalFsPath(configs.DataRoot.S())

	fpCrawler := fpDataRoot.Join(configs.FnCrawlerDB)
	outDocsUpdated := kv.DirOutput(fpDataRoot.Join("docs-updated"))
	outDocsUpdated.Clean()

	var cntDeleted, cntUpdated, cntNew, cntUnchanged int64

	job := mr.MrJob{
		Source: []mr.Input{
			kv.DirInput(fpDataRoot.Join(configs.FnDocs)),   // 0
			kv.DirInput(fpCrawler.Join(configs.FnNewDocs)), // 1
		},

		NewMapperF: func(src, part int) mr.Mapper {
			if src == 0 {
				// Mapper for docs
				return &mr.MapperStruct{
					NewKeyF: sophie.NewRawString,
					NewValF: gcse.NewDocInfo,
					MapF: func(key, val sophie.SophieWriter, c mr.PartCollector) error {
						pkg := key.(*sophie.RawString).String()
						di := val.(*gcse.DocInfo)
						act := gcse.NewDocAction{
							Action:  gcse.NDA_ORIGINAL,
							DocInfo: *di,
						}
						part := gcse.CalcPackagePartition(pkg, gcse.DOCS_PARTS)
						return c.CollectTo(part, key, &act)
					},
				}
			}
			// Mapper for new docs
			return &mr.MapperStruct{
				NewKeyF: sophie.NewRawString,
				NewValF: gcse.NewNewDocAction,
				MapF: func(key, val sophie.SophieWriter, c mr.PartCollector) error {
					pkg := string(*key.(*sophie.RawString))
					part := gcse.CalcPackagePartition(pkg, gcse.DOCS_PARTS)
					return c.CollectTo(part, key, val)
				},
			}
		},

		Sorter: mr.NewFileSorter(fpDataRoot.Join("tmp")),

		NewReducerF: func(part int) mr.Reducer {
			return &mr.ReducerStruct{
				NewKeyF: sophie.NewRawString,
				NewValF: gcse.NewNewDocAction,
				ReduceF: func(key sophie.SophieWriter,
					nextVal mr.SophierIterator, c []sophie.Collector) error {

					if nonStorePackage != nil {
						pkg := string(*key.(*sophie.RawString))
						if nonStorePackage.MatchString(pkg) {
							log.Printf("Ignoring non-store pkg: %s", pkg)
							return nil
						}
					}

					var act gcse.DocInfo
					isSet := false
					isUpdated := false
					hasOriginal := false
					for {
						val, err := nextVal()
						if errorsp.Cause(err) == io.EOF {
							break
						}
						if err != nil {
							return err
						}

						cur := val.(*gcse.NewDocAction)
						switch cur.Action {
						case gcse.NDA_DEL:
							// not collect out to delete it
							atomic.AddInt64(&cntDeleted, 1)
							return nil

						case gcse.NDA_ORIGINAL:
							hasOriginal = true
						}

						if !isSet {
							isSet = true
							act = cur.DocInfo
						} else {
							if cur.LastUpdated.After(act.LastUpdated) {
								isUpdated = true
								act = cur.DocInfo
							}
						}
					}

					if isSet {
						if isUpdated {
							atomic.AddInt64(&cntUpdated, 1)
						} else if hasOriginal {
							atomic.AddInt64(&cntUnchanged, 1)
						} else {
							atomic.AddInt64(&cntNew, 1)
						}
						return c[0].Collect(key, &act)
					} else {
						return nil
					}
				},
			}
		},

		Dest: []mr.Output{
			outDocsUpdated,
		},
	}

	if err := job.Run(); err != nil {
		log.Fatalf("job.Run failed: %v", err)
	}

	log.Printf("Deleted:   %v", cntDeleted)
	log.Printf("Updated:   %v", cntUpdated)
	log.Printf("New:       %v", cntNew)
	log.Printf("Unchanged: %v", cntUnchanged)

	pDocs := configs.DataRoot.Join(configs.FnDocs)
	pUpdated := configs.DataRoot.Join("docs-updated")
	pTmp := configs.DataRoot.Join("docs-tmp")

	pTmp.RemoveAll()
	if err := pDocs.Rename(pTmp); err != nil {
		log.Fatalf("rename %v to %v failed: %v", pDocs, pTmp, err)
	}
	if err := pUpdated.Rename(pDocs); err != nil {
		log.Fatalf("rename %v to %v failed: %v", pUpdated, pDocs, err)
	}

	log.Println("Merging success...")
}