Example #1
0
func (db *searcherDB) PackageCrawlHistory(pkg string) *sppb.HistoryInfo {
	site, path := utils.SplitPackage(pkg)
	info, err := store.ReadPackageHistoryOf(db.storeDB, site, path)
	if err != nil {
		log.Printf("ReadPackageHistoryOf %s %s failed: %v", site, path, err)
		return nil
	}
	return info
}
Example #2
0
func doFill() error {
	cDB := gcse.LoadCrawlerDB()
	return cDB.PackageDB.Iterate(func(pkg string, val interface{}) error {
		ent, ok := val.(gcse.CrawlingEntry)
		if !ok {
			log.Printf("Wrong entry, ignored: %+v", ent)
			return nil
		}
		site, path := utils.SplitPackage(pkg)
		return store.AppendPackageEvent(site, path, "unknown", ent.ScheduleTime.Add(-10*timep.Day), sppb.HistoryEvent_Action_None)
	})
}
Example #3
0
func appendNewPackage(pkg, foundWay string) {
	cDB.AppendPackage(pkg, allDocsPkgs.Contain)

	site, path := utils.SplitPackage(pkg)
	if err := store.UpdatePackage(site, path, func(*stpb.PackageInfo) error {
		return nil
	}); err != nil {
		log.Printf("UpdatePackage %s %s failed: %v", site, path, err)
	}
	if err := store.AppendPackageEvent(site, path, foundWay, time.Now(), sppb.HistoryEvent_Action_None); err != nil {
		log.Printf("UpdatePackageHistory %s %s failed: %v", site, path, err)
	}
}
Example #4
0
func main() {
	log.Println("Running tocrawl tool, to generate crawling list")
	log.Println("NonCrawlHosts: ", configs.NonCrawlHosts)
	log.Println("CrawlGithubUpdate: ", configs.CrawlGithubUpdate)
	log.Println("CrawlByGodocApi: ", configs.CrawlByGodocApi)

	log.Printf("Using personal: %v", configs.CrawlerGithubPersonal)
	gcse.GithubSpider = github.NewSpiderWithToken(configs.CrawlerGithubPersonal)

	// Load CrawlerDB
	cDB = gcse.LoadCrawlerDB()

	if configs.CrawlGithubUpdate || configs.CrawlByGodocApi {
		// load pkgUTs
		pkgUTs, err := loadPackageUpdateTimes(
			sophie.LocalFsPath(configs.DocsDBPath().S()))
		if err != nil {
			log.Fatalf("loadPackageUpdateTimes failed: %v", err)
		}

		if configs.CrawlGithubUpdate {
			touchByGithubUpdates(pkgUTs)
		}

		if configs.CrawlByGodocApi {
			httpClient := gcse.GenHttpClient("")
			pkgs, err := gcse.FetchAllPackagesInGodoc(httpClient)
			if err != nil {
				log.Fatalf("FetchAllPackagesInGodoc failed: %v", err)
			}
			gcse.AddBiValueAndProcess(bi.Max, "godoc.doc-count", len(pkgs))
			log.Printf("FetchAllPackagesInGodoc returns %d entries", len(pkgs))
			now := time.Now()
			for _, pkg := range pkgs {
				cDB.AppendPackage(pkg, func(pkg string) bool {
					_, ok := pkgUTs[pkg]
					return ok
				})
				site, path := utils.SplitPackage(pkg)
				if err := store.AppendPackageEvent(site, path, "godoc", now, sppb.HistoryEvent_Action_None); err != nil {
					log.Printf("UpdatePackageHistory %s %s failed: %v", site, path, err)
				}
			}
		}
		syncDatabases()
	}

	log.Printf("Package DB: %d entries", cDB.PackageDB.Count())
	log.Printf("Person DB: %d entries", cDB.PersonDB.Count())

	pathToCrawl := configs.DataRoot.Join(configs.FnToCrawl)

	kvPackage := kv.DirOutput(sophie.LocalFsPath(
		pathToCrawl.Join(configs.FnPackage).S()))
	kvPackage.Clean()
	if err := generateCrawlEntries(cDB.PackageDB, gcse.HostOfPackage, kvPackage); err != nil {
		log.Fatalf("generateCrawlEntries %v failed: %v", kvPackage.Path, err)
	}

	kvPerson := kv.DirOutput(sophie.LocalFsPath(
		pathToCrawl.Join(configs.FnPerson).S()))

	kvPerson.Clean()
	if err := generateCrawlEntries(cDB.PersonDB, func(id string) string {
		site, _ := gcse.ParsePersonId(id)
		return site
	}, kvPerson); err != nil {
		log.Fatalf("generateCrawlEntries %v failed: %v", kvPerson.Path, err)
	}
}
Example #5
0
// OnlyMapper.Map
func (pc *PackageCrawler) Map(key, val sophie.SophieWriter, c []sophie.Collector) error {
	if time.Now().After(AppStopTime) {
		log.Printf("[Part %d] Timeout(key = %v), PackageCrawler returns EOM",
			pc.part, key)
		return mr.EOM
	}
	pkg := string(*key.(*sophie.RawString))
	ent := val.(*gcse.CrawlingEntry)
	if ent.Version < gcse.CrawlerVersion {
		// if gcse.CrawlerVersion is larger than Version, Etag is ignored.
		ent.Etag = ""
	}
	log.Printf("[Part %d] Crawling package %v with etag %s\n", pc.part, pkg, ent.Etag)

	p, flds, err := gcse.CrawlPackage(pc.httpClient, pkg, ent.Etag)
	for _, fld := range flds {
		if spider.LikeGoSubFolder(fld.Name) {
			appendNewPackage(pkg+"/"+fld.Path, "parent")
		}
	}
	site, path := utils.SplitPackage(pkg)
	if err != nil && errorsp.Cause(err) != gcse.ErrPackageNotModifed {
		log.Printf("[Part %d] Crawling pkg %s failed: %v", pc.part, pkg, err)
		if gcse.IsBadPackage(err) {
			utils.LogError(store.AppendPackageEvent(site, path, "", time.Now(), sppb.HistoryEvent_Action_Invalid), "AppendPackageEvent %v %v failed", site, path)
			bi.AddValue(bi.Sum, "crawler.package.wrong-package", 1)
			// a wrong path
			nda := gcse.NewDocAction{
				Action: gcse.NDA_DEL,
			}
			c[0].Collect(sophie.RawString(pkg), &nda)
			cDB.PackageDB.Delete(pkg)
			log.Printf("[Part %d] Remove wrong package %s", pc.part, pkg)
		} else {
			utils.LogError(store.AppendPackageEvent(site, path, "", time.Now(), sppb.HistoryEvent_Action_Failed), "AppendPackageEvent %v %v failed", site, path)
			bi.Inc("crawler.package.failed")
			if strings.HasPrefix(pkg, "github.com/") {
				bi.Inc("crawler.package.failed.github")
			}
			pc.failCount++

			cDB.SchedulePackage(pkg, time.Now().Add(12*time.Hour), ent.Etag)

			if pc.failCount >= 10 || strings.Contains(err.Error(), "403") {
				durToSleep := 10 * time.Minute
				if time.Now().Add(durToSleep).After(AppStopTime) {
					log.Printf("[Part %d] Timeout(key = %v), PackageCrawler returns EOM",
						pc.part, key)
					return mr.EOM
				}

				log.Printf("[Part %d] Last ten crawling packages failed, sleep for a while...(current: %s)",
					pc.part, pkg)
				time.Sleep(durToSleep)
				pc.failCount = 0
			}
		}
		return nil
	}
	utils.LogError(store.AppendPackageEvent(site, path, "", time.Now(), sppb.HistoryEvent_Action_Success), "AppendPackageEvent %v %v failed", site, path)
	pc.failCount = 0
	if errorsp.Cause(err) == gcse.ErrPackageNotModifed {
		// TODO crawling stars for unchanged project
		log.Printf("[Part %d] Package %s unchanged!", pc.part, pkg)
		schedulePackageNextCrawl(pkg, ent.Etag)
		bi.AddValue(bi.Sum, "crawler.package.not-modified", 1)
		return nil
	}
	bi.AddValue(bi.Sum, "crawler.package.success", 1)
	if strings.HasPrefix(pkg, "github.com/") {
		bi.AddValue(bi.Sum, "crawler.package.success.github", 1)
	}
	log.Printf("[Part %d] Crawled package %s success!", pc.part, pkg)

	var pkgInfo *stpb.PackageInfo
	if err := store.UpdatePackage(site, path, func(pi *stpb.PackageInfo) error {
		fillPackageInfo(p, pi)
		pkgInfo = pi
		return nil
	}); err != nil {
		log.Printf("UpdatePackage %v %v failed: %v", site, path, err)
	}
	saveRelatedInfo(pkgInfo)

	nda := gcse.NewDocAction{
		Action:  gcse.NDA_UPDATE,
		DocInfo: packageToDoc(p),
	}
	c[0].Collect(sophie.RawString(pkg), &nda)
	log.Printf("[Part %d] Package %s saved!", pc.part, pkg)

	if !strings.HasPrefix(pkg, "github.com/") {
		// github.com throttling is done within the GithubSpider.
		time.Sleep(10 * time.Second)
	}
	return nil
}