Example #1
0
func CrawlPerson(httpClient doc.HttpClient, id string) (*Person, error) {
	site, user := ParsePersonId(id)
	switch site {
	case "github.com":
		u, err := GithubSpider.ReadUser(user)
		if err != nil {
			return nil, errorsp.WithStacksAndMessage(err, "ReadUser %s failed", id)
		}
		p := &Person{Id: id}
		for name, ri := range u.Repos {
			path := user + "/" + name
			p.Packages = append(p.Packages, "github.com/"+path)
			if err := store.UpdatePackage(site, path, func(info *stpb.PackageInfo) error {
				info.RepoInfo = ri
				return nil
			}); err != nil {
				log.Printf("UpdatePackage %v %v failed: %v", site, path, err)
			} else {
				log.Printf("UpdatePackage %v %v success", site, path)
			}
		}
		return p, nil
	case "bitbucket.org":
		p, err := doc.GetBitbucketPerson(httpClient, map[string]string{"owner": user})
		if err != nil {
			return nil, errorsp.WithStacks(err)
		}
		return &Person{
			Id:       id,
			Packages: p.Projects,
		}, nil
	}
	return nil, nil
}
Example #2
0
func CrawlRepoInfo(site, user, name string) *sppb.RepoInfo {
	// Check cache in store.
	path := user + "/" + name
	p, err := store.ReadPackage(site, path)
	if err != nil {
		log.Printf("ReadPackage %v %v failed: %v", site, path, err)
	} else {
		if p.RepoInfo != nil && store.RepoInfoAge(p.RepoInfo) < maxRepoInfoAge {
			log.Printf("Repo cache of %s/%s hit", site, path)
			bi.Inc("crawler.repocache.hit")
			return p.RepoInfo
		}
	}
	bi.Inc("crawler.repocache.miss")
	ri, err := GithubSpider.ReadRepository(user, name)
	if err != nil {
		if errorsp.Cause(err) == github.ErrInvalidRepository {
			if err := store.DeletePackage(site, path); err != nil {
				log.Printf("DeleteRepoInfo %v %v failed: %v", site, path, err)
			}
		}
		return nil
	}
	if err := store.UpdatePackage(site, path, func(info *stpb.PackageInfo) error {
		info.RepoInfo = ri
		return nil
	}); err != nil {
		log.Printf("UpdatePackage %v %v failed: %v", site, path, err)
	} else {
		log.Printf("UpdatePackage %s %s success", site, path)
	}
	return ri
}
Example #3
0
func appendNewPackage(pkg, foundWay string) {
	cDB.AppendPackage(pkg, allDocsPkgs.Contain)

	site, path := utils.SplitPackage(pkg)
	if err := store.UpdatePackage(site, path, func(*stpb.PackageInfo) error {
		return nil
	}); err != nil {
		log.Printf("UpdatePackage %s %s failed: %v", site, path, err)
	}
	if err := store.AppendPackageEvent(site, path, foundWay, time.Now(), sppb.HistoryEvent_Action_None); err != nil {
		log.Printf("UpdatePackageHistory %s %s failed: %v", site, path, err)
	}
}
Example #4
0
// OnlyMapper.Map
func (pc *PackageCrawler) Map(key, val sophie.SophieWriter, c []sophie.Collector) error {
	if time.Now().After(AppStopTime) {
		log.Printf("[Part %d] Timeout(key = %v), PackageCrawler returns EOM",
			pc.part, key)
		return mr.EOM
	}
	pkg := string(*key.(*sophie.RawString))
	ent := val.(*gcse.CrawlingEntry)
	if ent.Version < gcse.CrawlerVersion {
		// if gcse.CrawlerVersion is larger than Version, Etag is ignored.
		ent.Etag = ""
	}
	log.Printf("[Part %d] Crawling package %v with etag %s\n", pc.part, pkg, ent.Etag)

	p, flds, err := gcse.CrawlPackage(pc.httpClient, pkg, ent.Etag)
	for _, fld := range flds {
		if spider.LikeGoSubFolder(fld.Name) {
			appendNewPackage(pkg+"/"+fld.Path, "parent")
		}
	}
	site, path := utils.SplitPackage(pkg)
	if err != nil && errorsp.Cause(err) != gcse.ErrPackageNotModifed {
		log.Printf("[Part %d] Crawling pkg %s failed: %v", pc.part, pkg, err)
		if gcse.IsBadPackage(err) {
			utils.LogError(store.AppendPackageEvent(site, path, "", time.Now(), sppb.HistoryEvent_Action_Invalid), "AppendPackageEvent %v %v failed", site, path)
			bi.AddValue(bi.Sum, "crawler.package.wrong-package", 1)
			// a wrong path
			nda := gcse.NewDocAction{
				Action: gcse.NDA_DEL,
			}
			c[0].Collect(sophie.RawString(pkg), &nda)
			cDB.PackageDB.Delete(pkg)
			log.Printf("[Part %d] Remove wrong package %s", pc.part, pkg)
		} else {
			utils.LogError(store.AppendPackageEvent(site, path, "", time.Now(), sppb.HistoryEvent_Action_Failed), "AppendPackageEvent %v %v failed", site, path)
			bi.Inc("crawler.package.failed")
			if strings.HasPrefix(pkg, "github.com/") {
				bi.Inc("crawler.package.failed.github")
			}
			pc.failCount++

			cDB.SchedulePackage(pkg, time.Now().Add(12*time.Hour), ent.Etag)

			if pc.failCount >= 10 || strings.Contains(err.Error(), "403") {
				durToSleep := 10 * time.Minute
				if time.Now().Add(durToSleep).After(AppStopTime) {
					log.Printf("[Part %d] Timeout(key = %v), PackageCrawler returns EOM",
						pc.part, key)
					return mr.EOM
				}

				log.Printf("[Part %d] Last ten crawling packages failed, sleep for a while...(current: %s)",
					pc.part, pkg)
				time.Sleep(durToSleep)
				pc.failCount = 0
			}
		}
		return nil
	}
	utils.LogError(store.AppendPackageEvent(site, path, "", time.Now(), sppb.HistoryEvent_Action_Success), "AppendPackageEvent %v %v failed", site, path)
	pc.failCount = 0
	if errorsp.Cause(err) == gcse.ErrPackageNotModifed {
		// TODO crawling stars for unchanged project
		log.Printf("[Part %d] Package %s unchanged!", pc.part, pkg)
		schedulePackageNextCrawl(pkg, ent.Etag)
		bi.AddValue(bi.Sum, "crawler.package.not-modified", 1)
		return nil
	}
	bi.AddValue(bi.Sum, "crawler.package.success", 1)
	if strings.HasPrefix(pkg, "github.com/") {
		bi.AddValue(bi.Sum, "crawler.package.success.github", 1)
	}
	log.Printf("[Part %d] Crawled package %s success!", pc.part, pkg)

	var pkgInfo *stpb.PackageInfo
	if err := store.UpdatePackage(site, path, func(pi *stpb.PackageInfo) error {
		fillPackageInfo(p, pi)
		pkgInfo = pi
		return nil
	}); err != nil {
		log.Printf("UpdatePackage %v %v failed: %v", site, path, err)
	}
	saveRelatedInfo(pkgInfo)

	nda := gcse.NewDocAction{
		Action:  gcse.NDA_UPDATE,
		DocInfo: packageToDoc(p),
	}
	c[0].Collect(sophie.RawString(pkg), &nda)
	log.Printf("[Part %d] Package %s saved!", pc.part, pkg)

	if !strings.HasPrefix(pkg, "github.com/") {
		// github.com throttling is done within the GithubSpider.
		time.Sleep(10 * time.Second)
	}
	return nil
}