func fillPackageInfo(p *gcse.Package, pi *stpb.PackageInfo) { pi.Package = p.Package pi.Name = p.Name pi.Synopsis = p.Synopsis pi.Description = p.Doc pi.Author = gcse.AuthorOfPackage(p.Package) pi.ProjectUrl = p.ProjectURL pi.Stars = int32(p.StarCount) pi.ReadmeFn = p.ReadmeFn pi.ReadmeData = p.ReadmeData pi.Exported = p.Exported pi.References = p.References pi.Imports = nil for _, imp := range p.Imports { if doc.IsValidRemotePath(imp) { pi.Imports = append(pi.Imports, imp) } } pi.TestImports = nil for _, imp := range p.TestImports { if doc.IsValidRemotePath(imp) { pi.TestImports = append(pi.TestImports, imp) } } }
func packageToDoc(p *gcse.Package) gcse.DocInfo { // copy Package as a DocInfo d := gcse.DocInfo{ Package: p.Package, Name: p.Name, Synopsis: p.Synopsis, Description: p.Doc, LastUpdated: time.Now(), Author: gcse.AuthorOfPackage(p.Package), ProjectURL: p.ProjectURL, StarCount: p.StarCount, ReadmeFn: p.ReadmeFn, ReadmeData: p.ReadmeData, Exported: p.Exported, } d.Imports = nil for _, imp := range p.Imports { if doc.IsValidRemotePath(imp) { d.Imports = append(d.Imports, imp) } } d.TestImports = nil for _, imp := range p.TestImports { if doc.IsValidRemotePath(imp) { d.TestImports = append(d.TestImports, imp) } } // append new authors if strings.HasPrefix(d.Package, "github.com/") { cDB.AppendPerson("github.com", d.Author) } else if strings.HasPrefix(d.Package, "bitbucket.org/") { cDB.AppendPerson("bitbucket.org", d.Author) } for _, imp := range d.Imports { appendPackage(imp) } for _, imp := range d.TestImports { appendPackage(imp) } log.Printf("[pushPackage] References: %v", p.References) for _, ref := range p.References { appendPackage(ref) } schedulePackageNextCrawl(d.Package, p.Etag) return d }
func filterPackages(pkgs []string) (res []string) { for _, pkg := range pkgs { pkg = gcse.TrimPackageName(pkg) if !doc.IsValidRemotePath(pkg) { continue } res = append(res, pkg) } return }
func pushPackage(c appengine.Context, p *gcc.Package) (succ bool) { // copy Package as a DocInfo d := DocInfo{ Name: p.Name, Package: p.ImportPath, Synopsis: p.Synopsis, Description: p.Doc, LastUpdated: time.Now(), Author: authorOfPackage(p.ImportPath), ProjectURL: p.ProjectURL, StarCount: p.StarCount, ReadmeFn: p.ReadmeFn, ReadmeData: p.ReadmeData, } d.Imports = nil for _, imp := range p.Imports { if doc.IsValidRemotePath(imp) { d.Imports = append(d.Imports, imp) } } // save DocInfo into fetchedDoc DB ddb := NewDocDB(c, kindFetchedDoc) err := ddb.Put(d.Package, &d) if err != nil { c.Errorf("ddb.Put(%s) failed: %v", err) return false } // append new authors if strings.HasPrefix(d.Package, "github.com/") { appendPerson(c, "github.com", d.Author) } else if strings.HasPrefix(d.Package, "bitbucket.org/") { appendPerson(c, "bitbucket.org", d.Author) } for _, imp := range d.Imports { appendPackage(c, imp) } log.Printf("[crawlPackage] References: %v", p.References) for _, ref := range p.References { appendPackage(c, ref) } schedulePackage(c, d.Package, time.Now().Add(DefaultPackageAge).Add( time.Duration(rand.Int63n(int64(DefaultPackageAge)/10)- int64(DefaultPackageAge)/5))) return true }
// touchPackage forces a package to update if it was not crawled before a // specific time. func touchPackage(pkg string, crawledBefore time.Time, pkgUTs map[string]time.Time) { pkg = strings.TrimSpace(pkg) if !doc.IsValidRemotePath(pkg) { //log.Printf(" [touchPackage] Not a valid remote path: %s", pkg) return } ut, ok := pkgUTs[pkg] if ok && ut.After(crawledBefore) { return } // set Etag to "" to force updating cDB.PushToCrawlPackage(pkg) }
// AppendPackage appends a package. If the package did not exist in either // PackageDB or Docs, schedule it (immediately). func (cdb *CrawlerDB) AppendPackage(pkg string, inDocs func(pkg string) bool) { pkg = TrimPackageName(pkg) if !doc.IsValidRemotePath(pkg) { return } var ent CrawlingEntry if cdb.PackageDB.Get(pkg, &ent) { if ent.ScheduleTime.Before(time.Now()) || inDocs(pkg) { return } // if the docs is missing in Docs, schedule it earlier log.Printf("Scheduling a package with missing docs: %v", pkg) } else { log.Printf("Scheduling new package: %v", pkg) } cdb.SchedulePackage(pkg, time.Now(), "") }
// reschedule if last crawl time is later than crawledBefore func touchPackage(pkg string, crawledBefore time.Time) bool { pkg = strings.TrimSpace(pkg) if !doc.IsValidRemotePath(pkg) { //log.Printf(" [touchPackage] Not a valid remote path: %s", pkg) return false } var ent gcse.DocInfo if docDB.Get(pkg, &ent) { if ent.LastUpdated.After(crawledBefore) { //log.Printf(" [touchPackage] no need to update: %s", pkg) return false } } // set Etag to "" to force updating return schedulePackage(pkg, time.Now(), "") == nil }
// AppendPackage appends a package. If the package did not exist in either // PackageDB or Docs, shedulet it (immediately). func (cdb *CrawlerDB) AppendPackage(pkg string, inDocs func(pkg string) bool) { pkg = TrimPackageName(pkg) if !doc.IsValidRemotePath(pkg) { return } var ent CrawlingEntry exists := cdb.PackageDB.Get(pkg, &ent) if exists { if inDocs(pkg) { return } // if the docs is missing in Docs, still schedule it now } // if the package doesn't exist in docDB, Etag is discarded cdb.SchedulePackage(pkg, time.Now(), "") }
// returns true if a new package is appended to the crawling list func appendPackage(c appengine.Context, pkg string) bool { if !doc.IsValidRemotePath(pkg) { // log.Printf(" [appendPackage] Not a valid remote path: %s", pkg) return false } ddb := NewCachedDocDB(c, kindCrawlerPackage) var ent CrawlingEntry err, exists := ddb.Get(pkg, &ent) if exists { // already scheduled log.Printf(" [appendPackage] Package %s was scheduled to %v", pkg, ent.ScheduleTime) return false } if err != nil { log.Printf(" [appendPackage] Get(crawler, %s) failed: %v", pkg, err) return false } return schedulePackage(c, pkg, time.Now()) == nil }
func appendPackage(pkg string) bool { pkg = strings.TrimFunc(strings.TrimSpace(pkg), func(r rune) bool { return r > rune(128) }) if !doc.IsValidRemotePath(pkg) { //log.Printf(" [appendPackage] Not a valid remote path: %s", pkg) return false } var ent CrawlingEntry exists := cPackageDB.Get(pkg, &ent) if exists { var di gcse.DocInfo exists := docDB.Get(pkg, &di) if exists { // already scheduled // log.Printf(" [appendPackage] Package %s was scheduled to %v", pkg, ent.ScheduleTime) return false } } // if the package doesn't exist in docDB, Etag is discarded return schedulePackage(pkg, time.Now(), "") == nil }