func put(pdoc *doc.Package, nextCrawl time.Time) error { if pdoc.Status == gosrc.NoRecentCommits && isActivePkg(pdoc.ImportPath, gosrc.NoRecentCommits) { pdoc.Status = gosrc.Active } if err := db.Put(pdoc, nextCrawl, false); err != nil { return fmt.Errorf("ERROR db.Put(%q): %v", pdoc.ImportPath, err) } return nil }
// Put adds the package documentation to the database. func (db *Database) Put(pdoc *doc.Package, nextCrawl time.Time, hide bool) error { c := db.Pool.Get() defer c.Close() score := 0.0 if !hide { score = documentScore(pdoc) } terms := documentTerms(pdoc, score) var gobBuf bytes.Buffer if err := gob.NewEncoder(&gobBuf).Encode(pdoc); err != nil { return err } // Truncate large documents. if gobBuf.Len() > 200000 { pdocNew := *pdoc pdoc = &pdocNew pdoc.Truncated = true pdoc.Vars = nil pdoc.Funcs = nil pdoc.Types = nil pdoc.Consts = nil pdoc.Examples = nil gobBuf.Reset() if err := gob.NewEncoder(&gobBuf).Encode(pdoc); err != nil { return err } } gobBytes, err := snappy.Encode(nil, gobBuf.Bytes()) if err != nil { return err } kind := "p" switch { case pdoc.Name == "": kind = "d" case pdoc.IsCmd: kind = "c" } t := int64(0) if !nextCrawl.IsZero() { t = nextCrawl.Unix() } _, err = putScript.Do(c, pdoc.ImportPath, pdoc.Synopsis, score, gobBytes, strings.Join(terms, " "), pdoc.Etag, kind, t) if err != nil { return err } if nextCrawl.IsZero() { // Skip crawling related packages if this is not a full save. return nil } paths := make(map[string]bool) for _, p := range pdoc.Imports { if gosrc.IsValidRemotePath(p) { paths[p] = true } } for _, p := range pdoc.TestImports { if gosrc.IsValidRemotePath(p) { paths[p] = true } } for _, p := range pdoc.XTestImports { if gosrc.IsValidRemotePath(p) { paths[p] = true } } if pdoc.ImportPath != pdoc.ProjectRoot && pdoc.ProjectRoot != "" { paths[pdoc.ProjectRoot] = true } for _, p := range pdoc.Subdirectories { paths[pdoc.ImportPath+"/"+p] = true } args := make([]interface{}, 0, len(paths)) for p := range paths { args = append(args, p) } _, err = addCrawlScript.Do(c, args...) return err }
// crawlDoc fetches the package documentation from the VCS and updates the database. func crawlDoc(source string, importPath string, pdoc *doc.Package, hasSubdirs bool, nextCrawl time.Time) (*doc.Package, error) { message := []interface{}{source} defer func() { message = append(message, importPath) log.Println(message...) }() if !nextCrawl.IsZero() { d := time.Since(nextCrawl) / time.Hour if d > 0 { message = append(message, "late:", int64(d)) } } etag := "" if pdoc != nil { etag = pdoc.Etag message = append(message, "etag:", etag) } start := time.Now() var err error if strings.HasPrefix(importPath, "code.google.com/p/go.") { // Old import path for Go sub-repository. pdoc = nil err = gosrc.NotFoundError{Message: "old Go sub-repo", Redirect: "golang.org/x/" + importPath[len("code.google.com/p/go."):]} } else if blocked, e := db.IsBlocked(importPath); blocked && e == nil { pdoc = nil err = gosrc.NotFoundError{Message: "blocked."} } else if testdataPat.MatchString(importPath) { pdoc = nil err = gosrc.NotFoundError{Message: "testdata."} } else { var pdocNew *doc.Package pdocNew, err = doc.Get(httpClient, importPath, etag) message = append(message, "fetch:", int64(time.Since(start)/time.Millisecond)) if err == nil && pdocNew.Name == "" && !hasSubdirs { for _, e := range pdocNew.Errors { message = append(message, "err:", e) } pdoc = nil err = gosrc.NotFoundError{Message: "no Go files or subdirs"} } else if _, ok := err.(gosrc.NotModifiedError); !ok { pdoc = pdocNew } } nextCrawl = start.Add(*maxAge) switch { case strings.HasPrefix(importPath, "github.com/") || (pdoc != nil && len(pdoc.Errors) > 0): nextCrawl = start.Add(*maxAge * 7) case strings.HasPrefix(importPath, "gist.github.com/"): // Don't spend time on gists. It's silly thing to do. nextCrawl = start.Add(*maxAge * 30) } if err == nil { message = append(message, "put:", pdoc.Etag) if err := put(pdoc, nextCrawl); err != nil { log.Println(err) } return pdoc, nil } else if e, ok := err.(gosrc.NotModifiedError); ok { if pdoc.Status == gosrc.Active && !isActivePkg(importPath, e.Status) { if e.Status == gosrc.NoRecentCommits { e.Status = gosrc.Inactive } message = append(message, "archive", e) pdoc.Status = e.Status if err := db.Put(pdoc, nextCrawl, false); err != nil { log.Printf("ERROR db.Put(%q): %v", importPath, err) } } else { // Touch the package without updating and move on to next one. message = append(message, "touch") if err := db.SetNextCrawl(importPath, nextCrawl); err != nil { log.Printf("ERROR db.SetNextCrawl(%q): %v", importPath, err) } } return pdoc, nil } else if e, ok := err.(gosrc.NotFoundError); ok { message = append(message, "notfound:", e) if err := db.Delete(importPath); err != nil { log.Printf("ERROR db.Delete(%q): %v", importPath, err) } return nil, e } else { message = append(message, "ERROR:", err) return nil, err } }
// Put adds the package documentation to the database. func (db *Database) Put(pdoc *doc.Package, nextCrawl time.Time, hide bool) error { c := db.Pool.Get() defer c.Close() score := 0.0 if !hide { score = documentScore(pdoc) } terms := documentTerms(pdoc, score) var gobBuf bytes.Buffer if err := gob.NewEncoder(&gobBuf).Encode(pdoc); err != nil { return err } gobBytes := snappy.Encode(nil, gobBuf.Bytes()) // Truncate large documents. if len(gobBytes) > 400000 { pdocNew := *pdoc pdoc = &pdocNew pdoc.Truncated = true pdoc.Vars = nil pdoc.Funcs = nil pdoc.Types = nil pdoc.Consts = nil pdoc.Examples = nil gobBuf.Reset() if err := gob.NewEncoder(&gobBuf).Encode(pdoc); err != nil { return err } gobBytes = snappy.Encode(nil, gobBuf.Bytes()) } kind := "p" switch { case pdoc.Name == "": kind = "d" case pdoc.IsCmd: kind = "c" } t := int64(0) if !nextCrawl.IsZero() { t = nextCrawl.Unix() } // Get old version of the package to extract its imports. // If the package does not exist, both oldDoc and err will be nil. old, _, err := db.getDoc(c, pdoc.ImportPath) if err != nil { return err } _, err = putScript.Do(c, pdoc.ImportPath, pdoc.Synopsis, score, gobBytes, strings.Join(terms, " "), pdoc.Etag, kind, t) if err != nil { return err } id, n, err := pkgIDAndImportCount(c, pdoc.ImportPath) if err != nil { return err } ctx := bgCtx() if score > 0 { if err := PutIndex(ctx, pdoc, id, score, n); err != nil { log.Printf("Cannot put %q in index: %v", pdoc.ImportPath, err) } if old != nil { if err := updateImportsIndex(c, ctx, old, pdoc); err != nil { return err } } } else { if err := deleteIndex(ctx, id); err != nil { return err } } if nextCrawl.IsZero() { // Skip crawling related packages if this is not a full save. return nil } paths := make(map[string]bool) for _, p := range pdoc.Imports { if gosrc.IsValidRemotePath(p) { paths[p] = true } } for _, p := range pdoc.TestImports { if gosrc.IsValidRemotePath(p) { paths[p] = true } } for _, p := range pdoc.XTestImports { if gosrc.IsValidRemotePath(p) { paths[p] = true } } if pdoc.ImportPath != pdoc.ProjectRoot && pdoc.ProjectRoot != "" { paths[pdoc.ProjectRoot] = true } for _, p := range pdoc.Subdirectories { paths[pdoc.ImportPath+"/"+p] = true } args := make([]interface{}, 0, len(paths)) for p := range paths { args = append(args, p) } _, err = addCrawlScript.Do(c, args...) return err }
// Reindex gets all the packages in database and put them into the search index. // This will update the search index with the path, synopsis, score, import counts // of all the packages in the database. func (db *Database) Reindex(ctx context.Context) error { c := db.Pool.Get() defer c.Close() idx, err := search.Open("packages") if err != nil { return fmt.Errorf("database: failed to open packages: %v", err) } npkgs := 0 for { // Get 200 packages from the nextCrawl set each time. Use npkgs as a cursor // to store the current position we actually indexed. Retry from the cursor // position if we received a timeout error from app engine. values, err := redis.Values(c.Do( "SORT", "nextCrawl", "LIMIT", strconv.Itoa(npkgs), "200", "GET", "pkg:*->path", "GET", "pkg:*->synopsis", "GET", "pkg:*->score", )) if err != nil { return err } if len(values) == 0 { break // all done } // The Search API should support put in batches of up to 200 documents, // the Go version of this API does not support this yet. // TODO(shantuo): Put packages in batch operations. for ; len(values) > 0; npkgs++ { var pdoc doc.Package var score float64 values, err = redis.Scan(values, &pdoc.ImportPath, &pdoc.Synopsis, &score) if err != nil { return err } // There are some corrupted data in our current database // that causes an error when putting the package into the // search index which only supports UTF8 encoding. if !utf8.ValidString(pdoc.Synopsis) { pdoc.Synopsis = "" } id, n, err := pkgIDAndImportCount(c, pdoc.ImportPath) if err != nil { return err } if _, err := idx.Put(ctx, id, &Package{ Path: pdoc.ImportPath, Synopsis: pdoc.Synopsis, Score: score, ImportCount: n, }); err != nil { if appengine.IsTimeoutError(err) { log.Printf("App Engine timeout: %v. Continue...", err) break } return fmt.Errorf("Failed to put index %s: %v", id, err) } } } log.Printf("%d packages are reindexed", npkgs) return nil }