Example #1
0
File: crawl.go Project: golang/gddo
func put(pdoc *doc.Package, nextCrawl time.Time) error {
	if pdoc.Status == gosrc.NoRecentCommits &&
		isActivePkg(pdoc.ImportPath, gosrc.NoRecentCommits) {
		pdoc.Status = gosrc.Active
	}
	if err := db.Put(pdoc, nextCrawl, false); err != nil {
		return fmt.Errorf("ERROR db.Put(%q): %v", pdoc.ImportPath, err)
	}
	return nil
}
Example #2
0
// Put adds the package documentation to the database.
func (db *Database) Put(pdoc *doc.Package, nextCrawl time.Time, hide bool) error {
	c := db.Pool.Get()
	defer c.Close()

	score := 0.0
	if !hide {
		score = documentScore(pdoc)
	}
	terms := documentTerms(pdoc, score)

	var gobBuf bytes.Buffer
	if err := gob.NewEncoder(&gobBuf).Encode(pdoc); err != nil {
		return err
	}

	// Truncate large documents.
	if gobBuf.Len() > 200000 {
		pdocNew := *pdoc
		pdoc = &pdocNew
		pdoc.Truncated = true
		pdoc.Vars = nil
		pdoc.Funcs = nil
		pdoc.Types = nil
		pdoc.Consts = nil
		pdoc.Examples = nil
		gobBuf.Reset()
		if err := gob.NewEncoder(&gobBuf).Encode(pdoc); err != nil {
			return err
		}
	}

	gobBytes, err := snappy.Encode(nil, gobBuf.Bytes())
	if err != nil {
		return err
	}

	kind := "p"
	switch {
	case pdoc.Name == "":
		kind = "d"
	case pdoc.IsCmd:
		kind = "c"
	}

	t := int64(0)
	if !nextCrawl.IsZero() {
		t = nextCrawl.Unix()
	}

	_, err = putScript.Do(c, pdoc.ImportPath, pdoc.Synopsis, score, gobBytes, strings.Join(terms, " "), pdoc.Etag, kind, t)
	if err != nil {
		return err
	}

	if nextCrawl.IsZero() {
		// Skip crawling related packages if this is not a full save.
		return nil
	}

	paths := make(map[string]bool)
	for _, p := range pdoc.Imports {
		if gosrc.IsValidRemotePath(p) {
			paths[p] = true
		}
	}
	for _, p := range pdoc.TestImports {
		if gosrc.IsValidRemotePath(p) {
			paths[p] = true
		}
	}
	for _, p := range pdoc.XTestImports {
		if gosrc.IsValidRemotePath(p) {
			paths[p] = true
		}
	}
	if pdoc.ImportPath != pdoc.ProjectRoot && pdoc.ProjectRoot != "" {
		paths[pdoc.ProjectRoot] = true
	}
	for _, p := range pdoc.Subdirectories {
		paths[pdoc.ImportPath+"/"+p] = true
	}

	args := make([]interface{}, 0, len(paths))
	for p := range paths {
		args = append(args, p)
	}
	_, err = addCrawlScript.Do(c, args...)
	return err
}
Example #3
0
File: crawl.go Project: golang/gddo
// crawlDoc fetches the package documentation from the VCS and updates the database.
func crawlDoc(source string, importPath string, pdoc *doc.Package, hasSubdirs bool, nextCrawl time.Time) (*doc.Package, error) {
	message := []interface{}{source}
	defer func() {
		message = append(message, importPath)
		log.Println(message...)
	}()

	if !nextCrawl.IsZero() {
		d := time.Since(nextCrawl) / time.Hour
		if d > 0 {
			message = append(message, "late:", int64(d))
		}
	}

	etag := ""
	if pdoc != nil {
		etag = pdoc.Etag
		message = append(message, "etag:", etag)
	}

	start := time.Now()
	var err error
	if strings.HasPrefix(importPath, "code.google.com/p/go.") {
		// Old import path for Go sub-repository.
		pdoc = nil
		err = gosrc.NotFoundError{Message: "old Go sub-repo", Redirect: "golang.org/x/" + importPath[len("code.google.com/p/go."):]}
	} else if blocked, e := db.IsBlocked(importPath); blocked && e == nil {
		pdoc = nil
		err = gosrc.NotFoundError{Message: "blocked."}
	} else if testdataPat.MatchString(importPath) {
		pdoc = nil
		err = gosrc.NotFoundError{Message: "testdata."}
	} else {
		var pdocNew *doc.Package
		pdocNew, err = doc.Get(httpClient, importPath, etag)
		message = append(message, "fetch:", int64(time.Since(start)/time.Millisecond))
		if err == nil && pdocNew.Name == "" && !hasSubdirs {
			for _, e := range pdocNew.Errors {
				message = append(message, "err:", e)
			}
			pdoc = nil
			err = gosrc.NotFoundError{Message: "no Go files or subdirs"}
		} else if _, ok := err.(gosrc.NotModifiedError); !ok {
			pdoc = pdocNew
		}
	}

	nextCrawl = start.Add(*maxAge)
	switch {
	case strings.HasPrefix(importPath, "github.com/") || (pdoc != nil && len(pdoc.Errors) > 0):
		nextCrawl = start.Add(*maxAge * 7)
	case strings.HasPrefix(importPath, "gist.github.com/"):
		// Don't spend time on gists. It's silly thing to do.
		nextCrawl = start.Add(*maxAge * 30)
	}

	if err == nil {
		message = append(message, "put:", pdoc.Etag)
		if err := put(pdoc, nextCrawl); err != nil {
			log.Println(err)
		}
		return pdoc, nil
	} else if e, ok := err.(gosrc.NotModifiedError); ok {
		if pdoc.Status == gosrc.Active && !isActivePkg(importPath, e.Status) {
			if e.Status == gosrc.NoRecentCommits {
				e.Status = gosrc.Inactive
			}
			message = append(message, "archive", e)
			pdoc.Status = e.Status
			if err := db.Put(pdoc, nextCrawl, false); err != nil {
				log.Printf("ERROR db.Put(%q): %v", importPath, err)
			}
		} else {
			// Touch the package without updating and move on to next one.
			message = append(message, "touch")
			if err := db.SetNextCrawl(importPath, nextCrawl); err != nil {
				log.Printf("ERROR db.SetNextCrawl(%q): %v", importPath, err)
			}
		}
		return pdoc, nil
	} else if e, ok := err.(gosrc.NotFoundError); ok {
		message = append(message, "notfound:", e)
		if err := db.Delete(importPath); err != nil {
			log.Printf("ERROR db.Delete(%q): %v", importPath, err)
		}
		return nil, e
	} else {
		message = append(message, "ERROR:", err)
		return nil, err
	}
}
Example #4
0
// Put adds the package documentation to the database.
func (db *Database) Put(pdoc *doc.Package, nextCrawl time.Time, hide bool) error {
	c := db.Pool.Get()
	defer c.Close()

	score := 0.0
	if !hide {
		score = documentScore(pdoc)
	}
	terms := documentTerms(pdoc, score)

	var gobBuf bytes.Buffer
	if err := gob.NewEncoder(&gobBuf).Encode(pdoc); err != nil {
		return err
	}

	gobBytes := snappy.Encode(nil, gobBuf.Bytes())

	// Truncate large documents.
	if len(gobBytes) > 400000 {
		pdocNew := *pdoc
		pdoc = &pdocNew
		pdoc.Truncated = true
		pdoc.Vars = nil
		pdoc.Funcs = nil
		pdoc.Types = nil
		pdoc.Consts = nil
		pdoc.Examples = nil
		gobBuf.Reset()
		if err := gob.NewEncoder(&gobBuf).Encode(pdoc); err != nil {
			return err
		}
		gobBytes = snappy.Encode(nil, gobBuf.Bytes())
	}

	kind := "p"
	switch {
	case pdoc.Name == "":
		kind = "d"
	case pdoc.IsCmd:
		kind = "c"
	}

	t := int64(0)
	if !nextCrawl.IsZero() {
		t = nextCrawl.Unix()
	}

	// Get old version of the package to extract its imports.
	// If the package does not exist, both oldDoc and err will be nil.
	old, _, err := db.getDoc(c, pdoc.ImportPath)
	if err != nil {
		return err
	}

	_, err = putScript.Do(c, pdoc.ImportPath, pdoc.Synopsis, score, gobBytes, strings.Join(terms, " "), pdoc.Etag, kind, t)
	if err != nil {
		return err
	}

	id, n, err := pkgIDAndImportCount(c, pdoc.ImportPath)
	if err != nil {
		return err
	}
	ctx := bgCtx()

	if score > 0 {
		if err := PutIndex(ctx, pdoc, id, score, n); err != nil {
			log.Printf("Cannot put %q in index: %v", pdoc.ImportPath, err)
		}

		if old != nil {
			if err := updateImportsIndex(c, ctx, old, pdoc); err != nil {
				return err
			}
		}
	} else {
		if err := deleteIndex(ctx, id); err != nil {
			return err
		}
	}

	if nextCrawl.IsZero() {
		// Skip crawling related packages if this is not a full save.
		return nil
	}

	paths := make(map[string]bool)
	for _, p := range pdoc.Imports {
		if gosrc.IsValidRemotePath(p) {
			paths[p] = true
		}
	}
	for _, p := range pdoc.TestImports {
		if gosrc.IsValidRemotePath(p) {
			paths[p] = true
		}
	}
	for _, p := range pdoc.XTestImports {
		if gosrc.IsValidRemotePath(p) {
			paths[p] = true
		}
	}
	if pdoc.ImportPath != pdoc.ProjectRoot && pdoc.ProjectRoot != "" {
		paths[pdoc.ProjectRoot] = true
	}
	for _, p := range pdoc.Subdirectories {
		paths[pdoc.ImportPath+"/"+p] = true
	}

	args := make([]interface{}, 0, len(paths))
	for p := range paths {
		args = append(args, p)
	}
	_, err = addCrawlScript.Do(c, args...)
	return err
}
Example #5
0
// Reindex gets all the packages in database and put them into the search index.
// This will update the search index with the path, synopsis, score, import counts
// of all the packages in the database.
func (db *Database) Reindex(ctx context.Context) error {
	c := db.Pool.Get()
	defer c.Close()

	idx, err := search.Open("packages")
	if err != nil {
		return fmt.Errorf("database: failed to open packages: %v", err)
	}
	npkgs := 0
	for {
		// Get 200 packages from the nextCrawl set each time. Use npkgs as a cursor
		// to store the current position we actually indexed. Retry from the cursor
		// position if we received a timeout error from app engine.
		values, err := redis.Values(c.Do(
			"SORT", "nextCrawl",
			"LIMIT", strconv.Itoa(npkgs), "200",
			"GET", "pkg:*->path",
			"GET", "pkg:*->synopsis",
			"GET", "pkg:*->score",
		))
		if err != nil {
			return err
		}
		if len(values) == 0 {
			break // all done
		}

		// The Search API should support put in batches of up to 200 documents,
		// the Go version of this API does not support this yet.
		// TODO(shantuo): Put packages in batch operations.
		for ; len(values) > 0; npkgs++ {
			var pdoc doc.Package
			var score float64
			values, err = redis.Scan(values, &pdoc.ImportPath, &pdoc.Synopsis, &score)
			if err != nil {
				return err
			}
			// There are some corrupted data in our current database
			// that causes an error when putting the package into the
			// search index which only supports UTF8 encoding.
			if !utf8.ValidString(pdoc.Synopsis) {
				pdoc.Synopsis = ""
			}
			id, n, err := pkgIDAndImportCount(c, pdoc.ImportPath)
			if err != nil {
				return err
			}
			if _, err := idx.Put(ctx, id, &Package{
				Path:        pdoc.ImportPath,
				Synopsis:    pdoc.Synopsis,
				Score:       score,
				ImportCount: n,
			}); err != nil {
				if appengine.IsTimeoutError(err) {
					log.Printf("App Engine timeout: %v. Continue...", err)
					break
				}
				return fmt.Errorf("Failed to put index %s: %v", id, err)
			}
		}
	}
	log.Printf("%d packages are reindexed", npkgs)
	return nil
}