// ReqThrottledRoundTripper wraps another RoundTripper rt, // throttling all requests to the specified request rate. func ReqThrottledRoundTripper(rt http.RoundTripper, rate int64) http.RoundTripper { freq := time.Duration(1e9 / rate) bucket := tb.NewBucket(rate, freq) return roundTripperFunc(func(r *http.Request) (*http.Response, error) { got := bucket.Take(1) for got != 1 { got = bucket.Take(1) time.Sleep(freq) } return rt.RoundTrip(r) }) }
// ByteThrottledRoundTripper wraps another RoundTripper rt, // throttling all requests to the specified byte rate. func ByteThrottledRoundTripper(rt http.RoundTripper, rate int64) http.RoundTripper { freq := time.Duration(1 * time.Millisecond) bucket := tb.NewBucket(rate, freq) return roundTripperFunc(func(r *http.Request) (*http.Response, error) { got := bucket.Take(r.ContentLength) for got < r.ContentLength { got += bucket.Take(r.ContentLength - got) time.Sleep(freq) } return rt.RoundTrip(r) }) }
// SendVarReadFirstLinger is a refined version of SendAllReadFirstLinger. It // works in the same way but reduces the requests to all clusters under // certain circumstances. If maxKeysPerSecond is exceeded by this read // strategy, it will stop performing SendAll and revert to SendOne. SendOne // has two issues: no repairs will ever be made, and if the chosen cluster is // slow or unusable, the read will be delayed or fail. The first issue can be // ignored, because the baseline SendAll reads provide a basis for repairs. To // solve the second issue, we promote any SendOne to a SendAll if no results // are returned by thresholdLatency. // // To never perform an initial SendAll, set maxKeysPerSecond to 0. To always // perform an initial SendAll, set maxKeysPerSecond to a negative value. func SendVarReadFirstLinger(maxKeysPerSecond int, thresholdLatency time.Duration) func(*Farm) Selecter { permitter := permitter(allowAllPermitter{}) if maxKeysPerSecond >= 0 { permitter = tokenBucketPermitter{tb.NewBucket(int64(maxKeysPerSecond), -1)} } permitter.canHas(0) return func(farm *Farm) Selecter { return sendVarReadFirstLinger{ Farm: farm, permitter: permitter, thresholdLatency: thresholdLatency, } } }
// Start the token bucket if necessary func startTokenBucket() { currLimitMu.Lock() currLimit := bwLimit.LimitAt(time.Now()) currLimitMu.Unlock() if currLimit.bandwidth > 0 { tokenBucket = tb.NewBucket(int64(currLimit.bandwidth), 100*time.Millisecond) Log(nil, "Starting bandwidth limiter at %vBytes/s", &currLimit.bandwidth) // Start the SIGUSR2 signal handler to toggle bandwidth. // This function does nothing in windows systems. startSignalHandler() } }
// RateLimited wraps a repair strategy with rate limit. Repair requests that // would cause the instantaneous number of elements (score-members) per second // to exceed the passed limit are dropped. // // RateLimited keeps read strategies responsive, while bounding the load // applied to your infrastructure. func RateLimited(maxElementsPerSecond int, repairStrategy RepairStrategy) RepairStrategy { return func(clusters []cluster.Cluster, instr instrumentation.RepairInstrumentation) coreRepairStrategy { permits := permitter(allowAllPermitter{}) if maxElementsPerSecond >= 0 { permits = tokenBucketPermitter{tb.NewBucket(int64(maxElementsPerSecond), -1)} } return func(kms []common.KeyMember) { if n := len(kms); !permits.canHas(int64(n)) { log.Printf("RateLimited repairs: element rate exceeded; repair request discarded") instr.RepairDiscarded(n) return } repairStrategy(clusters, instr)(kms) } } }
// startTokenTicker creates a ticker to update the bandwidth limiter every minute. func startTokenTicker() { // If the timetable has a single entry or was not specified, we don't need // a ticker to update the bandwidth. if len(bwLimit) <= 1 { return } ticker := time.NewTicker(time.Minute) go func() { for range ticker.C { limitNow := bwLimit.LimitAt(time.Now()) currLimitMu.Lock() if currLimit.bandwidth != limitNow.bandwidth { tokenBucketMu.Lock() if tokenBucket != nil { err := tokenBucket.Close() if err != nil { Log(nil, "Error closing token bucket: %v", err) } } // Set new bandwidth. If unlimited, set tokenbucket to nil. if limitNow.bandwidth > 0 { tokenBucket = tb.NewBucket(int64(limitNow.bandwidth), 100*time.Millisecond) Log(nil, "Scheduled bandwidth change. Limit set to %vBytes/s", &limitNow.bandwidth) } else { tokenBucket = nil Log(nil, "Scheduled bandwidth change. Bandwidth limits disabled") } currLimit = limitNow tokenBucketMu.Unlock() } currLimitMu.Unlock() } }() }
// Start the token bucket if necessary func startTokenBucket() { if bwLimit > 0 { tokenBucket = tb.NewBucket(int64(bwLimit), 100*time.Millisecond) Log(nil, "Starting bandwidth limiter at %vBytes/s", &bwLimit) } }
// SendVarReadFirstLinger is a refined version of SendAllReadFirstLinger. It // works in the same way but reduces the requests to all clusters under // certain circumstances. If maxKeysPerSecond is exceeded by this read // strategy, it will stop performing SendAll and revert to SendOne. SendOne // has two issues: no repairs will ever be made, and if the chosen cluster is // slow or unusable, the read will be delayed or fail. The first issue can be // ignored, because the baseline SendAll reads provide a basis for repairs. To // solve the second issue, we promote any SendOne to a SendAll if no results // are returned by thresholdLatency. // // To never perform an initial SendAll, set maxKeysPerSecond to 0. To always // perform an initial SendAll, set maxKeysPerSecond to a negative value. func SendVarReadFirstLinger(maxKeysPerSecond int, thresholdLatency time.Duration) func(*Farm) coreReadStrategy { permits := permitter(allowAllPermitter{}) if maxKeysPerSecond >= 0 { permits = tokenBucketPermitter{tb.NewBucket(int64(maxKeysPerSecond), -1)} } permits.canHas(0) return func(farm *Farm) coreReadStrategy { return func(keys []string, offset, limit int) (map[string][]common.KeyScoreMember, error) { began := time.Now() go func() { farm.instrumentation.SelectCall() farm.instrumentation.SelectKeys(len(keys)) }() // We'll combine all response elements into a single channel. When // all clusters have finished sending elements there, close it, so // we can have nice range semantics in our linger phase. elements := make(chan cluster.Element) wg := sync.WaitGroup{} wg.Add(len(farm.clusters)) go func() { // Note that we need a wg.Done signal for every cluster, even // if we didn't actually send to it! wg.Wait() close(elements) }() // Depending on maySendAll, pick either one random cluster or all // of them. var clustersUsed, clustersNotUsed []cluster.Cluster maySendAll := permits.canHas(int64(len(keys))) if maySendAll { clustersUsed = farm.clusters clustersNotUsed = []cluster.Cluster{} } else { i := rand.Intn(len(farm.clusters)) clustersUsed = farm.clusters[i : i+1] clustersNotUsed = make([]cluster.Cluster, 0, len(farm.clusters)-1) clustersNotUsed = append(clustersNotUsed, farm.clusters[:i]...) clustersNotUsed = append(clustersNotUsed, farm.clusters[i+1:]...) } blockingBegan := time.Now() go farm.instrumentation.SelectSendTo(len(clustersUsed)) scatterSelects(clustersUsed, keys, offset, limit, &wg, elements) // remainingKeys keeps track of all keys for which we haven't // received any non-error responses yet. remainingKeys := make(map[string]bool, len(keys)) for _, key := range keys { remainingKeys[key] = true } // If we are not permitted to SendAll, we need a timeout (after // which we will SendAll nevertheless). var timeout <-chan time.Time if !maySendAll && thresholdLatency >= 0 { timeout = time.After(thresholdLatency) } responses := map[string][]tupleSet{} var firstResponseDuration time.Duration retrieved := 0 loop: for { select { case e, ok := <-elements: if !ok { break loop // elements already closed, all Selects done. } retrieved += len(e.KeyScoreMembers) if e.Error != nil { go farm.instrumentation.SelectPartialError() continue // It might appear tempting to immediately send a // Select to the unusedClusters once we run into an // error. However, it's probably better to wait until // thresholdLatency has passed (which should be a // short duration anyway and might have already // happened...) to gather all the keys for which we // need a SendAll first and then do them all in one // big Select. } if firstResponseDuration == 0 { firstResponseDuration = time.Since(blockingBegan) } responses[e.Key] = append(responses[e.Key], makeSet(e.KeyScoreMembers)) delete(remainingKeys, e.Key) case <-timeout: // Promote to SendAll for remaining keys. go farm.instrumentation.SelectSendAllPromotion() maySendAll = true remainingKeysSlice := make([]string, 0, len(remainingKeys)) for k := range remainingKeys { remainingKeysSlice = append(remainingKeysSlice, k) } go farm.instrumentation.SelectSendTo(len(clustersNotUsed)) scatterSelects(clustersNotUsed, remainingKeysSlice, offset, limit, &wg, elements) clustersUsed = farm.clusters clustersNotUsed = []cluster.Cluster{} } if len(remainingKeys) == 0 { // We got enough results to return our results. break loop } } blockingDuration := time.Since(blockingBegan) returned := 0 defer func() { go func() { duration := time.Since(began) farm.instrumentation.SelectDuration(duration) farm.instrumentation.SelectFirstResponseDuration(firstResponseDuration) farm.instrumentation.SelectBlockingDuration(blockingDuration) farm.instrumentation.SelectOverheadDuration(duration - blockingDuration) farm.instrumentation.SelectRetrieved(retrieved) farm.instrumentation.SelectReturned(returned) }() }() // If we are here, we either got at least one result for each key, // or all Select calls have finished but we still did not get at // least one result for each key. In either case, it's time to // return results. if len(responses) == 0 && len(remainingKeys) > 0 { // All Selects returned an error. return map[string][]common.KeyScoreMember{}, fmt.Errorf("complete failure") } response := map[string][]common.KeyScoreMember{} repairs := keyMemberSet{} for key, tupleSets := range responses { union, difference := unionDifference(tupleSets) a := union.orderedLimitedSlice(limit) response[key] = a returned += len(a) repairs.addMany(difference) } sentAllButIncomplete := len(remainingKeys) > 0 sentOneGotEverything := !maySendAll if sentAllButIncomplete { // We already got all results but they are incomplete because // of errors. Partial results are still better than nothing, // so issue repairs as needed and return the partial results. if len(repairs) > 0 { go farm.repairStrategy(repairs.slice()) } return response, nil } if sentOneGotEverything { // The WaitGroup expects len(farm.clusters) Done signals, but // so far we've only given 1. Give the rest. for _ = range clustersNotUsed { wg.Done() } return response, nil } // If we are here, we *might* still have Selects running. So start // a goroutine to "linger" and collect the remaining responses for // repairs before returning the results we have so far. go func() { lingeringRetrievals := 0 for e := range elements { lingeringRetrievals += len(e.KeyScoreMembers) if e.Error != nil { go farm.instrumentation.SelectPartialError() continue } responses[e.Key] = append(responses[e.Key], makeSet(e.KeyScoreMembers)) } for _, tupleSets := range responses { _, difference := unionDifference(tupleSets) repairs.addMany(difference) } if len(repairs) > 0 { go func() { farm.instrumentation.SelectRepairNeeded(len(repairs)) farm.repairStrategy(repairs.slice()) }() } farm.instrumentation.SelectRetrieved(lingeringRetrievals) // additive }() return response, nil } } }
// NewThrottledWriter is an io.Writer wrapping another io.Writer with // byte rate throttling, flushing block bytes at a time. func NewThrottledWriter(rate, block int64, w io.Writer) io.Writer { return &throttledWriter{rate, block, w, tb.NewBucket(rate, -1)} }
func main() { var ( redisInstances = flag.String("redis.instances", "", "Semicolon-separated list of comma-separated lists of Redis instances") redisConnectTimeout = flag.Duration("redis.connect.timeout", 3*time.Second, "Redis connect timeout") redisReadTimeout = flag.Duration("redis.read.timeout", 3*time.Second, "Redis read timeout") redisWriteTimeout = flag.Duration("redis.write.timeout", 3*time.Second, "Redis write timeout") redisMCPI = flag.Int("redis.mcpi", 2, "Max connections per Redis instance") redisHash = flag.String("redis.hash", "murmur3", "Redis hash function: murmur3, fnv, fnva") selectGap = flag.Duration("select.gap", 0*time.Millisecond, "delay between pipeline read invocations when Selecting over multiple keys") maxSize = flag.Int("max.size", 10000, "Maximum number of events per key") batchSize = flag.Int("batch.size", 100, "keys to select per request") maxKeysPerSecond = flag.Int64("max.keys.per.second", 1000, "max keys per second to walk") scanLogInterval = flag.Duration("scan.log.interval", 5*time.Second, "how often to report scan rates in log") once = flag.Bool("once", false, "walk entire keyspace once and exit (default false, walk forever)") statsdAddress = flag.String("statsd.address", "", "Statsd address (blank to disable)") statsdSampleRate = flag.Float64("statsd.sample.rate", 0.1, "Statsd sample rate for normal metrics") statsdBucketPrefix = flag.String("statsd.bucket.prefix", "myservice.", "Statsd bucket key prefix, including trailing period") prometheusNamespace = flag.String("prometheus.namespace", "roshiwalker", "Prometheus key namespace, excluding trailing punctuation") prometheusMaxSummaryAge = flag.Duration("prometheus.max.summary.age", 10*time.Second, "Prometheus max age for instantaneous histogram data") httpAddress = flag.String("http.address", ":6060", "HTTP listen address (profiling/metrics endpoints only)") ) flag.Parse() log.SetOutput(os.Stdout) log.SetFlags(log.Lmicroseconds) // Validate integer arguments. if *maxKeysPerSecond < int64(*batchSize) { log.Fatal("max keys per second should be bigger than batch size") } // Set up instrumentation. statter := g2s.Noop() if *statsdAddress != "" { var err error statter, err = g2s.Dial("udp", *statsdAddress) if err != nil { log.Fatal(err) } } prometheusInstr := prometheus.New(*prometheusNamespace, *prometheusMaxSummaryAge) prometheusInstr.Install("/metrics", http.DefaultServeMux) instr := instrumentation.NewMultiInstrumentation( statsd.New(statter, float32(*statsdSampleRate), *statsdBucketPrefix), prometheusInstr, ) // Parse hash function. var hashFunc func(string) uint32 switch strings.ToLower(*redisHash) { case "murmur3": hashFunc = pool.Murmur3 case "fnv": hashFunc = pool.FNV case "fnva": hashFunc = pool.FNVa default: log.Fatalf("unknown hash %q", *redisHash) } // Set up the clusters. clusters, err := farm.ParseFarmString( *redisInstances, *redisConnectTimeout, *redisReadTimeout, *redisWriteTimeout, *redisMCPI, hashFunc, *maxSize, *selectGap, instr, ) if err != nil { log.Fatal(err) } // HTTP server for profiling. go func() { log.Print(http.ListenAndServe(*httpAddress, nil)) }() // Set up our rate limiter. Remember: it's per-key, not per-request. var ( freq = time.Duration(1/(*maxKeysPerSecond)) * time.Second bucket = tb.NewBucket(*maxKeysPerSecond, freq) ) // Build the farm. var ( readStrategy = farm.SendAllReadAll repairStrategy = farm.AllRepairs // blocking writeQuorum = len(clusters) // 100% dst = farm.New(clusters, writeQuorum, readStrategy, repairStrategy, instr) ) // Perform the walk. defer func(t time.Time) { log.Printf("total walk complete, %s", time.Since(t)) }(time.Now()) for { src := scan(clusters, *batchSize, *scanLogInterval) // new key set walkOnce(dst, bucket, src, *maxSize, instr) if *once { break } } }
func main() { var ( redisInstances = flag.String("redis.instances", "", "Semicolon-separated list of comma-separated lists of Redis instances") redisConnectTimeout = flag.Duration("redis.connect.timeout", 3*time.Second, "Redis connect timeout") redisReadTimeout = flag.Duration("redis.read.timeout", 3*time.Second, "Redis read timeout") redisWriteTimeout = flag.Duration("redis.write.timeout", 3*time.Second, "Redis write timeout") redisMCPI = flag.Int("redis.mcpi", 2, "Max connections per Redis instance") redisHash = flag.String("redis.hash", "murmur3", "Redis hash function: murmur3, fnv, fnva") maxSize = flag.Int("max.size", 10000, "Maximum number of events per key") batchSize = flag.Int("batch.size", 100, "keys to select per request") maxKeysPerSecond = flag.Int64("max.keys.per.second", 1000, "max keys per second to walk") scanLogInterval = flag.Duration("scan.log.interval", 5*time.Second, "how often to report scan rates in log") once = flag.Bool("once", false, "walk entire keyspace once and exit (default false, walk forever)") statsdAddress = flag.String("statsd.address", "", "Statsd address (blank to disable)") statsdSampleRate = flag.Float64("statsd.sample.rate", 0.1, "Statsd sample rate for normal metrics") statsdBucketPrefix = flag.String("statsd.bucket.prefix", "myservice.", "Statsd bucket key prefix, including trailing period") httpAddress = flag.String("http.address", ":6060", "HTTP listen address (profiling endpoints only)") ) flag.Parse() log.SetFlags(log.Lmicroseconds) // Validate integer arguments. if *maxKeysPerSecond < int64(*batchSize) { log.Fatal("max keys per second should be bigger than batch size") } // Set up statsd instrumentation, if it's specified. stats := g2s.Noop() if *statsdAddress != "" { var err error stats, err = g2s.Dial("udp", *statsdAddress) if err != nil { log.Fatal(err) } } instr := statsd.New(stats, float32(*statsdSampleRate), *statsdBucketPrefix) // Parse hash function. var hashFunc func(string) uint32 switch strings.ToLower(*redisHash) { case "murmur3": hashFunc = pool.Murmur3 case "fnv": hashFunc = pool.FNV case "fnva": hashFunc = pool.FNVa default: log.Fatalf("unknown hash '%s'", *redisHash) } // Set up the clusters. clusters, err := makeClusters( *redisInstances, *redisConnectTimeout, *redisReadTimeout, *redisWriteTimeout, *redisMCPI, hashFunc, *maxSize, instr, ) if err != nil { log.Fatal(err) } // HTTP server for profiling go func() { log.Print(http.ListenAndServe(*httpAddress, nil)) }() // Set up our rate limiter. Remember: it's per-key, not per-request. freq := time.Duration(1/(*maxKeysPerSecond)) * time.Second bucket := tb.NewBucket(*maxKeysPerSecond, freq) // Build the farm readStrategy := farm.SendAllReadAll repairStrategy := farm.AllRepairs // blocking dst := farm.New(clusters, len(clusters), readStrategy, repairStrategy, instr) // Perform the walk begin := time.Now() for { src := scan(clusters, *batchSize, *scanLogInterval) // new key set walkOnce(dst, bucket, src, *maxSize, instr) if *once { break } } log.Printf("walk complete in %s", time.Since(begin)) }