// Etcd is a configuration stored in etcd. It will be reloaded as soon // as it changes. func Etcd(path string, endpoints []string) Source { updates := make(chan pair, 1) req := make(chan context.Context) go func() { var c client.Client for i := 0; true; i++ { var err error c, err = client.New(client.Config{Endpoints: endpoints}) if err != nil { log.Errorf("configuration: cannot connect to etcd: %v", err) updates <- pair{err: err} time.Sleep(timeutil.Backoff(1*time.Second, 60*time.Second, i)) continue } break } log.V(2).Infof("configuration: connected to etcd") kapi := client.NewKeysAPI(c) r, err := kapi.Get(<-req, path, nil) if err != nil { updates <- pair{err: err} } else { updates <- pair{data: []byte(r.Node.Value)} } w := kapi.Watcher(path, nil) for i := 0; true; i++ { ctx := <-req r, err := w.Next(ctx) if err != nil { updates <- pair{err: err} time.Sleep(timeutil.Backoff(1*time.Second, 60*time.Second, i)) continue } updates <- pair{data: []byte(r.Node.Value)} } }() return func(ctx context.Context) (data []byte, err error) { req <- ctx p := <-updates return p.data, p.err } }
// performRequests does a request and returns the duration of the // shortest refresh interval from all handled resources. // // If there's an error, it will be logged, and the returned interval // will be increasing exponentially (basing on the passed retry // number). The returned nextRetryNumber should be used in the next // call to performRequests. func (client *Client) performRequests(retryNumber int) (interval time.Duration, nextRetryNumber int) { // Creates new GetCapacityRequest in := &pb.GetCapacityRequest{ClientId: proto.String(client.id)} // Adds all resources in this client's resource registry to the // request. for id, resource := range client.resources { in.Resource = append(in.Resource, &pb.ResourceRequest{ Priority: proto.Int64(resource.priority), ResourceId: proto.String(id), Wants: proto.Float64(resource.Wants()), Has: resource.lease, }) } if retryNumber > 0 { log.Infof("GetCapacity: retry number %v: %v", retryNumber, in) } out, err := client.getCapacity(in) if err != nil { log.Errorf("GetCapacityRequest: %v", err) // Expired resources only need to be handled if the // RPC failed: otherwise the client has gotten a // refreshed lease. for _, res := range client.resources { if res.expires().Before(time.Now()) { res.lease = nil // FIXME(ryszard): This probably should be the safe // capacity instead. res.capacity <- 0.0 } } return timeutil.Backoff(minBackoff, maxBackoff, retryNumber), retryNumber + 1 } for _, pr := range out.Response { res, ok := client.resources[pr.GetResourceId()] if !ok { log.Errorf("response for non-existing resource: %q", pr.GetResourceId()) continue } oldCapacity := float64(-1) if res.lease != nil { oldCapacity = res.lease.GetCapacity() } res.lease = pr.GetGets() // Only send a message down the channel if the capacity has changed. if res.lease.GetCapacity() != oldCapacity { // res.capacity is a buffered channel, so if no one is // receiving on the other side this will send messages // over it until it reaches its size, and then will // start dropping them. select { case res.capacity <- res.lease.GetCapacity(): default: } } } // Finds the minimal refresh interval. interval = veryLongTime for _, res := range client.resources { if refresh := time.Duration(res.lease.GetRefreshInterval()) * time.Second; refresh < interval { interval = refresh } } // Applies the --minimum_refresh_interval_secs flag. if interval < client.conn.Opts.MinimumRefreshInterval { log.Infof("overriding interval %v with %v", interval, client.conn.Opts.MinimumRefreshInterval) interval = client.conn.Opts.MinimumRefreshInterval } return interval, 0 }
// runMasterAware is a wrapper for RPCs that may receive a response informing // of a changed mastership, in which case it will reconnect and retry. func (connection *Connection) runMasterAware(callback func() (HasMastership, error)) (interface{}, error) { var ( err error out HasMastership retries int ) for { // Does the exponential backoff sleep. if retries > 0 { t := timeutil.Backoff(minBackoff, maxBackoff, retries) log.Infof("retry sleep number %d: %v", retries, t) time.Sleep(t) } retries++ // We goto here when we want to retry the loop without sleeping. RetryNoSleep: // If there is no current client connection, connect to the original target. // If that fails, retry. if connection.conn == nil { if err := connection.connect(connection.addr); err != nil { // The connection failed. Retry. continue } } // Calls the callback function that performs an RPC on the master. out, err = callback() // If an error happened we are going to close the connection to the // server. The next iteration will open it again. if err != nil { connection.Close() continue } // There was no RPC error. Now there can be two cases. Either the server // we talked to was the master, and it processes the request, or it was // not the master, in which case it tells us who the master is (if it // knows). The indicator for this is the presence of the mastership // field in the response. mastership := out.GetMastership() // If there was no mastership field in the response the server we talked // to was the master and has processed the request. If that is the case // we can return the response. if mastership == nil { return out, nil } // If there was a mastership message we check it for presence of the // master_bns field. If there is none then the server does not know // who the master is. In that case we need to retry. if mastership.MasterAddress == nil { log.Warningf("%v is not the master, and does not know who the master is", connection.currentMaster) continue } newMaster := mastership.GetMasterAddress() // This should not happen, because if the server does not know who the master is // it should signify that through the absence of the master_bns field, but why // not check it. if newMaster == "" { log.Errorf("Unexpected error: %v", connection.currentMaster) continue } // The server we talked to told us who the master is. Connect to it. connection.connect(newMaster) goto RetryNoSleep } log.Error("runMasterAware failed to complete") return nil, err }
// performRequests does a request and returns the duration of the // shortest refresh interval from all handled resources. // // If there's an error, it will be logged, and the returned interval // will be increasing exponentially (basing on the passed retry // number). The returned nextRetryNumber should be used in the next // call to performRequests. func (server *Server) performRequests(ctx context.Context, retryNumber int) (time.Duration, int) { // Creates new GetServerCapacityRequest. in := &pb.GetServerCapacityRequest{ServerId: proto.String(server.ID)} server.mu.RLock() // Adds all resources in this client's resource registry to the request. for id, resource := range server.resources { status := resource.Status() // For now we do not take into account clients with different // priorities. That is why we form only one PriorityBandAggregate proto. // Also, compose request only for the resource whose wants capacity > 0, // because it makes no sense to ask for zero capacity. if status.SumWants > 0 { in.Resource = append(in.Resource, &pb.ServerCapacityResourceRequest{ ResourceId: proto.String(id), // TODO(rushanny): fill optional Has field which is of type Lease. Wants: []*pb.PriorityBandAggregate{ { // TODO(rushanny): replace defaultPriority with some client's priority. Priority: proto.Int64(int64(defaultPriority)), NumClients: proto.Int64(status.Count), Wants: proto.Float64(status.SumWants), }, }, }) } } // If there is no actual resources that we could ask for, just send a default request // just to check a lower-level server's availability. if len(server.resources) == 0 { in.Resource = append(in.Resource, defaultServerCapacityResourceRequest) } server.mu.RUnlock() if retryNumber > 0 { log.Infof("GetServerCapacity: retry number %v: %v\n", retryNumber, in) } out, err := server.getCapacityRPC(ctx, in) if err != nil { log.Errorf("GetServerCapacityRequest: %v", err) return timeutil.Backoff(minBackoff, maxBackoff, retryNumber), retryNumber + 1 } // Find the minimal refresh interval. interval := veryLongTime var templates []*pb.ResourceTemplate expiryTimes := make(map[string]*time.Time, 0) for _, pr := range out.Response { _, ok := server.resources[pr.GetResourceId()] if !ok { log.Errorf("response for non-existing resource: %q", pr.GetResourceId()) continue } // Refresh an expiry time for the resource. expiryTime := time.Unix(pr.GetGets().GetExpiryTime(), 0) expiryTimes[pr.GetResourceId()] = &expiryTime // Add a new resource configuration. templates = append(templates, &pb.ResourceTemplate{ IdentifierGlob: proto.String(pr.GetResourceId()), Capacity: proto.Float64(pr.GetGets().GetCapacity()), SafeCapacity: proto.Float64(pr.GetSafeCapacity()), Algorithm: pr.GetAlgorithm(), }) // Find the minimum refresh interval. if refresh := time.Duration(pr.GetGets().GetRefreshInterval()) * time.Second; refresh < interval { interval = refresh } } // Append the default template for * resource. It should be the last one in templates. templates = append(templates, proto.Clone(defaultResourceTemplate).(*pb.ResourceTemplate)) // Load a new configuration for the resources. if err := server.LoadConfig(ctx, &pb.ResourceRepository{ Resources: templates, }, expiryTimes); err != nil { log.Errorf("server.LoadConfig: %v", err) return timeutil.Backoff(minBackoff, maxBackoff, retryNumber), retryNumber + 1 } // Applies the --minimum_refresh_interval_secs flag. // Or if interval was set to veryLongTime and not updated, set it to minimum refresh interval. if interval < server.conn.Opts.MinimumRefreshInterval || interval == veryLongTime { log.Infof("overriding interval %v with %v", interval, server.conn.Opts.MinimumRefreshInterval) interval = server.conn.Opts.MinimumRefreshInterval } return interval, 0 }