// OnRPC should be called on each RPC that we want to track. func (tracker *FleetTracker) OnRPC(rpcEvent *RPCEvent) error { tracker.lock.RLock() servingTracker, ok := tracker.services[rpcEvent.ServingID] tracker.lock.RUnlock() if !ok { // Handling new service. tracker.lock.Lock() // Make sure nothing changed when we switched to writing. _, ok := tracker.services[rpcEvent.ServingID] if ok { tracker.lock.Unlock() // Something changed while we switched locks. Try again. return tracker.OnRPC(rpcEvent) } codeDir := dockerutil.CodeDirPath( rpcEvent.Environment, rpcEvent.Service, rpcEvent.CodeVersion) leverConfig, err := core.ReadLeverConfig(codeDir) if err != nil { tracker.lock.Unlock() return err } servingTracker = NewLoadTracker( rpcEvent.ServingID, leverConfig.MaxInstanceLoad, leverConfig.MinInstances, rpcEvent.SessionID) tracker.services[rpcEvent.ServingID] = servingTracker go tracker.monitorServiceResource(rpcEvent.SessionID) tracker.lock.Unlock() } delta := servingTracker.OnRPC(rpcEvent.RpcNanos) if delta == 0 { return nil } else if delta > 0 { return tracker.scaleUp(delta, rpcEvent) } else { return tracker.scaleDown(-delta, rpcEvent) } }
func (tracker *FleetTracker) scaleUp(delta int, rpcEvent *RPCEvent) error { logger.WithFields( "leverEnv", rpcEvent.Environment, "leverService", rpcEvent.Service, "codeVersion", rpcEvent.CodeVersion, "servingID", rpcEvent.ServingID, "deltaInstances", delta, ).Info("Scaling up") // Read the entry point from the config. codeDir := dockerutil.CodeDirPath( rpcEvent.Environment, rpcEvent.Service, rpcEvent.CodeVersion) leverConfig, err := core.ReadLeverConfig(codeDir) if err != nil { return err } // Spin up. hadErrors := false for i := 0; i < delta; i++ { instanceID := leverutil.RandomID() containerID, node, err := dockerutil.StartDockerContainer( tracker.docker, rpcEvent.Environment, rpcEvent.Service, instanceID, rpcEvent.CodeVersion, rpcEvent.IsAdmin, leverConfig) if err != nil { logger.WithFields( "err", err, "leverEnv", rpcEvent.Environment, "leverService", rpcEvent.Service, "codeVersion", rpcEvent.CodeVersion, "servingID", rpcEvent.ServingID, ).Error("Error starting docker container") hadErrors = true continue } err = hostman.InitializeInstance( tracker.grpcPool, &hostman.InstanceInfo{ Environment: rpcEvent.Environment, Service: rpcEvent.Service, InstanceID: instanceID, ContainerID: containerID, ServingID: rpcEvent.ServingID, LevInstResourceID: "", LevInstSessionID: "", }, node) if err != nil { logger.WithFields( "err", err, "leverEnv", rpcEvent.Environment, "leverService", rpcEvent.Service, "codeVersion", rpcEvent.CodeVersion, "servingID", rpcEvent.ServingID, "node", node, "leverInstanceID", instanceID, ).Error("Failed to initialize instance remotely") hadErrors = true continue } } if hadErrors { return fmt.Errorf("There were errors during scale down") } return nil }
func (finder *Finder) newInstance( env, service string, version int64) ( levInstTarget *LevInstTarget, targetNode string, levInstResource *scale.Resource, success bool, err error) { // Note: This process only takes place for the very first instance within // a service. The non-first instances do not have a resource entry // and they are found via service lookup through DNS. levInstResourceID := makeLevInstResourceID(env, service, version) levInstResource, success, err = scale.ConstructResource( leverutil.ServiceFlag.Get(), levInstResourceID, InstanceConstructTimeoutFlag.Get()) if err != nil { finder.logger.WithFields("err", err).Error( "Failed to construct new instance") return nil, "", nil, false, err } if !success { // Failed to get lock. Someone else constructed instance. levInstTarget = new(LevInstTarget) err = json.Unmarshal([]byte(levInstResource.GetTarget()), levInstTarget) if err != nil { finder.logger.WithFields("err", err).Panic("Failed to decode json") } finder.logger.WithFields( "leverEnv", env, "leverService", service, "leverInstanceID", levInstTarget.InstanceID, ).Debug("Reusing instance") return levInstTarget, levInstResource.GetTargetNode(), levInstResource, false, nil } // Read the entry point from the config. codeDir := dockerutil.CodeDirPath(env, service, version) leverConfig, err := core.ReadLeverConfig(codeDir) if err != nil { finder.logger.WithFields("err", err).Error( "Failed to read lever.json") return nil, "", nil, false, err } // TODO: If somehow first RPC fails and service no longer // contacted afterwards, then the container remains hanging // forever. Need a way to kill it / make it expire after a // while for this situation. // Idea: Poll docker every ~30s for instances that we are not // handling and register those. instanceID := leverutil.RandomID() // TODO: Collisions possible. leverURL := &core.LeverURL{ Environment: env, Service: service, } isAdmin := core.IsAdmin(leverURL) containerID, node, err := dockerutil.StartDockerContainer( finder.docker, env, service, instanceID, version, isAdmin, leverConfig) if err != nil { finder.logger.WithFields( "err", err, "leverEnv", env, "leverService", service, "leverInstanceID", instanceID, ).Error("Unable to start container") return nil, "", nil, false, err } hostAddr, err := GetHostAddrOnNode(node) if err != nil { finder.logger.WithFields("err", err).Error( "Failed to get Host addr on node") return nil, "", nil, false, err } finder.logger.WithFields( "leverEnv", env, "leverService", service, "leverInstanceID", instanceID, "containerID", containerID, "node", node, ).Debug("Creating new instance") levInstTarget = &LevInstTarget{ HostAddr: hostAddr, InstanceID: instanceID, ContainerID: containerID, } target, err := json.Marshal(levInstTarget) if err != nil { finder.logger.WithFields("err", err).Panic("Failed to encode json") } err = levInstResource.DoneConstructing( string(target), node, 2*hostman.InstanceExpiryTimeFlag.Get()) if err != nil { finder.logger.WithFields("err", err).Error( "Failed to register new instance") return nil, "", nil, false, err } return levInstTarget, node, levInstResource, true, nil }