// EnsureInfrastructureInitialized verifies that all the necessary // infrastructure is registered, managed and ready to be used. func (manager *Manager) EnsureInfrastructureInitialized( info *InstanceInfo) ( envNetworkIP string, ownIP string, instanceAddr string, keepAlive KeepAliveFun, err error) { entry, err := manager.getEnvironment(info.Environment) if err != nil { return "", "", "", nil, err } defer entry.envLock.Unlock() instance, ok := entry.leverInstances[info.InstanceID] if ok { // Instance already managed. manager.logger.WithFields("leverInstanceID", info.InstanceID).Debug( "Instance already managed") keepAlive = func( resourceName, levResResourceID, levResSessionID string) { instance.KeepAlive(resourceName, levResResourceID, levResSessionID) } return entry.networkIP, entry.ownIP, instance.InstanceAddr(), keepAlive, nil } // Instance is new. manager.logger.WithFields("leverInstanceID", info.InstanceID).Debug( "Instance is new") if info.ContainerID == "" { // We don't have the container ID. Cannot initialize instance. return "", "", "", nil, fmt.Errorf("Need container ID") } // Connect the container to the env network bridge so we can talk to it // (forward RPCs). instanceIPv4, err := dockerutil.ConnectToDockerEnvNetwork( manager.docker, info.ContainerID, entry.networkID) if err != nil { manager.logger.WithFields( "err", err, "leverEnv", info.Environment, "containerID", info.ContainerID, ).Error("Error connecting instance to env network") removeErr := dockerutil.RemoveDockerContainer( manager.docker, info.ContainerID) if removeErr != nil { manager.logger.WithFields( "containerID", info.ContainerID, "err", removeErr, ).Error("Error trying to remove container after previous error") } return "", "", "", nil, err } instanceAddr = instanceIPv4 + ":" + core.InstanceListenPortFlag.Get() leverURL := &core.LeverURL{ Environment: info.Environment, Service: info.Service, } if core.IsAdmin(leverURL) { // Admin environment. Also connect it to the regional network. _, err := dockerutil.ConnectToDockerEnvNetwork( manager.docker, info.ContainerID, RegionalNetworkFlag.Get()) if err != nil { manager.logger.WithFields( "err", err, "leverEnv", info.Environment, "containerID", info.ContainerID, ).Error("Error connecting admin instance to regional network") removeErr := dockerutil.RemoveDockerContainer( manager.docker, info.ContainerID) if removeErr != nil { manager.logger.WithFields( "containerID", info.ContainerID, "err", removeErr, ).Error( "Error trying to remove container after previous error") } return "", "", "", nil, err } } // Add the instance ID to the local serving ID map. manager.servingIDsLock.Lock() _, ok = manager.servingIDs[info.ServingID] if !ok { manager.servingIDs[info.ServingID] = make(map[string]struct{}) } manager.servingIDs[info.ServingID][info.InstanceID] = struct{}{} manager.servingIDsLock.Unlock() // Start managing instance. instance = NewLeverInstance( info, instanceAddr, manager.proxyInAddr, manager.grpcPool, manager.docker, func(instanceID string, err error) { manager.logger.WithFields("leverInstanceID", instanceID).Debug( "Instance closed") manager.onInstanceClose( entry, info.Environment, instanceID, info.ServingID, err) }) entry.leverInstances[info.InstanceID] = instance keepAlive = func(resourceName, levResResourceID, levResSessionID string) { instance.KeepAlive(resourceName, levResResourceID, levResSessionID) } return entry.networkIP, entry.ownIP, instanceAddr, keepAlive, nil }
func (proxy *LeverProxy) handleInStream(stream *http2stream.HTTP2Stream) { headers := stream.GetHeaders() err := expectHeaders( headers, "lever-url", "x-lever-src-env", "x-lever-dest-instance-id", "x-lever-dest-container-id", "x-lever-serving-id", "x-lever-code-version", "x-lever-inst-resource-id", "x-lever-inst-session-id", "x-lever-res-resource-id", "x-lever-res-session-id", ) if err != nil { proxy.inLogger.WithFields("err", err).Error("") stream.Write(&http2stream.MsgError{Err: err}) return } leverURL, err := core.ParseLeverURL(headers["lever-url"][0]) if err != nil { proxy.inLogger.WithFields( "err", err, "leverURL", headers["lever-url"][0]).Error( "Unable to parse Lever URL") } if !core.IsInternalEnvironment(leverURL.Environment) { err = fmt.Errorf("Cannot route to dest env") proxy.inLogger.WithFields( "err", err, "leverEnv", leverURL.Environment, ).Error("") stream.Write(&http2stream.MsgError{Err: err}) return } srcEnv := headers["x-lever-src-env"][0] instanceID := headers["x-lever-dest-instance-id"][0] containerID := headers["x-lever-dest-container-id"][0] servingID := headers["x-lever-serving-id"][0] codeVersionInt, err := strconv.Atoi(headers["x-lever-code-version"][0]) if err != nil { proxy.inLogger.WithFields("err", err).Error("Cannot parse code version") stream.Write(&http2stream.MsgError{Err: err}) return } codeVersion := int64(codeVersionInt) levInstResourceID := headers["x-lever-inst-resource-id"][0] levInstSessionID := headers["x-lever-inst-session-id"][0] levResResourceID := headers["x-lever-res-resource-id"][0] levResSessionID := headers["x-lever-res-session-id"][0] if instanceID == "" { instanceID, err = proxy.manager.RandomInstaceID(servingID) if err != nil { proxy.inLogger.WithFields( "err", err, "leverEnv", leverURL.Environment, ).Error("Could not find an instanceID for provided servingID") stream.Write(&http2stream.MsgError{Err: err}) return } } streamID := leverutil.RandomID() proxy.inLogger.WithFields( "leverURL", leverURL.String(), "srcEnv", srcEnv, "leverInstanceID", instanceID, "containerID", containerID, "servingID", servingID, "levInstSessionID", levInstSessionID, "levInstResourceID", levInstResourceID, "streamID", streamID, ).Debug("Receiving stream") streamLogger := proxy.inLogger.WithFields("streamID", streamID) if !core.IsInternalEnvironment(leverURL.Environment) { err = fmt.Errorf("Environment not routable internally") streamLogger.WithFields( "err", err, "leverEnv", leverURL.Environment, ).Error("") stream.Write(&http2stream.MsgError{Err: err}) return } _, ownIP, instanceAddr, keepAliveFun, err := proxy.manager.EnsureInfrastructureInitialized(&hostman.InstanceInfo{ Environment: leverURL.Environment, Service: leverURL.Service, InstanceID: instanceID, ContainerID: containerID, ServingID: servingID, LevInstResourceID: levInstResourceID, LevInstSessionID: levInstSessionID, }) if err != nil { streamLogger.WithFields("err", err).Error( "Error initializing instance") stream.Write(&http2stream.MsgError{Err: err}) return } err = proxy.serveOut(leverURL.Environment, ownIP) if err != nil { streamLogger.WithFields("err", err).Error( "Error listening on env network") stream.Write(&http2stream.MsgError{Err: err}) return } destStreamI, err := leverutil.ExpBackoff( func() (clientStream interface{}, err error, finalErr error) { clientStream, err = proxy.client.NewStream(instanceAddr) if err != nil { if strings.Contains(err.Error(), "connection refused") && proxy.manager.IsInstanceAlive(servingID, instanceID) { // Retry. return nil, err, nil } if err == leverutil.ErrNotYetConstructed || err == leverutil.ErrWasDestructed { // Retry. return nil, err, nil } return nil, nil, err } return clientStream, nil, nil }, 10*time.Millisecond, 15*time.Second) if err != nil { streamLogger.WithFields( "err", err, "instanceAddr", instanceAddr, ).Error("Error trying to create client stream to dest") stream.Write(&http2stream.MsgError{Err: err}) return } destStream := destStreamI.(*http2stream.HTTP2Stream) addHeaders := make(map[string][]string) if srcEnv != "" { addHeaders["x-lever-src-env"] = []string{srcEnv} } addHeaders["x-lever-internal-rpc-gateway"] = []string{ ownIP + ":" + EnvOutListenPortFlag.Get()} startTime := time.Now() firstHeaders := true stream.ProxyTo( destStream, func(msg http2stream.MsgItem) []http2stream.MsgItem { proxy.client.KeepAlive(instanceAddr) keepAliveFun( leverURL.Resource, levResResourceID, levResSessionID) return proxy.filterTo(&firstHeaders, addHeaders, msg) }, func(msg http2stream.MsgItem) []http2stream.MsgItem { proxy.client.KeepAlive(instanceAddr) keepAliveFun( leverURL.Resource, levResResourceID, levResSessionID) return noFilter(msg) }) // Wait for RPC to finish. <-destStream.Closed() rpcNanos := uint64(time.Now().Sub(startTime).Nanoseconds()) streamLogger.WithFields("rpcNanos", rpcNanos).Debug("RPC nanos") // Send RPC stats to fleettracker. // TODO: For services with high load, we should not send info on every // RPC. They should be batched and sent say... every ~50ms // (well below tracker tick interval). err = fleettracker.OnRPC(proxy.grpcPool, &fleettracker.RPCEvent{ Environment: leverURL.Environment, Service: leverURL.Service, ServingID: servingID, CodeVersion: codeVersion, IsAdmin: core.IsAdmin(leverURL), RpcNanos: rpcNanos, }) if err != nil { streamLogger.WithFields("err", err).Error( "Failed to send RPC stats to fleettracker") } }
func (finder *Finder) newInstance( env, service string, version int64) ( levInstTarget *LevInstTarget, targetNode string, levInstResource *scale.Resource, success bool, err error) { // Note: This process only takes place for the very first instance within // a service. The non-first instances do not have a resource entry // and they are found via service lookup through DNS. levInstResourceID := makeLevInstResourceID(env, service, version) levInstResource, success, err = scale.ConstructResource( leverutil.ServiceFlag.Get(), levInstResourceID, InstanceConstructTimeoutFlag.Get()) if err != nil { finder.logger.WithFields("err", err).Error( "Failed to construct new instance") return nil, "", nil, false, err } if !success { // Failed to get lock. Someone else constructed instance. levInstTarget = new(LevInstTarget) err = json.Unmarshal([]byte(levInstResource.GetTarget()), levInstTarget) if err != nil { finder.logger.WithFields("err", err).Panic("Failed to decode json") } finder.logger.WithFields( "leverEnv", env, "leverService", service, "leverInstanceID", levInstTarget.InstanceID, ).Debug("Reusing instance") return levInstTarget, levInstResource.GetTargetNode(), levInstResource, false, nil } // Read the entry point from the config. codeDir := dockerutil.CodeDirPath(env, service, version) leverConfig, err := core.ReadLeverConfig(codeDir) if err != nil { finder.logger.WithFields("err", err).Error( "Failed to read lever.json") return nil, "", nil, false, err } // TODO: If somehow first RPC fails and service no longer // contacted afterwards, then the container remains hanging // forever. Need a way to kill it / make it expire after a // while for this situation. // Idea: Poll docker every ~30s for instances that we are not // handling and register those. instanceID := leverutil.RandomID() // TODO: Collisions possible. leverURL := &core.LeverURL{ Environment: env, Service: service, } isAdmin := core.IsAdmin(leverURL) containerID, node, err := dockerutil.StartDockerContainer( finder.docker, env, service, instanceID, version, isAdmin, leverConfig) if err != nil { finder.logger.WithFields( "err", err, "leverEnv", env, "leverService", service, "leverInstanceID", instanceID, ).Error("Unable to start container") return nil, "", nil, false, err } hostAddr, err := GetHostAddrOnNode(node) if err != nil { finder.logger.WithFields("err", err).Error( "Failed to get Host addr on node") return nil, "", nil, false, err } finder.logger.WithFields( "leverEnv", env, "leverService", service, "leverInstanceID", instanceID, "containerID", containerID, "node", node, ).Debug("Creating new instance") levInstTarget = &LevInstTarget{ HostAddr: hostAddr, InstanceID: instanceID, ContainerID: containerID, } target, err := json.Marshal(levInstTarget) if err != nil { finder.logger.WithFields("err", err).Panic("Failed to encode json") } err = levInstResource.DoneConstructing( string(target), node, 2*hostman.InstanceExpiryTimeFlag.Get()) if err != nil { finder.logger.WithFields("err", err).Error( "Failed to register new instance") return nil, "", nil, false, err } return levInstTarget, node, levInstResource, true, nil }