func (c *Controller) kickOffHealthChecks(healthExit chan struct{}) { client, err := node.NewLBClient(c.options.ServicedEndpoint) if err != nil { glog.Errorf("Could not create a client to endpoint: %s, %s", c.options.ServicedEndpoint, err) return } defer client.Close() var healthChecks map[string]domain.HealthCheck instanceID, err := strconv.Atoi(c.options.Service.InstanceID) if err != nil { glog.Errorf("Invalid instance from instanceID:%s", c.options.Service.InstanceID) return } err = client.GetHealthCheck(node.HealthCheckRequest{ c.options.Service.ID, instanceID}, &healthChecks) if err != nil { glog.Errorf("Error getting health checks: %s", err) return } for key, mapping := range healthChecks { glog.Infof("Kicking off health check %s.", key) glog.Infof("Setting up health check: %s", mapping.Script) timeout := mapping.Timeout if timeout == 0 { timeout = time.Second * 30 } go c.handleHealthCheck(key, mapping.Script, mapping.Interval, timeout, healthExit) } return }
// sendLogMessage sends a log message to the host agent func sendLogMessage(lbClientPort string, serviceLogInfo node.ServiceLogInfo) error { client, err := node.NewLBClient(lbClientPort) if err != nil { glog.Errorf("Could not create a client to endpoint: %s, %s", lbClientPort, err) return err } defer client.Close() return client.SendLogMessage(serviceLogInfo, nil) }
// getAgentHostID retrieves the agent's host id func getAgentHostID(lbClientPort string) (string, error) { client, err := node.NewLBClient(lbClientPort) if err != nil { glog.Errorf("Could not create a client to endpoint: %s, %s", lbClientPort, err) return "", err } defer client.Close() var hostID string err = client.GetHostID(&hostID) if err != nil { glog.Errorf("Error getting host id, error: %s", err) return "", err } glog.V(1).Infof("getAgentHostID: %s", hostID) return hostID, nil }
// getServiceTenantID retrieves a service's tenantID func getServiceTenantID(lbClientPort string, serviceID string) (string, error) { client, err := node.NewLBClient(lbClientPort) if err != nil { glog.Errorf("Could not create a client to endpoint: %s, %s", lbClientPort, err) return "", err } defer client.Close() var tenantID string err = client.GetTenantId(serviceID, &tenantID) if err != nil { glog.Errorf("Error getting service %s's tenantID, error: %s", serviceID, err) return "", err } glog.V(1).Infof("getServiceTenantID: service id=%s: %s", serviceID, tenantID) return tenantID, nil }
// getAgentZkInfo retrieves the agent's zookeeper dsn func getAgentZkInfo(lbClientPort string) (node.ZkInfo, error) { var zkInfo node.ZkInfo client, err := node.NewLBClient(lbClientPort) if err != nil { glog.Errorf("Could not create a client to endpoint: %s, %s", lbClientPort, err) return zkInfo, err } defer client.Close() err = client.GetZkInfo(&zkInfo) if err != nil { glog.Errorf("Error getting zookeeper dsn/poolID, error: %s", err) return zkInfo, err } glog.V(1).Infof("GetZkInfo: %+v", zkInfo) return zkInfo, nil }
// rpcHealthCheck returns a channel that will close when it not longer possible // to ping the RPC server func (c *Controller) rpcHealthCheck() (chan struct{}, error) { gone := make(chan struct{}) client, err := node.NewLBClient(c.options.ServicedEndpoint) if err != nil { return nil, err } go func() { var ts time.Time for { err := client.Ping(time.Minute, &ts) if err != nil { close(gone) return } } }() return gone, nil }
// getService retrieves a service func getService(lbClientPort string, serviceID string, instanceID int) (*service.Service, error) { client, err := node.NewLBClient(lbClientPort) if err != nil { glog.Errorf("Could not create a client to endpoint: %s, %s", lbClientPort, err) return nil, err } defer client.Close() var svc service.Service err = client.GetServiceInstance(node.ServiceInstanceRequest{serviceID, instanceID}, &svc) if err != nil { glog.Errorf("Error getting service %s error: %s", serviceID, err) return nil, err } glog.V(1).Infof("getService: service id=%s: %+v", serviceID, svc) return &svc, nil }
// getServiceBindMounts retrieves a service's bindmounts func getServiceBindMounts(lbClientPort string, serviceID string) (map[string]string, error) { client, err := node.NewLBClient(lbClientPort) if err != nil { glog.Errorf("Could not create a client to endpoint: %s, %s", lbClientPort, err) return nil, err } defer client.Close() var bindmounts map[string]string err = client.GetServiceBindMounts(serviceID, &bindmounts) if err != nil { if strings.HasPrefix(err.Error(), "rpc: can't find service") { glog.Errorf("`serviced service shell` is available only when running serviced in agent mode") return nil, err } glog.Errorf("Error getting service %s's bindmounts, error: %s", serviceID, err) return nil, err } glog.V(1).Infof("getServiceBindMounts: service id=%s: %s", serviceID, bindmounts) return bindmounts, nil }
func registerExportedEndpoints(c *Controller, closing chan struct{}) <-chan struct{} { for { err := c.registerExportedEndpoints() if err == nil { return c.watchregistry() } client, err2 := node.NewLBClient(c.options.ServicedEndpoint) if err2 != nil { glog.Errorf("Could not create a client to endpoint: %s, %s", c.options.ServicedEndpoint, err2) } else { client.SendLogMessage(node.ServiceLogInfo{ServiceID: c.options.Service.ID, Message: fmt.Sprintf("error registering exported endpoints: %s", err)}, nil) client.Close() } select { case <-time.After(time.Second): case <-closing: return nil } glog.Errorf("could not register exported expoints: %s", err) } }
func (c *Controller) handleControlCenterImports(rpcdead chan struct{}) error { // this function is currently needed to handle special control center imports // from GetServiceEndpoints() that does not exist in endpoints from getServiceState // get service endpoints client, err := node.NewLBClient(c.options.ServicedEndpoint) if err != nil { glog.Errorf("Could not create a client to endpoint: %s, %s", c.options.ServicedEndpoint, err) return err } defer client.Close() // TODO: instead of getting all endpoints, via GetServiceEndpoints(), create a new call // that returns only special "controlplane" imported endpoints // Note: GetServiceEndpoints has been modified to return only special controlplane endpoints. // We should rename it and clean up the filtering code below. epchan := make(chan map[string][]dao.ApplicationEndpoint) timeout := make(chan struct{}) go func(c *node.LBClient, svcid string, epc chan map[string][]dao.ApplicationEndpoint, timeout chan struct{}) { var endpoints map[string][]dao.ApplicationEndpoint RetryGetServiceEndpoints: for { err = c.GetServiceEndpoints(svcid, &endpoints) if err != nil { select { case <-time.After(1 * time.Second): glog.V(3).Info("Couldn't retrieve service endpoints, trying again") continue RetryGetServiceEndpoints case <-timeout: glog.V(3).Info("Timed out trying to retrieve service endpoints") return } } break } // deal with the race between the one minute timeout in handleControlCenterImports() and the // call to GetServiceEndpoint() - the timeout may happen between GetServiceEndpoint() completing // and sending the result via the epc channel. select { case _, ok := <-epc: if ok { panic("should never receive anything on the endpoints channel") } glog.V(3).Info("Endpoint channel closed, giving up") return default: epc <- endpoints } }(client, c.options.Service.ID, epchan, timeout) var endpoints map[string][]dao.ApplicationEndpoint select { case <-time.After(1 * time.Minute): close(epchan) timeout <- struct{}{} client.SendLogMessage(node.ServiceLogInfo{ServiceID: c.options.Service.ID, Message: "unable to retrieve service endpoints"}, nil) return ErrNoServiceEndpoints case <-rpcdead: close(epchan) timeout <- struct{}{} return fmt.Errorf("RPC Service has gone away") case endpoints = <-epchan: glog.Infof("Got service endpoints for %s: %+v", c.options.Service.ID, endpoints) } // convert keys set by GetServiceEndpoints to tenantID_endpointID tmp := make(map[string][]dao.ApplicationEndpoint) for key, endpointList := range endpoints { if len(endpointList) <= 0 { glog.Warningf("ignoring key: %s with empty endpointList", key) continue } tenantEndpointID := registry.TenantEndpointKey(c.tenantID, endpointList[0].Application) glog.Infof("changing key from %s to %s: %+v", key, tenantEndpointID, endpointList[0]) tmp[tenantEndpointID] = endpoints[key] } endpoints = tmp cc_endpoint_purpose := "import" // Punting on control center dynamic imports for now for key, endpointList := range endpoints { // ignore endpoints that are not special controlplane imports ignorePrefix := fmt.Sprintf("%s_controlplane", c.tenantID) if !strings.HasPrefix(key, ignorePrefix) { continue } // set proxy addresses c.setProxyAddresses(key, endpointList, endpointList[0].VirtualAddress, cc_endpoint_purpose) // add/replace entries in importedEndpoints instanceIDStr := fmt.Sprintf("%d", endpointList[0].InstanceID) setImportedEndpoint(&c.importedEndpoints, c.tenantID, endpointList[0].Application, instanceIDStr, endpointList[0].VirtualAddress, cc_endpoint_purpose, endpointList[0].ContainerPort) // TODO: agent needs to register controlplane and controlplane_consumer // but don't do that here in the container code } return nil }
func (c *Controller) handleHealthCheck(name string, script string, interval, timeout time.Duration, exitChannel chan struct{}) { client, err := node.NewLBClient(c.options.ServicedEndpoint) if err != nil { glog.Errorf("Could not create a client to endpoint: %s, %s", c.options.ServicedEndpoint, err) return } defer client.Close() scriptFile, err := ioutil.TempFile("", name) if err != nil { glog.Errorf("Error creating temporary file for health check %s: %s", name, err) return } defer scriptFile.Close() defer os.Remove(scriptFile.Name()) err = ioutil.WriteFile(scriptFile.Name(), []byte(script), os.FileMode(0777)) if err != nil { glog.Errorf("Error writing script for health check %s: %s", name, err) return } scriptFile.Close() err = os.Chmod(scriptFile.Name(), os.FileMode(0777)) if err != nil { glog.Errorf("Error setting script executable for health check %s: %s", name, err) return } var unused int sigtermTimeout := time.Second * 10 for { select { case <-time.After(interval): exited := make(chan error, 1) sysProcAttr := &syscall.SysProcAttr{Setpgid: true, Pdeathsig: syscall.SIGTERM} cmd := exec.Command("sh", "-c", scriptFile.Name()) cmd.SysProcAttr = sysProcAttr if err := cmd.Start(); err != nil { glog.Errorf("Could not run cmd %v: %s", cmd, err) break } go func(c *exec.Cmd) { exited <- c.Wait() }(cmd) select { case err := <-exited: if err == nil { glog.V(4).Infof("Health check %s succeeded.", name) client.LogHealthCheck(domain.HealthCheckResult{c.options.Service.ID, c.options.Service.InstanceID, name, time.Now().String(), "passed"}, &unused) } else { glog.Warningf("Health check %s failed.", name) client.LogHealthCheck(domain.HealthCheckResult{c.options.Service.ID, c.options.Service.InstanceID, name, time.Now().String(), "failed"}, &unused) } case <-exitChannel: proc.KillGroup(cmd.Process.Pid, sigtermTimeout) return case <-time.After(timeout): proc.KillGroup(cmd.Process.Pid, sigtermTimeout) glog.Warningf("Health check %s timeout.", name) client.LogHealthCheck(domain.HealthCheckResult{c.options.Service.ID, c.options.Service.InstanceID, name, time.Now().String(), "failed"}, &unused) } case <-exitChannel: return } } }
// Run executes the controller's main loop and block until the service exits // according to it's restart policy or Close() is called. func (c *Controller) Run() (err error) { defer c.shutdown() sigc := make(chan os.Signal, 1) signal.Notify(sigc, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT) env := os.Environ() env = append(env, "CONTROLPLANE=1") env = append(env, fmt.Sprintf("CONTROLPLANE_CONSUMER_URL=http://localhost%s/api/metrics/store", c.options.Metric.Address)) env = append(env, fmt.Sprintf("CONTROLPLANE_HOST_ID=%s", c.hostID)) env = append(env, fmt.Sprintf("CONTROLPLANE_TENANT_ID=%s", c.tenantID)) env = append(env, fmt.Sprintf("CONTROLPLANE_INSTANCE_ID=%s", c.options.Service.InstanceID)) env = append(env, fmt.Sprintf("CONTROLPLANE_SERVICED_ID=%s", c.options.Service.ID)) if err := writeEnvFile(env); err != nil { return err } args := []string{"-c", "exec " + strings.Join(c.options.Service.Command, " ")} startService := func() (*subprocess.Instance, chan error) { service, serviceExited, _ := subprocess.New(time.Second*10, env, "/bin/sh", args...) return service, serviceExited } sendSignal := func(service *subprocess.Instance, sig os.Signal) bool { switch { case c.PIDFile != "": c.forwardSignal(sig) case service != nil: service.Notify(sig) default: return false } return true } rpcDead, err := c.rpcHealthCheck() if err != nil { glog.Error("Could not setup RPC ping check: %s", err) return err } storageDead, err := c.storageHealthCheck() if err != nil { glog.Errorf("Could not set up storage check: %s", err) return err } prereqsPassed := make(chan bool) var startAfter <-chan time.Time var exitAfter <-chan time.Time var service *subprocess.Instance = nil serviceExited := make(chan error, 1) c.watchRemotePorts() if err := c.handleControlCenterImports(rpcDead); err != nil { glog.Error("Could not setup Control Center specific imports: ", err) return err } go c.checkPrereqs(prereqsPassed, rpcDead) go c.reapZombies(rpcDead) healthExit := make(chan struct{}) defer close(healthExit) c.kickOffHealthChecks(healthExit) doRegisterEndpoints := true exited := false var shutdownService = func(service *subprocess.Instance, sig os.Signal) { c.options.Service.Autorestart = false if sendSignal(service, sig) { sigc = nil prereqsPassed = nil startAfter = nil rpcDead = nil exitAfter = time.After(time.Second * 30) close(healthExit) } else { c.exitStatus = 1 exited = true } } var reregister <-chan struct{} for !exited { select { case sig := <-sigc: glog.Infof("Notifying subprocess of signal %v", sig) shutdownService(service, sig) case <-exitAfter: glog.Infof("Killing unresponsive subprocess") sendSignal(service, syscall.SIGKILL) c.exitStatus = 1 exited = true case <-prereqsPassed: startAfter = time.After(time.Millisecond * 1) prereqsPassed = nil case exitError := <-serviceExited: if !c.options.Service.Autorestart { exitStatus, _ := utils.GetExitStatus(exitError) if c.options.Logforwarder.Enabled { time.Sleep(c.options.Logforwarder.SettleTime) } glog.Infof("Service Exited with status:%d due to %+v", exitStatus, exitError) //set loop to end exited = true //exit with exit code, defer so that other cleanup can happen c.exitStatus = exitStatus } else { glog.Infof("Restarting service process in 10 seconds.") service = nil startAfter = time.After(time.Second * 10) } case <-startAfter: glog.Infof("Starting service process.") service, serviceExited = startService() if doRegisterEndpoints { reregister = registerExportedEndpoints(c, rpcDead) doRegisterEndpoints = false } startAfter = nil case <-reregister: reregister = registerExportedEndpoints(c, rpcDead) case <-rpcDead: glog.Infof("RPC Server has gone away, cleaning up") shutdownService(service, syscall.SIGTERM) case <-storageDead: glog.Infof("Distributed storage for service %s has gone away; shutting down", c.options.Service.ID) shutdownService(service, syscall.SIGTERM) } } // Signal to health check registry that this instance is giving up the ghost. client, err := node.NewLBClient(c.options.ServicedEndpoint) if err != nil { glog.Errorf("Could not create a client to endpoint: %s, %s", c.options.ServicedEndpoint, err) return nil } defer client.Close() c.Close() var unused int client.LogHealthCheck(domain.HealthCheckResult{c.options.Service.ID, c.options.Service.InstanceID, "__instance_shutdown", time.Now().String(), "passed"}, &unused) return nil }