Exemplo n.º 1
0
func (c *Controller) kickOffHealthChecks(healthExit chan struct{}) {
	client, err := node.NewLBClient(c.options.ServicedEndpoint)
	if err != nil {
		glog.Errorf("Could not create a client to endpoint: %s, %s", c.options.ServicedEndpoint, err)
		return
	}
	defer client.Close()
	var healthChecks map[string]domain.HealthCheck

	instanceID, err := strconv.Atoi(c.options.Service.InstanceID)
	if err != nil {
		glog.Errorf("Invalid instance from instanceID:%s", c.options.Service.InstanceID)
		return
	}
	err = client.GetHealthCheck(node.HealthCheckRequest{
		c.options.Service.ID, instanceID}, &healthChecks)
	if err != nil {
		glog.Errorf("Error getting health checks: %s", err)
		return
	}
	for key, mapping := range healthChecks {
		glog.Infof("Kicking off health check %s.", key)
		glog.Infof("Setting up health check: %s", mapping.Script)
		timeout := mapping.Timeout
		if timeout == 0 {
			timeout = time.Second * 30
		}
		go c.handleHealthCheck(key, mapping.Script, mapping.Interval, timeout, healthExit)
	}
	return
}
Exemplo n.º 2
0
// sendLogMessage sends a log message to the host agent
func sendLogMessage(lbClientPort string, serviceLogInfo node.ServiceLogInfo) error {
	client, err := node.NewLBClient(lbClientPort)
	if err != nil {
		glog.Errorf("Could not create a client to endpoint: %s, %s", lbClientPort, err)
		return err
	}
	defer client.Close()
	return client.SendLogMessage(serviceLogInfo, nil)
}
Exemplo n.º 3
0
// getAgentHostID retrieves the agent's host id
func getAgentHostID(lbClientPort string) (string, error) {
	client, err := node.NewLBClient(lbClientPort)
	if err != nil {
		glog.Errorf("Could not create a client to endpoint: %s, %s", lbClientPort, err)
		return "", err
	}
	defer client.Close()

	var hostID string
	err = client.GetHostID(&hostID)
	if err != nil {
		glog.Errorf("Error getting host id, error: %s", err)
		return "", err
	}

	glog.V(1).Infof("getAgentHostID: %s", hostID)
	return hostID, nil
}
Exemplo n.º 4
0
// getServiceTenantID retrieves a service's tenantID
func getServiceTenantID(lbClientPort string, serviceID string) (string, error) {
	client, err := node.NewLBClient(lbClientPort)
	if err != nil {
		glog.Errorf("Could not create a client to endpoint: %s, %s", lbClientPort, err)
		return "", err
	}
	defer client.Close()

	var tenantID string
	err = client.GetTenantId(serviceID, &tenantID)
	if err != nil {
		glog.Errorf("Error getting service %s's tenantID, error: %s", serviceID, err)
		return "", err
	}

	glog.V(1).Infof("getServiceTenantID: service id=%s: %s", serviceID, tenantID)
	return tenantID, nil
}
Exemplo n.º 5
0
// getAgentZkInfo retrieves the agent's zookeeper dsn
func getAgentZkInfo(lbClientPort string) (node.ZkInfo, error) {
	var zkInfo node.ZkInfo
	client, err := node.NewLBClient(lbClientPort)
	if err != nil {
		glog.Errorf("Could not create a client to endpoint: %s, %s", lbClientPort, err)
		return zkInfo, err
	}
	defer client.Close()

	err = client.GetZkInfo(&zkInfo)
	if err != nil {
		glog.Errorf("Error getting zookeeper dsn/poolID, error: %s", err)
		return zkInfo, err
	}

	glog.V(1).Infof("GetZkInfo: %+v", zkInfo)
	return zkInfo, nil
}
Exemplo n.º 6
0
// rpcHealthCheck returns a channel that will close when it not longer possible
// to ping the RPC server
func (c *Controller) rpcHealthCheck() (chan struct{}, error) {
	gone := make(chan struct{})

	client, err := node.NewLBClient(c.options.ServicedEndpoint)
	if err != nil {
		return nil, err
	}
	go func() {
		var ts time.Time
		for {
			err := client.Ping(time.Minute, &ts)
			if err != nil {
				close(gone)
				return
			}
		}
	}()
	return gone, nil
}
Exemplo n.º 7
0
// getService retrieves a service
func getService(lbClientPort string, serviceID string, instanceID int) (*service.Service, error) {
	client, err := node.NewLBClient(lbClientPort)
	if err != nil {
		glog.Errorf("Could not create a client to endpoint: %s, %s", lbClientPort, err)
		return nil, err
	}
	defer client.Close()

	var svc service.Service
	err = client.GetServiceInstance(node.ServiceInstanceRequest{serviceID, instanceID}, &svc)

	if err != nil {
		glog.Errorf("Error getting service %s  error: %s", serviceID, err)
		return nil, err
	}

	glog.V(1).Infof("getService: service id=%s: %+v", serviceID, svc)
	return &svc, nil
}
Exemplo n.º 8
0
// getServiceBindMounts retrieves a service's bindmounts
func getServiceBindMounts(lbClientPort string, serviceID string) (map[string]string, error) {
	client, err := node.NewLBClient(lbClientPort)
	if err != nil {
		glog.Errorf("Could not create a client to endpoint: %s, %s", lbClientPort, err)
		return nil, err
	}
	defer client.Close()

	var bindmounts map[string]string
	err = client.GetServiceBindMounts(serviceID, &bindmounts)
	if err != nil {
		if strings.HasPrefix(err.Error(), "rpc: can't find service") {
			glog.Errorf("`serviced service shell` is available only when running serviced in agent mode")
			return nil, err
		}
		glog.Errorf("Error getting service %s's bindmounts, error: %s", serviceID, err)
		return nil, err
	}

	glog.V(1).Infof("getServiceBindMounts: service id=%s: %s", serviceID, bindmounts)
	return bindmounts, nil
}
Exemplo n.º 9
0
func registerExportedEndpoints(c *Controller, closing chan struct{}) <-chan struct{} {

	for {
		err := c.registerExportedEndpoints()
		if err == nil {
			return c.watchregistry()
		}
		client, err2 := node.NewLBClient(c.options.ServicedEndpoint)
		if err2 != nil {
			glog.Errorf("Could not create a client to endpoint: %s, %s", c.options.ServicedEndpoint, err2)

		} else {
			client.SendLogMessage(node.ServiceLogInfo{ServiceID: c.options.Service.ID, Message: fmt.Sprintf("error registering exported endpoints: %s", err)}, nil)
			client.Close()
		}
		select {
		case <-time.After(time.Second):
		case <-closing:
			return nil
		}
		glog.Errorf("could not register exported expoints: %s", err)
	}
}
Exemplo n.º 10
0
func (c *Controller) handleControlCenterImports(rpcdead chan struct{}) error {
	// this function is currently needed to handle special control center imports
	// from GetServiceEndpoints() that does not exist in endpoints from getServiceState

	// get service endpoints
	client, err := node.NewLBClient(c.options.ServicedEndpoint)
	if err != nil {
		glog.Errorf("Could not create a client to endpoint: %s, %s", c.options.ServicedEndpoint, err)
		return err
	}
	defer client.Close()

	// TODO: instead of getting all endpoints, via GetServiceEndpoints(), create a new call
	//       that returns only special "controlplane" imported endpoints
	//	Note: GetServiceEndpoints has been modified to return only special controlplane endpoints.
	//		We should rename it and clean up the filtering code below.

	epchan := make(chan map[string][]dao.ApplicationEndpoint)
	timeout := make(chan struct{})

	go func(c *node.LBClient, svcid string, epc chan map[string][]dao.ApplicationEndpoint, timeout chan struct{}) {
		var endpoints map[string][]dao.ApplicationEndpoint
	RetryGetServiceEndpoints:
		for {
			err = c.GetServiceEndpoints(svcid, &endpoints)
			if err != nil {
				select {
				case <-time.After(1 * time.Second):
					glog.V(3).Info("Couldn't retrieve service endpoints, trying again")
					continue RetryGetServiceEndpoints
				case <-timeout:
					glog.V(3).Info("Timed out trying to retrieve service endpoints")
					return
				}
			}
			break
		}

		// deal with the race between the one minute timeout in handleControlCenterImports() and the
		// call to GetServiceEndpoint() - the timeout may happen between GetServiceEndpoint() completing
		// and sending the result via the epc channel.
		select {
		case _, ok := <-epc:
			if ok {
				panic("should never receive anything on the endpoints channel")
			}
			glog.V(3).Info("Endpoint channel closed, giving up")
			return
		default:
			epc <- endpoints
		}
	}(client, c.options.Service.ID, epchan, timeout)

	var endpoints map[string][]dao.ApplicationEndpoint
	select {
	case <-time.After(1 * time.Minute):
		close(epchan)
		timeout <- struct{}{}
		client.SendLogMessage(node.ServiceLogInfo{ServiceID: c.options.Service.ID, Message: "unable to retrieve service endpoints"}, nil)
		return ErrNoServiceEndpoints
	case <-rpcdead:
		close(epchan)
		timeout <- struct{}{}
		return fmt.Errorf("RPC Service has gone away")
	case endpoints = <-epchan:
		glog.Infof("Got service endpoints for %s: %+v", c.options.Service.ID, endpoints)
	}

	// convert keys set by GetServiceEndpoints to tenantID_endpointID
	tmp := make(map[string][]dao.ApplicationEndpoint)
	for key, endpointList := range endpoints {
		if len(endpointList) <= 0 {
			glog.Warningf("ignoring key: %s with empty endpointList", key)
			continue
		}

		tenantEndpointID := registry.TenantEndpointKey(c.tenantID, endpointList[0].Application)
		glog.Infof("changing key from %s to %s: %+v", key, tenantEndpointID, endpointList[0])
		tmp[tenantEndpointID] = endpoints[key]
	}
	endpoints = tmp

	cc_endpoint_purpose := "import" // Punting on control center dynamic imports for now

	for key, endpointList := range endpoints {
		// ignore endpoints that are not special controlplane imports
		ignorePrefix := fmt.Sprintf("%s_controlplane", c.tenantID)
		if !strings.HasPrefix(key, ignorePrefix) {
			continue
		}

		// set proxy addresses
		c.setProxyAddresses(key, endpointList, endpointList[0].VirtualAddress, cc_endpoint_purpose)

		// add/replace entries in importedEndpoints
		instanceIDStr := fmt.Sprintf("%d", endpointList[0].InstanceID)
		setImportedEndpoint(&c.importedEndpoints, c.tenantID,
			endpointList[0].Application, instanceIDStr,
			endpointList[0].VirtualAddress, cc_endpoint_purpose,
			endpointList[0].ContainerPort)

		// TODO: agent needs to register controlplane and controlplane_consumer
		//       but don't do that here in the container code
	}

	return nil
}
Exemplo n.º 11
0
func (c *Controller) handleHealthCheck(name string, script string, interval, timeout time.Duration, exitChannel chan struct{}) {
	client, err := node.NewLBClient(c.options.ServicedEndpoint)
	if err != nil {
		glog.Errorf("Could not create a client to endpoint: %s, %s", c.options.ServicedEndpoint, err)
		return
	}
	defer client.Close()
	scriptFile, err := ioutil.TempFile("", name)
	if err != nil {
		glog.Errorf("Error creating temporary file for health check %s: %s", name, err)
		return
	}
	defer scriptFile.Close()
	defer os.Remove(scriptFile.Name())
	err = ioutil.WriteFile(scriptFile.Name(), []byte(script), os.FileMode(0777))
	if err != nil {
		glog.Errorf("Error writing script for health check %s: %s", name, err)
		return
	}
	scriptFile.Close()
	err = os.Chmod(scriptFile.Name(), os.FileMode(0777))
	if err != nil {
		glog.Errorf("Error setting script executable for health check %s: %s", name, err)
		return
	}
	var unused int
	sigtermTimeout := time.Second * 10
	for {
		select {
		case <-time.After(interval):
			exited := make(chan error, 1)
			sysProcAttr := &syscall.SysProcAttr{Setpgid: true, Pdeathsig: syscall.SIGTERM}
			cmd := exec.Command("sh", "-c", scriptFile.Name())
			cmd.SysProcAttr = sysProcAttr
			if err := cmd.Start(); err != nil {
				glog.Errorf("Could not run cmd %v: %s", cmd, err)
				break
			}
			go func(c *exec.Cmd) { exited <- c.Wait() }(cmd)
			select {
			case err := <-exited:
				if err == nil {
					glog.V(4).Infof("Health check %s succeeded.", name)
					client.LogHealthCheck(domain.HealthCheckResult{c.options.Service.ID, c.options.Service.InstanceID, name, time.Now().String(), "passed"}, &unused)
				} else {
					glog.Warningf("Health check %s failed.", name)
					client.LogHealthCheck(domain.HealthCheckResult{c.options.Service.ID, c.options.Service.InstanceID, name, time.Now().String(), "failed"}, &unused)
				}
			case <-exitChannel:
				proc.KillGroup(cmd.Process.Pid, sigtermTimeout)
				return
			case <-time.After(timeout):
				proc.KillGroup(cmd.Process.Pid, sigtermTimeout)
				glog.Warningf("Health check %s timeout.", name)
				client.LogHealthCheck(domain.HealthCheckResult{c.options.Service.ID, c.options.Service.InstanceID, name, time.Now().String(), "failed"}, &unused)
			}
		case <-exitChannel:
			return
		}
	}
}
Exemplo n.º 12
0
// Run executes the controller's main loop and block until the service exits
// according to it's restart policy or Close() is called.
func (c *Controller) Run() (err error) {
	defer c.shutdown()
	sigc := make(chan os.Signal, 1)
	signal.Notify(sigc,
		syscall.SIGINT,
		syscall.SIGTERM,
		syscall.SIGQUIT)

	env := os.Environ()
	env = append(env, "CONTROLPLANE=1")
	env = append(env, fmt.Sprintf("CONTROLPLANE_CONSUMER_URL=http://localhost%s/api/metrics/store", c.options.Metric.Address))
	env = append(env, fmt.Sprintf("CONTROLPLANE_HOST_ID=%s", c.hostID))
	env = append(env, fmt.Sprintf("CONTROLPLANE_TENANT_ID=%s", c.tenantID))
	env = append(env, fmt.Sprintf("CONTROLPLANE_INSTANCE_ID=%s", c.options.Service.InstanceID))
	env = append(env, fmt.Sprintf("CONTROLPLANE_SERVICED_ID=%s", c.options.Service.ID))

	if err := writeEnvFile(env); err != nil {
		return err
	}

	args := []string{"-c", "exec " + strings.Join(c.options.Service.Command, " ")}

	startService := func() (*subprocess.Instance, chan error) {
		service, serviceExited, _ := subprocess.New(time.Second*10, env, "/bin/sh", args...)
		return service, serviceExited
	}

	sendSignal := func(service *subprocess.Instance, sig os.Signal) bool {
		switch {
		case c.PIDFile != "":
			c.forwardSignal(sig)
		case service != nil:
			service.Notify(sig)
		default:
			return false
		}
		return true
	}

	rpcDead, err := c.rpcHealthCheck()
	if err != nil {
		glog.Error("Could not setup RPC ping check: %s", err)
		return err
	}

	storageDead, err := c.storageHealthCheck()
	if err != nil {
		glog.Errorf("Could not set up storage check: %s", err)
		return err
	}

	prereqsPassed := make(chan bool)
	var startAfter <-chan time.Time
	var exitAfter <-chan time.Time
	var service *subprocess.Instance = nil
	serviceExited := make(chan error, 1)
	c.watchRemotePorts()
	if err := c.handleControlCenterImports(rpcDead); err != nil {
		glog.Error("Could not setup Control Center specific imports: ", err)
		return err
	}
	go c.checkPrereqs(prereqsPassed, rpcDead)
	go c.reapZombies(rpcDead)
	healthExit := make(chan struct{})
	defer close(healthExit)
	c.kickOffHealthChecks(healthExit)
	doRegisterEndpoints := true
	exited := false

	var shutdownService = func(service *subprocess.Instance, sig os.Signal) {
		c.options.Service.Autorestart = false
		if sendSignal(service, sig) {
			sigc = nil
			prereqsPassed = nil
			startAfter = nil
			rpcDead = nil
			exitAfter = time.After(time.Second * 30)
			close(healthExit)
		} else {
			c.exitStatus = 1
			exited = true
		}
	}

	var reregister <-chan struct{}

	for !exited {
		select {
		case sig := <-sigc:
			glog.Infof("Notifying subprocess of signal %v", sig)
			shutdownService(service, sig)

		case <-exitAfter:
			glog.Infof("Killing unresponsive subprocess")
			sendSignal(service, syscall.SIGKILL)
			c.exitStatus = 1
			exited = true

		case <-prereqsPassed:
			startAfter = time.After(time.Millisecond * 1)
			prereqsPassed = nil

		case exitError := <-serviceExited:
			if !c.options.Service.Autorestart {
				exitStatus, _ := utils.GetExitStatus(exitError)
				if c.options.Logforwarder.Enabled {
					time.Sleep(c.options.Logforwarder.SettleTime)
				}
				glog.Infof("Service Exited with status:%d due to %+v", exitStatus, exitError)
				//set loop to end
				exited = true
				//exit with exit code, defer so that other cleanup can happen
				c.exitStatus = exitStatus

			} else {
				glog.Infof("Restarting service process in 10 seconds.")
				service = nil
				startAfter = time.After(time.Second * 10)
			}

		case <-startAfter:
			glog.Infof("Starting service process.")
			service, serviceExited = startService()
			if doRegisterEndpoints {
				reregister = registerExportedEndpoints(c, rpcDead)
				doRegisterEndpoints = false
			}
			startAfter = nil
		case <-reregister:
			reregister = registerExportedEndpoints(c, rpcDead)
		case <-rpcDead:
			glog.Infof("RPC Server has gone away, cleaning up")
			shutdownService(service, syscall.SIGTERM)
		case <-storageDead:
			glog.Infof("Distributed storage for service %s has gone away; shutting down", c.options.Service.ID)
			shutdownService(service, syscall.SIGTERM)
		}
	}
	// Signal to health check registry that this instance is giving up the ghost.
	client, err := node.NewLBClient(c.options.ServicedEndpoint)
	if err != nil {
		glog.Errorf("Could not create a client to endpoint: %s, %s", c.options.ServicedEndpoint, err)
		return nil
	}
	defer client.Close()
	c.Close()
	var unused int
	client.LogHealthCheck(domain.HealthCheckResult{c.options.Service.ID, c.options.Service.InstanceID, "__instance_shutdown", time.Now().String(), "passed"}, &unused)
	return nil
}