// EnsureCheck is used to create a check or updates it's state func (s *StateStore) EnsureCheck(index uint64, check *structs.HealthCheck) error { // Ensure we have a status if check.Status == "" { check.Status = structs.HealthUnknown } // Start the txn tx, err := s.tables.StartTxn(false) if err != nil { panic(fmt.Errorf("Failed to start txn: %v", err)) } defer tx.Abort() // Ensure the node exists res, err := s.nodeTable.GetTxn(tx, "id", check.Node) if err != nil { return err } if len(res) == 0 { return fmt.Errorf("Missing node registration") } // Ensure the service exists if specified if check.ServiceID != "" { res, err = s.serviceTable.GetTxn(tx, "id", check.Node, check.ServiceID) if err != nil { return err } if len(res) == 0 { return fmt.Errorf("Missing service registration") } // Ensure we set the correct service srv := res[0].(*structs.ServiceNode) check.ServiceName = srv.ServiceName } // Invalidate any sessions if status is critical if check.Status == structs.HealthCritical { err := s.invalidateCheck(index, tx, check.Node, check.CheckID) if err != nil { return err } } // Ensure the check is set if err := s.checkTable.InsertTxn(tx, check); err != nil { return err } if err := s.checkTable.SetLastIndexTxn(tx, index); err != nil { return err } tx.Defer(func() { s.watch[s.checkTable].Notify() }) return tx.Commit() }
// AddCheck is used to add a health check to the agent. // This entry is persistent and the agent will make a best effort to // ensure it is registered. The Check may include a CheckType which // is used to automatically update the check status func (a *Agent) AddCheck(check *structs.HealthCheck, chkType *CheckType, persist bool, token string) error { if check.CheckID == "" { return fmt.Errorf("CheckID missing") } if chkType != nil && !chkType.Valid() { return fmt.Errorf("Check type is not valid") } if check.ServiceID != "" { svc, ok := a.state.Services()[check.ServiceID] if !ok { return fmt.Errorf("ServiceID %q does not exist", check.ServiceID) } check.ServiceName = svc.Service } a.checkLock.Lock() defer a.checkLock.Unlock() // Check if already registered if chkType != nil { if chkType.IsTTL() { if existing, ok := a.checkTTLs[check.CheckID]; ok { existing.Stop() } ttl := &CheckTTL{ Notify: &a.state, CheckID: check.CheckID, TTL: chkType.TTL, Logger: a.logger, } // Restore persisted state, if any if err := a.loadCheckState(check); err != nil { a.logger.Printf("[WARN] agent: failed restoring state for check %q: %s", check.CheckID, err) } ttl.Start() a.checkTTLs[check.CheckID] = ttl } else if chkType.IsHTTP() { if existing, ok := a.checkHTTPs[check.CheckID]; ok { existing.Stop() } if chkType.Interval < MinInterval { a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v", check.CheckID, MinInterval)) chkType.Interval = MinInterval } http := &CheckHTTP{ Notify: &a.state, CheckID: check.CheckID, HTTP: chkType.HTTP, Interval: chkType.Interval, Timeout: chkType.Timeout, Logger: a.logger, } http.Start() a.checkHTTPs[check.CheckID] = http } else { if existing, ok := a.checkMonitors[check.CheckID]; ok { existing.Stop() } if chkType.Interval < MinInterval { a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v", check.CheckID, MinInterval)) chkType.Interval = MinInterval } monitor := &CheckMonitor{ Notify: &a.state, CheckID: check.CheckID, Script: chkType.Script, Interval: chkType.Interval, Logger: a.logger, } monitor.Start() a.checkMonitors[check.CheckID] = monitor } } // Add to the local state for anti-entropy a.state.AddCheck(check, token) // Persist the check if persist { return a.persistCheck(check, chkType) } return nil }
// AddCheck is used to add a health check to the agent. // This entry is persistent and the agent will make a best effort to // ensure it is registered. The Check may include a CheckType which // is used to automatically update the check status func (a *Agent) AddCheck(check *structs.HealthCheck, chkType *CheckType, persist bool, token string) error { if check.CheckID == "" { return fmt.Errorf("CheckID missing") } if chkType != nil && !chkType.Valid() { return fmt.Errorf("Check type is not valid") } if check.ServiceID != "" { svc, ok := a.state.Services()[check.ServiceID] if !ok { return fmt.Errorf("ServiceID %q does not exist", check.ServiceID) } check.ServiceName = svc.Service } a.checkLock.Lock() defer a.checkLock.Unlock() // Check if already registered if chkType != nil { if chkType.IsTTL() { if existing, ok := a.checkTTLs[check.CheckID]; ok { existing.Stop() } ttl := &CheckTTL{ Notify: &a.state, CheckID: check.CheckID, TTL: chkType.TTL, Logger: a.logger, } // Restore persisted state, if any if err := a.loadCheckState(check); err != nil { a.logger.Printf("[WARN] agent: failed restoring state for check %q: %s", check.CheckID, err) } ttl.Start() a.checkTTLs[check.CheckID] = ttl } else if chkType.IsHTTP() { if existing, ok := a.checkHTTPs[check.CheckID]; ok { existing.Stop() } if chkType.Interval < MinInterval { a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v", check.CheckID, MinInterval)) chkType.Interval = MinInterval } http := &CheckHTTP{ Notify: &a.state, CheckID: check.CheckID, HTTP: chkType.HTTP, Interval: chkType.Interval, Timeout: chkType.Timeout, Logger: a.logger, } http.Start() a.checkHTTPs[check.CheckID] = http } else if chkType.IsTCP() { if existing, ok := a.checkTCPs[check.CheckID]; ok { existing.Stop() } if chkType.Interval < MinInterval { a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v", check.CheckID, MinInterval)) chkType.Interval = MinInterval } tcp := &CheckTCP{ Notify: &a.state, CheckID: check.CheckID, TCP: chkType.TCP, Interval: chkType.Interval, Timeout: chkType.Timeout, Logger: a.logger, } tcp.Start() a.checkTCPs[check.CheckID] = tcp } else if chkType.IsDocker() { if existing, ok := a.checkDockers[check.CheckID]; ok { existing.Stop() } if chkType.Interval < MinInterval { a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v", check.CheckID, MinInterval)) chkType.Interval = MinInterval } dockerCheck := &CheckDocker{ Notify: &a.state, CheckID: check.CheckID, DockerContainerID: chkType.DockerContainerID, Shell: chkType.Shell, Script: chkType.Script, Interval: chkType.Interval, Logger: a.logger, } if err := dockerCheck.Init(); err != nil { return err } dockerCheck.Start() a.checkDockers[check.CheckID] = dockerCheck } else if chkType.IsMonitor() { if existing, ok := a.checkMonitors[check.CheckID]; ok { existing.Stop() } if chkType.Interval < MinInterval { a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v", check.CheckID, MinInterval)) chkType.Interval = MinInterval } monitor := &CheckMonitor{ Notify: &a.state, CheckID: check.CheckID, Script: chkType.Script, Interval: chkType.Interval, Timeout: chkType.Timeout, Logger: a.logger, ReapLock: &a.reapLock, } monitor.Start() a.checkMonitors[check.CheckID] = monitor } else { return fmt.Errorf("Check type is not valid") } if chkType.DeregisterCriticalServiceAfter > 0 { timeout := chkType.DeregisterCriticalServiceAfter if timeout < a.config.CheckDeregisterIntervalMin { timeout = a.config.CheckDeregisterIntervalMin a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has deregister interval below minimum of %v", check.CheckID, a.config.CheckDeregisterIntervalMin)) } a.checkReapAfter[check.CheckID] = timeout } else { delete(a.checkReapAfter, check.CheckID) } } // Add to the local state for anti-entropy a.state.AddCheck(check, token) // Persist the check if persist && !a.config.DevMode { return a.persistCheck(check, chkType) } return nil }
// ensureCheckTransaction is used as the inner method to handle inserting // a health check into the state store. It ensures safety against inserting // checks with no matching node or service. func (s *StateStore) ensureCheckTxn(tx *memdb.Txn, idx uint64, watches *DumbWatchManager, hc *structs.HealthCheck) error { // Check if we have an existing health check existing, err := tx.First("checks", "id", hc.Node, string(hc.CheckID)) if err != nil { return fmt.Errorf("failed health check lookup: %s", err) } // Set the indexes if existing != nil { hc.CreateIndex = existing.(*structs.HealthCheck).CreateIndex hc.ModifyIndex = idx } else { hc.CreateIndex = idx hc.ModifyIndex = idx } // Use the default check status if none was provided if hc.Status == "" { hc.Status = structs.HealthCritical } // Get the node node, err := tx.First("nodes", "id", hc.Node) if err != nil { return fmt.Errorf("failed node lookup: %s", err) } if node == nil { return ErrMissingNode } // If the check is associated with a service, check that we have // a registration for the service. if hc.ServiceID != "" { service, err := tx.First("services", "id", hc.Node, hc.ServiceID) if err != nil { return fmt.Errorf("failed service lookup: %s", err) } if service == nil { return ErrMissingService } // Copy in the service name hc.ServiceName = service.(*structs.ServiceNode).ServiceName } // Delete any sessions for this check if the health is critical. if hc.Status == structs.HealthCritical { mappings, err := tx.Get("session_checks", "node_check", hc.Node, string(hc.CheckID)) if err != nil { return fmt.Errorf("failed session checks lookup: %s", err) } var ids []string for mapping := mappings.Next(); mapping != nil; mapping = mappings.Next() { ids = append(ids, mapping.(*sessionCheck).Session) } // Delete the session in a separate loop so we don't trash the // iterator. watches := NewDumbWatchManager(s.tableWatches) for _, id := range ids { if err := s.deleteSessionTxn(tx, idx, watches, id); err != nil { return fmt.Errorf("failed deleting session: %s", err) } } tx.Defer(func() { watches.Notify() }) } // Persist the check registration in the db. if err := tx.Insert("checks", hc); err != nil { return fmt.Errorf("failed inserting check: %s", err) } if err := tx.Insert("index", &IndexEntry{"checks", idx}); err != nil { return fmt.Errorf("failed updating index: %s", err) } watches.Arm("checks") return nil }