// loadCheckState is used to restore the persisted state of a check. func (a *Agent) loadCheckState(check *structs.HealthCheck) error { // Try to read the persisted state for this check file := filepath.Join(a.config.DataDir, checkStateDir, stringHash(check.CheckID)) buf, err := ioutil.ReadFile(file) if err != nil { if os.IsNotExist(err) { return nil } return fmt.Errorf("failed reading file %q: %s", file, err) } // Decode the state data var p persistedCheckState if err := json.Unmarshal(buf, &p); err != nil { return fmt.Errorf("failed decoding check state: %s", err) } // Check if the state has expired if time.Now().Unix() >= p.Expires { a.logger.Printf("[DEBUG] agent: check state expired for %q, not restoring", check.CheckID) return a.purgeCheckState(check.CheckID) } // Restore the fields from the state check.Output = p.Output check.Status = p.Status return nil }
// AddCheck is used to add a health check to the local state. // This entry is persistent and the agent will make a best effort to // ensure it is registered func (l *localState) AddCheck(check *structs.HealthCheck, token string) { // Set the node name check.Node = l.config.NodeName l.Lock() defer l.Unlock() l.checks[check.CheckID] = check l.checkStatus[check.CheckID] = syncStatus{} l.checkTokens[check.CheckID] = token l.changeMade() }
func TestAgentAntiEntropy_Checks(t *testing.T) { conf := nextConfig() dir, agent := makeAgent(t, conf) defer os.RemoveAll(dir) defer agent.Shutdown() testutil.WaitForLeader(t, agent.RPC, "dc1") // Register info args := &structs.RegisterRequest{ Datacenter: "dc1", Node: agent.config.NodeName, Address: "127.0.0.1", } // Exists both, same (noop) var out struct{} chk1 := &structs.HealthCheck{ Node: agent.config.NodeName, CheckID: "mysql", Name: "mysql", Status: structs.HealthPassing, } agent.state.AddCheck(chk1, "") args.Check = chk1 if err := agent.RPC("Catalog.Register", args, &out); err != nil { t.Fatalf("err: %v", err) } // Exists both, different (update) chk2 := &structs.HealthCheck{ Node: agent.config.NodeName, CheckID: "redis", Name: "redis", Status: structs.HealthPassing, } agent.state.AddCheck(chk2, "") chk2_mod := new(structs.HealthCheck) *chk2_mod = *chk2 chk2_mod.Status = structs.HealthCritical args.Check = chk2_mod if err := agent.RPC("Catalog.Register", args, &out); err != nil { t.Fatalf("err: %v", err) } // Exists local (create) chk3 := &structs.HealthCheck{ Node: agent.config.NodeName, CheckID: "web", Name: "web", Status: structs.HealthPassing, } agent.state.AddCheck(chk3, "") // Exists remote (delete) chk4 := &structs.HealthCheck{ Node: agent.config.NodeName, CheckID: "lb", Name: "lb", Status: structs.HealthPassing, } args.Check = chk4 if err := agent.RPC("Catalog.Register", args, &out); err != nil { t.Fatalf("err: %v", err) } // Exists local, in sync, remote missing (create) chk5 := &structs.HealthCheck{ Node: agent.config.NodeName, CheckID: "cache", Name: "cache", Status: structs.HealthPassing, } agent.state.AddCheck(chk5, "") agent.state.checkStatus["cache"] = syncStatus{inSync: true} // Trigger anti-entropy run and wait agent.StartSync() time.Sleep(200 * time.Millisecond) // Verify that we are in sync req := structs.NodeSpecificRequest{ Datacenter: "dc1", Node: agent.config.NodeName, } var checks structs.IndexedHealthChecks if err := agent.RPC("Health.NodeChecks", &req, &checks); err != nil { t.Fatalf("err: %v", err) } // We should have 5 checks (serf included) if len(checks.HealthChecks) != 5 { t.Fatalf("bad: %v", checks) } // All the checks should match for _, chk := range checks.HealthChecks { switch chk.CheckID { case "mysql": if !reflect.DeepEqual(chk, chk1) { t.Fatalf("bad: %v %v", chk, chk1) } case "redis": if !reflect.DeepEqual(chk, chk2) { t.Fatalf("bad: %v %v", chk, chk2) } case "web": if !reflect.DeepEqual(chk, chk3) { t.Fatalf("bad: %v %v", chk, chk3) } case "cache": if !reflect.DeepEqual(chk, chk5) { t.Fatalf("bad: %v %v", chk, chk5) } case "serfHealth": // ignore default: t.Fatalf("unexpected check: %v", chk) } } // Check the local state if len(agent.state.checks) != 4 { t.Fatalf("bad: %v", agent.state.checks) } if len(agent.state.checkStatus) != 4 { t.Fatalf("bad: %v", agent.state.checkStatus) } for name, status := range agent.state.checkStatus { if !status.inSync { t.Fatalf("should be in sync: %v %v", name, status) } } }
// AddCheck is used to add a health check to the agent. // This entry is persistent and the agent will make a best effort to // ensure it is registered. The Check may include a CheckType which // is used to automatically update the check status func (a *Agent) AddCheck(check *structs.HealthCheck, chkType *CheckType, persist bool, token string) error { if check.CheckID == "" { return fmt.Errorf("CheckID missing") } if chkType != nil && !chkType.Valid() { return fmt.Errorf("Check type is not valid") } if check.ServiceID != "" { svc, ok := a.state.Services()[check.ServiceID] if !ok { return fmt.Errorf("ServiceID %q does not exist", check.ServiceID) } check.ServiceName = svc.Service } a.checkLock.Lock() defer a.checkLock.Unlock() // Check if already registered if chkType != nil { if chkType.IsTTL() { if existing, ok := a.checkTTLs[check.CheckID]; ok { existing.Stop() } ttl := &CheckTTL{ Notify: &a.state, CheckID: check.CheckID, TTL: chkType.TTL, Logger: a.logger, } // Restore persisted state, if any if err := a.loadCheckState(check); err != nil { a.logger.Printf("[WARN] agent: failed restoring state for check %q: %s", check.CheckID, err) } ttl.Start() a.checkTTLs[check.CheckID] = ttl } else if chkType.IsHTTP() { if existing, ok := a.checkHTTPs[check.CheckID]; ok { existing.Stop() } if chkType.Interval < MinInterval { a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v", check.CheckID, MinInterval)) chkType.Interval = MinInterval } http := &CheckHTTP{ Notify: &a.state, CheckID: check.CheckID, HTTP: chkType.HTTP, Interval: chkType.Interval, Timeout: chkType.Timeout, Logger: a.logger, } http.Start() a.checkHTTPs[check.CheckID] = http } else { if existing, ok := a.checkMonitors[check.CheckID]; ok { existing.Stop() } if chkType.Interval < MinInterval { a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v", check.CheckID, MinInterval)) chkType.Interval = MinInterval } monitor := &CheckMonitor{ Notify: &a.state, CheckID: check.CheckID, Script: chkType.Script, Interval: chkType.Interval, Logger: a.logger, } monitor.Start() a.checkMonitors[check.CheckID] = monitor } } // Add to the local state for anti-entropy a.state.AddCheck(check, token) // Persist the check if persist { return a.persistCheck(check, chkType) } return nil }
// setSyncState does a read of the server state, and updates // the local syncStatus as appropriate func (l *localState) setSyncState() error { req := structs.NodeSpecificRequest{ Datacenter: l.config.Datacenter, Node: l.config.NodeName, QueryOptions: structs.QueryOptions{Token: l.config.ACLToken}, } var out1 structs.IndexedNodeServices var out2 structs.IndexedHealthChecks if e := l.iface.RPC("Catalog.NodeServices", &req, &out1); e != nil { return e } if err := l.iface.RPC("Health.NodeChecks", &req, &out2); err != nil { return err } checks := out2.HealthChecks l.Lock() defer l.Unlock() services := make(map[string]*structs.NodeService) if out1.NodeServices != nil { services = out1.NodeServices.Services } for id, _ := range l.services { // If the local service doesn't exist remotely, then sync it if _, ok := services[id]; !ok { l.serviceStatus[id] = syncStatus{inSync: false} } } for id, service := range services { // If we don't have the service locally, deregister it existing, ok := l.services[id] if !ok { l.serviceStatus[id] = syncStatus{remoteDelete: true} continue } // If our definition is different, we need to update it equal := reflect.DeepEqual(existing, service) l.serviceStatus[id] = syncStatus{inSync: equal} } for id, _ := range l.checks { // Sync any check which doesn't exist on the remote side found := false for _, check := range checks { if check.CheckID == id { found = true break } } if !found { l.checkStatus[id] = syncStatus{inSync: false} } } for _, check := range checks { // If we don't have the check locally, deregister it id := check.CheckID existing, ok := l.checks[id] if !ok { // The Serf check is created automatically, and does not // need to be registered if id == consul.SerfCheckID { continue } l.checkStatus[id] = syncStatus{remoteDelete: true} continue } // If our definition is different, we need to update it var equal bool if l.config.CheckUpdateInterval == 0 { equal = reflect.DeepEqual(existing, check) } else { eCopy := new(structs.HealthCheck) *eCopy = *existing eCopy.Output = "" check.Output = "" equal = reflect.DeepEqual(eCopy, check) } // Update the status l.checkStatus[id] = syncStatus{inSync: equal} } return nil }