// loadCheckState is used to restore the persisted state of a check. func (a *Agent) loadCheckState(check *structs.HealthCheck) error { // Try to read the persisted state for this check file := filepath.Join(a.config.DataDir, checkStateDir, stringHash(check.CheckID)) buf, err := ioutil.ReadFile(file) if err != nil { if os.IsNotExist(err) { return nil } return fmt.Errorf("failed reading file %q: %s", file, err) } // Decode the state data var p persistedCheckState if err := json.Unmarshal(buf, &p); err != nil { return fmt.Errorf("failed decoding check state: %s", err) } // Check if the state has expired if time.Now().Unix() >= p.Expires { a.logger.Printf("[DEBUG] agent: check state expired for %q, not restoring", check.CheckID) return a.purgeCheckState(check.CheckID) } // Restore the fields from the state check.Output = p.Output check.Status = p.Status return nil }
// EnsureCheck is used to create a check or updates it's state func (s *StateStore) EnsureCheck(index uint64, check *structs.HealthCheck) error { // Ensure we have a status if check.Status == "" { check.Status = structs.HealthUnknown } // Start the txn tx, err := s.tables.StartTxn(false) if err != nil { panic(fmt.Errorf("Failed to start txn: %v", err)) } defer tx.Abort() // Ensure the node exists res, err := s.nodeTable.GetTxn(tx, "id", check.Node) if err != nil { return err } if len(res) == 0 { return fmt.Errorf("Missing node registration") } // Ensure the service exists if specified if check.ServiceID != "" { res, err = s.serviceTable.GetTxn(tx, "id", check.Node, check.ServiceID) if err != nil { return err } if len(res) == 0 { return fmt.Errorf("Missing service registration") } // Ensure we set the correct service srv := res[0].(*structs.ServiceNode) check.ServiceName = srv.ServiceName } // Invalidate any sessions if status is critical if check.Status == structs.HealthCritical { err := s.invalidateCheck(index, tx, check.Node, check.CheckID) if err != nil { return err } } // Ensure the check is set if err := s.checkTable.InsertTxn(tx, check); err != nil { return err } if err := s.checkTable.SetLastIndexTxn(tx, index); err != nil { return err } tx.Defer(func() { s.watch[s.checkTable].Notify() }) return tx.Commit() }
func TestAgentAntiEntropy_Checks(t *testing.T) { conf := nextConfig() dir, agent := makeAgent(t, conf) defer os.RemoveAll(dir) defer agent.Shutdown() testutil.WaitForLeader(t, agent.RPC, "dc1") // Register info args := &structs.RegisterRequest{ Datacenter: "dc1", Node: agent.config.NodeName, Address: "127.0.0.1", } // Exists both, same (noop) var out struct{} chk1 := &structs.HealthCheck{ Node: agent.config.NodeName, CheckID: "mysql", Name: "mysql", Status: structs.HealthPassing, } agent.state.AddCheck(chk1, "") args.Check = chk1 if err := agent.RPC("Catalog.Register", args, &out); err != nil { t.Fatalf("err: %v", err) } // Exists both, different (update) chk2 := &structs.HealthCheck{ Node: agent.config.NodeName, CheckID: "redis", Name: "redis", Status: structs.HealthPassing, } agent.state.AddCheck(chk2, "") chk2_mod := new(structs.HealthCheck) *chk2_mod = *chk2 chk2_mod.Status = structs.HealthCritical args.Check = chk2_mod if err := agent.RPC("Catalog.Register", args, &out); err != nil { t.Fatalf("err: %v", err) } // Exists local (create) chk3 := &structs.HealthCheck{ Node: agent.config.NodeName, CheckID: "web", Name: "web", Status: structs.HealthPassing, } agent.state.AddCheck(chk3, "") // Exists remote (delete) chk4 := &structs.HealthCheck{ Node: agent.config.NodeName, CheckID: "lb", Name: "lb", Status: structs.HealthPassing, } args.Check = chk4 if err := agent.RPC("Catalog.Register", args, &out); err != nil { t.Fatalf("err: %v", err) } // Exists local, in sync, remote missing (create) chk5 := &structs.HealthCheck{ Node: agent.config.NodeName, CheckID: "cache", Name: "cache", Status: structs.HealthPassing, } agent.state.AddCheck(chk5, "") agent.state.checkStatus["cache"] = syncStatus{inSync: true} // Trigger anti-entropy run and wait agent.StartSync() time.Sleep(200 * time.Millisecond) // Verify that we are in sync req := structs.NodeSpecificRequest{ Datacenter: "dc1", Node: agent.config.NodeName, } var checks structs.IndexedHealthChecks if err := agent.RPC("Health.NodeChecks", &req, &checks); err != nil { t.Fatalf("err: %v", err) } // We should have 5 checks (serf included) if len(checks.HealthChecks) != 5 { t.Fatalf("bad: %v", checks) } // All the checks should match for _, chk := range checks.HealthChecks { chk.CreateIndex, chk.ModifyIndex = 0, 0 switch chk.CheckID { case "mysql": if !reflect.DeepEqual(chk, chk1) { t.Fatalf("bad: %v %v", chk, chk1) } case "redis": if !reflect.DeepEqual(chk, chk2) { t.Fatalf("bad: %v %v", chk, chk2) } case "web": if !reflect.DeepEqual(chk, chk3) { t.Fatalf("bad: %v %v", chk, chk3) } case "cache": if !reflect.DeepEqual(chk, chk5) { t.Fatalf("bad: %v %v", chk, chk5) } case "serfHealth": // ignore default: t.Fatalf("unexpected check: %v", chk) } } // Check the local state if len(agent.state.checks) != 4 { t.Fatalf("bad: %v", agent.state.checks) } if len(agent.state.checkStatus) != 4 { t.Fatalf("bad: %v", agent.state.checkStatus) } for name, status := range agent.state.checkStatus { if !status.inSync { t.Fatalf("should be in sync: %v %v", name, status) } } // Make sure we sent along our tagged addresses when we synced. { req := structs.NodeSpecificRequest{ Datacenter: "dc1", Node: agent.config.NodeName, } var services structs.IndexedNodeServices if err := agent.RPC("Catalog.NodeServices", &req, &services); err != nil { t.Fatalf("err: %v", err) } addrs := services.NodeServices.Node.TaggedAddresses if len(addrs) == 0 || !reflect.DeepEqual(addrs, conf.TaggedAddresses) { t.Fatalf("bad: %v", addrs) } } }
// ensureCheckTransaction is used as the inner method to handle inserting // a health check into the state store. It ensures safety against inserting // checks with no matching node or service. func (s *StateStore) ensureCheckTxn(tx *memdb.Txn, idx uint64, watches *DumbWatchManager, hc *structs.HealthCheck) error { // Check if we have an existing health check existing, err := tx.First("checks", "id", hc.Node, string(hc.CheckID)) if err != nil { return fmt.Errorf("failed health check lookup: %s", err) } // Set the indexes if existing != nil { hc.CreateIndex = existing.(*structs.HealthCheck).CreateIndex hc.ModifyIndex = idx } else { hc.CreateIndex = idx hc.ModifyIndex = idx } // Use the default check status if none was provided if hc.Status == "" { hc.Status = structs.HealthCritical } // Get the node node, err := tx.First("nodes", "id", hc.Node) if err != nil { return fmt.Errorf("failed node lookup: %s", err) } if node == nil { return ErrMissingNode } // If the check is associated with a service, check that we have // a registration for the service. if hc.ServiceID != "" { service, err := tx.First("services", "id", hc.Node, hc.ServiceID) if err != nil { return fmt.Errorf("failed service lookup: %s", err) } if service == nil { return ErrMissingService } // Copy in the service name hc.ServiceName = service.(*structs.ServiceNode).ServiceName } // Delete any sessions for this check if the health is critical. if hc.Status == structs.HealthCritical { mappings, err := tx.Get("session_checks", "node_check", hc.Node, string(hc.CheckID)) if err != nil { return fmt.Errorf("failed session checks lookup: %s", err) } var ids []string for mapping := mappings.Next(); mapping != nil; mapping = mappings.Next() { ids = append(ids, mapping.(*sessionCheck).Session) } // Delete the session in a separate loop so we don't trash the // iterator. watches := NewDumbWatchManager(s.tableWatches) for _, id := range ids { if err := s.deleteSessionTxn(tx, idx, watches, id); err != nil { return fmt.Errorf("failed deleting session: %s", err) } } tx.Defer(func() { watches.Notify() }) } // Persist the check registration in the db. if err := tx.Insert("checks", hc); err != nil { return fmt.Errorf("failed inserting check: %s", err) } if err := tx.Insert("index", &IndexEntry{"checks", idx}); err != nil { return fmt.Errorf("failed updating index: %s", err) } watches.Arm("checks") return nil }
func TestAgentAntiEntropy_Checks(t *testing.T) { conf := nextConfig() dir, agent := makeAgent(t, conf) defer os.RemoveAll(dir) defer agent.Shutdown() // Wait for a leader time.Sleep(100 * time.Millisecond) // Register info args := &structs.RegisterRequest{ Datacenter: "dc1", Node: agent.config.NodeName, Address: "127.0.0.1", } var out struct{} // Exists both, same (noop) chk1 := &structs.HealthCheck{ Node: agent.config.NodeName, CheckID: "mysql", Name: "mysql", Status: structs.HealthPassing, } agent.state.AddCheck(chk1) args.Check = chk1 if err := agent.RPC("Catalog.Register", args, &out); err != nil { t.Fatalf("err: %v", err) } // Exists both, different (update) chk2 := &structs.HealthCheck{ Node: agent.config.NodeName, CheckID: "redis", Name: "redis", Status: structs.HealthPassing, } agent.state.AddCheck(chk2) chk2_mod := new(structs.HealthCheck) *chk2_mod = *chk2 chk2_mod.Status = structs.HealthUnknown args.Check = chk2_mod if err := agent.RPC("Catalog.Register", args, &out); err != nil { t.Fatalf("err: %v", err) } // Exists local (create) chk3 := &structs.HealthCheck{ Node: agent.config.NodeName, CheckID: "web", Name: "web", Status: structs.HealthPassing, } agent.state.AddCheck(chk3) // Exists remote (delete) chk4 := &structs.HealthCheck{ Node: agent.config.NodeName, CheckID: "lb", Name: "lb", Status: structs.HealthPassing, } args.Check = chk4 if err := agent.RPC("Catalog.Register", args, &out); err != nil { t.Fatalf("err: %v", err) } // Trigger anti-entropy run and wait agent.StartSync() time.Sleep(100 * time.Millisecond) // Verify that we are in sync req := structs.NodeSpecificRequest{ Datacenter: "dc1", Node: agent.config.NodeName, } var checks structs.IndexedHealthChecks if err := agent.RPC("Health.NodeChecks", &req, &checks); err != nil { t.Fatalf("err: %v", err) } // We should have 4 services (serf included) if len(checks.HealthChecks) != 4 { t.Fatalf("bad: %v", checks) } // All the checks should match for _, chk := range checks.HealthChecks { switch chk.CheckID { case "mysql": if !reflect.DeepEqual(chk, chk1) { t.Fatalf("bad: %v %v", chk, chk1) } case "redis": if !reflect.DeepEqual(chk, chk2) { t.Fatalf("bad: %v %v", chk, chk2) } case "web": if !reflect.DeepEqual(chk, chk3) { t.Fatalf("bad: %v %v", chk, chk3) } case "serfHealth": // ignore default: t.Fatalf("unexpected check: %v", chk) } } // Check the local state if len(agent.state.checks) != 3 { t.Fatalf("bad: %v", agent.state.checks) } if len(agent.state.checkStatus) != 3 { t.Fatalf("bad: %v", agent.state.checkStatus) } for name, status := range agent.state.checkStatus { if !status.inSync { t.Fatalf("should be in sync: %v %v", name, status) } } }