func (h *NodeHealer) shouldHealNode(node provision.Node) (bool, error) { conf := healerConfig() var configEntry NodeHealerConfig err := conf.Load(node.Metadata()[poolMetadataName], &configEntry) if err != nil { return false, err } queryPart, err := h.queryPartForConfig([]provision.Node{node}, configEntry) if err != nil { return false, err } if queryPart == nil { return false, nil } coll, err := nodeDataCollection() if err != nil { return false, errors.Wrap(err, "unable to get node data collection") } defer coll.Close() count, err := coll.Find(queryPart).Count() if err != nil { return false, errors.Wrap(err, "unable to find nodes to heal") } return count > 0, nil }
func (h *NodeHealer) UpdateNodeData(node provision.Node, checks []provision.NodeCheckResult) error { isSuccess := true for _, c := range checks { isSuccess = c.Successful if !isSuccess { break } } now := time.Now().UTC() toInsert := NodeStatusData{ LastUpdate: now, } if isSuccess { toInsert.LastSuccess = now } coll, err := nodeDataCollection() if err != nil { return err } defer coll.Close() _, err = coll.UpsertId(node.Address(), bson.M{ "$set": toInsert, "$push": bson.M{ "checks": bson.D([]bson.DocElem{ {Name: "$each", Value: []NodeChecks{{Time: now, Checks: checks}}}, {Name: "$slice", Value: -10}, }), }, }) return err }
func (h *NodeHealer) RemoveNode(node provision.Node) error { coll, err := nodeDataCollection() if err != nil { return errors.Wrap(err, "unable to get node data collection") } defer coll.Close() err = coll.RemoveId(node.Address()) if err != nil && err != mgo.ErrNotFound { return err } return nil }
// UpdateNodeStatus updates the status of the given node and its units, // returning a map which units were found during the update. func UpdateNodeStatus(nodeData provision.NodeStatusData) ([]UpdateUnitsResult, error) { provisioners, err := provision.Registry() if err != nil { return nil, err } var node provision.Node for _, p := range provisioners { if nodeProv, ok := p.(provision.NodeProvisioner); ok { node, err = nodeProv.NodeForNodeData(nodeData) if err == nil { break } if errors.Cause(err) != provision.ErrNodeNotFound { return nil, err } } } if node == nil { return nil, provision.ErrNodeNotFound } if healer.HealerInstance != nil { err = healer.HealerInstance.UpdateNodeData(node, nodeData.Checks) if err != nil { log.Errorf("unable to set node status in healer: %s", err) } } unitProv, ok := node.Provisioner().(provision.UnitStatusProvisioner) if !ok { return []UpdateUnitsResult{}, nil } result := make([]UpdateUnitsResult, len(nodeData.Units)) for i, unitData := range nodeData.Units { unit := provision.Unit{ID: unitData.ID, Name: unitData.Name} err = unitProv.SetUnitStatus(unit, unitData.Status) _, isNotFound := err.(*provision.UnitNotFoundError) if err != nil && !isNotFound { return nil, err } result[i] = UpdateUnitsResult{ID: unitData.ID, Found: !isNotFound} } return result, nil }
func (h *NodeHealer) healNode(node provision.Node) (*provision.NodeSpec, error) { failingAddr := node.Address() // Copy metadata to ensure underlying data structure is not modified. newNodeMetadata := map[string]string{} for k, v := range node.Metadata() { newNodeMetadata[k] = v } failingHost := net.URLToHost(failingAddr) healthNode, isHealthNode := node.(provision.NodeHealthChecker) failures := 0 if isHealthNode { failures = healthNode.FailureCount() } machine, err := iaas.CreateMachineForIaaS(newNodeMetadata["iaas"], newNodeMetadata) if err != nil { if isHealthNode { healthNode.ResetFailures() } return nil, errors.Wrapf(err, "Can't auto-heal after %d failures for node %s: error creating new machine", failures, failingHost) } err = node.Provisioner().UpdateNode(provision.UpdateNodeOptions{ Address: failingAddr, Disable: true, }) if err != nil { machine.Destroy() return nil, errors.Wrapf(err, "Can't auto-heal after %d failures for node %s: error unregistering old node", failures, failingHost) } newAddr := machine.FormatNodeAddress() log.Debugf("New machine created during healing process: %s - Waiting for docker to start...", newAddr) createOpts := provision.AddNodeOptions{ Address: newAddr, Metadata: newNodeMetadata, WaitTO: h.waitTimeNewMachine, CaCert: machine.CaCert, ClientCert: machine.ClientCert, ClientKey: machine.ClientKey, } err = node.Provisioner().AddNode(createOpts) if err != nil { if isHealthNode { healthNode.ResetFailures() } node.Provisioner().UpdateNode(provision.UpdateNodeOptions{Address: failingAddr, Enable: true}) machine.Destroy() return nil, errors.Wrapf(err, "Can't auto-heal after %d failures for node %s: error registering new node", failures, failingHost) } nodeSpec := provision.NodeToSpec(node) nodeSpec.Address = newAddr nodeSpec.Metadata = newNodeMetadata var buf bytes.Buffer err = node.Provisioner().RemoveNode(provision.RemoveNodeOptions{ Address: failingAddr, Rebalance: true, Writer: &buf, }) if err != nil { log.Errorf("Unable to move containers, skipping containers healing %q -> %q: %s: %s", failingHost, machine.Address, err, buf.String()) } failingMachine, err := iaas.FindMachineByIdOrAddress(node.Metadata()["iaas-id"], failingHost) if err != nil { return &nodeSpec, errors.Wrapf(err, "Unable to find failing machine %s in IaaS", failingHost) } err = failingMachine.Destroy() if err != nil { return &nodeSpec, errors.Wrapf(err, "Unable to destroy machine %s from IaaS", failingHost) } log.Debugf("Done auto-healing node %q, node %q created in its place.", failingHost, machine.Address) return &nodeSpec, nil }
func (h *NodeHealer) tryHealingNode(node provision.Node, reason string, lastCheck *NodeChecks) error { _, hasIaas := node.Metadata()["iaas"] if !hasIaas { log.Debugf("node %q doesn't have IaaS information, healing (%s) won't run on it.", node.Address(), reason) return nil } poolName := node.Metadata()[poolMetadataName] evt, err := event.NewInternal(&event.Opts{ Target: event.Target{Type: event.TargetTypeNode, Value: node.Address()}, InternalKind: "healer", CustomData: NodeHealerCustomData{ Node: provision.NodeToSpec(node), Reason: reason, LastCheck: lastCheck, }, Allowed: event.Allowed(permission.PermPoolReadEvents, permission.Context(permission.CtxPool, poolName)), }) if err != nil { if _, ok := err.(event.ErrEventLocked); ok { // Healing in progress. return nil } return errors.Wrap(err, "Error trying to insert node healing event, healing aborted") } var createdNode *provision.NodeSpec var evtErr error defer func() { var updateErr error if evtErr == nil && createdNode == nil { updateErr = evt.Abort() } else { updateErr = evt.DoneCustomData(evtErr, createdNode) } if updateErr != nil { log.Errorf("error trying to update healing event: %s", updateErr) } }() _, err = node.Provisioner().GetNode(node.Address()) if err != nil { if err == provision.ErrNodeNotFound { return nil } evtErr = errors.Wrap(err, "unable to check if node still exists") return evtErr } shouldHeal, err := h.shouldHealNode(node) if err != nil { evtErr = errors.Wrap(err, "unable to check if node should be healed") return evtErr } if !shouldHeal { return nil } log.Errorf("initiating healing process for node %q due to: %s", node.Address(), reason) createdNode, evtErr = h.healNode(node) return evtErr }