Example #1
0
func (h *NodeHealer) tryHealingNode(node provision.Node, reason string, lastCheck *NodeChecks) error {
	_, hasIaas := node.Metadata()["iaas"]
	if !hasIaas {
		log.Debugf("node %q doesn't have IaaS information, healing (%s) won't run on it.", node.Address(), reason)
		return nil
	}
	poolName := node.Metadata()[poolMetadataName]
	evt, err := event.NewInternal(&event.Opts{
		Target:       event.Target{Type: event.TargetTypeNode, Value: node.Address()},
		InternalKind: "healer",
		CustomData: NodeHealerCustomData{
			Node:      provision.NodeToSpec(node),
			Reason:    reason,
			LastCheck: lastCheck,
		},
		Allowed: event.Allowed(permission.PermPoolReadEvents, permission.Context(permission.CtxPool, poolName)),
	})
	if err != nil {
		if _, ok := err.(event.ErrEventLocked); ok {
			// Healing in progress.
			return nil
		}
		return errors.Wrap(err, "Error trying to insert node healing event, healing aborted")
	}
	var createdNode *provision.NodeSpec
	var evtErr error
	defer func() {
		var updateErr error
		if evtErr == nil && createdNode == nil {
			updateErr = evt.Abort()
		} else {
			updateErr = evt.DoneCustomData(evtErr, createdNode)
		}
		if updateErr != nil {
			log.Errorf("error trying to update healing event: %s", updateErr)
		}
	}()
	_, err = node.Provisioner().GetNode(node.Address())
	if err != nil {
		if err == provision.ErrNodeNotFound {
			return nil
		}
		evtErr = errors.Wrap(err, "unable to check if node still exists")
		return evtErr
	}
	shouldHeal, err := h.shouldHealNode(node)
	if err != nil {
		evtErr = errors.Wrap(err, "unable to check if node should be healed")
		return evtErr
	}
	if !shouldHeal {
		return nil
	}
	log.Errorf("initiating healing process for node %q due to: %s", node.Address(), reason)
	createdNode, evtErr = h.healNode(node)
	return evtErr
}
Example #2
0
func (h *NodeHealer) healNode(node provision.Node) (*provision.NodeSpec, error) {
	failingAddr := node.Address()
	// Copy metadata to ensure underlying data structure is not modified.
	newNodeMetadata := map[string]string{}
	for k, v := range node.Metadata() {
		newNodeMetadata[k] = v
	}
	failingHost := net.URLToHost(failingAddr)
	healthNode, isHealthNode := node.(provision.NodeHealthChecker)
	failures := 0
	if isHealthNode {
		failures = healthNode.FailureCount()
	}
	machine, err := iaas.CreateMachineForIaaS(newNodeMetadata["iaas"], newNodeMetadata)
	if err != nil {
		if isHealthNode {
			healthNode.ResetFailures()
		}
		return nil, errors.Wrapf(err, "Can't auto-heal after %d failures for node %s: error creating new machine", failures, failingHost)
	}
	err = node.Provisioner().UpdateNode(provision.UpdateNodeOptions{
		Address: failingAddr,
		Disable: true,
	})
	if err != nil {
		machine.Destroy()
		return nil, errors.Wrapf(err, "Can't auto-heal after %d failures for node %s: error unregistering old node", failures, failingHost)
	}
	newAddr := machine.FormatNodeAddress()
	log.Debugf("New machine created during healing process: %s - Waiting for docker to start...", newAddr)
	createOpts := provision.AddNodeOptions{
		Address:    newAddr,
		Metadata:   newNodeMetadata,
		WaitTO:     h.waitTimeNewMachine,
		CaCert:     machine.CaCert,
		ClientCert: machine.ClientCert,
		ClientKey:  machine.ClientKey,
	}
	err = node.Provisioner().AddNode(createOpts)
	if err != nil {
		if isHealthNode {
			healthNode.ResetFailures()
		}
		node.Provisioner().UpdateNode(provision.UpdateNodeOptions{Address: failingAddr, Enable: true})
		machine.Destroy()
		return nil, errors.Wrapf(err, "Can't auto-heal after %d failures for node %s: error registering new node", failures, failingHost)
	}
	nodeSpec := provision.NodeToSpec(node)
	nodeSpec.Address = newAddr
	nodeSpec.Metadata = newNodeMetadata
	var buf bytes.Buffer
	err = node.Provisioner().RemoveNode(provision.RemoveNodeOptions{
		Address:   failingAddr,
		Rebalance: true,
		Writer:    &buf,
	})
	if err != nil {
		log.Errorf("Unable to move containers, skipping containers healing %q -> %q: %s: %s", failingHost, machine.Address, err, buf.String())
	}
	failingMachine, err := iaas.FindMachineByIdOrAddress(node.Metadata()["iaas-id"], failingHost)
	if err != nil {
		return &nodeSpec, errors.Wrapf(err, "Unable to find failing machine %s in IaaS", failingHost)
	}
	err = failingMachine.Destroy()
	if err != nil {
		return &nodeSpec, errors.Wrapf(err, "Unable to destroy machine %s from IaaS", failingHost)
	}
	log.Debugf("Done auto-healing node %q, node %q created in its place.", failingHost, machine.Address)
	return &nodeSpec, nil
}