Beispiel #1
0
func (h *NodeHealer) shouldHealNode(node provision.Node) (bool, error) {
	conf := healerConfig()
	var configEntry NodeHealerConfig
	err := conf.Load(node.Metadata()[poolMetadataName], &configEntry)
	if err != nil {
		return false, err
	}
	queryPart, err := h.queryPartForConfig([]provision.Node{node}, configEntry)
	if err != nil {
		return false, err
	}
	if queryPart == nil {
		return false, nil
	}
	coll, err := nodeDataCollection()
	if err != nil {
		return false, errors.Wrap(err, "unable to get node data collection")
	}
	defer coll.Close()
	count, err := coll.Find(queryPart).Count()
	if err != nil {
		return false, errors.Wrap(err, "unable to find nodes to heal")
	}
	return count > 0, nil
}
Beispiel #2
0
func (h *NodeHealer) UpdateNodeData(node provision.Node, checks []provision.NodeCheckResult) error {
	isSuccess := true
	for _, c := range checks {
		isSuccess = c.Successful
		if !isSuccess {
			break
		}
	}
	now := time.Now().UTC()
	toInsert := NodeStatusData{
		LastUpdate: now,
	}
	if isSuccess {
		toInsert.LastSuccess = now
	}
	coll, err := nodeDataCollection()
	if err != nil {
		return err
	}
	defer coll.Close()
	_, err = coll.UpsertId(node.Address(), bson.M{
		"$set": toInsert,
		"$push": bson.M{
			"checks": bson.D([]bson.DocElem{
				{Name: "$each", Value: []NodeChecks{{Time: now, Checks: checks}}},
				{Name: "$slice", Value: -10},
			}),
		},
	})
	return err
}
Beispiel #3
0
func (h *NodeHealer) RemoveNode(node provision.Node) error {
	coll, err := nodeDataCollection()
	if err != nil {
		return errors.Wrap(err, "unable to get node data collection")
	}
	defer coll.Close()
	err = coll.RemoveId(node.Address())
	if err != nil && err != mgo.ErrNotFound {
		return err
	}
	return nil
}
Beispiel #4
0
// UpdateNodeStatus updates the status of the given node and its units,
// returning a map which units were found during the update.
func UpdateNodeStatus(nodeData provision.NodeStatusData) ([]UpdateUnitsResult, error) {
	provisioners, err := provision.Registry()
	if err != nil {
		return nil, err
	}
	var node provision.Node
	for _, p := range provisioners {
		if nodeProv, ok := p.(provision.NodeProvisioner); ok {
			node, err = nodeProv.NodeForNodeData(nodeData)
			if err == nil {
				break
			}
			if errors.Cause(err) != provision.ErrNodeNotFound {
				return nil, err
			}
		}
	}
	if node == nil {
		return nil, provision.ErrNodeNotFound
	}
	if healer.HealerInstance != nil {
		err = healer.HealerInstance.UpdateNodeData(node, nodeData.Checks)
		if err != nil {
			log.Errorf("unable to set node status in healer: %s", err)
		}
	}
	unitProv, ok := node.Provisioner().(provision.UnitStatusProvisioner)
	if !ok {
		return []UpdateUnitsResult{}, nil
	}
	result := make([]UpdateUnitsResult, len(nodeData.Units))
	for i, unitData := range nodeData.Units {
		unit := provision.Unit{ID: unitData.ID, Name: unitData.Name}
		err = unitProv.SetUnitStatus(unit, unitData.Status)
		_, isNotFound := err.(*provision.UnitNotFoundError)
		if err != nil && !isNotFound {
			return nil, err
		}
		result[i] = UpdateUnitsResult{ID: unitData.ID, Found: !isNotFound}
	}
	return result, nil
}
Beispiel #5
0
func (h *NodeHealer) healNode(node provision.Node) (*provision.NodeSpec, error) {
	failingAddr := node.Address()
	// Copy metadata to ensure underlying data structure is not modified.
	newNodeMetadata := map[string]string{}
	for k, v := range node.Metadata() {
		newNodeMetadata[k] = v
	}
	failingHost := net.URLToHost(failingAddr)
	healthNode, isHealthNode := node.(provision.NodeHealthChecker)
	failures := 0
	if isHealthNode {
		failures = healthNode.FailureCount()
	}
	machine, err := iaas.CreateMachineForIaaS(newNodeMetadata["iaas"], newNodeMetadata)
	if err != nil {
		if isHealthNode {
			healthNode.ResetFailures()
		}
		return nil, errors.Wrapf(err, "Can't auto-heal after %d failures for node %s: error creating new machine", failures, failingHost)
	}
	err = node.Provisioner().UpdateNode(provision.UpdateNodeOptions{
		Address: failingAddr,
		Disable: true,
	})
	if err != nil {
		machine.Destroy()
		return nil, errors.Wrapf(err, "Can't auto-heal after %d failures for node %s: error unregistering old node", failures, failingHost)
	}
	newAddr := machine.FormatNodeAddress()
	log.Debugf("New machine created during healing process: %s - Waiting for docker to start...", newAddr)
	createOpts := provision.AddNodeOptions{
		Address:    newAddr,
		Metadata:   newNodeMetadata,
		WaitTO:     h.waitTimeNewMachine,
		CaCert:     machine.CaCert,
		ClientCert: machine.ClientCert,
		ClientKey:  machine.ClientKey,
	}
	err = node.Provisioner().AddNode(createOpts)
	if err != nil {
		if isHealthNode {
			healthNode.ResetFailures()
		}
		node.Provisioner().UpdateNode(provision.UpdateNodeOptions{Address: failingAddr, Enable: true})
		machine.Destroy()
		return nil, errors.Wrapf(err, "Can't auto-heal after %d failures for node %s: error registering new node", failures, failingHost)
	}
	nodeSpec := provision.NodeToSpec(node)
	nodeSpec.Address = newAddr
	nodeSpec.Metadata = newNodeMetadata
	var buf bytes.Buffer
	err = node.Provisioner().RemoveNode(provision.RemoveNodeOptions{
		Address:   failingAddr,
		Rebalance: true,
		Writer:    &buf,
	})
	if err != nil {
		log.Errorf("Unable to move containers, skipping containers healing %q -> %q: %s: %s", failingHost, machine.Address, err, buf.String())
	}
	failingMachine, err := iaas.FindMachineByIdOrAddress(node.Metadata()["iaas-id"], failingHost)
	if err != nil {
		return &nodeSpec, errors.Wrapf(err, "Unable to find failing machine %s in IaaS", failingHost)
	}
	err = failingMachine.Destroy()
	if err != nil {
		return &nodeSpec, errors.Wrapf(err, "Unable to destroy machine %s from IaaS", failingHost)
	}
	log.Debugf("Done auto-healing node %q, node %q created in its place.", failingHost, machine.Address)
	return &nodeSpec, nil
}
Beispiel #6
0
func (h *NodeHealer) tryHealingNode(node provision.Node, reason string, lastCheck *NodeChecks) error {
	_, hasIaas := node.Metadata()["iaas"]
	if !hasIaas {
		log.Debugf("node %q doesn't have IaaS information, healing (%s) won't run on it.", node.Address(), reason)
		return nil
	}
	poolName := node.Metadata()[poolMetadataName]
	evt, err := event.NewInternal(&event.Opts{
		Target:       event.Target{Type: event.TargetTypeNode, Value: node.Address()},
		InternalKind: "healer",
		CustomData: NodeHealerCustomData{
			Node:      provision.NodeToSpec(node),
			Reason:    reason,
			LastCheck: lastCheck,
		},
		Allowed: event.Allowed(permission.PermPoolReadEvents, permission.Context(permission.CtxPool, poolName)),
	})
	if err != nil {
		if _, ok := err.(event.ErrEventLocked); ok {
			// Healing in progress.
			return nil
		}
		return errors.Wrap(err, "Error trying to insert node healing event, healing aborted")
	}
	var createdNode *provision.NodeSpec
	var evtErr error
	defer func() {
		var updateErr error
		if evtErr == nil && createdNode == nil {
			updateErr = evt.Abort()
		} else {
			updateErr = evt.DoneCustomData(evtErr, createdNode)
		}
		if updateErr != nil {
			log.Errorf("error trying to update healing event: %s", updateErr)
		}
	}()
	_, err = node.Provisioner().GetNode(node.Address())
	if err != nil {
		if err == provision.ErrNodeNotFound {
			return nil
		}
		evtErr = errors.Wrap(err, "unable to check if node still exists")
		return evtErr
	}
	shouldHeal, err := h.shouldHealNode(node)
	if err != nil {
		evtErr = errors.Wrap(err, "unable to check if node should be healed")
		return evtErr
	}
	if !shouldHeal {
		return nil
	}
	log.Errorf("initiating healing process for node %q due to: %s", node.Address(), reason)
	createdNode, evtErr = h.healNode(node)
	return evtErr
}