func (h *NodeHealer) HandleError(node *cluster.Node) time.Duration { h.Lock() if h.locks[node.Address] == nil { h.locks[node.Address] = &sync.Mutex{} } h.Unlock() h.locks[node.Address].Lock() defer h.locks[node.Address].Unlock() failures := node.FailureCount() if failures < h.failuresBeforeHealing { log.Debugf("%d failures detected in node %q, waiting for more failures before healing.", failures, node.Address) return h.disabledTime } if !node.HasSuccess() { log.Debugf("Node %q has never been successfully reached, healing won't run on it.", node.Address) return h.disabledTime } _, hasIaas := node.Metadata["iaas"] if !hasIaas { log.Debugf("Node %q doesn't have IaaS information, healing won't run on it.", node.Address) return h.disabledTime } healingCounter, err := healingCountFor("node", node.Address, consecutiveHealingsTimeframe) if err != nil { log.Errorf("Node healing: couldn't verify number of previous healings for %s: %s", node.Address, err.Error()) return h.disabledTime } if healingCounter > consecutiveHealingsLimitInTimeframe { log.Errorf("Node healing: number of healings for node %s in the last %d minutes exceeds limit of %d: %d", node.Address, consecutiveHealingsTimeframe/time.Minute, consecutiveHealingsLimitInTimeframe, healingCounter) return h.disabledTime } log.Errorf("Initiating healing process for node %q after %d failures.", node.Address, failures) evt, err := NewHealingEvent(*node) if err != nil { log.Errorf("Error trying to insert healing event: %s", err.Error()) return h.disabledTime } createdNode, err := h.healNode(node) if err != nil { log.Errorf("Error healing: %s", err.Error()) } err = evt.Update(createdNode, err) if err != nil { log.Errorf("Error trying to update healing event: %s", err.Error()) } if createdNode.Address != "" { return 0 } return h.disabledTime }
func (h *NodeHealer) HandleError(node *cluster.Node) time.Duration { h.wg.Add(1) defer h.wg.Done() failures := node.FailureCount() if failures < h.failuresBeforeHealing { log.Debugf("%d failures detected in node %q, waiting for more failures before healing.", failures, node.Address) return h.disabledTime } if !node.HasSuccess() { log.Debugf("Node %q has never been successfully reached, healing won't run on it.", node.Address) return h.disabledTime } err := h.tryHealingNode(node, fmt.Sprintf("%d consecutive failures", failures), nil) if err != nil { log.Errorf("[node healer handle error] %s", err) } return h.disabledTime }