func (h *nodeHealer) healNode(node *cluster.Node) (cluster.Node, error) { emptyNode := cluster.Node{} failingAddr := node.Address nodeMetadata := node.CleanMetadata() failingHost := urlToHost(failingAddr) failures := node.FailureCount() machine, err := iaas.CreateMachineForIaaS(nodeMetadata["iaas"], nodeMetadata) if err != nil { node.ResetFailures() return emptyNode, fmt.Errorf("Can't auto-heal after %d failures for node %s: error creating new machine: %s", failures, failingHost, err.Error()) } err = h.provisioner.getCluster().Unregister(failingAddr) if err != nil { machine.Destroy() return emptyNode, fmt.Errorf("Can't auto-heal after %d failures for node %s: error unregistering old node: %s", failures, failingHost, err.Error()) } newAddr := machine.FormatNodeAddress() log.Debugf("New machine created during healing process: %s - Waiting for docker to start...", newAddr) createdNode, err := h.provisioner.getCluster().WaitAndRegister(newAddr, nodeMetadata, h.waitTimeNewMachine) if err != nil { node.ResetFailures() h.provisioner.getCluster().Register(failingAddr, nodeMetadata) machine.Destroy() return emptyNode, fmt.Errorf("Can't auto-heal after %d failures for node %s: error registering new node: %s", failures, failingHost, err.Error()) } var buf bytes.Buffer err = h.provisioner.moveContainers(failingHost, "", &buf) if err != nil { log.Errorf("Unable to move containers, skipping containers healing %q -> %q: %s: %s", failingHost, machine.Address, err.Error(), buf.String()) } failingMachine, err := iaas.FindMachineByIdOrAddress(node.Metadata["iaas-id"], failingHost) if err != nil { return createdNode, fmt.Errorf("Unable to find failing machine %s in IaaS: %s", failingHost, err.Error()) } err = failingMachine.Destroy() if err != nil { return createdNode, fmt.Errorf("Unable to destroy machine %s from IaaS: %s", failingHost, err.Error()) } log.Debugf("Done auto-healing node %q, node %q created in its place.", failingHost, machine.Address) return createdNode, nil }
func (h *NodeHealer) healNode(node *cluster.Node) (cluster.Node, error) { emptyNode := cluster.Node{} failingAddr := node.Address nodeMetadata := node.CleanMetadata() failingHost := net.URLToHost(failingAddr) failures := node.FailureCount() machine, err := iaas.CreateMachineForIaaS(nodeMetadata["iaas"], nodeMetadata) if err != nil { node.ResetFailures() return emptyNode, fmt.Errorf("Can't auto-heal after %d failures for node %s: error creating new machine: %s", failures, failingHost, err.Error()) } err = h.provisioner.Cluster().Unregister(failingAddr) if err != nil { machine.Destroy() return emptyNode, fmt.Errorf("Can't auto-heal after %d failures for node %s: error unregistering old node: %s", failures, failingHost, err.Error()) } newAddr := machine.FormatNodeAddress() log.Debugf("New machine created during healing process: %s - Waiting for docker to start...", newAddr) createdNode := cluster.Node{ Address: newAddr, Metadata: nodeMetadata, CreationStatus: cluster.NodeCreationStatusPending, } err = h.provisioner.Cluster().Register(createdNode) if err != nil { node.ResetFailures() h.provisioner.Cluster().Register(cluster.Node{Address: failingAddr, Metadata: nodeMetadata}) machine.Destroy() return emptyNode, fmt.Errorf("Can't auto-heal after %d failures for node %s: error registering new node: %s", failures, failingHost, err.Error()) } q, err := queue.Queue() if err != nil { return emptyNode, err } jobParams := monsterqueue.JobParams{ "endpoint": createdNode.Address, "machine": machine.Id, "metadata": createdNode.Metadata, } job, err := q.EnqueueWait(bs.QueueTaskName, jobParams, h.waitTimeNewMachine) if err == nil { _, err = job.Result() } if err != nil { node.ResetFailures() h.provisioner.Cluster().Register(cluster.Node{Address: failingAddr, Metadata: nodeMetadata}) return emptyNode, fmt.Errorf("Can't auto-heal after %d failures for node %s: error waiting for the bs task: %s", failures, failingHost, err.Error()) } var buf bytes.Buffer err = h.provisioner.MoveContainers(failingHost, "", &buf) if err != nil { log.Errorf("Unable to move containers, skipping containers healing %q -> %q: %s: %s", failingHost, machine.Address, err.Error(), buf.String()) } failingMachine, err := iaas.FindMachineByIdOrAddress(node.Metadata["iaas-id"], failingHost) if err != nil { return createdNode, fmt.Errorf("Unable to find failing machine %s in IaaS: %s", failingHost, err.Error()) } err = failingMachine.Destroy() if err != nil { return createdNode, fmt.Errorf("Unable to destroy machine %s from IaaS: %s", failingHost, err.Error()) } log.Debugf("Done auto-healing node %q, node %q created in its place.", failingHost, machine.Address) return createdNode, nil }