func (t *runBs) Run(job monsterqueue.Job) { params := job.Parameters() dockerEndpoint := params["endpoint"].(string) node := cluster.Node{Address: dockerEndpoint} err := t.waitDocker(dockerEndpoint) if err != nil { job.Error(err) return } node.CreationStatus = cluster.NodeCreationStatusCreated rawMetadata := params["metadata"].(monsterqueue.JobParams) metadata := make(map[string]string, len(rawMetadata)) for key, value := range rawMetadata { metadata[key] = value.(string) } err = createContainer(dockerEndpoint, metadata["pool"], t.provisioner, true) if err != nil { t.provisioner.Cluster().UpdateNode(node) job.Error(err) return } node.Metadata = map[string]string{"LastSuccess": time.Now().Format(time.RFC3339)} _, err = t.provisioner.Cluster().UpdateNode(node) if err != nil { job.Error(err) return } job.Success(nil) }
func (t *runBs) Run(job monsterqueue.Job) { params := job.Parameters() dockerEndpoint := params["endpoint"].(string) machineID := params["machine"].(string) node := cluster.Node{Address: dockerEndpoint} err := t.waitDocker(dockerEndpoint) if err != nil { job.Error(err) t.destroyMachine(machineID) return } rawMetadata := params["metadata"].(monsterqueue.JobParams) metadata := make(map[string]string, len(rawMetadata)) for key, value := range rawMetadata { metadata[key] = value.(string) } err = CreateContainer(dockerEndpoint, metadata["pool"], t.provisioner, true) if err != nil { node.CreationStatus = cluster.NodeCreationStatusError node.Metadata = map[string]string{"creationError": err.Error()} t.provisioner.Cluster().UpdateNode(node) job.Error(err) t.destroyMachine(machineID) return } node.CreationStatus = cluster.NodeCreationStatusCreated _, err = t.provisioner.Cluster().UpdateNode(node) if err != nil { job.Error(err) t.destroyMachine(machineID) return } job.Success(nil) }
func (p *dockerProvisioner) AddNode(opts provision.AddNodeOptions) error { node := cluster.Node{ Address: opts.Address, Metadata: opts.Metadata, CreationStatus: cluster.NodeCreationStatusPending, CaCert: opts.CaCert, ClientCert: opts.ClientCert, ClientKey: opts.ClientKey, } if len(opts.CaCert) == 0 && len(p.caCert) > 0 { node.CaCert = p.caCert node.ClientCert = p.clientCert node.ClientKey = p.clientKey } err := p.Cluster().Register(node) if err != nil { return err } q, err := queue.Queue() if err != nil { return err } jobParams := monsterqueue.JobParams{"endpoint": opts.Address, "metadata": opts.Metadata} var job monsterqueue.Job if opts.WaitTO != 0 { job, err = q.EnqueueWait(internalNodeContainer.QueueTaskName, jobParams, opts.WaitTO) } else { _, err = q.Enqueue(internalNodeContainer.QueueTaskName, jobParams) } if err == nil && job != nil { _, err = job.Result() } return err }
func updateNodeHandler(w http.ResponseWriter, r *http.Request, t auth.Token) error { params, err := unmarshal(r.Body) if err != nil { return err } address, _ := params["address"] if address == "" { return &errors.HTTP{Code: http.StatusBadRequest, Message: "address is required"} } nodes, err := mainDockerProvisioner.Cluster().UnfilteredNodes() if err != nil { return err } var oldNode *cluster.Node for i := range nodes { if nodes[i].Address == address { oldNode = &nodes[i] break } } oldPool, _ := oldNode.Metadata["pool"] allowedOldPool := permission.Check(t, permission.PermNodeUpdate, permission.Context(permission.CtxPool, oldPool), ) if !allowedOldPool { return permission.ErrUnauthorized } newPool, ok := params["pool"] if ok { allowedNewPool := permission.Check(t, permission.PermNodeUpdate, permission.Context(permission.CtxPool, newPool), ) if !allowedNewPool { return permission.ErrUnauthorized } } delete(params, "address") node := cluster.Node{Address: address, Metadata: params} disabled, _ := strconv.ParseBool(r.URL.Query().Get("disabled")) enabled, _ := strconv.ParseBool(r.URL.Query().Get("enabled")) if disabled && enabled { return &errors.HTTP{ Code: http.StatusBadRequest, Message: "You can't make a node enable and disable at the same time.", } } if disabled { node.CreationStatus = cluster.NodeCreationStatusDisabled } if enabled { node.CreationStatus = cluster.NodeStatusReady } _, err = mainDockerProvisioner.Cluster().UpdateNode(node) return err }
func cleanMetadata(n *cluster.Node) map[string]string { // iaas-id is ignored because it wasn't created in previous tsuru versions // and having nodes with and without it would cause unbalanced metadata // errors. ignoredMetadata := []string{"iaas-id"} metadata := n.CleanMetadata() for _, val := range ignoredMetadata { delete(metadata, val) } return metadata }
func (h *NodeHealer) HandleError(node *cluster.Node) time.Duration { h.Lock() if h.locks[node.Address] == nil { h.locks[node.Address] = &sync.Mutex{} } h.Unlock() h.locks[node.Address].Lock() defer h.locks[node.Address].Unlock() failures := node.FailureCount() if failures < h.failuresBeforeHealing { log.Debugf("%d failures detected in node %q, waiting for more failures before healing.", failures, node.Address) return h.disabledTime } if !node.HasSuccess() { log.Debugf("Node %q has never been successfully reached, healing won't run on it.", node.Address) return h.disabledTime } _, hasIaas := node.Metadata["iaas"] if !hasIaas { log.Debugf("Node %q doesn't have IaaS information, healing won't run on it.", node.Address) return h.disabledTime } healingCounter, err := healingCountFor("node", node.Address, consecutiveHealingsTimeframe) if err != nil { log.Errorf("Node healing: couldn't verify number of previous healings for %s: %s", node.Address, err.Error()) return h.disabledTime } if healingCounter > consecutiveHealingsLimitInTimeframe { log.Errorf("Node healing: number of healings for node %s in the last %d minutes exceeds limit of %d: %d", node.Address, consecutiveHealingsTimeframe/time.Minute, consecutiveHealingsLimitInTimeframe, healingCounter) return h.disabledTime } log.Errorf("Initiating healing process for node %q after %d failures.", node.Address, failures) evt, err := NewHealingEvent(*node) if err != nil { log.Errorf("Error trying to insert healing event: %s", err.Error()) return h.disabledTime } createdNode, err := h.healNode(node) if err != nil { log.Errorf("Error healing: %s", err.Error()) } err = evt.Update(createdNode, err) if err != nil { log.Errorf("Error trying to update healing event: %s", err.Error()) } if createdNode.Address != "" { return 0 } return h.disabledTime }
func (p *dockerProvisioner) UpdateNode(opts provision.UpdateNodeOptions) error { node := cluster.Node{Address: opts.Address, Metadata: opts.Metadata} if opts.Disable { node.CreationStatus = cluster.NodeCreationStatusDisabled } if opts.Enable { node.CreationStatus = cluster.NodeCreationStatusCreated } _, err := mainDockerProvisioner.Cluster().UpdateNode(node) if err == clusterStorage.ErrNoSuchNode { return provision.ErrNodeNotFound } return err }
func create(c *nodecontainer.NodeContainerConfig, node *cluster.Node, poolName string, p DockerProvisioner, relaunch bool) error { client, err := node.Client() if err != nil { return err } c.Config.Image, err = pullImage(c, client, p, poolName) if err != nil { return err } c.Config.Env = append([]string{"DOCKER_ENDPOINT=" + node.Address}, c.Config.Env...) if c.Config.Labels == nil { c.Config.Labels = map[string]string{} } c.Config.Labels["tsuru.nodecontainer"] = strconv.FormatBool(true) c.Config.Labels["tsuru.node.pool"] = poolName c.Config.Labels["tsuru.node.address"] = node.Address c.Config.Labels["tsuru.node.provisioner"] = p.GetName() opts := docker.CreateContainerOptions{ Name: c.Name, HostConfig: &c.HostConfig, Config: &c.Config, } _, err = client.CreateContainer(opts) if err != nil { if err != docker.ErrContainerAlreadyExists { return err } if relaunch { multiErr := tsuruErrors.NewMultiError() err = tryRemovingOld(client, opts.Name) if err != nil { multiErr.Add(errors.Wrapf(err, "unable to remove old node-container")) } _, err = client.CreateContainer(opts) if err != nil { multiErr.Add(errors.Wrapf(err, "unable to create new node-container")) return multiErr } } } err = client.StartContainer(c.Name, nil) if _, ok := err.(*docker.ContainerAlreadyRunning); !ok { return err } return nil }
func updateNodeHandler(w http.ResponseWriter, r *http.Request, t auth.Token) error { params, err := unmarshal(r.Body) if err != nil { return err } address, _ := params["address"] if address == "" { return &errors.HTTP{Code: http.StatusBadRequest, Message: "address is required"} } delete(params, "address") node := cluster.Node{Address: address, Metadata: params} disabled, _ := strconv.ParseBool(r.URL.Query().Get("disabled")) if disabled { node.CreationStatus = cluster.NodeCreationStatusDisabled } _, err = mainDockerProvisioner.Cluster().UpdateNode(node) return err }
func (h *NodeHealer) HandleError(node *cluster.Node) time.Duration { h.wg.Add(1) defer h.wg.Done() failures := node.FailureCount() if failures < h.failuresBeforeHealing { log.Debugf("%d failures detected in node %q, waiting for more failures before healing.", failures, node.Address) return h.disabledTime } if !node.HasSuccess() { log.Debugf("Node %q has never been successfully reached, healing won't run on it.", node.Address) return h.disabledTime } err := h.tryHealingNode(node, fmt.Sprintf("%d consecutive failures", failures), nil) if err != nil { log.Errorf("[node healer handle error] %s", err) } return h.disabledTime }
func testStorageStoreUpdateNode(storage cluster.Storage, t *testing.T) { node1 := cluster.Node{Address: "my-addr-1", Metadata: map[string]string{"abc": "def", "x": "y"}} defer storage.RemoveNode("my-addr-1") err := storage.StoreNode(node1) assertIsNil(err, t) delete(node1.Metadata, "x") node1.Metadata["ahoy"] = "foo" err = storage.UpdateNode(node1) assertIsNil(err, t) nd, err := storage.RetrieveNode("my-addr-1") if !reflect.DeepEqual(nd, node1) { t.Errorf("unexpected node, expected: %#v, got: %#v", node1, nd) } node1.Address = "my-addr-xxxxxx" err = storage.UpdateNode(node1) if err != cstorage.ErrNoSuchNode { t.Errorf("Expected ErrNoSuchNode got: %#v", err) } }
func TestUpdateNodeDoesNotExist(t *testing.T) { mongo, err := mongodb.Mongodb("mongodb://localhost:27017", "test-docker-node-update") if err != nil { t.Fatal(err) } clu, err := cluster.New(nil, mongo) if err != nil { t.Fatal(err) } node := cluster.Node{Address: "http://localhost:4243"} err = clu.Register(node) defer clu.Unregister("http://localhost:4243") nodeUpd := cluster.Node{Address: "http://localhost:4223"} nodeUpd.Metadata = map[string]string{"k1": "v1", "k2": "v2"} nodeUpd, err = clu.UpdateNode(nodeUpd) if err != storage.ErrNoSuchNode { t.Error("Expected: No such node in storage, got: ", err) } }
func (h *nodeHealer) healNode(node *cluster.Node) (cluster.Node, error) { emptyNode := cluster.Node{} failingAddr := node.Address nodeMetadata := node.CleanMetadata() failingHost := urlToHost(failingAddr) failures := node.FailureCount() machine, err := iaas.CreateMachineForIaaS(nodeMetadata["iaas"], nodeMetadata) if err != nil { node.ResetFailures() return emptyNode, fmt.Errorf("Can't auto-heal after %d failures for node %s: error creating new machine: %s", failures, failingHost, err.Error()) } err = h.provisioner.getCluster().Unregister(failingAddr) if err != nil { machine.Destroy() return emptyNode, fmt.Errorf("Can't auto-heal after %d failures for node %s: error unregistering old node: %s", failures, failingHost, err.Error()) } newAddr := machine.FormatNodeAddress() log.Debugf("New machine created during healing process: %s - Waiting for docker to start...", newAddr) createdNode, err := h.provisioner.getCluster().WaitAndRegister(newAddr, nodeMetadata, h.waitTimeNewMachine) if err != nil { node.ResetFailures() h.provisioner.getCluster().Register(failingAddr, nodeMetadata) machine.Destroy() return emptyNode, fmt.Errorf("Can't auto-heal after %d failures for node %s: error registering new node: %s", failures, failingHost, err.Error()) } var buf bytes.Buffer err = h.provisioner.moveContainers(failingHost, "", &buf) if err != nil { log.Errorf("Unable to move containers, skipping containers healing %q -> %q: %s: %s", failingHost, machine.Address, err.Error(), buf.String()) } failingMachine, err := iaas.FindMachineByIdOrAddress(node.Metadata["iaas-id"], failingHost) if err != nil { return createdNode, fmt.Errorf("Unable to find failing machine %s in IaaS: %s", failingHost, err.Error()) } err = failingMachine.Destroy() if err != nil { return createdNode, fmt.Errorf("Unable to destroy machine %s from IaaS: %s", failingHost, err.Error()) } log.Debugf("Done auto-healing node %q, node %q created in its place.", failingHost, machine.Address) return createdNode, nil }
func (h *Healer) HandleError(node cluster.Node) time.Duration { defaultWait := 1 * time.Minute failures := node.FailureCount() if failures < 5 { return defaultWait } failingAddr := node.Address failingHost := urlToHost(failingAddr) containers, err := listContainersByHost(failingHost) if err != nil { log.Errorf("Error in cluster healer, trying to list containers: %s", err.Error()) return defaultWait } // Empty host let's just try again in the future if len(containers) == 0 { return defaultWait } iaasName, hasIaas := node.Metadata["iaas"] if !hasIaas { log.Errorf("Can't auto-heal after %d failures for node %s: no IaaS information.", failures, failingHost) return defaultWait } machine, err := iaas.CreateMachineForIaaS(iaasName, node.Metadata) if err != nil { log.Errorf("Can't auto-heal after %d failures for node %s: error creating new machine: %s", failures, failingHost, err.Error()) return defaultWait } newAddr, err := machine.FormatNodeAddress() if err != nil { log.Errorf("Can't auto-heal after %d failures for node %s: error formatting address: %s", failures, failingHost, err.Error()) machine.Destroy() return defaultWait } cluster := dockerCluster() err = cluster.Unregister(failingAddr) if err != nil { log.Errorf("Can't auto-heal after %d failures for node %s: error unregistering old node: %s", failures, failingHost, err.Error()) return defaultWait } err = cluster.WaitAndRegister(newAddr, node.Metadata, 2*time.Minute) if err != nil { log.Errorf("Can't auto-heal after %d failures for node %s: error registering new node: %s", failures, failingHost, err.Error()) machine.Destroy() return defaultWait } var buf bytes.Buffer encoder := json.NewEncoder(&buf) err = moveContainers(failingHost, machine.Address, encoder) if err != nil { log.Errorf("Unable to move containers from: %s to: %s - %s", failingHost, machine.Address, err.Error()) return 0 } failingMachine, err := iaas.FindMachineByAddress(failingHost) if err != nil { log.Errorf("Unable to find failing machine %s in IaaS", failingHost) return 0 } err = failingMachine.Destroy() if err != nil { log.Errorf("Unable to find destroy machine %s from IaaS", failingHost) } return 0 }
func (h *NodeHealer) healNode(node *cluster.Node) (cluster.Node, error) { emptyNode := cluster.Node{} failingAddr := node.Address nodeMetadata := node.CleanMetadata() failingHost := net.URLToHost(failingAddr) failures := node.FailureCount() machine, err := iaas.CreateMachineForIaaS(nodeMetadata["iaas"], nodeMetadata) if err != nil { node.ResetFailures() return emptyNode, fmt.Errorf("Can't auto-heal after %d failures for node %s: error creating new machine: %s", failures, failingHost, err.Error()) } err = h.provisioner.Cluster().Unregister(failingAddr) if err != nil { machine.Destroy() return emptyNode, fmt.Errorf("Can't auto-heal after %d failures for node %s: error unregistering old node: %s", failures, failingHost, err.Error()) } newAddr := machine.FormatNodeAddress() log.Debugf("New machine created during healing process: %s - Waiting for docker to start...", newAddr) createdNode := cluster.Node{ Address: newAddr, Metadata: nodeMetadata, CreationStatus: cluster.NodeCreationStatusPending, } err = h.provisioner.Cluster().Register(createdNode) if err != nil { node.ResetFailures() h.provisioner.Cluster().Register(cluster.Node{Address: failingAddr, Metadata: nodeMetadata}) machine.Destroy() return emptyNode, fmt.Errorf("Can't auto-heal after %d failures for node %s: error registering new node: %s", failures, failingHost, err.Error()) } q, err := queue.Queue() if err != nil { return emptyNode, err } jobParams := monsterqueue.JobParams{ "endpoint": createdNode.Address, "machine": machine.Id, "metadata": createdNode.Metadata, } job, err := q.EnqueueWait(bs.QueueTaskName, jobParams, h.waitTimeNewMachine) if err == nil { _, err = job.Result() } if err != nil { node.ResetFailures() h.provisioner.Cluster().Register(cluster.Node{Address: failingAddr, Metadata: nodeMetadata}) return emptyNode, fmt.Errorf("Can't auto-heal after %d failures for node %s: error waiting for the bs task: %s", failures, failingHost, err.Error()) } var buf bytes.Buffer err = h.provisioner.MoveContainers(failingHost, "", &buf) if err != nil { log.Errorf("Unable to move containers, skipping containers healing %q -> %q: %s: %s", failingHost, machine.Address, err.Error(), buf.String()) } failingMachine, err := iaas.FindMachineByIdOrAddress(node.Metadata["iaas-id"], failingHost) if err != nil { return createdNode, fmt.Errorf("Unable to find failing machine %s in IaaS: %s", failingHost, err.Error()) } err = failingMachine.Destroy() if err != nil { return createdNode, fmt.Errorf("Unable to destroy machine %s from IaaS: %s", failingHost, err.Error()) } log.Debugf("Done auto-healing node %q, node %q created in its place.", failingHost, machine.Address) return createdNode, nil }