Пример #1
0
func (t *runBs) Run(job monsterqueue.Job) {
	params := job.Parameters()
	dockerEndpoint := params["endpoint"].(string)
	node := cluster.Node{Address: dockerEndpoint}
	err := t.waitDocker(dockerEndpoint)
	if err != nil {
		job.Error(err)
		return
	}
	node.CreationStatus = cluster.NodeCreationStatusCreated
	rawMetadata := params["metadata"].(monsterqueue.JobParams)
	metadata := make(map[string]string, len(rawMetadata))
	for key, value := range rawMetadata {
		metadata[key] = value.(string)
	}
	err = createContainer(dockerEndpoint, metadata["pool"], t.provisioner, true)
	if err != nil {
		t.provisioner.Cluster().UpdateNode(node)
		job.Error(err)
		return
	}
	node.Metadata = map[string]string{"LastSuccess": time.Now().Format(time.RFC3339)}
	_, err = t.provisioner.Cluster().UpdateNode(node)
	if err != nil {
		job.Error(err)
		return
	}
	job.Success(nil)
}
Пример #2
0
func (t *runBs) Run(job monsterqueue.Job) {
	params := job.Parameters()
	dockerEndpoint := params["endpoint"].(string)
	machineID := params["machine"].(string)
	node := cluster.Node{Address: dockerEndpoint}
	err := t.waitDocker(dockerEndpoint)
	if err != nil {
		job.Error(err)
		t.destroyMachine(machineID)
		return
	}
	rawMetadata := params["metadata"].(monsterqueue.JobParams)
	metadata := make(map[string]string, len(rawMetadata))
	for key, value := range rawMetadata {
		metadata[key] = value.(string)
	}
	err = CreateContainer(dockerEndpoint, metadata["pool"], t.provisioner, true)
	if err != nil {
		node.CreationStatus = cluster.NodeCreationStatusError
		node.Metadata = map[string]string{"creationError": err.Error()}
		t.provisioner.Cluster().UpdateNode(node)
		job.Error(err)
		t.destroyMachine(machineID)
		return
	}
	node.CreationStatus = cluster.NodeCreationStatusCreated
	_, err = t.provisioner.Cluster().UpdateNode(node)
	if err != nil {
		job.Error(err)
		t.destroyMachine(machineID)
		return
	}
	job.Success(nil)
}
Пример #3
0
func (p *dockerProvisioner) AddNode(opts provision.AddNodeOptions) error {
	node := cluster.Node{
		Address:        opts.Address,
		Metadata:       opts.Metadata,
		CreationStatus: cluster.NodeCreationStatusPending,
		CaCert:         opts.CaCert,
		ClientCert:     opts.ClientCert,
		ClientKey:      opts.ClientKey,
	}
	if len(opts.CaCert) == 0 && len(p.caCert) > 0 {
		node.CaCert = p.caCert
		node.ClientCert = p.clientCert
		node.ClientKey = p.clientKey
	}
	err := p.Cluster().Register(node)
	if err != nil {
		return err
	}
	q, err := queue.Queue()
	if err != nil {
		return err
	}
	jobParams := monsterqueue.JobParams{"endpoint": opts.Address, "metadata": opts.Metadata}
	var job monsterqueue.Job
	if opts.WaitTO != 0 {
		job, err = q.EnqueueWait(internalNodeContainer.QueueTaskName, jobParams, opts.WaitTO)
	} else {
		_, err = q.Enqueue(internalNodeContainer.QueueTaskName, jobParams)
	}
	if err == nil && job != nil {
		_, err = job.Result()
	}
	return err
}
Пример #4
0
func updateNodeHandler(w http.ResponseWriter, r *http.Request, t auth.Token) error {
	params, err := unmarshal(r.Body)
	if err != nil {
		return err
	}
	address, _ := params["address"]
	if address == "" {
		return &errors.HTTP{Code: http.StatusBadRequest, Message: "address is required"}
	}
	nodes, err := mainDockerProvisioner.Cluster().UnfilteredNodes()
	if err != nil {
		return err
	}
	var oldNode *cluster.Node
	for i := range nodes {
		if nodes[i].Address == address {
			oldNode = &nodes[i]
			break
		}
	}
	oldPool, _ := oldNode.Metadata["pool"]
	allowedOldPool := permission.Check(t, permission.PermNodeUpdate,
		permission.Context(permission.CtxPool, oldPool),
	)
	if !allowedOldPool {
		return permission.ErrUnauthorized
	}
	newPool, ok := params["pool"]
	if ok {
		allowedNewPool := permission.Check(t, permission.PermNodeUpdate,
			permission.Context(permission.CtxPool, newPool),
		)
		if !allowedNewPool {
			return permission.ErrUnauthorized
		}
	}
	delete(params, "address")
	node := cluster.Node{Address: address, Metadata: params}
	disabled, _ := strconv.ParseBool(r.URL.Query().Get("disabled"))
	enabled, _ := strconv.ParseBool(r.URL.Query().Get("enabled"))
	if disabled && enabled {
		return &errors.HTTP{
			Code:    http.StatusBadRequest,
			Message: "You can't make a node enable and disable at the same time.",
		}
	}
	if disabled {
		node.CreationStatus = cluster.NodeCreationStatusDisabled
	}
	if enabled {
		node.CreationStatus = cluster.NodeStatusReady
	}
	_, err = mainDockerProvisioner.Cluster().UpdateNode(node)
	return err
}
Пример #5
0
func cleanMetadata(n *cluster.Node) map[string]string {
	// iaas-id is ignored because it wasn't created in previous tsuru versions
	// and having nodes with and without it would cause unbalanced metadata
	// errors.
	ignoredMetadata := []string{"iaas-id"}
	metadata := n.CleanMetadata()
	for _, val := range ignoredMetadata {
		delete(metadata, val)
	}
	return metadata
}
Пример #6
0
func (h *NodeHealer) HandleError(node *cluster.Node) time.Duration {
	h.Lock()
	if h.locks[node.Address] == nil {
		h.locks[node.Address] = &sync.Mutex{}
	}
	h.Unlock()
	h.locks[node.Address].Lock()
	defer h.locks[node.Address].Unlock()
	failures := node.FailureCount()
	if failures < h.failuresBeforeHealing {
		log.Debugf("%d failures detected in node %q, waiting for more failures before healing.", failures, node.Address)
		return h.disabledTime
	}
	if !node.HasSuccess() {
		log.Debugf("Node %q has never been successfully reached, healing won't run on it.", node.Address)
		return h.disabledTime
	}
	_, hasIaas := node.Metadata["iaas"]
	if !hasIaas {
		log.Debugf("Node %q doesn't have IaaS information, healing won't run on it.", node.Address)
		return h.disabledTime
	}
	healingCounter, err := healingCountFor("node", node.Address, consecutiveHealingsTimeframe)
	if err != nil {
		log.Errorf("Node healing: couldn't verify number of previous healings for %s: %s", node.Address, err.Error())
		return h.disabledTime
	}
	if healingCounter > consecutiveHealingsLimitInTimeframe {
		log.Errorf("Node healing: number of healings for node %s in the last %d minutes exceeds limit of %d: %d",
			node.Address, consecutiveHealingsTimeframe/time.Minute, consecutiveHealingsLimitInTimeframe, healingCounter)
		return h.disabledTime
	}
	log.Errorf("Initiating healing process for node %q after %d failures.", node.Address, failures)
	evt, err := NewHealingEvent(*node)
	if err != nil {
		log.Errorf("Error trying to insert healing event: %s", err.Error())
		return h.disabledTime
	}
	createdNode, err := h.healNode(node)
	if err != nil {
		log.Errorf("Error healing: %s", err.Error())
	}
	err = evt.Update(createdNode, err)
	if err != nil {
		log.Errorf("Error trying to update healing event: %s", err.Error())
	}
	if createdNode.Address != "" {
		return 0
	}
	return h.disabledTime
}
Пример #7
0
func (p *dockerProvisioner) UpdateNode(opts provision.UpdateNodeOptions) error {
	node := cluster.Node{Address: opts.Address, Metadata: opts.Metadata}
	if opts.Disable {
		node.CreationStatus = cluster.NodeCreationStatusDisabled
	}
	if opts.Enable {
		node.CreationStatus = cluster.NodeCreationStatusCreated
	}
	_, err := mainDockerProvisioner.Cluster().UpdateNode(node)
	if err == clusterStorage.ErrNoSuchNode {
		return provision.ErrNodeNotFound
	}
	return err
}
Пример #8
0
func create(c *nodecontainer.NodeContainerConfig, node *cluster.Node, poolName string, p DockerProvisioner, relaunch bool) error {
	client, err := node.Client()
	if err != nil {
		return err
	}
	c.Config.Image, err = pullImage(c, client, p, poolName)
	if err != nil {
		return err
	}
	c.Config.Env = append([]string{"DOCKER_ENDPOINT=" + node.Address}, c.Config.Env...)
	if c.Config.Labels == nil {
		c.Config.Labels = map[string]string{}
	}
	c.Config.Labels["tsuru.nodecontainer"] = strconv.FormatBool(true)
	c.Config.Labels["tsuru.node.pool"] = poolName
	c.Config.Labels["tsuru.node.address"] = node.Address
	c.Config.Labels["tsuru.node.provisioner"] = p.GetName()
	opts := docker.CreateContainerOptions{
		Name:       c.Name,
		HostConfig: &c.HostConfig,
		Config:     &c.Config,
	}
	_, err = client.CreateContainer(opts)
	if err != nil {
		if err != docker.ErrContainerAlreadyExists {
			return err
		}
		if relaunch {
			multiErr := tsuruErrors.NewMultiError()
			err = tryRemovingOld(client, opts.Name)
			if err != nil {
				multiErr.Add(errors.Wrapf(err, "unable to remove old node-container"))
			}
			_, err = client.CreateContainer(opts)
			if err != nil {
				multiErr.Add(errors.Wrapf(err, "unable to create new node-container"))
				return multiErr
			}
		}
	}
	err = client.StartContainer(c.Name, nil)
	if _, ok := err.(*docker.ContainerAlreadyRunning); !ok {
		return err
	}
	return nil
}
Пример #9
0
func updateNodeHandler(w http.ResponseWriter, r *http.Request, t auth.Token) error {
	params, err := unmarshal(r.Body)
	if err != nil {
		return err
	}
	address, _ := params["address"]
	if address == "" {
		return &errors.HTTP{Code: http.StatusBadRequest, Message: "address is required"}
	}
	delete(params, "address")
	node := cluster.Node{Address: address, Metadata: params}
	disabled, _ := strconv.ParseBool(r.URL.Query().Get("disabled"))
	if disabled {
		node.CreationStatus = cluster.NodeCreationStatusDisabled
	}
	_, err = mainDockerProvisioner.Cluster().UpdateNode(node)
	return err
}
Пример #10
0
func (h *NodeHealer) HandleError(node *cluster.Node) time.Duration {
	h.wg.Add(1)
	defer h.wg.Done()
	failures := node.FailureCount()
	if failures < h.failuresBeforeHealing {
		log.Debugf("%d failures detected in node %q, waiting for more failures before healing.", failures, node.Address)
		return h.disabledTime
	}
	if !node.HasSuccess() {
		log.Debugf("Node %q has never been successfully reached, healing won't run on it.", node.Address)
		return h.disabledTime
	}
	err := h.tryHealingNode(node, fmt.Sprintf("%d consecutive failures", failures), nil)
	if err != nil {
		log.Errorf("[node healer handle error] %s", err)
	}
	return h.disabledTime
}
Пример #11
0
func testStorageStoreUpdateNode(storage cluster.Storage, t *testing.T) {
	node1 := cluster.Node{Address: "my-addr-1", Metadata: map[string]string{"abc": "def", "x": "y"}}
	defer storage.RemoveNode("my-addr-1")
	err := storage.StoreNode(node1)
	assertIsNil(err, t)
	delete(node1.Metadata, "x")
	node1.Metadata["ahoy"] = "foo"
	err = storage.UpdateNode(node1)
	assertIsNil(err, t)
	nd, err := storage.RetrieveNode("my-addr-1")
	if !reflect.DeepEqual(nd, node1) {
		t.Errorf("unexpected node, expected: %#v, got: %#v", node1, nd)
	}
	node1.Address = "my-addr-xxxxxx"
	err = storage.UpdateNode(node1)
	if err != cstorage.ErrNoSuchNode {
		t.Errorf("Expected ErrNoSuchNode got: %#v", err)
	}
}
Пример #12
0
func TestUpdateNodeDoesNotExist(t *testing.T) {
	mongo, err := mongodb.Mongodb("mongodb://localhost:27017", "test-docker-node-update")
	if err != nil {
		t.Fatal(err)
	}
	clu, err := cluster.New(nil, mongo)
	if err != nil {
		t.Fatal(err)
	}
	node := cluster.Node{Address: "http://localhost:4243"}
	err = clu.Register(node)
	defer clu.Unregister("http://localhost:4243")
	nodeUpd := cluster.Node{Address: "http://localhost:4223"}
	nodeUpd.Metadata = map[string]string{"k1": "v1", "k2": "v2"}
	nodeUpd, err = clu.UpdateNode(nodeUpd)
	if err != storage.ErrNoSuchNode {
		t.Error("Expected: No such node in storage, got: ", err)
	}
}
Пример #13
0
func (h *nodeHealer) healNode(node *cluster.Node) (cluster.Node, error) {
	emptyNode := cluster.Node{}
	failingAddr := node.Address
	nodeMetadata := node.CleanMetadata()
	failingHost := urlToHost(failingAddr)
	failures := node.FailureCount()
	machine, err := iaas.CreateMachineForIaaS(nodeMetadata["iaas"], nodeMetadata)
	if err != nil {
		node.ResetFailures()
		return emptyNode, fmt.Errorf("Can't auto-heal after %d failures for node %s: error creating new machine: %s", failures, failingHost, err.Error())
	}
	err = h.provisioner.getCluster().Unregister(failingAddr)
	if err != nil {
		machine.Destroy()
		return emptyNode, fmt.Errorf("Can't auto-heal after %d failures for node %s: error unregistering old node: %s", failures, failingHost, err.Error())
	}
	newAddr := machine.FormatNodeAddress()
	log.Debugf("New machine created during healing process: %s - Waiting for docker to start...", newAddr)
	createdNode, err := h.provisioner.getCluster().WaitAndRegister(newAddr, nodeMetadata, h.waitTimeNewMachine)
	if err != nil {
		node.ResetFailures()
		h.provisioner.getCluster().Register(failingAddr, nodeMetadata)
		machine.Destroy()
		return emptyNode, fmt.Errorf("Can't auto-heal after %d failures for node %s: error registering new node: %s", failures, failingHost, err.Error())
	}
	var buf bytes.Buffer
	err = h.provisioner.moveContainers(failingHost, "", &buf)
	if err != nil {
		log.Errorf("Unable to move containers, skipping containers healing %q -> %q: %s: %s", failingHost, machine.Address, err.Error(), buf.String())
	}
	failingMachine, err := iaas.FindMachineByIdOrAddress(node.Metadata["iaas-id"], failingHost)
	if err != nil {
		return createdNode, fmt.Errorf("Unable to find failing machine %s in IaaS: %s", failingHost, err.Error())
	}
	err = failingMachine.Destroy()
	if err != nil {
		return createdNode, fmt.Errorf("Unable to destroy machine %s from IaaS: %s", failingHost, err.Error())
	}
	log.Debugf("Done auto-healing node %q, node %q created in its place.", failingHost, machine.Address)
	return createdNode, nil
}
func (h *Healer) HandleError(node cluster.Node) time.Duration {
	defaultWait := 1 * time.Minute
	failures := node.FailureCount()
	if failures < 5 {
		return defaultWait
	}
	failingAddr := node.Address
	failingHost := urlToHost(failingAddr)
	containers, err := listContainersByHost(failingHost)
	if err != nil {
		log.Errorf("Error in cluster healer, trying to list containers: %s", err.Error())
		return defaultWait
	}
	// Empty host let's just try again in the future
	if len(containers) == 0 {
		return defaultWait
	}
	iaasName, hasIaas := node.Metadata["iaas"]
	if !hasIaas {
		log.Errorf("Can't auto-heal after %d failures for node %s: no IaaS information.", failures, failingHost)
		return defaultWait
	}
	machine, err := iaas.CreateMachineForIaaS(iaasName, node.Metadata)
	if err != nil {
		log.Errorf("Can't auto-heal after %d failures for node %s: error creating new machine: %s", failures, failingHost, err.Error())
		return defaultWait
	}
	newAddr, err := machine.FormatNodeAddress()
	if err != nil {
		log.Errorf("Can't auto-heal after %d failures for node %s: error formatting address: %s", failures, failingHost, err.Error())
		machine.Destroy()
		return defaultWait
	}
	cluster := dockerCluster()
	err = cluster.Unregister(failingAddr)
	if err != nil {
		log.Errorf("Can't auto-heal after %d failures for node %s: error unregistering old node: %s", failures, failingHost, err.Error())
		return defaultWait
	}
	err = cluster.WaitAndRegister(newAddr, node.Metadata, 2*time.Minute)
	if err != nil {
		log.Errorf("Can't auto-heal after %d failures for node %s: error registering new node: %s", failures, failingHost, err.Error())
		machine.Destroy()
		return defaultWait
	}
	var buf bytes.Buffer
	encoder := json.NewEncoder(&buf)
	err = moveContainers(failingHost, machine.Address, encoder)
	if err != nil {
		log.Errorf("Unable to move containers from: %s to: %s - %s", failingHost, machine.Address, err.Error())
		return 0
	}
	failingMachine, err := iaas.FindMachineByAddress(failingHost)
	if err != nil {
		log.Errorf("Unable to find failing machine %s in IaaS", failingHost)
		return 0
	}
	err = failingMachine.Destroy()
	if err != nil {
		log.Errorf("Unable to find destroy machine %s from IaaS", failingHost)
	}
	return 0
}
Пример #15
0
func (h *NodeHealer) healNode(node *cluster.Node) (cluster.Node, error) {
	emptyNode := cluster.Node{}
	failingAddr := node.Address
	nodeMetadata := node.CleanMetadata()
	failingHost := net.URLToHost(failingAddr)
	failures := node.FailureCount()
	machine, err := iaas.CreateMachineForIaaS(nodeMetadata["iaas"], nodeMetadata)
	if err != nil {
		node.ResetFailures()
		return emptyNode, fmt.Errorf("Can't auto-heal after %d failures for node %s: error creating new machine: %s", failures, failingHost, err.Error())
	}
	err = h.provisioner.Cluster().Unregister(failingAddr)
	if err != nil {
		machine.Destroy()
		return emptyNode, fmt.Errorf("Can't auto-heal after %d failures for node %s: error unregistering old node: %s", failures, failingHost, err.Error())
	}
	newAddr := machine.FormatNodeAddress()
	log.Debugf("New machine created during healing process: %s - Waiting for docker to start...", newAddr)
	createdNode := cluster.Node{
		Address:        newAddr,
		Metadata:       nodeMetadata,
		CreationStatus: cluster.NodeCreationStatusPending,
	}
	err = h.provisioner.Cluster().Register(createdNode)
	if err != nil {
		node.ResetFailures()
		h.provisioner.Cluster().Register(cluster.Node{Address: failingAddr, Metadata: nodeMetadata})
		machine.Destroy()
		return emptyNode, fmt.Errorf("Can't auto-heal after %d failures for node %s: error registering new node: %s", failures, failingHost, err.Error())
	}
	q, err := queue.Queue()
	if err != nil {
		return emptyNode, err
	}
	jobParams := monsterqueue.JobParams{
		"endpoint": createdNode.Address,
		"machine":  machine.Id,
		"metadata": createdNode.Metadata,
	}
	job, err := q.EnqueueWait(bs.QueueTaskName, jobParams, h.waitTimeNewMachine)
	if err == nil {
		_, err = job.Result()
	}
	if err != nil {
		node.ResetFailures()
		h.provisioner.Cluster().Register(cluster.Node{Address: failingAddr, Metadata: nodeMetadata})
		return emptyNode, fmt.Errorf("Can't auto-heal after %d failures for node %s: error waiting for the bs task: %s", failures, failingHost, err.Error())
	}
	var buf bytes.Buffer
	err = h.provisioner.MoveContainers(failingHost, "", &buf)
	if err != nil {
		log.Errorf("Unable to move containers, skipping containers healing %q -> %q: %s: %s", failingHost, machine.Address, err.Error(), buf.String())
	}
	failingMachine, err := iaas.FindMachineByIdOrAddress(node.Metadata["iaas-id"], failingHost)
	if err != nil {
		return createdNode, fmt.Errorf("Unable to find failing machine %s in IaaS: %s", failingHost, err.Error())
	}
	err = failingMachine.Destroy()
	if err != nil {
		return createdNode, fmt.Errorf("Unable to destroy machine %s from IaaS: %s", failingHost, err.Error())
	}
	log.Debugf("Done auto-healing node %q, node %q created in its place.", failingHost, machine.Address)
	return createdNode, nil
}