예제 #1
0
func GetPeersFromState(state *MasterState, frameworkName string) ([]string, error) {
	var framework *Framework
	for _, f := range state.Frameworks {
		if f.Name == frameworkName {
			framework = &f
			break
		}
	}
	if framework == nil {
		return []string{}, fmt.Errorf("Could not find framework %q in "+
			"the mesos master's state.json", frameworkName)
	}

	peers := []string{}
	for _, t := range framework.Tasks {
		if t.State == "TASK_RUNNING" {
			node, err := config.Parse(t.ID)
			if err != nil {
				return []string{}, err
			}
			peers = append(peers, fmt.Sprintf("%s=http://%s:%d",
				node.Name, node.Host, node.RPCPort))
		}
	}
	return peers, nil
}
예제 #2
0
func (s *EtcdScheduler) StatusUpdate(
	driver scheduler.SchedulerDriver,
	status *mesos.TaskStatus,
) {
	s.mut.Lock()
	defer s.mut.Unlock()
	log.Infoln(
		"Status update: task",
		status.TaskId.GetValue(),
		" is in state ",
		status.State.Enum().String(),
	)

	node, err := config.Parse(status.GetTaskId().GetValue())
	if err != nil {
		log.Errorf("scheduler: failed to unmarshal config.Node from TaskId: %s", err)
		return
	}
	node.SlaveID = status.SlaveId.GetValue()

	// record that we've heard about this task
	s.heardFrom[status.GetTaskId().GetValue()] = struct{}{}

	switch status.GetState() {
	case mesos.TaskState_TASK_LOST,
		mesos.TaskState_TASK_FINISHED,
		mesos.TaskState_TASK_KILLED,
		mesos.TaskState_TASK_ERROR,
		mesos.TaskState_TASK_FAILED:

		log.Errorf("Task contraction: %+v", status.GetState())
		log.Errorf("message: %s", status.GetMessage())
		log.Errorf("reason: %+v", status.GetReason())

		atomic.AddUint32(&s.Stats.FailedServers, 1)

		// TODO(tyler) kill this
		// Pump the brakes so that we have time to deconfigure the lost node
		// before adding a new one.  If we don't deconfigure first, we risk
		// split brain.
		s.PumpTheBrakes()

		// now we know this task is dead
		delete(s.pending, node.Name)
		delete(s.running, node.Name)
		delete(s.tasks, node.Name)

		// We don't have to clean up the state in ZK for this
		// as it is fine to eventually just persist when we
		// receive a new TASK_RUNNING below.
		delete(s.reconciliationInfo, status.TaskId.GetValue())

		s.QueueLaunchAttempt()

		// TODO(tyler) do we want to lock if the first task fails?
		// TODO(tyler) can we handle a total loss at reconciliation time,
		//             when s.state == Immutable?
		if len(s.running) == 0 && s.state == Mutable {
			log.Error("TOTAL CLUSTER LOSS!  LOCKING SCHEDULER, " +
				"FOLLOW RESTORATION GUIDE AT " +
				"https://github.com/mesosphere/" +
				"etcd-mesos/blob/master/docs/response.md")
			s.state = Immutable
		}
	case mesos.TaskState_TASK_STARTING:
	case mesos.TaskState_TASK_RUNNING:
		// We update data to ZK synchronously because it must happen
		// in-order.  If we spun off a goroutine this would possibly retry
		// and succeed in the wrong order, and older data would win.
		// We keep this simple here, as if ZK is healthy this won't take long.
		// If this takes long, we're probably about to die anyway, as ZK is
		// displeased and mesos-go will panic when it loses contact.
		s.reconciliationInfo[status.TaskId.GetValue()] = status.SlaveId.GetValue()
		err = s.updateReconciliationInfoFunc(
			s.reconciliationInfo,
			s.ZkServers,
			s.ZkChroot,
			s.FrameworkName,
		)
		if err != nil {
			log.Errorf("Failed to persist reconciliation info: %+v", err)
		}

		delete(s.pending, node.Name)
		_, present := s.running[node.Name]
		if !present {
			s.running[node.Name] = node
			s.tasks[node.Name] = status.TaskId
		}

		// During reconcilliation, we may find nodes with higher ID's due to ntp drift
		etcdIndexParts := strings.Split(node.Name, "-")
		if len(etcdIndexParts) != 2 {
			log.Warning("Task has a Name that does not follow the form etcd-<index>")
		} else {
			etcdIndex, err := strconv.ParseInt(etcdIndexParts[1], 10, 64)
			if err != nil {
				log.Warning("Task has a Name that does not follow the form etcd-<index>")
			} else {
				if etcdIndex > s.highestInstanceID {
					s.highestInstanceID = etcdIndex + 1
				}
			}
		}
	default:
		log.Warningf("Received unhandled task state: %+v", status.GetState())
	}
}