func GetPeersFromState(state *MasterState, frameworkName string) ([]string, error) { var framework *Framework for _, f := range state.Frameworks { if f.Name == frameworkName { framework = &f break } } if framework == nil { return []string{}, fmt.Errorf("Could not find framework %q in "+ "the mesos master's state.json", frameworkName) } peers := []string{} for _, t := range framework.Tasks { if t.State == "TASK_RUNNING" { node, err := config.Parse(t.ID) if err != nil { return []string{}, err } peers = append(peers, fmt.Sprintf("%s=http://%s:%d", node.Name, node.Host, node.RPCPort)) } } return peers, nil }
func (s *EtcdScheduler) StatusUpdate( driver scheduler.SchedulerDriver, status *mesos.TaskStatus, ) { s.mut.Lock() defer s.mut.Unlock() log.Infoln( "Status update: task", status.TaskId.GetValue(), " is in state ", status.State.Enum().String(), ) node, err := config.Parse(status.GetTaskId().GetValue()) if err != nil { log.Errorf("scheduler: failed to unmarshal config.Node from TaskId: %s", err) return } node.SlaveID = status.SlaveId.GetValue() // record that we've heard about this task s.heardFrom[status.GetTaskId().GetValue()] = struct{}{} switch status.GetState() { case mesos.TaskState_TASK_LOST, mesos.TaskState_TASK_FINISHED, mesos.TaskState_TASK_KILLED, mesos.TaskState_TASK_ERROR, mesos.TaskState_TASK_FAILED: log.Errorf("Task contraction: %+v", status.GetState()) log.Errorf("message: %s", status.GetMessage()) log.Errorf("reason: %+v", status.GetReason()) atomic.AddUint32(&s.Stats.FailedServers, 1) // TODO(tyler) kill this // Pump the brakes so that we have time to deconfigure the lost node // before adding a new one. If we don't deconfigure first, we risk // split brain. s.PumpTheBrakes() // now we know this task is dead delete(s.pending, node.Name) delete(s.running, node.Name) delete(s.tasks, node.Name) // We don't have to clean up the state in ZK for this // as it is fine to eventually just persist when we // receive a new TASK_RUNNING below. delete(s.reconciliationInfo, status.TaskId.GetValue()) s.QueueLaunchAttempt() // TODO(tyler) do we want to lock if the first task fails? // TODO(tyler) can we handle a total loss at reconciliation time, // when s.state == Immutable? if len(s.running) == 0 && s.state == Mutable { log.Error("TOTAL CLUSTER LOSS! LOCKING SCHEDULER, " + "FOLLOW RESTORATION GUIDE AT " + "https://github.com/mesosphere/" + "etcd-mesos/blob/master/docs/response.md") s.state = Immutable } case mesos.TaskState_TASK_STARTING: case mesos.TaskState_TASK_RUNNING: // We update data to ZK synchronously because it must happen // in-order. If we spun off a goroutine this would possibly retry // and succeed in the wrong order, and older data would win. // We keep this simple here, as if ZK is healthy this won't take long. // If this takes long, we're probably about to die anyway, as ZK is // displeased and mesos-go will panic when it loses contact. s.reconciliationInfo[status.TaskId.GetValue()] = status.SlaveId.GetValue() err = s.updateReconciliationInfoFunc( s.reconciliationInfo, s.ZkServers, s.ZkChroot, s.FrameworkName, ) if err != nil { log.Errorf("Failed to persist reconciliation info: %+v", err) } delete(s.pending, node.Name) _, present := s.running[node.Name] if !present { s.running[node.Name] = node s.tasks[node.Name] = status.TaskId } // During reconcilliation, we may find nodes with higher ID's due to ntp drift etcdIndexParts := strings.Split(node.Name, "-") if len(etcdIndexParts) != 2 { log.Warning("Task has a Name that does not follow the form etcd-<index>") } else { etcdIndex, err := strconv.ParseInt(etcdIndexParts[1], 10, 64) if err != nil { log.Warning("Task has a Name that does not follow the form etcd-<index>") } else { if etcdIndex > s.highestInstanceID { s.highestInstanceID = etcdIndex + 1 } } } default: log.Warningf("Received unhandled task state: %+v", status.GetState()) } }