func (m *Master) execLaunchTask(segmentId int64, data interface{}) { client.Debug("execLaunchTask", segmentId) m.mu.Lock() defer m.mu.Unlock() tx := m.hd.Begin() segment := GetSegment(tx, segmentId) if segment.Status == SEGMENT_UNASSIGNED { worker := GetRandomAliveWorker(tx) if worker != nil { segment.WorkerId = int64(worker.Id) } else { segment.WorkerId = 0 } saveOrPanic(tx, segment) if segment.WorkerId != 0 { // if a worker was availble inputs, missingRdds := segment.CalculateInputSegments(tx) if len(missingRdds) != 0 { // if any of the input rdds are incomplete, then re-execute them for _, rdd := range missingRdds { client.Debug("missing rdd, reexecuting", rdd) e := Event{ Type: LAUNCH_JOB, Id: int64(rdd.Id), } m.queueEvent(e) } commitOrPanic(tx) } else { // otherwise, launch the task rdd := segment.GetRdd(tx) pj := rdd.GetProtojob(tx) batch := rdd.GetWorkflowBatch(tx) workflow := batch.GetWorkflow(tx) segmentCopies := segment.GetSegmentCopies(tx) commitOrPanic(tx) command := preprocessMasterCommand(pj.Command, batch, segment, workflow) args := &client.ExecArgs{ Command: command, Segments: inputs, OutputSegmentId: int64(segment.Id), Indices: parseIndex(pj.PartitionIndex), Parts: pj.NumBuckets, } c := client.MakeWorkerClerk(worker.Url) // Launch the task on a background goroutine go func() { reply := c.ExecTask(args, 3) if reply != nil { if reply.Err == client.OK { // task success if len(segmentCopies) > 0 { for _, cp := range segmentCopies { e := Event{ Type: LAUNCH_COPY, Id: int64(cp.Id), } m.queueEvent(e) } } else { e := Event{ Type: TASK_SUCCESS, Id: segmentId, } m.queueEvent(e) } } else { if reply.Err == client.DEAD_SEGMENT { client.Debug(client.DEAD_SEGMENT) // task failed due to dead segment host e := Event{ Type: TASK_FAILURE, Id: segmentId, Data: &FailureData{ Type: FAILURE_DEAD_SEGMENT, WorkerId: reply.WorkerId, }, } m.queueEvent(e) } else { client.Debug(client.SEGMENT_NOT_FOUND) // task failed due to a segment host that forgot an RDD e := Event{ Type: TASK_FAILURE, Id: segmentId, Data: &FailureData{ Type: FAILURE_MISSING_SEGMENT, WorkerId: reply.WorkerId, }, } m.queueEvent(e) } } } else { client.Debug("DEAD_WORKER") // Conclude that the worker is dead e := Event{ Type: TASK_FAILURE, Id: segmentId, Data: &FailureData{ Type: FAILURE_DEAD_WORKER, WorkerId: int64(worker.Id), }, } m.queueEvent(e) } }() } } else { // if no workers are available, just re-queue the task client.Debug("no workers available") e := Event{ Type: LAUNCH_TASK, Id: segmentId, } m.queueEvent(e) commitOrPanic(tx) } } }
func (m *Master) execLaunchCopy(segmentCopyId int64, data interface{}) { client.Debug("launchCopy", segmentCopyId) m.mu.Lock() defer m.mu.Unlock() tx := m.hd.Begin() cp := GetSegmentCopy(tx, segmentCopyId) if cp.Status == SEGMENT_COPY_UNASSIGNED { segment := cp.GetSegment(tx) rdd := segment.GetRdd(tx) pj := rdd.GetProtojob(tx) workers := GetAliveWorkers(tx) otherCopies := segment.GetSegmentCopies(tx) if len(workers) < pj.Copies+1 { // Stop the event loop until enough workers join the system // to meet the required replication level client.Debug("not enough workers, need at least", pj.Copies+1) m.increaseMinWorkersTo(int64(pj.Copies + 1)) e := Event{ Type: LAUNCH_COPY, Id: int64(cp.Id), } m.queueEvent(e) } else { // it is safe to launch the copy, so choose a random worker that // doesn't already have an identical segment or a copy workerIds := make(map[int64]*Worker) for _, worker := range workers { workerIds[int64(worker.Id)] = worker } sourceWorker := workerIds[segment.WorkerId] // sourceWorker might be nil if it has already died. In this case, // abort this event and reschedule the RDD if sourceWorker == nil { e := Event{ Type: LAUNCH_JOB, Id: int64(rdd.Id), } m.queueEvent(e) } else { delete(workerIds, segment.WorkerId) for _, c := range otherCopies { if c.Id != cp.Id { delete(workerIds, c.WorkerId) } } workerList := make([]*Worker, 0, len(workerIds)) for _, w := range workerIds { workerList = append(workerList, w) } worker := workerList[rand.Int()%len(workerList)] cp.WorkerId = int64(worker.Id) cp.Status = SEGMENT_COPY_PENDING saveOrPanic(tx, cp) // launch the rpc in the background c := client.MakeWorkerClerk(worker.Url) args := &client.CopySegmentArgs{ SegmentId: int64(segment.Id), WorkerUrl: sourceWorker.Url, WorkerId: int64(sourceWorker.Id), } go func() { reply := c.CopySegment(args, 3) if reply != nil { if reply.Err == client.OK { // task success e := Event{ Type: COPY_SUCCESS, Id: segmentCopyId, } m.queueEvent(e) } else { if reply.Err == client.DEAD_SEGMENT { client.Debug(client.DEAD_SEGMENT) // task failed due to dead segment host e := Event{ Type: COPY_FAILURE, Id: segmentCopyId, Data: &FailureData{ Type: FAILURE_DEAD_SEGMENT, WorkerId: reply.WorkerId, }, } m.queueEvent(e) } else { client.Debug(client.SEGMENT_NOT_FOUND) // task failed due to a segment host that forgot an RDD e := Event{ Type: COPY_FAILURE, Id: segmentCopyId, Data: &FailureData{ Type: FAILURE_MISSING_SEGMENT, WorkerId: reply.WorkerId, }, } m.queueEvent(e) } } } else { client.Debug("DEAD_WORKER") // Conclude that the worker is dead e := Event{ Type: COPY_FAILURE, Id: segmentCopyId, Data: &FailureData{ Type: FAILURE_DEAD_WORKER, WorkerId: int64(worker.Id), }, } m.queueEvent(e) } }() } } } commitOrPanic(tx) }