Beispiel #1
0
func (m *Master) execLaunchTask(segmentId int64, data interface{}) {
	client.Debug("execLaunchTask", segmentId)

	m.mu.Lock()
	defer m.mu.Unlock()
	tx := m.hd.Begin()

	segment := GetSegment(tx, segmentId)

	if segment.Status == SEGMENT_UNASSIGNED {
		worker := GetRandomAliveWorker(tx)

		if worker != nil {
			segment.WorkerId = int64(worker.Id)
		} else {
			segment.WorkerId = 0
		}
		saveOrPanic(tx, segment)

		if segment.WorkerId != 0 {
			// if a worker was availble
			inputs, missingRdds := segment.CalculateInputSegments(tx)
			if len(missingRdds) != 0 {
				// if any of the input rdds are incomplete, then re-execute them
				for _, rdd := range missingRdds {
					client.Debug("missing rdd, reexecuting", rdd)
					e := Event{
						Type: LAUNCH_JOB,
						Id:   int64(rdd.Id),
					}
					m.queueEvent(e)
				}
				commitOrPanic(tx)
			} else {
				// otherwise, launch the task
				rdd := segment.GetRdd(tx)
				pj := rdd.GetProtojob(tx)
				batch := rdd.GetWorkflowBatch(tx)
				workflow := batch.GetWorkflow(tx)
				segmentCopies := segment.GetSegmentCopies(tx)
				commitOrPanic(tx)

				command := preprocessMasterCommand(pj.Command, batch, segment, workflow)

				args := &client.ExecArgs{
					Command:         command,
					Segments:        inputs,
					OutputSegmentId: int64(segment.Id),
					Indices:         parseIndex(pj.PartitionIndex),
					Parts:           pj.NumBuckets,
				}

				c := client.MakeWorkerClerk(worker.Url)

				// Launch the task on a background goroutine
				go func() {
					reply := c.ExecTask(args, 3)
					if reply != nil {
						if reply.Err == client.OK {
							// task success
							if len(segmentCopies) > 0 {
								for _, cp := range segmentCopies {
									e := Event{
										Type: LAUNCH_COPY,
										Id:   int64(cp.Id),
									}
									m.queueEvent(e)
								}
							} else {
								e := Event{
									Type: TASK_SUCCESS,
									Id:   segmentId,
								}
								m.queueEvent(e)
							}
						} else {
							if reply.Err == client.DEAD_SEGMENT {
								client.Debug(client.DEAD_SEGMENT)
								// task failed due to dead segment host
								e := Event{
									Type: TASK_FAILURE,
									Id:   segmentId,
									Data: &FailureData{
										Type:     FAILURE_DEAD_SEGMENT,
										WorkerId: reply.WorkerId,
									},
								}
								m.queueEvent(e)
							} else {
								client.Debug(client.SEGMENT_NOT_FOUND)
								// task failed due to a segment host that forgot an RDD
								e := Event{
									Type: TASK_FAILURE,
									Id:   segmentId,
									Data: &FailureData{
										Type:     FAILURE_MISSING_SEGMENT,
										WorkerId: reply.WorkerId,
									},
								}
								m.queueEvent(e)
							}
						}
					} else {
						client.Debug("DEAD_WORKER")
						// Conclude that the worker is dead
						e := Event{
							Type: TASK_FAILURE,
							Id:   segmentId,
							Data: &FailureData{
								Type:     FAILURE_DEAD_WORKER,
								WorkerId: int64(worker.Id),
							},
						}
						m.queueEvent(e)
					}
				}()
			}
		} else {
			// if no workers are available, just re-queue the task
			client.Debug("no workers available")
			e := Event{
				Type: LAUNCH_TASK,
				Id:   segmentId,
			}
			m.queueEvent(e)
			commitOrPanic(tx)
		}
	}
}
Beispiel #2
0
func (m *Master) execLaunchCopy(segmentCopyId int64, data interface{}) {
	client.Debug("launchCopy", segmentCopyId)

	m.mu.Lock()
	defer m.mu.Unlock()
	tx := m.hd.Begin()

	cp := GetSegmentCopy(tx, segmentCopyId)

	if cp.Status == SEGMENT_COPY_UNASSIGNED {
		segment := cp.GetSegment(tx)
		rdd := segment.GetRdd(tx)
		pj := rdd.GetProtojob(tx)
		workers := GetAliveWorkers(tx)
		otherCopies := segment.GetSegmentCopies(tx)
		if len(workers) < pj.Copies+1 {
			// Stop the event loop until enough workers join the system
			// to meet the required replication level
			client.Debug("not enough workers, need at least", pj.Copies+1)
			m.increaseMinWorkersTo(int64(pj.Copies + 1))
			e := Event{
				Type: LAUNCH_COPY,
				Id:   int64(cp.Id),
			}
			m.queueEvent(e)
		} else {
			// it is safe to launch the copy, so choose a random worker that
			// doesn't already have an identical segment or a copy
			workerIds := make(map[int64]*Worker)
			for _, worker := range workers {
				workerIds[int64(worker.Id)] = worker
			}
			sourceWorker := workerIds[segment.WorkerId]
			// sourceWorker might be nil if it has already died. In this case,
			// abort this event and reschedule the RDD
			if sourceWorker == nil {
				e := Event{
					Type: LAUNCH_JOB,
					Id:   int64(rdd.Id),
				}
				m.queueEvent(e)
			} else {
				delete(workerIds, segment.WorkerId)
				for _, c := range otherCopies {
					if c.Id != cp.Id {
						delete(workerIds, c.WorkerId)
					}
				}
				workerList := make([]*Worker, 0, len(workerIds))
				for _, w := range workerIds {
					workerList = append(workerList, w)
				}
				worker := workerList[rand.Int()%len(workerList)]
				cp.WorkerId = int64(worker.Id)
				cp.Status = SEGMENT_COPY_PENDING
				saveOrPanic(tx, cp)
				// launch the rpc in the background
				c := client.MakeWorkerClerk(worker.Url)
				args := &client.CopySegmentArgs{
					SegmentId: int64(segment.Id),
					WorkerUrl: sourceWorker.Url,
					WorkerId:  int64(sourceWorker.Id),
				}
				go func() {
					reply := c.CopySegment(args, 3)
					if reply != nil {
						if reply.Err == client.OK {
							// task success
							e := Event{
								Type: COPY_SUCCESS,
								Id:   segmentCopyId,
							}
							m.queueEvent(e)
						} else {
							if reply.Err == client.DEAD_SEGMENT {
								client.Debug(client.DEAD_SEGMENT)
								// task failed due to dead segment host
								e := Event{
									Type: COPY_FAILURE,
									Id:   segmentCopyId,
									Data: &FailureData{
										Type:     FAILURE_DEAD_SEGMENT,
										WorkerId: reply.WorkerId,
									},
								}
								m.queueEvent(e)
							} else {
								client.Debug(client.SEGMENT_NOT_FOUND)
								// task failed due to a segment host that forgot an RDD
								e := Event{
									Type: COPY_FAILURE,
									Id:   segmentCopyId,
									Data: &FailureData{
										Type:     FAILURE_MISSING_SEGMENT,
										WorkerId: reply.WorkerId,
									},
								}
								m.queueEvent(e)
							}
						}
					} else {
						client.Debug("DEAD_WORKER")
						// Conclude that the worker is dead
						e := Event{
							Type: COPY_FAILURE,
							Id:   segmentCopyId,
							Data: &FailureData{
								Type:     FAILURE_DEAD_WORKER,
								WorkerId: int64(worker.Id),
							},
						}
						m.queueEvent(e)
					}
				}()
			}
		}
	}

	commitOrPanic(tx)
}