func TestLRU(t *testing.T) { gopath := os.Getenv("GOPATH") if _, err := os.Stat(gopath + "/src/segments"); err != nil { if os.IsNotExist(err) { os.Mkdir(gopath+"/src/segments", 0777) } else { panic(err) } } tuple1 := Tuple{Slice: []string{"Vedha", "Vikas", "Jeffrey", "Zack"}} tuple2 := Tuple{Slice: []string{"Vivek", "Anuhya", "Esha"}} tuple3 := Tuple{Slice: []string{"Christina", "Keerti"}} tuple4 := Tuple{Slice: []string{"Suganya", "Arooshi"}} var segment1 Segment segment1.Partitions = make([][]Tuple, 2) segment1.Partitions[0] = []Tuple{tuple1, tuple2} segment1.Partitions[1] = []Tuple{tuple3, tuple4} segment1.Id = 1234 var segment2 Segment segment2.Partitions = make([][]Tuple, 2) segment2.Partitions[0] = []Tuple{tuple1, tuple3} segment2.Partitions[1] = []Tuple{tuple2, tuple4} segment2.Id = 1111 lru := NewLRU(1, 4) lru.Insert(1234, &segment1) lru.Insert(1111, &segment2) s := lru.Get(1234) s2 := lru.Get(1111) client.Debug("Here's what I got", s) client.Debug(s2) client.Debug("Length", lru.Length()) }
func main() { hd := master.GetDbConnection() workflows := master.GetWorkflows(hd) for _, w := range workflows { client.Debug("--------------------------Begin--------------------------") client.Debug("Workflow ID:", w.Id) client.Debug(workflow.WorkflowToString(hd, w)) client.Debug("---------------------------End---------------------------") } }
func main() { if len(os.Args) != 3 { printUsage() return } workerhost := os.Args[1] masterhost := os.Args[2] client.Debug("Starting server on", workerhost) client.Debug("Press Ctrl-C to stop") worker.StartServer(workerhost, masterhost) waitForInterrupt() }
func main() { if len(os.Args) != 2 { printUsage() return } host := os.Args[1] hd := master.GetDbConnection() client.Debug("Starting server on", host) client.Debug("Press Ctrl-C to stop") master.StartServer(host, hd) waitForInterrupt() }
// Execute a UDF command that accepts zero or more input lists of tuples, and // returns one output list of tuples. This function blocks until the UDF is // done executing. func runUDF(command string, inputTuples map[int][]Tuple) []Tuple { // spawn the external process splits := strings.Split(command, " ") client.Debug(preprocessCommand(splits[0])) cmd := exec.Command(preprocessCommand(splits[0]), splits[1:]...) stdin, err := cmd.StdinPipe() if err != nil { log.Panic(err) } stdout, err := cmd.StdoutPipe() if err != nil { log.Panic(err) } cmd.Start() // write tuples to standard input on a background goroutine go func() { for index, tupleList := range inputTuples { for _, tuple := range tupleList { stdin.Write(tuple.SerializeTuple(index)) stdin.Write([]byte{'\n'}) } } stdin.Close() }() // read from standard output to get the output tuples outputTuples := make([]Tuple, 0) ReadTupleStream(stdout, func(tuple Tuple, index int) { outputTuples = append(outputTuples, tuple) }) return outputTuples }
func (m *Master) execLaunchJob(rddId int64, data interface{}) { client.Debug("execLaunchJob", rddId) m.mu.Lock() defer m.mu.Unlock() tx := m.hd.Begin() // TODO: check that all of the input RDDS are available rdd := GetRdd(tx, rddId) // check whether the rdd is already complete if rdd.State != RDD_COMPLETE { sourceRdds := rdd.GetSourceRdds(tx) readyToContinue := true for _, srcRdd := range sourceRdds { if srcRdd.State != RDD_COMPLETE { // relaunch any dependencies that are not complete readyToContinue = false e := Event{ Type: LAUNCH_JOB, Id: int64(srcRdd.Id), } m.queueEvent(e) } } // If all the dependencies are met, then launch the next // Rdd (dependencies are also checked for each individual task) if readyToContinue { // check whether we already created segments for this rdd segments := rdd.GetSegments(tx) if len(segments) > 0 { // if segments already present, just run the ones that are not complete // (this is part of the recovery protocol) for _, segment := range segments { if segment.Status != SEGMENT_COMPLETE { e := Event{ Type: LAUNCH_TASK, Id: int64(segment.Id), } m.queueEvent(e) } } } else { // Otherwise, create the new segments and run them segments, _ := rdd.CreateSegments(tx) for _, segment := range segments { e := Event{ Type: LAUNCH_TASK, Id: int64(segment.Id), } m.queueEvent(e) } } } } commitOrPanic(tx) }
func (m *Master) tryLaunchingDependentJobs(tx *hood.Hood, rdd *Rdd, pj *Protojob) { destRdds := rdd.GetDestRdds(tx) // For each destRdd, check whether all of the srcRdds // for that destRdd are complete. If so, launch the job // for destRdd // TODO: this logic will have to be re-written when fault-tolerance // is implemented for _, destRdd := range destRdds { srcRdds := destRdd.GetSourceRdds(tx) isComplete := true for _, srcRdd := range srcRdds { if (srcRdd.State != RDD_COMPLETE) && (srcRdd.Id != rdd.Id) { isComplete = false } } if isComplete { client.Debug("launching next job", destRdd) e := Event{ Type: LAUNCH_JOB, Id: int64(destRdd.Id), } m.queueEvent(e) } } }
func (m *Master) execTaskSuccess(segmentId int64, data interface{}) { client.Debug("execTaskSuccess", segmentId) m.mu.Lock() defer m.mu.Unlock() tx := m.hd.Begin() segment := GetSegment(tx, segmentId) rdd := segment.GetRdd(tx) pj := rdd.GetProtojob(tx) segment.Status = SEGMENT_COMPLETE saveOrPanic(tx, segment) numComplete := rdd.GetNumSegmentsComplete(tx, segment) if numComplete == pj.NumSegments { batch := rdd.GetWorkflowBatch(tx) workflow := batch.GetWorkflow(tx) fmt.Println("Job complete", rdd.Id, pj.Command, time.Now().UnixNano()/1000000-batch.StartTime-workflow.Duration) rdd.State = RDD_COMPLETE saveOrPanic(tx, rdd) m.tryLaunchingDependentJobs(tx, rdd, pj) } commitOrPanic(tx) }
// // server Register RPC handler. // func (m *Master) Register(args *client.RegisterArgs, reply *client.RegisterReply) error { client.Debug("Registering", args) m.mu.Lock() defer m.mu.Unlock() tx := m.hd.Begin() existingWorkers := GetWorkersAtAddress(tx, args.Me) for _, w := range existingWorkers { w.Status = WORKER_DEAD tx.Save(w) } newWorker := Worker{ Url: args.Me, } tx.Save(&newWorker) commitOrPanic(tx) tx = m.hd.Begin() m.getNumAliveWorkers(tx) commitOrPanic(tx) reply.Err = client.OK reply.Id = int64(newWorker.Id) return nil }
func waitForInterrupt() { c := make(chan os.Signal, 1) signal.Notify(c, os.Interrupt) for sig := range c { client.Debug("\ncaptured signal, stopping and exiting.\n", sig) return } }
func (m *Master) HandleFailureData(data *FailureData) { client.Debug("HANDLING FAILURE", data) tx := m.hd.Begin() worker := GetWorker(tx, data.WorkerId) worker.Status = WORKER_DEAD saveOrPanic(tx, worker) m.getNumAliveWorkers(tx) commitOrPanic(tx) }
func (w *Worker) ExecTask(args *client.ExecArgs, reply *client.ExecReply) error { inputTuples := make(map[int][]Tuple) fmt.Println("executing task", args) for _, segment := range args.Segments { localSegment := w.LocalGetSegment(segment.SegmentId) // fetch the segment if it is not already stored locally if localSegment == nil { client.Debug("fetching tuples", segment) clerk := MakeWorkerInternalClerk(segment.WorkerUrl) args2 := GetTuplesArgs{SegmentId: segment.SegmentId, PartitionIndex: segment.PartitionIndex} reply2 := clerk.GetTuples(&args2, 3) if reply2 != nil { if reply2.Err == client.OK { client.Debug("fetched tuples", len(reply2.Tuples)) inputTuples[segment.Index] = append(inputTuples[segment.Index], reply2.Tuples...) } else { reply.Err = reply2.Err reply.WorkerId = segment.WorkerId client.Debug(reply.Err) return nil } } else { reply.Err = client.DEAD_SEGMENT reply.WorkerId = segment.WorkerId client.Debug(reply.Err) return nil } } else { // use the locally stored copy inputTuples[segment.Index] = append(inputTuples[segment.Index], localSegment.Partitions[segment.PartitionIndex]...) } } client.Debug("running udf") start := time.Now() outputTuples := runUDF(args.Command, inputTuples) end := time.Now() client.Debug("duration:", end.Sub(start)) client.Debug("got output tuples", len(outputTuples)) client.Debug("writing segment") segment := MakeSegment(outputTuples, args.Indices, args.Parts) w.LocalPutSegment(args.OutputSegmentId, segment) client.Debug("success") reply.Err = client.OK return nil }
func (m *Master) eventLoop() { to := 1 iteration := 0 for { if iteration%10 == 0 { fmt.Println("on iteration", iteration) } iteration += 1 if atomic.LoadInt64(&m.numAliveWorkers) >= atomic.LoadInt64(&m.minWorkers) { start := time.Now() to = 1 e := <-m.events atomic.AddInt64(&m.numQueuedEvents, -1) switch e.Type { case NEW_BATCH: m.execNewBatch(e.Id, e.Data) case LAUNCH_TASK: m.execLaunchTask(e.Id, e.Data) case TASK_SUCCESS: m.execTaskSuccess(e.Id, e.Data) case TASK_FAILURE: m.execTaskFailure(e.Id, e.Data) case LAUNCH_JOB: m.execLaunchJob(e.Id, e.Data) case COPY_SUCCESS: m.execCopySuccess(e.Id, e.Data) case COPY_FAILURE: m.execCopyFailure(e.Id, e.Data) case LAUNCH_COPY: m.execLaunchCopy(e.Id, e.Data) } diff := time.Now().Sub(start) client.Debug("duration", diff) } else { client.Debug("sleeping", to) time.Sleep(time.Duration(to) * time.Millisecond) if to < 1000 { to *= 2 } } } }
func (w *Worker) CopySegment(args *client.CopySegmentArgs, reply *client.CopySegmentReply) error { client.Debug("copying segment", args) if w.LocalGetSegment(args.SegmentId) != nil { // this should never happen during normal operaiton (though it might // happen during the master recovery procedure) client.Debug("already have segment, overwriting...") } client.Debug("fetching segment", args.SegmentId) clerk := MakeWorkerInternalClerk(args.WorkerUrl) args2 := GetSegmentArgs{SegmentId: args.SegmentId} reply2 := clerk.GetSegment(&args2, 3) if reply2 != nil { if reply2.Err == client.OK { client.Debug("fetched segment", args.SegmentId) w.LocalPutSegment(args.SegmentId, reply2.Segment) reply.Err = client.OK } else { reply.Err = reply2.Err reply.WorkerId = args.WorkerId client.Debug(reply.Err) return nil } } else { reply.Err = client.DEAD_SEGMENT reply.WorkerId = args.WorkerId client.Debug(reply.Err) return nil } return nil }
func (m *Master) execTaskFailure(segmentId int64, data interface{}) { client.Debug("execTaskFailure", segmentId) m.mu.Lock() defer m.mu.Unlock() m.HandleFailureData(data.(*FailureData)) e := Event{ Type: LAUNCH_TASK, Id: int64(segmentId), } m.queueEvent(e) }
func (m *Master) execNewBatch(workflowId int64, data interface{}) { client.Debug("execNewBatch", workflowId) m.mu.Lock() defer m.mu.Unlock() tx := m.hd.Begin() // look up workflow workflow := GetWorkflow(tx, workflowId) // create new workflowbatch lastBatch := workflow.GetLastWorkflowBatch(tx) now := time.Now().UnixNano() / 1000000 var batch *WorkflowBatch if lastBatch == nil { // if no last batch, then create the first batch right now - duration - time_eror client.Debug("No last batch") batch = workflow.MakeBatch(tx, now-workflow.Duration-TIME_ERROR) } else { // TODO: figure out what exactly to do if there are multiple // batches to catch up on, or if it is not yet time to execute // the next job // for now, only launch a new batch if the proper time has arrived // (eg. the end time of the new batch has definitely passed) client.Debug(now, lastBatch.StartTime) if now > lastBatch.StartTime+2*workflow.Duration+TIME_ERROR { client.Debug("add new batch", workflow.Duration) batch = workflow.MakeBatch(tx, lastBatch.StartTime+workflow.Duration) } } commitOrPanic(tx) if batch != nil { m.launchBatchSourceJobs(batch) } }
func main() { if len(os.Args) != 2 { printUsage() return } hd := master.GetDbConnection() reader, err := os.Open(os.Args[1]) if err != nil { panic(err) } w, err := workflow.ReadWorkflow(hd, reader) if err != nil { panic(err) } client.Debug("Loaded workflow") client.Debug("--------------------------Begin--------------------------") client.Debug("Workflow ID:", w.Id) client.Debug(workflow.WorkflowToString(hd, w)) client.Debug("---------------------------End---------------------------") }
func (w *Worker) GetTuples(args *GetTuplesArgs, reply *GetTuplesReply) error { client.Debug("GET TUPLES RPC") if args.WorkerId != w.master.GetId() { segment := w.LocalGetSegment(args.SegmentId) if segment != nil { reply.Tuples = segment.Partitions[args.PartitionIndex] reply.Err = client.OK } else { reply.Err = client.SEGMENT_NOT_FOUND } } else { // The request is old, and this worker has died, rebooted, // and re-registered reply.Err = client.DEAD_SEGMENT } return nil }
func (w *Worker) GetSegment(args *GetSegmentArgs, reply *GetSegmentReply) error { client.Debug("GET SEGMENT RPC") if args.WorkerId != w.master.GetId() { segment := w.LocalGetSegment(args.SegmentId) if segment != nil { reply.Segment = segment reply.Err = client.OK } else { reply.Err = client.SEGMENT_NOT_FOUND } } else { // The request is old, and this worker has died, rebooted, // and re-registered reply.Err = client.DEAD_SEGMENT } return nil }
// // server Ping RPC handler. // func (m *Master) Ping(args *client.PingArgs, reply *client.PingReply) error { client.Debug("Pinging", args.Id) m.mu.Lock() defer m.mu.Unlock() tx := m.hd.Begin() w := GetWorker(m.hd, args.Id) if w != nil { reply.Err = client.OK w.Status = WORKER_ALIVE tx.Save(w) } else { // The worker was not found in our database, so tell it to reset reply.Err = client.RESET } // Timestamp is automatically upated on save commitOrPanic(tx) return nil }
func (m *Master) execCopySuccess(segmentCopyId int64, data interface{}) { client.Debug("copySuccess", segmentCopyId) m.mu.Lock() defer m.mu.Unlock() tx := m.hd.Begin() cp := GetSegmentCopy(tx, segmentCopyId) cp.Status = SEGMENT_COPY_COMPLETE saveOrPanic(tx, cp) segment := cp.GetSegment(tx) otherCopies := segment.GetSegmentCopies(tx) numComplete := 0 for _, c := range otherCopies { if (c.Status == SEGMENT_COPY_COMPLETE) || (c.Id == cp.Id) { numComplete += 1 } } rdd := segment.GetRdd(tx) pj := rdd.GetProtojob(tx) // If all of the segment copies are finished transmitting, declare // the task complete if numComplete >= pj.Copies { e := Event{ Type: TASK_SUCCESS, Id: int64(segment.Id), } m.queueEvent(e) } commitOrPanic(tx) }
/* Reset and initalize the master database tables. Note that this * will delete any existing data. * * Usage: * go run init_dabatase.go * */ func main() { hd := master.GetDbConnection() master.ResetDb(hd) master.CreateTables(hd) client.Debug("Success") }
func StartServer(hostname string, masterhost string) *Worker { // call gob.Register on structures you want // Go's RPC library to marshall/unmarshall. // gob.Register() runtime.GOMAXPROCS(7) gopath := os.Getenv("GOPATH") if _, err := os.Stat(gopath + "/src/segments"); err != nil { if os.IsNotExist(err) { os.Mkdir(gopath+"/src/segments", 0777) } else { panic(err) } } client.Debug("Starting worker") worker := new(Worker) worker.master = client.MakeMasterClerk(hostname, masterhost) worker.batches = make(map[int][]int64) worker.max_segments = 50000 rpcs := rpc.NewServer() rpcs.Register(worker) // ignore the domain name: listen on all urls splitName := strings.Split(hostname, ":") l, e := net.Listen("tcp", ":"+splitName[1]) if e != nil { log.Fatal("listen error: ", e) } worker.l = l // Register the worker to master client.Debug("Registering worker") worker.master.Register(true) client.Debug("Registered worker") worker.segments = NewLRU(worker.max_segments, worker.master.GetId()) go func() { for { if conn, err := worker.l.Accept(); err == nil { go rpcs.ServeConn(conn) } else { worker.kill() } } }() go func() { for { // Continuously ping the master so that the master is notified // when a network partition is resolved. reply := worker.master.Ping(true) if reply == client.RESET { panic("ping rejected by master") } time.Sleep(1 * time.Second) } }() return worker }
func printUsage() { client.Debug("Usage\n go run start_worker.go worker_interface:port master_interface:port\n") client.Debug("Example ports\n localhost:1324\n :2112\n 192.168.0.15:3333") }
func (m *Master) execLaunchTask(segmentId int64, data interface{}) { client.Debug("execLaunchTask", segmentId) m.mu.Lock() defer m.mu.Unlock() tx := m.hd.Begin() segment := GetSegment(tx, segmentId) if segment.Status == SEGMENT_UNASSIGNED { worker := GetRandomAliveWorker(tx) if worker != nil { segment.WorkerId = int64(worker.Id) } else { segment.WorkerId = 0 } saveOrPanic(tx, segment) if segment.WorkerId != 0 { // if a worker was availble inputs, missingRdds := segment.CalculateInputSegments(tx) if len(missingRdds) != 0 { // if any of the input rdds are incomplete, then re-execute them for _, rdd := range missingRdds { client.Debug("missing rdd, reexecuting", rdd) e := Event{ Type: LAUNCH_JOB, Id: int64(rdd.Id), } m.queueEvent(e) } commitOrPanic(tx) } else { // otherwise, launch the task rdd := segment.GetRdd(tx) pj := rdd.GetProtojob(tx) batch := rdd.GetWorkflowBatch(tx) workflow := batch.GetWorkflow(tx) segmentCopies := segment.GetSegmentCopies(tx) commitOrPanic(tx) command := preprocessMasterCommand(pj.Command, batch, segment, workflow) args := &client.ExecArgs{ Command: command, Segments: inputs, OutputSegmentId: int64(segment.Id), Indices: parseIndex(pj.PartitionIndex), Parts: pj.NumBuckets, } c := client.MakeWorkerClerk(worker.Url) // Launch the task on a background goroutine go func() { reply := c.ExecTask(args, 3) if reply != nil { if reply.Err == client.OK { // task success if len(segmentCopies) > 0 { for _, cp := range segmentCopies { e := Event{ Type: LAUNCH_COPY, Id: int64(cp.Id), } m.queueEvent(e) } } else { e := Event{ Type: TASK_SUCCESS, Id: segmentId, } m.queueEvent(e) } } else { if reply.Err == client.DEAD_SEGMENT { client.Debug(client.DEAD_SEGMENT) // task failed due to dead segment host e := Event{ Type: TASK_FAILURE, Id: segmentId, Data: &FailureData{ Type: FAILURE_DEAD_SEGMENT, WorkerId: reply.WorkerId, }, } m.queueEvent(e) } else { client.Debug(client.SEGMENT_NOT_FOUND) // task failed due to a segment host that forgot an RDD e := Event{ Type: TASK_FAILURE, Id: segmentId, Data: &FailureData{ Type: FAILURE_MISSING_SEGMENT, WorkerId: reply.WorkerId, }, } m.queueEvent(e) } } } else { client.Debug("DEAD_WORKER") // Conclude that the worker is dead e := Event{ Type: TASK_FAILURE, Id: segmentId, Data: &FailureData{ Type: FAILURE_DEAD_WORKER, WorkerId: int64(worker.Id), }, } m.queueEvent(e) } }() } } else { // if no workers are available, just re-queue the task client.Debug("no workers available") e := Event{ Type: LAUNCH_TASK, Id: segmentId, } m.queueEvent(e) commitOrPanic(tx) } } }
func printUsage() { client.Debug("Usage\n go run load_workflow.go filename\n") }
func (m *Master) execLaunchCopy(segmentCopyId int64, data interface{}) { client.Debug("launchCopy", segmentCopyId) m.mu.Lock() defer m.mu.Unlock() tx := m.hd.Begin() cp := GetSegmentCopy(tx, segmentCopyId) if cp.Status == SEGMENT_COPY_UNASSIGNED { segment := cp.GetSegment(tx) rdd := segment.GetRdd(tx) pj := rdd.GetProtojob(tx) workers := GetAliveWorkers(tx) otherCopies := segment.GetSegmentCopies(tx) if len(workers) < pj.Copies+1 { // Stop the event loop until enough workers join the system // to meet the required replication level client.Debug("not enough workers, need at least", pj.Copies+1) m.increaseMinWorkersTo(int64(pj.Copies + 1)) e := Event{ Type: LAUNCH_COPY, Id: int64(cp.Id), } m.queueEvent(e) } else { // it is safe to launch the copy, so choose a random worker that // doesn't already have an identical segment or a copy workerIds := make(map[int64]*Worker) for _, worker := range workers { workerIds[int64(worker.Id)] = worker } sourceWorker := workerIds[segment.WorkerId] // sourceWorker might be nil if it has already died. In this case, // abort this event and reschedule the RDD if sourceWorker == nil { e := Event{ Type: LAUNCH_JOB, Id: int64(rdd.Id), } m.queueEvent(e) } else { delete(workerIds, segment.WorkerId) for _, c := range otherCopies { if c.Id != cp.Id { delete(workerIds, c.WorkerId) } } workerList := make([]*Worker, 0, len(workerIds)) for _, w := range workerIds { workerList = append(workerList, w) } worker := workerList[rand.Int()%len(workerList)] cp.WorkerId = int64(worker.Id) cp.Status = SEGMENT_COPY_PENDING saveOrPanic(tx, cp) // launch the rpc in the background c := client.MakeWorkerClerk(worker.Url) args := &client.CopySegmentArgs{ SegmentId: int64(segment.Id), WorkerUrl: sourceWorker.Url, WorkerId: int64(sourceWorker.Id), } go func() { reply := c.CopySegment(args, 3) if reply != nil { if reply.Err == client.OK { // task success e := Event{ Type: COPY_SUCCESS, Id: segmentCopyId, } m.queueEvent(e) } else { if reply.Err == client.DEAD_SEGMENT { client.Debug(client.DEAD_SEGMENT) // task failed due to dead segment host e := Event{ Type: COPY_FAILURE, Id: segmentCopyId, Data: &FailureData{ Type: FAILURE_DEAD_SEGMENT, WorkerId: reply.WorkerId, }, } m.queueEvent(e) } else { client.Debug(client.SEGMENT_NOT_FOUND) // task failed due to a segment host that forgot an RDD e := Event{ Type: COPY_FAILURE, Id: segmentCopyId, Data: &FailureData{ Type: FAILURE_MISSING_SEGMENT, WorkerId: reply.WorkerId, }, } m.queueEvent(e) } } } else { client.Debug("DEAD_WORKER") // Conclude that the worker is dead e := Event{ Type: COPY_FAILURE, Id: segmentCopyId, Data: &FailureData{ Type: FAILURE_DEAD_WORKER, WorkerId: int64(worker.Id), }, } m.queueEvent(e) } }() } } } commitOrPanic(tx) }