func main() { programType := flag.String("type", "", "(c) controller or (t) task") job := flag.String("job", "", "job name") etcdURLs := []string{"http://localhost:4001"} flag.Parse() if *job == "" { log.Fatalf("Please specify a job name") } ntask := uint64(2) switch *programType { case "c": log.Printf("controller") controller := controller.New(*job, etcd.NewClient(etcdURLs), ntask) controller.Start() controller.WaitForJobDone() case "t": log.Printf("task") bootstrap := framework.NewBootStrap(*job, etcdURLs, createListener(), nil) taskBuilder := ®ression.SimpleTaskBuilder{ GDataChan: make(chan int32, 11), NumberOfIterations: 10, MasterConfig: map[string]string{"writefile": "result.txt"}, } bootstrap.SetTaskBuilder(taskBuilder) bootstrap.AddLinkage("Parents" : topo.NewTreeTopologyOfParent(2, ntask)) bootstrap.AddLinkage("Children" : topo.NewTreeTopologyOfChildren(2, ntask)) bootstrap.Start() default: log.Fatal("Please choose a type: (c) controller, (t) task") } }
// TestRequestDataEpochMismatch creates a scenario where data request happened // with two different epochs. In this case, the server should back pressure and // request client should get notified and return error. func TestRequestDataEpochMismatch(t *testing.T) { t.Skip("TODO") job := "TestRequestDataEpochMismatch" etcdURLs := []string{"http://localhost:4001"} ctl := controller.New(job, etcd.NewClient(etcdURLs), 1, []string{"Parents", "Children"}) ctl.InitEtcdLayout() defer ctl.DestroyEtcdLayout() fw := &framework{ name: job, etcdURLs: etcdURLs, ln: createListener(t), } var wg sync.WaitGroup fw.SetTaskBuilder(&testableTaskBuilder{ setupLatch: &wg, }) fw.AddLinkage("Parents", topo.NewTreeTopologyOfParent(1, 1)) fw.AddLinkage("Children", topo.NewTreeTopologyOfChildren(1, 1)) wg.Add(1) go fw.Start() wg.Wait() defer fw.ShutdownJob() addr, err := etcdutil.GetAddress(fw.etcdClient, job, fw.GetTaskID()) if err != nil { t.Fatalf("GetAddress failed: %v", err) } addr = addr // _, err = frameworkhttp.RequestData(addr, "Parents", "req", 0, fw.GetTaskID(), 10, fw.GetLogger()) // if err != frameworkhttp.ErrReqEpochMismatch { // t.Fatalf("error want = %v, but get = (%)", frameworkhttp.ErrReqEpochMismatch, err.Error()) // } }
func TestRegressionFramework(t *testing.T) { etcdURLs := []string{"http://localhost:4001"} job := "framework_regression_test" numOfTasks := uint64(15) numOfIterations := uint64(10) // controller start first to setup task directories in etcd controller := controller.New(job, etcd.NewClient(etcdURLs), numOfTasks, []string{"Parents", "Children"}) controller.Start() // We need to set etcd so that nodes know what to do. taskBuilder := ®ression.SimpleTaskBuilder{ GDataChan: make(chan int32, 11), NumberOfIterations: numOfIterations, } for i := uint64(0); i < numOfTasks; i++ { go driveWithTreeTopo(t, job, etcdURLs, numOfTasks, taskBuilder) } wantData := []int32{0, 105, 210, 315, 420, 525, 630, 735, 840, 945, 1050} getData := make([]int32, numOfIterations+1) for i := uint64(0); i <= numOfIterations; i++ { getData[i] = <-taskBuilder.GDataChan } for i := range wantData { if wantData[i] != getData[i] { t.Errorf("#%d: data want = %d, get = %d\n", i, wantData[i], getData[i]) } } controller.WaitForJobDone() controller.Stop() }
func main() { etcdUrlList := flag.String("etcd_urls", "", "ETCD server lists, sep by a comma.") jobName := flag.String("job_name", "bwmf", "Job name in etcd path.") jobType := flag.String("job_type", "c", "Job type, either 'c' for controller or 't' for task.") numTasks := flag.Int("num_tasks", 1, "Num of tasks.") taskConfigFile := flag.String("task_config", "", "Path to task config json file.") flag.Parse() if *jobName == "" { log.Fatal("Job name is required.") } crd, oErr := filesystem.NewLocalFSClient().OpenReadCloser(*taskConfigFile) if oErr != nil { log.Fatalf("Failed opening task config file. %s", oErr) } confData, rdErr := ioutil.ReadAll(crd) if rdErr != nil { log.Fatalf("Failed reading task config. %s", rdErr) } log.Printf("conf data: %s", confData) if *etcdUrlList == "" { log.Fatal("Please specify the etcd server urls.") } etcdUrls := strings.Split(*etcdUrlList, ",") log.Println("etcd urls: ", etcdUrls) topoMaster := topo.NewFullTopologyOfMaster(uint64(*numTasks)) topoNeighbors := topo.NewFullTopologyOfNeighbor(uint64(*numTasks)) switch *jobType { case "t": bootstrap := framework.NewBootStrap(*jobName, etcdUrls, createListener(), nil) taskBuilder := &bwmf.BWMFTaskBuilder{ NumOfTasks: uint64(*numTasks), ConfBytes: confData, } bootstrap.SetTaskBuilder(taskBuilder) bootstrap.AddLinkage("Master", topoMaster) bootstrap.AddLinkage("Neighbors", topoNeighbors) log.Println("Starting task..") bootstrap.Start() case "c": controller := controller.New(*jobName, etcd.NewClient(etcdUrls), uint64(*numTasks), []string{"Master", "Neighbors"}) controller.Start() log.Println("Controller started.") controller.WaitForJobDone() controller.Stop() default: log.Fatal("Please choose a type via '-jobtype': (c) controller, (t) task") } }
func TestBWMF(t *testing.T) { etcdURLs := []string{"http://localhost:4001"} job := "bwmf_basic_test" numOfTasks := uint64(2) generateTestData(t) ctl := controller.New(job, etcd.NewClient(etcdURLs), numOfTasks, []string{"Neighbors", "Master"}) ctl.Start() tb := &bwmf.BWMFTaskBuilder{ NumOfTasks: numOfTasks, ConfBytes: []byte(`{ "OptConf": { "Sigma":0.01, "Alpha":1, "Beta":0.1, "GradTol":1e-06, "FixedCnt": 200000, "NumIters":4, "DimLatent":2 }, "IOConf": { "Fs":"local", "IDPath":"../.tmp/row_shard.dat", "ITPath":"../.tmp/column_shard.dat", "ODPath":"../.tmp/dShard.dat", "OTPath":"../.tmp/tShard.dat" } }`), } for i := uint64(0); i < numOfTasks; i++ { go drive( t, job, etcdURLs, tb, map[string]taskgraph.Topology{ "Master": topo.NewFullTopologyOfMaster(numOfTasks), "Neighbors": topo.NewFullTopologyOfNeighbor(numOfTasks), }, ) } ctl.WaitForJobDone() ctl.Stop() }
// TestMasterSetEpochFailure checks if a master task failed at SetEpoch, // 1. a new boostrap will be created to take over // 2. continue what's left; // 3. finish the job with the same result. func TestMasterSetEpochFailure(t *testing.T) { job := "TestMasterSetEpochFailure" etcdURLs := []string{"http://localhost:4001"} numOfTasks := uint64(15) numOfIterations := uint64(10) // controller start first to setup task directories in etcd controller := controller.New(job, etcd.NewClient(etcdURLs), numOfTasks, []string{"Parents", "Children"}) controller.Start() taskBuilder := ®ression.SimpleTaskBuilder{ GDataChan: make(chan int32, 11), NodeProducer: make(chan bool, 1), MasterConfig: map[string]string{ "SetEpoch": "fail", "failepoch": "1", "faillevel": "100", }, NumberOfIterations: numOfIterations, } for i := uint64(0); i < numOfTasks; i++ { go driveWithTreeTopo(t, job, etcdURLs, numOfTasks, taskBuilder) } if <-taskBuilder.NodeProducer { taskBuilder.MasterConfig = nil log.Println("Starting a new node") // this time we start a new bootstrap whose task master doesn't fail. go driveWithTreeTopo(t, job, etcdURLs, numOfTasks, taskBuilder) } wantData := []int32{0, 105, 210, 315, 420, 525, 630, 735, 840, 945, 1050} getData := make([]int32, numOfIterations+1) for i := uint64(0); i <= numOfIterations; i++ { getData[i] = <-taskBuilder.GDataChan } for i := range wantData { if wantData[i] != getData[i] { t.Errorf("#%d: data want = %d, get = %d", i, wantData[i], getData[i]) } } controller.WaitForJobDone() controller.Stop() }
func testSlaveFailure(t *testing.T, job string, slaveConfig map[string]string) { etcdURLs := []string{"http://localhost:4001"} numOfTasks := uint64(15) numOfIterations := uint64(10) // controller start first to setup task directories in etcd controller := controller.New(job, etcd.NewClient(etcdURLs), numOfTasks, []string{"Parents", "Children"}) controller.Start() defer controller.Stop() taskBuilder := ®ression.SimpleTaskBuilder{ GDataChan: make(chan int32, 11), NodeProducer: make(chan bool, 1), SlaveConfig: slaveConfig, NumberOfIterations: numOfIterations, } go func() { for _ = range taskBuilder.NodeProducer { log.Println("Starting a new node") go driveWithTreeTopo(t, job, etcdURLs, numOfTasks, taskBuilder) } }() for i := uint64(0); i < numOfTasks; i++ { taskBuilder.NodeProducer <- true } wantData := []int32{0, 105, 210, 315, 420, 525, 630, 735, 840, 945, 1050} getData := make([]int32, numOfIterations+1) for i := uint64(0); i <= numOfIterations; i++ { getData[i] = <-taskBuilder.GDataChan } for i := range wantData { if wantData[i] != getData[i] { t.Errorf("#%d: data want = %d, get = %d", i, wantData[i], getData[i]) } } controller.WaitForJobDone() controller.Stop() close(taskBuilder.NodeProducer) }
// TestFrameworkFlagMetaReady and TestFrameworkDataRequest test basic workflows of // framework impl. It uses a scenario with two nodes: 0 as parent, 1 as child. // The basic idea is that when parent tries to talk to child and vice versa, // there will be some data transferring and captured by application task. // Here we have implemented a helper user task to capture those data, test if // it's passed from framework correctly and unmodified. func TestFrameworkFlagMeta(t *testing.T) { appName := "TestFrameworkFlagMeta" etcdURLs := []string{"http://localhost:4001"} // launch controller to setup etcd layout ctl := controller.New(appName, etcd.NewClient(etcdURLs), 2, []string{"Parents", "Children"}) if err := ctl.InitEtcdLayout(); err != nil { t.Fatalf("initEtcdLayout failed: %v", err) } defer ctl.DestroyEtcdLayout() pDataChan := make(chan *tDataBundle, 1) cDataChan := make(chan *tDataBundle, 1) // simulate two tasks on two nodes -- 0 and 1 // 0 is parent, 1 is child f0 := &framework{ name: appName, etcdURLs: etcdURLs, ln: createListener(t), } f1 := &framework{ name: appName, etcdURLs: etcdURLs, ln: createListener(t), } var wg sync.WaitGroup taskBuilder := &testableTaskBuilder{ cDataChan: cDataChan, pDataChan: pDataChan, setupLatch: &wg, } f0.SetTaskBuilder(taskBuilder) f0.AddLinkage("Parents", topo.NewTreeTopologyOfParent(2, 2)) f0.AddLinkage("Children", topo.NewTreeTopologyOfChildren(2, 2)) f1.SetTaskBuilder(taskBuilder) f1.AddLinkage("Parents", topo.NewTreeTopologyOfParent(2, 2)) f1.AddLinkage("Children", topo.NewTreeTopologyOfChildren(2, 2)) taskBuilder.setupLatch.Add(2) go f0.Start() go f1.Start() taskBuilder.setupLatch.Wait() if f0.GetTaskID() != 0 { f0, f1 = f1, f0 } defer f0.ShutdownJob() tests := []struct { cMeta string pMeta string }{ {"parent", "child"}, {"ParamReady", "GradientReady"}, } ctx := context.WithValue(context.Background(), epochKey, uint64(0)) for i, tt := range tests { // 0: F#FlagChildMetaReady -> 1: T#ParentMetaReady f0.FlagMeta(ctx, "Children", tt.cMeta) // from child(1)'s view data := <-cDataChan expected := &tDataBundle{id: 0, meta: tt.cMeta} if !reflect.DeepEqual(data, expected) { t.Errorf("#%d: data bundle want = %v, get = %v", i, expected, data) } // 1: F#FlagParentMetaReady -> 0: T#ChildMetaReady f1.FlagMeta(ctx, "Parents", tt.pMeta) // from parent(0)'s view data = <-pDataChan expected = &tDataBundle{id: 1, meta: tt.pMeta} if !reflect.DeepEqual(data, expected) { t.Errorf("#%d: data bundle want = %v, get = %v", i, expected, data) } } }
func TestFrameworkDataRequest(t *testing.T) { appName := "framework_test_datarequest" etcdURLs := []string{"http://localhost:4001"} // launch controller to setup etcd layout ctl := controller.New(appName, etcd.NewClient(etcdURLs), 2, []string{"Parents", "Children"}) if err := ctl.InitEtcdLayout(); err != nil { t.Fatalf("initEtcdLayout failed: %v", err) } defer ctl.DestroyEtcdLayout() pDataChan := make(chan *tDataBundle, 1) cDataChan := make(chan *tDataBundle, 1) // simulate two tasks on two nodes -- 0 and 1 // 0 is parent, 1 is child f0 := &framework{ name: appName, etcdURLs: etcdURLs, ln: createListener(t), } f1 := &framework{ name: appName, etcdURLs: etcdURLs, ln: createListener(t), } var wg sync.WaitGroup taskBuilder := &testableTaskBuilder{ cDataChan: cDataChan, pDataChan: pDataChan, setupLatch: &wg, } f0.SetTaskBuilder(taskBuilder) f0.AddLinkage("Parents", topo.NewTreeTopologyOfParent(2, 2)) f0.AddLinkage("Children", topo.NewTreeTopologyOfChildren(2, 2)) f1.SetTaskBuilder(taskBuilder) f1.AddLinkage("Parents", topo.NewTreeTopologyOfParent(2, 2)) f1.AddLinkage("Children", topo.NewTreeTopologyOfChildren(2, 2)) taskBuilder.setupLatch.Add(2) go f0.Start() go f1.Start() taskBuilder.setupLatch.Wait() if f0.GetTaskID() != 0 { f0, f1 = f1, f0 } defer f0.ShutdownJob() ctx := context.WithValue(context.Background(), epochKey, uint64(0)) f0.DataRequest(ctx, 1, "/proto.Regression/GetGradient", nil) data := <-pDataChan expected := &tDataBundle{ id: 1, method: "/proto.Regression/GetGradient", output: &pb.Gradient{1}, } if !reflect.DeepEqual(data, expected) { t.Errorf("data bundle want = %v, get = %v", expected, data) } f1.DataRequest(ctx, 0, "/proto.Regression/GetParameter", nil) data = <-cDataChan expected = &tDataBundle{ id: 0, method: "/proto.Regression/GetParameter", output: &pb.Parameter{1}, } if !reflect.DeepEqual(data, expected) { t.Errorf("data bundle want = %v, get = %v", expected, data) } }