func createScript(t tool.Interface, checker ToolChecker, args ...string) string { cmd := "go/bin/" + os.Args[0] if checker == nil { // since amazon emr doesn't have these libs checker = LapackToolChecker } if err := checker(cmd, t); err != nil { panic(fmt.Sprintf("tool %s doesn't check out: %v", tool.Name(t), err)) } buf, err := ioutil.ReadFile(cmd) check(err) { var g bytes.Buffer gz := gzip.NewWriter(&g) gz.Write(buf) gz.Close() buf = g.Bytes() } var f bytes.Buffer w := func(s string) { f.Write([]byte(s)) f.Write([]byte("\n")) } w2 := func(s string) { f.Write([]byte(s)) } w("#!/bin/bash") w(fmt.Sprintf("CMD=/tmp/`/bin/mktemp -u %s_XXXXXXXXXXXXX`", tool.Name(t))) w("/usr/bin/base64 -d <<END_TEXT | /bin/gzip -d > $CMD") w2(split(base64.StdEncoding.EncodeToString(buf))) w("END_TEXT") w("/bin/chmod 777 $CMD") run := func() string { f := new(bytes.Buffer) fmt.Fprintf(f, "$CMD %s", tool.Name(t)) for _, a := range args { fmt.Fprintf(f, " %s", a) } return string(f.Bytes()) }() w(run) w("/bin/rm $CMD") return string(f.Bytes()) }
func Run(flow Flow) (*RunFlowResponse, error) { validate(flow) if !flow.IsSpot { flow.MasterSpotPrice = 0 flow.SlaveSpotPrice = 0 } id := fmt.Sprintf("%s-%s_%s_%s", tool.Name(flow.Steps[0].Mapper), tool.Name(flow.Steps[0].Reducer), time.Now().UTC().Format("20060102T150405Z"), uuid.New()[:4]) ss3 := s3.GetDefault(flow.Auth) v := make(url.Values) v.Set("Action", "RunJobFlow") v.Set("Name", id) v.Set("AmiVersion", "2.4.3") v.Set("LogUri", fmt.Sprintf("s3n://%s/%s", flow.LogBucket, id)) v.Set("Instances.Ec2KeyName", flow.KeyName) v.Set("Instances.HadoopVersion", "1.0.3") if len(flow.AvailabilityZone) == 0 { flow.AvailabilityZone = "us-east-1d" } v.Set("Instances.Placement.AvailabilityZone", flow.AvailabilityZone) v.Set("Instances.KeepJobFlowAliveWhenNoSteps", fmt.Sprintf("%v", flow.KeepAlive)) v.Set("Instances.TerminationProtected", "false") if flow.IsSpot { v.Set("Instances.InstanceGroups.member.1.InstanceRole", "MASTER") v.Set("Instances.InstanceGroups.member.1.Market", "SPOT") v.Set("Instances.InstanceGroups.member.1.BidPrice", fmt.Sprintf("%.3f", flow.MasterSpotPrice)) v.Set("Instances.InstanceGroups.member.1.InstanceType", flow.MasterInstanceType) v.Set("Instances.InstanceGroups.member.1.InstanceCount", "1") v.Set("Instances.InstanceGroups.member.2.InstanceRole", "CORE") v.Set("Instances.InstanceGroups.member.2.Market", "SPOT") v.Set("Instances.InstanceGroups.member.2.BidPrice", fmt.Sprintf("%.3f", flow.SlaveSpotPrice)) v.Set("Instances.InstanceGroups.member.2.InstanceType", flow.SlaveInstanceType) v.Set("Instances.InstanceGroups.member.2.InstanceCount", fmt.Sprintf("%d", flow.Instances-1)) } else { v.Set("Instances.MasterInstanceType", flow.MasterInstanceType) v.Set("Instances.SlaveInstanceType", flow.SlaveInstanceType) v.Set("Instances.InstanceCount", fmt.Sprintf("%d", flow.Instances)) } failureAction := func() string { if flow.KeepAlive { return "CANCEL_AND_WAIT" } else { return "TERMINATE_JOB_FLOW" } }() v.Set("Steps.member.1.Name", "debugging") v.Set("Steps.member.1.ActionOnFailure", failureAction) v.Set("Steps.member.1.HadoopJarStep.Jar", "s3://elasticmapreduce/libs/script-runner/script-runner.jar") v.Set("Steps.member.1.HadoopJarStep.Args.member.1", "s3://elasticmapreduce/libs/state-pusher/0.1/fetch") for i, step := range flow.Steps { n := i + 2 id := uuid.New() mapperObject := s3.Object{Bucket: flow.ScriptBucket, Key: "mapper/" + id} reducerObject := s3.Object{Bucket: flow.ScriptBucket, Key: "reducer/" + id} { var args []string args = append(args, fmt.Sprintf("-indirect=%v", step.IndirectMapJob)) args = append(args, step.Args...) check(ss3.PutObject(s3.PutObjectRequest{BasePut: s3.BasePut{Object: mapperObject, ContentType: "application/octet-stream"}, Data: []byte(createScript(step.Mapper, step.ToolChecker, args...))})) } { var args []string args = append(args, step.Args...) check(ss3.PutObject(s3.PutObjectRequest{BasePut: s3.BasePut{Object: reducerObject, ContentType: "application/octet-stream"}, Data: []byte(createScript(step.Reducer, step.ToolChecker, args...))})) } { v.Set(fmt.Sprintf("Steps.member.%d.Name", n), step.Name) v.Set(fmt.Sprintf("Steps.member.%d.ActionOnFailure", n), failureAction) v.Set(fmt.Sprintf("Steps.member.%d.HadoopJarStep.Jar", n), "/home/hadoop/contrib/streaming/hadoop-streaming.jar") i := func() func() int { i := 0 return func() int { i++ return i } }() arg := func(a string) { v.Set(fmt.Sprintf("Steps.member.%d.HadoopJarStep.Args.member.%d", n, i()), a) } pair := func(a, b string) { arg(a) arg(b) } if step.CompressMapOutput { pair("-D", "mapred.compress.map.output=true") } if step.Reducers > 0 { pair("-D", fmt.Sprintf("mapred.reduce.tasks=%d", step.Reducers)) } if step.Timeout > 0 { pair("-D", fmt.Sprintf("mapred.task.timeout=%d", step.Timeout.Nanoseconds()/1000000)) } if step.SortSecondKeyField { pair("-D", "stream.num.map.output.key.fields=2") pair("-D", "mapred.text.key.partitioner.options=-k1,1") pair("-partitioner", "org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner") } if step.Compress { pair("-jobconf", "mapred.output.compress=true") } for _, s := range step.Inputs { pair("-input", s) } pair("-output", step.Output) pair("-mapper", toUrl(mapperObject)) pair("-reducer", toUrl(reducerObject)) for k, x := range step.Vars { pair("-cmdenv", fmt.Sprintf("%s%s=%s", VARS_PREFIX, k, x)) } } } { var keys []string for k := range v { keys = append(keys, k) } sort.Strings(keys) for _, k := range keys { fmt.Printf("%s: %s\n", k, v.Get(k)) } } u := createSignedURL(flow.Auth, v) resp, err := runReq(u) if err != nil { return nil, err } return ParseEmrResponse(resp) }