Пример #1
0
func createScript(t tool.Interface, checker ToolChecker, args ...string) string {

	cmd := "go/bin/" + os.Args[0]

	if checker == nil {
		// since amazon emr doesn't have these libs
		checker = LapackToolChecker
	}
	if err := checker(cmd, t); err != nil {
		panic(fmt.Sprintf("tool %s doesn't check out: %v", tool.Name(t), err))
	}

	buf, err := ioutil.ReadFile(cmd)
	check(err)

	{
		var g bytes.Buffer
		gz := gzip.NewWriter(&g)
		gz.Write(buf)
		gz.Close()
		buf = g.Bytes()
	}

	var f bytes.Buffer
	w := func(s string) {
		f.Write([]byte(s))
		f.Write([]byte("\n"))
	}
	w2 := func(s string) {
		f.Write([]byte(s))
	}

	w("#!/bin/bash")
	w(fmt.Sprintf("CMD=/tmp/`/bin/mktemp -u %s_XXXXXXXXXXXXX`", tool.Name(t)))
	w("/usr/bin/base64 -d <<END_TEXT | /bin/gzip -d > $CMD")
	w2(split(base64.StdEncoding.EncodeToString(buf)))
	w("END_TEXT")
	w("/bin/chmod 777 $CMD")

	run := func() string {
		f := new(bytes.Buffer)
		fmt.Fprintf(f, "$CMD %s", tool.Name(t))
		for _, a := range args {
			fmt.Fprintf(f, " %s", a)
		}
		return string(f.Bytes())
	}()

	w(run)

	w("/bin/rm $CMD")

	return string(f.Bytes())
}
Пример #2
0
func Run(flow Flow) (*RunFlowResponse, error) {

	validate(flow)

	if !flow.IsSpot {
		flow.MasterSpotPrice = 0
		flow.SlaveSpotPrice = 0
	}

	id := fmt.Sprintf("%s-%s_%s_%s", tool.Name(flow.Steps[0].Mapper), tool.Name(flow.Steps[0].Reducer), time.Now().UTC().Format("20060102T150405Z"), uuid.New()[:4])

	ss3 := s3.GetDefault(flow.Auth)

	v := make(url.Values)

	v.Set("Action", "RunJobFlow")

	v.Set("Name", id)
	v.Set("AmiVersion", "2.4.3")
	v.Set("LogUri", fmt.Sprintf("s3n://%s/%s", flow.LogBucket, id))

	v.Set("Instances.Ec2KeyName", flow.KeyName)
	v.Set("Instances.HadoopVersion", "1.0.3")

	if len(flow.AvailabilityZone) == 0 {
		flow.AvailabilityZone = "us-east-1d"
	}

	v.Set("Instances.Placement.AvailabilityZone", flow.AvailabilityZone)
	v.Set("Instances.KeepJobFlowAliveWhenNoSteps", fmt.Sprintf("%v", flow.KeepAlive))
	v.Set("Instances.TerminationProtected", "false")

	if flow.IsSpot {
		v.Set("Instances.InstanceGroups.member.1.InstanceRole", "MASTER")
		v.Set("Instances.InstanceGroups.member.1.Market", "SPOT")
		v.Set("Instances.InstanceGroups.member.1.BidPrice", fmt.Sprintf("%.3f", flow.MasterSpotPrice))
		v.Set("Instances.InstanceGroups.member.1.InstanceType", flow.MasterInstanceType)
		v.Set("Instances.InstanceGroups.member.1.InstanceCount", "1")

		v.Set("Instances.InstanceGroups.member.2.InstanceRole", "CORE")
		v.Set("Instances.InstanceGroups.member.2.Market", "SPOT")
		v.Set("Instances.InstanceGroups.member.2.BidPrice", fmt.Sprintf("%.3f", flow.SlaveSpotPrice))
		v.Set("Instances.InstanceGroups.member.2.InstanceType", flow.SlaveInstanceType)
		v.Set("Instances.InstanceGroups.member.2.InstanceCount", fmt.Sprintf("%d", flow.Instances-1))
	} else {
		v.Set("Instances.MasterInstanceType", flow.MasterInstanceType)
		v.Set("Instances.SlaveInstanceType", flow.SlaveInstanceType)
		v.Set("Instances.InstanceCount", fmt.Sprintf("%d", flow.Instances))
	}

	failureAction := func() string {
		if flow.KeepAlive {
			return "CANCEL_AND_WAIT"
		} else {
			return "TERMINATE_JOB_FLOW"
		}
	}()

	v.Set("Steps.member.1.Name", "debugging")
	v.Set("Steps.member.1.ActionOnFailure", failureAction)
	v.Set("Steps.member.1.HadoopJarStep.Jar", "s3://elasticmapreduce/libs/script-runner/script-runner.jar")
	v.Set("Steps.member.1.HadoopJarStep.Args.member.1", "s3://elasticmapreduce/libs/state-pusher/0.1/fetch")

	for i, step := range flow.Steps {

		n := i + 2

		id := uuid.New()

		mapperObject := s3.Object{Bucket: flow.ScriptBucket, Key: "mapper/" + id}
		reducerObject := s3.Object{Bucket: flow.ScriptBucket, Key: "reducer/" + id}

		{
			var args []string
			args = append(args, fmt.Sprintf("-indirect=%v", step.IndirectMapJob))
			args = append(args, step.Args...)
			check(ss3.PutObject(s3.PutObjectRequest{BasePut: s3.BasePut{Object: mapperObject, ContentType: "application/octet-stream"}, Data: []byte(createScript(step.Mapper, step.ToolChecker, args...))}))
		}

		{
			var args []string
			args = append(args, step.Args...)
			check(ss3.PutObject(s3.PutObjectRequest{BasePut: s3.BasePut{Object: reducerObject, ContentType: "application/octet-stream"}, Data: []byte(createScript(step.Reducer, step.ToolChecker, args...))}))
		}

		{
			v.Set(fmt.Sprintf("Steps.member.%d.Name", n), step.Name)
			v.Set(fmt.Sprintf("Steps.member.%d.ActionOnFailure", n), failureAction)
			v.Set(fmt.Sprintf("Steps.member.%d.HadoopJarStep.Jar", n), "/home/hadoop/contrib/streaming/hadoop-streaming.jar")

			i := func() func() int {
				i := 0
				return func() int {
					i++
					return i
				}
			}()

			arg := func(a string) {
				v.Set(fmt.Sprintf("Steps.member.%d.HadoopJarStep.Args.member.%d", n, i()), a)
			}

			pair := func(a, b string) {
				arg(a)
				arg(b)
			}

			if step.CompressMapOutput {
				pair("-D", "mapred.compress.map.output=true")
			}

			if step.Reducers > 0 {
				pair("-D", fmt.Sprintf("mapred.reduce.tasks=%d", step.Reducers))
			}

			if step.Timeout > 0 {
				pair("-D", fmt.Sprintf("mapred.task.timeout=%d", step.Timeout.Nanoseconds()/1000000))

			}

			if step.SortSecondKeyField {
				pair("-D", "stream.num.map.output.key.fields=2")
				pair("-D", "mapred.text.key.partitioner.options=-k1,1")
				pair("-partitioner", "org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner")
			}

			if step.Compress {
				pair("-jobconf", "mapred.output.compress=true")
			}

			for _, s := range step.Inputs {
				pair("-input", s)
			}

			pair("-output", step.Output)
			pair("-mapper", toUrl(mapperObject))
			pair("-reducer", toUrl(reducerObject))

			for k, x := range step.Vars {
				pair("-cmdenv", fmt.Sprintf("%s%s=%s", VARS_PREFIX, k, x))
			}
		}

	}

	{
		var keys []string
		for k := range v {
			keys = append(keys, k)
		}
		sort.Strings(keys)
		for _, k := range keys {
			fmt.Printf("%s: %s\n", k, v.Get(k))
		}
	}

	u := createSignedURL(flow.Auth, v)

	resp, err := runReq(u)
	if err != nil {
		return nil, err
	}
	return ParseEmrResponse(resp)
}