Example #1
0
File: cache.go Project: MG-RAST/AWE
//fetch input data
func MoveInputData(work *core.Workunit) (size int64, err error) {
	for _, io := range work.Inputs {
		inputname := io.FileName
		// skip if NoFile == true
		if !io.NoFile { // is file !
			dataUrl, uerr := io.DataUrl()
			if uerr != nil {
				return 0, uerr
			}
			inputFilePath := fmt.Sprintf("%s/%s", work.Path(), inputname)

			if work.Rank == 0 {
				if conf.CACHE_ENABLED && io.Node != "" {
					if file_path, err := StatCacheFilePath(io.Node); err == nil {
						//make a link in work dir from cached file
						linkname := fmt.Sprintf("%s/%s", work.Path(), inputname)
						fmt.Printf("input found in cache, making link: " + file_path + " -> " + linkname + "\n")
						err = os.Symlink(file_path, linkname)
						if err == nil {
							logger.Event(event.FILE_READY, "workid="+work.Id+";url="+dataUrl)
						}
						return 0, err
					}
				}
			} else {
				dataUrl = fmt.Sprintf("%s&index=%s&part=%s", dataUrl, work.IndexType(), work.Part())
			}
			logger.Debug(2, "mover: fetching input file from url:"+dataUrl)
			logger.Event(event.FILE_IN, "workid="+work.Id+";url="+dataUrl)

			// download file
			if datamoved, _, err := shock.FetchFile(inputFilePath, dataUrl, work.Info.DataToken, io.Uncompress, false); err != nil {
				return size, errors.New("shock.FetchFile returned: " + err.Error())
			} else {
				size += datamoved
			}
			logger.Event(event.FILE_READY, "workid="+work.Id+";url="+dataUrl)
		}

		// download node attributes if requested
		if io.AttrFile != "" {
			// get node
			node, err := shock.ShockGet(io.Host, io.Node, work.Info.DataToken)
			if err != nil {
				//return size, err
				return size, errors.New("shock.ShockGet (node attributes) returned: " + err.Error())
			}
			logger.Debug(2, "mover: fetching input attributes from node:"+node.Id)
			logger.Event(event.ATTR_IN, "workid="+work.Id+";node="+node.Id)
			// print node attributes
			attrFilePath := fmt.Sprintf("%s/%s", work.Path(), io.AttrFile)
			attr_json, _ := json.Marshal(node.Attributes)
			if err := ioutil.WriteFile(attrFilePath, attr_json, 0644); err != nil {
				return size, err
			}
			logger.Event(event.ATTR_READY, "workid="+work.Id+";path="+attrFilePath)
		}
	}
	return
}
Example #2
0
func (qm *CQMgr) ClientChecker() {
	for {
		time.Sleep(30 * time.Second)
		for clientid, client := range qm.clientMap {
			if client.Tag == true {
				client.Tag = false
				total_minutes := int(time.Now().Sub(client.RegTime).Minutes())
				hours := total_minutes / 60
				minutes := total_minutes % 60
				client.Serve_time = fmt.Sprintf("%dh%dm", hours, minutes)
				if len(client.Current_work) > 0 {
					client.Idle_time = 0
				} else {
					client.Idle_time += 30
				}
			} else {
				//now client must be gone as tag set to false 30 seconds ago and no heartbeat received thereafter
				logger.Event(event.CLIENT_UNREGISTER, "clientid="+clientid+";name="+qm.clientMap[clientid].Name)

				//requeue unfinished workunits associated with the failed client
				workids := qm.getWorkByClient(clientid)
				for _, workid := range workids {
					if qm.workQueue.Has(workid) {
						qm.workQueue.StatusChange(workid, WORK_STAT_QUEUED)
						logger.Event(event.WORK_REQUEUE, "workid="+workid)
					}
				}
				//delete the client from client map
				delete(qm.clientMap, clientid)
			}
		}
	}
}
Example #3
0
//fetch input data
func moveInputData(work *core.Workunit) (size int64, err error) {
	for _, io := range work.Inputs {
		inputname := io.FileName
		dataUrl, uerr := io.DataUrl()
		if uerr != nil {
			return 0, uerr
		}
		if work.Rank > 0 {
			dataUrl = fmt.Sprintf("%s&index=%s&part=%s", dataUrl, work.IndexType(), work.Part())
		}

		inputFilePath := path.Join(work.Path(), inputname)

		logger.Debug(2, "mover: fetching input from url:"+dataUrl)
		logger.Event(event.FILE_IN, "workid="+work.Id+" url="+dataUrl)

		// this gets file from any downloadable url, not just shock
		if datamoved, _, err := shock.FetchFile(inputFilePath, dataUrl, work.Info.DataToken, io.Uncompress, false); err != nil {
			return size, err
		} else {
			size += datamoved
		}
		logger.Event(event.FILE_READY, "workid="+work.Id+";url="+dataUrl)
	}
	return
}
Example #4
0
// PUT: /queue
func (cr *QueueController) UpdateMany(cx *goweb.Context) {
	LogRequest(cx.Request)

	// Try to authenticate user.
	u, err := request.Authenticate(cx.Request)
	if err != nil && err.Error() != e.NoAuth {
		cx.RespondWithErrorMessage(err.Error(), http.StatusUnauthorized)
		return
	}
	// must be admin user
	if u == nil || u.Admin == false {
		cx.RespondWithErrorMessage(e.NoAuth, http.StatusUnauthorized)
		return
	}

	// Gather query params
	query := &Query{Li: cx.Request.URL.Query()}

	if query.Has("resume") {
		core.QMgr.ResumeQueue()
		logger.Event(event.QUEUE_RESUME, "user="******"work queue resumed")
		return
	}
	if query.Has("suspend") {
		core.QMgr.SuspendQueue()
		logger.Event(event.QUEUE_SUSPEND, "user="******"work queue suspended")
		return
	}

	cx.RespondWithErrorMessage("requested queue operation not supported", http.StatusBadRequest)
	return
}
Example #5
0
func RunWorkunit(work *core.Workunit) (err error) {

	args := work.Cmd.ParsedArgs

	//change cwd to the workunit's working directory
	if err := work.CDworkpath(); err != nil {
		return err
	}

	commandName := work.Cmd.Name
	cmd := exec.Command(commandName, args...)

	msg := fmt.Sprintf("worker: start cmd=%s, args=%v", commandName, args)
	fmt.Println(msg)
	logger.Debug(1, msg)
	logger.Event(event.WORK_START, "workid="+work.Id,
		"cmd="+commandName,
		fmt.Sprintf("args=%v", args))

	var stdout, stderr io.ReadCloser
	if conf.PRINT_APP_MSG {
		stdout, err = cmd.StdoutPipe()
		if err != nil {
			return err
		}
		stderr, err = cmd.StderrPipe()
		if err != nil {
			return err
		}
	}

	if err := cmd.Start(); err != nil {
		return errors.New(fmt.Sprintf("start_cmd=%s, err=%s", commandName, err.Error()))
	}

	if conf.PRINT_APP_MSG {
		go io.Copy(os.Stdout, stdout)
		go io.Copy(os.Stderr, stderr)
	}

	done := make(chan error)
	go func() {
		done <- cmd.Wait()
	}()
	select {
	case <-chankill:
		if err := cmd.Process.Kill(); err != nil {
			fmt.Println("failed to kill" + err.Error())
		}
		<-done // allow goroutine to exit
		fmt.Println("process killed")
		return errors.New("process killed")
	case err := <-done:
		if err != nil {
			return errors.New(fmt.Sprintf("wait_cmd=%s, err=%s", commandName, err.Error()))
		}
	}
	logger.Event(event.WORK_END, "workid="+work.Id)
	return
}
Example #6
0
func deliverer(control chan int) {
	fmt.Printf("deliverer lanched, client=%s\n", core.Self.Id)
	defer fmt.Printf("deliverer exiting...\n")
	for {
		processed := <-fromProcessor
		work := processed.workunit
		workmap[work.Id] = ID_DELIVERER
		perfstat := processed.perfstat

		//post-process for works computed successfully: push output data to Shock
		move_start := time.Now().Unix()
		if work.State == core.WORK_STAT_COMPUTED {
			if err := core.PushOutputData(work); err != nil {
				work.State = core.WORK_STAT_FAIL
				logger.Error("err@pushOutputData: workid=" + work.Id + ", err=" + err.Error())
			} else {
				work.State = core.WORK_STAT_DONE
			}
		}
		move_end := time.Now().Unix()
		perfstat.DataOut = move_end - move_start
		perfstat.Deliver = move_end
		perfstat.ClientResp = perfstat.Deliver - perfstat.Checkout
		perfstat.ClientId = core.Self.Id

		//notify server the final process results
		if err := core.NotifyWorkunitProcessed(work, perfstat); err != nil {
			time.Sleep(3 * time.Second) //wait 3 seconds and try another time
			if err := core.NotifyWorkunitProcessed(work, perfstat); err != nil {
				fmt.Printf("!!!NotifyWorkunitDone returned error: %s\n", err.Error())
				logger.Error("err@NotifyWorkunitProcessed: workid=" + work.Id + ", err=" + err.Error())
				//mark this work in Current_work map as false, something needs to be done in the future
				//to clean this kind of work that has been proccessed but its result can't be sent to server!
				core.Self.Current_work[work.Id] = false //server doesn't know this yet
			}
		}
		//now final status report sent to server, update some local info
		if work.State == core.WORK_STAT_DONE {
			logger.Event(event.WORK_DONE, "workid="+work.Id)
			core.Self.Total_completed += 1

			if conf.AUTO_CLEAN_DIR {
				if err := work.RemoveDir(); err != nil {
					logger.Error("[email protected](): workid=" + work.Id + ", err=" + err.Error())
				}
			}
		} else {
			logger.Event(event.WORK_RETURN, "workid="+work.Id)
			core.Self.Total_failed += 1
		}
		delete(core.Self.Current_work, work.Id)
		delete(workmap, work.Id)

	}
	control <- ID_DELIVERER //we are ending
}
Example #7
0
func (qm *ServerMgr) SuspendJob(jobid string, reason string, id string) (err error) {
	job, err := LoadJob(jobid)
	if err != nil {
		return
	}
	if id != "" {
		job.LastFailed = id
	}
	if err := job.UpdateState(JOB_STAT_SUSPEND, reason); err != nil {
		return err
	}
	qm.putSusJob(jobid)

	//suspend queueing workunits
	for _, workid := range qm.workQueue.List() {
		if jobid == getParentJobId(workid) {
			qm.workQueue.StatusChange(workid, WORK_STAT_SUSPEND)
		}
	}

	//suspend parsed tasks
	for _, task := range job.Tasks {
		if task.State == TASK_STAT_QUEUED || task.State == TASK_STAT_INIT || task.State == TASK_STAT_INPROGRESS {
			qm.taskStateChange(task.Id, TASK_STAT_SUSPEND)
			task.State = TASK_STAT_SUSPEND
			job.UpdateTask(task)
		}
	}
	qm.LogJobPerf(jobid)
	qm.removeActJob(jobid)
	logger.Event(event.JOB_SUSPEND, "jobid="+jobid+";reason="+reason)
	return
}
Example #8
0
// PUT: /logger
func (cr *LoggerController) UpdateMany(cx *goweb.Context) {
	LogRequest(cx.Request)

	// Try to authenticate user.
	u, err := request.Authenticate(cx.Request)
	if err != nil && err.Error() != e.NoAuth {
		cx.RespondWithErrorMessage(err.Error(), http.StatusUnauthorized)
		return
	}
	// must be admin user
	if u == nil || u.Admin == false {
		cx.RespondWithErrorMessage(e.NoAuth, http.StatusUnauthorized)
		return
	}

	// Gather query params
	query := &Query{Li: cx.Request.URL.Query()}

	// currently can only reset debug level
	if query.Has("debug") {
		levelStr := query.Value("debug")
		levelInt, err := strconv.Atoi(levelStr)
		if err != nil {
			cx.RespondWithErrorMessage("invalid debug level: "+err.Error(), http.StatusBadRequest)
		}
		conf.DEBUG_LEVEL = levelInt
		logger.Event(event.DEBUG_LEVEL, "level="+levelStr+";user="******"debuglevel": conf.DEBUG_LEVEL})
		return
	}

	cx.RespondWithError(http.StatusNotImplemented)
	return
}
Example #9
0
func (qm *ServerMgr) DeleteJob(jobid string) (err error) {
	job, err := LoadJob(jobid)
	if err != nil {
		return
	}
	if err := job.UpdateState(JOB_STAT_DELETED, "deleted"); err != nil {
		return err
	}
	//delete queueing workunits
	for workid, _ := range qm.workQueue.workMap {
		if jobid == strings.Split(workid, "_")[0] {
			qm.workQueue.Delete(workid)
		}
	}
	//delete parsed tasks
	for i := 0; i < len(job.TaskList()); i++ {
		task_id := fmt.Sprintf("%s_%d", jobid, i)
		delete(qm.taskMap, task_id)
	}
	qm.DeleteJobPerf(jobid)
	delete(qm.susJobs, jobid)

	logger.Event(event.JOB_DELETED, "jobid="+jobid)
	return
}
Example #10
0
File: cqmgr.go Project: edwardt/AWE
func (qm *CQMgr) ClientChecker() {
	for {
		time.Sleep(30 * time.Second)
		logger.Debug(3, "time to update client list....\n")
		for _, client := range qm.GetAllClients() {
			if client.Tag == true {
				client.Tag = false
				total_minutes := int(time.Now().Sub(client.RegTime).Minutes())
				hours := total_minutes / 60
				minutes := total_minutes % 60
				client.Serve_time = fmt.Sprintf("%dh%dm", hours, minutes)
				if len(client.Current_work) > 0 {
					client.Idle_time = 0
				} else {
					client.Idle_time += 30
				}
				qm.PutClient(client)
			} else {
				if ok := qm.HasClient(client.Id); !ok {
					continue
				}
				//now client must be gone as tag set to false 30 seconds ago and no heartbeat received thereafter
				logger.Event(event.CLIENT_UNREGISTER, "clientid="+client.Id+";name="+client.Name)
				//requeue unfinished workunits associated with the failed client
				qm.ReQueueWorkunitByClient(client.Id)
				//delete the client from client map
				qm.RemoveClient(client.Id)
			}
		}
	}
}
Example #11
0
func (qm *ServerMgr) DeleteJobByUser(jobid string, u *user.User) (err error) {
	job, err := LoadJob(jobid)
	if err != nil {
		return
	}
	// User must have delete permissions on job or be job owner or be an admin
	rights := job.Acl.Check(u.Uuid)
	if job.Acl.Owner != u.Uuid && rights["delete"] == false && u.Admin == false {
		return errors.New(e.UnAuth)
	}
	if err := job.UpdateState(JOB_STAT_DELETED, "deleted"); err != nil {
		return err
	}
	//delete queueing workunits
	for _, workid := range qm.workQueue.List() {
		if jobid == getParentJobId(workid) {
			qm.workQueue.Delete(workid)
		}
	}
	//delete parsed tasks
	for i := 0; i < len(job.TaskList()); i++ {
		task_id := fmt.Sprintf("%s_%d", jobid, i)
		qm.deleteTask(task_id)
	}
	qm.removeActJob(jobid)
	qm.removeSusJob(jobid)

	logger.Event(event.JOB_DELETED, "jobid="+jobid)
	return
}
Example #12
0
func (qm *ProxyMgr) ClientChecker() {
	for {
		time.Sleep(30 * time.Second)
		for _, client := range qm.GetAllClients() {
			if client.Tag == true {
				client.Tag = false
				total_minutes := int(time.Now().Sub(client.RegTime).Minutes())
				hours := total_minutes / 60
				minutes := total_minutes % 60
				client.Serve_time = fmt.Sprintf("%dh%dm", hours, minutes)
				if client.Current_work_length() > 0 {
					client.Idle_time = 0
				} else {
					client.Idle_time += 30
				}
				qm.PutClient(client)
			} else {
				//now client must be gone as tag set to false 30 seconds ago and no heartbeat received thereafter
				logger.Event(event.CLIENT_UNREGISTER, "clientid="+client.Id+";name="+client.Name)

				//requeue unfinished workunits associated with the failed client
				qm.ReQueueWorkunitByClient(client.Id)
				//delete the client from client map
				qm.RemoveClient(client.Id)
				//proxy specific
				Self.SubClients -= 1
				notifySubClients(Self.Id, Self.SubClients)
			}
		}
	}
}
Example #13
0
func (qm *ServerMgr) DeleteJob(jobid string) (err error) {
	job, err := LoadJob(jobid)
	if err != nil {
		return
	}
	if err := job.UpdateState(JOB_STAT_DELETED, "deleted"); err != nil {
		return err
	}
	//delete queueing workunits
	for _, workid := range qm.workQueue.List() {
		if jobid == getParentJobId(workid) {
			qm.workQueue.Delete(workid)
		}
	}
	//delete parsed tasks
	for i := 0; i < len(job.TaskList()); i++ {
		task_id := fmt.Sprintf("%s_%d", jobid, i)
		qm.deleteTask(task_id)
	}
	qm.removeActJob(jobid)
	qm.removeSusJob(jobid)

	logger.Event(event.JOB_DELETED, "jobid="+jobid)
	return
}
Example #14
0
func (qm *ServerMgr) SuspendJob(jobid string, reason string) (err error) {
	job, err := LoadJob(jobid)
	if err != nil {
		return
	}
	if err := job.UpdateState(JOB_STAT_SUSPEND, reason); err != nil {
		return err
	}
	//qm.DeleteJobPerf(jobid)
	qm.susJobs[jobid] = true

	//suspend queueing workunits
	for workid, _ := range qm.workQueue.workMap {
		if jobid == strings.Split(workid, "_")[0] {
			qm.workQueue.StatusChange(workid, WORK_STAT_SUSPEND)
		}
	}

	//suspend parsed tasks
	for _, task := range job.Tasks {
		if task.State == TASK_STAT_QUEUED || task.State == TASK_STAT_INIT {
			if _, ok := qm.taskMap[task.Id]; ok {
				qm.taskMap[task.Id].State = TASK_STAT_SUSPEND
				task.State = TASK_STAT_SUSPEND
				job.UpdateTask(task)
			}
		}
	}

	qm.DeleteJobPerf(jobid)
	logger.Event(event.JOB_SUSPEND, "jobid="+jobid+";reason="+reason)
	return
}
Example #15
0
//update job info when a task in that job changed to a new state
func (qm *ServerMgr) updateJobTask(task *Task) (err error) {
	parts := strings.Split(task.Id, "_")
	jobid := parts[0]
	job, err := LoadJob(jobid)
	if err != nil {
		return
	}
	remainTasks, err := job.UpdateTask(task)
	if err != nil {
		return err
	}

	logger.Debug(2, fmt.Sprintf("remaining tasks for task %s: %d", task.Id, remainTasks))

	if remainTasks == 0 { //job done
		qm.FinalizeJobPerf(jobid)
		qm.LogJobPerf(jobid)
		qm.removeActJob(jobid)
		//delete tasks in task map
		//delete from shock output flagged for deletion
		for _, task := range job.TaskList() {
			task.DeleteOutput()
			task.DeleteInput()
			qm.deleteTask(task.Id)
		}
		//log event about job done (JD)
		logger.Event(event.JOB_DONE, "jobid="+job.Id+";jid="+job.Jid+";project="+job.Info.Project+";name="+job.Info.Name)
	}
	return
}
Example #16
0
//update job info when a task in that job changed to a new state
func (qm *ServerMgr) updateJobTask(task *Task) (err error) {
	parts := strings.Split(task.Id, "_")
	jobid := parts[0]
	job, err := LoadJob(jobid)
	if err != nil {
		return
	}
	remainTasks, err := job.UpdateTask(task)
	if err != nil {
		return err
	}

	if remainTasks == 0 { //job done
		qm.FinalizeJobPerf(jobid)
		qm.LogJobPerf(jobid)
		qm.DeleteJobPerf(jobid)
		//delete tasks in task map
		for _, task := range job.TaskList() {
			delete(qm.taskMap, task.Id)
		}
		//log event about job done (JD)
		logger.Event(event.JOB_DONE, "jobid="+job.Id+";jid="+job.Jid+";project="+job.Info.Project+";name="+job.Info.Name)
	}
	return
}
Example #17
0
//parse workunit, fetch input data, compose command arguments
func ParseWorkunitArgs(work *core.Workunit) (args []string, err error) {
	argstr := work.Cmd.Args
	if argstr == "" {
		return
	}

	argList := strings.Fields(argstr)
	inputsMap := work.Inputs

	for _, arg := range argList {
		if strings.Contains(arg, "@") { //parse input/output to accessible local file
			segs := strings.Split(arg, "@")
			if len(segs) > 2 {
				return []string{}, errors.New("invalid format in command args, multiple @ within one arg")
			}
			inputname := segs[1]

			if inputsMap.Has(inputname) {
				io := inputsMap[inputname]

				var dataUrl string
				if work.Rank == 0 {
					dataUrl = io.DataUrl()
				} else {
					dataUrl = fmt.Sprintf("%s&index=%s&part=%s", io.DataUrl(), work.IndexType(), work.Part())
				}

				inputFilePath := fmt.Sprintf("%s/%s", work.Path(), inputname)

				logger.Debug(2, "mover: fetching input from url:"+dataUrl)
				logger.Event(event.FILE_IN, "workid="+work.Id+" url="+dataUrl)

				if err := fetchFile(inputFilePath, dataUrl, work.Info.DataToken); err != nil { //get file from Shock
					return []string{}, err
				}
				logger.Event(event.FILE_READY, "workid="+work.Id+" url="+dataUrl)

				parsedArg := fmt.Sprintf("%s%s", segs[0], inputFilePath)
				args = append(args, parsedArg)
			}
		} else { //no @, has nothing to do with input/output, append directly
			args = append(args, arg)
		}
	}

	return args, nil
}
Example #18
0
func main() {

	if !conf.INIT_SUCCESS {
		conf.PrintClientUsage()
		os.Exit(1)
	}

	if _, err := os.Stat(conf.WORK_PATH); err != nil && os.IsNotExist(err) {
		if err := os.MkdirAll(conf.WORK_PATH, 0777); err != nil {
			fmt.Fprintf(os.Stderr, "ERROR in creating work_path %s\n", err.Error())
			os.Exit(1)
		}
	}

	if _, err := os.Stat(conf.DATA_PATH); err != nil && os.IsNotExist(err) {
		if err := os.MkdirAll(conf.DATA_PATH, 0777); err != nil {
			fmt.Fprintf(os.Stderr, "ERROR in creating data_path %s\n", err.Error())
			os.Exit(1)
		}
	}

	if _, err := os.Stat(conf.LOGS_PATH); err != nil && os.IsNotExist(err) {
		if err := os.MkdirAll(conf.LOGS_PATH, 0777); err != nil {
			fmt.Fprintf(os.Stderr, "ERROR in creating log_path %s\n", err.Error())
			os.Exit(1)
		}
	}

	profile, err := worker.ComposeProfile()
	if err != nil {
		fmt.Fprintf(os.Stderr, "fail to compose profile: %s\n", err.Error())
		os.Exit(1)
	}

	self, err := worker.RegisterWithAuth(conf.SERVER_URL, profile)
	if err != nil {
		fmt.Fprintf(os.Stderr, "fail to register: %s\n", err.Error())
		os.Exit(1)
	}
	core.InitClientProfile(self)

	var logdir string
	if self.Name != "" {
		logdir = self.Name
	} else {
		logdir = conf.CLIENT_NAME
	}

	logger.Initialize("client-" + logdir)

	fmt.Printf("Client registered, name=%s, id=%s\n", self.Name, self.Id)
	logger.Event(event.CLIENT_REGISTRATION, "clientid="+self.Id)

	if err := worker.InitWorkers(self); err == nil {
		worker.StartClientWorkers()
	} else {
		fmt.Printf("failed to initialize and start workers:" + err.Error())
	}
}
Example #19
0
func workStealer(control chan int) {
	fmt.Printf("workStealer lanched, client=%s\n", core.Self.Id)
	defer fmt.Printf("workStealer exiting...\n")
	retry := 0
	for {
		if core.Service == "proxy" {
			<-core.ProxyWorkChan
		}
		wu, err := CheckoutWorkunitRemote()
		if err != nil {
			if err.Error() == e.QueueEmpty || err.Error() == e.NoEligibleWorkunitFound {
				//normal, do nothing
			} else if err.Error() == e.ClientNotFound {
				//server may be restarted, waiting for the hearbeater goroutine to try re-register
				ReRegisterWithSelf(conf.SERVER_URL)
			} else if err.Error() == e.ClientSuspended {
				fmt.Printf("client suspended, waiting for repair or resume request...\n")
				//to-do: send out email notice that this client has problem and been suspended
				time.Sleep(2 * time.Minute)
			} else {
				//something is wrong, server may be down
				fmt.Printf("error in checking out workunits: %v\n", err)
				retry += 1
			}
			if retry == 3 {
				os.Exit(1)
			}
			if core.Service != "proxy" { //proxy: event driven, client: timer driven
				time.Sleep(10 * time.Second)
			}
			continue
		} else {
			retry = 0
		}
		logger.Debug(2, "workStealer: checked out a workunit: id="+wu.Id)
		//log event about work checktout (WC)
		logger.Event(event.WORK_CHECKOUT, "workid="+wu.Id)
		core.Self.Total_checkout += 1
		core.Self.Current_work[wu.Id] = true
		workmap[wu.Id] = ID_WORKSTEALER

		//hand the work to the next step handler: dataMover
		workstat := core.NewWorkPerf(wu.Id)
		workstat.Checkout = time.Now().Unix()
		rawWork := &mediumwork{
			workunit: wu,
			perfstat: workstat,
		}
		fromStealer <- rawWork

		//if worker overlap is inhibited, wait until deliverer finishes processing the workunit
		if conf.WORKER_OVERLAP == false && core.Service != "proxy" {
			chanPermit <- true
		}
	}
	control <- ID_WORKSTEALER //we are ending
}
Example #20
0
// POST: /job
func (cr *JobController) Create(cx *goweb.Context) {
	// Log Request and check for Auth
	LogRequest(cx.Request)

	// Parse uploaded form
	params, files, err := ParseMultipartForm(cx.Request)

	if err != nil {
		if err.Error() == "request Content-Type isn't multipart/form-data" {
			cx.RespondWithErrorMessage("No job file is submitted", http.StatusBadRequest)
		} else {
			// Some error other than request encoding. Theoretically
			// could be a lost db connection between user lookup and parsing.
			// Blame the user, Its probaby their fault anyway.
			logger.Error("Error parsing form: " + err.Error())
			cx.RespondWithError(http.StatusBadRequest)
		}
		return
	}

	_, has_upload := files["upload"]
	_, has_awf := files["awf"]

	if !has_upload && !has_awf {
		cx.RespondWithErrorMessage("No job script or awf is submitted", http.StatusBadRequest)
		return
	}

	//send job submission request and get back an assigned job number (jid)
	var jid string
	jid, err = core.QMgr.JobRegister()
	if err != nil {
		logger.Error("Err@job_Create:GetNextJobNum: " + err.Error())
		cx.RespondWithErrorMessage(err.Error(), http.StatusBadRequest)
		return
	}

	var job *core.Job
	job, err = core.CreateJobUpload(params, files, jid)
	if err != nil {
		logger.Error("Err@job_Create:CreateJobUpload: " + err.Error())
		cx.RespondWithErrorMessage(err.Error(), http.StatusBadRequest)
		return
	}

	if token, err := request.RetrieveToken(cx.Request); err == nil {
		job.SetDataToken(token)
	}

	core.QMgr.EnqueueTasksByJobId(job.Id, job.TaskList())

	//log event about job submission (JB)
	logger.Event(event.JOB_SUBMISSION, "jobid="+job.Id+";jid="+job.Jid+";name="+job.Info.Name+";project="+job.Info.Project)
	cx.RespondWithData(job)
	return
}
Example #21
0
func (qm *ServerMgr) skipTask(task *Task) (err error) {
	task.State = TASK_STAT_SKIPPED
	task.RemainWork = 0
	//update job and queue info. Skipped task behaves as finished tasks
	if err = qm.updateJobTask(task); err != nil { //TASK state  -> SKIPPED
		return
	}
	logger.Event(event.TASK_SKIPPED, "taskid="+task.Id)
	return
}
Example #22
0
File: job.go Project: MG-RAST/AWE
func (job *Job) Delete() (err error) {
	if err = dbDelete(bson.M{"id": job.Id}, conf.DB_COLL_JOBS); err != nil {
		return err
	}
	if err = job.Rmdir(); err != nil {
		return err
	}
	logger.Event(event.JOB_FULL_DELETE, "jobid="+job.Id)
	return
}
Example #23
0
func ReRegisterWithSelf(host string) (client *core.Client, err error) {
	fmt.Printf("lost contact with server, try to re-register\n")
	client, err = RegisterWithAuth(host, core.Self)
	if err != nil {
		logger.Error("Error: fail to re-register, clientid=" + core.Self.Id)
		fmt.Printf("failed to re-register\n")
	} else {
		logger.Event(event.CLIENT_AUTO_REREGI, "clientid="+core.Self.Id)
		fmt.Printf("re-register successfully\n")
	}
	return
}
Example #24
0
//fetch input data
func moveInputData(work *core.Workunit) (size int64, err error) {
	for inputname, io := range work.Inputs {
		var dataUrl string
		if work.Rank == 0 {
			dataUrl = io.DataUrl()
		} else {
			dataUrl = fmt.Sprintf("%s&index=%s&part=%s", io.DataUrl(), work.IndexType(), work.Part())
		}

		inputFilePath := path.Join(work.Path(), inputname)

		logger.Debug(2, "mover: fetching input from url:"+dataUrl)
		logger.Event(event.FILE_IN, "workid="+work.Id+" url="+dataUrl)

		if datamoved, err := shock.FetchFile(inputFilePath, dataUrl, work.Info.DataToken, io.Uncompress); err != nil {
			return size, err
		} else {
			size += datamoved
		}
		logger.Event(event.FILE_READY, "workid="+work.Id+";url="+dataUrl)
	}
	return
}
Example #25
0
// GET: /work
// checkout a workunit with earliest submission time
// to-do: to support more options for workunit checkout
func (cr *WorkController) ReadMany(cx *goweb.Context) {

	// Gather query params
	query := &Query{Li: cx.Request.URL.Query()}

	if !query.Has("client") { //view workunits
		var workunits []*core.Workunit
		if query.Has("state") {
			workunits = core.QMgr.ShowWorkunits(query.Value("state"))
		} else {
			workunits = core.QMgr.ShowWorkunits("")
		}
		cx.RespondWithData(workunits)
		return
	}

	if core.Service == "proxy" { //drive proxy workStealer to checkout work from server
		core.ProxyWorkChan <- true
	}

	//checkout a workunit in FCFS order
	clientid := query.Value("client")
	workunits, err := core.QMgr.CheckoutWorkunits("FCFS", clientid, 1)

	if err != nil {
		if err.Error() != e.QueueEmpty && err.Error() != e.NoEligibleWorkunitFound && err.Error() != e.ClientNotFound {
			logger.Error("Err@work_ReadMany:core.QMgr.GetWorkByFCFS(): " + err.Error() + ";client=" + clientid)
		}
		cx.RespondWithErrorMessage(err.Error(), http.StatusBadRequest)
		return
	}

	//log access info only when the queue is not empty, save some log
	LogRequest(cx.Request)

	//log event about workunit checkout (WO)
	workids := []string{}
	for _, work := range workunits {
		workids = append(workids, work.Id)
	}

	logger.Event(event.WORK_CHECKOUT,
		"workids="+strings.Join(workids, ","),
		"clientid="+clientid)

	// Base case respond with node in json
	cx.RespondWithData(workunits[0])
	return
}
Example #26
0
File: cqmgr.go Project: MG-RAST/AWE
func (qm *CQMgr) ReQueueWorkunitByClient(clientid string) (err error) {
	workids := qm.getWorkByClient(clientid)
	for _, workid := range workids {
		if qm.workQueue.Has(workid) {
			jobid, _ := GetJobIdByWorkId(workid)
			if job, err := LoadJob(jobid); err == nil {
				if contains(JOB_STATS_ACTIVE, job.State) { //only requeue workunits belonging to active jobs (rule out suspended jobs)
					qm.workQueue.StatusChange(workid, WORK_STAT_QUEUED)
					logger.Event(event.WORK_REQUEUE, "workid="+workid)
				}
			}
		}
	}
	return
}
Example #27
0
func PushOutputData(work *Workunit) (err error) {
	for name, io := range work.Outputs {
		file_path := fmt.Sprintf("%s/%s", work.Path(), name)
		//use full path here, cwd could be changed by Worker (likely in worker-overlapping mode)
		if fi, err := os.Stat(file_path); err != nil {
			if io.Optional {
				continue
			} else {
				return errors.New(fmt.Sprintf("output %s not generated for workunit %s", name, work.Id))
			}
		} else {
			if io.Nonzero && fi.Size() == 0 {
				return errors.New(fmt.Sprintf("workunit %s generated zero-sized output %s while non-zero-sized file required", work.Id, name))
			}
		}
		logger.Debug(2, "deliverer: push output to shock, filename="+name)
		logger.Event(event.FILE_OUT,
			"workid="+work.Id,
			"filename="+name,
			fmt.Sprintf("url=%s/node/%s", io.Host, io.Node))

		if err := putFileToShock(file_path, io.Host, io.Node, work.Rank, work.Info.DataToken); err != nil {
			time.Sleep(3 * time.Second) //wait for 3 seconds and try again
			if err := putFileToShock(file_path, io.Host, io.Node, work.Rank, work.Info.DataToken); err != nil {
				fmt.Errorf("push file error\n")
				logger.Error("op=pushfile,err=" + err.Error())
				return err
			}
		}
		logger.Event(event.FILE_DONE,
			"workid="+work.Id,
			"filename="+name,
			fmt.Sprintf("url=%s/node/%s", io.Host, io.Node))
	}
	return
}
Example #28
0
func proxy_relay_workunit(work *Workunit, perfstat *WorkPerf) (err error) {
	//notify server the final process results
	if err := NotifyWorkunitProcessed(work, perfstat); err != nil {
		time.Sleep(3 * time.Second) //wait 3 seconds and try another time
		if err := NotifyWorkunitProcessed(work, perfstat); err != nil {
			fmt.Printf("!!!NotifyWorkunitDone returned error: %s\n", err.Error())
			logger.Error("err@NotifyWorkunitProcessed: workid=" + work.Id + ", err=" + err.Error())
			//mark this work in Current_work map as false, something needs to be done in the future
			//to clean this kind of work that has been proccessed but its result can't be sent to server!
			Self.Current_work[work.Id] = false
		}
	}

	//now final status report sent to server, update some local info
	if work.State == WORK_STAT_DONE {
		logger.Event(event.WORK_DONE, "workid="+work.Id)
		Self.Total_completed += 1
	} else {
		logger.Event(event.WORK_RETURN, "workid="+work.Id)
		Self.Total_failed += 1
	}
	delete(Self.Current_work, work.Id)
	return
}
Example #29
0
func (jr *JobReaper) Handle() {
	waitDuration := time.Duration(conf.EXPIRE_WAIT) * time.Minute
	for {
		// sleep
		time.Sleep(waitDuration)
		// query to get expired jobs
		jobs := Jobs{}
		query := jr.getQuery()
		jobs.GetAllUnsorted(query)
		// delete expired jobs
		for _, j := range jobs {
			logger.Event(event.JOB_EXPIRED, "jobid="+j.Id)
			if err := j.Delete(); err != nil {
				logger.Error("Err@job_delete: " + err.Error())
			}
		}
	}
}
Example #30
0
func (qm *ServerMgr) taskEnQueue(task *Task) (err error) {

	logger.Debug(2, "trying to enqueue task "+task.Id)

	if err := qm.locateInputs(task); err != nil {
		logger.Error("qmgr.taskEnQueue locateInputs:" + err.Error())
		return err
	}

	//create shock index on input nodes (if set in workflow document)
	if err := task.CreateIndex(); err != nil {
		logger.Error("qmgr.taskEnQueue CreateIndex:" + err.Error())
		return err
	}

	//init partition
	if err := task.InitPartIndex(); err != nil {
		logger.Error("qmgr.taskEnQueue InitPartitionIndex:" + err.Error())
		return err
	}

	if err := qm.createOutputNode(task); err != nil {
		logger.Error("qmgr.taskEnQueue createOutputNode:" + err.Error())
		return err
	}
	if err := qm.parseTask(task); err != nil {
		logger.Error("qmgr.taskEnQueue parseTask:" + err.Error())
		return err
	}
	task.State = TASK_STAT_QUEUED
	task.CreatedDate = time.Now()
	task.StartedDate = time.Now() //to-do: will be changed to the time when the first workunit is checked out
	qm.updateJobTask(task)        //task status PENDING->QUEUED

	//log event about task enqueue (TQ)
	logger.Event(event.TASK_ENQUEUE, fmt.Sprintf("taskid=%s;totalwork=%d", task.Id, task.TotalWork))
	qm.CreateTaskPerf(task.Id)

	if IsFirstTask(task.Id) {
		jobid, _ := GetJobIdByTaskId(task.Id)
		UpdateJobState(jobid, JOB_STAT_QUEUED, []string{JOB_STAT_INIT, JOB_STAT_SUSPEND})
	}
	return
}