//fetch input data func MoveInputData(work *core.Workunit) (size int64, err error) { for _, io := range work.Inputs { inputname := io.FileName // skip if NoFile == true if !io.NoFile { // is file ! dataUrl, uerr := io.DataUrl() if uerr != nil { return 0, uerr } inputFilePath := fmt.Sprintf("%s/%s", work.Path(), inputname) if work.Rank == 0 { if conf.CACHE_ENABLED && io.Node != "" { if file_path, err := StatCacheFilePath(io.Node); err == nil { //make a link in work dir from cached file linkname := fmt.Sprintf("%s/%s", work.Path(), inputname) fmt.Printf("input found in cache, making link: " + file_path + " -> " + linkname + "\n") err = os.Symlink(file_path, linkname) if err == nil { logger.Event(event.FILE_READY, "workid="+work.Id+";url="+dataUrl) } return 0, err } } } else { dataUrl = fmt.Sprintf("%s&index=%s&part=%s", dataUrl, work.IndexType(), work.Part()) } logger.Debug(2, "mover: fetching input file from url:"+dataUrl) logger.Event(event.FILE_IN, "workid="+work.Id+";url="+dataUrl) // download file if datamoved, _, err := shock.FetchFile(inputFilePath, dataUrl, work.Info.DataToken, io.Uncompress, false); err != nil { return size, errors.New("shock.FetchFile returned: " + err.Error()) } else { size += datamoved } logger.Event(event.FILE_READY, "workid="+work.Id+";url="+dataUrl) } // download node attributes if requested if io.AttrFile != "" { // get node node, err := shock.ShockGet(io.Host, io.Node, work.Info.DataToken) if err != nil { //return size, err return size, errors.New("shock.ShockGet (node attributes) returned: " + err.Error()) } logger.Debug(2, "mover: fetching input attributes from node:"+node.Id) logger.Event(event.ATTR_IN, "workid="+work.Id+";node="+node.Id) // print node attributes attrFilePath := fmt.Sprintf("%s/%s", work.Path(), io.AttrFile) attr_json, _ := json.Marshal(node.Attributes) if err := ioutil.WriteFile(attrFilePath, attr_json, 0644); err != nil { return size, err } logger.Event(event.ATTR_READY, "workid="+work.Id+";path="+attrFilePath) } } return }
func (qm *CQMgr) ClientChecker() { for { time.Sleep(30 * time.Second) for clientid, client := range qm.clientMap { if client.Tag == true { client.Tag = false total_minutes := int(time.Now().Sub(client.RegTime).Minutes()) hours := total_minutes / 60 minutes := total_minutes % 60 client.Serve_time = fmt.Sprintf("%dh%dm", hours, minutes) if len(client.Current_work) > 0 { client.Idle_time = 0 } else { client.Idle_time += 30 } } else { //now client must be gone as tag set to false 30 seconds ago and no heartbeat received thereafter logger.Event(event.CLIENT_UNREGISTER, "clientid="+clientid+";name="+qm.clientMap[clientid].Name) //requeue unfinished workunits associated with the failed client workids := qm.getWorkByClient(clientid) for _, workid := range workids { if qm.workQueue.Has(workid) { qm.workQueue.StatusChange(workid, WORK_STAT_QUEUED) logger.Event(event.WORK_REQUEUE, "workid="+workid) } } //delete the client from client map delete(qm.clientMap, clientid) } } } }
//fetch input data func moveInputData(work *core.Workunit) (size int64, err error) { for _, io := range work.Inputs { inputname := io.FileName dataUrl, uerr := io.DataUrl() if uerr != nil { return 0, uerr } if work.Rank > 0 { dataUrl = fmt.Sprintf("%s&index=%s&part=%s", dataUrl, work.IndexType(), work.Part()) } inputFilePath := path.Join(work.Path(), inputname) logger.Debug(2, "mover: fetching input from url:"+dataUrl) logger.Event(event.FILE_IN, "workid="+work.Id+" url="+dataUrl) // this gets file from any downloadable url, not just shock if datamoved, _, err := shock.FetchFile(inputFilePath, dataUrl, work.Info.DataToken, io.Uncompress, false); err != nil { return size, err } else { size += datamoved } logger.Event(event.FILE_READY, "workid="+work.Id+";url="+dataUrl) } return }
// PUT: /queue func (cr *QueueController) UpdateMany(cx *goweb.Context) { LogRequest(cx.Request) // Try to authenticate user. u, err := request.Authenticate(cx.Request) if err != nil && err.Error() != e.NoAuth { cx.RespondWithErrorMessage(err.Error(), http.StatusUnauthorized) return } // must be admin user if u == nil || u.Admin == false { cx.RespondWithErrorMessage(e.NoAuth, http.StatusUnauthorized) return } // Gather query params query := &Query{Li: cx.Request.URL.Query()} if query.Has("resume") { core.QMgr.ResumeQueue() logger.Event(event.QUEUE_RESUME, "user="******"work queue resumed") return } if query.Has("suspend") { core.QMgr.SuspendQueue() logger.Event(event.QUEUE_SUSPEND, "user="******"work queue suspended") return } cx.RespondWithErrorMessage("requested queue operation not supported", http.StatusBadRequest) return }
func RunWorkunit(work *core.Workunit) (err error) { args := work.Cmd.ParsedArgs //change cwd to the workunit's working directory if err := work.CDworkpath(); err != nil { return err } commandName := work.Cmd.Name cmd := exec.Command(commandName, args...) msg := fmt.Sprintf("worker: start cmd=%s, args=%v", commandName, args) fmt.Println(msg) logger.Debug(1, msg) logger.Event(event.WORK_START, "workid="+work.Id, "cmd="+commandName, fmt.Sprintf("args=%v", args)) var stdout, stderr io.ReadCloser if conf.PRINT_APP_MSG { stdout, err = cmd.StdoutPipe() if err != nil { return err } stderr, err = cmd.StderrPipe() if err != nil { return err } } if err := cmd.Start(); err != nil { return errors.New(fmt.Sprintf("start_cmd=%s, err=%s", commandName, err.Error())) } if conf.PRINT_APP_MSG { go io.Copy(os.Stdout, stdout) go io.Copy(os.Stderr, stderr) } done := make(chan error) go func() { done <- cmd.Wait() }() select { case <-chankill: if err := cmd.Process.Kill(); err != nil { fmt.Println("failed to kill" + err.Error()) } <-done // allow goroutine to exit fmt.Println("process killed") return errors.New("process killed") case err := <-done: if err != nil { return errors.New(fmt.Sprintf("wait_cmd=%s, err=%s", commandName, err.Error())) } } logger.Event(event.WORK_END, "workid="+work.Id) return }
func deliverer(control chan int) { fmt.Printf("deliverer lanched, client=%s\n", core.Self.Id) defer fmt.Printf("deliverer exiting...\n") for { processed := <-fromProcessor work := processed.workunit workmap[work.Id] = ID_DELIVERER perfstat := processed.perfstat //post-process for works computed successfully: push output data to Shock move_start := time.Now().Unix() if work.State == core.WORK_STAT_COMPUTED { if err := core.PushOutputData(work); err != nil { work.State = core.WORK_STAT_FAIL logger.Error("err@pushOutputData: workid=" + work.Id + ", err=" + err.Error()) } else { work.State = core.WORK_STAT_DONE } } move_end := time.Now().Unix() perfstat.DataOut = move_end - move_start perfstat.Deliver = move_end perfstat.ClientResp = perfstat.Deliver - perfstat.Checkout perfstat.ClientId = core.Self.Id //notify server the final process results if err := core.NotifyWorkunitProcessed(work, perfstat); err != nil { time.Sleep(3 * time.Second) //wait 3 seconds and try another time if err := core.NotifyWorkunitProcessed(work, perfstat); err != nil { fmt.Printf("!!!NotifyWorkunitDone returned error: %s\n", err.Error()) logger.Error("err@NotifyWorkunitProcessed: workid=" + work.Id + ", err=" + err.Error()) //mark this work in Current_work map as false, something needs to be done in the future //to clean this kind of work that has been proccessed but its result can't be sent to server! core.Self.Current_work[work.Id] = false //server doesn't know this yet } } //now final status report sent to server, update some local info if work.State == core.WORK_STAT_DONE { logger.Event(event.WORK_DONE, "workid="+work.Id) core.Self.Total_completed += 1 if conf.AUTO_CLEAN_DIR { if err := work.RemoveDir(); err != nil { logger.Error("[email protected](): workid=" + work.Id + ", err=" + err.Error()) } } } else { logger.Event(event.WORK_RETURN, "workid="+work.Id) core.Self.Total_failed += 1 } delete(core.Self.Current_work, work.Id) delete(workmap, work.Id) } control <- ID_DELIVERER //we are ending }
func (qm *ServerMgr) SuspendJob(jobid string, reason string, id string) (err error) { job, err := LoadJob(jobid) if err != nil { return } if id != "" { job.LastFailed = id } if err := job.UpdateState(JOB_STAT_SUSPEND, reason); err != nil { return err } qm.putSusJob(jobid) //suspend queueing workunits for _, workid := range qm.workQueue.List() { if jobid == getParentJobId(workid) { qm.workQueue.StatusChange(workid, WORK_STAT_SUSPEND) } } //suspend parsed tasks for _, task := range job.Tasks { if task.State == TASK_STAT_QUEUED || task.State == TASK_STAT_INIT || task.State == TASK_STAT_INPROGRESS { qm.taskStateChange(task.Id, TASK_STAT_SUSPEND) task.State = TASK_STAT_SUSPEND job.UpdateTask(task) } } qm.LogJobPerf(jobid) qm.removeActJob(jobid) logger.Event(event.JOB_SUSPEND, "jobid="+jobid+";reason="+reason) return }
// PUT: /logger func (cr *LoggerController) UpdateMany(cx *goweb.Context) { LogRequest(cx.Request) // Try to authenticate user. u, err := request.Authenticate(cx.Request) if err != nil && err.Error() != e.NoAuth { cx.RespondWithErrorMessage(err.Error(), http.StatusUnauthorized) return } // must be admin user if u == nil || u.Admin == false { cx.RespondWithErrorMessage(e.NoAuth, http.StatusUnauthorized) return } // Gather query params query := &Query{Li: cx.Request.URL.Query()} // currently can only reset debug level if query.Has("debug") { levelStr := query.Value("debug") levelInt, err := strconv.Atoi(levelStr) if err != nil { cx.RespondWithErrorMessage("invalid debug level: "+err.Error(), http.StatusBadRequest) } conf.DEBUG_LEVEL = levelInt logger.Event(event.DEBUG_LEVEL, "level="+levelStr+";user="******"debuglevel": conf.DEBUG_LEVEL}) return } cx.RespondWithError(http.StatusNotImplemented) return }
func (qm *ServerMgr) DeleteJob(jobid string) (err error) { job, err := LoadJob(jobid) if err != nil { return } if err := job.UpdateState(JOB_STAT_DELETED, "deleted"); err != nil { return err } //delete queueing workunits for workid, _ := range qm.workQueue.workMap { if jobid == strings.Split(workid, "_")[0] { qm.workQueue.Delete(workid) } } //delete parsed tasks for i := 0; i < len(job.TaskList()); i++ { task_id := fmt.Sprintf("%s_%d", jobid, i) delete(qm.taskMap, task_id) } qm.DeleteJobPerf(jobid) delete(qm.susJobs, jobid) logger.Event(event.JOB_DELETED, "jobid="+jobid) return }
func (qm *CQMgr) ClientChecker() { for { time.Sleep(30 * time.Second) logger.Debug(3, "time to update client list....\n") for _, client := range qm.GetAllClients() { if client.Tag == true { client.Tag = false total_minutes := int(time.Now().Sub(client.RegTime).Minutes()) hours := total_minutes / 60 minutes := total_minutes % 60 client.Serve_time = fmt.Sprintf("%dh%dm", hours, minutes) if len(client.Current_work) > 0 { client.Idle_time = 0 } else { client.Idle_time += 30 } qm.PutClient(client) } else { if ok := qm.HasClient(client.Id); !ok { continue } //now client must be gone as tag set to false 30 seconds ago and no heartbeat received thereafter logger.Event(event.CLIENT_UNREGISTER, "clientid="+client.Id+";name="+client.Name) //requeue unfinished workunits associated with the failed client qm.ReQueueWorkunitByClient(client.Id) //delete the client from client map qm.RemoveClient(client.Id) } } } }
func (qm *ServerMgr) DeleteJobByUser(jobid string, u *user.User) (err error) { job, err := LoadJob(jobid) if err != nil { return } // User must have delete permissions on job or be job owner or be an admin rights := job.Acl.Check(u.Uuid) if job.Acl.Owner != u.Uuid && rights["delete"] == false && u.Admin == false { return errors.New(e.UnAuth) } if err := job.UpdateState(JOB_STAT_DELETED, "deleted"); err != nil { return err } //delete queueing workunits for _, workid := range qm.workQueue.List() { if jobid == getParentJobId(workid) { qm.workQueue.Delete(workid) } } //delete parsed tasks for i := 0; i < len(job.TaskList()); i++ { task_id := fmt.Sprintf("%s_%d", jobid, i) qm.deleteTask(task_id) } qm.removeActJob(jobid) qm.removeSusJob(jobid) logger.Event(event.JOB_DELETED, "jobid="+jobid) return }
func (qm *ProxyMgr) ClientChecker() { for { time.Sleep(30 * time.Second) for _, client := range qm.GetAllClients() { if client.Tag == true { client.Tag = false total_minutes := int(time.Now().Sub(client.RegTime).Minutes()) hours := total_minutes / 60 minutes := total_minutes % 60 client.Serve_time = fmt.Sprintf("%dh%dm", hours, minutes) if client.Current_work_length() > 0 { client.Idle_time = 0 } else { client.Idle_time += 30 } qm.PutClient(client) } else { //now client must be gone as tag set to false 30 seconds ago and no heartbeat received thereafter logger.Event(event.CLIENT_UNREGISTER, "clientid="+client.Id+";name="+client.Name) //requeue unfinished workunits associated with the failed client qm.ReQueueWorkunitByClient(client.Id) //delete the client from client map qm.RemoveClient(client.Id) //proxy specific Self.SubClients -= 1 notifySubClients(Self.Id, Self.SubClients) } } } }
func (qm *ServerMgr) DeleteJob(jobid string) (err error) { job, err := LoadJob(jobid) if err != nil { return } if err := job.UpdateState(JOB_STAT_DELETED, "deleted"); err != nil { return err } //delete queueing workunits for _, workid := range qm.workQueue.List() { if jobid == getParentJobId(workid) { qm.workQueue.Delete(workid) } } //delete parsed tasks for i := 0; i < len(job.TaskList()); i++ { task_id := fmt.Sprintf("%s_%d", jobid, i) qm.deleteTask(task_id) } qm.removeActJob(jobid) qm.removeSusJob(jobid) logger.Event(event.JOB_DELETED, "jobid="+jobid) return }
func (qm *ServerMgr) SuspendJob(jobid string, reason string) (err error) { job, err := LoadJob(jobid) if err != nil { return } if err := job.UpdateState(JOB_STAT_SUSPEND, reason); err != nil { return err } //qm.DeleteJobPerf(jobid) qm.susJobs[jobid] = true //suspend queueing workunits for workid, _ := range qm.workQueue.workMap { if jobid == strings.Split(workid, "_")[0] { qm.workQueue.StatusChange(workid, WORK_STAT_SUSPEND) } } //suspend parsed tasks for _, task := range job.Tasks { if task.State == TASK_STAT_QUEUED || task.State == TASK_STAT_INIT { if _, ok := qm.taskMap[task.Id]; ok { qm.taskMap[task.Id].State = TASK_STAT_SUSPEND task.State = TASK_STAT_SUSPEND job.UpdateTask(task) } } } qm.DeleteJobPerf(jobid) logger.Event(event.JOB_SUSPEND, "jobid="+jobid+";reason="+reason) return }
//update job info when a task in that job changed to a new state func (qm *ServerMgr) updateJobTask(task *Task) (err error) { parts := strings.Split(task.Id, "_") jobid := parts[0] job, err := LoadJob(jobid) if err != nil { return } remainTasks, err := job.UpdateTask(task) if err != nil { return err } logger.Debug(2, fmt.Sprintf("remaining tasks for task %s: %d", task.Id, remainTasks)) if remainTasks == 0 { //job done qm.FinalizeJobPerf(jobid) qm.LogJobPerf(jobid) qm.removeActJob(jobid) //delete tasks in task map //delete from shock output flagged for deletion for _, task := range job.TaskList() { task.DeleteOutput() task.DeleteInput() qm.deleteTask(task.Id) } //log event about job done (JD) logger.Event(event.JOB_DONE, "jobid="+job.Id+";jid="+job.Jid+";project="+job.Info.Project+";name="+job.Info.Name) } return }
//update job info when a task in that job changed to a new state func (qm *ServerMgr) updateJobTask(task *Task) (err error) { parts := strings.Split(task.Id, "_") jobid := parts[0] job, err := LoadJob(jobid) if err != nil { return } remainTasks, err := job.UpdateTask(task) if err != nil { return err } if remainTasks == 0 { //job done qm.FinalizeJobPerf(jobid) qm.LogJobPerf(jobid) qm.DeleteJobPerf(jobid) //delete tasks in task map for _, task := range job.TaskList() { delete(qm.taskMap, task.Id) } //log event about job done (JD) logger.Event(event.JOB_DONE, "jobid="+job.Id+";jid="+job.Jid+";project="+job.Info.Project+";name="+job.Info.Name) } return }
//parse workunit, fetch input data, compose command arguments func ParseWorkunitArgs(work *core.Workunit) (args []string, err error) { argstr := work.Cmd.Args if argstr == "" { return } argList := strings.Fields(argstr) inputsMap := work.Inputs for _, arg := range argList { if strings.Contains(arg, "@") { //parse input/output to accessible local file segs := strings.Split(arg, "@") if len(segs) > 2 { return []string{}, errors.New("invalid format in command args, multiple @ within one arg") } inputname := segs[1] if inputsMap.Has(inputname) { io := inputsMap[inputname] var dataUrl string if work.Rank == 0 { dataUrl = io.DataUrl() } else { dataUrl = fmt.Sprintf("%s&index=%s&part=%s", io.DataUrl(), work.IndexType(), work.Part()) } inputFilePath := fmt.Sprintf("%s/%s", work.Path(), inputname) logger.Debug(2, "mover: fetching input from url:"+dataUrl) logger.Event(event.FILE_IN, "workid="+work.Id+" url="+dataUrl) if err := fetchFile(inputFilePath, dataUrl, work.Info.DataToken); err != nil { //get file from Shock return []string{}, err } logger.Event(event.FILE_READY, "workid="+work.Id+" url="+dataUrl) parsedArg := fmt.Sprintf("%s%s", segs[0], inputFilePath) args = append(args, parsedArg) } } else { //no @, has nothing to do with input/output, append directly args = append(args, arg) } } return args, nil }
func main() { if !conf.INIT_SUCCESS { conf.PrintClientUsage() os.Exit(1) } if _, err := os.Stat(conf.WORK_PATH); err != nil && os.IsNotExist(err) { if err := os.MkdirAll(conf.WORK_PATH, 0777); err != nil { fmt.Fprintf(os.Stderr, "ERROR in creating work_path %s\n", err.Error()) os.Exit(1) } } if _, err := os.Stat(conf.DATA_PATH); err != nil && os.IsNotExist(err) { if err := os.MkdirAll(conf.DATA_PATH, 0777); err != nil { fmt.Fprintf(os.Stderr, "ERROR in creating data_path %s\n", err.Error()) os.Exit(1) } } if _, err := os.Stat(conf.LOGS_PATH); err != nil && os.IsNotExist(err) { if err := os.MkdirAll(conf.LOGS_PATH, 0777); err != nil { fmt.Fprintf(os.Stderr, "ERROR in creating log_path %s\n", err.Error()) os.Exit(1) } } profile, err := worker.ComposeProfile() if err != nil { fmt.Fprintf(os.Stderr, "fail to compose profile: %s\n", err.Error()) os.Exit(1) } self, err := worker.RegisterWithAuth(conf.SERVER_URL, profile) if err != nil { fmt.Fprintf(os.Stderr, "fail to register: %s\n", err.Error()) os.Exit(1) } core.InitClientProfile(self) var logdir string if self.Name != "" { logdir = self.Name } else { logdir = conf.CLIENT_NAME } logger.Initialize("client-" + logdir) fmt.Printf("Client registered, name=%s, id=%s\n", self.Name, self.Id) logger.Event(event.CLIENT_REGISTRATION, "clientid="+self.Id) if err := worker.InitWorkers(self); err == nil { worker.StartClientWorkers() } else { fmt.Printf("failed to initialize and start workers:" + err.Error()) } }
func workStealer(control chan int) { fmt.Printf("workStealer lanched, client=%s\n", core.Self.Id) defer fmt.Printf("workStealer exiting...\n") retry := 0 for { if core.Service == "proxy" { <-core.ProxyWorkChan } wu, err := CheckoutWorkunitRemote() if err != nil { if err.Error() == e.QueueEmpty || err.Error() == e.NoEligibleWorkunitFound { //normal, do nothing } else if err.Error() == e.ClientNotFound { //server may be restarted, waiting for the hearbeater goroutine to try re-register ReRegisterWithSelf(conf.SERVER_URL) } else if err.Error() == e.ClientSuspended { fmt.Printf("client suspended, waiting for repair or resume request...\n") //to-do: send out email notice that this client has problem and been suspended time.Sleep(2 * time.Minute) } else { //something is wrong, server may be down fmt.Printf("error in checking out workunits: %v\n", err) retry += 1 } if retry == 3 { os.Exit(1) } if core.Service != "proxy" { //proxy: event driven, client: timer driven time.Sleep(10 * time.Second) } continue } else { retry = 0 } logger.Debug(2, "workStealer: checked out a workunit: id="+wu.Id) //log event about work checktout (WC) logger.Event(event.WORK_CHECKOUT, "workid="+wu.Id) core.Self.Total_checkout += 1 core.Self.Current_work[wu.Id] = true workmap[wu.Id] = ID_WORKSTEALER //hand the work to the next step handler: dataMover workstat := core.NewWorkPerf(wu.Id) workstat.Checkout = time.Now().Unix() rawWork := &mediumwork{ workunit: wu, perfstat: workstat, } fromStealer <- rawWork //if worker overlap is inhibited, wait until deliverer finishes processing the workunit if conf.WORKER_OVERLAP == false && core.Service != "proxy" { chanPermit <- true } } control <- ID_WORKSTEALER //we are ending }
// POST: /job func (cr *JobController) Create(cx *goweb.Context) { // Log Request and check for Auth LogRequest(cx.Request) // Parse uploaded form params, files, err := ParseMultipartForm(cx.Request) if err != nil { if err.Error() == "request Content-Type isn't multipart/form-data" { cx.RespondWithErrorMessage("No job file is submitted", http.StatusBadRequest) } else { // Some error other than request encoding. Theoretically // could be a lost db connection between user lookup and parsing. // Blame the user, Its probaby their fault anyway. logger.Error("Error parsing form: " + err.Error()) cx.RespondWithError(http.StatusBadRequest) } return } _, has_upload := files["upload"] _, has_awf := files["awf"] if !has_upload && !has_awf { cx.RespondWithErrorMessage("No job script or awf is submitted", http.StatusBadRequest) return } //send job submission request and get back an assigned job number (jid) var jid string jid, err = core.QMgr.JobRegister() if err != nil { logger.Error("Err@job_Create:GetNextJobNum: " + err.Error()) cx.RespondWithErrorMessage(err.Error(), http.StatusBadRequest) return } var job *core.Job job, err = core.CreateJobUpload(params, files, jid) if err != nil { logger.Error("Err@job_Create:CreateJobUpload: " + err.Error()) cx.RespondWithErrorMessage(err.Error(), http.StatusBadRequest) return } if token, err := request.RetrieveToken(cx.Request); err == nil { job.SetDataToken(token) } core.QMgr.EnqueueTasksByJobId(job.Id, job.TaskList()) //log event about job submission (JB) logger.Event(event.JOB_SUBMISSION, "jobid="+job.Id+";jid="+job.Jid+";name="+job.Info.Name+";project="+job.Info.Project) cx.RespondWithData(job) return }
func (qm *ServerMgr) skipTask(task *Task) (err error) { task.State = TASK_STAT_SKIPPED task.RemainWork = 0 //update job and queue info. Skipped task behaves as finished tasks if err = qm.updateJobTask(task); err != nil { //TASK state -> SKIPPED return } logger.Event(event.TASK_SKIPPED, "taskid="+task.Id) return }
func (job *Job) Delete() (err error) { if err = dbDelete(bson.M{"id": job.Id}, conf.DB_COLL_JOBS); err != nil { return err } if err = job.Rmdir(); err != nil { return err } logger.Event(event.JOB_FULL_DELETE, "jobid="+job.Id) return }
func ReRegisterWithSelf(host string) (client *core.Client, err error) { fmt.Printf("lost contact with server, try to re-register\n") client, err = RegisterWithAuth(host, core.Self) if err != nil { logger.Error("Error: fail to re-register, clientid=" + core.Self.Id) fmt.Printf("failed to re-register\n") } else { logger.Event(event.CLIENT_AUTO_REREGI, "clientid="+core.Self.Id) fmt.Printf("re-register successfully\n") } return }
//fetch input data func moveInputData(work *core.Workunit) (size int64, err error) { for inputname, io := range work.Inputs { var dataUrl string if work.Rank == 0 { dataUrl = io.DataUrl() } else { dataUrl = fmt.Sprintf("%s&index=%s&part=%s", io.DataUrl(), work.IndexType(), work.Part()) } inputFilePath := path.Join(work.Path(), inputname) logger.Debug(2, "mover: fetching input from url:"+dataUrl) logger.Event(event.FILE_IN, "workid="+work.Id+" url="+dataUrl) if datamoved, err := shock.FetchFile(inputFilePath, dataUrl, work.Info.DataToken, io.Uncompress); err != nil { return size, err } else { size += datamoved } logger.Event(event.FILE_READY, "workid="+work.Id+";url="+dataUrl) } return }
// GET: /work // checkout a workunit with earliest submission time // to-do: to support more options for workunit checkout func (cr *WorkController) ReadMany(cx *goweb.Context) { // Gather query params query := &Query{Li: cx.Request.URL.Query()} if !query.Has("client") { //view workunits var workunits []*core.Workunit if query.Has("state") { workunits = core.QMgr.ShowWorkunits(query.Value("state")) } else { workunits = core.QMgr.ShowWorkunits("") } cx.RespondWithData(workunits) return } if core.Service == "proxy" { //drive proxy workStealer to checkout work from server core.ProxyWorkChan <- true } //checkout a workunit in FCFS order clientid := query.Value("client") workunits, err := core.QMgr.CheckoutWorkunits("FCFS", clientid, 1) if err != nil { if err.Error() != e.QueueEmpty && err.Error() != e.NoEligibleWorkunitFound && err.Error() != e.ClientNotFound { logger.Error("Err@work_ReadMany:core.QMgr.GetWorkByFCFS(): " + err.Error() + ";client=" + clientid) } cx.RespondWithErrorMessage(err.Error(), http.StatusBadRequest) return } //log access info only when the queue is not empty, save some log LogRequest(cx.Request) //log event about workunit checkout (WO) workids := []string{} for _, work := range workunits { workids = append(workids, work.Id) } logger.Event(event.WORK_CHECKOUT, "workids="+strings.Join(workids, ","), "clientid="+clientid) // Base case respond with node in json cx.RespondWithData(workunits[0]) return }
func (qm *CQMgr) ReQueueWorkunitByClient(clientid string) (err error) { workids := qm.getWorkByClient(clientid) for _, workid := range workids { if qm.workQueue.Has(workid) { jobid, _ := GetJobIdByWorkId(workid) if job, err := LoadJob(jobid); err == nil { if contains(JOB_STATS_ACTIVE, job.State) { //only requeue workunits belonging to active jobs (rule out suspended jobs) qm.workQueue.StatusChange(workid, WORK_STAT_QUEUED) logger.Event(event.WORK_REQUEUE, "workid="+workid) } } } } return }
func PushOutputData(work *Workunit) (err error) { for name, io := range work.Outputs { file_path := fmt.Sprintf("%s/%s", work.Path(), name) //use full path here, cwd could be changed by Worker (likely in worker-overlapping mode) if fi, err := os.Stat(file_path); err != nil { if io.Optional { continue } else { return errors.New(fmt.Sprintf("output %s not generated for workunit %s", name, work.Id)) } } else { if io.Nonzero && fi.Size() == 0 { return errors.New(fmt.Sprintf("workunit %s generated zero-sized output %s while non-zero-sized file required", work.Id, name)) } } logger.Debug(2, "deliverer: push output to shock, filename="+name) logger.Event(event.FILE_OUT, "workid="+work.Id, "filename="+name, fmt.Sprintf("url=%s/node/%s", io.Host, io.Node)) if err := putFileToShock(file_path, io.Host, io.Node, work.Rank, work.Info.DataToken); err != nil { time.Sleep(3 * time.Second) //wait for 3 seconds and try again if err := putFileToShock(file_path, io.Host, io.Node, work.Rank, work.Info.DataToken); err != nil { fmt.Errorf("push file error\n") logger.Error("op=pushfile,err=" + err.Error()) return err } } logger.Event(event.FILE_DONE, "workid="+work.Id, "filename="+name, fmt.Sprintf("url=%s/node/%s", io.Host, io.Node)) } return }
func proxy_relay_workunit(work *Workunit, perfstat *WorkPerf) (err error) { //notify server the final process results if err := NotifyWorkunitProcessed(work, perfstat); err != nil { time.Sleep(3 * time.Second) //wait 3 seconds and try another time if err := NotifyWorkunitProcessed(work, perfstat); err != nil { fmt.Printf("!!!NotifyWorkunitDone returned error: %s\n", err.Error()) logger.Error("err@NotifyWorkunitProcessed: workid=" + work.Id + ", err=" + err.Error()) //mark this work in Current_work map as false, something needs to be done in the future //to clean this kind of work that has been proccessed but its result can't be sent to server! Self.Current_work[work.Id] = false } } //now final status report sent to server, update some local info if work.State == WORK_STAT_DONE { logger.Event(event.WORK_DONE, "workid="+work.Id) Self.Total_completed += 1 } else { logger.Event(event.WORK_RETURN, "workid="+work.Id) Self.Total_failed += 1 } delete(Self.Current_work, work.Id) return }
func (jr *JobReaper) Handle() { waitDuration := time.Duration(conf.EXPIRE_WAIT) * time.Minute for { // sleep time.Sleep(waitDuration) // query to get expired jobs jobs := Jobs{} query := jr.getQuery() jobs.GetAllUnsorted(query) // delete expired jobs for _, j := range jobs { logger.Event(event.JOB_EXPIRED, "jobid="+j.Id) if err := j.Delete(); err != nil { logger.Error("Err@job_delete: " + err.Error()) } } } }
func (qm *ServerMgr) taskEnQueue(task *Task) (err error) { logger.Debug(2, "trying to enqueue task "+task.Id) if err := qm.locateInputs(task); err != nil { logger.Error("qmgr.taskEnQueue locateInputs:" + err.Error()) return err } //create shock index on input nodes (if set in workflow document) if err := task.CreateIndex(); err != nil { logger.Error("qmgr.taskEnQueue CreateIndex:" + err.Error()) return err } //init partition if err := task.InitPartIndex(); err != nil { logger.Error("qmgr.taskEnQueue InitPartitionIndex:" + err.Error()) return err } if err := qm.createOutputNode(task); err != nil { logger.Error("qmgr.taskEnQueue createOutputNode:" + err.Error()) return err } if err := qm.parseTask(task); err != nil { logger.Error("qmgr.taskEnQueue parseTask:" + err.Error()) return err } task.State = TASK_STAT_QUEUED task.CreatedDate = time.Now() task.StartedDate = time.Now() //to-do: will be changed to the time when the first workunit is checked out qm.updateJobTask(task) //task status PENDING->QUEUED //log event about task enqueue (TQ) logger.Event(event.TASK_ENQUEUE, fmt.Sprintf("taskid=%s;totalwork=%d", task.Id, task.TotalWork)) qm.CreateTaskPerf(task.Id) if IsFirstTask(task.Id) { jobid, _ := GetJobIdByTaskId(task.Id) UpdateJobState(jobid, JOB_STAT_QUEUED, []string{JOB_STAT_INIT, JOB_STAT_SUSPEND}) } return }