func dockerImportImage(client *docker.Client, Dockerimage string, datatoken string) (err error) { _, download_url, err := findDockerImageInShock(Dockerimage, datatoken) // TODO get node if err != nil { return err } logger.Debug(1, fmt.Sprintf("docker image url=%s", download_url)) // TODO import base image if needed // *** import image Dockerimage_array := strings.Split(Dockerimage, ":") Dockerimage_repo, Dockerimage_tag := Dockerimage_array[0], Dockerimage_array[1] logger.Debug(1, fmt.Sprintf("importing image...")) var buf bytes.Buffer opts := docker.ImportImageOptions{ Source: download_url, Repository: Dockerimage_repo, Tag: Dockerimage_tag, OutputStream: &buf, } err = client.ImportImage(opts) if err != nil { return errors.New(fmt.Sprintf("Error importing docker image, err=%s", err.Error())) } return }
func (qm *ServerMgr) Handle() { for { select { case <-qm.jsReq: jid := qm.getNextJid() qm.jsAck <- jid logger.Debug(2, fmt.Sprintf("qmgr:receive a job submission request, assigned jid=%s\n", jid)) case task := <-qm.taskIn: logger.Debug(2, fmt.Sprintf("qmgr:task recived from chan taskIn, id=%s\n", task.Id)) qm.addTask(task) case coReq := <-qm.coReq: logger.Debug(2, fmt.Sprintf("qmgr: workunit checkout request received, Req=%v\n", coReq)) works, err := qm.popWorks(coReq) ack := CoAck{workunits: works, err: err} qm.coAck <- ack case notice := <-qm.feedback: logger.Debug(2, fmt.Sprintf("qmgr: workunit feedback received, workid=%s, status=%s, clientid=%s\n", notice.WorkId, notice.Status, notice.ClientId)) if err := qm.handleWorkStatusChange(notice); err != nil { logger.Error("handleWorkStatusChange(): " + err.Error()) } case <-qm.reminder: logger.Debug(3, "time to update workunit queue....\n") qm.updateQueue() if conf.DEV_MODE { fmt.Println(qm.ShowStatus()) } } } }
func (qm *ServerMgr) locateInputs(task *Task) (err error) { logger.Debug(2, "trying to locate Inputs of task "+task.Id) jobid := strings.Split(task.Id, "_")[0] for name, io := range task.Inputs { if io.Url == "" { preId := fmt.Sprintf("%s_%s", jobid, io.Origin) if preTask, ok := qm.taskMap[preId]; ok { if preTask.State == TASK_STAT_SKIPPED || preTask.State == TASK_STAT_FAIL_SKIP { // For now we know that skipped tasks have // just one input and one output. So we know // that we just need to change one file (this // may change in the future) //locateSkippedInput(qm, preTask, io) } else { outputs := preTask.Outputs if outio, ok := outputs[name]; ok { io.Node = outio.Node } } } } io.DataUrl() if io.Node == "-" { return errors.New(fmt.Sprintf("error in locate input for task %s, %s", task.Id, name)) } //need time out! if io.GetFileSize() < 0 { return errors.New(fmt.Sprintf("task %s: input file %s not available", task.Id, name)) } logger.Debug(2, fmt.Sprintf("inputs located %s, %s\n", name, io.Node)) } return }
// was getDockerImageUrl(Dockerimage string) (download_url string, err error) func findDockerImageInShock(Dockerimage string) (node *shock.ShockNode, download_url string, err error) { shock_docker_repo := shock.ShockClient{conf.SHOCK_DOCKER_IMAGE_REPOSITORY, ""} logger.Debug(1, fmt.Sprint("try to import docker image, Dockerimage=", Dockerimage)) //query url = type=dockerimage&name=wgerlach/bowtie2:2.2.0" query_response_p, err := shock_docker_repo.Query(url.Values{"type": {"dockerimage"}, "name": {Dockerimage}}) if err != nil { return nil, "", errors.New(fmt.Sprintf("shock node not found for image=%s, err=%s", Dockerimage, err.Error())) } logger.Debug(1, fmt.Sprintf("query result: %v", query_response_p)) datalen := len((*query_response_p).Data) if datalen == 0 { return nil, "", errors.New(fmt.Sprintf("image %s not found in shocks docker repo", Dockerimage)) } else if datalen > 1 { return nil, "", errors.New(fmt.Sprintf("more than one image %s found in shocks docker repo", Dockerimage)) } node = &(*query_response_p).Data[0] logger.Debug(1, fmt.Sprintf("found SHOCK node for docker image: %s", node.Id)) download_url, err = shock_docker_repo.Get_node_download_url(*node) if err != nil { return nil, "", errors.New(fmt.Sprintf("Could not create download url, err=%s", err.Error())) } return }
// show functions used in debug func (qm *CQMgr) ShowWorkQueue() { logger.Debug(1, fmt.Sprintf("current queuing workunits (%d)", qm.workQueue.Len())) for _, id := range qm.workQueue.List() { logger.Debug(1, fmt.Sprintf("workid=%s", id)) } return }
func (qm *ServerMgr) ClientHandle() { for { select { case coReq := <-qm.coReq: logger.Debug(2, fmt.Sprintf("qmgr: workunit checkout request received, Req=%v", coReq)) var ack CoAck if qm.suspendQueue { // queue is suspended, return suspend error ack = CoAck{workunits: nil, err: errors.New(e.QueueSuspend)} } else { qm.updateQueue() works, err := qm.popWorks(coReq) if err == nil { qm.UpdateJobTaskToInProgress(works) } ack = CoAck{workunits: works, err: err} } qm.coAck <- ack case notice := <-qm.feedback: logger.Debug(2, fmt.Sprintf("qmgr: workunit feedback received, workid=%s, status=%s, clientid=%s", notice.WorkId, notice.Status, notice.ClientId)) if err := qm.handleWorkStatusChange(notice); err != nil { logger.Error("handleWorkStatusChange(): " + err.Error()) } qm.updateQueue() } } }
func CreateContainer(create_args []string) (container_id string, err error) { //docker create [OPTIONS] IMAGE [COMMAND] [ARG...] // first line contains ID // must have "-t" to attach, this is not documented in docker. // prepend "create" create_args = append([]string{"create"}, create_args...) logger.Debug(1, fmt.Sprintf("(CreateContainer) cmd: %s %s", conf.DOCKER_BINARY, strings.Join(create_args, " "))) stdo, _, err := RunCommand(conf.DOCKER_BINARY, create_args...) if err != nil { logger.Debug(1, fmt.Sprintf("(CreateContainer) cmd.Wait returned error: %s", err.Error())) return "", err } // extract only first line endofline := bytes.IndexByte(stdo, '\n') stdout_line := "" if endofline >= 0 { stdout_line = string(stdo[0 : endofline-1]) } else { err = errors.New("docker create returned empty string") } return stdout_line, err }
//fetch input data func MoveInputData(work *core.Workunit) (size int64, err error) { for _, io := range work.Inputs { inputname := io.FileName // skip if NoFile == true if !io.NoFile { // is file ! dataUrl, uerr := io.DataUrl() if uerr != nil { return 0, uerr } inputFilePath := fmt.Sprintf("%s/%s", work.Path(), inputname) if work.Rank == 0 { if conf.CACHE_ENABLED && io.Node != "" { if file_path, err := StatCacheFilePath(io.Node); err == nil { //make a link in work dir from cached file linkname := fmt.Sprintf("%s/%s", work.Path(), inputname) fmt.Printf("input found in cache, making link: " + file_path + " -> " + linkname + "\n") err = os.Symlink(file_path, linkname) if err == nil { logger.Event(event.FILE_READY, "workid="+work.Id+";url="+dataUrl) } return 0, err } } } else { dataUrl = fmt.Sprintf("%s&index=%s&part=%s", dataUrl, work.IndexType(), work.Part()) } logger.Debug(2, "mover: fetching input file from url:"+dataUrl) logger.Event(event.FILE_IN, "workid="+work.Id+";url="+dataUrl) // download file if datamoved, _, err := shock.FetchFile(inputFilePath, dataUrl, work.Info.DataToken, io.Uncompress, false); err != nil { return size, errors.New("shock.FetchFile returned: " + err.Error()) } else { size += datamoved } logger.Event(event.FILE_READY, "workid="+work.Id+";url="+dataUrl) } // download node attributes if requested if io.AttrFile != "" { // get node node, err := shock.ShockGet(io.Host, io.Node, work.Info.DataToken) if err != nil { //return size, err return size, errors.New("shock.ShockGet (node attributes) returned: " + err.Error()) } logger.Debug(2, "mover: fetching input attributes from node:"+node.Id) logger.Event(event.ATTR_IN, "workid="+work.Id+";node="+node.Id) // print node attributes attrFilePath := fmt.Sprintf("%s/%s", work.Path(), io.AttrFile) attr_json, _ := json.Marshal(node.Attributes) if err := ioutil.WriteFile(attrFilePath, attr_json, 0644); err != nil { return size, err } logger.Event(event.ATTR_READY, "workid="+work.Id+";path="+attrFilePath) } } return }
func (qm *ServerMgr) createOutputNode(task *Task) (err error) { outputs := task.Outputs for _, io := range outputs { name := io.FileName if io.Type == "update" { // this an update output, it will update an existing shock node and not create a new one if (io.Node == "") || (io.Node == "-") { if io.Origin == "" { return errors.New(fmt.Sprintf("update output %s in task %s is missing required origin", name, task.Id)) } nodeid, err := qm.locateUpdate(task.Id, name, io.Origin) if err != nil { return err } io.Node = nodeid } logger.Debug(2, fmt.Sprintf("outout %s in task %s is an update of node %s\n", name, task.Id, io.Node)) } else { // POST empty shock node for this output logger.Debug(2, fmt.Sprintf("posting output Shock node for file %s in task %s\n", name, task.Id)) nodeid, err := PostNodeWithToken(io, task.TotalWork, task.Info.DataToken) if err != nil { return err } io.Node = nodeid logger.Debug(2, fmt.Sprintf("task %s: output Shock node created, node=%s\n", task.Id, nodeid)) } } return }
func (appr AppRegistry) createIOnodes(job *Job) (err error) { // go over tasks taskid2task := make(map[string]*Task) taskid_processed := make(map[string]bool) // create taskid2task for _, task := range job.Tasks { taskid_split := strings.Split(task.Id, "_") taskid := taskid_split[1] _, ok := taskid2task[taskid] if ok { err = errors.New("error: task id not unique, id=" + taskid) return } taskid2task[taskid] = task logger.Debug(1, fmt.Sprintf("--------adding to taskid2task map: %s", taskid)) } for _, task := range job.Tasks { err = appr.createIOnodes_forTask(job, task, taskid2task, taskid_processed) if err != nil { return errors.New(fmt.Sprintf("error in task %s: %s", task.App.Name, err.Error())) } } logger.Debug(1, fmt.Sprintf("+++ +++ createIONodesm finished")) return }
func (acm AppCommandMode) Get_default_app_variables() (app_variables AppVariables, err error) { app_variables = make(AppVariables) // this function is called on the server // *** app input arguments (app definition) logger.Debug(1, fmt.Sprintf("Get_default_app_variables: size of acm.Input=%d", len(acm.Input))) time.Sleep(15 * time.Millisecond) for _, input_arg := range acm.Input { //logger.Debug(1, fmt.Sprintf("app input arg: %s", strings.Join(input_arg, ", "))) // save the defaults if available logger.Debug(1, fmt.Sprintf("from app-definition: variable \"%s\"", input_arg.Name)) app_type, err := String2apptype(input_arg.Type) if err != nil { err = errors.New(fmt.Sprintf("error converting type, error=%s", err.Error())) return app_variables, err } logger.Debug(1, fmt.Sprintf("from app-definition: variable \"%s\" has type %s", input_arg.Name, apptype2string(app_type))) logger.Debug(1, fmt.Sprintf("from app-definition: write variable:\"%s\" - default value: \"%s\"", input_arg.Name, input_arg.DefaultValue)) app_variables[input_arg.Name] = AppVariable{Key: input_arg.Name, Var_type: app_type, Value: input_arg.DefaultValue, Option: input_arg.Option, Optional: input_arg.Optional} } return }
//fetch prerequisite data (e.g. reference dbs) func movePreData(workunit *core.Workunit) (size int64, err error) { for name, io := range workunit.Predata { predata_directory := path.Join(conf.DATA_PATH, "predata") err = os.MkdirAll(predata_directory, 755) if err != nil { return 0, errors.New("error creating predata_directory: " + err.Error()) } file_path := path.Join(predata_directory, name) if !isFileExisting(file_path) { size, err = shock.FetchFile(file_path, io.Url, workunit.Info.DataToken, io.Uncompress) if err != nil { return 0, errors.New("error in fetchFile:" + err.Error()) } } use_symlink := false linkname := path.Join(workunit.Path(), name) if workunit.Cmd.Dockerimage != "" || strings.HasPrefix(workunit.Cmd.Name, "app:") { // TODO need more save way to detect use of docker use_symlink = false // TODO mechanism if use_symlink { file_path = path.Join(conf.DOCKER_WORKUNIT_PREDATA_DIR, name) // some tasks want to write in predata dir, thus need symlink logger.Debug(1, "dangling symlink:"+linkname+" -> "+file_path) // creation of dangling symlinks is not possible with with os.Symlink, thus use system ln link_out, err := exec.Command("ln", "-s", file_path, linkname).CombinedOutput() logger.Debug(1, fmt.Sprintf("ln returned %s", link_out)) if err != nil { return 0, errors.New("error creating predata file symlink (dangling version): " + err.Error()) } } else { // some programs do not accept symlinks (e.g. emirge), need to copy the file into the work directory // linkname refers to target file now. logger.Debug(1, "copy predata:"+file_path+" -> "+linkname) _, err := shock.CopyFile(file_path, linkname) if err != nil { return 0, fmt.Errorf("error copying file from %s to % s: ", file_path, linkname, err.Error()) } } } else { //linkname := path.Join(workunit.Path(), name) logger.Debug(1, "symlink:"+linkname+" -> "+file_path) err = os.Symlink(file_path, linkname) if err != nil { return 0, errors.New("error creating predata file symlink: " + err.Error()) } } } return }
func InspectImage(client *docker.Client, dockerimage_id string) (image *docker.Image, err error) { logger.Debug(1, fmt.Sprintf("(InspectImage) %s:", dockerimage_id)) if client == nil { // if image does not exists, return status 1 and text on stderr cmd := exec.Command(conf.DOCKER_BINARY, "inspect", dockerimage_id) stdout, err := cmd.StdoutPipe() if err != nil { return nil, err } stderr, err := cmd.StderrPipe() if err != nil { return nil, err } if err = cmd.Start(); err != nil { return nil, err } var image_array []docker.Image err_json := json.NewDecoder(stdout).Decode(&image_array) if err_json != nil { logger.Debug(1, fmt.Sprintf("(InspectImage) err_json: %s", err_json.Error())) image = nil } err = cmd.Wait() // wait just in case if err != nil { stderr_bytearray, err_read := ioutil.ReadAll(stderr) if err_read != nil { return nil, err_read } logger.Debug(1, fmt.Sprintf("(InspectImage) STDERR: %s", stderr_bytearray)) return nil, err } else { err = err_json // in case that failed... } if len(image_array) == 1 { image = &image_array[0] } else { err = errors.New("error: inspect returned zero (or more than one) images") } return image, err } else { image, err = client.InspectImage(dockerimage_id) } return image, err }
func (sc *ShockClient) Get_request(resource string, query url.Values, response interface{}) (err error) { logger.Debug(1, fmt.Sprint("string_url: ", sc.Host)) myurl, err := url.ParseRequestURI(sc.Host) if err != nil { return err } (*myurl).Path = resource (*myurl).RawQuery = query.Encode() shockurl := myurl.String() logger.Debug(1, fmt.Sprint("shock request url: ", shockurl)) if len(shockurl) < 5 { return errors.New("could not parse SHOCK_DOCKER_IMAGE_REPOSITORY") } var res *http.Response c := make(chan int, 1) go func() { res, err = httpclient.Get(shockurl, httpclient.Header{}, nil, nil) c <- 1 //we are ending }() select { case <-c: //go ahead case <-time.After(conf.SHOCK_TIMEOUT): return errors.New("timeout when getting node from shock, url=" + shockurl) } if err != nil { return } defer res.Body.Close() jsonstream, err := ioutil.ReadAll(res.Body) //logger.Debug(1, string(jsonstream)) if err != nil { return err } //response := new(result) if err := json.Unmarshal(jsonstream, response); err != nil { return err } //if len(response.Errs) > 0 { // return errors.New(strings.Join(response.Errs, ",")) //} //node = &response.Data //if node == nil { // err = errors.New("empty node got from Shock") //} return }
func getMetaDataField(field string) (result string, err error) { var url = fmt.Sprintf("%s/%s", conf.OPENSTACK_METADATA_URL, field) // TODO this is not OPENSTACK, this is EC2 logger.Debug(1, fmt.Sprintf("url=%s", url)) for i := 0; i < 3; i++ { var res *http.Response c := make(chan error) go func() { res, err = http.Get(url) if err != nil { c <- err //we are ending with error return } defer res.Body.Close() bodybytes, err := ioutil.ReadAll(res.Body) if err != nil { c <- err //we are ending with error return } result = string(bodybytes[:]) c <- nil //we are ending without error }() select { case err = <-c: //go ahead case <-time.After(conf.INSTANCE_METADATA_TIMEOUT): //GET timeout err = errors.New("timeout: " + url) } if err != nil { logger.Error(fmt.Sprintf("warning: (iteration=%d) %s \"%s\"", i, url, err.Error())) continue } else if result == "" { logger.Error(fmt.Sprintf("warning: (iteration=%d) %s empty result", i, url)) continue } break } if err != nil { return "", err } if result == "" { return "", errors.New(fmt.Sprintf("metadata result empty, %s", url)) } logger.Debug(1, fmt.Sprintf("Intance Metadata %s => \"%s\"", url, result)) return }
//select workunits, return a slice of ids based on given queuing policy and requested count func (wq *WQueue) selectWorkunits(workid []string, policy string, count int) (selected []*Workunit, err error) { logger.Debug(3, fmt.Sprintf("starting selectWorkunits\n")) worklist := wq.GetSet(workid) if policy == "FCFS" { sort.Sort(byFCFS{worklist}) } for i := 0; i < count; i++ { selected = append(selected, worklist[i]) } logger.Debug(3, fmt.Sprintf("done with selectWorkunits\n")) return }
func (apr AppRegistry) GetAppPackage(app_package string) (ap *AppPackage, err error) { ap, ok := apr[app_package] if ok { return ap, nil } package_url := conf.APP_REGISTRY_URL + "/" + app_package + ".json" var new_app_package AppPackage for i := 0; i < 3; i++ { if i > 0 { time.Sleep(1000 * time.Millisecond) } logger.Debug(1, fmt.Sprintf("downloading app package \"%s\"", package_url)) res, err := httpclient.GetTimeout(package_url, nil, nil, nil, 5000*time.Millisecond) if err != nil { logger.Error("warning: " + conf.APP_REGISTRY_URL + " " + err.Error()) continue } app_package_json, err := ioutil.ReadAll(res.Body) if err != nil { logger.Error(fmt.Sprintf("warning, could not read app registry json: %s", err.Error())) continue } // transform json into go struct interface //var f map[string]interface{} err = json.Unmarshal(app_package_json, &new_app_package) if err != nil { logger.Error("error unmarshaling app package " + app_package + ", error=" + err.Error()) continue } apr[app_package] = &new_app_package ap = &new_app_package logger.Debug(1, fmt.Sprintf("app package unmarshalled")) return ap, nil } ap = nil err = errors.New("could not get app package from " + package_url) return }
func (qm *ServerMgr) createOutputNode(task *Task) (err error) { outputs := task.Outputs for name, io := range outputs { logger.Debug(2, fmt.Sprintf("posting output Shock node for file %s in task %s\n", name, task.Id)) nodeid, err := PostNodeWithToken(io, task.TotalWork, task.Info.DataToken) if err != nil { return err } io.Node = nodeid logger.Debug(2, fmt.Sprintf("task %s: output Shock node created, node=%s\n", task.Id, nodeid)) } return }
func (qm *CQMgr) filterWorkByClient(clientid string) (ids []string) { client := qm.clientMap[clientid] for id, _ := range qm.workQueue.wait { if _, ok := qm.workQueue.workMap[id]; !ok { logger.Error(fmt.Sprintf("error: workunit %s is in wait queue but not in workMap", id)) continue } work := qm.workQueue.workMap[id] // In case of edge case where pointer to workunit is in queue but workunit has been deleted // If work.Info is nil, this will cause errors in execution // These will be deleted by servermgr.updateQueue() if work == nil || work.Info == nil { continue } if client == nil { fmt.Fprintf(os.Stderr, "error: Skip_work for client %s is nil", clientid) logger.Error(fmt.Sprintf("error: client %s is nil", clientid)) continue } if client.Skip_work == nil { fmt.Fprintf(os.Stderr, "error: Skip_work for client %s is nil", clientid) logger.Error(fmt.Sprintf("error: Skip_work for client %s is nil", clientid)) continue } //skip works that are in the client's skip-list if contains(client.Skip_work, work.Id) { logger.Debug(2, fmt.Sprintf("2) contains(client.Skip_work, work.Id) %s", id)) continue } //skip works that have dedicate client groups which this client doesn't belong to if len(work.Info.ClientGroups) > 0 { eligible_groups := strings.Split(work.Info.ClientGroups, ",") if !contains(eligible_groups, client.Group) { logger.Debug(2, fmt.Sprintf("3) !contains(eligible_groups, client.Group) %s", id)) continue } } //append works whos apps are supported by the client if contains(client.Apps, work.Cmd.Name) || contains(client.Apps, conf.ALL_APP) { ids = append(ids, id) } else { logger.Debug(2, fmt.Sprintf("3) contains(client.Apps, work.Cmd.Name) || contains(client.Apps, conf.ALL_APP) %s", id)) } } return ids }
func (qm *ServerMgr) locateInputs(task *Task) (err error) { logger.Debug(2, "trying to locate Inputs of task "+task.Id) jobid, _ := GetJobIdByTaskId(task.Id) for _, io := range task.Inputs { name := io.FileName if io.Url == "" { preId := fmt.Sprintf("%s_%s", jobid, io.Origin) if preTask, ok := qm.getTask(preId); ok { if preTask.State == TASK_STAT_SKIPPED || preTask.State == TASK_STAT_FAIL_SKIP { // For now we know that skipped tasks have // just one input and one output. So we know // that we just need to change one file (this // may change in the future) //locateSkippedInput(qm, preTask, io) } else { outputs := preTask.Outputs for _, outio := range outputs { if outio.FileName == name { io.Node = outio.Node } } } } } logger.Debug(2, fmt.Sprintf("processing input %s, %s\n", name, io.Node)) if io.Node == "-" { return errors.New(fmt.Sprintf("error in locate input for task %s, %s", task.Id, name)) } //need time out! if io.Node != "" && io.GetFileSize() < 0 { return errors.New(fmt.Sprintf("task %s: input file %s not available", task.Id, name)) } logger.Debug(2, fmt.Sprintf("inputs located %s, %s\n", name, io.Node)) } // locate predata for _, io := range task.Predata { name := io.FileName logger.Debug(2, fmt.Sprintf("processing predata %s, %s\n", name, io.Node)) // only verify predata that is a shock node if (io.Node != "") && (io.Node != "-") && (io.GetFileSize() < 0) { // bad shock node if io.GetFileSize() < 0 { return errors.New(fmt.Sprintf("task %s: predata file %s not available", task.Id, name)) } logger.Debug(2, fmt.Sprintf("predata located %s, %s\n", name, io.Node)) } } return }
func WaitContainer(container_id string) (status int, err error) { logger.Debug(1, fmt.Sprintf("(WaitContainer) container id: %s", container_id)) stdo, stde, err := RunCommand(conf.DOCKER_BINARY, []string{"wait", container_id}...) _ = stde if err != nil { logger.Debug(1, fmt.Sprintf("(WaitContainer) cmd.Wait returned error: %s", err.Error())) logger.Debug(1, fmt.Sprintf("(WaitContainer) cmd.Wait stdout: %s", stdo)) logger.Debug(1, fmt.Sprintf("(WaitContainer) cmd.Wait stderr: %s", stde)) return 0, err } // extract only first line endofline := bytes.IndexByte(stdo, '\n') stdout_line := "" if endofline > 0 { stdout_line = string(stdo[0:endofline]) } else { err = errors.New("docker create returned empty string") return 0, err } negative_status := false if strings.HasPrefix(stdout_line, "-") { stdout_line = strings.TrimPrefix(stdout_line, "-") negative_status = true } status, err = strconv.Atoi(stdout_line) if err != nil { logger.Debug(1, fmt.Sprintf("(WaitContainer) could not interpret status code: \"%s\"", stdout_line)) // handle error return 0, err } if negative_status { status *= -1 } return status, nil }
//update job info when a task in that job changed to a new state func (qm *ServerMgr) updateJobTask(task *Task) (err error) { parts := strings.Split(task.Id, "_") jobid := parts[0] job, err := LoadJob(jobid) if err != nil { return } remainTasks, err := job.UpdateTask(task) if err != nil { return err } logger.Debug(2, fmt.Sprintf("remaining tasks for task %s: %d", task.Id, remainTasks)) if remainTasks == 0 { //job done qm.FinalizeJobPerf(jobid) qm.LogJobPerf(jobid) qm.removeActJob(jobid) //delete tasks in task map //delete from shock output flagged for deletion for _, task := range job.TaskList() { task.DeleteOutput() task.DeleteInput() qm.deleteTask(task.Id) } //log event about job done (JD) logger.Event(event.JOB_DONE, "jobid="+job.Id+";jid="+job.Jid+";project="+job.Info.Project+";name="+job.Info.Name) } return }
//fetch input data func moveInputData(work *core.Workunit) (size int64, err error) { for _, io := range work.Inputs { inputname := io.FileName dataUrl, uerr := io.DataUrl() if uerr != nil { return 0, uerr } if work.Rank > 0 { dataUrl = fmt.Sprintf("%s&index=%s&part=%s", dataUrl, work.IndexType(), work.Part()) } inputFilePath := path.Join(work.Path(), inputname) logger.Debug(2, "mover: fetching input from url:"+dataUrl) logger.Event(event.FILE_IN, "workid="+work.Id+" url="+dataUrl) // this gets file from any downloadable url, not just shock if datamoved, _, err := shock.FetchFile(inputFilePath, dataUrl, work.Info.DataToken, io.Uncompress, false); err != nil { return size, err } else { size += datamoved } logger.Event(event.FILE_READY, "workid="+work.Id+";url="+dataUrl) } return }
func (qm *ServerMgr) TaskHandle() { for { task := <-qm.taskIn logger.Debug(2, fmt.Sprintf("qmgr:task recived from chan taskIn, id=%s\n", task.Id)) qm.addTask(task) } }
func (qm *CQMgr) ClientChecker() { for { time.Sleep(30 * time.Second) logger.Debug(3, "time to update client list....") for _, client := range qm.GetAllClients() { if client.Tag == true { client.Tag = false total_minutes := int(time.Now().Sub(client.RegTime).Minutes()) hours := total_minutes / 60 minutes := total_minutes % 60 client.Serve_time = fmt.Sprintf("%dh%dm", hours, minutes) if client.Current_work_length() > 0 { client.Idle_time = 0 } else { client.Idle_time += 30 } qm.PutClient(client) } else { if ok := qm.HasClient(client.Id); !ok { continue } //now client must be gone as tag set to false 30 seconds ago and no heartbeat received thereafter logger.Event(event.CLIENT_UNREGISTER, "clientid="+client.Id+";name="+client.Name) //requeue unfinished workunits associated with the failed client qm.ReQueueWorkunitByClient(client.Id) //delete the client from client map qm.RemoveClient(client.Id) } } } }
func (qm *CQMgr) ClientHeartBeat(id string, cg *ClientGroup) (hbmsg HBmsg, err error) { hbmsg = make(map[string]string, 1) if client, ok := qm.GetClient(id); ok { // If the name of the clientgroup (from auth token) does not match the name in the client retrieved, throw an error if cg != nil && client.Group != cg.Name { return nil, errors.New(e.ClientGroupBadName) } client.Tag = true qm.PutClient(client) logger.Debug(3, "HeartBeatFrom:"+"clientid="+id+",name="+client.Name) //get suspended workunit that need the client to discard workids := qm.getWorkByClient(id) suspended := []string{} for _, work := range qm.workQueue.GetSet(workids) { if work.State == WORK_STAT_SUSPEND { suspended = append(suspended, work.Id) } } if len(suspended) > 0 { hbmsg["discard"] = strings.Join(suspended, ",") } if client.Get_Status() == CLIENT_STAT_DELETED { hbmsg["stop"] = id } //hbmsg["discard"] = strings.Join(workids, ",") return hbmsg, nil } return hbmsg, errors.New(e.ClientNotFound) }
//recover a job in db that is missing from queue (caused by server restarting) func (qm *ServerMgr) RecoverJob(id string) (err error) { //Load job by id if qm.isActJob(id) { return errors.New("job " + id + " is already active") } dbjob, err := LoadJob(id) if err != nil { return errors.New("failed to load job " + err.Error()) } if dbjob.State == JOB_STAT_SUSPEND { qm.putSusJob(dbjob.Id) } else { if dbjob.State == JOB_STAT_COMPLETED || dbjob.State == JOB_STAT_DELETED { return errors.New("job is in " + dbjob.State + " state thus cannot be recovered") } for _, task := range dbjob.Tasks { task.Info = dbjob.Info } qm.EnqueueTasksByJobId(dbjob.Id, dbjob.TaskList()) } logger.Debug(2, fmt.Sprintf("Recovered job %s", id)) return }
func heartbeating(host string, clientid string) (msg core.HBmsg, err error) { response := new(HeartbeatResponse) targeturl := fmt.Sprintf("%s/client/%s?heartbeat", host, clientid) //res, err := http.Get(targeturl) var headers httpclient.Header if conf.CLIENT_GROUP_TOKEN != "" { headers = httpclient.Header{ "Authorization": []string{"CG_TOKEN " + conf.CLIENT_GROUP_TOKEN}, } } res, err := httpclient.Get(targeturl, headers, nil, nil) logger.Debug(3, fmt.Sprintf("client %s sent a heartbeat to %s", host, clientid)) if err != nil { return } defer res.Body.Close() jsonstream, err := ioutil.ReadAll(res.Body) if err != nil { return } if err = json.Unmarshal(jsonstream, response); err == nil { if len(response.Errs) > 0 { return msg, errors.New(strings.Join(response.Errs, ",")) } return response.Data, nil } return }
func resetTask(task *Task, info *Info) { task.Info = info task.State = TASK_STAT_PENDING task.RemainWork = task.TotalWork task.ComputeTime = 0 task.CompletedDate = time.Time{} // reset all inputs with an origin for _, input := range task.Inputs { if input.Origin != "" { input.Node = "-" input.Url = "" input.Size = 0 } } // reset / delete all outputs for _, output := range task.Outputs { if dataUrl, _ := output.DataUrl(); dataUrl != "" { // delete dataUrl if is shock node if strings.HasSuffix(dataUrl, shock.DATA_SUFFIX) { if err := shock.ShockDelete(output.Host, output.Node, output.DataToken); err == nil { logger.Debug(2, fmt.Sprintf("Deleted node %s from shock", output.Node)) } else { logger.Error(fmt.Sprintf("resetTask: unable to deleted node %s from shock: %s", output.Node, err.Error())) } } } output.Node = "-" output.Url = "" output.Size = 0 } // delete all workunit logs for _, log := range conf.WORKUNIT_LOGS { deleteStdLogByTask(task.Id, log) } }
func (qm *ServerMgr) InitMaxJid() (err error) { jidfile := conf.DATA_PATH + "/maxjid" if _, err := os.Stat(jidfile); err != nil { f, err := os.Create(jidfile) if err != nil { return err } f.WriteString("10000") qm.nextJid = "10001" f.Close() } else { buf, err := ioutil.ReadFile(jidfile) if err != nil { return err } bufstr := strings.TrimSpace(string(buf)) maxjid, err := strconv.Atoi(bufstr) if err != nil { return err } qm.nextJid = strconv.Itoa(maxjid + 1) } logger.Debug(2, fmt.Sprintf("qmgr:jid initialized, next jid=%s\n", qm.nextJid)) return }