func ScoreConfigure(si *ScoreInfo, r io.Reader) { o.Info("Score: %s (%s)", (*si).Name, (*si).Executable) config := NewScoreInfoConfig() err := config.Read(r, 1) o.MightFail(err, "Error Parsing Score Configuration for %s", si.Name) si.updateFromConfig(config) }
func regInternalAdd(hostname string) { o.Info("Adding host: %s", hostname) clientList[hostname] = NewClientInfo() // do this initialisation here since it'll help unmask sequencing errors clientList[hostname].pendingTasks = make(map[uint64]*TaskRequest) clientList[hostname].Player = hostname }
func handleRequest(c net.Conn, message interface{}) { o.Debug("Request Recieved. Decoding!") ptr, ok := message.(*o.TaskRequest) if !ok { o.Assert("CC stuffed up - handleRequest got something that wasn't a TaskRequest.") } task := TaskFromProto(ptr) /* search the registry for the task */ o.Debug("Request for Job.ID %d", task.Id) existing := TaskGet(task.Id) if nil != existing { if existing.MyResponse.IsFinished() { o.Debug("job%d: Resending Response", task.Id) sendResponse(c, existing.MyResponse) } } else { // check to see if we have the score // add the Job to our Registry task.MyResponse = NewTaskResponse() task.MyResponse.id = task.Id task.MyResponse.State = RESP_PENDING TaskAdd(task) o.Info("Added New Task (Job ID %d) to our local registry", task.Id) // and then push it onto the pending job list so we know it needs actioning. appendPendingTask(task) } }
// handle the signals. By default, we ignore everything, but the // three terminal signals, HUP, INT, TERM, we want to explicitly // handle. func signalHandler() { c := make(chan os.Signal) signal.Notify(c, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM) for { sig := <-c ux, ok := sig.(syscall.Signal) if !ok { o.Warn("Couldn't handle signal %s, coercion failed", sig) continue } switch ux { case syscall.SIGHUP: o.Info("Reloading configuration...") ConfigLoad() case syscall.SIGINT, syscall.SIGTERM: //FIXME: Gentle Shutdown SaveState() os.Exit(0) } } }
func handleRequest(c net.Conn, message interface{}) { o.Debug("Request Recieved. Decoding!") ptr, ok := message.(*o.ProtoTaskRequest) if !ok { o.Assert("CC stuffed up - handleRequest got something that wasn't a ProtoTaskRequest.") } job := o.JobFromProto(ptr) /* search the registry for the job */ o.Debug("Request for Job.ID %d", job.Id) existing := o.JobGet(job.Id) if nil != existing { if existing.MyResponse.IsFinished() { o.Debug("job%d: Resending Response", job.Id) sendResponse(c, existing.MyResponse) } } else { // check to see if we have the score // add the Job to our Registry job.MyResponse = o.NewTaskResponse() job.MyResponse.Id = job.Id job.MyResponse.State = o.RESP_PENDING o.JobAdd(job) o.Info("Added New Job %d to our local registry", job.Id) // and then push it onto the pending job list so we know it needs actioning. appendPendingJob(job) } }
func connectMe(initialDelay int64) { var backOff int64 = initialDelay for { // Sleep first. if backOff > 0 { o.Info("Sleeping for %d seconds", backOff/1e9) err := time.Sleep(backOff) o.MightFail(err, "Couldn't Sleep") backOff *= ReconnectDelayScale if backOff > MaximumReconnectDelay { backOff = MaximumReconnectDelay } } else { backOff = InitialReconnectDelay } tconf := &tls.Config{ RootCAs: CACertPool, } tconf.Certificates = append(tconf.Certificates, CertPair) // update our local hostname. LocalHostname = GetStringOpt("player name") if LocalHostname == "" { LocalHostname = o.ProbeHostname() o.Warn("No hostname provided - probed hostname: %s", LocalHostname) } masterHostname := GetStringOpt("master") raddr := fmt.Sprintf("%s:%d", masterHostname, 2258) o.Info("Connecting to %s", raddr) conn, err := tls.Dial("tcp", raddr, tconf) if err == nil { conn.Handshake() err = conn.VerifyHostname(masterHostname) } if err == nil { nc := new(NewConnectionInfo) nc.conn = conn nc.timeout = backOff newConnection <- nc return } o.Warn("Couldn't connect to master: %s", err) } }
func batchLogger(jobid uint64, errpipe *os.File) { defer errpipe.Close() r := bufio.NewReader(errpipe) for { lb, _, err := r.ReadLine() if err == io.EOF { return } if err != nil { o.Warn("executionLogger failed: %s", err) return } o.Info("job%d: STDERR: %s", jobid, string(lb)) } }
func handleIdentify(client *ClientInfo, message interface{}) { if client.Player != "" { o.Warn("Client %s: tried to reintroduce itself", client.Name()) client.Abort() return } ic, _ := message.(*o.IdentifyClient) o.Info("Client %s: identified itself as \"%s\"", client.Name(), *ic.Hostname) client.Player = *ic.Hostname if !HostAuthorised(client.Player) { o.Warn("Client %s: not authorised", client.Name()) client.Abort() return } /* if we're TLS, verify the client's certificate given the name it used */ tlsc, ok := client.connection.(*tls.Conn) if ok && !*DontVerifyPeer { cs := tlsc.ConnectionState() if cs.PeerCertificates == nil || cs.PeerCertificates[0] == nil { o.Warn("Client %s: peer didn't provide a certificate", client.Name()) client.Abort() return } err := cs.PeerCertificates[0].VerifyHostname(client.Player) if err != nil { o.Warn("Client %s: couldn't be identified: %s", client.Name(), err) client.Abort() return } } reg := ClientGet(client.Player) if nil == reg { o.Warn("Client %s: couldn't register", client.Name()) client.Abort() return } client.MergeState(reg) }
// handle the signals. By default, we ignore everything, but the // three terminal signals, HUP, INT, TERM, we want to explicitly // handle. func signalHandler() { c := make(chan os.Signal) signal.Notify(c, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM) for { sig := <-c ux, ok := sig.(syscall.Signal) if !ok { o.Warn("Couldn't handle signal %s, coercion failed", sig) continue } switch ux { case syscall.SIGHUP: o.Info("Reloading configuration...") reloadScores <- 1 case syscall.SIGINT, syscall.SIGTERM: os.Exit(0) } } }
func regInternalDel(hostname string) { o.Info("Removing host: %s", hostname) /* remove it from the registry */ delete(clientList, hostname) }
func ProcessingLoop() { var conn net.Conn = nil var nextRetryResp *TaskResponse = nil var taskCompletionChan <-chan *TaskResponse = nil var connectDelay time.Duration var doScoreReload bool = false // kick off a new connection attempt. go connectMe(connectDelay) // and this is where we spin! for { var retryDelay time.Duration = 0 var retryChan <-chan time.Time = nil if conn != nil { for nextRetryResp == nil { nextRetryResp = getNextUnacknowledgedResponse() if nil == nextRetryResp { break } retryDelay = nextRetryResp.RetryTime.Sub(time.Now()) if retryDelay < 0 { sendResponse(conn, nextRetryResp) nextRetryResp = nil } } if nextRetryResp != nil { retryChan = time.After(retryDelay) } } if taskCompletionChan == nil { nextTask := getNextPendingTask() if nextTask != nil { taskCompletionChan = ExecuteTask(nextTask) } else { if conn != nil && !pendingTaskRequest { o.Debug("Asking for trouble") p := o.MakeReadyForTask() p.Send(conn) o.Debug("Sent Request for trouble") pendingTaskRequest = true } } } select { // Currently executing job finishes. case newresp := <-taskCompletionChan: o.Debug("job%d: Completed with State %s\n", newresp.id, newresp.State) // preemptively set a retrytime. newresp.RetryTime = time.Now() // ENOCONN - sub it in as our next retryresponse, and prepend the old one onto the queue. if nil == conn { if nil != nextRetryResp { prequeueResponse(nextRetryResp) } o.Debug("job%d: Queuing Initial Response", newresp.id) nextRetryResp = newresp } else { o.Debug("job%d: Sending Initial Response", newresp.id) sendResponse(conn, newresp) } if doScoreReload { o.Info("Performing Deferred score reload") LoadScores() doScoreReload = false } taskCompletionChan = nil // If the current unacknowledged response needs a retry, send it. case <-retryChan: sendResponse(conn, nextRetryResp) nextRetryResp = nil // New connection. Set up the receiver thread and Introduce ourselves. case nci := <-newConnection: if conn != nil { conn.Close() } conn = nci.conn connectDelay = nci.timeout pendingTaskRequest = false // start the reader go Reader(conn) /* Introduce ourself */ p := o.MakeIdentifyClient(LocalHostname, PlayerVersion) p.Send(conn) // Lost connection. Shut downt he connection. case <-lostConnection: o.Warn("Lost Connection to Master") conn.Close() conn = nil // restart the connection attempts go connectMe(connectDelay) // Message received from master. Decode and action. case p := <-receivedMessage: // because the message could possibly be an ACK, push the next retry response back into the queue so acknowledge can find it. if nil != nextRetryResp { prequeueResponse(nextRetryResp) nextRetryResp = nil } var upkt interface{} = nil if p.Length > 0 { var err error upkt, err = p.Decode() o.MightFail(err, "Couldn't decode packet from master") } handler, exists := dispatcher[p.Type] if exists { connectDelay = 0 handler(conn, upkt) } else { o.Fail("Unhandled Pkt Type %d", p.Type) } // Reload scores case <-reloadScores: // fortunately this is actually completely safe as // long as nobody's currently executing. // who'd have thunk it? if taskCompletionChan == nil { o.Info("Reloading scores") LoadScores() } else { o.Info("Deferring score reload (execution in progress)") doScoreReload = true } // Keepalive delay expired. Send Nop. case <-time.After(KeepaliveDelay): if conn == nil { break } o.Debug("Sending NOP") p := o.MakeNop() p.Send(conn) } } }
func handleResult(client *ClientInfo, message interface{}) { jr, _ := message.(*o.ProtoTaskResponse) r := ResponseFromProto(jr) // at this point in time, we only care about terminal // condition codes. a Job that isn't finished is just // prodding us back to let us know it lives. if r.IsFinished() { job := JobGet(r.id) if nil == job { o.Warn("Client %s: NAKing job%d, couldn't find job data", client.Name(), r.id) nack := o.MakeNack(r.id) client.sendNow(nack) } else { job := JobGet(r.id) if job != nil { /* if the job exists, Ack it. */ ack := o.MakeAck(r.id) client.sendNow(ack) } // now, we only accept the results if we were // expecting the results (ie: it was pending) // and expunge the task information from the // pending list so we stop bugging the client for it. task, exists := client.pendingTasks[r.id] if exists { // store the result. if !JobAddResult(client.Player, r) { o.Assert("Couldn't add result for pending task") } // next, work out if the job is a retryable failure or not var didretry bool if r.DidFail() { o.Info("Client %s: reported failure for job%d", client.Name(), r.id) if r.CanRetry() { job := JobGet(r.id) if job.Scope == SCOPE_ONEOF { // right, we're finally deep enough to work out what's going on! JobDisqualifyPlayer(r.id, client.Player) if len(job.Players) >= 1 { // still players left we can try? then go for it! CleanTask(task) DispatchTask(task) didretry = true } } } } if !didretry { // if we didn't retry, the task needs to be marked as finished. task.State = TASK_FINISHED } // update the job state. JobReviewState(r.id) delete(client.pendingTasks, r.id) } } } }
func ServiceRequests() { var sockConfig tls.Config // resolve the bind address bindAddressStr := GetStringOpt("bind address") var bindAddr *net.IPAddr if bindAddressStr != "" { var err error bindAddr, err = net.ResolveIPAddr("ip", bindAddressStr) if err != nil { o.Warn("Ignoring bind address. Couldn't resolve \"%s\": %s", bindAddressStr, err) } else { bindAddr = nil } } // load the x509 certificate and key, then attach it to the tls config. x509CertFilename := GetStringOpt("x509 certificate") x509PrivateKeyFilename := GetStringOpt("x509 private key") serverCert, err := tls.LoadX509KeyPair(x509CertFilename, x509PrivateKeyFilename) o.MightFail(err, "Couldn't load certificates") sockConfig.Certificates = append(sockConfig.Certificates, serverCert) // load the CA certs CACertPool = x509.NewCertPool() caCertNames := GetCACertList() if caCertNames != nil { for _, filename := range caCertNames { fh, err := os.Open(filename) if err != nil { o.Warn("Whilst parsing CA certs, couldn't open %s: %s", filename, err) continue } defer fh.Close() fi, err := fh.Stat() o.MightFail(err, "Couldn't stat CA certificate file: %s", filename) data := make([]byte, fi.Size()) fh.Read(data) CACertPool.AppendCertsFromPEM(data) } } sockConfig.ClientCAs = CACertPool // determine the server hostname. servername := GetStringOpt("server name") if servername != "" { o.Info("Using %s as the server name", servername) sockConfig.ServerName = servername } else { if bindAddr != nil { o.Warn("Probing for FQDN for bind address as none was provided") hostnames, err := net.LookupAddr(bindAddr.String()) o.MightFail(err, "Failed to get full hostname for bind address") sockConfig.ServerName = hostnames[0] } else { o.Warn("Probing for FQDN as no server name was provided") sockConfig.ServerName = o.ProbeHostname() } } // ask the client to authenticate sockConfig.ClientAuth = tls.RequireAndVerifyClientCert if *DontVerifyPeer { sockConfig.ClientAuth = tls.RequestClientCert } /* convert the bindAddress to a string suitable for the Listen call */ var laddr string if bindAddr == nil { laddr = fmt.Sprintf(":%d", o.DefaultMasterPort) } else { laddr = fmt.Sprintf("%s:%d", bindAddr.String(), o.DefaultMasterPort) } o.Info("Binding to %s...", laddr) listener, err := tls.Listen("tcp", laddr, &sockConfig) o.MightFail(err, "Couldn't bind TLS listener") for { o.Info("Waiting for connection...") c, err := listener.Accept() o.MightFail(err, "Couldn't accept TLS connection") o.Info("Connection received from %s", c.RemoteAddr().String()) HandleConnection(c) } }
func doExecution(task *TaskRequest, completionChannel chan<- *TaskResponse) { // we must notify the parent when we exit. defer func(c chan<- *TaskResponse, task *TaskRequest) { c <- task.MyResponse }(completionChannel, task) // first of all, verify that the score exists at all. score, exists := Scores[task.Score] if !exists { o.Warn("job%d: request for unknown score: %s", task.Id, task.Score) task.MyResponse.State = RESP_FAILED_UNKNOWN_SCORE return } si := NewScoreInterface(task) if si == nil { o.Warn("job%d: couldn't initialise score interface", task.Id) task.MyResponse.State = RESP_FAILED_HOST_ERROR return } if !si.Prepare() { o.Warn("job%d: couldn't prepare score interface", task.Id) task.MyResponse.State = RESP_FAILED_HOST_ERROR return } defer si.Cleanup() eenv := si.SetupProcess() task.MyResponse.State = RESP_RUNNING procenv := new(os.ProcAttr) // Build the default environment. procenv.Env = peSetEnv(procenv.Env, "PATH", "/usr/bin:/usr/sbin:/bin:/sbin") procenv.Env = peSetEnv(procenv.Env, "IFS", " \t\n") pwd, err := os.Getwd() if err != nil { task.MyResponse.State = RESP_FAILED_HOST_ERROR o.Warn("job%d: couldn't resolve PWD: %s", task.Id, err) return } procenv.Env = peSetEnv(procenv.Env, "PWD", pwd) // copy in the environment overrides for k, v := range eenv.Environment { procenv.Env = peSetEnv(procenv.Env, k, v) } // attach FDs to procenv. procenv.Files = make([]*os.File, 3) // first off, attach /dev/null to stdin and stdout devNull, err := os.OpenFile(os.DevNull, os.O_RDWR|os.O_APPEND, 0666) o.MightFail(err, "couldn't open DevNull") defer devNull.Close() for i := 0; i < 2; i++ { procenv.Files[i] = devNull } // attach STDERR to to our logger via pipe. lr, lw, err := os.Pipe() o.MightFail(err, "Couldn't create pipe") defer lw.Close() // lr will be closed by the logger. procenv.Files[2] = lw // check the environment's configuration and allow it to override stdin, stdout, and FDs 3+ if nil != eenv.Files { for i := range eenv.Files { if i < 2 { procenv.Files[i] = eenv.Files[i] } else { procenv.Files = append(procenv.Files, eenv.Files[i]) } } } var args []string args = append(args, eenv.Arguments...) o.Info("job%d: executing %s...", task.Id, score.Executable) go batchLogger(task.Id, lr) proc, err := os.StartProcess(score.Executable, args, procenv) if err != nil { o.Warn("job%d: failed to start process", task.Id) task.MyResponse.State = RESP_FAILED_HOST_ERROR return } wm, err := proc.Wait() if err != nil { o.Warn("job%d: error waiting for process", task.Id) task.MyResponse.State = RESP_FAILED_UNKNOWN // Worse of all, we don't even know if we succeeded. return } ws, _ := wm.Sys().(syscall.WaitStatus) if !(ws.Signaled() || ws.Exited()) { o.Assert("Non Terminal notification received when not expected.") return } if ws.Signaled() { o.Warn("job%d: process got signalled", task.Id) task.MyResponse.State = RESP_FAILED_UNKNOWN return } if ws.Exited() { if 0 == ws.ExitStatus() { o.Warn("job%d: process exited OK", task.Id) task.MyResponse.State = RESP_FINISHED } else { o.Warn("job%d: process exited with failure", task.Id) task.MyResponse.State = RESP_FAILED } return } o.Assert("Should never get here.") }