func UnixAudienceListener(sockaddr string) { fi, err := os.Stat(sockaddr) if err == nil { fmode := fi.Mode() if fmode&os.ModeType == os.ModeSocket { o.Warn("Removing stale socket at %s", sockaddr) os.Remove(sockaddr) } else { o.Fail("%s exists and is not a socket", sockaddr) } } laddr, err := net.ResolveUnixAddr("unix", sockaddr) o.MightFail(err, "Couldn't resolve audience socket address") l, err := net.ListenUnix("unix", laddr) o.MightFail(err, "Couldn't start audience unixsock listener") // Fudge the permissions on the unixsock! fi, err = os.Stat(sockaddr) if err == nil { os.Chmod(sockaddr, fi.Mode()|0777) } else { o.Warn("Couldn't fudge permission on audience socket: %s", err) } // make sure we clean up the unix socket when we die. defer l.Close() defer os.Remove(sockaddr) AudienceListener(l) }
// handle the signals. By default, we ignore everything, but the // three terminal signals, HUP, INT, TERM, we want to explicitly // handle. func signalHandler() { for { sig := <-signal.Incoming ux, ok := sig.(os.UnixSignal) if !ok { o.Warn("Couldn't handle signal %s, Coercion failed", sig) continue } switch int(ux) { case syscall.SIGHUP: o.Warn("Reloading Configuration") reloadScores <- 1 case syscall.SIGINT: fmt.Fprintln(os.Stderr, "Interrupt Received - Terminating") //FIXME: Gentle Shutdown os.Exit(1) case syscall.SIGTERM: fmt.Fprintln(os.Stderr, "Terminate Received - Terminating") //FIXME: Gentle Shutdown os.Exit(2) } } }
// handle the signals. By default, we ignore everything, but the // three terminal signals, HUP, INT, TERM, we want to explicitly // handle. func signalHandler() { incoming := make(chan os.Signal) signal.Notify(incoming, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM) for { sig := <-incoming ux, ok := sig.(syscall.Signal) if !ok { o.Warn("*BUG* Couldn't handle signal %s, Coercion failed", sig) continue } switch ux { case syscall.SIGHUP: o.Warn("Reloading Configuration") ConfigLoad() case syscall.SIGINT: fmt.Fprintln(os.Stderr, "Interrupt Received - Terminating") //FIXME: Gentle Shutdown SaveState() os.Exit(1) case syscall.SIGTERM: fmt.Fprintln(os.Stderr, "Terminate Received - Terminating") //FIXME: Gentle Shutdown os.Exit(2) } } }
func loadSpoolFiles(dirname string, depth int) { dh, err := os.Open(dirname) o.MightFail(err, "Couldn't open %s", dirname) nodes, err := dh.Readdir(-1) o.MightFail(err, "Couldn't readdir on %s", dirname) if depth > 0 { for _, n := range nodes { abspath := path.Join(dirname, n.Name()) if (n.Mode() & os.ModeType) == os.ModeDir { // if not a single character, it's not a spool node. if len(n.Name()) != 1 { continue } if n.Name() == "." { // we're not interested in . continue } nrunes := []rune(n.Name()) if unicode.Is(unicode.ASCII_Hex_Digit, nrunes[0]) { loadSpoolFiles(abspath, depth-1) } else { o.Warn("Foreign dirent %s found in spool tree", abspath) } } } } else { // depth == 0 - only interested in files. for _, n := range nodes { abspath := path.Join(dirname, n.Name()) if n.Mode()&os.ModeType == 0 { if len(n.Name()) != 16 { shuffleToCorrupted(abspath, "Filename incorrect length") continue } id, err := strconv.ParseUint(n.Name(), 16, 64) if err != nil { shuffleToCorrupted(abspath, "Invalid Filename") continue } fh, err := os.Open(abspath) if err != nil { shuffleToCorrupted(abspath, "Couldn't open") continue } defer fh.Close() jr, err := JobRequestFromReader(fh) if err != nil || jr.Id != id { o.Warn("Couldn't parse?! %s", err) shuffleToCorrupted(abspath, "Parse Failure") continue } // Add the request to the registry directly. if !RestoreJobState(jr) { shuffleToCorrupted(abspath, "Job State Invalid") } } } } }
func LoadScores() { scoreDirectory := GetStringOpt("score directory") dir, err := os.Open(scoreDirectory) o.MightFail(err, "Couldn't open Score directory") defer dir.Close() Scores = make(map[string]*ScoreInfo) files, err := dir.Readdir(-1) for i := range files { name := files[i].Name() // skip ., .. and other dotfiles. if strings.HasPrefix(name, ".") { continue } // emacs backup files. ignore these. if strings.HasSuffix(name, "~") || strings.HasPrefix(name, "#") { continue } // .conf is reserved for score configurations. if strings.HasSuffix(name, ".conf") { continue } // check to see if it's a file or symlink ftype := files[i].Mode() & os.ModeType if ftype != 0 && ftype != os.ModeSymlink { continue } // check for the executionable bit if files[i].Mode()&0111 == 0 { continue } fullpath := path.Join(scoreDirectory, name) conffile := fullpath + ".conf" o.Warn("Considering %s as score", name) si := NewScoreInfo() si.Name = name si.Executable = fullpath conf, err := os.Open(conffile) if err == nil { o.Warn("Parsing configuration for %s", fullpath) ScoreConfigure(si, conf) conf.Close() } else { o.Warn("Couldn't open config file for %s, assuming defaults: %s", name, err) } Scores[name] = si } }
// Can only be used form inside of handlers and the main client loop. func (client *ClientInfo) sendNow(p *o.WirePkt) { _, err := p.Send(client.connection) if err != nil { o.Warn("Client %s: error sending packet: %s", client.Name(), err) client.Abort() } }
// handle the signals. By default, we ignore everything, but the // three terminal signals, HUP, INT, TERM, we want to explicitly // handle. func signalHandler() { c := make(chan os.Signal) signal.Notify(c, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM) for { sig := <-c ux, ok := sig.(syscall.Signal) if !ok { o.Warn("Couldn't handle signal %s, coercion failed", sig) continue } switch ux { case syscall.SIGHUP: o.Info("Reloading configuration...") ConfigLoad() case syscall.SIGINT, syscall.SIGTERM: //FIXME: Gentle Shutdown SaveState() os.Exit(0) } } }
func connectMe(initialDelay int64) { var backOff int64 = initialDelay for { // Sleep first. if backOff > 0 { o.Info("Sleeping for %d seconds", backOff/1e9) err := time.Sleep(backOff) o.MightFail(err, "Couldn't Sleep") backOff *= ReconnectDelayScale if backOff > MaximumReconnectDelay { backOff = MaximumReconnectDelay } } else { backOff = InitialReconnectDelay } tconf := &tls.Config{ RootCAs: CACertPool, } tconf.Certificates = append(tconf.Certificates, CertPair) // update our local hostname. LocalHostname = GetStringOpt("player name") if LocalHostname == "" { LocalHostname = o.ProbeHostname() o.Warn("No hostname provided - probed hostname: %s", LocalHostname) } masterHostname := GetStringOpt("master") raddr := fmt.Sprintf("%s:%d", masterHostname, 2258) o.Info("Connecting to %s", raddr) conn, err := tls.Dial("tcp", raddr, tconf) if err == nil { conn.Handshake() err = conn.VerifyHostname(masterHostname) } if err == nil { nc := new(NewConnectionInfo) nc.conn = conn nc.timeout = backOff newConnection <- nc return } o.Warn("Couldn't connect to master: %s", err) } }
func SetSpoolDirectory(spooldir string) { if spoolDirectory == "" { spoolDirectory = spooldir } else { if spooldir != spoolDirectory { o.Warn("Spool Directory Not Changed.") } } }
func shuffleToCorrupted(abspath, reason string) { basename := path.Base(abspath) targetname := path.Join(spoolDirectory, "corrupt", basename) // make sure there's nothing in the target name. os.Remove(targetname) err := os.Rename(abspath, targetname) o.MightFail(err, "Couldn't bin corrupt spoolfile %s", abspath) o.Warn("Moved \"%s\" to corrupted spool: %s", abspath, reason) }
func SetSpoolDirectory(spooldir string) { if spoolDirectory == "" { spoolDirectory = spooldir } else { if spooldir != spoolDirectory { o.Warn("Refusing to change spool directory as it's already set") } } }
func writeIdCheckpoint() { fh, err := os.OpenFile(checkpointPath(), os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0600) if err != nil { o.Warn("Failed to create checkpoint file: %s", err) return } defer fh.Close() fmt.Fprintf(fh, "%d\n", lastId) }
func AudienceListener(l net.Listener) { for { c, err := l.Accept() if err != nil { o.Warn("Accept() failed on Audience Listenter.") break } go handleAudienceRequest(c) } }
func saveLastId() { fh, err := os.OpenFile(savePath(), os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0600) if err != nil { o.Warn("Failed to create last ID save file: %s", err) return } defer fh.Close() fmt.Fprintf(fh, "%d\n", lastId) os.Remove(checkpointPath()) }
func ConfigLoad() { // attempt to open the configuration file. fh, err := os.Open(*ConfigFile) if nil == err { defer fh.Close() // reset the config File data, then reload it. configFile.Reset() ierr := configFile.Read(fh, 1) o.MightFail(ierr, "Couldn't parse configuration") } else { o.Warn("Couldn't open configuration file: %s. Proceeding anyway.", err) } playerpath := strings.TrimSpace(GetStringOpt("player file path")) pfh, err := os.Open(playerpath) o.MightFail(err, "Couldn't open \"%s\"", playerpath) pbr := bufio.NewReader(pfh) ahmap := make(map[string]bool) for err = nil; err == nil; { var lb []byte var prefix bool lb, prefix, err = pbr.ReadLine() if nil == lb { break } if prefix { o.Fail("ConfigLoad: Short Read (prefix only)!") } line := strings.TrimSpace(string(lb)) if line == "" { continue } if line[0] == '#' { continue } ahmap[line] = true } // convert newAuthorisedHosts to a slice authorisedHosts := make([]string, len(ahmap)) idx := 0 for k, _ := range ahmap { authorisedHosts[idx] = k idx++ } ClientUpdateKnown(authorisedHosts) // set the spool directory SetSpoolDirectory(GetStringOpt("conductor state path")) }
// dump the bytestream in buf into the serialisation file for req. func (req *JobRequest) doSerialisation(buf []byte) { // first up, clean up old state. UnlinkNodesForJobId(req.Id) outpath := req.FilenameForSpool() fh, err := os.OpenFile(outpath, os.O_WRONLY|os.O_CREATE|os.O_EXCL, 0600) if err != nil { o.Warn("Could not create persistence file %s: %s", outpath, err) return } defer fh.Close() fh.Write(buf) }
func regintGetJob(req *registryRequest, resp *registryResponse) { job, exists := jobRegister[req.id] resp.success = exists if exists { resp.jobs = make([]*JobRequest, 1) resp.jobs[0] = job } else { o.Warn("Received request for job%d which is not in memory", req.id) go regintGetJobDeferred(req.id, req.responseChannel) // mask out the responseChannel so the deferred handler can use it. req.responseChannel = nil } }
func handleIdentify(client *ClientInfo, message interface{}) { if client.Player != "" { o.Warn("Client %s: tried to reintroduce itself", client.Name()) client.Abort() return } ic, _ := message.(*o.IdentifyClient) o.Info("Client %s: identified itself as \"%s\"", client.Name(), *ic.Hostname) client.Player = *ic.Hostname if !HostAuthorised(client.Player) { o.Warn("Client %s: not authorised", client.Name()) client.Abort() return } /* if we're TLS, verify the client's certificate given the name it used */ tlsc, ok := client.connection.(*tls.Conn) if ok && !*DontVerifyPeer { cs := tlsc.ConnectionState() if cs.PeerCertificates == nil || cs.PeerCertificates[0] == nil { o.Warn("Client %s: peer didn't provide a certificate", client.Name()) client.Abort() return } err := cs.PeerCertificates[0].VerifyHostname(client.Player) if err != nil { o.Warn("Client %s: couldn't be identified: %s", client.Name(), err) client.Abort() return } } reg := ClientGet(client.Player) if nil == reg { o.Warn("Client %s: couldn't register", client.Name()) client.Abort() return } client.MergeState(reg) }
func sendQueueFailureResponse(reason string, enc *json.Encoder) { resp := make([]interface{}, 2) resperr := new(string) *resperr = "Error" resp[0] = resperr if reason != "" { resp[1] = &reason } err := enc.Encode(resp) if nil != err { o.Warn("Couldn't encode response to audience: %s", err) } }
func ConfigLoad() { // attempt to open the configuration file. fh, err := os.Open(*ConfigFile) if nil == err { defer fh.Close() // reset the config File data, then reload it. configFile.Reset() ierr := configFile.Read(fh, 1) o.MightFail(ierr, "Couldn't parse configuration") } else { o.Warn("Couldn't open configuration file: %s. Proceeding anyway.", err) } // load the x509 certificates x509CertFilename := GetStringOpt("x509 certificate") x509PrivateKeyFilename := GetStringOpt("x509 private key") CertPair, err = tls.LoadX509KeyPair(x509CertFilename, x509PrivateKeyFilename) o.MightFail(err, "Couldn't load certificates") // load the CA Certs CACertPool = x509.NewCertPool() caCertNames := GetCACertList() if caCertNames != nil { for _, filename := range caCertNames { fh, err := os.Open(filename) if err != nil { o.Warn("Whilst parsing CA certs, couldn't open %s: %s", filename, err) continue } defer fh.Close() fi, err := fh.Stat() o.MightFail(err, "Couldn't stat CA certificate file: %s", filename) data := make([]byte, fi.Size()) fh.Read(data) CACertPool.AppendCertsFromPEM(data) } } }
func Reader(conn net.Conn) { defer func(l chan int) { l <- 1 }(lostConnection) for { pkt, err := o.Receive(conn) if err != nil { o.Warn("Error receiving message: %s", err) break } receivedMessage <- pkt } }
func sendResponse(c net.Conn, resp *TaskResponse) { //FIXME: update retry time on Response ptr := resp.Encode() p, err := o.Encode(ptr) o.MightFail(err, "Failed to encode response") _, err = p.Send(c) if err != nil { o.Warn("Transmission error: %s", err) c.Close() prequeueResponse(resp) lostConnection <- 1 } else { appendUnacknowledgedResponse(resp) } }
func clientReceiver(client *ClientInfo) { conn := client.connection loop := true for loop { pkt, err := o.Receive(conn) if nil != err { o.Warn("Client %s: error receiving packet: %s", conn.RemoteAddr().String(), err) client.Abort() client.connection.Close() loop = false } else { client.PktInQ <- pkt } } }
func batchLogger(jobid uint64, errpipe *os.File) { defer errpipe.Close() r := bufio.NewReader(errpipe) for { lb, _, err := r.ReadLine() if err == io.EOF { return } if err != nil { o.Warn("executionLogger failed: %s", err) return } o.Info("job%d: STDERR: %s", jobid, string(lb)) } }
func sendQueueSuccessResponse(job *JobRequest, enc *json.Encoder) { resp := make([]interface{}, 2) resperr := new(string) *resperr = "OK" resp[0] = resperr // this probably looks odd, but all numbers cross through float64 when being json encoded. d'oh! jobid := new(uint64) *jobid = uint64(job.Id) resp[1] = jobid err := enc.Encode(resp) if nil != err { o.Warn("Couldn't encode response to audience: %s", err) } }
func regintGetJobDeferred(jobid uint64, responseChannel chan<- *registryResponse) { resp := new(registryResponse) resp.success = false defer func(resp *registryResponse, rChan chan<- *registryResponse) { rChan <- resp }(resp, responseChannel) req, err := LoadFromFinished(jobid) if err != nil { o.Warn("Couldn't load job%d from disk. Doesn't exist?", jobid) return } // fix up the state, and stuff it back into the system RestoreJobState(req) resp.jobs = make([]*JobRequest, 1) resp.jobs[0] = req resp.success = true }
// pipeListener is the goroutine that sits on the stdout pipe and // processes what it sees. func pipeListener(task *TaskRequest, outpipe *os.File) { defer outpipe.Close() r := bufio.NewReader(outpipe) for { lb, _, err := r.ReadLine() if err == io.EOF { return } if err != nil { o.Warn("pipeListener failed: %s", err) return } linein := string(lb) if strings.Index(linein, "=") >= 0 { bits := strings.SplitN(linein, "=", 2) task.MyResponse.Response[bits[0]] = bits[1] } } }
// handle the signals. By default, we ignore everything, but the // three terminal signals, HUP, INT, TERM, we want to explicitly // handle. func signalHandler() { c := make(chan os.Signal) signal.Notify(c, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM) for { sig := <-c ux, ok := sig.(syscall.Signal) if !ok { o.Warn("Couldn't handle signal %s, coercion failed", sig) continue } switch ux { case syscall.SIGHUP: o.Info("Reloading configuration...") reloadScores <- 1 case syscall.SIGINT, syscall.SIGTERM: os.Exit(0) } } }
func handleAudienceRequest(c net.Conn) { defer c.Close() r, _ := c.(io.Reader) w, _ := c.(io.Writer) dec := json.NewDecoder(r) enc := json.NewEncoder(w) outobj := new(GenericJsonRequest) err := dec.Decode(outobj) if err != nil { o.Warn("Error decoding JSON talking to audience: %s", err) return } if nil == outobj.Op { o.Warn("Malformed JSON message talking to audience. Missing Op") return } switch *(outobj.Op) { case "status": if nil == outobj.Id { o.Warn("Malformed Status message talking to audience. Missing Job ID") return } job := JobGet(*outobj.Id) jresp := new([2]interface{}) if nil != job { jresp[0] = "OK" iresp := NewJsonStatusResponse() iresp.Status = job.State resnames := JobGetResultNames(*outobj.Id) for i := range resnames { tr := JobGetResult(*outobj.Id, resnames[i]) if nil != tr { presp := NewJsonPlayerStatus() presp.Status = tr.State for k, v := range tr.Response { presp.Response[k] = v } iresp.Players[resnames[i]] = presp } } jresp[1] = iresp } else { jresp[0] = "Error" jresp[1] = nil } enc.Encode(jresp) o.Debug("Status...") case "queue": if nil == outobj.Score { o.Warn("Malformed Queue message talking to audience. Missing Score") sendQueueFailureResponse("Missing Score", enc) return } if nil == outobj.Scope { o.Warn("Malformed Queue message talking to audience. Missing Scope") sendQueueFailureResponse("Missing Scope", enc) return } if nil == outobj.Players || len(outobj.Players) < 1 { o.Warn("Malformed Queue message talking to audience. Missing Players") sendQueueFailureResponse("Missing Players", enc) return } for _, player := range outobj.Players { if !HostAuthorised(player) { o.Warn("Malformed Queue message - unknown player %s specified.", player) sendQueueFailureResponse("Invalid Player", enc) return } } job := NewRequest() job.Score = *outobj.Score job.Scope = *outobj.Scope job.Players = outobj.Players job.Params = outobj.Params QueueJob(job) sendQueueSuccessResponse(job, enc) default: o.Warn("Unknown operation talking to audience: \"%s\"", *(outobj.Op)) return } _ = enc }
func ProcessingLoop() { var conn net.Conn = nil var nextRetryResp *TaskResponse = nil var taskCompletionChan <-chan *TaskResponse = nil var connectDelay time.Duration var doScoreReload bool = false // kick off a new connection attempt. go connectMe(connectDelay) // and this is where we spin! for { var retryDelay time.Duration = 0 var retryChan <-chan time.Time = nil if conn != nil { for nextRetryResp == nil { nextRetryResp = getNextUnacknowledgedResponse() if nil == nextRetryResp { break } retryDelay = nextRetryResp.RetryTime.Sub(time.Now()) if retryDelay < 0 { sendResponse(conn, nextRetryResp) nextRetryResp = nil } } if nextRetryResp != nil { retryChan = time.After(retryDelay) } } if taskCompletionChan == nil { nextTask := getNextPendingTask() if nextTask != nil { taskCompletionChan = ExecuteTask(nextTask) } else { if conn != nil && !pendingTaskRequest { o.Debug("Asking for trouble") p := o.MakeReadyForTask() p.Send(conn) o.Debug("Sent Request for trouble") pendingTaskRequest = true } } } select { // Currently executing job finishes. case newresp := <-taskCompletionChan: o.Debug("job%d: Completed with State %s\n", newresp.id, newresp.State) // preemptively set a retrytime. newresp.RetryTime = time.Now() // ENOCONN - sub it in as our next retryresponse, and prepend the old one onto the queue. if nil == conn { if nil != nextRetryResp { prequeueResponse(nextRetryResp) } o.Debug("job%d: Queuing Initial Response", newresp.id) nextRetryResp = newresp } else { o.Debug("job%d: Sending Initial Response", newresp.id) sendResponse(conn, newresp) } if doScoreReload { o.Info("Performing Deferred score reload") LoadScores() doScoreReload = false } taskCompletionChan = nil // If the current unacknowledged response needs a retry, send it. case <-retryChan: sendResponse(conn, nextRetryResp) nextRetryResp = nil // New connection. Set up the receiver thread and Introduce ourselves. case nci := <-newConnection: if conn != nil { conn.Close() } conn = nci.conn connectDelay = nci.timeout pendingTaskRequest = false // start the reader go Reader(conn) /* Introduce ourself */ p := o.MakeIdentifyClient(LocalHostname, PlayerVersion) p.Send(conn) // Lost connection. Shut downt he connection. case <-lostConnection: o.Warn("Lost Connection to Master") conn.Close() conn = nil // restart the connection attempts go connectMe(connectDelay) // Message received from master. Decode and action. case p := <-receivedMessage: // because the message could possibly be an ACK, push the next retry response back into the queue so acknowledge can find it. if nil != nextRetryResp { prequeueResponse(nextRetryResp) nextRetryResp = nil } var upkt interface{} = nil if p.Length > 0 { var err error upkt, err = p.Decode() o.MightFail(err, "Couldn't decode packet from master") } handler, exists := dispatcher[p.Type] if exists { connectDelay = 0 handler(conn, upkt) } else { o.Fail("Unhandled Pkt Type %d", p.Type) } // Reload scores case <-reloadScores: // fortunately this is actually completely safe as // long as nobody's currently executing. // who'd have thunk it? if taskCompletionChan == nil { o.Info("Reloading scores") LoadScores() } else { o.Info("Deferring score reload (execution in progress)") doScoreReload = true } // Keepalive delay expired. Send Nop. case <-time.After(KeepaliveDelay): if conn == nil { break } o.Debug("Sending NOP") p := o.MakeNop() p.Send(conn) } } }