func UnixAudienceListener(sockaddr string) { fi, err := os.Stat(sockaddr) if err == nil { fmode := fi.Mode() if fmode&os.ModeType == os.ModeSocket { o.Warn("Removing stale socket at %s", sockaddr) os.Remove(sockaddr) } else { o.Fail("%s exists and is not a socket", sockaddr) } } laddr, err := net.ResolveUnixAddr("unix", sockaddr) o.MightFail(err, "Couldn't resolve audience socket address") l, err := net.ListenUnix("unix", laddr) o.MightFail(err, "Couldn't start audience unixsock listener") // Fudge the permissions on the unixsock! fi, err = os.Stat(sockaddr) if err == nil { os.Chmod(sockaddr, fi.Mode()|0777) } else { o.Warn("Couldn't fudge permission on audience socket: %s", err) } // make sure we clean up the unix socket when we die. defer l.Close() defer os.Remove(sockaddr) AudienceListener(l) }
func UnixAudienceListener(sockaddr string, sockmode os.FileMode, sockuid int, sockgid int) { fi, err := os.Stat(sockaddr) if err == nil { if (fi.Mode() & os.ModeSocket) != 0 { os.Remove(sockaddr) } else { o.Fail("%s exists and is not a socket", sockaddr) } } err = os.MkdirAll(path.Dir(sockaddr), 0755) o.MightFail(err, "Couldn't create socket directory") laddr, err := net.ResolveUnixAddr("unix", sockaddr) o.MightFail(err, "Couldn't resolve audience socket address") old_umask := syscall.Umask(0777) defer syscall.Umask(old_umask) l, err := net.ListenUnix("unix", laddr) o.MightFail(err, "Couldn't start audience unixsock listener") if sockuid >= 0 || sockgid >= 0 { err = os.Chown(sockaddr, sockuid, sockgid) o.MightFail(err, "Couldn't chown audience unixsock listener") } err = os.Chmod(sockaddr, sockmode) o.MightFail(err, "Couldn't chmod audience unixsock listener") // make sure we clean up the unix socket when we die. defer l.Close() defer os.Remove(sockaddr) AudienceListener(l) }
func loadSpoolFiles(dirname string, depth int) { dh, err := os.Open(dirname) o.MightFail(err, "Couldn't open %s", dirname) nodes, err := dh.Readdir(-1) o.MightFail(err, "Couldn't readdir on %s", dirname) if depth > 0 { for _, n := range nodes { abspath := path.Join(dirname, n.Name()) if (n.Mode() & os.ModeType) == os.ModeDir { // if not a single character, it's not a spool node. if len(n.Name()) != 1 { continue } if n.Name() == "." { // we're not interested in . continue } nrunes := []rune(n.Name()) if unicode.Is(unicode.ASCII_Hex_Digit, nrunes[0]) { loadSpoolFiles(abspath, depth-1) } else { o.Warn("Foreign dirent %s found in spool tree", abspath) } } } } else { // depth == 0 - only interested in files. for _, n := range nodes { abspath := path.Join(dirname, n.Name()) if n.Mode()&os.ModeType == 0 { if len(n.Name()) != 16 { shuffleToCorrupted(abspath, "Filename incorrect length") continue } id, err := strconv.ParseUint(n.Name(), 16, 64) if err != nil { shuffleToCorrupted(abspath, "Invalid Filename") continue } fh, err := os.Open(abspath) if err != nil { shuffleToCorrupted(abspath, "Couldn't open") continue } defer fh.Close() jr, err := JobRequestFromReader(fh) if err != nil || jr.Id != id { o.Warn("Couldn't parse?! %s", err) shuffleToCorrupted(abspath, "Parse Failure") continue } // Add the request to the registry directly. if !RestoreJobState(jr) { shuffleToCorrupted(abspath, "Job State Invalid") } } } } }
func ConfigLoad() { // attempt to open the configuration file. fh, err := os.Open(*ConfigFile) if nil == err { defer fh.Close() // reset the config File data, then reload it. configFile.Reset() ierr := configFile.Read(fh, 1) o.MightFail(ierr, "Couldn't parse configuration") } else { o.Warn("Couldn't open configuration file: %s. Proceeding anyway.", err) } playerpath := strings.TrimSpace(GetStringOpt("player file path")) pfh, err := os.Open(playerpath) o.MightFail(err, "Couldn't open \"%s\"", playerpath) pbr := bufio.NewReader(pfh) ahmap := make(map[string]bool) for err = nil; err == nil; { var lb []byte var prefix bool lb, prefix, err = pbr.ReadLine() if nil == lb { break } if prefix { o.Fail("ConfigLoad: Short Read (prefix only)!") } line := strings.TrimSpace(string(lb)) if line == "" { continue } if line[0] == '#' { continue } ahmap[line] = true } // convert newAuthorisedHosts to a slice authorisedHosts := make([]string, len(ahmap)) idx := 0 for k, _ := range ahmap { authorisedHosts[idx] = k idx++ } ClientUpdateKnown(authorisedHosts) // set the spool directory SetSpoolDirectory(GetStringOpt("conductor state path")) }
func ScoreConfigure(si *ScoreInfo, r io.Reader) { o.Info("Score: %s (%s)", (*si).Name, (*si).Executable) config := NewScoreInfoConfig() err := config.Read(r, 1) o.MightFail(err, "Error Parsing Score Configuration for %s", si.Name) si.updateFromConfig(config) }
func loadLastId() { fh, err := os.Open(checkpointPath()) if err == nil { defer fh.Close() // we have a checkpoint file. blah. cbio := bufio.NewReader(fh) l, err := cbio.ReadString('\n') lastId, err = strconv.ParseUint(strings.TrimSpace(l), 10, 64) if err != nil { o.Fail("Couldn't read last ID from checkpoint file") } lastId += IdCheckpointSafetySkip } else { if !os.IsNotExist(err) { o.Fail("Found checkpoint file, but couldn't open it: %s", err) } fh, err := os.Open(savePath()) if err != nil { if os.IsNotExist(err) { lastId = 0 return } o.MightFail(err, "Couldn't open last_id file") } defer fh.Close() cbio := bufio.NewReader(fh) l, err := cbio.ReadString('\n') lastId, err = strconv.ParseUint(strings.TrimSpace(l), 10, 64) if err != nil { o.Fail("Couldn't read last ID from last_id") } } writeIdCheckpoint() }
func (client *ClientInfo) SendTask(task *TaskRequest) { tr := task.Encode() p, err := o.Encode(tr) o.MightFail(err, "Couldn't encode task for client") client.Send(p) task.RetryTime = time.Now().Add(RetryDelay) }
func shuffleToCorrupted(abspath, reason string) { basename := path.Base(abspath) targetname := path.Join(spoolDirectory, "corrupt", basename) // make sure there's nothing in the target name. os.Remove(targetname) err := os.Rename(abspath, targetname) o.MightFail(err, "Couldn't bin corrupt spoolfile %s", abspath) o.Warn("Moved \"%s\" to corrupted spool: %s", abspath, reason) }
func LoadScores() { scoreDirectory := GetStringOpt("score directory") dir, err := os.Open(scoreDirectory) o.MightFail(err, "Couldn't open Score directory") defer dir.Close() Scores = make(map[string]*ScoreInfo) files, err := dir.Readdir(-1) for i := range files { name := files[i].Name() // skip ., .. and other dotfiles. if strings.HasPrefix(name, ".") { continue } // emacs backup files. ignore these. if strings.HasSuffix(name, "~") || strings.HasPrefix(name, "#") { continue } // .conf is reserved for score configurations. if strings.HasSuffix(name, ".conf") { continue } // check to see if it's a file or symlink ftype := files[i].Mode() & os.ModeType if ftype != 0 && ftype != os.ModeSymlink { continue } // check for the executionable bit if files[i].Mode()&0111 == 0 { continue } fullpath := path.Join(scoreDirectory, name) conffile := fullpath + ".conf" o.Warn("Considering %s as score", name) si := NewScoreInfo() si.Name = name si.Executable = fullpath conf, err := os.Open(conffile) if err == nil { o.Warn("Parsing configuration for %s", fullpath) ScoreConfigure(si, conf) conf.Close() } else { o.Warn("Couldn't open config file for %s, assuming defaults: %s", name, err) } Scores[name] = si } }
func makeSpoolDirInner(prefix string, depth int) { for i := 0; i < 16; i++ { dirname := path.Join(prefix, fmt.Sprintf("%01X", i)) if depth == 1 { err := os.MkdirAll(dirname, 0700) o.MightFail(err, "Couldn't make directory building spool tree") } else { makeSpoolDirInner(dirname, depth-1) } } }
func ConfigLoad() { // attempt to open the configuration file. fh, err := os.Open(*ConfigFile) if nil == err { defer fh.Close() // reset the config File data, then reload it. configFile.Reset() ierr := configFile.Read(fh, 1) o.MightFail(ierr, "Couldn't parse configuration") } else { o.Warn("Couldn't open configuration file: %s. Proceeding anyway.", err) } // load the x509 certificates x509CertFilename := GetStringOpt("x509 certificate") x509PrivateKeyFilename := GetStringOpt("x509 private key") CertPair, err = tls.LoadX509KeyPair(x509CertFilename, x509PrivateKeyFilename) o.MightFail(err, "Couldn't load certificates") // load the CA Certs CACertPool = x509.NewCertPool() caCertNames := GetCACertList() if caCertNames != nil { for _, filename := range caCertNames { fh, err := os.Open(filename) if err != nil { o.Warn("Whilst parsing CA certs, couldn't open %s: %s", filename, err) continue } defer fh.Close() fi, err := fh.Stat() o.MightFail(err, "Couldn't stat CA certificate file: %s", filename) data := make([]byte, fi.Size()) fh.Read(data) CACertPool.AppendCertsFromPEM(data) } } }
func sendResponse(c net.Conn, resp *TaskResponse) { //FIXME: update retry time on Response ptr := resp.Encode() p, err := o.Encode(ptr) o.MightFail(err, "Failed to encode response") _, err = p.Send(c) if err != nil { o.Warn("Transmission error: %s", err) c.Close() prequeueResponse(resp) lostConnection <- 1 } else { appendUnacknowledgedResponse(resp) } }
func connectMe(initialDelay int64) { var backOff int64 = initialDelay for { // Sleep first. if backOff > 0 { o.Info("Sleeping for %d seconds", backOff/1e9) err := time.Sleep(backOff) o.MightFail(err, "Couldn't Sleep") backOff *= ReconnectDelayScale if backOff > MaximumReconnectDelay { backOff = MaximumReconnectDelay } } else { backOff = InitialReconnectDelay } tconf := &tls.Config{ RootCAs: CACertPool, } tconf.Certificates = append(tconf.Certificates, CertPair) // update our local hostname. LocalHostname = GetStringOpt("player name") if LocalHostname == "" { LocalHostname = o.ProbeHostname() o.Warn("No hostname provided - probed hostname: %s", LocalHostname) } masterHostname := GetStringOpt("master") raddr := fmt.Sprintf("%s:%d", masterHostname, 2258) o.Info("Connecting to %s", raddr) conn, err := tls.Dial("tcp", raddr, tconf) if err == nil { conn.Handshake() err = conn.VerifyHostname(masterHostname) } if err == nil { nc := new(NewConnectionInfo) nc.conn = conn nc.timeout = backOff newConnection <- nc return } o.Warn("Couldn't connect to master: %s", err) } }
func ServiceRequests() { var sockConfig tls.Config // resolve the bind address bindAddressStr := GetStringOpt("bind address") var bindAddr *net.IPAddr if bindAddressStr != "" { var err error bindAddr, err = net.ResolveIPAddr("ip", bindAddressStr) if err != nil { o.Warn("Ignoring bind address. Couldn't resolve \"%s\": %s", bindAddressStr, err) } else { bindAddr = nil } } // load the x509 certificate and key, then attach it to the tls config. x509CertFilename := GetStringOpt("x509 certificate") x509PrivateKeyFilename := GetStringOpt("x509 private key") serverCert, err := tls.LoadX509KeyPair(x509CertFilename, x509PrivateKeyFilename) o.MightFail(err, "Couldn't load certificates") sockConfig.Certificates = append(sockConfig.Certificates, serverCert) // load the CA certs CACertPool = x509.NewCertPool() caCertNames := GetCACertList() if caCertNames != nil { for _, filename := range caCertNames { fh, err := os.Open(filename) if err != nil { o.Warn("Whilst parsing CA certs, couldn't open %s: %s", filename, err) continue } defer fh.Close() fi, err := fh.Stat() o.MightFail(err, "Couldn't stat CA certificate file: %s", filename) data := make([]byte, fi.Size()) fh.Read(data) CACertPool.AppendCertsFromPEM(data) } } sockConfig.ClientCAs = CACertPool // determine the server hostname. servername := GetStringOpt("server name") if servername != "" { o.Info("Using %s as the server name", servername) sockConfig.ServerName = servername } else { if bindAddr != nil { o.Warn("Probing for FQDN for bind address as none was provided") hostnames, err := net.LookupAddr(bindAddr.String()) o.MightFail(err, "Failed to get full hostname for bind address") sockConfig.ServerName = hostnames[0] } else { o.Warn("Probing for FQDN as no server name was provided") sockConfig.ServerName = o.ProbeHostname() } } // ask the client to authenticate sockConfig.ClientAuth = tls.RequireAndVerifyClientCert if *DontVerifyPeer { sockConfig.ClientAuth = tls.RequestClientCert } /* convert the bindAddress to a string suitable for the Listen call */ var laddr string if bindAddr == nil { laddr = fmt.Sprintf(":%d", o.DefaultMasterPort) } else { laddr = fmt.Sprintf("%s:%d", bindAddr.String(), o.DefaultMasterPort) } o.Info("Binding to %s...", laddr) listener, err := tls.Listen("tcp", laddr, &sockConfig) o.MightFail(err, "Couldn't bind TLS listener") for { o.Info("Waiting for connection...") c, err := listener.Accept() o.MightFail(err, "Couldn't accept TLS connection") o.Info("Connection received from %s", c.RemoteAddr().String()) HandleConnection(c) } }
func ProcessingLoop() { var conn net.Conn = nil var nextRetryResp *TaskResponse = nil var taskCompletionChan <-chan *TaskResponse = nil var connectDelay time.Duration var doScoreReload bool = false // kick off a new connection attempt. go connectMe(connectDelay) // and this is where we spin! for { var retryDelay time.Duration = 0 var retryChan <-chan time.Time = nil if conn != nil { for nextRetryResp == nil { nextRetryResp = getNextUnacknowledgedResponse() if nil == nextRetryResp { break } retryDelay = nextRetryResp.RetryTime.Sub(time.Now()) if retryDelay < 0 { sendResponse(conn, nextRetryResp) nextRetryResp = nil } } if nextRetryResp != nil { retryChan = time.After(retryDelay) } } if taskCompletionChan == nil { nextTask := getNextPendingTask() if nextTask != nil { taskCompletionChan = ExecuteTask(nextTask) } else { if conn != nil && !pendingTaskRequest { o.Debug("Asking for trouble") p := o.MakeReadyForTask() p.Send(conn) o.Debug("Sent Request for trouble") pendingTaskRequest = true } } } select { // Currently executing job finishes. case newresp := <-taskCompletionChan: o.Debug("job%d: Completed with State %s\n", newresp.id, newresp.State) // preemptively set a retrytime. newresp.RetryTime = time.Now() // ENOCONN - sub it in as our next retryresponse, and prepend the old one onto the queue. if nil == conn { if nil != nextRetryResp { prequeueResponse(nextRetryResp) } o.Debug("job%d: Queuing Initial Response", newresp.id) nextRetryResp = newresp } else { o.Debug("job%d: Sending Initial Response", newresp.id) sendResponse(conn, newresp) } if doScoreReload { o.Info("Performing Deferred score reload") LoadScores() doScoreReload = false } taskCompletionChan = nil // If the current unacknowledged response needs a retry, send it. case <-retryChan: sendResponse(conn, nextRetryResp) nextRetryResp = nil // New connection. Set up the receiver thread and Introduce ourselves. case nci := <-newConnection: if conn != nil { conn.Close() } conn = nci.conn connectDelay = nci.timeout pendingTaskRequest = false // start the reader go Reader(conn) /* Introduce ourself */ p := o.MakeIdentifyClient(LocalHostname, PlayerVersion) p.Send(conn) // Lost connection. Shut downt he connection. case <-lostConnection: o.Warn("Lost Connection to Master") conn.Close() conn = nil // restart the connection attempts go connectMe(connectDelay) // Message received from master. Decode and action. case p := <-receivedMessage: // because the message could possibly be an ACK, push the next retry response back into the queue so acknowledge can find it. if nil != nextRetryResp { prequeueResponse(nextRetryResp) nextRetryResp = nil } var upkt interface{} = nil if p.Length > 0 { var err error upkt, err = p.Decode() o.MightFail(err, "Couldn't decode packet from master") } handler, exists := dispatcher[p.Type] if exists { connectDelay = 0 handler(conn, upkt) } else { o.Fail("Unhandled Pkt Type %d", p.Type) } // Reload scores case <-reloadScores: // fortunately this is actually completely safe as // long as nobody's currently executing. // who'd have thunk it? if taskCompletionChan == nil { o.Info("Reloading scores") LoadScores() } else { o.Info("Deferring score reload (execution in progress)") doScoreReload = true } // Keepalive delay expired. Send Nop. case <-time.After(KeepaliveDelay): if conn == nil { break } o.Debug("Sending NOP") p := o.MakeNop() p.Send(conn) } } }
func GetModeOpt(key string) os.FileMode { s := GetStringOpt(key) mode, err := strconv.ParseUint(s, 8, 0) o.MightFail(err, "Invalid mode in %s option: %s", key, s) return os.FileMode(mode) }
func doExecution(task *TaskRequest, completionChannel chan<- *TaskResponse) { // we must notify the parent when we exit. defer func(c chan<- *TaskResponse, task *TaskRequest) { c <- task.MyResponse }(completionChannel, task) // first of all, verify that the score exists at all. score, exists := Scores[task.Score] if !exists { o.Warn("job%d: request for unknown score: %s", task.Id, task.Score) task.MyResponse.State = RESP_FAILED_UNKNOWN_SCORE return } si := NewScoreInterface(task) if si == nil { o.Warn("job%d: couldn't initialise score interface", task.Id) task.MyResponse.State = RESP_FAILED_HOST_ERROR return } if !si.Prepare() { o.Warn("job%d: couldn't prepare score interface", task.Id) task.MyResponse.State = RESP_FAILED_HOST_ERROR return } defer si.Cleanup() eenv := si.SetupProcess() task.MyResponse.State = RESP_RUNNING procenv := new(os.ProcAttr) // Build the default environment. procenv.Env = peSetEnv(procenv.Env, "PATH", "/usr/bin:/usr/sbin:/bin:/sbin") procenv.Env = peSetEnv(procenv.Env, "IFS", " \t\n") pwd, err := os.Getwd() if err != nil { task.MyResponse.State = RESP_FAILED_HOST_ERROR o.Warn("job%d: couldn't resolve PWD: %s", task.Id, err) return } procenv.Env = peSetEnv(procenv.Env, "PWD", pwd) // copy in the environment overrides for k, v := range eenv.Environment { procenv.Env = peSetEnv(procenv.Env, k, v) } // attach FDs to procenv. procenv.Files = make([]*os.File, 3) // first off, attach /dev/null to stdin and stdout devNull, err := os.OpenFile(os.DevNull, os.O_RDWR|os.O_APPEND, 0666) o.MightFail(err, "couldn't open DevNull") defer devNull.Close() for i := 0; i < 2; i++ { procenv.Files[i] = devNull } // attach STDERR to to our logger via pipe. lr, lw, err := os.Pipe() o.MightFail(err, "Couldn't create pipe") defer lw.Close() // lr will be closed by the logger. procenv.Files[2] = lw // check the environment's configuration and allow it to override stdin, stdout, and FDs 3+ if nil != eenv.Files { for i := range eenv.Files { if i < 2 { procenv.Files[i] = eenv.Files[i] } else { procenv.Files = append(procenv.Files, eenv.Files[i]) } } } var args []string args = append(args, eenv.Arguments...) o.Info("job%d: executing %s...", task.Id, score.Executable) go batchLogger(task.Id, lr) proc, err := os.StartProcess(score.Executable, args, procenv) if err != nil { o.Warn("job%d: failed to start process", task.Id) task.MyResponse.State = RESP_FAILED_HOST_ERROR return } wm, err := proc.Wait() if err != nil { o.Warn("job%d: error waiting for process", task.Id) task.MyResponse.State = RESP_FAILED_UNKNOWN // Worse of all, we don't even know if we succeeded. return } ws, _ := wm.Sys().(syscall.WaitStatus) if !(ws.Signaled() || ws.Exited()) { o.Assert("Non Terminal notification received when not expected.") return } if ws.Signaled() { o.Warn("job%d: process got signalled", task.Id) task.MyResponse.State = RESP_FAILED_UNKNOWN return } if ws.Exited() { if 0 == ws.ExitStatus() { o.Warn("job%d: process exited OK", task.Id) task.MyResponse.State = RESP_FINISHED } else { o.Warn("job%d: process exited with failure", task.Id) task.MyResponse.State = RESP_FAILED } return } o.Assert("Should never get here.") }
func ScoreConfigure(si *ScoreInfo, r io.Reader) { config := NewScoreInfoConfig() err := config.Read(r, 1) o.MightFail(err, "Error Parsing Score Configuration for %s", si.Name) si.updateFromConfig(config) }
func (req *JobRequest) UpdateInSpool() { buf, err := json.MarshalIndent(req, "", " ") o.MightFail(err, "Failed to marshal job %d", req.Id) //FIXME: should try to do this out of the registry's thread. req.doSerialisation(buf) }