// runCmd runs the command on next available server, or creates a new server if // none are available. NB: we only return an error if we can't start the cmd, // not if the command fails (schedule() only guarantees that the cmds are run // count times, not that they are /successful/ that many times). func (s *opst) runCmd(cmd string, req *Requirements) error { // look through space on existing servers to see if we can run cmd on one // of them s.mutex.Lock() var server *cloud.Server for sid, thisServer := range s.servers { if thisServer.Destroyed() { delete(s.servers, sid) continue } if thisServer.HasSpaceFor(req.Cores, req.RAM, req.Disk) > 0 { server = thisServer break } } // else see if there will be space on a soon-to-be-spawned server // *** this is untested if server == nil { for _, standinServer := range s.standins { if standinServer.hasSpaceFor(req) > 0 { standinServer.allocate(req) s.mutex.Unlock() server = standinServer.waitForServer() s.mutex.Lock() } } } // else spawn the smallest server that can run this cmd, recording our new // quota usage. if server == nil { reqForSpawn := req if req.RAM < s.config.OSRAM { reqForSpawn = &Requirements{ RAM: s.config.OSRAM, Time: req.Time, Cores: req.Cores, Disk: req.Disk, Other: req.Other, } } flavor, err := s.determineFlavor(reqForSpawn) if err != nil { s.mutex.Unlock() return err } // because spawning can take a while, we record that we're going to use // up some of our quota and unlock so other things can proceed numSpawning := s.waitingToSpawn + s.spawningNow if numSpawning == 0 { s.nextSpawnTime = time.Now().Add(10 * time.Second) s.spawningNow++ } else { s.waitingToSpawn++ } s.reservedInstances++ s.reservedCores += flavor.Cores s.reservedRAM += flavor.RAM standinID := uuid.NewV4().String() standinServer := newStandin(standinID, flavor) standinServer.allocate(req) s.standins[standinID] = standinServer s.mutex.Unlock() // now spawn, but don't overload the system by trying to spawn too many // at once; wait at least 10 seconds between each spawn if numSpawning > 0 { done := make(chan error) go func() { ticker := time.NewTicker(1 * time.Second) for { select { case <-ticker.C: s.mutex.Lock() if time.Now().After(s.nextSpawnTime) { s.nextSpawnTime = time.Now().Add(10 * time.Second) s.waitingToSpawn-- s.spawningNow++ s.mutex.Unlock() ticker.Stop() done <- nil return } s.mutex.Unlock() continue case <-s.stopWaitingToSpawn: ticker.Stop() s.mutex.Lock() s.waitingToSpawn-- standinServer.failed() delete(s.standins, standinID) s.mutex.Unlock() done <- errors.New("giving up waiting to spawn") return } } }() err = <-done if err != nil { return err } } server, err = s.provider.Spawn(s.config.OSPrefix, s.config.OSUser, flavor.ID, s.config.ServerKeepTime, false, s.config.PostCreationScript) if err == nil { // check that the exe of the cmd we're supposed to run exists on the // new server, and if not, copy it over *** this is just a hack to // get wr working, need to think of a better way of doing this... exe := strings.Split(cmd, " ")[0] if _, err = os.Stat(exe); err == nil { if stdout, err := server.RunCmd("file "+exe, false); err == nil { if strings.Contains(stdout, "No such file") { err = server.UploadFile(exe, exe) if err == nil { server.RunCmd("chmod u+x "+exe, false) } else { server.Destroy() } } } else { server.Destroy() } } else { server.Destroy() } } // handle Spawn() or upload-of-exe errors now, by noting we failed and // unreserving resources if err != nil { s.mutex.Lock() s.spawningNow-- s.reservedInstances-- s.reservedCores -= flavor.Cores s.reservedRAM -= flavor.RAM standinServer.failed() delete(s.standins, standinID) s.mutex.Unlock() return err } s.mutex.Lock() s.spawningNow-- s.reservedInstances-- s.reservedCores -= flavor.Cores s.reservedRAM -= flavor.RAM s.servers[server.ID] = server standinServer.worked(server) delete(s.standins, standinID) } server.Allocate(req.Cores, req.RAM, req.Disk) s.mutex.Unlock() // now we have a server, ssh over and run the cmd on it var err error if server.IP == "127.0.0.1" { err = s.local.runCmd(cmd, req) } else { _, err = server.RunCmd(cmd, false) } // having run a command, this server is now available for another; signal a // runCmd call that is waiting its turn to spawn a new server to give up // waiting and potentially get scheduled on us instead s.mutex.Lock() server.Release(req.Cores, req.RAM, req.Disk) if s.waitingToSpawn > 0 && server.IP != "127.0.0.1" { s.mutex.Unlock() s.stopWaitingToSpawn <- true } else { s.mutex.Unlock() } return err }
func bootstrapOnRemote(provider *cloud.Provider, server *cloud.Server, exe string, mp int, wp int, wrMayHaveStarted bool) { // upload ourselves remoteExe := filepath.Join(cloudBinDir, "wr") err := server.UploadFile(exe, remoteExe) if err != nil && !wrMayHaveStarted { provider.TearDown() die("failed to upload wr to the server at %s: %s", server.IP, err) } // create a config file on the remote to have the remote wr work on the same // ports that we'd use locally err = server.CreateFile(fmt.Sprintf("managerport: \"%d\"\nmanagerweb: \"%d\"\n", mp, wp), wrConfigFileName) if err != nil { provider.TearDown() die("failed to create our config file on the server at %s: %s", server.IP, err) } _, err = server.RunCmd("chmod u+x "+remoteExe, false) if err != nil && !wrMayHaveStarted { provider.TearDown() die("failed to make remote wr executable: %s", err) } // copy over our cloud resource details, including our ssh key localResourceFile := filepath.Join(config.ManagerDir, "cloud_resources."+providerName+".wr-"+config.Deployment) remoteResourceFile := filepath.Join("./.wr_"+config.Deployment, "cloud_resources."+providerName+".wr-"+config.Deployment) err = server.UploadFile(localResourceFile, remoteResourceFile) if err != nil && !wrMayHaveStarted { provider.TearDown() die("failed to upload wr cloud resources file to the server at %s: %s", server.IP, err) } localKeyFile := filepath.Join(config.ManagerDir, "cloud_resources."+providerName+".key") err = ioutil.WriteFile(localKeyFile, []byte(provider.PrivateKey()), 0600) if err != nil { provider.TearDown() die("failed to create key file %s: %s", localKeyFile, err) } remoteKeyFile := filepath.Join("./.wr_"+config.Deployment, "cloud_resources."+providerName+".key") err = server.UploadFile(localKeyFile, remoteKeyFile) if err != nil && !wrMayHaveStarted { provider.TearDown() die("failed to upload wr cloud key file to the server at %s: %s", server.IP, err) } // start up the manager var alreadyStarted bool if wrMayHaveStarted { response, err := server.RunCmd(fmt.Sprintf("%s manager status --deployment %s", remoteExe, config.Deployment), false) if err != nil && response == "started\n" { alreadyStarted = true } } if !alreadyStarted { // build a command prefix that sets all the required env vars for this // provider envvarPrefix := "" envvars, _ := cloud.RequiredEnv(providerName) for _, envvar := range envvars { envvarPrefix += fmt.Sprintf("%s=\"%s\" ", envvar, os.Getenv(envvar)) } var postCreationArg string if postCreationScript != "" { // copy over the post creation script to the server so remote // manager can use it remoteScriptFile := filepath.Join("./.wr_"+config.Deployment, "cloud_resources."+providerName+".script") err = server.UploadFile(postCreationScript, remoteScriptFile) if err != nil && !wrMayHaveStarted { provider.TearDown() die("failed to upload wr cloud script file to the server at %s: %s", server.IP, err) } postCreationArg = " -p " + remoteScriptFile } var flavorArg string if flavorRegex != "" { flavorArg = " -l '" + flavorRegex + "'" } // get the manager running mCmd := fmt.Sprintf("%s%s manager start --deployment %s -s %s -k %d -o '%s' -r %d -m %d -u %s%s%s", envvarPrefix, remoteExe, config.Deployment, providerName, serverKeepAlive, osPrefix, osRAM, maxServers, osUsername, postCreationArg, flavorArg) _, err = server.RunCmd(mCmd, false) if err != nil { provider.TearDown() die("failed to start wr manager on the remote server: %s", err) } // wait a few seconds for the manager to start listening on its ports <-time.After(3 * time.Second) } }