func timerTicker(timer *WallclockTimer) { for t := range timer.ticker.C { timer.incrementTimer() select { case command := <-timer.command: log.Debugln("Command tick at: ", t) log.Debugf("Command received: %s\n", command) switch command { case "reset": log.Debugln("Received a Reset") timer.tick = false timer.elapsedTime = time.Duration(0) case "start": log.Debugln("Received a start") timer.tick = true timer.elapsedTime = time.Duration(0) timer.previousTime = time.Now() case "stop": log.Debugln("Received a stop") timer.tick = false case "resume": log.Debugln("Received a resume") timer.tick = true timer.previousTime = time.Now() default: log.Errorln("Unknown command received") } default: //log.Debugln("No commands received, just keep ticking") } //log.Debugf("Timer elapsed time is: %v", timer.elapsedTime) //log.Debugf("Timer timeout time is: %v", timer.timeoutTime) if timer.elapsedTime > timer.timeoutTime { log.Debugln("Elapsed time has exceeded timeout time.") timer.done <- errors.New("The timer has timed out.") timer.tick = false // Possible bug? Reset timer to equal timeout timer.elapsedTime = timer.timeoutTime } } }
func main() { if *debugMode { log.SetLevel(log.DebugLevel) } else if *noWarn { log.SetLevel(log.ErrorLevel) } // Create channel for ProcessStats to trigger a sample psChan := make(chan agents.ProcessStatCommand) // Incoming remote command channel (new with each reconnect) rcChan := make(chan chan agents.RemoteControlCommand) args := flag.Args() var cmdArgs []string var cmd string if len(args) > 0 { cmd = args[0] } else { log.Fatal("Did you forget a command to run?") return } log.Debugf("cmd: %s cmdArgs: %q\n", cmd, cmdArgs) done := make(chan error) // Initialize job job, err := agents.NewControlledProcess(cmd, args, done, *stdoutByteLimit) if err != nil { log.Fatalf("Failed to create a NewControlledProcess: %s\n", err) return } log.Debugf("%#v\n", job) signals := make(chan os.Signal, 1) signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM, syscall.SIGHUP, syscall.SIGQUIT) timer, err := agents.NewTimer(*wallclockTimeout) if err != nil { log.Warnln("Error returned creating timeout agent", err) } log.Debugf("Starting timer with timeout of: %v\n", *wallclockTimeout) timer.Start() sess := session{ job: job, exchange: *exchange, rcRoutingKey: *rmtKey, psChan: psChan, rcChan: rcChan, amqpConfig: amqp.Config{ Properties: amqp.Table{ "product": "proc_box", "version": "master", }, }, multiRmtKey: *multiRmtKey, } go redial(sess) go monitor(signals, rcChan, job, psChan, timer, done) err = <-done elapsedTime, _ := timer.ElapsedTime() // Print to standard out fmt.Printf("Task elapsed time: %.2f seconds.\n", elapsedTime.Seconds()) if err != nil { if exiterr, ok := err.(*exec.ExitError); ok { // Non-zero exit code if status, ok := exiterr.Sys().(syscall.WaitStatus); ok { exitStatus := status.ExitStatus() log.Debugf("Exit Status: %d\n", exitStatus) os.Exit(exitStatus) } } else { log.Debugf("cmd.Wait: %v\n", err) } } else { os.Exit(0) } }
func monitor( signals chan os.Signal, rcChanChan chan chan agents.RemoteControlCommand, job agents.JobControl, psChan chan agents.ProcessStatCommand, timer agents.Timer, done chan error, ) { // Catch any panics here to ensure we kill the child process before going // to our own doom. defer func() { if e := recover(); e != nil { job.Kill(-9) panic(e) } }() var logSampling <-chan time.Time var err error if *stdoutByteLimit > 0 { ticker := time.NewTicker(100 * time.Millisecond) // Replace the time channel with an actual ticker if this is in use logSampling = ticker.C } var rcChan chan agents.RemoteControlCommand for { select { case rcChan = <-rcChanChan: // Catch incoming signals and operate on them as if they were remote commands case sig := <-signals: switch sig { case syscall.SIGINT: log.Debugln("Caught SIGINT, graceful shutdown") // Initiate non-blocking send select { case psChan <- agents.ProcessStatCommand{}: log.Debugln("Sending psChan a msg to sample") default: log.Debugln("SIGINT failed to send a sample msg on the psChan") } err = job.Stop() case syscall.SIGTERM: log.Debugln("Caught SIGTERM, end abruptly") job.Kill(-9) case syscall.SIGHUP: log.Debugln("Caught SIGHUP, emit stats") // Initiate non-blocking send select { case psChan <- agents.ProcessStatCommand{}: log.Debugln("Sending psChan a msg to sample") default: log.Debugln("SIGHUP failed to send a sample msg on the psChan") } case syscall.SIGQUIT: log.Debugln("Caught SIGQUIT, graceful shutdown") select { case psChan <- agents.ProcessStatCommand{}: log.Debugln("Sending psChan a msg to sample") default: log.Debugln("SIGQUIT failed to send a sample msg on the psChan") } err = job.Stop() } // Process incoming remote commands, toss unknown requests case cmd := <-rcChan: log.Debugf("Got a command %#v\n", cmd) switch cmd.Command { case "suspend": log.Debugln("RemoteCommand: Suspend") job.Suspend() case "resume": log.Debugln("RemoteCommand: Resume") job.Resume() case "kill": log.Debugln("RemoteCommand: Kill") var args int64 if len(cmd.Arguments) == 0 { args = -9 } else { args, err = strconv.ParseInt(cmd.Arguments[0], 10, 32) if err != nil { log.Warnf("Unable to parse kill command argument[0] into int: %s\n", err) args = -9 } } job.Kill(args) case "stop": log.Debugln("RemoteCommand: Stop") select { case psChan <- agents.ProcessStatCommand{}: log.Debugln("Sending psChan a msg to sample") default: log.Debugln("RC Stop failed to send a sample msg on the psChan") } if err = job.Stop(); err != nil { log.Fatalf("Error received while stopping sub-process: %s\n", err) } case "sample": log.Debugln("RemoteCommand: Sample") select { case psChan <- agents.ProcessStatCommand{}: log.Debugln("Sending psChan a msg to sample") default: log.Debugln("RC Sample failed to send a sample msg on the psChan") } case "change_sample_rate": log.Debugln("RemoteCommand: Change Stats Sample Rate") if len(cmd.Arguments) > 0 { log.Debugf("change_sample_rate arg[0]: %s\n", cmd.Arguments[0]) d, err := time.ParseDuration(cmd.Arguments[0]) if err == nil { select { case psChan <- agents.ProcessStatCommand{ TimeUpdate: true, NewTime: d}: log.Debugln("Sending psChan a msg to update the ticker") default: log.Debugln("RC change_sample_rate failed to send a msg") } } else { log.Warnf("Unparseable duration argument to command change_sample_rate") } } else { log.Warnf("Missing argument to command change_sample_rate") } case "timer_reset": log.Debugln("RemoteCommand: Timer Reset") if err = timer.Reset(); err != nil { log.Fatalf("Error received from timer calling Reset: %s\n", err) } case "timer_start": log.Debugln("RemoteCommand: Timer Start") if err = timer.Start(); err != nil { log.Fatalf("Error received from timer calling Start: %s\n", err) } case "timer_stop": log.Debugln("RemoteCommand: Timer Stop") if err = timer.Stop(); err != nil { log.Fatalf("Error received from timer calling Stop: %s\n", err) } case "timer_resume": log.Debugln("RemoteCommand: Timer Resume") if err = timer.Resume(); err != nil { log.Fatalf("Error received from timer calling Resume: %s\n", err) } default: log.Debugf("Unknown command: %s\n", cmd) } case timeoutMsg := <-timer.Done(): log.Debugf("Timer timeout message: %s\n", timeoutMsg) if err = job.Stop(); err != nil { log.Fatalf("Error received while stopping sub-process: %v\n", err) // If there was an error stopping the process, kill the porcess. job.Kill(-9) } case jobDone := <-job.Done(): log.Debugln("Command exited gracefully; shutting down.") done <- jobDone case _ = <-logSampling: if job.StdoutByteCount() > 2*(*stdoutByteLimit) { err = job.Kill(-9) } else if job.StdoutByteCount() > *stdoutByteLimit { err = job.Stop() } } } }
func (ps *ProcessStatCollector) collectSample() error { var err error job := *ps.job proc := job.Process() stat := ProcessStatSample{} stat.Pid = proc.Pid curTime := time.Now() stat.TimeUTC = curTime.UTC() stat.TimeUnix = curTime.Unix() hostinfo, err := host.HostInfo() var hostnameRoutingKey string if err != nil { log.Warnf("Error encountered collecting host stats: %s", err) hostnameRoutingKey = "" } else { stat.Host = *hostinfo hostnameRoutingKey = fmt.Sprintf(".%s", stat.Host.Hostname) } // Collect for parent stat.aggregateStatForProc(proc) children, err := proc.Children() // Collect on children (if any) if err == nil { for _, cproc := range children { stat.ChildPids = append(stat.ChildPids, cproc.Pid) stat.aggregateStatForProc(cproc) } } stat.StdoutBytes = job.StdoutByteCount() stat.UserData = ps.userJSON log.Debugf("Sample: %#v\n", stat) var body []byte body, err = json.Marshal(stat) routingKey := fmt.Sprintf("%s%s", ps.routingKey, hostnameRoutingKey) err = ps.channel.Publish( ps.exchange, // publish to an exchange routingKey, // routing to queues false, // mandatory false, // immediate amqp.Publishing{ Headers: amqp.Table{}, ContentType: "text/javascript", ContentEncoding: "", Body: body, DeliveryMode: amqp.Transient, // non-persistent Priority: 0, // 0-9 }, ) if err != nil { log.Warnf("Error publishing statistics sample %s", err) } return err }
// NewControlledProcess creates the child proc. func NewControlledProcess(cmd string, arguments []string, doneChan chan error, stdoutLimit int64) (JobControl, error) { var err error j := &Job{ nil, nil, 0, 0, doneChan, stdoutLimit, nil, } // Drop command from cmdline arguments and pass the rest as arguments separately var args []string if len(arguments) > 0 { args = arguments[1:] } j.Cmd = exec.Command(cmd, args...) // Collect stdout from the process to redirect to real stdout stdoutpipe, err := j.Cmd.StdoutPipe() if err != nil { return nil, fmt.Errorf("Failed to acquire stdout: %s", err) } stdout := iocontrol.NewMeasuredReader(stdoutpipe) j.stdoutReader = stdout var wg sync.WaitGroup stdin, err := j.Cmd.StdinPipe() if err != nil { return nil, fmt.Errorf("Failed to acquire stdin: %s", err) } stderr, err := j.Cmd.StderrPipe() if err != nil { return nil, fmt.Errorf("Failed to acquire stderr: %s", err) } // Map all child processes under this tree so Kill really ends everything. j.Cmd.SysProcAttr = &syscall.SysProcAttr{ Setpgid: true, // Set process group ID } log.Debugf("%#v\n", j.Cmd) // Start the sub-process but don't wait for completion to pickup the Pid // for resource monitoring. err = j.Cmd.Start() if err != nil { return nil, fmt.Errorf("Failed to execute sub-process: %s\n", err) } j.Pid = j.Cmd.Process.Pid j.Pgid, err = syscall.Getpgid(j.Pid) if err != nil { return nil, fmt.Errorf("Failed syscall.Getpgid: %s\n", err) } j.Proc, err = process.NewProcess(int32(j.Pgid)) if err != nil { return nil, fmt.Errorf("Unable to create process.NewProcess: %s\n", err) } wg.Add(1) go func(wg *sync.WaitGroup, r io.Reader) { defer wg.Done() io.Copy(os.Stdout, r) log.Debugln("child closed stdout") }(&wg, stdout) go func(w io.WriteCloser) { io.Copy(w, os.Stdin) }(stdin) wg.Add(1) go func(wg *sync.WaitGroup, r io.Reader) { defer wg.Done() io.Copy(os.Stderr, r) log.Debugln("child closed stderr") }(&wg, stderr) // Background waiting for the job to finish and emit a done channel message // when complete. go func(wg *sync.WaitGroup, j *Job) { log.Debugln("Waiting on wg.Wait()") wg.Wait() log.Debugln("Waiting on Cmd.Wait()") err := j.Cmd.Wait() log.Debugf("Job finished: %q\n", err) j.done <- err }(&wg, j) return j, nil }