func (p *ProcessStatSample) collectOpenFiles(proc *process.Process) { defer func() { if e := recover(); e != nil { log.Warnf("Recovered from panic on Open Files collection. Maybe unsupported on this platform.") } }() openFiles, err := proc.OpenFiles() if err != nil { log.Warnf("Error encountered collecting open files stats: %s", err) } else { p.OpenFiles = append(p.OpenFiles, openFiles...) } }
func (p *ProcessStatSample) collectNumThreads(proc *process.Process) { defer func() { if e := recover(); e != nil { log.Warnf("Recovered from panic on Number of Threads collection. Maybe unsupported on this platform.") } }() numThreads, err := proc.NumThreads() if err != nil { log.Warnf("Error encountered collecting thread count stats: %s", err) } else { p.NumThreads += numThreads } }
func (p *ProcessStatSample) collectCPUPercent(proc *process.Process) { defer func() { if e := recover(); e != nil { log.Warnf("Recovered from panic on CPU Percent collection. Maybe unsupported on this platform.") } }() // Use 0 interval to get difference since the last call cpuPercent, err := proc.CPUPercent(0 * time.Second) if err != nil { log.Warnf("Error encountered collecting CPU percent: %s", err) } else { p.CPUPercent += cpuPercent } }
func (p *ProcessStatSample) collectCPUTimes(proc *process.Process) { defer func() { if e := recover(); e != nil { log.Warnf("Recovered from panic on CPU times collection. Maybe unsupported on this platform.") } }() cputimes, err := proc.CPUTimes() if err != nil { log.Warnf("Error encountered collecting CPU stats: %s", err) } else { src := reflect.ValueOf(cputimes).Elem() dest := reflect.ValueOf(&p.CPUTimes).Elem() sum(&src, &dest) } }
func (p *ProcessStatSample) collectIOCounters(proc *process.Process) { defer func() { if e := recover(); e != nil { log.Warnf("Recovered from panic on IO counters collection. Maybe unsupported on this platform.") } }() iocnt, err := proc.IOCounters() if err != nil { log.Warnf("Error encountered collecting I/O stats: %s", err) } else { src := reflect.ValueOf(iocnt).Elem() dest := reflect.ValueOf(&p.IOCounters).Elem() sum(&src, &dest) } }
func (p *ProcessStatSample) collectMemInfo(proc *process.Process) { defer func() { if e := recover(); e != nil { log.Warnf("Recovered from panic on memory stats collection. Maybe unsupported on this platform.") } }() meminfo, err := proc.MemoryInfo() if err != nil { log.Warnf("Error encountered collecting memory stats: %s", err) } else { src := reflect.ValueOf(meminfo).Elem() dest := reflect.ValueOf(&p.Memory).Elem() sum(&src, &dest) } }
// Suspend pauses a running process func (j *Job) Suspend() error { if err := syscall.Kill(-j.Pgid, syscall.SIGSTOP); err != nil { log.Warnf("Error received calling suspend on sub-process: %s", err) return err } return nil }
// Resume continues a suspended process func (j *Job) Resume() error { if err := syscall.Kill(-j.Pgid, syscall.SIGCONT); err != nil { log.Warnf("Error received calling resume on sub-process: %s", err) return err } return nil }
// Stop gracefully ends the process func (j *Job) Stop() error { err := syscall.Kill(-j.Pgid, syscall.SIGTERM) if err != nil { log.Warnf("Error received calling stop on sub-process: %s", err) return err } return nil }
func handle(deliveries <-chan amqp.Delivery, done chan error, commands chan RemoteControlCommand) { for d := range deliveries { var cmd RemoteControlCommand err := json.Unmarshal(d.Body, &cmd) if err != nil { log.Warnf("Failed to unmarshal JSON from AMQP message: %s\n", err) } commands <- cmd d.Ack(false) } done <- nil }
// Kill forcefully stops a process func (j *Job) Kill(sig int64) error { var err error switch sig { case -9: log.Debugln("Sending process Kill (-9) signal") err = syscall.Kill(-j.Pgid, syscall.SIGKILL) default: signal := syscall.Signal(sig) err = syscall.Kill(-j.Pgid, signal) } if err != nil { log.Warnf("Error received calling kill on sub-process: %s", err) return err } return nil }
// NewProcessStats establishes a new AMQP channel and configures sampling period func NewProcessStats( amqp *amqp.Connection, routingKey string, exchange string, job *JobControl, interval time.Duration, msgTimeout time.Duration, userJSON string, ) (ProcessStats, error) { var err error var extraJSON map[string]interface{} if len(userJSON) > 0 { err = json.Unmarshal([]byte(userJSON), &extraJSON) if err != nil { log.Warnf("Error encountered trying to unmarshal user provided JSON: %s\n", err) } } psc := &ProcessStatCollector{ amqp, nil, routingKey, exchange, job, nil, msgTimeout, extraJSON, } err = psc.ReinitializeConnection(amqp) if err != nil { return nil, fmt.Errorf("Unable to open a channel on AMQP connection: %s\n", err) } psc.NewTicker(interval) return psc, nil }
func monitor( signals chan os.Signal, rcChanChan chan chan agents.RemoteControlCommand, job agents.JobControl, psChan chan agents.ProcessStatCommand, timer agents.Timer, done chan error, ) { // Catch any panics here to ensure we kill the child process before going // to our own doom. defer func() { if e := recover(); e != nil { job.Kill(-9) panic(e) } }() var logSampling <-chan time.Time var err error if *stdoutByteLimit > 0 { ticker := time.NewTicker(100 * time.Millisecond) // Replace the time channel with an actual ticker if this is in use logSampling = ticker.C } var rcChan chan agents.RemoteControlCommand for { select { case rcChan = <-rcChanChan: // Catch incoming signals and operate on them as if they were remote commands case sig := <-signals: switch sig { case syscall.SIGINT: log.Debugln("Caught SIGINT, graceful shutdown") // Initiate non-blocking send select { case psChan <- agents.ProcessStatCommand{}: log.Debugln("Sending psChan a msg to sample") default: log.Debugln("SIGINT failed to send a sample msg on the psChan") } err = job.Stop() case syscall.SIGTERM: log.Debugln("Caught SIGTERM, end abruptly") job.Kill(-9) case syscall.SIGHUP: log.Debugln("Caught SIGHUP, emit stats") // Initiate non-blocking send select { case psChan <- agents.ProcessStatCommand{}: log.Debugln("Sending psChan a msg to sample") default: log.Debugln("SIGHUP failed to send a sample msg on the psChan") } case syscall.SIGQUIT: log.Debugln("Caught SIGQUIT, graceful shutdown") select { case psChan <- agents.ProcessStatCommand{}: log.Debugln("Sending psChan a msg to sample") default: log.Debugln("SIGQUIT failed to send a sample msg on the psChan") } err = job.Stop() } // Process incoming remote commands, toss unknown requests case cmd := <-rcChan: log.Debugf("Got a command %#v\n", cmd) switch cmd.Command { case "suspend": log.Debugln("RemoteCommand: Suspend") job.Suspend() case "resume": log.Debugln("RemoteCommand: Resume") job.Resume() case "kill": log.Debugln("RemoteCommand: Kill") var args int64 if len(cmd.Arguments) == 0 { args = -9 } else { args, err = strconv.ParseInt(cmd.Arguments[0], 10, 32) if err != nil { log.Warnf("Unable to parse kill command argument[0] into int: %s\n", err) args = -9 } } job.Kill(args) case "stop": log.Debugln("RemoteCommand: Stop") select { case psChan <- agents.ProcessStatCommand{}: log.Debugln("Sending psChan a msg to sample") default: log.Debugln("RC Stop failed to send a sample msg on the psChan") } if err = job.Stop(); err != nil { log.Fatalf("Error received while stopping sub-process: %s\n", err) } case "sample": log.Debugln("RemoteCommand: Sample") select { case psChan <- agents.ProcessStatCommand{}: log.Debugln("Sending psChan a msg to sample") default: log.Debugln("RC Sample failed to send a sample msg on the psChan") } case "change_sample_rate": log.Debugln("RemoteCommand: Change Stats Sample Rate") if len(cmd.Arguments) > 0 { log.Debugf("change_sample_rate arg[0]: %s\n", cmd.Arguments[0]) d, err := time.ParseDuration(cmd.Arguments[0]) if err == nil { select { case psChan <- agents.ProcessStatCommand{ TimeUpdate: true, NewTime: d}: log.Debugln("Sending psChan a msg to update the ticker") default: log.Debugln("RC change_sample_rate failed to send a msg") } } else { log.Warnf("Unparseable duration argument to command change_sample_rate") } } else { log.Warnf("Missing argument to command change_sample_rate") } case "timer_reset": log.Debugln("RemoteCommand: Timer Reset") if err = timer.Reset(); err != nil { log.Fatalf("Error received from timer calling Reset: %s\n", err) } case "timer_start": log.Debugln("RemoteCommand: Timer Start") if err = timer.Start(); err != nil { log.Fatalf("Error received from timer calling Start: %s\n", err) } case "timer_stop": log.Debugln("RemoteCommand: Timer Stop") if err = timer.Stop(); err != nil { log.Fatalf("Error received from timer calling Stop: %s\n", err) } case "timer_resume": log.Debugln("RemoteCommand: Timer Resume") if err = timer.Resume(); err != nil { log.Fatalf("Error received from timer calling Resume: %s\n", err) } default: log.Debugf("Unknown command: %s\n", cmd) } case timeoutMsg := <-timer.Done(): log.Debugf("Timer timeout message: %s\n", timeoutMsg) if err = job.Stop(); err != nil { log.Fatalf("Error received while stopping sub-process: %v\n", err) // If there was an error stopping the process, kill the porcess. job.Kill(-9) } case jobDone := <-job.Done(): log.Debugln("Command exited gracefully; shutting down.") done <- jobDone case _ = <-logSampling: if job.StdoutByteCount() > 2*(*stdoutByteLimit) { err = job.Kill(-9) } else if job.StdoutByteCount() > *stdoutByteLimit { err = job.Stop() } } } }
func redial(sess session) { var err error var stats agents.ProcessStats var rc *agents.RemoteControl // Initialize mini-router for incoming stats agent requests go func() { for s := range sess.psChan { if stats == nil { log.Warnln("No stats agents available (yet), dropping request") } else if s.TimeUpdate { stats.NewTicker(s.NewTime) } else { stats.Sample() } } }() rcKeys := []string{*rmtKey} if sess.multiRmtKey { rcKeys = deleteEmpty(strings.Split(*rmtKey, ",")) } for { sess.amqpConn, err = amqp.DialConfig(*uri, sess.amqpConfig) if err != nil { log.Warnf("Failed to connect to AMQP: %q", err) // Rate limit reconnection attempts time.Sleep(5 * time.Second) } else { rc, err = agents.NewRemoteControl(sess.amqpConn, rcKeys, *exchange) if err != nil { log.Warnf("Failed creating NewRemoteControl: %s", err) } else { sess.rcChan <- rc.Commands } if stats == nil { // initial setup stats, err = agents.NewProcessStats( sess.amqpConn, *procStatsKey, *exchange, &sess.job, *statsInterval, *msgTimeout, *userJSON, ) if err != nil { log.Warnf("Failed creating NewProcessStats: %s", err) } } else { err = stats.ReinitializeConnection(sess.amqpConn) if err != nil { log.Warnf("Failed to reinitialize process stats: %s", err) } } closings := sess.amqpConn.NotifyClose(make(chan *amqp.Error)) // Wait for close notification and loop back around to reconnect _ = <-closings log.Debugln("Saw a notification for closed connection, looping") } } }
func (ps *ProcessStatCollector) collectSample() error { var err error job := *ps.job proc := job.Process() stat := ProcessStatSample{} stat.Pid = proc.Pid curTime := time.Now() stat.TimeUTC = curTime.UTC() stat.TimeUnix = curTime.Unix() hostinfo, err := host.HostInfo() var hostnameRoutingKey string if err != nil { log.Warnf("Error encountered collecting host stats: %s", err) hostnameRoutingKey = "" } else { stat.Host = *hostinfo hostnameRoutingKey = fmt.Sprintf(".%s", stat.Host.Hostname) } // Collect for parent stat.aggregateStatForProc(proc) children, err := proc.Children() // Collect on children (if any) if err == nil { for _, cproc := range children { stat.ChildPids = append(stat.ChildPids, cproc.Pid) stat.aggregateStatForProc(cproc) } } stat.StdoutBytes = job.StdoutByteCount() stat.UserData = ps.userJSON log.Debugf("Sample: %#v\n", stat) var body []byte body, err = json.Marshal(stat) routingKey := fmt.Sprintf("%s%s", ps.routingKey, hostnameRoutingKey) err = ps.channel.Publish( ps.exchange, // publish to an exchange routingKey, // routing to queues false, // mandatory false, // immediate amqp.Publishing{ Headers: amqp.Table{}, ContentType: "text/javascript", ContentEncoding: "", Body: body, DeliveryMode: amqp.Transient, // non-persistent Priority: 0, // 0-9 }, ) if err != nil { log.Warnf("Error publishing statistics sample %s", err) } return err }