Пример #1
0
func (p *ProcessStatSample) collectOpenFiles(proc *process.Process) {
	defer func() {
		if e := recover(); e != nil {
			log.Warnf("Recovered from panic on Open Files collection. Maybe unsupported on this platform.")
		}
	}()
	openFiles, err := proc.OpenFiles()
	if err != nil {
		log.Warnf("Error encountered collecting open files stats: %s", err)
	} else {
		p.OpenFiles = append(p.OpenFiles, openFiles...)
	}
}
Пример #2
0
func (p *ProcessStatSample) collectNumThreads(proc *process.Process) {
	defer func() {
		if e := recover(); e != nil {
			log.Warnf("Recovered from panic on Number of Threads collection. Maybe unsupported on this platform.")
		}
	}()
	numThreads, err := proc.NumThreads()
	if err != nil {
		log.Warnf("Error encountered collecting thread count stats: %s", err)
	} else {
		p.NumThreads += numThreads
	}
}
Пример #3
0
func (p *ProcessStatSample) collectCPUPercent(proc *process.Process) {
	defer func() {
		if e := recover(); e != nil {
			log.Warnf("Recovered from panic on CPU Percent collection. Maybe unsupported on this platform.")
		}
	}()
	// Use 0 interval to get difference since the last call
	cpuPercent, err := proc.CPUPercent(0 * time.Second)
	if err != nil {
		log.Warnf("Error encountered collecting CPU percent: %s", err)
	} else {
		p.CPUPercent += cpuPercent
	}
}
Пример #4
0
func (p *ProcessStatSample) collectCPUTimes(proc *process.Process) {
	defer func() {
		if e := recover(); e != nil {
			log.Warnf("Recovered from panic on CPU times collection. Maybe unsupported on this platform.")
		}
	}()
	cputimes, err := proc.CPUTimes()
	if err != nil {
		log.Warnf("Error encountered collecting CPU stats: %s", err)
	} else {
		src := reflect.ValueOf(cputimes).Elem()
		dest := reflect.ValueOf(&p.CPUTimes).Elem()
		sum(&src, &dest)
	}
}
Пример #5
0
func (p *ProcessStatSample) collectIOCounters(proc *process.Process) {
	defer func() {
		if e := recover(); e != nil {
			log.Warnf("Recovered from panic on IO counters collection. Maybe unsupported on this platform.")
		}
	}()
	iocnt, err := proc.IOCounters()
	if err != nil {
		log.Warnf("Error encountered collecting I/O stats: %s", err)
	} else {
		src := reflect.ValueOf(iocnt).Elem()
		dest := reflect.ValueOf(&p.IOCounters).Elem()
		sum(&src, &dest)
	}
}
Пример #6
0
func (p *ProcessStatSample) collectMemInfo(proc *process.Process) {
	defer func() {
		if e := recover(); e != nil {
			log.Warnf("Recovered from panic on memory stats collection. Maybe unsupported on this platform.")
		}
	}()
	meminfo, err := proc.MemoryInfo()
	if err != nil {
		log.Warnf("Error encountered collecting memory stats: %s", err)
	} else {
		src := reflect.ValueOf(meminfo).Elem()
		dest := reflect.ValueOf(&p.Memory).Elem()
		sum(&src, &dest)
	}
}
Пример #7
0
// Suspend pauses a running process
func (j *Job) Suspend() error {
	if err := syscall.Kill(-j.Pgid, syscall.SIGSTOP); err != nil {
		log.Warnf("Error received calling suspend on sub-process: %s", err)
		return err
	}
	return nil
}
Пример #8
0
// Resume continues a suspended process
func (j *Job) Resume() error {
	if err := syscall.Kill(-j.Pgid, syscall.SIGCONT); err != nil {
		log.Warnf("Error received calling resume on sub-process: %s", err)
		return err
	}
	return nil
}
Пример #9
0
// Stop gracefully ends the process
func (j *Job) Stop() error {
	err := syscall.Kill(-j.Pgid, syscall.SIGTERM)
	if err != nil {
		log.Warnf("Error received calling stop on sub-process: %s", err)
		return err
	}
	return nil
}
Пример #10
0
func handle(deliveries <-chan amqp.Delivery, done chan error, commands chan RemoteControlCommand) {
	for d := range deliveries {
		var cmd RemoteControlCommand
		err := json.Unmarshal(d.Body, &cmd)
		if err != nil {
			log.Warnf("Failed to unmarshal JSON from AMQP message: %s\n", err)
		}
		commands <- cmd
		d.Ack(false)
	}
	done <- nil
}
Пример #11
0
// Kill forcefully stops a process
func (j *Job) Kill(sig int64) error {
	var err error

	switch sig {
	case -9:
		log.Debugln("Sending process Kill (-9) signal")
		err = syscall.Kill(-j.Pgid, syscall.SIGKILL)
	default:
		signal := syscall.Signal(sig)
		err = syscall.Kill(-j.Pgid, signal)
	}

	if err != nil {
		log.Warnf("Error received calling kill on sub-process: %s", err)
		return err
	}

	return nil
}
Пример #12
0
// NewProcessStats establishes a new AMQP channel and configures sampling period
func NewProcessStats(
	amqp *amqp.Connection,
	routingKey string,
	exchange string,
	job *JobControl,
	interval time.Duration,
	msgTimeout time.Duration,
	userJSON string,
) (ProcessStats, error) {

	var err error
	var extraJSON map[string]interface{}

	if len(userJSON) > 0 {
		err = json.Unmarshal([]byte(userJSON), &extraJSON)
		if err != nil {
			log.Warnf("Error encountered trying to unmarshal user provided JSON: %s\n", err)
		}
	}

	psc := &ProcessStatCollector{
		amqp,
		nil,
		routingKey,
		exchange,
		job,
		nil,
		msgTimeout,
		extraJSON,
	}

	err = psc.ReinitializeConnection(amqp)
	if err != nil {
		return nil, fmt.Errorf("Unable to open a channel on AMQP connection: %s\n", err)
	}

	psc.NewTicker(interval)

	return psc, nil
}
Пример #13
0
func monitor(
	signals chan os.Signal,
	rcChanChan chan chan agents.RemoteControlCommand,
	job agents.JobControl,
	psChan chan agents.ProcessStatCommand,
	timer agents.Timer,
	done chan error,
) {

	// Catch any panics here to ensure we kill the child process before going
	// to our own doom.
	defer func() {
		if e := recover(); e != nil {
			job.Kill(-9)
			panic(e)
		}
	}()
	var logSampling <-chan time.Time
	var err error

	if *stdoutByteLimit > 0 {
		ticker := time.NewTicker(100 * time.Millisecond)
		// Replace the time channel with an actual ticker if this is in use
		logSampling = ticker.C
	}

	var rcChan chan agents.RemoteControlCommand

	for {
		select {
		case rcChan = <-rcChanChan:
		// Catch incoming signals and operate on them as if they were remote commands
		case sig := <-signals:
			switch sig {
			case syscall.SIGINT:
				log.Debugln("Caught SIGINT, graceful shutdown")
				// Initiate non-blocking send
				select {
				case psChan <- agents.ProcessStatCommand{}:
					log.Debugln("Sending psChan a msg to sample")
				default:
					log.Debugln("SIGINT failed to send a sample msg on the psChan")
				}
				err = job.Stop()
			case syscall.SIGTERM:
				log.Debugln("Caught SIGTERM, end abruptly")
				job.Kill(-9)
			case syscall.SIGHUP:
				log.Debugln("Caught SIGHUP, emit stats")
				// Initiate non-blocking send
				select {
				case psChan <- agents.ProcessStatCommand{}:
					log.Debugln("Sending psChan a msg to sample")
				default:
					log.Debugln("SIGHUP failed to send a sample msg on the psChan")
				}
			case syscall.SIGQUIT:
				log.Debugln("Caught SIGQUIT, graceful shutdown")
				select {
				case psChan <- agents.ProcessStatCommand{}:
					log.Debugln("Sending psChan a msg to sample")
				default:
					log.Debugln("SIGQUIT failed to send a sample msg on the psChan")
				}
				err = job.Stop()
			}
		// Process incoming remote commands, toss unknown requests
		case cmd := <-rcChan:
			log.Debugf("Got a command %#v\n", cmd)
			switch cmd.Command {
			case "suspend":
				log.Debugln("RemoteCommand: Suspend")
				job.Suspend()
			case "resume":
				log.Debugln("RemoteCommand: Resume")
				job.Resume()
			case "kill":
				log.Debugln("RemoteCommand: Kill")
				var args int64
				if len(cmd.Arguments) == 0 {
					args = -9
				} else {
					args, err = strconv.ParseInt(cmd.Arguments[0], 10, 32)
					if err != nil {
						log.Warnf("Unable to parse kill command argument[0] into int: %s\n", err)
						args = -9
					}
				}
				job.Kill(args)
			case "stop":
				log.Debugln("RemoteCommand: Stop")
				select {
				case psChan <- agents.ProcessStatCommand{}:
					log.Debugln("Sending psChan a msg to sample")
				default:
					log.Debugln("RC Stop failed to send a sample msg on the psChan")
				}
				if err = job.Stop(); err != nil {
					log.Fatalf("Error received while stopping sub-process: %s\n", err)
				}
			case "sample":
				log.Debugln("RemoteCommand: Sample")
				select {
				case psChan <- agents.ProcessStatCommand{}:
					log.Debugln("Sending psChan a msg to sample")
				default:
					log.Debugln("RC Sample failed to send a sample msg on the psChan")
				}
			case "change_sample_rate":
				log.Debugln("RemoteCommand: Change Stats Sample Rate")
				if len(cmd.Arguments) > 0 {
					log.Debugf("change_sample_rate arg[0]: %s\n", cmd.Arguments[0])
					d, err := time.ParseDuration(cmd.Arguments[0])
					if err == nil {
						select {
						case psChan <- agents.ProcessStatCommand{
							TimeUpdate: true,
							NewTime:    d}:
							log.Debugln("Sending psChan a msg to update the ticker")
						default:
							log.Debugln("RC change_sample_rate failed to send a msg")
						}
					} else {
						log.Warnf("Unparseable duration argument to command change_sample_rate")
					}
				} else {
					log.Warnf("Missing argument to command change_sample_rate")
				}
			case "timer_reset":
				log.Debugln("RemoteCommand: Timer Reset")
				if err = timer.Reset(); err != nil {
					log.Fatalf("Error received from timer calling Reset: %s\n", err)
				}
			case "timer_start":
				log.Debugln("RemoteCommand: Timer Start")
				if err = timer.Start(); err != nil {
					log.Fatalf("Error received from timer calling Start: %s\n", err)
				}
			case "timer_stop":
				log.Debugln("RemoteCommand: Timer Stop")
				if err = timer.Stop(); err != nil {
					log.Fatalf("Error received from timer calling Stop: %s\n", err)
				}
			case "timer_resume":
				log.Debugln("RemoteCommand: Timer Resume")
				if err = timer.Resume(); err != nil {
					log.Fatalf("Error received from timer calling Resume: %s\n", err)
				}
			default:
				log.Debugf("Unknown command: %s\n", cmd)
			}
		case timeoutMsg := <-timer.Done():
			log.Debugf("Timer timeout message: %s\n", timeoutMsg)
			if err = job.Stop(); err != nil {
				log.Fatalf("Error received while stopping sub-process: %v\n", err)
				// If there was an error stopping the process, kill the porcess.
				job.Kill(-9)
			}
		case jobDone := <-job.Done():
			log.Debugln("Command exited gracefully; shutting down.")
			done <- jobDone
		case _ = <-logSampling:
			if job.StdoutByteCount() > 2*(*stdoutByteLimit) {
				err = job.Kill(-9)
			} else if job.StdoutByteCount() > *stdoutByteLimit {
				err = job.Stop()
			}
		}
	}
}
Пример #14
0
func redial(sess session) {
	var err error
	var stats agents.ProcessStats
	var rc *agents.RemoteControl

	// Initialize mini-router for incoming stats agent requests
	go func() {
		for s := range sess.psChan {
			if stats == nil {
				log.Warnln("No stats agents available (yet), dropping request")
			} else if s.TimeUpdate {
				stats.NewTicker(s.NewTime)
			} else {
				stats.Sample()
			}
		}
	}()

	rcKeys := []string{*rmtKey}
	if sess.multiRmtKey {
		rcKeys = deleteEmpty(strings.Split(*rmtKey, ","))
	}
	for {
		sess.amqpConn, err = amqp.DialConfig(*uri, sess.amqpConfig)

		if err != nil {
			log.Warnf("Failed to connect to AMQP: %q", err)
			// Rate limit reconnection attempts
			time.Sleep(5 * time.Second)
		} else {
			rc, err = agents.NewRemoteControl(sess.amqpConn, rcKeys, *exchange)
			if err != nil {
				log.Warnf("Failed creating NewRemoteControl: %s", err)
			} else {
				sess.rcChan <- rc.Commands
			}

			if stats == nil {
				// initial setup
				stats, err = agents.NewProcessStats(
					sess.amqpConn,
					*procStatsKey,
					*exchange,
					&sess.job,
					*statsInterval,
					*msgTimeout,
					*userJSON,
				)
				if err != nil {
					log.Warnf("Failed creating NewProcessStats: %s", err)
				}
			} else {
				err = stats.ReinitializeConnection(sess.amqpConn)
				if err != nil {
					log.Warnf("Failed to reinitialize process stats: %s", err)
				}
			}
			closings := sess.amqpConn.NotifyClose(make(chan *amqp.Error))

			// Wait for close notification and loop back around to reconnect
			_ = <-closings
			log.Debugln("Saw a notification for closed connection, looping")
		}

	}
}
Пример #15
0
func (ps *ProcessStatCollector) collectSample() error {
	var err error
	job := *ps.job
	proc := job.Process()

	stat := ProcessStatSample{}

	stat.Pid = proc.Pid

	curTime := time.Now()
	stat.TimeUTC = curTime.UTC()
	stat.TimeUnix = curTime.Unix()

	hostinfo, err := host.HostInfo()
	var hostnameRoutingKey string
	if err != nil {
		log.Warnf("Error encountered collecting host stats: %s", err)
		hostnameRoutingKey = ""
	} else {
		stat.Host = *hostinfo
		hostnameRoutingKey = fmt.Sprintf(".%s", stat.Host.Hostname)
	}

	// Collect for parent
	stat.aggregateStatForProc(proc)

	children, err := proc.Children()
	// Collect on children (if any)
	if err == nil {
		for _, cproc := range children {
			stat.ChildPids = append(stat.ChildPids, cproc.Pid)
			stat.aggregateStatForProc(cproc)
		}
	}

	stat.StdoutBytes = job.StdoutByteCount()

	stat.UserData = ps.userJSON

	log.Debugf("Sample: %#v\n", stat)

	var body []byte
	body, err = json.Marshal(stat)

	routingKey := fmt.Sprintf("%s%s", ps.routingKey, hostnameRoutingKey)
	err = ps.channel.Publish(
		ps.exchange, // publish to an exchange
		routingKey,  // routing to queues
		false,       // mandatory
		false,       // immediate
		amqp.Publishing{
			Headers:         amqp.Table{},
			ContentType:     "text/javascript",
			ContentEncoding: "",
			Body:            body,
			DeliveryMode:    amqp.Transient, // non-persistent
			Priority:        0,              // 0-9
		},
	)
	if err != nil {
		log.Warnf("Error publishing statistics sample %s", err)
	}

	return err
}