Exemplo n.º 1
func (s *MainTestSuite) SetUpTest(t *C) {
	// Make a new mocked installation for test, copying the already compiled mocked agent.
	// Using the same tmp basedir for all tests could lead to dissapearing of pid files at any given time. KILL/TERM of
	// processes is async and mocked agent will remove the pid file while shutting down, making tests fail in the most
	// unexpected ways.
	var err error
	// We can't/shouldn't use /usr/local/percona/ (the default basedir),
	// so use a tmpdir instead with only a bin dir inside
	s.basedir, err = ioutil.TempDir("/tmp", "percona-agent-init-test-")
	t.Assert(err, IsNil)
	binDir := filepath.Join(s.basedir, pct.BIN_DIR)
	err = os.Mkdir(binDir, 0777)
	t.Assert(err, IsNil)

	// Lets copy mocked and already compiled percona-agent
	s.bin = filepath.Join(binDir, "percona-agent")
	cmd := exec.Command("cp", s.buildbin, s.bin)
	err = cmd.Run()
	t.Assert(err, IsNil, Commentf("Failed to copy mocked percona-agent to tmp dir: %v", err))

	// Copy init script to tmp basedir/bin directory
	initscript, err := filepath.Abs("./percona-agent")
	// Check if absolute path resolving succedeed
	t.Assert(err, IsNil)
	// Check if init script is there
	t.Assert(pct.FileExists(initscript), Equals, true)
	s.initscript = filepath.Join(s.basedir, pct.BIN_DIR, "init-script")
	cmd = exec.Command("cp", initscript, s.initscript)
	err = cmd.Run()
	t.Assert(err, IsNil, Commentf("Failed to copy init script to tmp dir: %v", err))

	// Set all env vars to default test values
Exemplo n.º 2
func (s *MainTestSuite) TestDelayedStop(t *C) {
	// Set init script stop timeout to 1 second
	os.Setenv("PCT_TEST_STOP_TIMEOUT", "1")
	// Set percona-agent stop delay to 2 seconds
	os.Setenv("PCT_TEST_STOP_DELAY", "2")
	// Now try to start service
	cmd := exec.Command(s.initscript, "start")
	output, err := cmd.Output()
	// start exit code should be 0
	t.Check(err, IsNil)

	// Get the PID from the pidfile
	pid, err := readPidFile(filepath.Join(s.basedir, "percona-agent.pid"))
	// Check if we could read the pidfile
	t.Check(err, IsNil)
	// pid should be non empty
	t.Check(pid, Not(Equals), "")

	stop_cmd := exec.Command(s.initscript, "stop")
	output, err = stop_cmd.Output()
	// start exit code should be 0
	t.Check(err, IsNil)

	// Script should output message
	t.Check(string(output), Equals, fmt.Sprintf("Stopping percona-agent...\nWaiting for percona-agent to exit...\n"+
		"Time out waiting for percona-agent to exit.  Trying kill -9 %v...\nStopped percona-agent.\n", pid))
	// Make sure the process was killed
	t.Assert(pct.FileExists(fmt.Sprintf("/proc/%v/stat", pid)), Equals, false)
Exemplo n.º 3
func (s *AgentTestSuite) TestRestart(t *C) {
	// Stop the default agnet.  We need our own to check its return value.

	cmdFactory := &mock.CmdFactory{}
	pctCmd.Factory = cmdFactory

	defer func() {

	newAgent := agent.NewAgent(s.config, s.logger, s.api, s.client, s.servicesMap)
	doneChan := make(chan error, 1)
	go func() {
		doneChan <- newAgent.Run()

	cmd := &proto.Cmd{
		Service: "agent",
		Cmd:     "Restart",
	s.sendChan <- cmd

	replies := test.WaitReply(s.recvChan)
	t.Assert(replies, HasLen, 1)
	t.Check(replies[0].Error, Equals, "")

	var err error
	select {
	case err = <-doneChan:
	case <-time.After(2 * time.Second):
		t.Fatal("Agent did not restart")

	// Agent should return without an error.
	t.Check(err, IsNil)

	// Agent should create the start-lock file and start-script.
	t.Check(pct.FileExists(pct.Basedir.File("start-lock")), Equals, true)
	t.Check(pct.FileExists(pct.Basedir.File("start-script")), Equals, true)

	// Agent should make a command to run the start-script.
	t.Assert(cmdFactory.Cmds, HasLen, 1)
	t.Check(cmdFactory.Cmds[0].Name, Equals, pct.Basedir.File("start-script"))
	t.Check(cmdFactory.Cmds[0].Args, IsNil)
Exemplo n.º 4
func (s *TestSuite) TestRemoveRel(t *C) {
	tmpFileName := getTmpFileName()
	t.Check(s.testPidFile.Set(tmpFileName), Equals, nil)
	// Remove should succeed, pidfile exists
	t.Assert(s.testPidFile.Remove(), Equals, nil)
	absFilePath := filepath.Join(pct.Basedir.Path(), tmpFileName)
	// Check if pidfile was deleted
	t.Assert(pct.FileExists(absFilePath), Equals, false)
Exemplo n.º 5
func (s *ManagerTestSuite) TestStartCollectStop(t *C) {
	files := []string{"stat", "meminfo", "vmstat", "loadavg", "diskstats"}
	for _, file := range files {
		if !pct.FileExists("/proc/" + file) {
			t.Fatal("/proc/" + file + " does not exist")

	// Create the monitor.
	m := system.NewMonitor(s.name, &system.Config{}, s.logger)
	if m == nil {
		t.Fatal("Make new system.Monitor")

	// Start the monitor.
	err := m.Start(s.tickChan, s.collectionChan)
	if err != nil {
		t.Fatalf("Start monitor without error, got %s", err)

	// system-monitor=Ready once it has started its internals,
	// should be very fast.
	if ok := test.WaitStatusPrefix(3, m, s.name, "Idle"); !ok {
		t.Fatal("Monitor is ready")

	// The monitor should only collect and send metrics on ticks; we haven't ticked yet.
	got := test.WaitCollection(s.collectionChan, 0)
	if len(got) > 0 {
		t.Fatal("No tick, no collection; got %+v", got)

	// Now tick.  This should make monitor collect.
	now := time.Now()
	s.tickChan <- now

	got = test.WaitCollection(s.collectionChan, 1)
	t.Assert(got, Not(HasLen), 0)
	t.Check(got, HasLen, 1)

	c := got[0]
	t.Check(c.Ts, Equals, now.Unix())

	t.Assert(c.Metrics, Not(HasLen), 0)

	// /proc/stat values are relative (current - prev) so there shouldn't be any
	// after one tick.
	haveCPU, _ := haveMetric("cpu/user", c.Metrics)
	t.Check(haveCPU, Equals, false)

	// But other metrics are not relative, so we should have them.
	metrics := []string{"memory/MemTotal", "vmstat/numa_local", "loadavg/running", "disk/sda/reads"}
	for _, metric := range metrics {
		ok, val := haveMetric(metric, c.Metrics)
		t.Check(ok, Equals, true)
		t.Check(val, Not(Equals), 0)

	// Tick a 2nd time and now we should get CPU metrics.
	time.Sleep(200 * time.Millisecond)
	now = time.Now()
	s.tickChan <- now

	got = test.WaitCollection(s.collectionChan, 1)
	t.Assert(got, Not(HasLen), 0)
	t.Check(got, HasLen, 1)
	c = got[0]
	t.Check(c.Ts, Equals, now.Unix())
	t.Assert(c.Metrics, Not(HasLen), 0)

	metrics = []string{"cpu/user", "cpu/nice", "cpu/system", "cpu/idle"}
	for _, metric := range metrics {
		ok, val := haveMetric(metric, c.Metrics)
		t.Check(ok, Equals, true)

		// Running this test requires some CPU so user and idle shouldn't be zero.
		if metric == "cpu/user" || metric == "cpu/idle" {
			t.Check(val, Not(Equals), 0)

	 * Stop the monitor.


	if ok := test.WaitStatus(5, m, s.name, "Stopped"); !ok {
		t.Fatal("Monitor has stopped")
Exemplo n.º 6
 * Tests:
 * - starting monitor
 * - stopping monitor
 * - starting monitor again (restarting monitor)
 * - sneaked in:) unknown cmd test
func (s *ManagerTestSuite) TestRestartMonitor(t *C) {
	// Create and start mm, no monitors yet.
	m := mm.NewManager(s.logger, s.factory, s.clock, s.spool, s.im)
	t.Assert(m, NotNil)
	err := m.Start()
	t.Assert(err, IsNil)

	// Start a monitor by sending StartService + monitor config.
	// This is the config in test/mm/config/mm-mysql-1.conf.
	mmConfig := &mysql.Config{
		Config: mm.Config{
			ServiceInstance: proto.ServiceInstance{
				Service:    "mysql",
				InstanceId: 1,
			Collect: 1,
			Report:  60,
		Status: map[string]string{
			"threads_connected": "gauge",
			"threads_running":   "gauge",
	mmConfigData, err := json.Marshal(mmConfig)
	t.Assert(err, IsNil)

	// If this were a real monitor, it would decode and set its own config.
	// The mock monitor doesn't have any real config type, so we set it manually.

	// The agent calls mm.Handle() with the cmd (for logging and status) and the config data.
	cmd := &proto.Cmd{
		User:    "******",
		Service: "mm",
		Cmd:     "StartService",
		Data:    mmConfigData,
	reply := m.Handle(cmd)
	t.Assert(reply, NotNil)
	t.Check(reply.Error, Equals, "")

	// The monitor should be running.  The mock monitor returns "Running" if
	// Start() has been called; else it returns "Stopped".
	status := m.Status()
	t.Check(status["monitor"], Equals, "Running")

	// There should be a 1s collect ticker for the monitor.
	if ok, diff := test.IsDeeply(s.clock.Added, []uint{1}); !ok {
		t.Errorf("Make 1s ticker for collect interval\n%s", diff)

	// After starting a monitor, mm should write its config to the dir
	// it learned when mm.LoadConfig() was called.  Next time agent starts,
	// it will have mm start the monitor with this config.
	data, err := ioutil.ReadFile(s.configDir + "/mm-mysql-1.conf")
	t.Check(err, IsNil)
	gotConfig := &mysql.Config{}
	err = json.Unmarshal(data, gotConfig)
	t.Check(err, IsNil)
	if same, diff := test.IsDeeply(gotConfig, mmConfig); !same {

	 * Stop the monitor.

	cmd = &proto.Cmd{
		User:    "******",
		Service: "mm",
		Cmd:     "StopService",
		Data:    mmConfigData,

	// Handles StopService without error.
	reply = m.Handle(cmd)
	t.Assert(reply, NotNil)
	t.Check(reply.Error, Equals, "")

	// Stop a monitor removes it from the managers list of monitors.
	// So it's no longer present in a status request.
	status = m.Status()
	t.Check(status["monitor"], Equals, "")

	// After stopping the monitor, the manager should remove its tickChan.
	if len(s.clock.Removed) != 1 {
		t.Error("Remove's monitor's tickChan from clock")

	// After stopping a monitor, mm should remove its config file so agent
	// doesn't start it on restart.
	file := s.configDir + "/mm-mysql-1.conf"
	if pct.FileExists(file) {
		t.Error("Stopping monitor removes its config; ", file, " exists")

	 * Start the monitor again (restarting monitor).
	cmd = &proto.Cmd{
		User:    "******",
		Service: "mm",
		Cmd:     "StartService",
		Data:    mmConfigData,

	// If this were a real monitor, it would decode and set its own config.
	// The mock monitor doesn't have any real config type, so we set it manually.

	// The agent calls mm.Handle() with the cmd (for logging and status) and the config data.
	reply = m.Handle(cmd)
	t.Assert(reply, NotNil)
	t.Check(reply.Error, Equals, "")

	// The monitor should be running.  The mock monitor returns "Running" if
	// Start() has been called; else it returns "Stopped".
	status = m.Status()
	t.Check(status["monitor"], Equals, "Running")

	// There should be a 1s collect ticker for the monitor.
	// (Actually two in s.clock.Added, as this is mock and we started monitor twice)
	if ok, diff := test.IsDeeply(s.clock.Added, []uint{1, 1}); !ok {
		t.Errorf("Make 1s ticker for collect interval\n%s", diff)

	// After starting a monitor, mm should write its config to the dir
	// it learned when mm.LoadConfig() was called.  Next time agent starts,
	// it will have mm start the monitor with this config.
	data, err = ioutil.ReadFile(s.configDir + "/mm-mysql-1.conf")
	t.Check(err, IsNil)
	gotConfig = &mysql.Config{}
	err = json.Unmarshal(data, gotConfig)
	t.Check(err, IsNil)
	if same, diff := test.IsDeeply(gotConfig, mmConfig); !same {
		t.Logf("%+v", gotConfig)

	 * While we're all setup and working, let's sneak in an unknown cmd test.

	cmd = &proto.Cmd{
		User:    "******",
		Service: "mm",
		Cmd:     "Pontificate",
		Data:    mmConfigData,

	// Unknown cmd causes error.
	reply = m.Handle(cmd)
	t.Assert(reply, NotNil)
	t.Check(reply.Error, Not(Equals), "")
Exemplo n.º 7
func run() error {
	version := fmt.Sprintf("percona-agent %s rev %s", agent.VERSION, agent.REVISION)
	if flagVersion {
		return nil
	golog.Printf("Running %s pid %d\n", version, os.Getpid())

	if err := pct.Basedir.Init(flagBasedir); err != nil {
		return err

	// Start-lock file is used to let agent1 self-update, create start-lock,
	// start updated agent2, exit cleanly, then agent2 starts.  agent1 may
	// not use a PID file, so this special file is required.
	if err := pct.WaitStartLock(); err != nil {
		return err
	// NOTE: This must run last, and defer if LIFO, so it must be declared first.
	defer os.Remove(pct.Basedir.File("start-lock"))

	 * Agent config (require API key and agent UUID)

	if !pct.FileExists(pct.Basedir.ConfigFile("agent")) {
		return fmt.Errorf("Agent config file %s does not exist", pct.Basedir.ConfigFile("agent"))

	bytes, err := agent.LoadConfig()
	if err != nil {
		return fmt.Errorf("Invalid agent config: %s\n", err)
	agentConfig := &agent.Config{}
	if err := json.Unmarshal(bytes, agentConfig); err != nil {
		return fmt.Errorf("Error parsing "+pct.Basedir.ConfigFile("agent")+": ", err)

	golog.Println("ApiHostname: " + agentConfig.ApiHostname)
	golog.Println("AgentUuid: " + agentConfig.AgentUuid)

	 * Ping and exit, maybe.

	if flagPing {
		t0 := time.Now()
		code, err := pct.Ping(agentConfig.ApiHostname, agentConfig.ApiKey)
		d := time.Now().Sub(t0)
		if err != nil || code != 200 {
			return fmt.Errorf("Ping FAIL (%d %d %s)", d, code, err)
		} else {
			golog.Printf("Ping OK (%s)", d)
			return nil

	 * PID file

	if flagPidFile != "" {
		pidFile := pct.NewPidFile()
		if err := pidFile.Set(flagPidFile); err != nil {
		defer pidFile.Remove()


	api, err := ConnectAPI(agentConfig)
	if err != nil {

	 * Log relay

	logChan := make(chan *proto.LogEntry, log.BUFFER_SIZE*3)

	// Log websocket client, possibly disabled later.
	logClient, err := client.NewWebsocketClient(pct.NewLogger(logChan, "log-ws"), api, "log")
	if err != nil {
	logManager := log.NewManager(
	if err := logManager.Start(); err != nil {
		return fmt.Errorf("Error starting logmanager: %s\n", err)

	 * Instance manager

	itManager := instance.NewManager(
		pct.NewLogger(logChan, "instance-manager"),
	if err := itManager.Start(); err != nil {
		return fmt.Errorf("Error starting instance manager: %s\n", err)

	 * Data spooler and sender

	hostname, _ := os.Hostname()

	dataClient, err := client.NewWebsocketClient(pct.NewLogger(logChan, "data-ws"), api, "data")
	if err != nil {
	dataManager := data.NewManager(
		pct.NewLogger(logChan, "data"),
	if err := dataManager.Start(); err != nil {
		return fmt.Errorf("Error starting data manager: %s\n", err)

	 * Collecct/report ticker (master clock)

	nowFunc := func() int64 { return time.Now().UTC().UnixNano() }
	clock := ticker.NewClock(&ticker.RealTickerFactory{}, nowFunc)

	 * Metric and system config monitors

	mmManager := mm.NewManager(
		pct.NewLogger(logChan, "mm"),
		mmMonitor.NewFactory(logChan, itManager.Repo()),
	if err := mmManager.Start(); err != nil {
		return fmt.Errorf("Error starting mm manager: %s\n", err)

	sysconfigManager := sysconfig.NewManager(
		pct.NewLogger(logChan, "sysconfig"),
		sysconfigMonitor.NewFactory(logChan, itManager.Repo()),
	if err := sysconfigManager.Start(); err != nil {
		return fmt.Errorf("Error starting sysconfig manager: %s\n", err)

	 * Query Analytics

	qanManager := qan.NewManager(
		pct.NewLogger(logChan, "qan"),
	if err := qanManager.Start(); err != nil {
		return fmt.Errorf("Error starting qan manager: %s\n", err)

	 * Signal handler

	// Generally the agent has a crash-only design, but QAN is so far the only service
	// which reconfigures MySQL: it enables the slow log, sets long_query_time, etc.
	// It's not terrible to leave slow log on, but it's nicer to turn it off.
	sigChan := make(chan os.Signal, 1)
	stopChan := make(chan error, 2)
	signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM)
	go func() {
		sig := <-sigChan
		golog.Printf("Caught %s signal, shutting down...\n", sig)
		stopChan <- qanManager.Stop()

	 * Agent

	cmdClient, err := client.NewWebsocketClient(pct.NewLogger(logChan, "agent-ws"), api, "cmd")
	if err != nil {

	// The official list of services known to the agent.  Adding a new service
	// requires a manager, starting the manager as above, and adding the manager
	// to this map.
	services := map[string]pct.ServiceManager{
		"log":       logManager,
		"data":      dataManager,
		"qan":       qanManager,
		"mm":        mmManager,
		"instance":  itManager,
		"sysconfig": sysconfigManager,

	agent := agent.NewAgent(
		pct.NewLogger(logChan, "agent"),

	 * Run agent, wait for it to stop or signal.

	go func() {
		stopChan <- agent.Run()
	stopErr := <-stopChan // agent or signal
	golog.Println("Agent stopped, shutting down...")
	qanManager.Stop()           // see Signal handler ^
	time.Sleep(2 * time.Second) // wait for final replies and log entries
	return stopErr
Exemplo n.º 8
func (s *ManagerTestSuite) TestStartStopMonitor(t *C) {
	m := sysconfig.NewManager(s.logger, s.factory, s.clock, s.spool, s.im)
	t.Assert(m, NotNil)

	err := m.Start()
	t.Assert(err, IsNil)

	// Starting a monitor is like starting the manager: it requires
	// a "StartService" cmd and the monitor's config.  This is the
	// config in configDir/db1-mysql-monitor.conf.
	sysconfigConfig := &mysql.Config{
		Config: sysconfig.Config{
			ServiceInstance: proto.ServiceInstance{
				Service:    "mysql",
				InstanceId: 1,
			Report: 3600,
	sysconfigConfigData, err := json.Marshal(sysconfigConfig)
	t.Assert(err, IsNil)

	cmd := &proto.Cmd{
		User:    "******",
		Service: "sysconfig",
		Cmd:     "StartService",
		Data:    sysconfigConfigData,

	// If this were a real monitor, it would decode and set its own config.
	// The mock monitor doesn't have any real config type, so we set it manually.

	// The agent calls sysconfig.Handle() with the cmd (for logging and status) and the config data.
	reply := m.Handle(cmd)
	t.Assert(reply, NotNil)
	t.Check(reply.Error, Equals, "")

	// The monitor should be running.  The mock monitor returns "Running" if
	// Start() has been called; else it returns "Stopped".
	status := s.mockMonitor.Status()
	if status["monitor"] != "Running" {
		t.Error("Monitor running")

	// There should be a 60s report ticker for the aggregator and a 1s collect ticker
	// for the monitor.
	if ok, diff := test.IsDeeply(s.clock.Added, []uint{3600}); !ok {
		t.Errorf("Make 3600s ticker for collect interval\n%s", diff)

	// After starting a monitor, sysconfig should write its config to the dir
	// it learned when sysconfig.LoadConfig() was called.  Next time agent starts,
	// it will have sysconfig start the monitor with this config.
	data, err := ioutil.ReadFile(s.configDir + "/sysconfig-mysql-1.conf")
	t.Check(err, IsNil)
	gotConfig := &mysql.Config{}
	err = json.Unmarshal(data, gotConfig)
	t.Check(err, IsNil)
	if same, diff := test.IsDeeply(gotConfig, sysconfigConfig); !same {
		t.Logf("%+v", gotConfig)

	 * Stop the monitor.

	cmd = &proto.Cmd{
		User:    "******",
		Service: "sysconfig",
		Cmd:     "StopService",
		Data:    sysconfigConfigData,

	// Handles StopService without error.
	reply = m.Handle(cmd)
	t.Assert(reply, NotNil)
	t.Check(reply.Error, Equals, "")

	status = s.mockMonitor.Status()
	if status["monitor"] != "Stopped" {
		t.Error("Monitor stopped")

	// After stopping the monitor, the manager should remove its tickChan.
	if len(s.clock.Removed) != 1 {
		t.Error("Remove's monitor's tickChan from clock")

	// After stopping a monitor, sysconfig should remove its config file so agent
	// doesn't start it on restart.
	file := s.configDir + "/sysconfig-mysql-1.conf"
	if pct.FileExists(file) {
		t.Error("Stopping monitor removes its config; ", file, " exists")

	 * While we're all setup and working, let's sneak in an unknown cmd test.

	cmd = &proto.Cmd{
		User:    "******",
		Service: "sysconfig",
		Cmd:     "Pontificate",
		Data:    sysconfigConfigData,

	// Unknown cmd causes error.
	reply = m.Handle(cmd)
	t.Assert(reply, NotNil)
	if reply.Error == "" {
		t.Fatalf("Unknown Cmd to Handle() causes error")

	 * Clean up
Exemplo n.º 9
func (s *DiskvSpoolerTestSuite) TestRejectData(t *C) {
	sz := data.NewJsonSerializer()

	// Create and start the spooler.
	spool := data.NewDiskvSpooler(s.logger, s.dataDir, s.trashDir, "localhost", s.limits)
	t.Assert(spool, NotNil)

	err := spool.Start(sz)
	t.Assert(err, IsNil)

	// Spooler should create the bad data dir.
	badDataDir := path.Join(s.trashDir, "data")
	ok := pct.FileExists(badDataDir)
	t.Assert(ok, Equals, true)

	// Spool any data...
	now := time.Now()
	logEntry := &proto.LogEntry{
		Ts:      now,
		Level:   1,
		Service: "mm",
		Msg:     "hello world",
	err = spool.Write("log", logEntry)
	t.Check(err, IsNil)

	// Wait for spooler to write data to disk.
	files := test.WaitFiles(s.dataDir, 1)
	t.Assert(files, HasLen, 1)

	// Get the file name the spooler saved the data as.
	gotFiles := []string{}
	filesChan := spool.Files()
	for file := range filesChan {
		gotFiles = append(gotFiles, file)
	t.Assert(gotFiles, HasLen, 1)

	// Reject the file.  The spooler should move it to the bad data dir
	// then remove it from the list.
	err = spool.Reject(gotFiles[0])
	t.Check(err, IsNil)

	ok = pct.FileExists(path.Join(s.dataDir, gotFiles[0]))
	t.Assert(ok, Equals, false)

	badFile := path.Join(badDataDir, gotFiles[0])
	ok = pct.FileExists(path.Join(badFile))
	t.Assert(ok, Equals, true)


	 * Start another spooler now that we have data/bad/file to ensure
	 * that the spooler does not read/index/cache bad files.

	spool = data.NewDiskvSpooler(s.logger, s.dataDir, s.trashDir, "localhost", s.limits)
	t.Assert(spool, NotNil)
	err = spool.Start(sz)
	t.Assert(err, IsNil)
	spool.Write("log", logEntry)
	files = test.WaitFiles(s.dataDir, 1)
	t.Assert(files, HasLen, 1)

	// There should only be 1 new file in the spool.
	gotFiles = []string{}
	filesChan = spool.Files()
	for file := range filesChan {
		t.Check(file, Not(Equals), badFile)
		gotFiles = append(gotFiles, file)
	t.Assert(gotFiles, HasLen, 1)

Exemplo n.º 10
func (i *Installer) Run() error {

	 * Check for pt-agent, upgrade if found.

	var ptagentDSN *mysql.DSN
	ptagentUpgrade := false
	ptagentConf := "/root/.pt-agent.conf"
	if pct.FileExists(ptagentConf) {
		fmt.Println("Found pt-agent, upgrading and removing because it is no longer supported...")
		ptagentUpgrade = true

		// Stop pt-agent
		if err := StopPTAgent(); err != nil {
			fmt.Printf("Error stopping pt-agent: %s\n\n", err)
			fmt.Println("WARNING: pt-agent must be stopped before installing percona-agent.  " +
				"Please verify that pt-agent is not running and has been removed from cron.  " +
				"Enter 'Y' to confirm and continue installing percona-agent.")
			ok, err := i.term.PromptBool("pt-agent has stopped?", "N")
			if err != nil {
				return err
			if !ok {
				return fmt.Errorf("Failed to stop pt-agent")

		// Get its settings (API key, UUID, etc.).
		agent, dsn, err := GetPTAgentSettings(ptagentConf)
		if err != nil {
			return fmt.Errorf("Error upgrading pt-agent: %s", err)
		if agent.ApiKey != "" {
			i.agentConfig.ApiKey = agent.ApiKey
		if agent.AgentUuid != "" {
			i.agentConfig.AgentUuid = agent.AgentUuid
			fmt.Printf("Upgrading pt-agent %s...\n", agent.AgentUuid)
		ptagentDSN = dsn

	 * Get the API key.

	fmt.Printf("API host: %s\n", i.agentConfig.ApiHostname)

	for i.agentConfig.ApiKey == "" {
		apiKey, err := i.term.PromptString("API key", "")
		if err != nil {
			return err
		if apiKey == "" {
			fmt.Println("API key is required, please try again.")
		i.agentConfig.ApiKey = apiKey

	 * Verify the API key by pinging the API.

	for {
		startTime := time.Now()
		fmt.Printf("Verifying API key %s...\n", i.agentConfig.ApiKey)
		code, err := pct.Ping(i.agentConfig.ApiHostname, i.agentConfig.ApiKey)
		elapsedTime := time.Since(startTime)
		elapsedTimeInSeconds := elapsedTime / time.Second

		timeout := false
		if urlErr, ok := err.(*url.Error); ok {
			if netOpErr, ok := urlErr.Err.(*net.OpError); ok && netOpErr.Timeout() {
				timeout = true
		if i.flags["debug"] {
			log.Printf("code=%d\n", code)
			log.Printf("err=%s\n", err)
		ok := false
		if timeout {
				"Error: API connection timeout (%ds): %s\n"+
					"Before you try again, please check your connection and DNS configuration.\n",
		} else if err != nil {
			fmt.Printf("Error: %s\n", err)
		} else if code >= 500 {
			fmt.Printf("Sorry, there's an API problem (status code %d). "+
				"Please try to install again. If the problem continues, contact Percona.\n",
		} else if code == 401 {
			return fmt.Errorf("Access denied.  Check the API key and try again.")
		} else if code >= 300 {
			fmt.Printf("Sorry, there's an installer problem (status code %d). "+
				"Please try to install again. If the problem continues, contact Percona.\n",
		} else if code != 200 {
			fmt.Printf("Sorry, there's an installer problem (status code %d). "+
				"Please try to install again. If the problem continues, contact Percona.\n",
		} else {
			ok = true

		if !ok {
			again, err := i.term.PromptBool("Try again?", "Y")
			if err != nil {
				return err
			if !again {
				return fmt.Errorf("Failed to verify API key")
			continue VERIFY_API_KEY

		fmt.Printf("API key %s is OK\n", i.agentConfig.ApiKey)

		if elapsedTimeInSeconds >= 0 {
				"WARNING: We have detected that request to api took %d second(-s) while usually it shouldn't take more than 1s.\n"+
					"This might be due to connection problems or slow DNS resolution.\n"+
					"Before you continue please check your connection and DNS configuration as this might impact performance of percona-agent.\n"+
					"If you are using CentOS or Fedora 19+ in a vagrant box then you might be interested in this bug report:\n"+
			proceed, err := i.term.PromptBool("Continue anyway?", "Y")
			if err != nil {
				return err
			if !proceed {
				return fmt.Errorf("Failed because of slow connection")


	var si *proto.ServerInstance
	var mi *proto.MySQLInstance

	 * Create new service instances.

	var err error

	if i.flags["create-server-instance"] {
		si, err = i.createServerInstance()
		if err != nil {
			return err
		fmt.Printf("Created server instance: hostname=%s id=%d\n", si.Hostname, si.Id)
	} else {
		fmt.Println("Not creating server instance (-create-server-instance=false)")

	if i.flags["create-mysql-instance"] {
		// Create MySQL user for agent, or using existing one, then verify MySQL connection.
		agentDSN, err := i.doMySQL(ptagentDSN)
		if err != nil {
			return err
		// Create MySQL instance in API.
		mi, err = i.createMySQLInstance(agentDSN)
		if err != nil {
			return err
		fmt.Printf("Created MySQL instance: dsn=%s hostname=%s id=%d\n", mi.DSN, mi.Hostname, si.Id)
	} else {
		fmt.Println("Not creating MySQL instance (-create-mysql-instance=false)")

	if err := i.writeInstances(si, mi); err != nil {
		return fmt.Errorf("Created agent but failed to write service instances: %s", err)

	 * Get default configs for all services.

	configs := []proto.AgentConfig{}

	if i.flags["start-services"] {
		// Server metrics monitor
		config, err := i.getMmServerConfig(si)
		if err != nil {
			fmt.Println("WARNING: cannot start server metrics monitor")
		} else {
			configs = append(configs, *config)

		if i.flags["start-mysql-services"] {
			// MySQL metrics tracker
			config, err = i.getMmMySQLConfig(mi)
			if err != nil {
				fmt.Println("WARNING: cannot start MySQL metrics monitor")
			} else {
				configs = append(configs, *config)

			// MySQL config tracker
			config, err = i.getSysconfigMySQLConfig(mi)
			if err != nil {
				fmt.Println("WARNING: cannot start MySQL configuration monitor")
			} else {
				configs = append(configs, *config)

			// QAN
			// MySQL is local if the server hostname == MySQL hostname without port number.
			if i.hostname == portNumberRe.ReplaceAllLiteralString(mi.Hostname, "") {
				if i.flags["debug"] {
					log.Printf("MySQL is local")
				config, err := i.getQanConfig(mi)
				if err != nil {
					fmt.Println("WARNING: cannot start Query Analytics")
				} else {
					configs = append(configs, *config)
		} else {
			fmt.Println("Not starting MySQL services (-start-mysql-services=false)")
	} else {
		fmt.Println("Not starting default services (-start-services=false)")

	 * Create agent with initial service configs.

	if ptagentUpgrade {
		agent, err := i.updateAgent(i.agentConfig.AgentUuid)
		if err != nil {
			return err
		fmt.Println("pt-agent upgraded to percona-agent")
		if err := i.writeConfigs(agent, configs); err != nil {
			return fmt.Errorf("Upgraded pt-agent but failed to write percona-agent configs: %s", err)
	} else if i.flags["create-agent"] {
		agent, err := i.createAgent(configs)
		if err != nil {
			return err
		fmt.Printf("Created agent: uuid=%s\n", agent.Uuid)

		if err := i.writeConfigs(agent, configs); err != nil {
			return fmt.Errorf("Created agent but failed to write configs: %s", err)
	} else {
		fmt.Println("Not creating agent (-create-agent=false)")

	 * Remove pt-agent if upgrading.

	if ptagentUpgrade {
		fmt.Println("pt-agent removed")

	return nil // success
Exemplo n.º 11
func run() error {
	version := fmt.Sprintf("percona-agent %s%s rev %s", agent.VERSION, agent.REL, agent.REVISION)
	if flagVersion {
		return nil
	golog.Printf("Running %s pid %d\n", version, os.Getpid())

	if err := pct.Basedir.Init(flagBasedir); err != nil {
		return err

	// Start-lock file is used to let agent1 self-update, create start-lock,
	// start updated agent2, exit cleanly, then agent2 starts.  agent1 may
	// not use a PID file, so this special file is required.
	if err := pct.WaitStartLock(); err != nil {
		return err
	// NOTE: This must run last, and defer if LIFO, so it must be declared first.
	defer os.Remove(pct.Basedir.File("start-lock"))

	 * Agent config (require API key and agent UUID)

	if !pct.FileExists(pct.Basedir.ConfigFile("agent")) {
		return fmt.Errorf("Agent config file %s does not exist", pct.Basedir.ConfigFile("agent"))

	bytes, err := agent.LoadConfig()
	if err != nil {
		return fmt.Errorf("Invalid agent config: %s\n", err)
	agentConfig := &agent.Config{}
	if err := json.Unmarshal(bytes, agentConfig); err != nil {
		return fmt.Errorf("Error parsing "+pct.Basedir.ConfigFile("agent")+": ", err)

	golog.Println("ApiHostname: " + agentConfig.ApiHostname)
	golog.Println("AgentUuid: " + agentConfig.AgentUuid)

	 * Ping and exit, maybe.

	// Set for all connections to API.  X-Percona-API-Key is set automatically
	// using the pct.APIConnector.
	headers := map[string]string{
		"X-Percona-Agent-Version": agent.VERSION,

	if flagPing {
		t0 := time.Now()
		code, err := pct.Ping(agentConfig.ApiHostname, agentConfig.ApiKey, headers)
		d := time.Now().Sub(t0)
		if err != nil || code != 200 {
			return fmt.Errorf("Ping FAIL (%d %d %s)", d, code, err)
		} else {
			golog.Printf("Ping OK (%s)", d)
			return nil

	 * PID file

	pidFilePath := agentConfig.PidFile
	if flagPidFile != "" {
		pidFilePath = flagPidFile
	if pidFilePath != "" {
		pidFile := pct.NewPidFile()
		if err := pidFile.Set(pidFilePath); err != nil {
		defer pidFile.Remove()


	retry := -1 // unlimited
	if flagStatus {
		retry = 1
	api, err := ConnectAPI(agentConfig, retry)
	if err != nil {

	// Get agent status via API and exit.
	if flagStatus {
		code, bytes, err := api.Get(agentConfig.ApiKey, api.AgentLink("self")+"/status")
		if err != nil {
			return err
		if code == 404 {
			return fmt.Errorf("Agent not found")
		status := make(map[string]string)
		if err := json.Unmarshal(bytes, &status); err != nil {
			return err
		return nil

	 * Connection factory
	connFactory := &mysql.RealConnectionFactory{}

	 * Log relay

	logChan := make(chan *proto.LogEntry, log.BUFFER_SIZE*3)

	// Log websocket client, possibly disabled later.
	logClient, err := client.NewWebsocketClient(pct.NewLogger(logChan, "log-ws"), api, "log", headers)
	if err != nil {
	logManager := log.NewManager(
	if err := logManager.Start(); err != nil {
		return fmt.Errorf("Error starting logmanager: %s\n", err)

	 * MRMS (MySQL Restart Monitoring Service)
	mrm := mrmsMonitor.NewMonitor(
		pct.NewLogger(logChan, "mrms-monitor"),
	mrmsManager := mrms.NewManager(
		pct.NewLogger(logChan, "mrms-manager"),
	if err := mrmsManager.Start(); err != nil {
		return fmt.Errorf("Error starting mrms manager: %s\n", err)

	 * Instance manager
	itManager := instance.NewManager(
		pct.NewLogger(logChan, "instance-manager"),
	if err := itManager.Start(); err != nil {
		return fmt.Errorf("Error starting instance manager: %s\n", err)

	 * Data spooler and sender

	hostname, _ := os.Hostname()

	dataClient, err := client.NewWebsocketClient(pct.NewLogger(logChan, "data-ws"), api, "data", headers)
	if err != nil {
	dataManager := data.NewManager(
		pct.NewLogger(logChan, "data"),
	if err := dataManager.Start(); err != nil {
		return fmt.Errorf("Error starting data manager: %s\n", err)

	 * Collecct/report ticker (master clock)

	nowFunc := func() int64 { return time.Now().UTC().UnixNano() }
	clock := ticker.NewClock(&ticker.RealTickerFactory{}, nowFunc)

	 * Metric and system config monitors

	mmManager := mm.NewManager(
		pct.NewLogger(logChan, "mm"),
		mmMonitor.NewFactory(logChan, itManager.Repo(), mrm),
	if err := mmManager.Start(); err != nil {
		return fmt.Errorf("Error starting mm manager: %s\n", err)

	sysconfigManager := sysconfig.NewManager(
		pct.NewLogger(logChan, "sysconfig"),
		sysconfigMonitor.NewFactory(logChan, itManager.Repo()),
	if err := sysconfigManager.Start(); err != nil {
		return fmt.Errorf("Error starting sysconfig manager: %s\n", err)

	 * Query service (real-time EXPLAIN, SHOW CREATE TABLE, etc.)

	queryManager := query.NewManager(
		pct.NewLogger(logChan, "query"),
	if err := queryManager.Start(); err != nil {
		return fmt.Errorf("Error starting query manager: %s\n", err)

	 * Query Analytics

	qanManager := qan.NewManager(
		pct.NewLogger(logChan, "qan"),

	if err := qanManager.Start(); err != nil {
		return fmt.Errorf("Error starting qan manager: %s\n", err)

	 * Sysinfo
	sysinfoManager := sysinfo.NewManager(
		pct.NewLogger(logChan, "sysinfo"),

	// MySQL Sysinfo
	mysqlSysinfoService := mysqlSysinfo.NewMySQL(
		pct.NewLogger(logChan, "sysinfo-mysql"),
	if err := sysinfoManager.RegisterService("MySQLSummary", mysqlSysinfoService); err != nil {
		return fmt.Errorf("Error registering Mysql Sysinfo service: %s\n", err)

	// System Sysinfo
	systemSysinfoService := systemSysinfo.NewSystem(
		pct.NewLogger(logChan, "sysinfo-system"),
	if err := sysinfoManager.RegisterService("SystemSummary", systemSysinfoService); err != nil {
		return fmt.Errorf("Error registering System Sysinfo service: %s\n", err)

	// Start Sysinfo manager
	if err := sysinfoManager.Start(); err != nil {
		return fmt.Errorf("Error starting Sysinfo manager: %s\n", err)

	 * Signal handler

	// Generally the agent has a crash-only design, but QAN is so far the only service
	// which reconfigures MySQL: it enables the slow log, sets long_query_time, etc.
	// It's not terrible to leave slow log on, but it's nicer to turn it off.
	sigChan := make(chan os.Signal, 1)
	stopChan := make(chan error, 2)
	signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM)
	go func() {
		sig := <-sigChan
		golog.Printf("Caught %s signal, shutting down...\n", sig)
		stopChan <- qanManager.Stop()

	 * Agent

	cmdClient, err := client.NewWebsocketClient(pct.NewLogger(logChan, "agent-ws"), api, "cmd", headers)
	if err != nil {

	// The official list of services known to the agent.  Adding a new service
	// requires a manager, starting the manager as above, and adding the manager
	// to this map.
	services := map[string]pct.ServiceManager{
		"log":       logManager,
		"data":      dataManager,
		"qan":       qanManager,
		"mm":        mmManager,
		"instance":  itManager,
		"mrms":      mrmsManager,
		"sysconfig": sysconfigManager,
		"query":     queryManager,
		"sysinfo":   sysinfoManager,

	// Set the global pct/cmd.Factory, used for the Restart cmd.
	pctCmd.Factory = &pctCmd.RealCmdFactory{}

	agentLogger := pct.NewLogger(logChan, "agent")

	agent := agent.NewAgent(

	 * Run agent, wait for it to stop, signal, or crash.

	var stopErr error
	go func() {
		defer func() {
			if err := recover(); err != nil {
				errMsg := fmt.Sprintf("Agent crashed: %s", err)
				stopChan <- fmt.Errorf("%s", errMsg)
		stopChan <- agent.Run()

	// Wait for agent to stop, or for signals.
	agentRunning := true
	statusSigChan := make(chan os.Signal, 1)
	signal.Notify(statusSigChan, syscall.SIGUSR1) // kill -USER1 PID
	reconnectSigChan := make(chan os.Signal, 1)
	signal.Notify(reconnectSigChan, syscall.SIGHUP) // kill -HUP PID
	for agentRunning {
		select {
		case stopErr = <-stopChan: // agent or signal
			golog.Println("Agent stopped, shutting down...")
			agentLogger.Info("Agent stopped")
			agentRunning = false
		case <-statusSigChan:
			status := agent.AllStatus()
			golog.Printf("Status: %+v\n", status)
		case <-reconnectSigChan:
			u, _ := user.Current()
			cmd := &proto.Cmd{
				Ts:        time.Now().UTC(),
				User:      u.Username + " (SIGHUP)",
				AgentUuid: agentConfig.AgentUuid,
				Service:   "agent",
				Cmd:       "Reconnect",

	qanManager.Stop()           // see Signal handler ^
	time.Sleep(2 * time.Second) // wait for final replies and log entries
	return stopErr