func (s *MainTestSuite) SetUpTest(t *C) { // Make a new mocked installation for test, copying the already compiled mocked agent. // Using the same tmp basedir for all tests could lead to dissapearing of pid files at any given time. KILL/TERM of // processes is async and mocked agent will remove the pid file while shutting down, making tests fail in the most // unexpected ways. var err error // We can't/shouldn't use /usr/local/percona/ (the default basedir), // so use a tmpdir instead with only a bin dir inside s.basedir, err = ioutil.TempDir("/tmp", "percona-agent-init-test-") t.Assert(err, IsNil) binDir := filepath.Join(s.basedir, pct.BIN_DIR) err = os.Mkdir(binDir, 0777) t.Assert(err, IsNil) // Lets copy mocked and already compiled percona-agent s.bin = filepath.Join(binDir, "percona-agent") cmd := exec.Command("cp", s.buildbin, s.bin) err = cmd.Run() t.Assert(err, IsNil, Commentf("Failed to copy mocked percona-agent to tmp dir: %v", err)) // Copy init script to tmp basedir/bin directory initscript, err := filepath.Abs("./percona-agent") // Check if absolute path resolving succedeed t.Assert(err, IsNil) // Check if init script is there t.Assert(pct.FileExists(initscript), Equals, true) s.initscript = filepath.Join(s.basedir, pct.BIN_DIR, "init-script") cmd = exec.Command("cp", initscript, s.initscript) err = cmd.Run() t.Assert(err, IsNil, Commentf("Failed to copy init script to tmp dir: %v", err)) // Set all env vars to default test values resetTestEnvVars(s) }
func (s *MainTestSuite) TestDelayedStop(t *C) { // Set init script stop timeout to 1 second os.Setenv("PCT_TEST_STOP_TIMEOUT", "1") // Set percona-agent stop delay to 2 seconds os.Setenv("PCT_TEST_STOP_DELAY", "2") // Now try to start service cmd := exec.Command(s.initscript, "start") output, err := cmd.Output() // start exit code should be 0 t.Check(err, IsNil) // Get the PID from the pidfile pid, err := readPidFile(filepath.Join(s.basedir, "percona-agent.pid")) // Check if we could read the pidfile t.Check(err, IsNil) // pid should be non empty t.Check(pid, Not(Equals), "") stop_cmd := exec.Command(s.initscript, "stop") output, err = stop_cmd.Output() // start exit code should be 0 t.Check(err, IsNil) // Script should output message t.Check(string(output), Equals, fmt.Sprintf("Stopping percona-agent...\nWaiting for percona-agent to exit...\n"+ "Time out waiting for percona-agent to exit. Trying kill -9 %v...\nStopped percona-agent.\n", pid)) // Make sure the process was killed t.Assert(pct.FileExists(fmt.Sprintf("/proc/%v/stat", pid)), Equals, false) }
func (s *AgentTestSuite) TestRestart(t *C) { // Stop the default agnet. We need our own to check its return value. s.TearDownTest(t) cmdFactory := &mock.CmdFactory{} pctCmd.Factory = cmdFactory defer func() { os.Remove(pct.Basedir.File("start-lock")) os.Remove(pct.Basedir.File("start-script")) }() newAgent := agent.NewAgent(s.config, s.logger, s.api, s.client, s.servicesMap) doneChan := make(chan error, 1) go func() { doneChan <- newAgent.Run() }() cmd := &proto.Cmd{ Service: "agent", Cmd: "Restart", } s.sendChan <- cmd replies := test.WaitReply(s.recvChan) t.Assert(replies, HasLen, 1) t.Check(replies[0].Error, Equals, "") var err error select { case err = <-doneChan: case <-time.After(2 * time.Second): t.Fatal("Agent did not restart") } // Agent should return without an error. t.Check(err, IsNil) // Agent should create the start-lock file and start-script. t.Check(pct.FileExists(pct.Basedir.File("start-lock")), Equals, true) t.Check(pct.FileExists(pct.Basedir.File("start-script")), Equals, true) // Agent should make a command to run the start-script. t.Assert(cmdFactory.Cmds, HasLen, 1) t.Check(cmdFactory.Cmds[0].Name, Equals, pct.Basedir.File("start-script")) t.Check(cmdFactory.Cmds[0].Args, IsNil) }
func (s *TestSuite) TestRemoveRel(t *C) { tmpFileName := getTmpFileName() t.Check(s.testPidFile.Set(tmpFileName), Equals, nil) // Remove should succeed, pidfile exists t.Assert(s.testPidFile.Remove(), Equals, nil) absFilePath := filepath.Join(pct.Basedir.Path(), tmpFileName) // Check if pidfile was deleted t.Assert(pct.FileExists(absFilePath), Equals, false) }
func (s *ManagerTestSuite) TestStartCollectStop(t *C) { files := []string{"stat", "meminfo", "vmstat", "loadavg", "diskstats"} for _, file := range files { if !pct.FileExists("/proc/" + file) { t.Fatal("/proc/" + file + " does not exist") } } // Create the monitor. m := system.NewMonitor(s.name, &system.Config{}, s.logger) if m == nil { t.Fatal("Make new system.Monitor") } // Start the monitor. err := m.Start(s.tickChan, s.collectionChan) if err != nil { t.Fatalf("Start monitor without error, got %s", err) } // system-monitor=Ready once it has started its internals, // should be very fast. if ok := test.WaitStatusPrefix(3, m, s.name, "Idle"); !ok { t.Fatal("Monitor is ready") } // The monitor should only collect and send metrics on ticks; we haven't ticked yet. got := test.WaitCollection(s.collectionChan, 0) if len(got) > 0 { t.Fatal("No tick, no collection; got %+v", got) } // Now tick. This should make monitor collect. now := time.Now() s.tickChan <- now got = test.WaitCollection(s.collectionChan, 1) t.Assert(got, Not(HasLen), 0) t.Check(got, HasLen, 1) c := got[0] t.Check(c.Ts, Equals, now.Unix()) t.Assert(c.Metrics, Not(HasLen), 0) // /proc/stat values are relative (current - prev) so there shouldn't be any // after one tick. haveCPU, _ := haveMetric("cpu/user", c.Metrics) t.Check(haveCPU, Equals, false) // But other metrics are not relative, so we should have them. metrics := []string{"memory/MemTotal", "vmstat/numa_local", "loadavg/running", "disk/sda/reads"} for _, metric := range metrics { ok, val := haveMetric(metric, c.Metrics) t.Check(ok, Equals, true) t.Check(val, Not(Equals), 0) } // Tick a 2nd time and now we should get CPU metrics. time.Sleep(200 * time.Millisecond) now = time.Now() s.tickChan <- now got = test.WaitCollection(s.collectionChan, 1) t.Assert(got, Not(HasLen), 0) t.Check(got, HasLen, 1) c = got[0] t.Check(c.Ts, Equals, now.Unix()) t.Assert(c.Metrics, Not(HasLen), 0) metrics = []string{"cpu/user", "cpu/nice", "cpu/system", "cpu/idle"} for _, metric := range metrics { ok, val := haveMetric(metric, c.Metrics) t.Check(ok, Equals, true) // Running this test requires some CPU so user and idle shouldn't be zero. if metric == "cpu/user" || metric == "cpu/idle" { t.Check(val, Not(Equals), 0) } } /** * Stop the monitor. */ m.Stop() if ok := test.WaitStatus(5, m, s.name, "Stopped"); !ok { t.Fatal("Monitor has stopped") } }
/** * Tests: * - starting monitor * - stopping monitor * - starting monitor again (restarting monitor) * - sneaked in:) unknown cmd test */ func (s *ManagerTestSuite) TestRestartMonitor(t *C) { // Create and start mm, no monitors yet. m := mm.NewManager(s.logger, s.factory, s.clock, s.spool, s.im) t.Assert(m, NotNil) err := m.Start() t.Assert(err, IsNil) // Start a monitor by sending StartService + monitor config. // This is the config in test/mm/config/mm-mysql-1.conf. mmConfig := &mysql.Config{ Config: mm.Config{ ServiceInstance: proto.ServiceInstance{ Service: "mysql", InstanceId: 1, }, Collect: 1, Report: 60, }, Status: map[string]string{ "threads_connected": "gauge", "threads_running": "gauge", }, } mmConfigData, err := json.Marshal(mmConfig) t.Assert(err, IsNil) // If this were a real monitor, it would decode and set its own config. // The mock monitor doesn't have any real config type, so we set it manually. s.mysqlMonitor.SetConfig(mmConfig) // The agent calls mm.Handle() with the cmd (for logging and status) and the config data. cmd := &proto.Cmd{ User: "******", Service: "mm", Cmd: "StartService", Data: mmConfigData, } reply := m.Handle(cmd) t.Assert(reply, NotNil) t.Check(reply.Error, Equals, "") // The monitor should be running. The mock monitor returns "Running" if // Start() has been called; else it returns "Stopped". status := m.Status() t.Check(status["monitor"], Equals, "Running") // There should be a 1s collect ticker for the monitor. if ok, diff := test.IsDeeply(s.clock.Added, []uint{1}); !ok { t.Errorf("Make 1s ticker for collect interval\n%s", diff) } // After starting a monitor, mm should write its config to the dir // it learned when mm.LoadConfig() was called. Next time agent starts, // it will have mm start the monitor with this config. data, err := ioutil.ReadFile(s.configDir + "/mm-mysql-1.conf") t.Check(err, IsNil) gotConfig := &mysql.Config{} err = json.Unmarshal(data, gotConfig) t.Check(err, IsNil) if same, diff := test.IsDeeply(gotConfig, mmConfig); !same { test.Dump(gotConfig) t.Error(diff) } /** * Stop the monitor. */ cmd = &proto.Cmd{ User: "******", Service: "mm", Cmd: "StopService", Data: mmConfigData, } // Handles StopService without error. reply = m.Handle(cmd) t.Assert(reply, NotNil) t.Check(reply.Error, Equals, "") // Stop a monitor removes it from the managers list of monitors. // So it's no longer present in a status request. status = m.Status() t.Check(status["monitor"], Equals, "") // After stopping the monitor, the manager should remove its tickChan. if len(s.clock.Removed) != 1 { t.Error("Remove's monitor's tickChan from clock") } // After stopping a monitor, mm should remove its config file so agent // doesn't start it on restart. file := s.configDir + "/mm-mysql-1.conf" if pct.FileExists(file) { t.Error("Stopping monitor removes its config; ", file, " exists") } /** * Start the monitor again (restarting monitor). */ cmd = &proto.Cmd{ User: "******", Service: "mm", Cmd: "StartService", Data: mmConfigData, } // If this were a real monitor, it would decode and set its own config. // The mock monitor doesn't have any real config type, so we set it manually. s.mysqlMonitor.SetConfig(mmConfig) // The agent calls mm.Handle() with the cmd (for logging and status) and the config data. reply = m.Handle(cmd) t.Assert(reply, NotNil) t.Check(reply.Error, Equals, "") // The monitor should be running. The mock monitor returns "Running" if // Start() has been called; else it returns "Stopped". status = m.Status() t.Check(status["monitor"], Equals, "Running") // There should be a 1s collect ticker for the monitor. // (Actually two in s.clock.Added, as this is mock and we started monitor twice) if ok, diff := test.IsDeeply(s.clock.Added, []uint{1, 1}); !ok { t.Errorf("Make 1s ticker for collect interval\n%s", diff) } // After starting a monitor, mm should write its config to the dir // it learned when mm.LoadConfig() was called. Next time agent starts, // it will have mm start the monitor with this config. data, err = ioutil.ReadFile(s.configDir + "/mm-mysql-1.conf") t.Check(err, IsNil) gotConfig = &mysql.Config{} err = json.Unmarshal(data, gotConfig) t.Check(err, IsNil) if same, diff := test.IsDeeply(gotConfig, mmConfig); !same { t.Logf("%+v", gotConfig) t.Error(diff) } /** * While we're all setup and working, let's sneak in an unknown cmd test. */ cmd = &proto.Cmd{ User: "******", Service: "mm", Cmd: "Pontificate", Data: mmConfigData, } // Unknown cmd causes error. reply = m.Handle(cmd) t.Assert(reply, NotNil) t.Check(reply.Error, Not(Equals), "") }
func run() error { version := fmt.Sprintf("percona-agent %s rev %s", agent.VERSION, agent.REVISION) if flagVersion { fmt.Println(version) return nil } golog.Printf("Running %s pid %d\n", version, os.Getpid()) if err := pct.Basedir.Init(flagBasedir); err != nil { return err } // Start-lock file is used to let agent1 self-update, create start-lock, // start updated agent2, exit cleanly, then agent2 starts. agent1 may // not use a PID file, so this special file is required. if err := pct.WaitStartLock(); err != nil { return err } // NOTE: This must run last, and defer if LIFO, so it must be declared first. defer os.Remove(pct.Basedir.File("start-lock")) /** * Agent config (require API key and agent UUID) */ if !pct.FileExists(pct.Basedir.ConfigFile("agent")) { return fmt.Errorf("Agent config file %s does not exist", pct.Basedir.ConfigFile("agent")) } bytes, err := agent.LoadConfig() if err != nil { return fmt.Errorf("Invalid agent config: %s\n", err) } agentConfig := &agent.Config{} if err := json.Unmarshal(bytes, agentConfig); err != nil { return fmt.Errorf("Error parsing "+pct.Basedir.ConfigFile("agent")+": ", err) } golog.Println("ApiHostname: " + agentConfig.ApiHostname) golog.Println("AgentUuid: " + agentConfig.AgentUuid) /** * Ping and exit, maybe. */ if flagPing { t0 := time.Now() code, err := pct.Ping(agentConfig.ApiHostname, agentConfig.ApiKey) d := time.Now().Sub(t0) if err != nil || code != 200 { return fmt.Errorf("Ping FAIL (%d %d %s)", d, code, err) } else { golog.Printf("Ping OK (%s)", d) return nil } } /** * PID file */ if flagPidFile != "" { pidFile := pct.NewPidFile() if err := pidFile.Set(flagPidFile); err != nil { golog.Fatalln(err) } defer pidFile.Remove() } /** * REST API */ api, err := ConnectAPI(agentConfig) if err != nil { golog.Fatal(err) } /** * Log relay */ logChan := make(chan *proto.LogEntry, log.BUFFER_SIZE*3) // Log websocket client, possibly disabled later. logClient, err := client.NewWebsocketClient(pct.NewLogger(logChan, "log-ws"), api, "log") if err != nil { golog.Fatalln(err) } logManager := log.NewManager( logClient, logChan, ) if err := logManager.Start(); err != nil { return fmt.Errorf("Error starting logmanager: %s\n", err) } /** * Instance manager */ itManager := instance.NewManager( pct.NewLogger(logChan, "instance-manager"), pct.Basedir.Dir("config"), api, ) if err := itManager.Start(); err != nil { return fmt.Errorf("Error starting instance manager: %s\n", err) } /** * Data spooler and sender */ hostname, _ := os.Hostname() dataClient, err := client.NewWebsocketClient(pct.NewLogger(logChan, "data-ws"), api, "data") if err != nil { golog.Fatalln(err) } dataManager := data.NewManager( pct.NewLogger(logChan, "data"), pct.Basedir.Dir("data"), hostname, dataClient, ) if err := dataManager.Start(); err != nil { return fmt.Errorf("Error starting data manager: %s\n", err) } /** * Collecct/report ticker (master clock) */ nowFunc := func() int64 { return time.Now().UTC().UnixNano() } clock := ticker.NewClock(&ticker.RealTickerFactory{}, nowFunc) /** * Metric and system config monitors */ mmManager := mm.NewManager( pct.NewLogger(logChan, "mm"), mmMonitor.NewFactory(logChan, itManager.Repo()), clock, dataManager.Spooler(), itManager.Repo(), ) if err := mmManager.Start(); err != nil { return fmt.Errorf("Error starting mm manager: %s\n", err) } sysconfigManager := sysconfig.NewManager( pct.NewLogger(logChan, "sysconfig"), sysconfigMonitor.NewFactory(logChan, itManager.Repo()), clock, dataManager.Spooler(), itManager.Repo(), ) if err := sysconfigManager.Start(); err != nil { return fmt.Errorf("Error starting sysconfig manager: %s\n", err) } /** * Query Analytics */ qanManager := qan.NewManager( pct.NewLogger(logChan, "qan"), &mysql.RealConnectionFactory{}, clock, qan.NewFileIntervalIterFactory(logChan), qan.NewSlowLogWorkerFactory(logChan), dataManager.Spooler(), itManager.Repo(), ) if err := qanManager.Start(); err != nil { return fmt.Errorf("Error starting qan manager: %s\n", err) } /** * Signal handler */ // Generally the agent has a crash-only design, but QAN is so far the only service // which reconfigures MySQL: it enables the slow log, sets long_query_time, etc. // It's not terrible to leave slow log on, but it's nicer to turn it off. sigChan := make(chan os.Signal, 1) stopChan := make(chan error, 2) signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM) go func() { sig := <-sigChan golog.Printf("Caught %s signal, shutting down...\n", sig) stopChan <- qanManager.Stop() }() /** * Agent */ cmdClient, err := client.NewWebsocketClient(pct.NewLogger(logChan, "agent-ws"), api, "cmd") if err != nil { golog.Fatal(err) } // The official list of services known to the agent. Adding a new service // requires a manager, starting the manager as above, and adding the manager // to this map. services := map[string]pct.ServiceManager{ "log": logManager, "data": dataManager, "qan": qanManager, "mm": mmManager, "instance": itManager, "sysconfig": sysconfigManager, } agent := agent.NewAgent( agentConfig, pct.NewLogger(logChan, "agent"), api, cmdClient, services, ) /** * Run agent, wait for it to stop or signal. */ go func() { stopChan <- agent.Run() }() stopErr := <-stopChan // agent or signal golog.Println("Agent stopped, shutting down...") qanManager.Stop() // see Signal handler ^ time.Sleep(2 * time.Second) // wait for final replies and log entries return stopErr }
func (s *ManagerTestSuite) TestStartStopMonitor(t *C) { m := sysconfig.NewManager(s.logger, s.factory, s.clock, s.spool, s.im) t.Assert(m, NotNil) err := m.Start() t.Assert(err, IsNil) // Starting a monitor is like starting the manager: it requires // a "StartService" cmd and the monitor's config. This is the // config in configDir/db1-mysql-monitor.conf. sysconfigConfig := &mysql.Config{ Config: sysconfig.Config{ ServiceInstance: proto.ServiceInstance{ Service: "mysql", InstanceId: 1, }, Report: 3600, }, } sysconfigConfigData, err := json.Marshal(sysconfigConfig) t.Assert(err, IsNil) cmd := &proto.Cmd{ User: "******", Service: "sysconfig", Cmd: "StartService", Data: sysconfigConfigData, } // If this were a real monitor, it would decode and set its own config. // The mock monitor doesn't have any real config type, so we set it manually. s.mockMonitor.SetConfig(sysconfigConfig) // The agent calls sysconfig.Handle() with the cmd (for logging and status) and the config data. reply := m.Handle(cmd) t.Assert(reply, NotNil) t.Check(reply.Error, Equals, "") // The monitor should be running. The mock monitor returns "Running" if // Start() has been called; else it returns "Stopped". status := s.mockMonitor.Status() if status["monitor"] != "Running" { t.Error("Monitor running") } // There should be a 60s report ticker for the aggregator and a 1s collect ticker // for the monitor. if ok, diff := test.IsDeeply(s.clock.Added, []uint{3600}); !ok { t.Errorf("Make 3600s ticker for collect interval\n%s", diff) } // After starting a monitor, sysconfig should write its config to the dir // it learned when sysconfig.LoadConfig() was called. Next time agent starts, // it will have sysconfig start the monitor with this config. data, err := ioutil.ReadFile(s.configDir + "/sysconfig-mysql-1.conf") t.Check(err, IsNil) gotConfig := &mysql.Config{} err = json.Unmarshal(data, gotConfig) t.Check(err, IsNil) if same, diff := test.IsDeeply(gotConfig, sysconfigConfig); !same { t.Logf("%+v", gotConfig) t.Error(diff) } /** * Stop the monitor. */ cmd = &proto.Cmd{ User: "******", Service: "sysconfig", Cmd: "StopService", Data: sysconfigConfigData, } // Handles StopService without error. reply = m.Handle(cmd) t.Assert(reply, NotNil) t.Check(reply.Error, Equals, "") status = s.mockMonitor.Status() if status["monitor"] != "Stopped" { t.Error("Monitor stopped") } // After stopping the monitor, the manager should remove its tickChan. if len(s.clock.Removed) != 1 { t.Error("Remove's monitor's tickChan from clock") } // After stopping a monitor, sysconfig should remove its config file so agent // doesn't start it on restart. file := s.configDir + "/sysconfig-mysql-1.conf" if pct.FileExists(file) { t.Error("Stopping monitor removes its config; ", file, " exists") } /** * While we're all setup and working, let's sneak in an unknown cmd test. */ cmd = &proto.Cmd{ User: "******", Service: "sysconfig", Cmd: "Pontificate", Data: sysconfigConfigData, } // Unknown cmd causes error. reply = m.Handle(cmd) t.Assert(reply, NotNil) if reply.Error == "" { t.Fatalf("Unknown Cmd to Handle() causes error") } /** * Clean up */ m.Stop() }
func (s *DiskvSpoolerTestSuite) TestRejectData(t *C) { sz := data.NewJsonSerializer() // Create and start the spooler. spool := data.NewDiskvSpooler(s.logger, s.dataDir, s.trashDir, "localhost", s.limits) t.Assert(spool, NotNil) err := spool.Start(sz) t.Assert(err, IsNil) // Spooler should create the bad data dir. badDataDir := path.Join(s.trashDir, "data") ok := pct.FileExists(badDataDir) t.Assert(ok, Equals, true) // Spool any data... now := time.Now() logEntry := &proto.LogEntry{ Ts: now, Level: 1, Service: "mm", Msg: "hello world", } err = spool.Write("log", logEntry) t.Check(err, IsNil) // Wait for spooler to write data to disk. files := test.WaitFiles(s.dataDir, 1) t.Assert(files, HasLen, 1) // Get the file name the spooler saved the data as. gotFiles := []string{} filesChan := spool.Files() for file := range filesChan { gotFiles = append(gotFiles, file) } t.Assert(gotFiles, HasLen, 1) // Reject the file. The spooler should move it to the bad data dir // then remove it from the list. err = spool.Reject(gotFiles[0]) t.Check(err, IsNil) ok = pct.FileExists(path.Join(s.dataDir, gotFiles[0])) t.Assert(ok, Equals, false) badFile := path.Join(badDataDir, gotFiles[0]) ok = pct.FileExists(path.Join(badFile)) t.Assert(ok, Equals, true) spool.Stop() /** * Start another spooler now that we have data/bad/file to ensure * that the spooler does not read/index/cache bad files. */ spool = data.NewDiskvSpooler(s.logger, s.dataDir, s.trashDir, "localhost", s.limits) t.Assert(spool, NotNil) err = spool.Start(sz) t.Assert(err, IsNil) spool.Write("log", logEntry) files = test.WaitFiles(s.dataDir, 1) t.Assert(files, HasLen, 1) // There should only be 1 new file in the spool. gotFiles = []string{} filesChan = spool.Files() for file := range filesChan { t.Check(file, Not(Equals), badFile) gotFiles = append(gotFiles, file) } t.Assert(gotFiles, HasLen, 1) spool.Stop() }
func (i *Installer) Run() error { /** * Check for pt-agent, upgrade if found. */ var ptagentDSN *mysql.DSN ptagentUpgrade := false ptagentConf := "/root/.pt-agent.conf" if pct.FileExists(ptagentConf) { fmt.Println("Found pt-agent, upgrading and removing because it is no longer supported...") ptagentUpgrade = true // Stop pt-agent if err := StopPTAgent(); err != nil { fmt.Printf("Error stopping pt-agent: %s\n\n", err) fmt.Println("WARNING: pt-agent must be stopped before installing percona-agent. " + "Please verify that pt-agent is not running and has been removed from cron. " + "Enter 'Y' to confirm and continue installing percona-agent.") ok, err := i.term.PromptBool("pt-agent has stopped?", "N") if err != nil { return err } if !ok { return fmt.Errorf("Failed to stop pt-agent") } } // Get its settings (API key, UUID, etc.). agent, dsn, err := GetPTAgentSettings(ptagentConf) if err != nil { return fmt.Errorf("Error upgrading pt-agent: %s", err) } if agent.ApiKey != "" { i.agentConfig.ApiKey = agent.ApiKey } if agent.AgentUuid != "" { i.agentConfig.AgentUuid = agent.AgentUuid fmt.Printf("Upgrading pt-agent %s...\n", agent.AgentUuid) } ptagentDSN = dsn } /** * Get the API key. */ fmt.Printf("API host: %s\n", i.agentConfig.ApiHostname) for i.agentConfig.ApiKey == "" { apiKey, err := i.term.PromptString("API key", "") if err != nil { return err } if apiKey == "" { fmt.Println("API key is required, please try again.") continue } i.agentConfig.ApiKey = apiKey break } /** * Verify the API key by pinging the API. */ VERIFY_API_KEY: for { startTime := time.Now() fmt.Printf("Verifying API key %s...\n", i.agentConfig.ApiKey) code, err := pct.Ping(i.agentConfig.ApiHostname, i.agentConfig.ApiKey) elapsedTime := time.Since(startTime) elapsedTimeInSeconds := elapsedTime / time.Second timeout := false if urlErr, ok := err.(*url.Error); ok { if netOpErr, ok := urlErr.Err.(*net.OpError); ok && netOpErr.Timeout() { timeout = true } } if i.flags["debug"] { log.Printf("code=%d\n", code) log.Printf("err=%s\n", err) } ok := false if timeout { fmt.Printf( "Error: API connection timeout (%ds): %s\n"+ "Before you try again, please check your connection and DNS configuration.\n", elapsedTimeInSeconds, err, ) } else if err != nil { fmt.Printf("Error: %s\n", err) } else if code >= 500 { fmt.Printf("Sorry, there's an API problem (status code %d). "+ "Please try to install again. If the problem continues, contact Percona.\n", code) } else if code == 401 { return fmt.Errorf("Access denied. Check the API key and try again.") } else if code >= 300 { fmt.Printf("Sorry, there's an installer problem (status code %d). "+ "Please try to install again. If the problem continues, contact Percona.\n", code) } else if code != 200 { fmt.Printf("Sorry, there's an installer problem (status code %d). "+ "Please try to install again. If the problem continues, contact Percona.\n", code) } else { ok = true } if !ok { again, err := i.term.PromptBool("Try again?", "Y") if err != nil { return err } if !again { return fmt.Errorf("Failed to verify API key") } continue VERIFY_API_KEY } fmt.Printf("API key %s is OK\n", i.agentConfig.ApiKey) if elapsedTimeInSeconds >= 0 { fmt.Printf( "WARNING: We have detected that request to api took %d second(-s) while usually it shouldn't take more than 1s.\n"+ "This might be due to connection problems or slow DNS resolution.\n"+ "Before you continue please check your connection and DNS configuration as this might impact performance of percona-agent.\n"+ "If you are using CentOS or Fedora 19+ in a vagrant box then you might be interested in this bug report:\n"+ "https://github.com/mitchellh/vagrant/issues/1172\n", elapsedTimeInSeconds, ) proceed, err := i.term.PromptBool("Continue anyway?", "Y") if err != nil { return err } if !proceed { return fmt.Errorf("Failed because of slow connection") } } break } var si *proto.ServerInstance var mi *proto.MySQLInstance /** * Create new service instances. */ var err error if i.flags["create-server-instance"] { si, err = i.createServerInstance() if err != nil { return err } fmt.Printf("Created server instance: hostname=%s id=%d\n", si.Hostname, si.Id) } else { fmt.Println("Not creating server instance (-create-server-instance=false)") } if i.flags["create-mysql-instance"] { // Create MySQL user for agent, or using existing one, then verify MySQL connection. agentDSN, err := i.doMySQL(ptagentDSN) if err != nil { return err } // Create MySQL instance in API. mi, err = i.createMySQLInstance(agentDSN) if err != nil { return err } fmt.Printf("Created MySQL instance: dsn=%s hostname=%s id=%d\n", mi.DSN, mi.Hostname, si.Id) } else { fmt.Println("Not creating MySQL instance (-create-mysql-instance=false)") } if err := i.writeInstances(si, mi); err != nil { return fmt.Errorf("Created agent but failed to write service instances: %s", err) } /** * Get default configs for all services. */ configs := []proto.AgentConfig{} if i.flags["start-services"] { // Server metrics monitor config, err := i.getMmServerConfig(si) if err != nil { fmt.Println(err) fmt.Println("WARNING: cannot start server metrics monitor") } else { configs = append(configs, *config) } if i.flags["start-mysql-services"] { // MySQL metrics tracker config, err = i.getMmMySQLConfig(mi) if err != nil { fmt.Println(err) fmt.Println("WARNING: cannot start MySQL metrics monitor") } else { configs = append(configs, *config) } // MySQL config tracker config, err = i.getSysconfigMySQLConfig(mi) if err != nil { fmt.Println(err) fmt.Println("WARNING: cannot start MySQL configuration monitor") } else { configs = append(configs, *config) } // QAN // MySQL is local if the server hostname == MySQL hostname without port number. if i.hostname == portNumberRe.ReplaceAllLiteralString(mi.Hostname, "") { if i.flags["debug"] { log.Printf("MySQL is local") } config, err := i.getQanConfig(mi) if err != nil { fmt.Println(err) fmt.Println("WARNING: cannot start Query Analytics") } else { configs = append(configs, *config) } } } else { fmt.Println("Not starting MySQL services (-start-mysql-services=false)") } } else { fmt.Println("Not starting default services (-start-services=false)") } /** * Create agent with initial service configs. */ if ptagentUpgrade { agent, err := i.updateAgent(i.agentConfig.AgentUuid) if err != nil { return err } fmt.Println("pt-agent upgraded to percona-agent") if err := i.writeConfigs(agent, configs); err != nil { return fmt.Errorf("Upgraded pt-agent but failed to write percona-agent configs: %s", err) } } else if i.flags["create-agent"] { agent, err := i.createAgent(configs) if err != nil { return err } fmt.Printf("Created agent: uuid=%s\n", agent.Uuid) if err := i.writeConfigs(agent, configs); err != nil { return fmt.Errorf("Created agent but failed to write configs: %s", err) } } else { fmt.Println("Not creating agent (-create-agent=false)") } /** * Remove pt-agent if upgrading. */ if ptagentUpgrade { RemovePTAgent(ptagentConf) fmt.Println("pt-agent removed") } return nil // success }
func run() error { version := fmt.Sprintf("percona-agent %s%s rev %s", agent.VERSION, agent.REL, agent.REVISION) if flagVersion { fmt.Println(version) return nil } golog.Printf("Running %s pid %d\n", version, os.Getpid()) if err := pct.Basedir.Init(flagBasedir); err != nil { return err } // Start-lock file is used to let agent1 self-update, create start-lock, // start updated agent2, exit cleanly, then agent2 starts. agent1 may // not use a PID file, so this special file is required. if err := pct.WaitStartLock(); err != nil { return err } // NOTE: This must run last, and defer if LIFO, so it must be declared first. defer os.Remove(pct.Basedir.File("start-lock")) /** * Agent config (require API key and agent UUID) */ if !pct.FileExists(pct.Basedir.ConfigFile("agent")) { return fmt.Errorf("Agent config file %s does not exist", pct.Basedir.ConfigFile("agent")) } bytes, err := agent.LoadConfig() if err != nil { return fmt.Errorf("Invalid agent config: %s\n", err) } agentConfig := &agent.Config{} if err := json.Unmarshal(bytes, agentConfig); err != nil { return fmt.Errorf("Error parsing "+pct.Basedir.ConfigFile("agent")+": ", err) } golog.Println("ApiHostname: " + agentConfig.ApiHostname) golog.Println("AgentUuid: " + agentConfig.AgentUuid) /** * Ping and exit, maybe. */ // Set for all connections to API. X-Percona-API-Key is set automatically // using the pct.APIConnector. headers := map[string]string{ "X-Percona-Agent-Version": agent.VERSION, } if flagPing { t0 := time.Now() code, err := pct.Ping(agentConfig.ApiHostname, agentConfig.ApiKey, headers) d := time.Now().Sub(t0) if err != nil || code != 200 { return fmt.Errorf("Ping FAIL (%d %d %s)", d, code, err) } else { golog.Printf("Ping OK (%s)", d) return nil } } /** * PID file */ pidFilePath := agentConfig.PidFile if flagPidFile != "" { pidFilePath = flagPidFile } if pidFilePath != "" { pidFile := pct.NewPidFile() if err := pidFile.Set(pidFilePath); err != nil { golog.Fatalln(err) } defer pidFile.Remove() } /** * REST API */ retry := -1 // unlimited if flagStatus { retry = 1 } api, err := ConnectAPI(agentConfig, retry) if err != nil { golog.Fatal(err) } // Get agent status via API and exit. if flagStatus { code, bytes, err := api.Get(agentConfig.ApiKey, api.AgentLink("self")+"/status") if err != nil { return err } if code == 404 { return fmt.Errorf("Agent not found") } status := make(map[string]string) if err := json.Unmarshal(bytes, &status); err != nil { return err } golog.Println(status) return nil } /** * Connection factory */ connFactory := &mysql.RealConnectionFactory{} /** * Log relay */ logChan := make(chan *proto.LogEntry, log.BUFFER_SIZE*3) // Log websocket client, possibly disabled later. logClient, err := client.NewWebsocketClient(pct.NewLogger(logChan, "log-ws"), api, "log", headers) if err != nil { golog.Fatalln(err) } logManager := log.NewManager( logClient, logChan, ) if err := logManager.Start(); err != nil { return fmt.Errorf("Error starting logmanager: %s\n", err) } /** * MRMS (MySQL Restart Monitoring Service) */ mrm := mrmsMonitor.NewMonitor( pct.NewLogger(logChan, "mrms-monitor"), connFactory, ) mrmsManager := mrms.NewManager( pct.NewLogger(logChan, "mrms-manager"), mrm, ) if err := mrmsManager.Start(); err != nil { return fmt.Errorf("Error starting mrms manager: %s\n", err) } /** * Instance manager */ itManager := instance.NewManager( pct.NewLogger(logChan, "instance-manager"), pct.Basedir.Dir("config"), api, mrm, ) if err := itManager.Start(); err != nil { return fmt.Errorf("Error starting instance manager: %s\n", err) } /** * Data spooler and sender */ hostname, _ := os.Hostname() dataClient, err := client.NewWebsocketClient(pct.NewLogger(logChan, "data-ws"), api, "data", headers) if err != nil { golog.Fatalln(err) } dataManager := data.NewManager( pct.NewLogger(logChan, "data"), pct.Basedir.Dir("data"), pct.Basedir.Dir("trash"), hostname, dataClient, ) if err := dataManager.Start(); err != nil { return fmt.Errorf("Error starting data manager: %s\n", err) } /** * Collecct/report ticker (master clock) */ nowFunc := func() int64 { return time.Now().UTC().UnixNano() } clock := ticker.NewClock(&ticker.RealTickerFactory{}, nowFunc) /** * Metric and system config monitors */ mmManager := mm.NewManager( pct.NewLogger(logChan, "mm"), mmMonitor.NewFactory(logChan, itManager.Repo(), mrm), clock, dataManager.Spooler(), itManager.Repo(), mrm, ) if err := mmManager.Start(); err != nil { return fmt.Errorf("Error starting mm manager: %s\n", err) } sysconfigManager := sysconfig.NewManager( pct.NewLogger(logChan, "sysconfig"), sysconfigMonitor.NewFactory(logChan, itManager.Repo()), clock, dataManager.Spooler(), itManager.Repo(), ) if err := sysconfigManager.Start(); err != nil { return fmt.Errorf("Error starting sysconfig manager: %s\n", err) } /** * Query service (real-time EXPLAIN, SHOW CREATE TABLE, etc.) */ queryManager := query.NewManager( pct.NewLogger(logChan, "query"), itManager.Repo(), &mysql.RealConnectionFactory{}, ) if err := queryManager.Start(); err != nil { return fmt.Errorf("Error starting query manager: %s\n", err) } /** * Query Analytics */ qanManager := qan.NewManager( pct.NewLogger(logChan, "qan"), clock, itManager.Repo(), mrm, connFactory, qanFactory.NewRealAnalyzerFactory( logChan, qanFactory.NewRealIntervalIterFactory(logChan), slowlog.NewRealWorkerFactory(logChan), perfschema.NewRealWorkerFactory(logChan), dataManager.Spooler(), clock, ), ) if err := qanManager.Start(); err != nil { return fmt.Errorf("Error starting qan manager: %s\n", err) } /** * Sysinfo */ sysinfoManager := sysinfo.NewManager( pct.NewLogger(logChan, "sysinfo"), ) // MySQL Sysinfo mysqlSysinfoService := mysqlSysinfo.NewMySQL( pct.NewLogger(logChan, "sysinfo-mysql"), itManager.Repo(), ) if err := sysinfoManager.RegisterService("MySQLSummary", mysqlSysinfoService); err != nil { return fmt.Errorf("Error registering Mysql Sysinfo service: %s\n", err) } // System Sysinfo systemSysinfoService := systemSysinfo.NewSystem( pct.NewLogger(logChan, "sysinfo-system"), ) if err := sysinfoManager.RegisterService("SystemSummary", systemSysinfoService); err != nil { return fmt.Errorf("Error registering System Sysinfo service: %s\n", err) } // Start Sysinfo manager if err := sysinfoManager.Start(); err != nil { return fmt.Errorf("Error starting Sysinfo manager: %s\n", err) } /** * Signal handler */ // Generally the agent has a crash-only design, but QAN is so far the only service // which reconfigures MySQL: it enables the slow log, sets long_query_time, etc. // It's not terrible to leave slow log on, but it's nicer to turn it off. sigChan := make(chan os.Signal, 1) stopChan := make(chan error, 2) signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM) go func() { sig := <-sigChan golog.Printf("Caught %s signal, shutting down...\n", sig) stopChan <- qanManager.Stop() }() /** * Agent */ cmdClient, err := client.NewWebsocketClient(pct.NewLogger(logChan, "agent-ws"), api, "cmd", headers) if err != nil { golog.Fatal(err) } // The official list of services known to the agent. Adding a new service // requires a manager, starting the manager as above, and adding the manager // to this map. services := map[string]pct.ServiceManager{ "log": logManager, "data": dataManager, "qan": qanManager, "mm": mmManager, "instance": itManager, "mrms": mrmsManager, "sysconfig": sysconfigManager, "query": queryManager, "sysinfo": sysinfoManager, } // Set the global pct/cmd.Factory, used for the Restart cmd. pctCmd.Factory = &pctCmd.RealCmdFactory{} agentLogger := pct.NewLogger(logChan, "agent") agent := agent.NewAgent( agentConfig, agentLogger, api, cmdClient, services, ) /** * Run agent, wait for it to stop, signal, or crash. */ var stopErr error go func() { defer func() { if err := recover(); err != nil { errMsg := fmt.Sprintf("Agent crashed: %s", err) golog.Println(errMsg) agentLogger.Error(errMsg) stopChan <- fmt.Errorf("%s", errMsg) } }() stopChan <- agent.Run() }() // Wait for agent to stop, or for signals. agentRunning := true statusSigChan := make(chan os.Signal, 1) signal.Notify(statusSigChan, syscall.SIGUSR1) // kill -USER1 PID reconnectSigChan := make(chan os.Signal, 1) signal.Notify(reconnectSigChan, syscall.SIGHUP) // kill -HUP PID for agentRunning { select { case stopErr = <-stopChan: // agent or signal golog.Println("Agent stopped, shutting down...") agentLogger.Info("Agent stopped") agentRunning = false case <-statusSigChan: status := agent.AllStatus() golog.Printf("Status: %+v\n", status) case <-reconnectSigChan: u, _ := user.Current() cmd := &proto.Cmd{ Ts: time.Now().UTC(), User: u.Username + " (SIGHUP)", AgentUuid: agentConfig.AgentUuid, Service: "agent", Cmd: "Reconnect", } agent.Handle(cmd) } } qanManager.Stop() // see Signal handler ^ time.Sleep(2 * time.Second) // wait for final replies and log entries return stopErr }