func (s *ManagerTestSuite) TestGetConfig(t *C) { mockConnFactory := &mock.ConnectionFactory{Conn: s.nullmysql} m := qan.NewManager(s.logger, mockConnFactory, s.clock, s.iterFactory, s.workerFactory, s.spool, s.im) t.Assert(m, NotNil) config := &qan.Config{ ServiceInstance: s.mysqlInstance, Interval: 300, MaxSlowLogSize: 1000, MaxWorkers: 3, WorkerRunTime: 300, Start: []mysql.Query{ mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"}, mysql.Query{Set: "SET GLOBAL long_query_time=0.456"}, mysql.Query{Set: "SET GLOBAL slow_query_log=ON"}, }, Stop: []mysql.Query{ mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"}, mysql.Query{Set: "SET GLOBAL long_query_time=10"}, }, } qanConfig, _ := json.Marshal(config) cmd := &proto.Cmd{ Ts: time.Now(), Cmd: "StartService", Data: qanConfig, } reply := m.Handle(cmd) t.Assert(reply.Error, Equals, "") test.WaitStatusPrefix(1, m, "qan-log-parser", "Idle") s.nullmysql.Reset() cmd = &proto.Cmd{ Cmd: "GetConfig", Service: "qan", } reply = m.Handle(cmd) t.Assert(reply.Error, Equals, "") t.Assert(reply.Data, NotNil) gotConfig := []proto.AgentConfig{} if err := json.Unmarshal(reply.Data, &gotConfig); err != nil { t.Fatal(err) } expectConfig := []proto.AgentConfig{ { InternalService: "qan", Config: string(qanConfig), Running: true, }, } if same, diff := test.IsDeeply(gotConfig, expectConfig); !same { test.Dump(gotConfig) t.Error(diff) } // Stop manager reply = m.Handle(&proto.Cmd{Cmd: "StopService"}) t.Assert(reply.Error, Equals, "") }
func (s *ManagerTestSuite) TestGetConfig(t *C) { // Make a qan.Manager with mock factories. mockConnFactory := &mock.ConnectionFactory{Conn: s.nullmysql} a := mock.NewQanAnalyzer() f := mock.NewQanAnalyzerFactory(a) m := qan.NewManager(s.logger, s.clock, s.im, s.mrmsMonitor, mockConnFactory, f) t.Assert(m, NotNil) // Write a realistic qan.conf config to disk. config := qan.Config{ ServiceInstance: s.mysqlInstance, CollectFrom: "slowlog", Interval: 300, MaxWorkers: 1, WorkerRunTime: 600, Start: []mysql.Query{ mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"}, mysql.Query{Set: "SET GLOBAL long_query_time=0.456"}, mysql.Query{Set: "SET GLOBAL slow_query_log=ON"}, }, Stop: []mysql.Query{ mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"}, mysql.Query{Set: "SET GLOBAL long_query_time=10"}, }, } err := pct.Basedir.WriteConfig("qan", &config) t.Assert(err, IsNil) qanConfig, err := json.Marshal(config) t.Assert(err, IsNil) // Start the manager and analyzer. err = m.Start() t.Check(err, IsNil) test.WaitStatus(1, m, "qan", "Running") // Get the manager config which should be just the analyzer config. got, errs := m.GetConfig() t.Assert(errs, HasLen, 0) t.Assert(got, HasLen, 1) expect := []proto.AgentConfig{ { InternalService: "qan", Config: string(qanConfig), Running: true, }, } if same, diff := IsDeeply(got, expect); !same { Dump(got) t.Error(diff) } // Stop the manager. err = m.Stop() t.Assert(err, IsNil) }
func (s *ManagerTestSuite) TestStart(t *C) { mockConnFactory := &mock.ConnectionFactory{Conn: s.nullmysql} m := qan.NewManager(s.logger, mockConnFactory, s.clock, s.iterFactory, s.workerFactory, s.spool, s.im) t.Assert(m, NotNil) // Starting qan without a config does nothing but start the qan manager. err := m.Start() t.Check(err, IsNil) status := m.Status() t.Check(status["qan-log-parser"], Equals, "") t.Check(status["qan-last-interval"], Equals, "") t.Check(status["qan-next-interval"], Equals, "") // Write a qan config to disk. config := &qan.Config{ ServiceInstance: s.mysqlInstance, Interval: 300, MaxWorkers: 1, WorkerRunTime: 600, Start: []mysql.Query{ mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"}, mysql.Query{Set: "SET GLOBAL long_query_time=0.456"}, mysql.Query{Set: "SET GLOBAL slow_query_log=ON"}, }, Stop: []mysql.Query{ mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"}, mysql.Query{Set: "SET GLOBAL long_query_time=10"}, }, } err = pct.Basedir.WriteConfig("qan", config) t.Assert(err, IsNil) // qan.Start() should read and use config on disk. err = m.Start() t.Check(err, IsNil) if !test.WaitStatusPrefix(1, m, "qan-log-parser", "Idle") { t.Error("WaitStatusPrefix(qan-log-parser, Idle) failed") } status = m.Status() t.Check(status["qan-log-parser"], Equals, "Idle (0 of 1 running)") t.Check(status["qan-last-interval"], Equals, "") t.Check(status["qan-next-interval"], Not(Equals), "") // Stopping qan.Stop() should leave config file on disk. err = m.Stop() t.Assert(err, IsNil) t.Check(test.FileExists(pct.Basedir.ConfigFile("qan")), Equals, true) }
func (s *ManagerTestSuite) TestBadCmd(t *C) { mockConnFactory := &mock.ConnectionFactory{Conn: s.nullmysql} a := mock.NewQanAnalyzer() f := mock.NewQanAnalyzerFactory(a) m := qan.NewManager(s.logger, s.clock, s.im, s.mrmsMonitor, mockConnFactory, f) t.Assert(m, NotNil) err := m.Start() t.Check(err, IsNil) defer m.Stop() test.WaitStatus(1, m, "qan", "Running") cmd := &proto.Cmd{ User: "******", Ts: time.Now(), AgentUuid: "123", Service: "qan", Cmd: "foo", // bad cmd } reply := m.Handle(cmd) t.Assert(reply.Error, Equals, "Unknown command: foo") }
func (s *ManagerTestSuite) TestStarNoConfig(t *C) { // Make a qan.Manager with mock factories. mockConnFactory := &mock.ConnectionFactory{Conn: s.nullmysql} a := mock.NewQanAnalyzer() f := mock.NewQanAnalyzerFactory(a) m := qan.NewManager(s.logger, s.clock, s.im, s.mrmsMonitor, mockConnFactory, f) t.Assert(m, NotNil) // qan.Manager should be able to start without a qan.conf, i.e. no analyzer. err := m.Start() t.Check(err, IsNil) // Wait for qan.Manager.Start() to finish. test.WaitStatus(1, m, "qan", "Running") // No analyzer is configured, so the mock analyzer should not be started. select { case <-a.StartChan: t.Error("Analyzer.Start() called") default: } // And the mock analyzer's status should not be reported. status := m.Status() t.Check(status["qan"], Equals, "Running") // Stop the manager. err = m.Stop() t.Assert(err, IsNil) // No analyzer is configured, so the mock analyzer should not be stop. select { case <-a.StartChan: t.Error("Analyzer.Stop() called") default: } }
func run() error { version := fmt.Sprintf("percona-agent %s rev %s", agent.VERSION, agent.REVISION) if flagVersion { fmt.Println(version) return nil } golog.Printf("Running %s pid %d\n", version, os.Getpid()) if err := pct.Basedir.Init(flagBasedir); err != nil { return err } // Start-lock file is used to let agent1 self-update, create start-lock, // start updated agent2, exit cleanly, then agent2 starts. agent1 may // not use a PID file, so this special file is required. if err := pct.WaitStartLock(); err != nil { return err } // NOTE: This must run last, and defer if LIFO, so it must be declared first. defer os.Remove(pct.Basedir.File("start-lock")) /** * Agent config (require API key and agent UUID) */ if !pct.FileExists(pct.Basedir.ConfigFile("agent")) { return fmt.Errorf("Agent config file %s does not exist", pct.Basedir.ConfigFile("agent")) } bytes, err := agent.LoadConfig() if err != nil { return fmt.Errorf("Invalid agent config: %s\n", err) } agentConfig := &agent.Config{} if err := json.Unmarshal(bytes, agentConfig); err != nil { return fmt.Errorf("Error parsing "+pct.Basedir.ConfigFile("agent")+": ", err) } golog.Println("ApiHostname: " + agentConfig.ApiHostname) golog.Println("AgentUuid: " + agentConfig.AgentUuid) /** * Ping and exit, maybe. */ if flagPing { t0 := time.Now() code, err := pct.Ping(agentConfig.ApiHostname, agentConfig.ApiKey) d := time.Now().Sub(t0) if err != nil || code != 200 { return fmt.Errorf("Ping FAIL (%d %d %s)", d, code, err) } else { golog.Printf("Ping OK (%s)", d) return nil } } /** * PID file */ if flagPidFile != "" { pidFile := pct.NewPidFile() if err := pidFile.Set(flagPidFile); err != nil { golog.Fatalln(err) } defer pidFile.Remove() } /** * REST API */ api, err := ConnectAPI(agentConfig) if err != nil { golog.Fatal(err) } /** * Log relay */ logChan := make(chan *proto.LogEntry, log.BUFFER_SIZE*3) // Log websocket client, possibly disabled later. logClient, err := client.NewWebsocketClient(pct.NewLogger(logChan, "log-ws"), api, "log") if err != nil { golog.Fatalln(err) } logManager := log.NewManager( logClient, logChan, ) if err := logManager.Start(); err != nil { return fmt.Errorf("Error starting logmanager: %s\n", err) } /** * Instance manager */ itManager := instance.NewManager( pct.NewLogger(logChan, "instance-manager"), pct.Basedir.Dir("config"), api, ) if err := itManager.Start(); err != nil { return fmt.Errorf("Error starting instance manager: %s\n", err) } /** * Data spooler and sender */ hostname, _ := os.Hostname() dataClient, err := client.NewWebsocketClient(pct.NewLogger(logChan, "data-ws"), api, "data") if err != nil { golog.Fatalln(err) } dataManager := data.NewManager( pct.NewLogger(logChan, "data"), pct.Basedir.Dir("data"), hostname, dataClient, ) if err := dataManager.Start(); err != nil { return fmt.Errorf("Error starting data manager: %s\n", err) } /** * Collecct/report ticker (master clock) */ nowFunc := func() int64 { return time.Now().UTC().UnixNano() } clock := ticker.NewClock(&ticker.RealTickerFactory{}, nowFunc) /** * Metric and system config monitors */ mmManager := mm.NewManager( pct.NewLogger(logChan, "mm"), mmMonitor.NewFactory(logChan, itManager.Repo()), clock, dataManager.Spooler(), itManager.Repo(), ) if err := mmManager.Start(); err != nil { return fmt.Errorf("Error starting mm manager: %s\n", err) } sysconfigManager := sysconfig.NewManager( pct.NewLogger(logChan, "sysconfig"), sysconfigMonitor.NewFactory(logChan, itManager.Repo()), clock, dataManager.Spooler(), itManager.Repo(), ) if err := sysconfigManager.Start(); err != nil { return fmt.Errorf("Error starting sysconfig manager: %s\n", err) } /** * Query Analytics */ qanManager := qan.NewManager( pct.NewLogger(logChan, "qan"), &mysql.RealConnectionFactory{}, clock, qan.NewFileIntervalIterFactory(logChan), qan.NewSlowLogWorkerFactory(logChan), dataManager.Spooler(), itManager.Repo(), ) if err := qanManager.Start(); err != nil { return fmt.Errorf("Error starting qan manager: %s\n", err) } /** * Signal handler */ // Generally the agent has a crash-only design, but QAN is so far the only service // which reconfigures MySQL: it enables the slow log, sets long_query_time, etc. // It's not terrible to leave slow log on, but it's nicer to turn it off. sigChan := make(chan os.Signal, 1) stopChan := make(chan error, 2) signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM) go func() { sig := <-sigChan golog.Printf("Caught %s signal, shutting down...\n", sig) stopChan <- qanManager.Stop() }() /** * Agent */ cmdClient, err := client.NewWebsocketClient(pct.NewLogger(logChan, "agent-ws"), api, "cmd") if err != nil { golog.Fatal(err) } // The official list of services known to the agent. Adding a new service // requires a manager, starting the manager as above, and adding the manager // to this map. services := map[string]pct.ServiceManager{ "log": logManager, "data": dataManager, "qan": qanManager, "mm": mmManager, "instance": itManager, "sysconfig": sysconfigManager, } agent := agent.NewAgent( agentConfig, pct.NewLogger(logChan, "agent"), api, cmdClient, services, ) /** * Run agent, wait for it to stop or signal. */ go func() { stopChan <- agent.Run() }() stopErr := <-stopChan // agent or signal golog.Println("Agent stopped, shutting down...") qanManager.Stop() // see Signal handler ^ time.Sleep(2 * time.Second) // wait for final replies and log entries return stopErr }
func (s *ManagerTestSuite) TestStartService(t *C) { // Make and start a qan.Manager with mock factories, no analyzer yet. mockConnFactory := &mock.ConnectionFactory{Conn: s.nullmysql} a := mock.NewQanAnalyzer() f := mock.NewQanAnalyzerFactory(a) m := qan.NewManager(s.logger, s.clock, s.im, s.mrmsMonitor, mockConnFactory, f) t.Assert(m, NotNil) err := m.Start() t.Check(err, IsNil) test.WaitStatus(1, m, "qan", "Running") // Create the qan config. config := &qan.Config{ ServiceInstance: s.mysqlInstance, Start: []mysql.Query{ mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"}, mysql.Query{Set: "SET GLOBAL long_query_time=0.123"}, mysql.Query{Set: "SET GLOBAL slow_query_log=ON"}, }, Stop: []mysql.Query{ mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"}, mysql.Query{Set: "SET GLOBAL long_query_time=10"}, }, Interval: 300, // 5 min MaxSlowLogSize: 1073741824, // 1 GiB RemoveOldSlowLogs: true, ExampleQueries: true, MaxWorkers: 2, WorkerRunTime: 600, // 10 min CollectFrom: "slowlog", } // Send a StartService cmd with the qan config to start an analyzer. now := time.Now() qanConfig, _ := json.Marshal(config) cmd := &proto.Cmd{ User: "******", Ts: now, AgentUuid: "123", Service: "agent", Cmd: "StartService", Data: qanConfig, } reply := m.Handle(cmd) t.Assert(reply.Error, Equals, "") // The manager writes the qan config to disk. data, err := ioutil.ReadFile(pct.Basedir.ConfigFile("qan")) t.Check(err, IsNil) gotConfig := &qan.Config{} err = json.Unmarshal(data, gotConfig) t.Check(err, IsNil) if same, diff := IsDeeply(gotConfig, config); !same { Dump(gotConfig) t.Error(diff) } // Now the manager and analyzer should be running. status := m.Status() t.Check(status["qan"], Equals, "Running") t.Check(status["qan-analyzer"], Equals, "ok") // Try to start the same analyzer again. It results in an error because // double tooling is not allowed. reply = m.Handle(cmd) t.Check(reply.Error, Equals, "qan-analyzer service is running") // Send a StopService cmd to stop the analyzer. // todo-1.1: send Data with analyzer instance to stop. now = time.Now() cmd = &proto.Cmd{ User: "******", Ts: now, AgentUuid: "123", Service: "qan", Cmd: "StopService", } reply = m.Handle(cmd) t.Assert(reply.Error, Equals, "") // Now the manager is still running, but the analyzer is not. status = m.Status() t.Check(status["qan"], Equals, "Running") // And the manager has removed the qan config from disk so next time // the agent starts the analyzer is not started. t.Check(test.FileExists(pct.Basedir.ConfigFile("qan")), Equals, false) // StopService should be idempotent, so send it again and expect no error. reply = m.Handle(cmd) t.Assert(reply.Error, Equals, "") // Stop the manager. err = m.Stop() t.Assert(err, IsNil) }
func (s *ManagerTestSuite) TestStartWithConfig(t *C) { for _, analyzerType := range []string{"slowlog", "perfschema"} { // Make a qan.Manager with mock factories. mockConnFactory := &mock.ConnectionFactory{Conn: s.nullmysql} a := mock.NewQanAnalyzer() f := mock.NewQanAnalyzerFactory(a) m := qan.NewManager(s.logger, s.clock, s.im, s.mrmsMonitor, mockConnFactory, f) t.Assert(m, NotNil) // Write a realistic qan.conf config to disk. config := qan.Config{ ServiceInstance: s.mysqlInstance, CollectFrom: analyzerType, Interval: 300, MaxWorkers: 1, WorkerRunTime: 600, Start: []mysql.Query{ mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"}, mysql.Query{Set: "SET GLOBAL long_query_time=0.456"}, mysql.Query{Set: "SET GLOBAL slow_query_log=ON"}, }, Stop: []mysql.Query{ mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"}, mysql.Query{Set: "SET GLOBAL long_query_time=10"}, }, } err := pct.Basedir.WriteConfig("qan", &config) t.Assert(err, IsNil) // qan.Start() reads qan.conf from disk and starts an analyzer for it. err = m.Start() t.Check(err, IsNil) // Wait until qan.Start() calls analyzer.Start(). if !test.WaitState(a.StartChan) { t.Fatal("Timeout waiting for <-a.StartChan") } // After starting, the manager's status should be Running and the analyzer's // status should be reported too. status := m.Status() t.Check(status["qan"], Equals, "Running") t.Check(status["qan-analyzer"], Equals, "ok") // Check the args passed by the manager to the analyzer factory. if len(f.Args) == 0 { t.Error("len(f.Args) == 0, expected 1") } else { t.Check(f.Args, HasLen, 1) t.Check(f.Args[0].Config, DeepEquals, config) t.Check(f.Args[0].Name, Equals, "qan-analyzer") } // qan.Stop() stops the analyzer and leaves qan.conf on disk. err = m.Stop() t.Assert(err, IsNil) // Wait until qan.Stop() calls analyzer.Stop(). if !test.WaitState(a.StopChan) { t.Fatal("Timeout waiting for <-a.StopChan") } // qan.conf still exists after qan.Stop(). t.Check(test.FileExists(pct.Basedir.ConfigFile("qan")), Equals, true) // The analyzer is no longer reported in the status because it was stopped // and removed when the manager was stopped. status = m.Status() t.Check(status["qan"], Equals, "Stopped") } }
func (s *ManagerTestSuite) TestRecoverWorkerPanic(t *C) { // Create and start manager with mock workers. w1StopChan := make(chan bool) w1 := mock.NewQanWorker("qan-worker-1", w1StopChan, nil, nil, true) f := mock.NewQanWorkerFactory([]*mock.QanWorker{w1}) mockConnFactory := &mock.ConnectionFactory{Conn: s.nullmysql} m := qan.NewManager(s.logger, mockConnFactory, s.clock, s.iterFactory, f, s.spool, s.im) t.Assert(m, NotNil) config := &qan.Config{ ServiceInstance: s.mysqlInstance, MaxSlowLogSize: 1000, MaxWorkers: 2, Interval: 60, WorkerRunTime: 60, Start: []mysql.Query{ mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"}, }, Stop: []mysql.Query{ mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"}, }, } qanConfig, _ := json.Marshal(config) cmd := &proto.Cmd{ Ts: time.Now(), Cmd: "StartService", Data: qanConfig, } reply := m.Handle(cmd) t.Assert(reply.Error, Equals, "") test.WaitStatusPrefix(1, m, "qan-log-parser", "Idle") test.DrainLogChan(s.logChan) // Start mock worker. All it does is panic, much like fipar. now := time.Now() i1 := &qan.Interval{ Filename: "slow.log", StartOffset: 0, EndOffset: 100, StartTime: now, StopTime: now, } s.intervalChan <- i1 <-w1.Running() // wait for manager to run worker // For now, worker panic only results in error to log. var gotError *proto.LogEntry timeout := time.After(200 * time.Millisecond) GET_LOG: for { select { case l := <-s.logChan: if l.Level == 3 && strings.HasPrefix(l.Msg, "Lost interval 0 slow.log") { gotError = l break GET_LOG } case <-timeout: break GET_LOG } } t.Check(gotError, NotNil) // Stop manager reply = m.Handle(&proto.Cmd{Cmd: "StopService"}) t.Assert(reply.Error, Equals, "") }
func (s *ManagerTestSuite) TestWaitRemoveSlowLog(t *C) { // Same as TestRotateAndRemoveSlowLog, but we use mock workers so we can // test that slow log is not removed until previous workers are done. // Mock worker factory will return our mock workers when manager calls Make(). w1StopChan := make(chan bool) w1 := mock.NewQanWorker("qan-worker-1", w1StopChan, nil, nil, false) w2StopChan := make(chan bool) w2 := mock.NewQanWorker("qan-worker-2", w2StopChan, nil, nil, false) // Let's take this time to also test that MaxWorkers is enforced. w3 := mock.NewQanWorker("qan-worker-3", nil, nil, nil, false) f := mock.NewQanWorkerFactory([]*mock.QanWorker{w1, w2, w3}) // Clean up files that may interfere with test. Then copy the test log. slowlog := "slow006.log" files, _ := filepath.Glob("/tmp/" + slowlog + "-[0-9]*") for _, file := range files { os.Remove(file) } cp := exec.Command("cp", testlog.Sample+slowlog, "/tmp/"+slowlog) cp.Run() // Create and start manager with mock workers. mockConnFactory := &mock.ConnectionFactory{Conn: s.nullmysql} m := qan.NewManager(s.logger, mockConnFactory, s.clock, s.iterFactory, f, s.spool, s.im) if m == nil { t.Fatal("Create qan.Manager") } config := &qan.Config{ ServiceInstance: s.mysqlInstance, MaxSlowLogSize: 1000, RemoveOldSlowLogs: true, // done after w2 and w1 done MaxWorkers: 2, // w1 and w2 but not w3 Interval: 60, WorkerRunTime: 60, Start: []mysql.Query{ mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"}, }, Stop: []mysql.Query{ mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"}, }, } qanConfig, _ := json.Marshal(config) cmd := &proto.Cmd{ Ts: time.Now(), Cmd: "StartService", Data: qanConfig, } reply := m.Handle(cmd) t.Assert(reply.Error, Equals, "") test.WaitStatusPrefix(1, m, "qan-log-parser", "Idle") // Start first mock worker (w1) with interval 0 - 736. The worker's Run() // func won't return until we send true to its stop chan, so manager will // think worker is still running until then. now := time.Now() i1 := &qan.Interval{ Filename: "/tmp/" + slowlog, StartOffset: 0, EndOffset: 736, StartTime: now, StopTime: now, } s.intervalChan <- i1 <-w1.Running() // wait for manager to run worker // Start 2nd mock worker (w2) with interval 736 - 1833. Manager will rotate // but not remove original slow log because w1 is still running. i2 := &qan.Interval{ Filename: "/tmp/" + slowlog, StartOffset: 736, EndOffset: 1833, StartTime: now, StopTime: now, } s.intervalChan <- i2 <-w2.Running() test.WaitStatus(1, m, "qan-log-parser", "Idle (2 of 2 running)") /** * Worker status test */ // Workers should have status and QAN manager should report them all. status := m.Status() t.Check(status["qan-worker-1"], Equals, "ok") t.Check(status["qan-worker-2"], Equals, "ok") t.Check(status["qan-worker-3"], Equals, "") // not running due to MaxWorkers /** * Quick side test: qan.Config.MaxWorkers is enforced. */ test.DrainLogChan(s.logChan) s.intervalChan <- i2 logs := test.WaitLogChan(s.logChan, 3) test.WaitStatus(1, m, "qan-log-parser", "Idle (2 of 2 running)") gotWarning := false for _, log := range logs { if log.Level == proto.LOG_WARNING && strings.Contains(log.Msg, "All workers busy") { gotWarning = true break } } if !gotWarning { t.Error("Too many workers causes \"All workers busy\" warning") } // Original slow log should no longer exist; it was rotated away, but... if _, err := os.Stat("/tmp/" + slowlog); !os.IsNotExist(err) { t.Error("/tmp/" + slowlog + " no longer exists") } // ...old slow log should exist because w1 is still running. files, _ = filepath.Glob("/tmp/" + slowlog + "-[0-9]*") if len(files) != 1 { t.Errorf("w1 running so old slow log not removed, got %+v", files) } defer func() { for _, file := range files { os.Remove(file) } }() // Stop w2 which is holding "holding" the "lock" on removing the old // slog log (figuratively speaking; there are no real locks). Because // w1 is still running, manager should not remove the old log yet because // w1 could still be parsing it. w2StopChan <- true test.WaitStatus(1, m, "qan-log-parser", "Idle (1 of 2 running)") if _, err := os.Stat(files[0]); os.IsNotExist(err) { t.Errorf("w1 still running so old slow log not removed") } // Stop w1 and now, even though slow log was rotated for w2, manager // should remove old slow log. w1StopChan <- true test.WaitStatus(1, m, "qan-log-parser", "Idle (0 of 2 running)") if _, err := os.Stat(files[0]); !os.IsNotExist(err) { t.Errorf("w1 done running so old slow log removed") } // Stop manager reply = m.Handle(&proto.Cmd{Cmd: "StopService"}) t.Assert(reply.Error, Equals, "") }
func (s *ManagerTestSuite) TestRotateSlowLog(t *C) { // Same as TestRotateAndRemoveSlowLog, but with qan.Config.RemoveOldSlowLogs=false // and testing that Start and Stop queries were executed. slowlog := "slow006.log" files, _ := filepath.Glob("/tmp/" + slowlog + "-[0-9]*") for _, file := range files { os.Remove(file) } mockConnFactory := &mock.ConnectionFactory{Conn: s.nullmysql} m := qan.NewManager(s.logger, mockConnFactory, s.clock, s.iterFactory, s.workerFactory, s.spool, s.im) if m == nil { t.Fatal("Create qan.Manager") } config := &qan.Config{ ServiceInstance: s.mysqlInstance, Interval: 300, MaxSlowLogSize: 1000, RemoveOldSlowLogs: false, // <-- HERE ExampleQueries: false, MaxWorkers: 2, WorkerRunTime: 600, Start: []mysql.Query{ mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"}, mysql.Query{Set: "SET GLOBAL long_query_time=0.456"}, mysql.Query{Set: "SET GLOBAL slow_query_log=ON"}, }, Stop: []mysql.Query{ mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"}, mysql.Query{Set: "SET GLOBAL long_query_time=10"}, }, } qanConfig, _ := json.Marshal(config) cmd := &proto.Cmd{ Ts: time.Now(), Cmd: "StartService", Data: qanConfig, } reply := m.Handle(cmd) t.Assert(reply.Error, Equals, "") test.WaitStatusPrefix(1, m, "qan-log-parser", "Idle") s.nullmysql.Reset() cp := exec.Command("cp", testlog.Sample+slowlog, "/tmp/"+slowlog) cp.Run() // First interval: 0 - 736 now := time.Now() i1 := &qan.Interval{ Filename: "/tmp/" + slowlog, StartOffset: 0, EndOffset: 736, StartTime: now, StopTime: now, } s.intervalChan <- i1 resultData := <-s.dataChan report := *resultData.(*qan.Report) if report.Global.TotalQueries != 2 { t.Error("First interval has 2 queries, got ", report.Global.TotalQueries) } if report.Global.UniqueQueries != 1 { t.Error("First interval has 1 unique query, got ", report.Global.UniqueQueries) } // Second interval: 736 - 1833, but will actually go to end: 2200, if not // the next two test will fail. i2 := &qan.Interval{ Filename: "/tmp/" + slowlog, StartOffset: 736, EndOffset: 1833, StartTime: now, StopTime: now, } s.intervalChan <- i2 resultData = <-s.dataChan report = *resultData.(*qan.Report) if report.Global.TotalQueries != 4 { t.Error("Second interval has 2 queries, got ", report.Global.TotalQueries) } if report.Global.UniqueQueries != 2 { t.Error("Second interval has 2 unique queries, got ", report.Global.UniqueQueries) } test.WaitStatus(1, m, "qan-log-parser", "Idle (0 of 2 running)") // Original slow log should no longer exist; it was rotated away. if _, err := os.Stat("/tmp/" + slowlog); !os.IsNotExist(err) { t.Error("/tmp/" + slowlog + " no longer exists") } // The original slow log should NOT have been removed. files, _ = filepath.Glob("/tmp/" + slowlog + "-[0-9]*") if len(files) != 1 { t.Errorf("Old slow log not removed, got %+v", files) } defer func() { for _, file := range files { os.Remove(file) } }() expect := []mysql.Query{} for _, q := range config.Stop { expect = append(expect, q) } for _, q := range config.Start { expect = append(expect, q) } if same, diff := test.IsDeeply(s.nullmysql.GetSet(), expect); !same { t.Logf("%+v", s.nullmysql.GetSet()) t.Logf("%+v", expect) t.Error(diff) } // Stop manager reply = m.Handle(&proto.Cmd{Cmd: "StopService"}) t.Assert(reply.Error, Equals, "") }
func (s *ManagerTestSuite) TestRotateAndRemoveSlowLog(t *C) { // Clean up files that may interfere with test. slowlog := "slow006.log" files, _ := filepath.Glob("/tmp/" + slowlog + "-[0-9]*") for _, file := range files { os.Remove(file) } /** * slow006.log is 2200 bytes large. Rotation happens when manager * see interval.EndOffset >= MaxSlowLogSize. So we'll use these * intervals, * 0 - 736 * 736 - 1833 * 1833 - 2200 * and set MaxSlowLogSize=1000 which should make manager rotate the log * after the 2nd interval. When manager rotates log, it 1) renames log * to NAME-TS where NAME is the original name and TS is the current Unix * timestamp (UTC); and 2) it sets interval.StopOff = file size of NAME-TS * to finish parsing the log. Therefore, results for 2nd interval should * include our 3rd interval. -- Manager also calls Start and Stop so the * nullmysql conn should record the queries being set. */ // See TestStartService() for description of these startup tasks. mockConnFactory := &mock.ConnectionFactory{Conn: s.nullmysql} m := qan.NewManager(s.logger, mockConnFactory, s.clock, s.iterFactory, s.workerFactory, s.spool, s.im) if m == nil { t.Fatal("Create qan.Manager") } config := &qan.Config{ ServiceInstance: s.mysqlInstance, Interval: 300, MaxSlowLogSize: 1000, // <-- HERE RemoveOldSlowLogs: true, // <-- HERE too ExampleQueries: false, MaxWorkers: 2, WorkerRunTime: 600, Start: []mysql.Query{ mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"}, mysql.Query{Set: "SET GLOBAL long_query_time=0.456"}, mysql.Query{Set: "SET GLOBAL slow_query_log=ON"}, }, Stop: []mysql.Query{ mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"}, mysql.Query{Set: "SET GLOBAL long_query_time=10"}, }, } qanConfig, _ := json.Marshal(config) cmd := &proto.Cmd{ Ts: time.Now(), Cmd: "StartService", Data: qanConfig, } reply := m.Handle(cmd) t.Assert(reply.Error, Equals, "") test.WaitStatusPrefix(1, m, "qan-log-parser", "Idle") // Make copy of slow log because test will mv/rename it. cp := exec.Command("cp", testlog.Sample+slowlog, "/tmp/"+slowlog) cp.Run() // First interval: 0 - 736 now := time.Now() i1 := &qan.Interval{ Filename: "/tmp/" + slowlog, StartOffset: 0, EndOffset: 736, StartTime: now, StopTime: now, } s.intervalChan <- i1 resultData := <-s.dataChan report := *resultData.(*qan.Report) if report.Global.TotalQueries != 2 { t.Error("First interval has 2 queries, got ", report.Global.TotalQueries) } if report.Global.UniqueQueries != 1 { t.Error("First interval has 1 unique query, got ", report.Global.UniqueQueries) } // Second interval: 736 - 1833, but will actually go to end: 2200, if not // the next two test will fail. i2 := &qan.Interval{ Filename: "/tmp/" + slowlog, StartOffset: 736, EndOffset: 1833, StartTime: now, StopTime: now, } s.intervalChan <- i2 resultData = <-s.dataChan report = *resultData.(*qan.Report) if report.Global.TotalQueries != 4 { t.Error("Second interval has 2 queries, got ", report.Global.TotalQueries) } if report.Global.UniqueQueries != 2 { t.Error("Second interval has 2 unique queries, got ", report.Global.UniqueQueries) } test.WaitStatus(1, m, "qan-log-parser", "Idle (0 of 2 running)") // Original slow log should no longer exist; it was rotated away. if _, err := os.Stat("/tmp/" + slowlog); !os.IsNotExist(err) { t.Error("/tmp/" + slowlog + " no longer exists") } // The original slow log should have been renamed to slow006-TS, parsed, and removed. files, _ = filepath.Glob("/tmp/" + slowlog + "-[0-9]*") if len(files) != 0 { t.Errorf("Old slow log removed, got %+v", files) } defer func() { for _, file := range files { os.Remove(file) } }() // https://jira.percona.com/browse/PCT-466 // Old slow log removed but space not freed in filesystem pid := fmt.Sprintf("%d", os.Getpid()) out, err := exec.Command("lsof", "-p", pid).Output() if err != nil { t.Fatal(err) } if strings.Contains(string(out), "/tmp/"+slowlog+"-") { t.Logf("%s\n", string(out)) t.Error("Old slow log removed but not freed in filesystem (PCT-466)") } // Stop manager reply = m.Handle(&proto.Cmd{Cmd: "StopService"}) t.Assert(reply.Error, Equals, "") }
func (s *ManagerTestSuite) TestStartServiceFast(t *C) { /** * Like TestStartService but we simulate the next tick being 3m away * (mock.clock.Eta = 180) so that run() sends the first tick on the * tick chan, causing the first interval to start immediately. */ s.clock.Eta = 180 defer func() { s.clock.Eta = 0 }() m := qan.NewManager(s.logger, &mysql.RealConnectionFactory{}, s.clock, s.iterFactory, s.workerFactory, s.spool, s.im) t.Assert(m, NotNil) config := &qan.Config{ ServiceInstance: s.mysqlInstance, Start: []mysql.Query{ mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"}, }, Stop: []mysql.Query{ mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"}, }, Interval: 300, // 5 min MaxSlowLogSize: 1073741824, // 1 GiB MaxWorkers: 1, WorkerRunTime: 600, // 10 min } now := time.Now() qanConfig, _ := json.Marshal(config) cmd := &proto.Cmd{ User: "******", Ts: now, AgentUuid: "123", Service: "qan", Cmd: "StartService", Data: qanConfig, } reply := m.Handle(cmd) t.Assert(reply.Error, Equals, "") test.WaitStatus(1, m, "qan-log-parser", "Starting") tickChan := s.iterFactory.TickChans[s.iter] t.Assert(tickChan, NotNil) // run() should prime the tickChan with the 1st tick immediately. This makes // the interval iter start the interval immediately. Then run() continues // waiting for the iter to send an interval which happens when the real ticker // (the clock) sends the 2nd tick which is synced to the interval, thus ending // the first interval started by run() and starting the 2nd interval as normal. var tick time.Time select { case tick = <-tickChan: case <-time.After(1 * time.Second): } t.Assert(tick.IsZero(), Not(Equals), true) status := m.Status() t.Check(status["qan-next-interval"], Equals, "180.0s") // Stop QAN. cmd = &proto.Cmd{ User: "******", Ts: now, AgentUuid: "123", Service: "", Cmd: "StopService", } reply = m.Handle(cmd) t.Assert(reply.Error, Equals, "") }
func (s *ManagerTestSuite) TestStartService(t *C) { /** * Create and start manager. */ m := qan.NewManager(s.logger, &mysql.RealConnectionFactory{}, s.clock, s.iterFactory, s.workerFactory, s.spool, s.im) t.Assert(m, NotNil) // Create the qan config. tmpFile := fmt.Sprintf("/tmp/qan_test.TestStartService.%d", os.Getpid()) defer func() { os.Remove(tmpFile) }() config := &qan.Config{ ServiceInstance: s.mysqlInstance, Start: []mysql.Query{ mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"}, mysql.Query{Set: "SET GLOBAL long_query_time=0.123"}, mysql.Query{Set: "SET GLOBAL slow_query_log=ON"}, }, Stop: []mysql.Query{ mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"}, mysql.Query{Set: "SET GLOBAL long_query_time=10"}, }, Interval: 300, // 5 min MaxSlowLogSize: 1073741824, // 1 GiB RemoveOldSlowLogs: true, ExampleQueries: true, MaxWorkers: 2, WorkerRunTime: 600, // 10 min } // Create the StartService cmd which contains the qan config. now := time.Now() qanConfig, _ := json.Marshal(config) cmd := &proto.Cmd{ User: "******", Ts: now, AgentUuid: "123", Service: "agent", Cmd: "StartService", Data: qanConfig, } // Have the service manager start the qa service reply := m.Handle(cmd) // It should start without error. t.Assert(reply.Error, Equals, "") // It should write the config to disk. data, err := ioutil.ReadFile(pct.Basedir.ConfigFile("qan")) t.Check(err, IsNil) gotConfig := &qan.Config{} err = json.Unmarshal(data, gotConfig) t.Check(err, IsNil) if same, diff := test.IsDeeply(gotConfig, config); !same { test.Dump(gotConfig) t.Error(diff) } // And status should be "Running" and "Idle". test.WaitStatus(1, m, "qan-log-parser", "Idle (0 of 2 running)") status := m.Status() t.Check(status["qan"], Equals, "Running") t.Check(status["qan-log-parser"], Equals, "Idle (0 of 2 running)") // It should have enabled the slow log. slowLog := s.realmysql.GetGlobalVarNumber("slow_query_log") t.Assert(slowLog, Equals, float64(1)) longQueryTime := s.realmysql.GetGlobalVarNumber("long_query_time") t.Assert(longQueryTime, Equals, 0.123) // Starting an already started service should result in a ServiceIsRunningError. reply = m.Handle(cmd) t.Check(reply.Error, Not(Equals), "") // It should add a tickChan for the interval iter. t.Check(s.clock.Added, HasLen, 1) t.Check(s.clock.Removed, HasLen, 0) /** * Have manager run a worker, parse, and send data. */ interv := &qan.Interval{ Filename: testlog.Sample + "slow001.log", StartOffset: 0, EndOffset: 524, StartTime: now, StopTime: now, } s.intervalChan <- interv v := test.WaitData(s.dataChan) t.Assert(v, HasLen, 1) report := v[0].(*qan.Report) result := &qan.Result{ StopOffset: report.StopOffset, Global: report.Global, Classes: report.Class, } test.WriteData(result, tmpFile) t.Check(tmpFile, testlog.FileEquals, sample+"slow001.json") /** * Send StopService cmd to stop qan/qan-log-parser. */ now = time.Now() cmd = &proto.Cmd{ User: "******", Ts: now, AgentUuid: "123", Service: "agent", Cmd: "StopService", } // Have the service manager start the qa service reply = m.Handle(cmd) // It should start without error. t.Assert(reply.Error, Equals, "") // It should disable the slow log. slowLog = s.realmysql.GetGlobalVarNumber("slow_query_log") t.Assert(slowLog, Equals, float64(0)) longQueryTime = s.realmysql.GetGlobalVarNumber("long_query_time") t.Assert(longQueryTime, Equals, 10.0) // It should remove the tickChan (and not have added others). t.Check(s.clock.Added, HasLen, 1) t.Check(s.clock.Removed, HasLen, 1) // qan still running, but qan-log-parser stopped. test.WaitStatus(1, m, "qan-log-parser", "Stopped") status = m.Status() t.Check(status["qan"], Equals, "Running") t.Check(status["qan-log-parser"], Equals, "Stopped") }
func run() error { version := fmt.Sprintf("percona-agent %s%s rev %s", agent.VERSION, agent.REL, agent.REVISION) if flagVersion { fmt.Println(version) return nil } golog.Printf("Running %s pid %d\n", version, os.Getpid()) if err := pct.Basedir.Init(flagBasedir); err != nil { return err } // Start-lock file is used to let agent1 self-update, create start-lock, // start updated agent2, exit cleanly, then agent2 starts. agent1 may // not use a PID file, so this special file is required. if err := pct.WaitStartLock(); err != nil { return err } // NOTE: This must run last, and defer if LIFO, so it must be declared first. defer os.Remove(pct.Basedir.File("start-lock")) /** * Agent config (require API key and agent UUID) */ if !pct.FileExists(pct.Basedir.ConfigFile("agent")) { return fmt.Errorf("Agent config file %s does not exist", pct.Basedir.ConfigFile("agent")) } bytes, err := agent.LoadConfig() if err != nil { return fmt.Errorf("Invalid agent config: %s\n", err) } agentConfig := &agent.Config{} if err := json.Unmarshal(bytes, agentConfig); err != nil { return fmt.Errorf("Error parsing "+pct.Basedir.ConfigFile("agent")+": ", err) } golog.Println("ApiHostname: " + agentConfig.ApiHostname) golog.Println("AgentUuid: " + agentConfig.AgentUuid) /** * Ping and exit, maybe. */ // Set for all connections to API. X-Percona-API-Key is set automatically // using the pct.APIConnector. headers := map[string]string{ "X-Percona-Agent-Version": agent.VERSION, } if flagPing { t0 := time.Now() code, err := pct.Ping(agentConfig.ApiHostname, agentConfig.ApiKey, headers) d := time.Now().Sub(t0) if err != nil || code != 200 { return fmt.Errorf("Ping FAIL (%d %d %s)", d, code, err) } else { golog.Printf("Ping OK (%s)", d) return nil } } /** * PID file */ pidFilePath := agentConfig.PidFile if flagPidFile != "" { pidFilePath = flagPidFile } if pidFilePath != "" { pidFile := pct.NewPidFile() if err := pidFile.Set(pidFilePath); err != nil { golog.Fatalln(err) } defer pidFile.Remove() } /** * REST API */ retry := -1 // unlimited if flagStatus { retry = 1 } api, err := ConnectAPI(agentConfig, retry) if err != nil { golog.Fatal(err) } // Get agent status via API and exit. if flagStatus { code, bytes, err := api.Get(agentConfig.ApiKey, api.AgentLink("self")+"/status") if err != nil { return err } if code == 404 { return fmt.Errorf("Agent not found") } status := make(map[string]string) if err := json.Unmarshal(bytes, &status); err != nil { return err } golog.Println(status) return nil } /** * Connection factory */ connFactory := &mysql.RealConnectionFactory{} /** * Log relay */ logChan := make(chan *proto.LogEntry, log.BUFFER_SIZE*3) // Log websocket client, possibly disabled later. logClient, err := client.NewWebsocketClient(pct.NewLogger(logChan, "log-ws"), api, "log", headers) if err != nil { golog.Fatalln(err) } logManager := log.NewManager( logClient, logChan, ) if err := logManager.Start(); err != nil { return fmt.Errorf("Error starting logmanager: %s\n", err) } /** * MRMS (MySQL Restart Monitoring Service) */ mrm := mrmsMonitor.NewMonitor( pct.NewLogger(logChan, "mrms-monitor"), connFactory, ) mrmsManager := mrms.NewManager( pct.NewLogger(logChan, "mrms-manager"), mrm, ) if err := mrmsManager.Start(); err != nil { return fmt.Errorf("Error starting mrms manager: %s\n", err) } /** * Instance manager */ itManager := instance.NewManager( pct.NewLogger(logChan, "instance-manager"), pct.Basedir.Dir("config"), api, mrm, ) if err := itManager.Start(); err != nil { return fmt.Errorf("Error starting instance manager: %s\n", err) } /** * Data spooler and sender */ hostname, _ := os.Hostname() dataClient, err := client.NewWebsocketClient(pct.NewLogger(logChan, "data-ws"), api, "data", headers) if err != nil { golog.Fatalln(err) } dataManager := data.NewManager( pct.NewLogger(logChan, "data"), pct.Basedir.Dir("data"), pct.Basedir.Dir("trash"), hostname, dataClient, ) if err := dataManager.Start(); err != nil { return fmt.Errorf("Error starting data manager: %s\n", err) } /** * Collecct/report ticker (master clock) */ nowFunc := func() int64 { return time.Now().UTC().UnixNano() } clock := ticker.NewClock(&ticker.RealTickerFactory{}, nowFunc) /** * Metric and system config monitors */ mmManager := mm.NewManager( pct.NewLogger(logChan, "mm"), mmMonitor.NewFactory(logChan, itManager.Repo(), mrm), clock, dataManager.Spooler(), itManager.Repo(), mrm, ) if err := mmManager.Start(); err != nil { return fmt.Errorf("Error starting mm manager: %s\n", err) } sysconfigManager := sysconfig.NewManager( pct.NewLogger(logChan, "sysconfig"), sysconfigMonitor.NewFactory(logChan, itManager.Repo()), clock, dataManager.Spooler(), itManager.Repo(), ) if err := sysconfigManager.Start(); err != nil { return fmt.Errorf("Error starting sysconfig manager: %s\n", err) } /** * Query service (real-time EXPLAIN, SHOW CREATE TABLE, etc.) */ queryManager := query.NewManager( pct.NewLogger(logChan, "query"), itManager.Repo(), &mysql.RealConnectionFactory{}, ) if err := queryManager.Start(); err != nil { return fmt.Errorf("Error starting query manager: %s\n", err) } /** * Query Analytics */ qanManager := qan.NewManager( pct.NewLogger(logChan, "qan"), clock, itManager.Repo(), mrm, connFactory, qanFactory.NewRealAnalyzerFactory( logChan, qanFactory.NewRealIntervalIterFactory(logChan), slowlog.NewRealWorkerFactory(logChan), perfschema.NewRealWorkerFactory(logChan), dataManager.Spooler(), clock, ), ) if err := qanManager.Start(); err != nil { return fmt.Errorf("Error starting qan manager: %s\n", err) } /** * Sysinfo */ sysinfoManager := sysinfo.NewManager( pct.NewLogger(logChan, "sysinfo"), ) // MySQL Sysinfo mysqlSysinfoService := mysqlSysinfo.NewMySQL( pct.NewLogger(logChan, "sysinfo-mysql"), itManager.Repo(), ) if err := sysinfoManager.RegisterService("MySQLSummary", mysqlSysinfoService); err != nil { return fmt.Errorf("Error registering Mysql Sysinfo service: %s\n", err) } // System Sysinfo systemSysinfoService := systemSysinfo.NewSystem( pct.NewLogger(logChan, "sysinfo-system"), ) if err := sysinfoManager.RegisterService("SystemSummary", systemSysinfoService); err != nil { return fmt.Errorf("Error registering System Sysinfo service: %s\n", err) } // Start Sysinfo manager if err := sysinfoManager.Start(); err != nil { return fmt.Errorf("Error starting Sysinfo manager: %s\n", err) } /** * Signal handler */ // Generally the agent has a crash-only design, but QAN is so far the only service // which reconfigures MySQL: it enables the slow log, sets long_query_time, etc. // It's not terrible to leave slow log on, but it's nicer to turn it off. sigChan := make(chan os.Signal, 1) stopChan := make(chan error, 2) signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM) go func() { sig := <-sigChan golog.Printf("Caught %s signal, shutting down...\n", sig) stopChan <- qanManager.Stop() }() /** * Agent */ cmdClient, err := client.NewWebsocketClient(pct.NewLogger(logChan, "agent-ws"), api, "cmd", headers) if err != nil { golog.Fatal(err) } // The official list of services known to the agent. Adding a new service // requires a manager, starting the manager as above, and adding the manager // to this map. services := map[string]pct.ServiceManager{ "log": logManager, "data": dataManager, "qan": qanManager, "mm": mmManager, "instance": itManager, "mrms": mrmsManager, "sysconfig": sysconfigManager, "query": queryManager, "sysinfo": sysinfoManager, } // Set the global pct/cmd.Factory, used for the Restart cmd. pctCmd.Factory = &pctCmd.RealCmdFactory{} agentLogger := pct.NewLogger(logChan, "agent") agent := agent.NewAgent( agentConfig, agentLogger, api, cmdClient, services, ) /** * Run agent, wait for it to stop, signal, or crash. */ var stopErr error go func() { defer func() { if err := recover(); err != nil { errMsg := fmt.Sprintf("Agent crashed: %s", err) golog.Println(errMsg) agentLogger.Error(errMsg) stopChan <- fmt.Errorf("%s", errMsg) } }() stopChan <- agent.Run() }() // Wait for agent to stop, or for signals. agentRunning := true statusSigChan := make(chan os.Signal, 1) signal.Notify(statusSigChan, syscall.SIGUSR1) // kill -USER1 PID reconnectSigChan := make(chan os.Signal, 1) signal.Notify(reconnectSigChan, syscall.SIGHUP) // kill -HUP PID for agentRunning { select { case stopErr = <-stopChan: // agent or signal golog.Println("Agent stopped, shutting down...") agentLogger.Info("Agent stopped") agentRunning = false case <-statusSigChan: status := agent.AllStatus() golog.Printf("Status: %+v\n", status) case <-reconnectSigChan: u, _ := user.Current() cmd := &proto.Cmd{ Ts: time.Now().UTC(), User: u.Username + " (SIGHUP)", AgentUuid: agentConfig.AgentUuid, Service: "agent", Cmd: "Reconnect", } agent.Handle(cmd) } } qanManager.Stop() // see Signal handler ^ time.Sleep(2 * time.Second) // wait for final replies and log entries return stopErr }