func (s *TestSuite) TestApiDisconnect(t *C) { /** * If using direct interface, Recv() should return error if API disconnects. */ ws, err := client.NewWebsocketClient(s.logger, s.api, "agent", nil) t.Assert(err, IsNil) ws.Connect() c := <-mock.ClientConnectChan <-ws.ConnectChan() // No error yet. got := test.WaitErr(ws.ErrorChan()) t.Assert(len(got), Equals, 0) mock.DisconnectClient(c) /** * I cannot provoke an error on websocket.Send(), only Receive(). * Perhaps errors (e.g. ws closed) are only reported on recv? * This only affect the logger since it's ws send-only: it will * need a goroutine blocking on Recieve() that, upon error, notifies * the sending goroutine to reconnect. */ var data interface{} err = ws.Recv(data, 5) t.Assert(err, NotNil) // EOF due to disconnect. }
func (s *TestSuite) TestDialTimeout(t *C) { /** * This test simulates a dial timeout by listening on a port that does nothing. * The TCP connection completes, but the TLS handshake times out because the * little goroutine below does nothing after net.Listen() (normally code would * net.Accept() after listening). To simulate a lower-level dial timeout would * require a very low-level handling of the network socket: having the port open * but not completing the TCP syn-syn+ack-ack handshake; this is too complicate, * so breaking the TLS handshake is close enough. */ addr := "localhost:9443" url := "wss://" + addr + "/" links := map[string]string{"agent": url} api := mock.NewAPI("http://localhost", url, "apikey", "uuid", links) wss, err := client.NewWebsocketClient(s.logger, api, "agent", nil) t.Assert(err, IsNil) doneChan := make(chan bool, 1) go func() { l, err := net.Listen("tcp", addr) if err != nil { log.Fatal(err) } defer l.Close() <-doneChan }() time.Sleep(1 * time.Second) err = wss.ConnectOnce(2) t.Check(err, NotNil) doneChan <- true }
func (s *TestSuite) TestConnectBackoff(t *C) { /** * Connect() should wait between attempts, using pct.Backoff (pct/backoff.go). */ ws, err := client.NewWebsocketClient(s.logger, s.api, "agent", nil) t.Assert(err, IsNil) ws.Connect() c := <-mock.ClientConnectChan <-ws.ConnectChan() defer ws.Disconnect() // 0s wait, connect, err="Lost connection", // 1s wait, connect, err="Lost connection", // 3s wait, connect, ok t0 := time.Now() for i := 0; i < 2; i++ { mock.DisconnectClient(c) ws.Connect() c = <-mock.ClientConnectChan <-ws.ConnectChan() // connect ack } d := time.Now().Sub(t0) if d < time.Duration(3*time.Second) { t.Errorf("Exponential backoff wait time between connect attempts: %s\n", d) } }
func (s *TestSuite) TestChannelsAfterReconnect(t *C) { /** * Client send/recv chans should work after disconnect and reconnect. */ ws, err := client.NewWebsocketClient(s.logger, s.api, "agent", nil) t.Assert(err, IsNil) ws.Start() defer ws.Stop() defer ws.Disconnect() ws.Connect() c := <-mock.ClientConnectChan <-ws.ConnectChan() // connect ack // Send cmd and wait for reply to ensure we're fully connected. cmd := &proto.Cmd{ User: "******", Ts: time.Now(), Cmd: "Status", } c.SendChan <- cmd got := test.WaitCmd(ws.RecvChan()) t.Assert(len(got), Equals, 1) reply := cmd.Reply(nil, nil) ws.SendChan() <- reply data := test.WaitData(c.RecvChan) t.Assert(len(data), Equals, 1) // Disconnect client. mock.DisconnectClient(c) <-ws.ConnectChan() // disconnect ack // Reconnect client and send/recv again. ws.Connect() c = <-mock.ClientConnectChan <-ws.ConnectChan() // connect ack c.SendChan <- cmd got = test.WaitCmd(ws.RecvChan()) t.Assert(len(got), Equals, 1) reply = cmd.Reply(nil, nil) ws.SendChan() <- reply data = test.WaitData(c.RecvChan) t.Assert(len(data), Equals, 1) }
func (s *TestSuite) TestChannels(t *C) { /** * Agent uses send/recv channels instead of "direct" interface. */ ws, err := client.NewWebsocketClient(s.logger, s.api, "agent") t.Assert(err, IsNil) // Start send/recv chans, but idle until successful Connect. ws.Start() defer ws.Stop() ws.Connect() c := <-mock.ClientConnectChan <-ws.ConnectChan() // API sends Cmd to client. cmd := &proto.Cmd{ User: "******", Ts: time.Now(), Cmd: "Status", } c.SendChan <- cmd // If client's recvChan is working, it will receive the Cmd. got := test.WaitCmd(ws.RecvChan()) t.Assert(len(got), Equals, 1) t.Assert(got[0], DeepEquals, *cmd) // Client sends Reply in response to Cmd. reply := cmd.Reply(nil, nil) ws.SendChan() <- reply // If client's sendChan is working, we/API will receive the Reply. data := test.WaitData(c.RecvChan) t.Assert(len(data), Equals, 1) // We're dealing with generic data again. m := data[0].(map[string]interface{}) t.Assert(m["Cmd"], Equals, "Status") t.Assert(m["Error"], Equals, "") err = ws.Disconnect() t.Assert(err, IsNil) }
func (s *TestSuite) TestSendBytes(t *C) { ws, err := client.NewWebsocketClient(s.logger, s.api, "agent", nil) t.Assert(err, IsNil) ws.ConnectOnce(5) c := <-mock.ClientConnectChan data := []byte(`["Hello"]`) err = ws.SendBytes(data, 5) t.Assert(err, IsNil) // Recv what we just sent. got := test.WaitData(c.RecvChan) t.Assert(len(got), Equals, 1) gotData := got[0].([]interface{}) t.Check(gotData[0].(string), Equals, "Hello") ws.DisconnectOnce() }
func (s *TestSuite) TestErrorChan(t *C) { /** * When client disconnects due to send or recv error, * it should send the error on its ErrorChan(). */ ws, err := client.NewWebsocketClient(s.logger, s.api, "agent") t.Assert(err, IsNil) ws.Start() defer ws.Stop() ws.Connect() c := <-mock.ClientConnectChan <-ws.ConnectChan() // No error yet. got := test.WaitErr(ws.ErrorChan()) t.Assert(len(got), Equals, 0) // API sends Cmd to client. cmd := &proto.Cmd{ User: "******", Ts: time.Now(), Cmd: "Status", } c.SendChan <- cmd // No error yet. got = test.WaitErr(ws.ErrorChan()) t.Assert(len(got), Equals, 0) // Disconnect the client. mock.DisconnectClient(c) // Client should send error from disconnect. got = test.WaitErr(ws.ErrorChan()) t.Assert(len(got), Equals, 1) t.Assert(got[0], NotNil) err = ws.Disconnect() t.Assert(err, IsNil) }
func (s *TestSuite) TestWssConnection(t *C) { /** * This test ensures our slighly customized wss connectio handling works, * i.e. that TLS works. Only drawback is: client disables cert verification * because the mock ws server uses a self-signed cert, but this only happens * when the remote addr is localhost:8443, so it shouldn't affect real connections. */ ws, err := client.NewWebsocketClient(s.logger, s.apiWss, "agent", nil) t.Assert(err, IsNil) // Client sends state of connection (true=connected, false=disconnected) // on its ConnectChan. connected := false doneChan := make(chan bool) go func() { connected = <-ws.ConnectChan() doneChan <- true }() // Wait for connection in mock ws server. ws.Connect() c := <-mock.ClientConnectChanWss <-doneChan t.Check(connected, Equals, true) // Send a log entry. logEntry := &proto.LogEntry{ Level: 2, Service: "qan", Msg: "Hello", } err = ws.Send(logEntry, 5) t.Assert(err, IsNil) // Recv what we just sent. got := test.WaitData(c.RecvChanWss) t.Assert(len(got), Equals, 1) ws.Conn().Close() }
func (s *TestSuite) TestChannelsApiDisconnect(t *C) { /** * If using chnanel interface, ErrorChan() should return error if API disconnects. */ ws, err := client.NewWebsocketClient(s.logger, s.api, "agent", nil) t.Assert(err, IsNil) var gotErr error doneChan := make(chan bool) go func() { gotErr = <-ws.ErrorChan() doneChan <- true }() ws.Start() defer ws.Stop() defer ws.Disconnect() ws.Connect() c := <-mock.ClientConnectChan <-ws.ConnectChan() // connect ack // No error yet. select { case <-doneChan: t.Error("No error yet") default: } mock.DisconnectClient(c) // Wait for error. select { case <-doneChan: t.Check(gotErr, NotNil) // EOF due to disconnect. case <-time.After(1 * time.Second): t.Error("Get error") } }
func (s *TestSuite) TestCloseTimeout(t *C) { // https://jira.percona.com/browse/PCT-1045 ws, err := client.NewWebsocketClient(s.logger, s.api, "agent", nil) t.Assert(err, IsNil) connected := false doneChan := make(chan bool) go func() { connected = <-ws.ConnectChan() doneChan <- true }() // Wait for connection in mock ws server. ws.Connect() c := <-mock.ClientConnectChan <-doneChan t.Check(connected, Equals, true) // Send a log entry. logEntry := &proto.LogEntry{ Level: 2, Service: "qan", Msg: "Hello", } err = ws.Send(logEntry, 1) t.Assert(err, IsNil) // Recv what we just sent. got := test.WaitData(c.RecvChan) t.Assert(len(got), Equals, 1) // Wait 1s for that ^ 1s timeout to pass. time.Sleep(1400 * time.Millisecond) err = ws.Disconnect() t.Check(err, IsNil) }
func (s *TestSuite) TestSend(t *C) { /** * LogRelay (logrelay/) uses "direct" interface, not send/recv chans. */ ws, err := client.NewWebsocketClient(s.logger, s.api, "agent", nil) t.Assert(err, IsNil) // Client sends state of connection (true=connected, false=disconnected) // on its ConnectChan. connected := false doneChan := make(chan bool) go func() { connected = <-ws.ConnectChan() doneChan <- true }() // Wait for connection in mock ws server. ws.Connect() c := <-mock.ClientConnectChan <-doneChan t.Check(connected, Equals, true) // Send a log entry. logEntry := &proto.LogEntry{ Level: 2, Service: "qan", Msg: "Hello", } err = ws.Send(logEntry, 5) t.Assert(err, IsNil) // Recv what we just sent. got := test.WaitData(c.RecvChan) t.Assert(len(got), Equals, 1) // We're dealing with generic data. m := got[0].(map[string]interface{}) t.Check(m["Level"], Equals, float64(2)) t.Check(m["Service"], Equals, "qan") t.Check(m["Msg"], Equals, "Hello") // Quick check that Conn() works. conn := ws.Conn() t.Check(conn, NotNil) // Status should report connected to the proper link. status := ws.Status() t.Check(status, DeepEquals, map[string]string{ "ws": "Connected " + URL, "ws-link": URL, }) ws.Disconnect() select { case connected = <-ws.ConnectChan(): case <-time.After(1 * time.Second): t.Error("No connected=false notify on Disconnect()") } // Status should report disconnected and still the proper link. status = ws.Status() t.Check(status, DeepEquals, map[string]string{ "ws": "Disconnected", "ws-link": URL, }) }
func run() error { version := fmt.Sprintf("percona-agent %s rev %s", agent.VERSION, agent.REVISION) if flagVersion { fmt.Println(version) return nil } golog.Printf("Running %s pid %d\n", version, os.Getpid()) if err := pct.Basedir.Init(flagBasedir); err != nil { return err } // Start-lock file is used to let agent1 self-update, create start-lock, // start updated agent2, exit cleanly, then agent2 starts. agent1 may // not use a PID file, so this special file is required. if err := pct.WaitStartLock(); err != nil { return err } // NOTE: This must run last, and defer if LIFO, so it must be declared first. defer os.Remove(pct.Basedir.File("start-lock")) /** * Agent config (require API key and agent UUID) */ if !pct.FileExists(pct.Basedir.ConfigFile("agent")) { return fmt.Errorf("Agent config file %s does not exist", pct.Basedir.ConfigFile("agent")) } bytes, err := agent.LoadConfig() if err != nil { return fmt.Errorf("Invalid agent config: %s\n", err) } agentConfig := &agent.Config{} if err := json.Unmarshal(bytes, agentConfig); err != nil { return fmt.Errorf("Error parsing "+pct.Basedir.ConfigFile("agent")+": ", err) } golog.Println("ApiHostname: " + agentConfig.ApiHostname) golog.Println("AgentUuid: " + agentConfig.AgentUuid) /** * Ping and exit, maybe. */ if flagPing { t0 := time.Now() code, err := pct.Ping(agentConfig.ApiHostname, agentConfig.ApiKey) d := time.Now().Sub(t0) if err != nil || code != 200 { return fmt.Errorf("Ping FAIL (%d %d %s)", d, code, err) } else { golog.Printf("Ping OK (%s)", d) return nil } } /** * PID file */ if flagPidFile != "" { pidFile := pct.NewPidFile() if err := pidFile.Set(flagPidFile); err != nil { golog.Fatalln(err) } defer pidFile.Remove() } /** * REST API */ api, err := ConnectAPI(agentConfig) if err != nil { golog.Fatal(err) } /** * Log relay */ logChan := make(chan *proto.LogEntry, log.BUFFER_SIZE*3) // Log websocket client, possibly disabled later. logClient, err := client.NewWebsocketClient(pct.NewLogger(logChan, "log-ws"), api, "log") if err != nil { golog.Fatalln(err) } logManager := log.NewManager( logClient, logChan, ) if err := logManager.Start(); err != nil { return fmt.Errorf("Error starting logmanager: %s\n", err) } /** * Instance manager */ itManager := instance.NewManager( pct.NewLogger(logChan, "instance-manager"), pct.Basedir.Dir("config"), api, ) if err := itManager.Start(); err != nil { return fmt.Errorf("Error starting instance manager: %s\n", err) } /** * Data spooler and sender */ hostname, _ := os.Hostname() dataClient, err := client.NewWebsocketClient(pct.NewLogger(logChan, "data-ws"), api, "data") if err != nil { golog.Fatalln(err) } dataManager := data.NewManager( pct.NewLogger(logChan, "data"), pct.Basedir.Dir("data"), hostname, dataClient, ) if err := dataManager.Start(); err != nil { return fmt.Errorf("Error starting data manager: %s\n", err) } /** * Collecct/report ticker (master clock) */ nowFunc := func() int64 { return time.Now().UTC().UnixNano() } clock := ticker.NewClock(&ticker.RealTickerFactory{}, nowFunc) /** * Metric and system config monitors */ mmManager := mm.NewManager( pct.NewLogger(logChan, "mm"), mmMonitor.NewFactory(logChan, itManager.Repo()), clock, dataManager.Spooler(), itManager.Repo(), ) if err := mmManager.Start(); err != nil { return fmt.Errorf("Error starting mm manager: %s\n", err) } sysconfigManager := sysconfig.NewManager( pct.NewLogger(logChan, "sysconfig"), sysconfigMonitor.NewFactory(logChan, itManager.Repo()), clock, dataManager.Spooler(), itManager.Repo(), ) if err := sysconfigManager.Start(); err != nil { return fmt.Errorf("Error starting sysconfig manager: %s\n", err) } /** * Query Analytics */ qanManager := qan.NewManager( pct.NewLogger(logChan, "qan"), &mysql.RealConnectionFactory{}, clock, qan.NewFileIntervalIterFactory(logChan), qan.NewSlowLogWorkerFactory(logChan), dataManager.Spooler(), itManager.Repo(), ) if err := qanManager.Start(); err != nil { return fmt.Errorf("Error starting qan manager: %s\n", err) } /** * Signal handler */ // Generally the agent has a crash-only design, but QAN is so far the only service // which reconfigures MySQL: it enables the slow log, sets long_query_time, etc. // It's not terrible to leave slow log on, but it's nicer to turn it off. sigChan := make(chan os.Signal, 1) stopChan := make(chan error, 2) signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM) go func() { sig := <-sigChan golog.Printf("Caught %s signal, shutting down...\n", sig) stopChan <- qanManager.Stop() }() /** * Agent */ cmdClient, err := client.NewWebsocketClient(pct.NewLogger(logChan, "agent-ws"), api, "cmd") if err != nil { golog.Fatal(err) } // The official list of services known to the agent. Adding a new service // requires a manager, starting the manager as above, and adding the manager // to this map. services := map[string]pct.ServiceManager{ "log": logManager, "data": dataManager, "qan": qanManager, "mm": mmManager, "instance": itManager, "sysconfig": sysconfigManager, } agent := agent.NewAgent( agentConfig, pct.NewLogger(logChan, "agent"), api, cmdClient, services, ) /** * Run agent, wait for it to stop or signal. */ go func() { stopChan <- agent.Run() }() stopErr := <-stopChan // agent or signal golog.Println("Agent stopped, shutting down...") qanManager.Stop() // see Signal handler ^ time.Sleep(2 * time.Second) // wait for final replies and log entries return stopErr }
func run() error { version := fmt.Sprintf("percona-agent %s%s rev %s", agent.VERSION, agent.REL, agent.REVISION) if flagVersion { fmt.Println(version) return nil } golog.Printf("Running %s pid %d\n", version, os.Getpid()) if err := pct.Basedir.Init(flagBasedir); err != nil { return err } // Start-lock file is used to let agent1 self-update, create start-lock, // start updated agent2, exit cleanly, then agent2 starts. agent1 may // not use a PID file, so this special file is required. if err := pct.WaitStartLock(); err != nil { return err } // NOTE: This must run last, and defer if LIFO, so it must be declared first. defer os.Remove(pct.Basedir.File("start-lock")) /** * Agent config (require API key and agent UUID) */ if !pct.FileExists(pct.Basedir.ConfigFile("agent")) { return fmt.Errorf("Agent config file %s does not exist", pct.Basedir.ConfigFile("agent")) } bytes, err := agent.LoadConfig() if err != nil { return fmt.Errorf("Invalid agent config: %s\n", err) } agentConfig := &agent.Config{} if err := json.Unmarshal(bytes, agentConfig); err != nil { return fmt.Errorf("Error parsing "+pct.Basedir.ConfigFile("agent")+": ", err) } golog.Println("ApiHostname: " + agentConfig.ApiHostname) golog.Println("AgentUuid: " + agentConfig.AgentUuid) /** * Ping and exit, maybe. */ // Set for all connections to API. X-Percona-API-Key is set automatically // using the pct.APIConnector. headers := map[string]string{ "X-Percona-Agent-Version": agent.VERSION, } if flagPing { t0 := time.Now() code, err := pct.Ping(agentConfig.ApiHostname, agentConfig.ApiKey, headers) d := time.Now().Sub(t0) if err != nil || code != 200 { return fmt.Errorf("Ping FAIL (%d %d %s)", d, code, err) } else { golog.Printf("Ping OK (%s)", d) return nil } } /** * PID file */ pidFilePath := agentConfig.PidFile if flagPidFile != "" { pidFilePath = flagPidFile } if pidFilePath != "" { pidFile := pct.NewPidFile() if err := pidFile.Set(pidFilePath); err != nil { golog.Fatalln(err) } defer pidFile.Remove() } /** * REST API */ retry := -1 // unlimited if flagStatus { retry = 1 } api, err := ConnectAPI(agentConfig, retry) if err != nil { golog.Fatal(err) } // Get agent status via API and exit. if flagStatus { code, bytes, err := api.Get(agentConfig.ApiKey, api.AgentLink("self")+"/status") if err != nil { return err } if code == 404 { return fmt.Errorf("Agent not found") } status := make(map[string]string) if err := json.Unmarshal(bytes, &status); err != nil { return err } golog.Println(status) return nil } /** * Connection factory */ connFactory := &mysql.RealConnectionFactory{} /** * Log relay */ logChan := make(chan *proto.LogEntry, log.BUFFER_SIZE*3) // Log websocket client, possibly disabled later. logClient, err := client.NewWebsocketClient(pct.NewLogger(logChan, "log-ws"), api, "log", headers) if err != nil { golog.Fatalln(err) } logManager := log.NewManager( logClient, logChan, ) if err := logManager.Start(); err != nil { return fmt.Errorf("Error starting logmanager: %s\n", err) } /** * MRMS (MySQL Restart Monitoring Service) */ mrm := mrmsMonitor.NewMonitor( pct.NewLogger(logChan, "mrms-monitor"), connFactory, ) mrmsManager := mrms.NewManager( pct.NewLogger(logChan, "mrms-manager"), mrm, ) if err := mrmsManager.Start(); err != nil { return fmt.Errorf("Error starting mrms manager: %s\n", err) } /** * Instance manager */ itManager := instance.NewManager( pct.NewLogger(logChan, "instance-manager"), pct.Basedir.Dir("config"), api, mrm, ) if err := itManager.Start(); err != nil { return fmt.Errorf("Error starting instance manager: %s\n", err) } /** * Data spooler and sender */ hostname, _ := os.Hostname() dataClient, err := client.NewWebsocketClient(pct.NewLogger(logChan, "data-ws"), api, "data", headers) if err != nil { golog.Fatalln(err) } dataManager := data.NewManager( pct.NewLogger(logChan, "data"), pct.Basedir.Dir("data"), pct.Basedir.Dir("trash"), hostname, dataClient, ) if err := dataManager.Start(); err != nil { return fmt.Errorf("Error starting data manager: %s\n", err) } /** * Collecct/report ticker (master clock) */ nowFunc := func() int64 { return time.Now().UTC().UnixNano() } clock := ticker.NewClock(&ticker.RealTickerFactory{}, nowFunc) /** * Metric and system config monitors */ mmManager := mm.NewManager( pct.NewLogger(logChan, "mm"), mmMonitor.NewFactory(logChan, itManager.Repo(), mrm), clock, dataManager.Spooler(), itManager.Repo(), mrm, ) if err := mmManager.Start(); err != nil { return fmt.Errorf("Error starting mm manager: %s\n", err) } sysconfigManager := sysconfig.NewManager( pct.NewLogger(logChan, "sysconfig"), sysconfigMonitor.NewFactory(logChan, itManager.Repo()), clock, dataManager.Spooler(), itManager.Repo(), ) if err := sysconfigManager.Start(); err != nil { return fmt.Errorf("Error starting sysconfig manager: %s\n", err) } /** * Query service (real-time EXPLAIN, SHOW CREATE TABLE, etc.) */ queryManager := query.NewManager( pct.NewLogger(logChan, "query"), itManager.Repo(), &mysql.RealConnectionFactory{}, ) if err := queryManager.Start(); err != nil { return fmt.Errorf("Error starting query manager: %s\n", err) } /** * Query Analytics */ qanManager := qan.NewManager( pct.NewLogger(logChan, "qan"), clock, itManager.Repo(), mrm, connFactory, qanFactory.NewRealAnalyzerFactory( logChan, qanFactory.NewRealIntervalIterFactory(logChan), slowlog.NewRealWorkerFactory(logChan), perfschema.NewRealWorkerFactory(logChan), dataManager.Spooler(), clock, ), ) if err := qanManager.Start(); err != nil { return fmt.Errorf("Error starting qan manager: %s\n", err) } /** * Sysinfo */ sysinfoManager := sysinfo.NewManager( pct.NewLogger(logChan, "sysinfo"), ) // MySQL Sysinfo mysqlSysinfoService := mysqlSysinfo.NewMySQL( pct.NewLogger(logChan, "sysinfo-mysql"), itManager.Repo(), ) if err := sysinfoManager.RegisterService("MySQLSummary", mysqlSysinfoService); err != nil { return fmt.Errorf("Error registering Mysql Sysinfo service: %s\n", err) } // System Sysinfo systemSysinfoService := systemSysinfo.NewSystem( pct.NewLogger(logChan, "sysinfo-system"), ) if err := sysinfoManager.RegisterService("SystemSummary", systemSysinfoService); err != nil { return fmt.Errorf("Error registering System Sysinfo service: %s\n", err) } // Start Sysinfo manager if err := sysinfoManager.Start(); err != nil { return fmt.Errorf("Error starting Sysinfo manager: %s\n", err) } /** * Signal handler */ // Generally the agent has a crash-only design, but QAN is so far the only service // which reconfigures MySQL: it enables the slow log, sets long_query_time, etc. // It's not terrible to leave slow log on, but it's nicer to turn it off. sigChan := make(chan os.Signal, 1) stopChan := make(chan error, 2) signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM) go func() { sig := <-sigChan golog.Printf("Caught %s signal, shutting down...\n", sig) stopChan <- qanManager.Stop() }() /** * Agent */ cmdClient, err := client.NewWebsocketClient(pct.NewLogger(logChan, "agent-ws"), api, "cmd", headers) if err != nil { golog.Fatal(err) } // The official list of services known to the agent. Adding a new service // requires a manager, starting the manager as above, and adding the manager // to this map. services := map[string]pct.ServiceManager{ "log": logManager, "data": dataManager, "qan": qanManager, "mm": mmManager, "instance": itManager, "mrms": mrmsManager, "sysconfig": sysconfigManager, "query": queryManager, "sysinfo": sysinfoManager, } // Set the global pct/cmd.Factory, used for the Restart cmd. pctCmd.Factory = &pctCmd.RealCmdFactory{} agentLogger := pct.NewLogger(logChan, "agent") agent := agent.NewAgent( agentConfig, agentLogger, api, cmdClient, services, ) /** * Run agent, wait for it to stop, signal, or crash. */ var stopErr error go func() { defer func() { if err := recover(); err != nil { errMsg := fmt.Sprintf("Agent crashed: %s", err) golog.Println(errMsg) agentLogger.Error(errMsg) stopChan <- fmt.Errorf("%s", errMsg) } }() stopChan <- agent.Run() }() // Wait for agent to stop, or for signals. agentRunning := true statusSigChan := make(chan os.Signal, 1) signal.Notify(statusSigChan, syscall.SIGUSR1) // kill -USER1 PID reconnectSigChan := make(chan os.Signal, 1) signal.Notify(reconnectSigChan, syscall.SIGHUP) // kill -HUP PID for agentRunning { select { case stopErr = <-stopChan: // agent or signal golog.Println("Agent stopped, shutting down...") agentLogger.Info("Agent stopped") agentRunning = false case <-statusSigChan: status := agent.AllStatus() golog.Printf("Status: %+v\n", status) case <-reconnectSigChan: u, _ := user.Current() cmd := &proto.Cmd{ Ts: time.Now().UTC(), User: u.Username + " (SIGHUP)", AgentUuid: agentConfig.AgentUuid, Service: "agent", Cmd: "Reconnect", } agent.Handle(cmd) } } qanManager.Stop() // see Signal handler ^ time.Sleep(2 * time.Second) // wait for final replies and log entries return stopErr }