Ejemplo n.º 1
0
func (s *ManagerTestSuite) TestGetConfig(t *C) {
	mockConnFactory := &mock.ConnectionFactory{Conn: s.nullmysql}
	m := qan.NewManager(s.logger, mockConnFactory, s.clock, s.iterFactory, s.workerFactory, s.spool, s.im)
	t.Assert(m, NotNil)

	config := &qan.Config{
		ServiceInstance: s.mysqlInstance,
		Interval:        300,
		MaxSlowLogSize:  1000,
		MaxWorkers:      3,
		WorkerRunTime:   300,
		Start: []mysql.Query{
			mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"},
			mysql.Query{Set: "SET GLOBAL long_query_time=0.456"},
			mysql.Query{Set: "SET GLOBAL slow_query_log=ON"},
		},
		Stop: []mysql.Query{
			mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"},
			mysql.Query{Set: "SET GLOBAL long_query_time=10"},
		},
	}
	qanConfig, _ := json.Marshal(config)
	cmd := &proto.Cmd{
		Ts:   time.Now(),
		Cmd:  "StartService",
		Data: qanConfig,
	}
	reply := m.Handle(cmd)
	t.Assert(reply.Error, Equals, "")
	test.WaitStatusPrefix(1, m, "qan-log-parser", "Idle")

	s.nullmysql.Reset()

	cmd = &proto.Cmd{
		Cmd:     "GetConfig",
		Service: "qan",
	}
	reply = m.Handle(cmd)
	t.Assert(reply.Error, Equals, "")
	t.Assert(reply.Data, NotNil)
	gotConfig := []proto.AgentConfig{}
	if err := json.Unmarshal(reply.Data, &gotConfig); err != nil {
		t.Fatal(err)
	}
	expectConfig := []proto.AgentConfig{
		{
			InternalService: "qan",
			Config:          string(qanConfig),
			Running:         true,
		},
	}
	if same, diff := test.IsDeeply(gotConfig, expectConfig); !same {
		test.Dump(gotConfig)
		t.Error(diff)
	}

	// Stop manager
	reply = m.Handle(&proto.Cmd{Cmd: "StopService"})
	t.Assert(reply.Error, Equals, "")
}
Ejemplo n.º 2
0
func (s *ManagerTestSuite) TestGetConfig(t *C) {
	// Make a qan.Manager with mock factories.
	mockConnFactory := &mock.ConnectionFactory{Conn: s.nullmysql}
	a := mock.NewQanAnalyzer()
	f := mock.NewQanAnalyzerFactory(a)
	m := qan.NewManager(s.logger, s.clock, s.im, s.mrmsMonitor, mockConnFactory, f)
	t.Assert(m, NotNil)

	// Write a realistic qan.conf config to disk.
	config := qan.Config{
		ServiceInstance: s.mysqlInstance,
		CollectFrom:     "slowlog",
		Interval:        300,
		MaxWorkers:      1,
		WorkerRunTime:   600,
		Start: []mysql.Query{
			mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"},
			mysql.Query{Set: "SET GLOBAL long_query_time=0.456"},
			mysql.Query{Set: "SET GLOBAL slow_query_log=ON"},
		},
		Stop: []mysql.Query{
			mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"},
			mysql.Query{Set: "SET GLOBAL long_query_time=10"},
		},
	}
	err := pct.Basedir.WriteConfig("qan", &config)
	t.Assert(err, IsNil)

	qanConfig, err := json.Marshal(config)
	t.Assert(err, IsNil)

	// Start the manager and analyzer.
	err = m.Start()
	t.Check(err, IsNil)
	test.WaitStatus(1, m, "qan", "Running")

	// Get the manager config which should be just the analyzer config.
	got, errs := m.GetConfig()
	t.Assert(errs, HasLen, 0)
	t.Assert(got, HasLen, 1)
	expect := []proto.AgentConfig{
		{
			InternalService: "qan",
			Config:          string(qanConfig),
			Running:         true,
		},
	}
	if same, diff := IsDeeply(got, expect); !same {
		Dump(got)
		t.Error(diff)
	}

	// Stop the manager.
	err = m.Stop()
	t.Assert(err, IsNil)
}
Ejemplo n.º 3
0
func (s *ManagerTestSuite) TestStart(t *C) {
	mockConnFactory := &mock.ConnectionFactory{Conn: s.nullmysql}
	m := qan.NewManager(s.logger, mockConnFactory, s.clock, s.iterFactory, s.workerFactory, s.spool, s.im)
	t.Assert(m, NotNil)

	// Starting qan without a config does nothing but start the qan manager.
	err := m.Start()
	t.Check(err, IsNil)

	status := m.Status()
	t.Check(status["qan-log-parser"], Equals, "")
	t.Check(status["qan-last-interval"], Equals, "")
	t.Check(status["qan-next-interval"], Equals, "")

	// Write a qan config to disk.
	config := &qan.Config{
		ServiceInstance: s.mysqlInstance,
		Interval:        300,
		MaxWorkers:      1,
		WorkerRunTime:   600,
		Start: []mysql.Query{
			mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"},
			mysql.Query{Set: "SET GLOBAL long_query_time=0.456"},
			mysql.Query{Set: "SET GLOBAL slow_query_log=ON"},
		},
		Stop: []mysql.Query{
			mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"},
			mysql.Query{Set: "SET GLOBAL long_query_time=10"},
		},
	}
	err = pct.Basedir.WriteConfig("qan", config)
	t.Assert(err, IsNil)

	// qan.Start() should read and use config on disk.
	err = m.Start()
	t.Check(err, IsNil)

	if !test.WaitStatusPrefix(1, m, "qan-log-parser", "Idle") {
		t.Error("WaitStatusPrefix(qan-log-parser, Idle) failed")
	}

	status = m.Status()
	t.Check(status["qan-log-parser"], Equals, "Idle (0 of 1 running)")
	t.Check(status["qan-last-interval"], Equals, "")
	t.Check(status["qan-next-interval"], Not(Equals), "")

	// Stopping qan.Stop() should leave config file on disk.
	err = m.Stop()
	t.Assert(err, IsNil)
	t.Check(test.FileExists(pct.Basedir.ConfigFile("qan")), Equals, true)
}
Ejemplo n.º 4
0
func (s *ManagerTestSuite) TestBadCmd(t *C) {
	mockConnFactory := &mock.ConnectionFactory{Conn: s.nullmysql}
	a := mock.NewQanAnalyzer()
	f := mock.NewQanAnalyzerFactory(a)
	m := qan.NewManager(s.logger, s.clock, s.im, s.mrmsMonitor, mockConnFactory, f)
	t.Assert(m, NotNil)
	err := m.Start()
	t.Check(err, IsNil)
	defer m.Stop()
	test.WaitStatus(1, m, "qan", "Running")
	cmd := &proto.Cmd{
		User:      "******",
		Ts:        time.Now(),
		AgentUuid: "123",
		Service:   "qan",
		Cmd:       "foo", // bad cmd
	}
	reply := m.Handle(cmd)
	t.Assert(reply.Error, Equals, "Unknown command: foo")
}
Ejemplo n.º 5
0
func (s *ManagerTestSuite) TestStarNoConfig(t *C) {
	// Make a qan.Manager with mock factories.
	mockConnFactory := &mock.ConnectionFactory{Conn: s.nullmysql}
	a := mock.NewQanAnalyzer()
	f := mock.NewQanAnalyzerFactory(a)
	m := qan.NewManager(s.logger, s.clock, s.im, s.mrmsMonitor, mockConnFactory, f)
	t.Assert(m, NotNil)

	// qan.Manager should be able to start without a qan.conf, i.e. no analyzer.
	err := m.Start()
	t.Check(err, IsNil)

	// Wait for qan.Manager.Start() to finish.
	test.WaitStatus(1, m, "qan", "Running")

	// No analyzer is configured, so the mock analyzer should not be started.
	select {
	case <-a.StartChan:
		t.Error("Analyzer.Start() called")
	default:
	}

	// And the mock analyzer's status should not be reported.
	status := m.Status()
	t.Check(status["qan"], Equals, "Running")

	// Stop the manager.
	err = m.Stop()
	t.Assert(err, IsNil)

	// No analyzer is configured, so the mock analyzer should not be stop.
	select {
	case <-a.StartChan:
		t.Error("Analyzer.Stop() called")
	default:
	}
}
Ejemplo n.º 6
0
func run() error {
	version := fmt.Sprintf("percona-agent %s rev %s", agent.VERSION, agent.REVISION)
	if flagVersion {
		fmt.Println(version)
		return nil
	}
	golog.Printf("Running %s pid %d\n", version, os.Getpid())

	if err := pct.Basedir.Init(flagBasedir); err != nil {
		return err
	}

	// Start-lock file is used to let agent1 self-update, create start-lock,
	// start updated agent2, exit cleanly, then agent2 starts.  agent1 may
	// not use a PID file, so this special file is required.
	if err := pct.WaitStartLock(); err != nil {
		return err
	}
	// NOTE: This must run last, and defer if LIFO, so it must be declared first.
	defer os.Remove(pct.Basedir.File("start-lock"))

	/**
	 * Agent config (require API key and agent UUID)
	 */

	if !pct.FileExists(pct.Basedir.ConfigFile("agent")) {
		return fmt.Errorf("Agent config file %s does not exist", pct.Basedir.ConfigFile("agent"))
	}

	bytes, err := agent.LoadConfig()
	if err != nil {
		return fmt.Errorf("Invalid agent config: %s\n", err)
	}
	agentConfig := &agent.Config{}
	if err := json.Unmarshal(bytes, agentConfig); err != nil {
		return fmt.Errorf("Error parsing "+pct.Basedir.ConfigFile("agent")+": ", err)
	}

	golog.Println("ApiHostname: " + agentConfig.ApiHostname)
	golog.Println("AgentUuid: " + agentConfig.AgentUuid)

	/**
	 * Ping and exit, maybe.
	 */

	if flagPing {
		t0 := time.Now()
		code, err := pct.Ping(agentConfig.ApiHostname, agentConfig.ApiKey)
		d := time.Now().Sub(t0)
		if err != nil || code != 200 {
			return fmt.Errorf("Ping FAIL (%d %d %s)", d, code, err)
		} else {
			golog.Printf("Ping OK (%s)", d)
			return nil
		}
	}

	/**
	 * PID file
	 */

	if flagPidFile != "" {
		pidFile := pct.NewPidFile()
		if err := pidFile.Set(flagPidFile); err != nil {
			golog.Fatalln(err)
		}
		defer pidFile.Remove()
	}

	/**
	 * REST API
	 */

	api, err := ConnectAPI(agentConfig)
	if err != nil {
		golog.Fatal(err)
	}

	/**
	 * Log relay
	 */

	logChan := make(chan *proto.LogEntry, log.BUFFER_SIZE*3)

	// Log websocket client, possibly disabled later.
	logClient, err := client.NewWebsocketClient(pct.NewLogger(logChan, "log-ws"), api, "log")
	if err != nil {
		golog.Fatalln(err)
	}
	logManager := log.NewManager(
		logClient,
		logChan,
	)
	if err := logManager.Start(); err != nil {
		return fmt.Errorf("Error starting logmanager: %s\n", err)
	}

	/**
	 * Instance manager
	 */

	itManager := instance.NewManager(
		pct.NewLogger(logChan, "instance-manager"),
		pct.Basedir.Dir("config"),
		api,
	)
	if err := itManager.Start(); err != nil {
		return fmt.Errorf("Error starting instance manager: %s\n", err)
	}

	/**
	 * Data spooler and sender
	 */

	hostname, _ := os.Hostname()

	dataClient, err := client.NewWebsocketClient(pct.NewLogger(logChan, "data-ws"), api, "data")
	if err != nil {
		golog.Fatalln(err)
	}
	dataManager := data.NewManager(
		pct.NewLogger(logChan, "data"),
		pct.Basedir.Dir("data"),
		hostname,
		dataClient,
	)
	if err := dataManager.Start(); err != nil {
		return fmt.Errorf("Error starting data manager: %s\n", err)
	}

	/**
	 * Collecct/report ticker (master clock)
	 */

	nowFunc := func() int64 { return time.Now().UTC().UnixNano() }
	clock := ticker.NewClock(&ticker.RealTickerFactory{}, nowFunc)

	/**
	 * Metric and system config monitors
	 */

	mmManager := mm.NewManager(
		pct.NewLogger(logChan, "mm"),
		mmMonitor.NewFactory(logChan, itManager.Repo()),
		clock,
		dataManager.Spooler(),
		itManager.Repo(),
	)
	if err := mmManager.Start(); err != nil {
		return fmt.Errorf("Error starting mm manager: %s\n", err)
	}

	sysconfigManager := sysconfig.NewManager(
		pct.NewLogger(logChan, "sysconfig"),
		sysconfigMonitor.NewFactory(logChan, itManager.Repo()),
		clock,
		dataManager.Spooler(),
		itManager.Repo(),
	)
	if err := sysconfigManager.Start(); err != nil {
		return fmt.Errorf("Error starting sysconfig manager: %s\n", err)
	}

	/**
	 * Query Analytics
	 */

	qanManager := qan.NewManager(
		pct.NewLogger(logChan, "qan"),
		&mysql.RealConnectionFactory{},
		clock,
		qan.NewFileIntervalIterFactory(logChan),
		qan.NewSlowLogWorkerFactory(logChan),
		dataManager.Spooler(),
		itManager.Repo(),
	)
	if err := qanManager.Start(); err != nil {
		return fmt.Errorf("Error starting qan manager: %s\n", err)
	}

	/**
	 * Signal handler
	 */

	// Generally the agent has a crash-only design, but QAN is so far the only service
	// which reconfigures MySQL: it enables the slow log, sets long_query_time, etc.
	// It's not terrible to leave slow log on, but it's nicer to turn it off.
	sigChan := make(chan os.Signal, 1)
	stopChan := make(chan error, 2)
	signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM)
	go func() {
		sig := <-sigChan
		golog.Printf("Caught %s signal, shutting down...\n", sig)
		stopChan <- qanManager.Stop()
	}()

	/**
	 * Agent
	 */

	cmdClient, err := client.NewWebsocketClient(pct.NewLogger(logChan, "agent-ws"), api, "cmd")
	if err != nil {
		golog.Fatal(err)
	}

	// The official list of services known to the agent.  Adding a new service
	// requires a manager, starting the manager as above, and adding the manager
	// to this map.
	services := map[string]pct.ServiceManager{
		"log":       logManager,
		"data":      dataManager,
		"qan":       qanManager,
		"mm":        mmManager,
		"instance":  itManager,
		"sysconfig": sysconfigManager,
	}

	agent := agent.NewAgent(
		agentConfig,
		pct.NewLogger(logChan, "agent"),
		api,
		cmdClient,
		services,
	)

	/**
	 * Run agent, wait for it to stop or signal.
	 */

	go func() {
		stopChan <- agent.Run()
	}()
	stopErr := <-stopChan // agent or signal
	golog.Println("Agent stopped, shutting down...")
	qanManager.Stop()           // see Signal handler ^
	time.Sleep(2 * time.Second) // wait for final replies and log entries
	return stopErr
}
Ejemplo n.º 7
0
func (s *ManagerTestSuite) TestStartService(t *C) {
	// Make and start a qan.Manager with mock factories, no analyzer yet.
	mockConnFactory := &mock.ConnectionFactory{Conn: s.nullmysql}
	a := mock.NewQanAnalyzer()
	f := mock.NewQanAnalyzerFactory(a)
	m := qan.NewManager(s.logger, s.clock, s.im, s.mrmsMonitor, mockConnFactory, f)
	t.Assert(m, NotNil)
	err := m.Start()
	t.Check(err, IsNil)
	test.WaitStatus(1, m, "qan", "Running")

	// Create the qan config.
	config := &qan.Config{
		ServiceInstance: s.mysqlInstance,
		Start: []mysql.Query{
			mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"},
			mysql.Query{Set: "SET GLOBAL long_query_time=0.123"},
			mysql.Query{Set: "SET GLOBAL slow_query_log=ON"},
		},
		Stop: []mysql.Query{
			mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"},
			mysql.Query{Set: "SET GLOBAL long_query_time=10"},
		},
		Interval:          300,        // 5 min
		MaxSlowLogSize:    1073741824, // 1 GiB
		RemoveOldSlowLogs: true,
		ExampleQueries:    true,
		MaxWorkers:        2,
		WorkerRunTime:     600, // 10 min
		CollectFrom:       "slowlog",
	}

	// Send a StartService cmd with the qan config to start an analyzer.
	now := time.Now()
	qanConfig, _ := json.Marshal(config)
	cmd := &proto.Cmd{
		User:      "******",
		Ts:        now,
		AgentUuid: "123",
		Service:   "agent",
		Cmd:       "StartService",
		Data:      qanConfig,
	}
	reply := m.Handle(cmd)
	t.Assert(reply.Error, Equals, "")

	// The manager writes the qan config to disk.
	data, err := ioutil.ReadFile(pct.Basedir.ConfigFile("qan"))
	t.Check(err, IsNil)
	gotConfig := &qan.Config{}
	err = json.Unmarshal(data, gotConfig)
	t.Check(err, IsNil)
	if same, diff := IsDeeply(gotConfig, config); !same {
		Dump(gotConfig)
		t.Error(diff)
	}

	// Now the manager and analyzer should be running.
	status := m.Status()
	t.Check(status["qan"], Equals, "Running")
	t.Check(status["qan-analyzer"], Equals, "ok")

	// Try to start the same analyzer again. It results in an error because
	// double tooling is not allowed.
	reply = m.Handle(cmd)
	t.Check(reply.Error, Equals, "qan-analyzer service is running")

	// Send a StopService cmd to stop the analyzer.
	// todo-1.1: send Data with analyzer instance to stop.
	now = time.Now()
	cmd = &proto.Cmd{
		User:      "******",
		Ts:        now,
		AgentUuid: "123",
		Service:   "qan",
		Cmd:       "StopService",
	}
	reply = m.Handle(cmd)
	t.Assert(reply.Error, Equals, "")

	// Now the manager is still running, but the analyzer is not.
	status = m.Status()
	t.Check(status["qan"], Equals, "Running")

	// And the manager has removed the qan config from disk so next time
	// the agent starts the analyzer is not started.
	t.Check(test.FileExists(pct.Basedir.ConfigFile("qan")), Equals, false)

	// StopService should be idempotent, so send it again and expect no error.
	reply = m.Handle(cmd)
	t.Assert(reply.Error, Equals, "")

	// Stop the manager.
	err = m.Stop()
	t.Assert(err, IsNil)
}
Ejemplo n.º 8
0
func (s *ManagerTestSuite) TestStartWithConfig(t *C) {
	for _, analyzerType := range []string{"slowlog", "perfschema"} {
		// Make a qan.Manager with mock factories.
		mockConnFactory := &mock.ConnectionFactory{Conn: s.nullmysql}
		a := mock.NewQanAnalyzer()
		f := mock.NewQanAnalyzerFactory(a)
		m := qan.NewManager(s.logger, s.clock, s.im, s.mrmsMonitor, mockConnFactory, f)
		t.Assert(m, NotNil)

		// Write a realistic qan.conf config to disk.
		config := qan.Config{
			ServiceInstance: s.mysqlInstance,
			CollectFrom:     analyzerType,
			Interval:        300,
			MaxWorkers:      1,
			WorkerRunTime:   600,
			Start: []mysql.Query{
				mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"},
				mysql.Query{Set: "SET GLOBAL long_query_time=0.456"},
				mysql.Query{Set: "SET GLOBAL slow_query_log=ON"},
			},
			Stop: []mysql.Query{
				mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"},
				mysql.Query{Set: "SET GLOBAL long_query_time=10"},
			},
		}
		err := pct.Basedir.WriteConfig("qan", &config)
		t.Assert(err, IsNil)

		// qan.Start() reads qan.conf from disk and starts an analyzer for it.
		err = m.Start()
		t.Check(err, IsNil)

		// Wait until qan.Start() calls analyzer.Start().
		if !test.WaitState(a.StartChan) {
			t.Fatal("Timeout waiting for <-a.StartChan")
		}

		// After starting, the manager's status should be Running and the analyzer's
		// status should be reported too.
		status := m.Status()
		t.Check(status["qan"], Equals, "Running")
		t.Check(status["qan-analyzer"], Equals, "ok")

		// Check the args passed by the manager to the analyzer factory.
		if len(f.Args) == 0 {
			t.Error("len(f.Args) == 0, expected 1")
		} else {
			t.Check(f.Args, HasLen, 1)
			t.Check(f.Args[0].Config, DeepEquals, config)
			t.Check(f.Args[0].Name, Equals, "qan-analyzer")
		}

		// qan.Stop() stops the analyzer and leaves qan.conf on disk.
		err = m.Stop()
		t.Assert(err, IsNil)

		// Wait until qan.Stop() calls analyzer.Stop().
		if !test.WaitState(a.StopChan) {
			t.Fatal("Timeout waiting for <-a.StopChan")
		}

		// qan.conf still exists after qan.Stop().
		t.Check(test.FileExists(pct.Basedir.ConfigFile("qan")), Equals, true)

		// The analyzer is no longer reported in the status because it was stopped
		// and removed when the manager was stopped.
		status = m.Status()
		t.Check(status["qan"], Equals, "Stopped")
	}
}
Ejemplo n.º 9
0
func (s *ManagerTestSuite) TestRecoverWorkerPanic(t *C) {
	// Create and start manager with mock workers.
	w1StopChan := make(chan bool)
	w1 := mock.NewQanWorker("qan-worker-1", w1StopChan, nil, nil, true)
	f := mock.NewQanWorkerFactory([]*mock.QanWorker{w1})
	mockConnFactory := &mock.ConnectionFactory{Conn: s.nullmysql}
	m := qan.NewManager(s.logger, mockConnFactory, s.clock, s.iterFactory, f, s.spool, s.im)
	t.Assert(m, NotNil)

	config := &qan.Config{
		ServiceInstance: s.mysqlInstance,
		MaxSlowLogSize:  1000,
		MaxWorkers:      2,
		Interval:        60,
		WorkerRunTime:   60,
		Start: []mysql.Query{
			mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"},
		},
		Stop: []mysql.Query{
			mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"},
		},
	}
	qanConfig, _ := json.Marshal(config)
	cmd := &proto.Cmd{
		Ts:   time.Now(),
		Cmd:  "StartService",
		Data: qanConfig,
	}
	reply := m.Handle(cmd)
	t.Assert(reply.Error, Equals, "")

	test.WaitStatusPrefix(1, m, "qan-log-parser", "Idle")
	test.DrainLogChan(s.logChan)

	// Start mock worker.  All it does is panic, much like fipar.
	now := time.Now()
	i1 := &qan.Interval{
		Filename:    "slow.log",
		StartOffset: 0,
		EndOffset:   100,
		StartTime:   now,
		StopTime:    now,
	}
	s.intervalChan <- i1
	<-w1.Running() // wait for manager to run worker

	// For now, worker panic only results in error to log.
	var gotError *proto.LogEntry
	timeout := time.After(200 * time.Millisecond)
GET_LOG:
	for {
		select {
		case l := <-s.logChan:
			if l.Level == 3 && strings.HasPrefix(l.Msg, "Lost interval 0 slow.log") {
				gotError = l
				break GET_LOG
			}
		case <-timeout:
			break GET_LOG
		}
	}
	t.Check(gotError, NotNil)

	// Stop manager
	reply = m.Handle(&proto.Cmd{Cmd: "StopService"})
	t.Assert(reply.Error, Equals, "")
}
Ejemplo n.º 10
0
func (s *ManagerTestSuite) TestWaitRemoveSlowLog(t *C) {

	// Same as TestRotateAndRemoveSlowLog, but we use mock workers so we can
	// test that slow log is not removed until previous workers are done.
	// Mock worker factory will return our mock workers when manager calls Make().
	w1StopChan := make(chan bool)
	w1 := mock.NewQanWorker("qan-worker-1", w1StopChan, nil, nil, false)

	w2StopChan := make(chan bool)
	w2 := mock.NewQanWorker("qan-worker-2", w2StopChan, nil, nil, false)

	// Let's take this time to also test that MaxWorkers is enforced.
	w3 := mock.NewQanWorker("qan-worker-3", nil, nil, nil, false)

	f := mock.NewQanWorkerFactory([]*mock.QanWorker{w1, w2, w3})

	// Clean up files that may interfere with test.  Then copy the test log.
	slowlog := "slow006.log"
	files, _ := filepath.Glob("/tmp/" + slowlog + "-[0-9]*")
	for _, file := range files {
		os.Remove(file)
	}
	cp := exec.Command("cp", testlog.Sample+slowlog, "/tmp/"+slowlog)
	cp.Run()

	// Create and start manager with mock workers.
	mockConnFactory := &mock.ConnectionFactory{Conn: s.nullmysql}
	m := qan.NewManager(s.logger, mockConnFactory, s.clock, s.iterFactory, f, s.spool, s.im)
	if m == nil {
		t.Fatal("Create qan.Manager")
	}
	config := &qan.Config{
		ServiceInstance:   s.mysqlInstance,
		MaxSlowLogSize:    1000,
		RemoveOldSlowLogs: true, // done after w2 and w1 done
		MaxWorkers:        2,    // w1 and w2 but not w3
		Interval:          60,
		WorkerRunTime:     60,
		Start: []mysql.Query{
			mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"},
		},
		Stop: []mysql.Query{
			mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"},
		},
	}
	qanConfig, _ := json.Marshal(config)
	cmd := &proto.Cmd{
		Ts:   time.Now(),
		Cmd:  "StartService",
		Data: qanConfig,
	}
	reply := m.Handle(cmd)
	t.Assert(reply.Error, Equals, "")

	test.WaitStatusPrefix(1, m, "qan-log-parser", "Idle")

	// Start first mock worker (w1) with interval 0 - 736.  The worker's Run()
	// func won't return until we send true to its stop chan, so manager will
	// think worker is still running until then.
	now := time.Now()
	i1 := &qan.Interval{
		Filename:    "/tmp/" + slowlog,
		StartOffset: 0,
		EndOffset:   736,
		StartTime:   now,
		StopTime:    now,
	}
	s.intervalChan <- i1
	<-w1.Running() // wait for manager to run worker

	// Start 2nd mock worker (w2) with interval 736 - 1833.  Manager will rotate
	// but not remove original slow log because w1 is still running.
	i2 := &qan.Interval{
		Filename:    "/tmp/" + slowlog,
		StartOffset: 736,
		EndOffset:   1833,
		StartTime:   now,
		StopTime:    now,
	}
	s.intervalChan <- i2
	<-w2.Running()

	test.WaitStatus(1, m, "qan-log-parser", "Idle (2 of 2 running)")

	/**
	 * Worker status test
	 */

	// Workers should have status and QAN manager should report them all.
	status := m.Status()
	t.Check(status["qan-worker-1"], Equals, "ok")
	t.Check(status["qan-worker-2"], Equals, "ok")
	t.Check(status["qan-worker-3"], Equals, "") // not running due to MaxWorkers

	/**
	 * Quick side test: qan.Config.MaxWorkers is enforced.
	 */
	test.DrainLogChan(s.logChan)
	s.intervalChan <- i2
	logs := test.WaitLogChan(s.logChan, 3)
	test.WaitStatus(1, m, "qan-log-parser", "Idle (2 of 2 running)")
	gotWarning := false
	for _, log := range logs {
		if log.Level == proto.LOG_WARNING && strings.Contains(log.Msg, "All workers busy") {
			gotWarning = true
			break
		}
	}
	if !gotWarning {
		t.Error("Too many workers causes \"All workers busy\" warning")
	}

	// Original slow log should no longer exist; it was rotated away, but...
	if _, err := os.Stat("/tmp/" + slowlog); !os.IsNotExist(err) {
		t.Error("/tmp/" + slowlog + " no longer exists")
	}

	// ...old slow log should exist because w1 is still running.
	files, _ = filepath.Glob("/tmp/" + slowlog + "-[0-9]*")
	if len(files) != 1 {
		t.Errorf("w1 running so old slow log not removed, got %+v", files)
	}
	defer func() {
		for _, file := range files {
			os.Remove(file)
		}
	}()

	// Stop w2 which is holding "holding" the "lock" on removing the old
	// slog log (figuratively speaking; there are no real locks).  Because
	// w1 is still running, manager should not remove the old log yet because
	// w1 could still be parsing it.
	w2StopChan <- true
	test.WaitStatus(1, m, "qan-log-parser", "Idle (1 of 2 running)")
	if _, err := os.Stat(files[0]); os.IsNotExist(err) {
		t.Errorf("w1 still running so old slow log not removed")
	}

	// Stop w1 and now, even though slow log was rotated for w2, manager
	// should remove old slow log.
	w1StopChan <- true
	test.WaitStatus(1, m, "qan-log-parser", "Idle (0 of 2 running)")
	if _, err := os.Stat(files[0]); !os.IsNotExist(err) {
		t.Errorf("w1 done running so old slow log removed")
	}

	// Stop manager
	reply = m.Handle(&proto.Cmd{Cmd: "StopService"})
	t.Assert(reply.Error, Equals, "")
}
Ejemplo n.º 11
0
func (s *ManagerTestSuite) TestRotateSlowLog(t *C) {

	// Same as TestRotateAndRemoveSlowLog, but with qan.Config.RemoveOldSlowLogs=false
	// and testing that Start and Stop queries were executed.

	slowlog := "slow006.log"
	files, _ := filepath.Glob("/tmp/" + slowlog + "-[0-9]*")
	for _, file := range files {
		os.Remove(file)
	}

	mockConnFactory := &mock.ConnectionFactory{Conn: s.nullmysql}
	m := qan.NewManager(s.logger, mockConnFactory, s.clock, s.iterFactory, s.workerFactory, s.spool, s.im)
	if m == nil {
		t.Fatal("Create qan.Manager")
	}
	config := &qan.Config{
		ServiceInstance:   s.mysqlInstance,
		Interval:          300,
		MaxSlowLogSize:    1000,
		RemoveOldSlowLogs: false, // <-- HERE
		ExampleQueries:    false,
		MaxWorkers:        2,
		WorkerRunTime:     600,
		Start: []mysql.Query{
			mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"},
			mysql.Query{Set: "SET GLOBAL long_query_time=0.456"},
			mysql.Query{Set: "SET GLOBAL slow_query_log=ON"},
		},
		Stop: []mysql.Query{
			mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"},
			mysql.Query{Set: "SET GLOBAL long_query_time=10"},
		},
	}
	qanConfig, _ := json.Marshal(config)
	cmd := &proto.Cmd{
		Ts:   time.Now(),
		Cmd:  "StartService",
		Data: qanConfig,
	}
	reply := m.Handle(cmd)
	t.Assert(reply.Error, Equals, "")

	test.WaitStatusPrefix(1, m, "qan-log-parser", "Idle")
	s.nullmysql.Reset()
	cp := exec.Command("cp", testlog.Sample+slowlog, "/tmp/"+slowlog)
	cp.Run()

	// First interval: 0 - 736
	now := time.Now()
	i1 := &qan.Interval{
		Filename:    "/tmp/" + slowlog,
		StartOffset: 0,
		EndOffset:   736,
		StartTime:   now,
		StopTime:    now,
	}
	s.intervalChan <- i1
	resultData := <-s.dataChan
	report := *resultData.(*qan.Report)
	if report.Global.TotalQueries != 2 {
		t.Error("First interval has 2 queries, got ", report.Global.TotalQueries)
	}
	if report.Global.UniqueQueries != 1 {
		t.Error("First interval has 1 unique query, got ", report.Global.UniqueQueries)
	}

	// Second interval: 736 - 1833, but will actually go to end: 2200, if not
	// the next two test will fail.
	i2 := &qan.Interval{
		Filename:    "/tmp/" + slowlog,
		StartOffset: 736,
		EndOffset:   1833,
		StartTime:   now,
		StopTime:    now,
	}
	s.intervalChan <- i2
	resultData = <-s.dataChan
	report = *resultData.(*qan.Report)
	if report.Global.TotalQueries != 4 {
		t.Error("Second interval has 2 queries, got ", report.Global.TotalQueries)
	}
	if report.Global.UniqueQueries != 2 {
		t.Error("Second interval has 2 unique queries, got ", report.Global.UniqueQueries)
	}

	test.WaitStatus(1, m, "qan-log-parser", "Idle (0 of 2 running)")

	// Original slow log should no longer exist; it was rotated away.
	if _, err := os.Stat("/tmp/" + slowlog); !os.IsNotExist(err) {
		t.Error("/tmp/" + slowlog + " no longer exists")
	}

	// The original slow log should NOT have been removed.
	files, _ = filepath.Glob("/tmp/" + slowlog + "-[0-9]*")
	if len(files) != 1 {
		t.Errorf("Old slow log not removed, got %+v", files)
	}
	defer func() {
		for _, file := range files {
			os.Remove(file)
		}
	}()

	expect := []mysql.Query{}
	for _, q := range config.Stop {
		expect = append(expect, q)
	}
	for _, q := range config.Start {
		expect = append(expect, q)
	}
	if same, diff := test.IsDeeply(s.nullmysql.GetSet(), expect); !same {
		t.Logf("%+v", s.nullmysql.GetSet())
		t.Logf("%+v", expect)
		t.Error(diff)
	}

	// Stop manager
	reply = m.Handle(&proto.Cmd{Cmd: "StopService"})
	t.Assert(reply.Error, Equals, "")
}
Ejemplo n.º 12
0
func (s *ManagerTestSuite) TestRotateAndRemoveSlowLog(t *C) {

	// Clean up files that may interfere with test.
	slowlog := "slow006.log"
	files, _ := filepath.Glob("/tmp/" + slowlog + "-[0-9]*")
	for _, file := range files {
		os.Remove(file)
	}

	/**
	 * slow006.log is 2200 bytes large.  Rotation happens when manager
	 * see interval.EndOffset >= MaxSlowLogSize.  So we'll use these
	 * intervals,
	 *      0 -  736
	 *    736 - 1833
	 *   1833 - 2200
	 * and set MaxSlowLogSize=1000 which should make manager rotate the log
	 * after the 2nd interval.  When manager rotates log, it 1) renames log
	 * to NAME-TS where NAME is the original name and TS is the current Unix
	 * timestamp (UTC); and 2) it sets interval.StopOff = file size of NAME-TS
	 * to finish parsing the log.  Therefore, results for 2nd interval should
	 * include our 3rd interval. -- Manager also calls Start and Stop so the
	 * nullmysql conn should record the queries being set.
	 */

	// See TestStartService() for description of these startup tasks.
	mockConnFactory := &mock.ConnectionFactory{Conn: s.nullmysql}
	m := qan.NewManager(s.logger, mockConnFactory, s.clock, s.iterFactory, s.workerFactory, s.spool, s.im)
	if m == nil {
		t.Fatal("Create qan.Manager")
	}
	config := &qan.Config{
		ServiceInstance:   s.mysqlInstance,
		Interval:          300,
		MaxSlowLogSize:    1000, // <-- HERE
		RemoveOldSlowLogs: true, // <-- HERE too
		ExampleQueries:    false,
		MaxWorkers:        2,
		WorkerRunTime:     600,
		Start: []mysql.Query{
			mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"},
			mysql.Query{Set: "SET GLOBAL long_query_time=0.456"},
			mysql.Query{Set: "SET GLOBAL slow_query_log=ON"},
		},
		Stop: []mysql.Query{
			mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"},
			mysql.Query{Set: "SET GLOBAL long_query_time=10"},
		},
	}
	qanConfig, _ := json.Marshal(config)
	cmd := &proto.Cmd{
		Ts:   time.Now(),
		Cmd:  "StartService",
		Data: qanConfig,
	}
	reply := m.Handle(cmd)
	t.Assert(reply.Error, Equals, "")

	test.WaitStatusPrefix(1, m, "qan-log-parser", "Idle")

	// Make copy of slow log because test will mv/rename it.
	cp := exec.Command("cp", testlog.Sample+slowlog, "/tmp/"+slowlog)
	cp.Run()

	// First interval: 0 - 736
	now := time.Now()
	i1 := &qan.Interval{
		Filename:    "/tmp/" + slowlog,
		StartOffset: 0,
		EndOffset:   736,
		StartTime:   now,
		StopTime:    now,
	}
	s.intervalChan <- i1
	resultData := <-s.dataChan
	report := *resultData.(*qan.Report)
	if report.Global.TotalQueries != 2 {
		t.Error("First interval has 2 queries, got ", report.Global.TotalQueries)
	}
	if report.Global.UniqueQueries != 1 {
		t.Error("First interval has 1 unique query, got ", report.Global.UniqueQueries)
	}

	// Second interval: 736 - 1833, but will actually go to end: 2200, if not
	// the next two test will fail.
	i2 := &qan.Interval{
		Filename:    "/tmp/" + slowlog,
		StartOffset: 736,
		EndOffset:   1833,
		StartTime:   now,
		StopTime:    now,
	}
	s.intervalChan <- i2
	resultData = <-s.dataChan
	report = *resultData.(*qan.Report)
	if report.Global.TotalQueries != 4 {
		t.Error("Second interval has 2 queries, got ", report.Global.TotalQueries)
	}
	if report.Global.UniqueQueries != 2 {
		t.Error("Second interval has 2 unique queries, got ", report.Global.UniqueQueries)
	}

	test.WaitStatus(1, m, "qan-log-parser", "Idle (0 of 2 running)")

	// Original slow log should no longer exist; it was rotated away.
	if _, err := os.Stat("/tmp/" + slowlog); !os.IsNotExist(err) {
		t.Error("/tmp/" + slowlog + " no longer exists")
	}

	// The original slow log should have been renamed to slow006-TS, parsed, and removed.
	files, _ = filepath.Glob("/tmp/" + slowlog + "-[0-9]*")
	if len(files) != 0 {
		t.Errorf("Old slow log removed, got %+v", files)
	}
	defer func() {
		for _, file := range files {
			os.Remove(file)
		}
	}()

	// https://jira.percona.com/browse/PCT-466
	// Old slow log removed but space not freed in filesystem
	pid := fmt.Sprintf("%d", os.Getpid())
	out, err := exec.Command("lsof", "-p", pid).Output()
	if err != nil {
		t.Fatal(err)
	}
	if strings.Contains(string(out), "/tmp/"+slowlog+"-") {
		t.Logf("%s\n", string(out))
		t.Error("Old slow log removed but not freed in filesystem (PCT-466)")
	}

	// Stop manager
	reply = m.Handle(&proto.Cmd{Cmd: "StopService"})
	t.Assert(reply.Error, Equals, "")
}
Ejemplo n.º 13
0
func (s *ManagerTestSuite) TestStartServiceFast(t *C) {
	/**
	 * Like TestStartService but we simulate the next tick being 3m away
	 * (mock.clock.Eta = 180) so that run() sends the first tick on the
	 * tick chan, causing the first interval to start immediately.
	 */

	s.clock.Eta = 180
	defer func() { s.clock.Eta = 0 }()

	m := qan.NewManager(s.logger, &mysql.RealConnectionFactory{}, s.clock, s.iterFactory, s.workerFactory, s.spool, s.im)
	t.Assert(m, NotNil)

	config := &qan.Config{
		ServiceInstance: s.mysqlInstance,
		Start: []mysql.Query{
			mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"},
		},
		Stop: []mysql.Query{
			mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"},
		},
		Interval:       300,        // 5 min
		MaxSlowLogSize: 1073741824, // 1 GiB
		MaxWorkers:     1,
		WorkerRunTime:  600, // 10 min
	}
	now := time.Now()
	qanConfig, _ := json.Marshal(config)
	cmd := &proto.Cmd{
		User:      "******",
		Ts:        now,
		AgentUuid: "123",
		Service:   "qan",
		Cmd:       "StartService",
		Data:      qanConfig,
	}
	reply := m.Handle(cmd)
	t.Assert(reply.Error, Equals, "")
	test.WaitStatus(1, m, "qan-log-parser", "Starting")
	tickChan := s.iterFactory.TickChans[s.iter]
	t.Assert(tickChan, NotNil)

	// run() should prime the tickChan with the 1st tick immediately.  This makes
	// the interval iter start the interval immediately.  Then run() continues
	// waiting for the iter to send an interval which happens when the real ticker
	// (the clock) sends the 2nd tick which is synced to the interval, thus ending
	// the first interval started by run() and starting the 2nd interval as normal.
	var tick time.Time
	select {
	case tick = <-tickChan:
	case <-time.After(1 * time.Second):
	}
	t.Assert(tick.IsZero(), Not(Equals), true)

	status := m.Status()
	t.Check(status["qan-next-interval"], Equals, "180.0s")

	// Stop QAN.
	cmd = &proto.Cmd{
		User:      "******",
		Ts:        now,
		AgentUuid: "123",
		Service:   "",
		Cmd:       "StopService",
	}
	reply = m.Handle(cmd)
	t.Assert(reply.Error, Equals, "")
}
Ejemplo n.º 14
0
func (s *ManagerTestSuite) TestStartService(t *C) {

	/**
	 * Create and start manager.
	 */

	m := qan.NewManager(s.logger, &mysql.RealConnectionFactory{}, s.clock, s.iterFactory, s.workerFactory, s.spool, s.im)
	t.Assert(m, NotNil)

	// Create the qan config.
	tmpFile := fmt.Sprintf("/tmp/qan_test.TestStartService.%d", os.Getpid())
	defer func() { os.Remove(tmpFile) }()
	config := &qan.Config{
		ServiceInstance: s.mysqlInstance,
		Start: []mysql.Query{
			mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"},
			mysql.Query{Set: "SET GLOBAL long_query_time=0.123"},
			mysql.Query{Set: "SET GLOBAL slow_query_log=ON"},
		},
		Stop: []mysql.Query{
			mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"},
			mysql.Query{Set: "SET GLOBAL long_query_time=10"},
		},
		Interval:          300,        // 5 min
		MaxSlowLogSize:    1073741824, // 1 GiB
		RemoveOldSlowLogs: true,
		ExampleQueries:    true,
		MaxWorkers:        2,
		WorkerRunTime:     600, // 10 min
	}

	// Create the StartService cmd which contains the qan config.
	now := time.Now()
	qanConfig, _ := json.Marshal(config)
	cmd := &proto.Cmd{
		User:      "******",
		Ts:        now,
		AgentUuid: "123",
		Service:   "agent",
		Cmd:       "StartService",
		Data:      qanConfig,
	}

	// Have the service manager start the qa service
	reply := m.Handle(cmd)

	// It should start without error.
	t.Assert(reply.Error, Equals, "")

	// It should write the config to disk.
	data, err := ioutil.ReadFile(pct.Basedir.ConfigFile("qan"))
	t.Check(err, IsNil)
	gotConfig := &qan.Config{}
	err = json.Unmarshal(data, gotConfig)
	t.Check(err, IsNil)
	if same, diff := test.IsDeeply(gotConfig, config); !same {
		test.Dump(gotConfig)
		t.Error(diff)
	}

	// And status should be "Running" and "Idle".
	test.WaitStatus(1, m, "qan-log-parser", "Idle (0 of 2 running)")
	status := m.Status()
	t.Check(status["qan"], Equals, "Running")
	t.Check(status["qan-log-parser"], Equals, "Idle (0 of 2 running)")

	// It should have enabled the slow log.
	slowLog := s.realmysql.GetGlobalVarNumber("slow_query_log")
	t.Assert(slowLog, Equals, float64(1))

	longQueryTime := s.realmysql.GetGlobalVarNumber("long_query_time")
	t.Assert(longQueryTime, Equals, 0.123)

	// Starting an already started service should result in a ServiceIsRunningError.
	reply = m.Handle(cmd)
	t.Check(reply.Error, Not(Equals), "")

	// It should add a tickChan for the interval iter.
	t.Check(s.clock.Added, HasLen, 1)
	t.Check(s.clock.Removed, HasLen, 0)

	/**
	 * Have manager run a worker, parse, and send data.
	 */

	interv := &qan.Interval{
		Filename:    testlog.Sample + "slow001.log",
		StartOffset: 0,
		EndOffset:   524,
		StartTime:   now,
		StopTime:    now,
	}
	s.intervalChan <- interv

	v := test.WaitData(s.dataChan)
	t.Assert(v, HasLen, 1)
	report := v[0].(*qan.Report)

	result := &qan.Result{
		StopOffset: report.StopOffset,
		Global:     report.Global,
		Classes:    report.Class,
	}
	test.WriteData(result, tmpFile)
	t.Check(tmpFile, testlog.FileEquals, sample+"slow001.json")

	/**
	 * Send StopService cmd to stop qan/qan-log-parser.
	 */

	now = time.Now()
	cmd = &proto.Cmd{
		User:      "******",
		Ts:        now,
		AgentUuid: "123",
		Service:   "agent",
		Cmd:       "StopService",
	}

	// Have the service manager start the qa service
	reply = m.Handle(cmd)

	// It should start without error.
	t.Assert(reply.Error, Equals, "")

	// It should disable the slow log.
	slowLog = s.realmysql.GetGlobalVarNumber("slow_query_log")
	t.Assert(slowLog, Equals, float64(0))

	longQueryTime = s.realmysql.GetGlobalVarNumber("long_query_time")
	t.Assert(longQueryTime, Equals, 10.0)

	// It should remove the tickChan (and not have added others).
	t.Check(s.clock.Added, HasLen, 1)
	t.Check(s.clock.Removed, HasLen, 1)

	// qan still running, but qan-log-parser stopped.
	test.WaitStatus(1, m, "qan-log-parser", "Stopped")
	status = m.Status()
	t.Check(status["qan"], Equals, "Running")
	t.Check(status["qan-log-parser"], Equals, "Stopped")
}
Ejemplo n.º 15
0
func run() error {
	version := fmt.Sprintf("percona-agent %s%s rev %s", agent.VERSION, agent.REL, agent.REVISION)
	if flagVersion {
		fmt.Println(version)
		return nil
	}
	golog.Printf("Running %s pid %d\n", version, os.Getpid())

	if err := pct.Basedir.Init(flagBasedir); err != nil {
		return err
	}

	// Start-lock file is used to let agent1 self-update, create start-lock,
	// start updated agent2, exit cleanly, then agent2 starts.  agent1 may
	// not use a PID file, so this special file is required.
	if err := pct.WaitStartLock(); err != nil {
		return err
	}
	// NOTE: This must run last, and defer if LIFO, so it must be declared first.
	defer os.Remove(pct.Basedir.File("start-lock"))

	/**
	 * Agent config (require API key and agent UUID)
	 */

	if !pct.FileExists(pct.Basedir.ConfigFile("agent")) {
		return fmt.Errorf("Agent config file %s does not exist", pct.Basedir.ConfigFile("agent"))
	}

	bytes, err := agent.LoadConfig()
	if err != nil {
		return fmt.Errorf("Invalid agent config: %s\n", err)
	}
	agentConfig := &agent.Config{}
	if err := json.Unmarshal(bytes, agentConfig); err != nil {
		return fmt.Errorf("Error parsing "+pct.Basedir.ConfigFile("agent")+": ", err)
	}

	golog.Println("ApiHostname: " + agentConfig.ApiHostname)
	golog.Println("AgentUuid: " + agentConfig.AgentUuid)

	/**
	 * Ping and exit, maybe.
	 */

	// Set for all connections to API.  X-Percona-API-Key is set automatically
	// using the pct.APIConnector.
	headers := map[string]string{
		"X-Percona-Agent-Version": agent.VERSION,
	}

	if flagPing {
		t0 := time.Now()
		code, err := pct.Ping(agentConfig.ApiHostname, agentConfig.ApiKey, headers)
		d := time.Now().Sub(t0)
		if err != nil || code != 200 {
			return fmt.Errorf("Ping FAIL (%d %d %s)", d, code, err)
		} else {
			golog.Printf("Ping OK (%s)", d)
			return nil
		}
	}

	/**
	 * PID file
	 */

	pidFilePath := agentConfig.PidFile
	if flagPidFile != "" {
		pidFilePath = flagPidFile
	}
	if pidFilePath != "" {
		pidFile := pct.NewPidFile()
		if err := pidFile.Set(pidFilePath); err != nil {
			golog.Fatalln(err)
		}
		defer pidFile.Remove()
	}

	/**
	 * REST API
	 */

	retry := -1 // unlimited
	if flagStatus {
		retry = 1
	}
	api, err := ConnectAPI(agentConfig, retry)
	if err != nil {
		golog.Fatal(err)
	}

	// Get agent status via API and exit.
	if flagStatus {
		code, bytes, err := api.Get(agentConfig.ApiKey, api.AgentLink("self")+"/status")
		if err != nil {
			return err
		}
		if code == 404 {
			return fmt.Errorf("Agent not found")
		}
		status := make(map[string]string)
		if err := json.Unmarshal(bytes, &status); err != nil {
			return err
		}
		golog.Println(status)
		return nil
	}

	/**
	 * Connection factory
	 */
	connFactory := &mysql.RealConnectionFactory{}

	/**
	 * Log relay
	 */

	logChan := make(chan *proto.LogEntry, log.BUFFER_SIZE*3)

	// Log websocket client, possibly disabled later.
	logClient, err := client.NewWebsocketClient(pct.NewLogger(logChan, "log-ws"), api, "log", headers)
	if err != nil {
		golog.Fatalln(err)
	}
	logManager := log.NewManager(
		logClient,
		logChan,
	)
	if err := logManager.Start(); err != nil {
		return fmt.Errorf("Error starting logmanager: %s\n", err)
	}

	/**
	 * MRMS (MySQL Restart Monitoring Service)
	 */
	mrm := mrmsMonitor.NewMonitor(
		pct.NewLogger(logChan, "mrms-monitor"),
		connFactory,
	)
	mrmsManager := mrms.NewManager(
		pct.NewLogger(logChan, "mrms-manager"),
		mrm,
	)
	if err := mrmsManager.Start(); err != nil {
		return fmt.Errorf("Error starting mrms manager: %s\n", err)
	}

	/**
	 * Instance manager
	 */
	itManager := instance.NewManager(
		pct.NewLogger(logChan, "instance-manager"),
		pct.Basedir.Dir("config"),
		api,
		mrm,
	)
	if err := itManager.Start(); err != nil {
		return fmt.Errorf("Error starting instance manager: %s\n", err)
	}

	/**
	 * Data spooler and sender
	 */

	hostname, _ := os.Hostname()

	dataClient, err := client.NewWebsocketClient(pct.NewLogger(logChan, "data-ws"), api, "data", headers)
	if err != nil {
		golog.Fatalln(err)
	}
	dataManager := data.NewManager(
		pct.NewLogger(logChan, "data"),
		pct.Basedir.Dir("data"),
		pct.Basedir.Dir("trash"),
		hostname,
		dataClient,
	)
	if err := dataManager.Start(); err != nil {
		return fmt.Errorf("Error starting data manager: %s\n", err)
	}

	/**
	 * Collecct/report ticker (master clock)
	 */

	nowFunc := func() int64 { return time.Now().UTC().UnixNano() }
	clock := ticker.NewClock(&ticker.RealTickerFactory{}, nowFunc)

	/**
	 * Metric and system config monitors
	 */

	mmManager := mm.NewManager(
		pct.NewLogger(logChan, "mm"),
		mmMonitor.NewFactory(logChan, itManager.Repo(), mrm),
		clock,
		dataManager.Spooler(),
		itManager.Repo(),
		mrm,
	)
	if err := mmManager.Start(); err != nil {
		return fmt.Errorf("Error starting mm manager: %s\n", err)
	}

	sysconfigManager := sysconfig.NewManager(
		pct.NewLogger(logChan, "sysconfig"),
		sysconfigMonitor.NewFactory(logChan, itManager.Repo()),
		clock,
		dataManager.Spooler(),
		itManager.Repo(),
	)
	if err := sysconfigManager.Start(); err != nil {
		return fmt.Errorf("Error starting sysconfig manager: %s\n", err)
	}

	/**
	 * Query service (real-time EXPLAIN, SHOW CREATE TABLE, etc.)
	 */

	queryManager := query.NewManager(
		pct.NewLogger(logChan, "query"),
		itManager.Repo(),
		&mysql.RealConnectionFactory{},
	)
	if err := queryManager.Start(); err != nil {
		return fmt.Errorf("Error starting query manager: %s\n", err)
	}

	/**
	 * Query Analytics
	 */

	qanManager := qan.NewManager(
		pct.NewLogger(logChan, "qan"),
		clock,
		itManager.Repo(),
		mrm,
		connFactory,
		qanFactory.NewRealAnalyzerFactory(
			logChan,
			qanFactory.NewRealIntervalIterFactory(logChan),
			slowlog.NewRealWorkerFactory(logChan),
			perfschema.NewRealWorkerFactory(logChan),
			dataManager.Spooler(),
			clock,
		),
	)

	if err := qanManager.Start(); err != nil {
		return fmt.Errorf("Error starting qan manager: %s\n", err)
	}

	/**
	 * Sysinfo
	 */
	sysinfoManager := sysinfo.NewManager(
		pct.NewLogger(logChan, "sysinfo"),
	)

	// MySQL Sysinfo
	mysqlSysinfoService := mysqlSysinfo.NewMySQL(
		pct.NewLogger(logChan, "sysinfo-mysql"),
		itManager.Repo(),
	)
	if err := sysinfoManager.RegisterService("MySQLSummary", mysqlSysinfoService); err != nil {
		return fmt.Errorf("Error registering Mysql Sysinfo service: %s\n", err)
	}

	// System Sysinfo
	systemSysinfoService := systemSysinfo.NewSystem(
		pct.NewLogger(logChan, "sysinfo-system"),
	)
	if err := sysinfoManager.RegisterService("SystemSummary", systemSysinfoService); err != nil {
		return fmt.Errorf("Error registering System Sysinfo service: %s\n", err)
	}

	// Start Sysinfo manager
	if err := sysinfoManager.Start(); err != nil {
		return fmt.Errorf("Error starting Sysinfo manager: %s\n", err)
	}

	/**
	 * Signal handler
	 */

	// Generally the agent has a crash-only design, but QAN is so far the only service
	// which reconfigures MySQL: it enables the slow log, sets long_query_time, etc.
	// It's not terrible to leave slow log on, but it's nicer to turn it off.
	sigChan := make(chan os.Signal, 1)
	stopChan := make(chan error, 2)
	signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM)
	go func() {
		sig := <-sigChan
		golog.Printf("Caught %s signal, shutting down...\n", sig)
		stopChan <- qanManager.Stop()
	}()

	/**
	 * Agent
	 */

	cmdClient, err := client.NewWebsocketClient(pct.NewLogger(logChan, "agent-ws"), api, "cmd", headers)
	if err != nil {
		golog.Fatal(err)
	}

	// The official list of services known to the agent.  Adding a new service
	// requires a manager, starting the manager as above, and adding the manager
	// to this map.
	services := map[string]pct.ServiceManager{
		"log":       logManager,
		"data":      dataManager,
		"qan":       qanManager,
		"mm":        mmManager,
		"instance":  itManager,
		"mrms":      mrmsManager,
		"sysconfig": sysconfigManager,
		"query":     queryManager,
		"sysinfo":   sysinfoManager,
	}

	// Set the global pct/cmd.Factory, used for the Restart cmd.
	pctCmd.Factory = &pctCmd.RealCmdFactory{}

	agentLogger := pct.NewLogger(logChan, "agent")

	agent := agent.NewAgent(
		agentConfig,
		agentLogger,
		api,
		cmdClient,
		services,
	)

	/**
	 * Run agent, wait for it to stop, signal, or crash.
	 */

	var stopErr error
	go func() {
		defer func() {
			if err := recover(); err != nil {
				errMsg := fmt.Sprintf("Agent crashed: %s", err)
				golog.Println(errMsg)
				agentLogger.Error(errMsg)
				stopChan <- fmt.Errorf("%s", errMsg)
			}
		}()
		stopChan <- agent.Run()
	}()

	// Wait for agent to stop, or for signals.
	agentRunning := true
	statusSigChan := make(chan os.Signal, 1)
	signal.Notify(statusSigChan, syscall.SIGUSR1) // kill -USER1 PID
	reconnectSigChan := make(chan os.Signal, 1)
	signal.Notify(reconnectSigChan, syscall.SIGHUP) // kill -HUP PID
	for agentRunning {
		select {
		case stopErr = <-stopChan: // agent or signal
			golog.Println("Agent stopped, shutting down...")
			agentLogger.Info("Agent stopped")
			agentRunning = false
		case <-statusSigChan:
			status := agent.AllStatus()
			golog.Printf("Status: %+v\n", status)
		case <-reconnectSigChan:
			u, _ := user.Current()
			cmd := &proto.Cmd{
				Ts:        time.Now().UTC(),
				User:      u.Username + " (SIGHUP)",
				AgentUuid: agentConfig.AgentUuid,
				Service:   "agent",
				Cmd:       "Reconnect",
			}
			agent.Handle(cmd)
		}
	}

	qanManager.Stop()           // see Signal handler ^
	time.Sleep(2 * time.Second) // wait for final replies and log entries
	return stopErr
}