func (s *ManagerTestSuite) TestWaitRemoveSlowLog(t *C) { // Same as TestRotateAndRemoveSlowLog, but we use mock workers so we can // test that slow log is not removed until previous workers are done. // Mock worker factory will return our mock workers when manager calls Make(). w1StopChan := make(chan bool) w1 := mock.NewQanWorker("qan-worker-1", w1StopChan, nil, nil, false) w2StopChan := make(chan bool) w2 := mock.NewQanWorker("qan-worker-2", w2StopChan, nil, nil, false) // Let's take this time to also test that MaxWorkers is enforced. w3 := mock.NewQanWorker("qan-worker-3", nil, nil, nil, false) f := mock.NewQanWorkerFactory([]*mock.QanWorker{w1, w2, w3}) // Clean up files that may interfere with test. Then copy the test log. slowlog := "slow006.log" files, _ := filepath.Glob("/tmp/" + slowlog + "-[0-9]*") for _, file := range files { os.Remove(file) } cp := exec.Command("cp", testlog.Sample+slowlog, "/tmp/"+slowlog) cp.Run() // Create and start manager with mock workers. mockConnFactory := &mock.ConnectionFactory{Conn: s.nullmysql} m := qan.NewManager(s.logger, mockConnFactory, s.clock, s.iterFactory, f, s.spool, s.im) if m == nil { t.Fatal("Create qan.Manager") } config := &qan.Config{ ServiceInstance: s.mysqlInstance, MaxSlowLogSize: 1000, RemoveOldSlowLogs: true, // done after w2 and w1 done MaxWorkers: 2, // w1 and w2 but not w3 Interval: 60, WorkerRunTime: 60, Start: []mysql.Query{ mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"}, }, Stop: []mysql.Query{ mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"}, }, } qanConfig, _ := json.Marshal(config) cmd := &proto.Cmd{ Ts: time.Now(), Cmd: "StartService", Data: qanConfig, } reply := m.Handle(cmd) t.Assert(reply.Error, Equals, "") test.WaitStatusPrefix(1, m, "qan-log-parser", "Idle") // Start first mock worker (w1) with interval 0 - 736. The worker's Run() // func won't return until we send true to its stop chan, so manager will // think worker is still running until then. now := time.Now() i1 := &qan.Interval{ Filename: "/tmp/" + slowlog, StartOffset: 0, EndOffset: 736, StartTime: now, StopTime: now, } s.intervalChan <- i1 <-w1.Running() // wait for manager to run worker // Start 2nd mock worker (w2) with interval 736 - 1833. Manager will rotate // but not remove original slow log because w1 is still running. i2 := &qan.Interval{ Filename: "/tmp/" + slowlog, StartOffset: 736, EndOffset: 1833, StartTime: now, StopTime: now, } s.intervalChan <- i2 <-w2.Running() test.WaitStatus(1, m, "qan-log-parser", "Idle (2 of 2 running)") /** * Worker status test */ // Workers should have status and QAN manager should report them all. status := m.Status() t.Check(status["qan-worker-1"], Equals, "ok") t.Check(status["qan-worker-2"], Equals, "ok") t.Check(status["qan-worker-3"], Equals, "") // not running due to MaxWorkers /** * Quick side test: qan.Config.MaxWorkers is enforced. */ test.DrainLogChan(s.logChan) s.intervalChan <- i2 logs := test.WaitLogChan(s.logChan, 3) test.WaitStatus(1, m, "qan-log-parser", "Idle (2 of 2 running)") gotWarning := false for _, log := range logs { if log.Level == proto.LOG_WARNING && strings.Contains(log.Msg, "All workers busy") { gotWarning = true break } } if !gotWarning { t.Error("Too many workers causes \"All workers busy\" warning") } // Original slow log should no longer exist; it was rotated away, but... if _, err := os.Stat("/tmp/" + slowlog); !os.IsNotExist(err) { t.Error("/tmp/" + slowlog + " no longer exists") } // ...old slow log should exist because w1 is still running. files, _ = filepath.Glob("/tmp/" + slowlog + "-[0-9]*") if len(files) != 1 { t.Errorf("w1 running so old slow log not removed, got %+v", files) } defer func() { for _, file := range files { os.Remove(file) } }() // Stop w2 which is holding "holding" the "lock" on removing the old // slog log (figuratively speaking; there are no real locks). Because // w1 is still running, manager should not remove the old log yet because // w1 could still be parsing it. w2StopChan <- true test.WaitStatus(1, m, "qan-log-parser", "Idle (1 of 2 running)") if _, err := os.Stat(files[0]); os.IsNotExist(err) { t.Errorf("w1 still running so old slow log not removed") } // Stop w1 and now, even though slow log was rotated for w2, manager // should remove old slow log. w1StopChan <- true test.WaitStatus(1, m, "qan-log-parser", "Idle (0 of 2 running)") if _, err := os.Stat(files[0]); !os.IsNotExist(err) { t.Errorf("w1 done running so old slow log removed") } // Stop manager reply = m.Handle(&proto.Cmd{Cmd: "StopService"}) t.Assert(reply.Error, Equals, "") }
func (s *ManagerTestSuite) TestRotateSlowLog(t *C) { // Same as TestRotateAndRemoveSlowLog, but with qan.Config.RemoveOldSlowLogs=false // and testing that Start and Stop queries were executed. slowlog := "slow006.log" files, _ := filepath.Glob("/tmp/" + slowlog + "-[0-9]*") for _, file := range files { os.Remove(file) } mockConnFactory := &mock.ConnectionFactory{Conn: s.nullmysql} m := qan.NewManager(s.logger, mockConnFactory, s.clock, s.iterFactory, s.workerFactory, s.spool, s.im) if m == nil { t.Fatal("Create qan.Manager") } config := &qan.Config{ ServiceInstance: s.mysqlInstance, Interval: 300, MaxSlowLogSize: 1000, RemoveOldSlowLogs: false, // <-- HERE ExampleQueries: false, MaxWorkers: 2, WorkerRunTime: 600, Start: []mysql.Query{ mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"}, mysql.Query{Set: "SET GLOBAL long_query_time=0.456"}, mysql.Query{Set: "SET GLOBAL slow_query_log=ON"}, }, Stop: []mysql.Query{ mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"}, mysql.Query{Set: "SET GLOBAL long_query_time=10"}, }, } qanConfig, _ := json.Marshal(config) cmd := &proto.Cmd{ Ts: time.Now(), Cmd: "StartService", Data: qanConfig, } reply := m.Handle(cmd) t.Assert(reply.Error, Equals, "") test.WaitStatusPrefix(1, m, "qan-log-parser", "Idle") s.nullmysql.Reset() cp := exec.Command("cp", testlog.Sample+slowlog, "/tmp/"+slowlog) cp.Run() // First interval: 0 - 736 now := time.Now() i1 := &qan.Interval{ Filename: "/tmp/" + slowlog, StartOffset: 0, EndOffset: 736, StartTime: now, StopTime: now, } s.intervalChan <- i1 resultData := <-s.dataChan report := *resultData.(*qan.Report) if report.Global.TotalQueries != 2 { t.Error("First interval has 2 queries, got ", report.Global.TotalQueries) } if report.Global.UniqueQueries != 1 { t.Error("First interval has 1 unique query, got ", report.Global.UniqueQueries) } // Second interval: 736 - 1833, but will actually go to end: 2200, if not // the next two test will fail. i2 := &qan.Interval{ Filename: "/tmp/" + slowlog, StartOffset: 736, EndOffset: 1833, StartTime: now, StopTime: now, } s.intervalChan <- i2 resultData = <-s.dataChan report = *resultData.(*qan.Report) if report.Global.TotalQueries != 4 { t.Error("Second interval has 2 queries, got ", report.Global.TotalQueries) } if report.Global.UniqueQueries != 2 { t.Error("Second interval has 2 unique queries, got ", report.Global.UniqueQueries) } test.WaitStatus(1, m, "qan-log-parser", "Idle (0 of 2 running)") // Original slow log should no longer exist; it was rotated away. if _, err := os.Stat("/tmp/" + slowlog); !os.IsNotExist(err) { t.Error("/tmp/" + slowlog + " no longer exists") } // The original slow log should NOT have been removed. files, _ = filepath.Glob("/tmp/" + slowlog + "-[0-9]*") if len(files) != 1 { t.Errorf("Old slow log not removed, got %+v", files) } defer func() { for _, file := range files { os.Remove(file) } }() expect := []mysql.Query{} for _, q := range config.Stop { expect = append(expect, q) } for _, q := range config.Start { expect = append(expect, q) } if same, diff := test.IsDeeply(s.nullmysql.GetSet(), expect); !same { t.Logf("%+v", s.nullmysql.GetSet()) t.Logf("%+v", expect) t.Error(diff) } // Stop manager reply = m.Handle(&proto.Cmd{Cmd: "StopService"}) t.Assert(reply.Error, Equals, "") }
func (s *ManagerTestSuite) TestStartServiceFast(t *C) { /** * Like TestStartService but we simulate the next tick being 3m away * (mock.clock.Eta = 180) so that run() sends the first tick on the * tick chan, causing the first interval to start immediately. */ s.clock.Eta = 180 defer func() { s.clock.Eta = 0 }() m := qan.NewManager(s.logger, &mysql.RealConnectionFactory{}, s.clock, s.iterFactory, s.workerFactory, s.spool, s.im) t.Assert(m, NotNil) config := &qan.Config{ ServiceInstance: s.mysqlInstance, Start: []mysql.Query{ mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"}, }, Stop: []mysql.Query{ mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"}, }, Interval: 300, // 5 min MaxSlowLogSize: 1073741824, // 1 GiB MaxWorkers: 1, WorkerRunTime: 600, // 10 min } now := time.Now() qanConfig, _ := json.Marshal(config) cmd := &proto.Cmd{ User: "******", Ts: now, AgentUuid: "123", Service: "qan", Cmd: "StartService", Data: qanConfig, } reply := m.Handle(cmd) t.Assert(reply.Error, Equals, "") test.WaitStatus(1, m, "qan-log-parser", "Starting") tickChan := s.iterFactory.TickChans[s.iter] t.Assert(tickChan, NotNil) // run() should prime the tickChan with the 1st tick immediately. This makes // the interval iter start the interval immediately. Then run() continues // waiting for the iter to send an interval which happens when the real ticker // (the clock) sends the 2nd tick which is synced to the interval, thus ending // the first interval started by run() and starting the 2nd interval as normal. var tick time.Time select { case tick = <-tickChan: case <-time.After(1 * time.Second): } t.Assert(tick.IsZero(), Not(Equals), true) status := m.Status() t.Check(status["qan-next-interval"], Equals, "180.0s") // Stop QAN. cmd = &proto.Cmd{ User: "******", Ts: now, AgentUuid: "123", Service: "", Cmd: "StopService", } reply = m.Handle(cmd) t.Assert(reply.Error, Equals, "") }
func (s *ManagerTestSuite) TestRotateAndRemoveSlowLog(t *C) { // Clean up files that may interfere with test. slowlog := "slow006.log" files, _ := filepath.Glob("/tmp/" + slowlog + "-[0-9]*") for _, file := range files { os.Remove(file) } /** * slow006.log is 2200 bytes large. Rotation happens when manager * see interval.EndOffset >= MaxSlowLogSize. So we'll use these * intervals, * 0 - 736 * 736 - 1833 * 1833 - 2200 * and set MaxSlowLogSize=1000 which should make manager rotate the log * after the 2nd interval. When manager rotates log, it 1) renames log * to NAME-TS where NAME is the original name and TS is the current Unix * timestamp (UTC); and 2) it sets interval.StopOff = file size of NAME-TS * to finish parsing the log. Therefore, results for 2nd interval should * include our 3rd interval. -- Manager also calls Start and Stop so the * nullmysql conn should record the queries being set. */ // See TestStartService() for description of these startup tasks. mockConnFactory := &mock.ConnectionFactory{Conn: s.nullmysql} m := qan.NewManager(s.logger, mockConnFactory, s.clock, s.iterFactory, s.workerFactory, s.spool, s.im) if m == nil { t.Fatal("Create qan.Manager") } config := &qan.Config{ ServiceInstance: s.mysqlInstance, Interval: 300, MaxSlowLogSize: 1000, // <-- HERE RemoveOldSlowLogs: true, // <-- HERE too ExampleQueries: false, MaxWorkers: 2, WorkerRunTime: 600, Start: []mysql.Query{ mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"}, mysql.Query{Set: "SET GLOBAL long_query_time=0.456"}, mysql.Query{Set: "SET GLOBAL slow_query_log=ON"}, }, Stop: []mysql.Query{ mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"}, mysql.Query{Set: "SET GLOBAL long_query_time=10"}, }, } qanConfig, _ := json.Marshal(config) cmd := &proto.Cmd{ Ts: time.Now(), Cmd: "StartService", Data: qanConfig, } reply := m.Handle(cmd) t.Assert(reply.Error, Equals, "") test.WaitStatusPrefix(1, m, "qan-log-parser", "Idle") // Make copy of slow log because test will mv/rename it. cp := exec.Command("cp", testlog.Sample+slowlog, "/tmp/"+slowlog) cp.Run() // First interval: 0 - 736 now := time.Now() i1 := &qan.Interval{ Filename: "/tmp/" + slowlog, StartOffset: 0, EndOffset: 736, StartTime: now, StopTime: now, } s.intervalChan <- i1 resultData := <-s.dataChan report := *resultData.(*qan.Report) if report.Global.TotalQueries != 2 { t.Error("First interval has 2 queries, got ", report.Global.TotalQueries) } if report.Global.UniqueQueries != 1 { t.Error("First interval has 1 unique query, got ", report.Global.UniqueQueries) } // Second interval: 736 - 1833, but will actually go to end: 2200, if not // the next two test will fail. i2 := &qan.Interval{ Filename: "/tmp/" + slowlog, StartOffset: 736, EndOffset: 1833, StartTime: now, StopTime: now, } s.intervalChan <- i2 resultData = <-s.dataChan report = *resultData.(*qan.Report) if report.Global.TotalQueries != 4 { t.Error("Second interval has 2 queries, got ", report.Global.TotalQueries) } if report.Global.UniqueQueries != 2 { t.Error("Second interval has 2 unique queries, got ", report.Global.UniqueQueries) } test.WaitStatus(1, m, "qan-log-parser", "Idle (0 of 2 running)") // Original slow log should no longer exist; it was rotated away. if _, err := os.Stat("/tmp/" + slowlog); !os.IsNotExist(err) { t.Error("/tmp/" + slowlog + " no longer exists") } // The original slow log should have been renamed to slow006-TS, parsed, and removed. files, _ = filepath.Glob("/tmp/" + slowlog + "-[0-9]*") if len(files) != 0 { t.Errorf("Old slow log removed, got %+v", files) } defer func() { for _, file := range files { os.Remove(file) } }() // https://jira.percona.com/browse/PCT-466 // Old slow log removed but space not freed in filesystem pid := fmt.Sprintf("%d", os.Getpid()) out, err := exec.Command("lsof", "-p", pid).Output() if err != nil { t.Fatal(err) } if strings.Contains(string(out), "/tmp/"+slowlog+"-") { t.Logf("%s\n", string(out)) t.Error("Old slow log removed but not freed in filesystem (PCT-466)") } // Stop manager reply = m.Handle(&proto.Cmd{Cmd: "StopService"}) t.Assert(reply.Error, Equals, "") }
func (s *ManagerTestSuite) TestStartService(t *C) { /** * Create and start manager. */ m := qan.NewManager(s.logger, &mysql.RealConnectionFactory{}, s.clock, s.iterFactory, s.workerFactory, s.spool, s.im) t.Assert(m, NotNil) // Create the qan config. tmpFile := fmt.Sprintf("/tmp/qan_test.TestStartService.%d", os.Getpid()) defer func() { os.Remove(tmpFile) }() config := &qan.Config{ ServiceInstance: s.mysqlInstance, Start: []mysql.Query{ mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"}, mysql.Query{Set: "SET GLOBAL long_query_time=0.123"}, mysql.Query{Set: "SET GLOBAL slow_query_log=ON"}, }, Stop: []mysql.Query{ mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"}, mysql.Query{Set: "SET GLOBAL long_query_time=10"}, }, Interval: 300, // 5 min MaxSlowLogSize: 1073741824, // 1 GiB RemoveOldSlowLogs: true, ExampleQueries: true, MaxWorkers: 2, WorkerRunTime: 600, // 10 min } // Create the StartService cmd which contains the qan config. now := time.Now() qanConfig, _ := json.Marshal(config) cmd := &proto.Cmd{ User: "******", Ts: now, AgentUuid: "123", Service: "agent", Cmd: "StartService", Data: qanConfig, } // Have the service manager start the qa service reply := m.Handle(cmd) // It should start without error. t.Assert(reply.Error, Equals, "") // It should write the config to disk. data, err := ioutil.ReadFile(pct.Basedir.ConfigFile("qan")) t.Check(err, IsNil) gotConfig := &qan.Config{} err = json.Unmarshal(data, gotConfig) t.Check(err, IsNil) if same, diff := test.IsDeeply(gotConfig, config); !same { test.Dump(gotConfig) t.Error(diff) } // And status should be "Running" and "Idle". test.WaitStatus(1, m, "qan-log-parser", "Idle (0 of 2 running)") status := m.Status() t.Check(status["qan"], Equals, "Running") t.Check(status["qan-log-parser"], Equals, "Idle (0 of 2 running)") // It should have enabled the slow log. slowLog := s.realmysql.GetGlobalVarNumber("slow_query_log") t.Assert(slowLog, Equals, float64(1)) longQueryTime := s.realmysql.GetGlobalVarNumber("long_query_time") t.Assert(longQueryTime, Equals, 0.123) // Starting an already started service should result in a ServiceIsRunningError. reply = m.Handle(cmd) t.Check(reply.Error, Not(Equals), "") // It should add a tickChan for the interval iter. t.Check(s.clock.Added, HasLen, 1) t.Check(s.clock.Removed, HasLen, 0) /** * Have manager run a worker, parse, and send data. */ interv := &qan.Interval{ Filename: testlog.Sample + "slow001.log", StartOffset: 0, EndOffset: 524, StartTime: now, StopTime: now, } s.intervalChan <- interv v := test.WaitData(s.dataChan) t.Assert(v, HasLen, 1) report := v[0].(*qan.Report) result := &qan.Result{ StopOffset: report.StopOffset, Global: report.Global, Classes: report.Class, } test.WriteData(result, tmpFile) t.Check(tmpFile, testlog.FileEquals, sample+"slow001.json") /** * Send StopService cmd to stop qan/qan-log-parser. */ now = time.Now() cmd = &proto.Cmd{ User: "******", Ts: now, AgentUuid: "123", Service: "agent", Cmd: "StopService", } // Have the service manager start the qa service reply = m.Handle(cmd) // It should start without error. t.Assert(reply.Error, Equals, "") // It should disable the slow log. slowLog = s.realmysql.GetGlobalVarNumber("slow_query_log") t.Assert(slowLog, Equals, float64(0)) longQueryTime = s.realmysql.GetGlobalVarNumber("long_query_time") t.Assert(longQueryTime, Equals, 10.0) // It should remove the tickChan (and not have added others). t.Check(s.clock.Added, HasLen, 1) t.Check(s.clock.Removed, HasLen, 1) // qan still running, but qan-log-parser stopped. test.WaitStatus(1, m, "qan-log-parser", "Stopped") status = m.Status() t.Check(status["qan"], Equals, "Running") t.Check(status["qan-log-parser"], Equals, "Stopped") }