func (s *ManagerTestSuite) TestRecoverWorkerPanic(t *C) { // Create and start manager with mock workers. w1StopChan := make(chan bool) w1 := mock.NewQanWorker("qan-worker-1", w1StopChan, nil, nil, true) f := mock.NewQanWorkerFactory([]*mock.QanWorker{w1}) mockConnFactory := &mock.ConnectionFactory{Conn: s.nullmysql} m := qan.NewManager(s.logger, mockConnFactory, s.clock, s.iterFactory, f, s.spool, s.im) t.Assert(m, NotNil) config := &qan.Config{ ServiceInstance: s.mysqlInstance, MaxSlowLogSize: 1000, MaxWorkers: 2, Interval: 60, WorkerRunTime: 60, Start: []mysql.Query{ mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"}, }, Stop: []mysql.Query{ mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"}, }, } qanConfig, _ := json.Marshal(config) cmd := &proto.Cmd{ Ts: time.Now(), Cmd: "StartService", Data: qanConfig, } reply := m.Handle(cmd) t.Assert(reply.Error, Equals, "") test.WaitStatusPrefix(1, m, "qan-log-parser", "Idle") test.DrainLogChan(s.logChan) // Start mock worker. All it does is panic, much like fipar. now := time.Now() i1 := &qan.Interval{ Filename: "slow.log", StartOffset: 0, EndOffset: 100, StartTime: now, StopTime: now, } s.intervalChan <- i1 <-w1.Running() // wait for manager to run worker // For now, worker panic only results in error to log. var gotError *proto.LogEntry timeout := time.After(200 * time.Millisecond) GET_LOG: for { select { case l := <-s.logChan: if l.Level == 3 && strings.HasPrefix(l.Msg, "Lost interval 0 slow.log") { gotError = l break GET_LOG } case <-timeout: break GET_LOG } } t.Check(gotError, NotNil) // Stop manager reply = m.Handle(&proto.Cmd{Cmd: "StopService"}) t.Assert(reply.Error, Equals, "") }
func (s *ManagerTestSuite) TestWaitRemoveSlowLog(t *C) { // Same as TestRotateAndRemoveSlowLog, but we use mock workers so we can // test that slow log is not removed until previous workers are done. // Mock worker factory will return our mock workers when manager calls Make(). w1StopChan := make(chan bool) w1 := mock.NewQanWorker("qan-worker-1", w1StopChan, nil, nil, false) w2StopChan := make(chan bool) w2 := mock.NewQanWorker("qan-worker-2", w2StopChan, nil, nil, false) // Let's take this time to also test that MaxWorkers is enforced. w3 := mock.NewQanWorker("qan-worker-3", nil, nil, nil, false) f := mock.NewQanWorkerFactory([]*mock.QanWorker{w1, w2, w3}) // Clean up files that may interfere with test. Then copy the test log. slowlog := "slow006.log" files, _ := filepath.Glob("/tmp/" + slowlog + "-[0-9]*") for _, file := range files { os.Remove(file) } cp := exec.Command("cp", testlog.Sample+slowlog, "/tmp/"+slowlog) cp.Run() // Create and start manager with mock workers. mockConnFactory := &mock.ConnectionFactory{Conn: s.nullmysql} m := qan.NewManager(s.logger, mockConnFactory, s.clock, s.iterFactory, f, s.spool, s.im) if m == nil { t.Fatal("Create qan.Manager") } config := &qan.Config{ ServiceInstance: s.mysqlInstance, MaxSlowLogSize: 1000, RemoveOldSlowLogs: true, // done after w2 and w1 done MaxWorkers: 2, // w1 and w2 but not w3 Interval: 60, WorkerRunTime: 60, Start: []mysql.Query{ mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"}, }, Stop: []mysql.Query{ mysql.Query{Set: "SET GLOBAL slow_query_log=OFF"}, }, } qanConfig, _ := json.Marshal(config) cmd := &proto.Cmd{ Ts: time.Now(), Cmd: "StartService", Data: qanConfig, } reply := m.Handle(cmd) t.Assert(reply.Error, Equals, "") test.WaitStatusPrefix(1, m, "qan-log-parser", "Idle") // Start first mock worker (w1) with interval 0 - 736. The worker's Run() // func won't return until we send true to its stop chan, so manager will // think worker is still running until then. now := time.Now() i1 := &qan.Interval{ Filename: "/tmp/" + slowlog, StartOffset: 0, EndOffset: 736, StartTime: now, StopTime: now, } s.intervalChan <- i1 <-w1.Running() // wait for manager to run worker // Start 2nd mock worker (w2) with interval 736 - 1833. Manager will rotate // but not remove original slow log because w1 is still running. i2 := &qan.Interval{ Filename: "/tmp/" + slowlog, StartOffset: 736, EndOffset: 1833, StartTime: now, StopTime: now, } s.intervalChan <- i2 <-w2.Running() test.WaitStatus(1, m, "qan-log-parser", "Idle (2 of 2 running)") /** * Worker status test */ // Workers should have status and QAN manager should report them all. status := m.Status() t.Check(status["qan-worker-1"], Equals, "ok") t.Check(status["qan-worker-2"], Equals, "ok") t.Check(status["qan-worker-3"], Equals, "") // not running due to MaxWorkers /** * Quick side test: qan.Config.MaxWorkers is enforced. */ test.DrainLogChan(s.logChan) s.intervalChan <- i2 logs := test.WaitLogChan(s.logChan, 3) test.WaitStatus(1, m, "qan-log-parser", "Idle (2 of 2 running)") gotWarning := false for _, log := range logs { if log.Level == proto.LOG_WARNING && strings.Contains(log.Msg, "All workers busy") { gotWarning = true break } } if !gotWarning { t.Error("Too many workers causes \"All workers busy\" warning") } // Original slow log should no longer exist; it was rotated away, but... if _, err := os.Stat("/tmp/" + slowlog); !os.IsNotExist(err) { t.Error("/tmp/" + slowlog + " no longer exists") } // ...old slow log should exist because w1 is still running. files, _ = filepath.Glob("/tmp/" + slowlog + "-[0-9]*") if len(files) != 1 { t.Errorf("w1 running so old slow log not removed, got %+v", files) } defer func() { for _, file := range files { os.Remove(file) } }() // Stop w2 which is holding "holding" the "lock" on removing the old // slog log (figuratively speaking; there are no real locks). Because // w1 is still running, manager should not remove the old log yet because // w1 could still be parsing it. w2StopChan <- true test.WaitStatus(1, m, "qan-log-parser", "Idle (1 of 2 running)") if _, err := os.Stat(files[0]); os.IsNotExist(err) { t.Errorf("w1 still running so old slow log not removed") } // Stop w1 and now, even though slow log was rotated for w2, manager // should remove old slow log. w1StopChan <- true test.WaitStatus(1, m, "qan-log-parser", "Idle (0 of 2 running)") if _, err := os.Stat(files[0]); !os.IsNotExist(err) { t.Errorf("w1 done running so old slow log removed") } // Stop manager reply = m.Handle(&proto.Cmd{Cmd: "StopService"}) t.Assert(reply.Error, Equals, "") }