func (server *ServerMonitor) rejoin() error { if readonly { dbhelper.SetReadOnly(server.Conn, true) } cm := "CHANGE MASTER TO master_host='" + master.IP + "', master_port=" + master.Port + ", master_user='******', master_password='******', MASTER_USE_GTID=CURRENT_POS" _, err := server.Conn.Exec(cm) dbhelper.StartSlave(server.Conn) return err }
/* Handles write freeze and existing transactions on a server */ func (server *ServerMonitor) freeze() bool { err := dbhelper.SetReadOnly(server.Conn, true) if err != nil { logprintf("WARN : Could not set %s as read-only: %s", server.URL, err) return false } for i := *waitKill; i > 0; i -= 500 { threads := dbhelper.CheckLongRunningWrites(server.Conn, 0) if threads == 0 { break } logprintf("INFO : Waiting for %d write threads to complete on %s", threads, server.URL) time.Sleep(500 * time.Millisecond) } logprintf("INFO : Terminating all threads on %s", server.URL) dbhelper.KillThreads(server.Conn) return true }
/* Triggers a master failover. Returns the new master's URL and key */ func (master *ServerMonitor) failover() (string, int) { log.Println("INFO : Starting failover and electing a new master") var nmUrl string key := master.electCandidate(slaves) if key == -1 { return "", -1 } nmUrl = slaves[key].URL log.Printf("INFO : Slave %s has been elected as a new master", nmUrl) newMaster, err := newServerMonitor(nmUrl) if *preScript != "" { log.Printf("INFO : Calling pre-failover script") out, err := exec.Command(*preScript, master.Host, newMaster.Host).CombinedOutput() if err != nil { log.Println("ERROR:", err) } log.Println("INFO : Post-failover script complete:", string(out)) } log.Println("INFO : Switching master") log.Println("INFO : Stopping slave thread on new master") err = dbhelper.StopSlave(newMaster.Conn) if err != nil { log.Println("WARN : Stopping slave failed on new master") } cm := "CHANGE MASTER TO master_host='" + newMaster.IP + "', master_port=" + newMaster.Port + ", master_user='******', master_password='******'" log.Println("INFO : Resetting slave on new master and set read/write mode on") err = dbhelper.ResetSlave(newMaster.Conn, true) if err != nil { log.Println("WARN : Reset slave failed on new master") } err = dbhelper.SetReadOnly(newMaster.Conn, false) if err != nil { log.Println("ERROR: Could not set new master as read-write") } log.Println("INFO : Switching other slaves to the new master") for _, sl := range slaves { log.Printf("INFO : Change master on slave %s", sl.URL) err := dbhelper.StopSlave(sl.Conn) if err != nil { log.Printf("WARN : Could not stop slave on server %s, %s", sl.URL, err) } _, err = sl.Conn.Exec(cm) if err != nil { log.Printf("ERROR: Change master failed on slave %s, %s", sl.URL, err) } err = dbhelper.StartSlave(sl.Conn) if err != nil { log.Printf("ERROR: could not start slave on server %s, %s", sl.URL, err) } if *readonly { err = dbhelper.SetReadOnly(sl.Conn, true) if err != nil { log.Printf("ERROR: Could not set slave %s as read-only, %s", sl.URL, err) } } } if *postScript != "" { log.Printf("INFO : Calling post-failover script") out, err := exec.Command(*postScript, master.Host, newMaster.Host).CombinedOutput() if err != nil { log.Println("ERROR:", err) } log.Println("INFO : Post-failover script complete", string(out)) } log.Println("INFO : Failover complete") return newMaster.URL, key }
/* Triggers a master switchover. Returns the new master's URL */ func (master *ServerMonitor) switchover() (string, int) { logprint("INFO : Starting switchover") // Phase 1: Cleanup and election logprintf("INFO : Flushing tables on %s (master)", master.URL) err := dbhelper.FlushTablesNoLog(master.Conn) if err != nil { logprintf("WARN : Could not flush tables on master", err) } logprint("INFO : Checking long running updates on master") if dbhelper.CheckLongRunningWrites(master.Conn, 10) > 0 { logprint("ERROR: Long updates running on master. Cannot switchover") return "", -1 } logprint("INFO : Electing a new master") var nmUrl string key := master.electCandidate(slaves) if key == -1 { return "", -1 } nmUrl = slaves[key].URL logprintf("INFO : Slave %s has been elected as a new master", nmUrl) newMaster, err := newServerMonitor(nmUrl) if *preScript != "" { logprintf("INFO : Calling pre-failover script") out, err := exec.Command(*preScript, master.Host, newMaster.Host).CombinedOutput() if err != nil { logprint("ERROR:", err) } logprint("INFO : Pre-failover script complete:", string(out)) } // Phase 2: Reject updates and sync slaves master.freeze() logprintf("INFO : Rejecting updates on %s (old master)", master.URL) err = dbhelper.FlushTablesWithReadLock(master.Conn) if err != nil { logprintf("WARN : Could not lock tables on %s (old master) %s", master.URL, err) } logprint("INFO : Switching master") logprint("INFO : Waiting for candidate master to synchronize") masterGtid := dbhelper.GetVariableByName(master.Conn, "GTID_BINLOG_POS") if *verbose { logprintf("DEBUG: Syncing on master GTID Current Pos [%s]", masterGtid) master.log() } dbhelper.MasterPosWait(newMaster.Conn, masterGtid) if *verbose { logprint("DEBUG: MASTER_POS_WAIT executed.") newMaster.log() } // Phase 3: Prepare new master logprint("INFO : Stopping slave thread on new master") err = dbhelper.StopSlave(newMaster.Conn) if err != nil { logprint("WARN : Stopping slave failed on new master") } // Call post-failover script before unlocking the old master. if *postScript != "" { logprintf("INFO : Calling post-failover script") out, err := exec.Command(*postScript, master.Host, newMaster.Host).CombinedOutput() if err != nil { logprint("ERROR:", err) } logprint("INFO : Post-failover script complete", string(out)) } logprint("INFO : Resetting slave on new master and set read/write mode on") err = dbhelper.ResetSlave(newMaster.Conn, true) if err != nil { logprint("WARN : Reset slave failed on new master") } err = dbhelper.SetReadOnly(newMaster.Conn, false) if err != nil { logprint("ERROR: Could not set new master as read-write") } newGtid := dbhelper.GetVariableByName(master.Conn, "GTID_BINLOG_POS") // Insert a bogus transaction in order to have a new GTID pos on master err = dbhelper.FlushTables(newMaster.Conn) if err != nil { logprint("WARN : Could not flush tables on new master", err) } // Phase 4: Demote old master to slave cm := "CHANGE MASTER TO master_host='" + newMaster.IP + "', master_port=" + newMaster.Port + ", master_user='******', master_password='******'" logprint("INFO : Switching old master as a slave") err = dbhelper.UnlockTables(master.Conn) if err != nil { logprint("WARN : Could not unlock tables on old master", err) } dbhelper.StopSlave(master.Conn) // This is helpful because in some cases the old master can have an old configuration running _, err = master.Conn.Exec("SET GLOBAL gtid_slave_pos='" + newGtid + "'") if err != nil { logprint("WARN : Could not set gtid_slave_pos on old master", err) } _, err = master.Conn.Exec(cm + ", master_use_gtid=slave_pos") if err != nil { logprint("WARN : Change master failed on old master", err) } err = dbhelper.StartSlave(master.Conn) if err != nil { logprint("WARN : Start slave failed on old master", err) } if *readonly { err = dbhelper.SetReadOnly(master.Conn, true) if err != nil { logprintf("ERROR: Could not set old master as read-only, %s", err) } } // Phase 5: Switch slaves to new master logprint("INFO : Switching other slaves to the new master") var oldMasterKey int for k, sl := range slaves { if sl.URL == newMaster.URL { slaves[k].URL = master.URL oldMasterKey = k if *verbose { logprintf("DEBUG: New master %s found in slave slice at key %d, reinstancing URL to %s", sl.URL, k, master.URL) } continue } logprintf("INFO : Waiting for slave %s to sync", sl.URL) dbhelper.MasterPosWait(sl.Conn, masterGtid) if *verbose { sl.log() } logprintf("INFO : Change master on slave %s", sl.URL) err := dbhelper.StopSlave(sl.Conn) if err != nil { logprintf("WARN : Could not stop slave on server %s, %s", sl.URL, err) } _, err = sl.Conn.Exec("SET GLOBAL gtid_slave_pos='" + newGtid + "'") if err != nil { logprintf("WARN : Could not set gtid_slave_pos on slave %s, %s", sl.URL, err) } _, err = sl.Conn.Exec(cm) if err != nil { logprintf("ERROR: Change master failed on slave %s, %s", sl.URL, err) } err = dbhelper.StartSlave(sl.Conn) if err != nil { logprintf("ERROR: could not start slave on server %s, %s", sl.URL, err) } if *readonly { err = dbhelper.SetReadOnly(sl.Conn, true) if err != nil { logprintf("ERROR: Could not set slave %s as read-only, %s", sl.URL, err) } } } logprint("INFO : Switchover complete") return newMaster.URL, oldMasterKey }
func main() { var errLog = mysql.Logger(log.New(ioutil.Discard, "", 0)) mysql.SetLogger(errLog) flag.Parse() if version == true { fmt.Println("MariaDB Replication Manager version", repmgrVersion) os.Exit(0) } if logfile != "" { var err error logPtr, err = os.Create(logfile) if err != nil { log.Println("ERROR: Error opening logfile, disabling for the rest of the session.") logfile = "" } } // if slaves option has been supplied, split into a slice. if hosts != "" { hostList = strings.Split(hosts, ",") } else { log.Fatal("ERROR: No hosts list specified.") } // validate users. if user == "" { log.Fatal("ERROR: No master user/pair specified.") } dbUser, dbPass = splitPair(user) if rpluser == "" { log.Fatal("ERROR: No replication user/pair specified.") } rplUser, rplPass = splitPair(rpluser) // Check that failover and switchover modes are set correctly. if switchover == "" && failover == "" { log.Fatal("ERROR: None of the switchover or failover modes are set.") } if switchover != "" && failover != "" { log.Fatal("ERROR: Both switchover and failover modes are set.") } if !contains(failOptions, failover) && failover != "" { log.Fatalf("ERROR: Incorrect failover mode: %s", failover) } if !contains(switchOptions, switchover) && switchover != "" { log.Fatalf("ERROR: Incorrect switchover mode: %s", switchover) } // Forced failover implies interactive == false if failover == "force" && interactive == true { interactive = false } if ignoreSrv != "" { ignoreList = strings.Split(ignoreSrv, ",") } // Create a connection to each host and build list of slaves. hostCount := len(hostList) servers = make([]*ServerMonitor, hostCount) slaveCount := 0 for k, url := range hostList { var err error servers[k], err = newServerMonitor(url) if verbose { log.Printf("DEBUG: Creating new server: %v", servers[k].URL) } if err != nil { if driverErr, ok := err.(*mysql.MySQLError); ok { if driverErr.Number == 1045 { log.Fatalln("ERROR: Database access denied:", err.Error()) } } if verbose { log.Println("ERROR:", err) } log.Printf("INFO : Server %s is dead.", servers[k].URL) servers[k].State = stateFailed continue } defer servers[k].Conn.Close() if verbose { log.Printf("DEBUG: Checking if server %s is slave", servers[k].URL) } servers[k].refresh() if servers[k].UsingGtid != "" { if verbose { log.Printf("DEBUG: Server %s is configured as a slave", servers[k].URL) } servers[k].State = stateSlave slaves = append(slaves, servers[k]) slaveCount++ } else { if verbose { log.Printf("DEBUG: Server %s is not a slave. Setting aside", servers[k].URL) } servers[k].State = stateUnconn } } // If no slaves are detected, then bail out if len(slaves) == 0 { log.Fatal("ERROR: No slaves were detected.") } // Check that all slave servers have the same master. for _, sl := range slaves { if sl.hasSiblings(slaves) == false { log.Fatalln("ERROR: Multi-master topologies are not yet supported.") } } // Check user privileges on live servers for _, sv := range servers { if sv.State != stateFailed { priv, err := dbhelper.GetPrivileges(sv.Conn, dbUser, sv.Host) if err != nil { log.Fatalf("ERROR: Error getting privileges for user %s on host %s: %s", dbUser, sv.Host, err) } if priv.Repl_client_priv == "N" { log.Fatalln("ERROR: User must have REPLICATION_CLIENT privilege") } else if priv.Repl_slave_priv == "N" { log.Fatalln("ERROR: User must have REPLICATION_SLAVE privilege") } else if priv.Super_priv == "N" { log.Fatalln("ERROR: User must have SUPER privilege") } } } // Depending if we are doing a failover or a switchover, we will find the master in the list of // dead hosts or unconnected hosts. if switchover != "" || failover == "monitor" { // First of all, get a server id from the slaves slice, they should be all the same sid := slaves[0].MasterServerID for k, s := range servers { if s.State == stateUnconn { if s.ServerID == sid { master = servers[k] master.State = stateMaster if verbose { log.Printf("DEBUG: Server %s was autodetected as a master", s.URL) } break } } } } else { // Slave master_host variable must point to dead master smh := slaves[0].MasterHost for k, s := range servers { if s.State == stateFailed { if s.Host == smh || s.IP == smh { master = servers[k] master.State = stateMaster if verbose { log.Printf("DEBUG: Server %s was autodetected as a master", s.URL) } break } } } } // Final check if master has been found if master == nil { if switchover != "" || failover == "monitor" { log.Fatalln("ERROR: Could not autodetect a master!") } else { log.Fatalln("ERROR: Could not autodetect a failed master!") } } for _, sl := range slaves { if verbose { log.Printf("DEBUG: Checking if server %s is a slave of server %s", sl.Host, master.Host) } if dbhelper.IsSlaveof(sl.Conn, sl.Host, master.IP) == false { log.Printf("WARN : Server %s is not a slave of declared master %s", master.URL, master.Host) } } // Check if preferred master is included in Host List ret := func() bool { for _, v := range hostList { if v == prefMaster { return true } } return false } if ret() == false && prefMaster != "" { log.Fatal("ERROR: Preferred master is not included in the hosts option") } // Do failover or switchover manually, or start the interactive monitor. if failover == "force" { masterFailover(true) } else if switchover != "" && interactive == false { masterFailover(false) } else { err := termbox.Init() if err != nil { log.Fatalln("Termbox initialization error", err) } _, termlength = termbox.Size() loglen := termlength - 9 - (hostCount * 3) tlog = NewTermLog(loglen) if failover != "" { tlog.Add("Monitor started in failover mode") } else { tlog.Add("Monitor started in switchover mode") } termboxChan := newTbChan() interval := time.Second ticker := time.NewTicker(interval * 1) for exit == false { select { case <-ticker.C: display() case event := <-termboxChan: switch event.Type { case termbox.EventKey: if event.Key == termbox.KeyCtrlS { if master.State != stateFailed || failCount > 0 { masterFailover(false) } else { logprint("ERROR: Master failed, cannot initiate switchover") } } if event.Key == termbox.KeyCtrlF { if master.State == stateFailed { masterFailover(true) } else { logprint("ERROR: Master not failed, cannot initiate failover") } } if event.Key == termbox.KeyCtrlD { for k, v := range servers { logprint("Servers", k, v) } logprint("Master", master) for k, v := range slaves { logprint("Slaves", k, v) } } if event.Key == termbox.KeyCtrlR { logprint("INFO: Setting slaves read-only") for _, sl := range slaves { dbhelper.SetReadOnly(sl.Conn, true) } } if event.Key == termbox.KeyCtrlW { logprint("INFO: Setting slaves read-write") for _, sl := range slaves { dbhelper.SetReadOnly(sl.Conn, false) } } if event.Key == termbox.KeyCtrlQ { exit = true } } switch event.Ch { case 's': termbox.Sync() } } if master.State == stateFailed && interactive == false { rem := (failoverTs + failtime) - time.Now().Unix() if (failtime == 0) || (failtime > 0 && (rem <= 0 || failoverCtr == 0)) { masterFailover(true) if failoverCtr == faillimit { exitMsg = "INFO : Failover limit reached. Exiting on failover completion." exit = true } } else if failtime > 0 && rem%10 == 0 { logprintf("WARN : Failover time limit enforced. Next failover available in %d seconds.", rem) } } } termbox.Close() if exitMsg != "" { log.Println(exitMsg) } } }
/* Triggers a master switchover. Returns the new master's URL */ func masterFailover(fail bool) { logprint("INFO : Starting master switch") // Phase 1: Cleanup and election var err error if fail == false { logprintf("INFO : Flushing tables on %s (master)", master.URL) err = dbhelper.FlushTablesNoLog(master.Conn) if err != nil { logprintf("WARN : Could not flush tables on master", err) } logprint("INFO : Checking long running updates on master") if dbhelper.CheckLongRunningWrites(master.Conn, 10) > 0 { logprint("ERROR: Long updates running on master. Cannot switchover") return } } logprint("INFO : Electing a new master") key := master.electCandidate(slaves) if key == -1 { return } logprintf("INFO : Slave %s [%d] has been elected as a new master", slaves[key].URL, key) // Shuffle the server list oldMaster := master var skey int for k, server := range servers { if slaves[key].URL == server.URL { skey = k break } } master = servers[skey] master.State = stateMaster slaves[key].delete(&slaves) // Call pre-failover script if preScript != "" { logprintf("INFO : Calling pre-failover script") out, err := exec.Command(preScript, oldMaster.Host, master.Host).CombinedOutput() if err != nil { logprint("ERROR:", err) } logprint("INFO : Pre-failover script complete:", string(out)) } // Phase 2: Reject updates and sync slaves if fail == false { oldMaster.freeze() logprintf("INFO : Rejecting updates on %s (old master)", oldMaster.URL) err = dbhelper.FlushTablesWithReadLock(oldMaster.Conn) if err != nil { logprintf("WARN : Could not lock tables on %s (old master) %s", oldMaster.URL, err) } } logprint("INFO : Switching master") if fail == false { logprint("INFO : Waiting for candidate Master to synchronize") oldMaster.refresh() if verbose { logprintf("DEBUG: Syncing on master GTID Binlog Pos [%s]", oldMaster.BinlogPos) oldMaster.log() } dbhelper.MasterPosWait(master.Conn, oldMaster.BinlogPos) if verbose { logprint("DEBUG: MASTER_POS_WAIT executed.") master.log() } } // Phase 3: Prepare new master logprint("INFO : Stopping slave thread on new master") err = dbhelper.StopSlave(master.Conn) if err != nil { logprint("WARN : Stopping slave failed on new master") } // Call post-failover script before unlocking the old master. if postScript != "" { logprintf("INFO : Calling post-failover script") out, err := exec.Command(postScript, oldMaster.Host, master.Host).CombinedOutput() if err != nil { logprint("ERROR:", err) } logprint("INFO : Post-failover script complete", string(out)) } logprint("INFO : Resetting slave on new master and set read/write mode on") err = dbhelper.ResetSlave(master.Conn, true) if err != nil { logprint("WARN : Reset slave failed on new master") } err = dbhelper.SetReadOnly(master.Conn, false) if err != nil { logprint("ERROR: Could not set new master as read-write") } cm := "CHANGE MASTER TO master_host='" + master.IP + "', master_port=" + master.Port + ", master_user='******', master_password='******'" if fail == false { // Get latest GTID pos oldMaster.refresh() // Insert a bogus transaction in order to have a new GTID pos on master err = dbhelper.FlushTables(master.Conn) if err != nil { logprint("WARN : Could not flush tables on new master", err) } // Phase 4: Demote old master to slave logprint("INFO : Switching old master as a slave") err = dbhelper.UnlockTables(oldMaster.Conn) if err != nil { logprint("WARN : Could not unlock tables on old master", err) } dbhelper.StopSlave(oldMaster.Conn) // This is helpful because in some cases the old master can have an old configuration running _, err = oldMaster.Conn.Exec("SET GLOBAL gtid_slave_pos='" + oldMaster.BinlogPos + "'") if err != nil { logprint("WARN : Could not set gtid_slave_pos on old master", err) } _, err = oldMaster.Conn.Exec(cm + ", master_use_gtid=slave_pos") if err != nil { logprint("WARN : Change master failed on old master", err) } err = dbhelper.StartSlave(oldMaster.Conn) if err != nil { logprint("WARN : Start slave failed on old master", err) } if readonly { err = dbhelper.SetReadOnly(oldMaster.Conn, true) if err != nil { logprintf("ERROR: Could not set old master as read-only, %s", err) } } // Add the old master to the slaves list oldMaster.State = stateSlave slaves = append(slaves, oldMaster) } // Phase 5: Switch slaves to new master logprint("INFO : Switching other slaves to the new master") for _, sl := range slaves { if fail == false { logprintf("INFO : Waiting for slave %s to sync", sl.URL) dbhelper.MasterPosWait(sl.Conn, oldMaster.BinlogPos) if verbose { sl.log() } } logprint("INFO : Change master on slave", sl.URL) err := dbhelper.StopSlave(sl.Conn) if err != nil { logprintf("WARN : Could not stop slave on server %s, %s", sl.URL, err) } if fail == false { _, err = sl.Conn.Exec("SET GLOBAL gtid_slave_pos='" + oldMaster.BinlogPos + "'") if err != nil { logprintf("WARN : Could not set gtid_slave_pos on slave %s, %s", sl.URL, err) } } _, err = sl.Conn.Exec(cm) if err != nil { logprintf("ERROR: Change master failed on slave %s, %s", sl.URL, err) } err = dbhelper.StartSlave(sl.Conn) if err != nil { logprintf("ERROR: could not start slave on server %s, %s", sl.URL, err) } if readonly { err = dbhelper.SetReadOnly(sl.Conn, true) if err != nil { logprintf("ERROR: Could not set slave %s as read-only, %s", sl.URL, err) } } } if postScript != "" { logprintf("INFO : Calling post-failover script") out, err := exec.Command(postScript, oldMaster.Host, master.Host).CombinedOutput() if err != nil { logprint("ERROR:", err) } logprint("INFO : Post-failover script complete", string(out)) } logprintf("INFO : Master switch on %s complete", master.URL) failCount = 0 if fail == true { failoverCtr++ failoverTs = time.Now().Unix() } return }