/* Triggers a master switchover. Returns the new master's URL */ func (master *ServerMonitor) switchover() (string, int) { logprint("INFO : Starting switchover") // Phase 1: Cleanup and election logprintf("INFO : Flushing tables on %s (master)", master.URL) err := dbhelper.FlushTablesNoLog(master.Conn) if err != nil { logprintf("WARN : Could not flush tables on master", err) } logprint("INFO : Checking long running updates on master") if dbhelper.CheckLongRunningWrites(master.Conn, 10) > 0 { logprint("ERROR: Long updates running on master. Cannot switchover") return "", -1 } logprint("INFO : Electing a new master") var nmUrl string key := master.electCandidate(slaves) if key == -1 { return "", -1 } nmUrl = slaves[key].URL logprintf("INFO : Slave %s has been elected as a new master", nmUrl) newMaster, err := newServerMonitor(nmUrl) if *preScript != "" { logprintf("INFO : Calling pre-failover script") out, err := exec.Command(*preScript, master.Host, newMaster.Host).CombinedOutput() if err != nil { logprint("ERROR:", err) } logprint("INFO : Pre-failover script complete:", string(out)) } // Phase 2: Reject updates and sync slaves master.freeze() logprintf("INFO : Rejecting updates on %s (old master)", master.URL) err = dbhelper.FlushTablesWithReadLock(master.Conn) if err != nil { logprintf("WARN : Could not lock tables on %s (old master) %s", master.URL, err) } logprint("INFO : Switching master") logprint("INFO : Waiting for candidate master to synchronize") masterGtid := dbhelper.GetVariableByName(master.Conn, "GTID_BINLOG_POS") if *verbose { logprintf("DEBUG: Syncing on master GTID Current Pos [%s]", masterGtid) master.log() } dbhelper.MasterPosWait(newMaster.Conn, masterGtid) if *verbose { logprint("DEBUG: MASTER_POS_WAIT executed.") newMaster.log() } // Phase 3: Prepare new master logprint("INFO : Stopping slave thread on new master") err = dbhelper.StopSlave(newMaster.Conn) if err != nil { logprint("WARN : Stopping slave failed on new master") } // Call post-failover script before unlocking the old master. if *postScript != "" { logprintf("INFO : Calling post-failover script") out, err := exec.Command(*postScript, master.Host, newMaster.Host).CombinedOutput() if err != nil { logprint("ERROR:", err) } logprint("INFO : Post-failover script complete", string(out)) } logprint("INFO : Resetting slave on new master and set read/write mode on") err = dbhelper.ResetSlave(newMaster.Conn, true) if err != nil { logprint("WARN : Reset slave failed on new master") } err = dbhelper.SetReadOnly(newMaster.Conn, false) if err != nil { logprint("ERROR: Could not set new master as read-write") } newGtid := dbhelper.GetVariableByName(master.Conn, "GTID_BINLOG_POS") // Insert a bogus transaction in order to have a new GTID pos on master err = dbhelper.FlushTables(newMaster.Conn) if err != nil { logprint("WARN : Could not flush tables on new master", err) } // Phase 4: Demote old master to slave cm := "CHANGE MASTER TO master_host='" + newMaster.IP + "', master_port=" + newMaster.Port + ", master_user='******', master_password='******'" logprint("INFO : Switching old master as a slave") err = dbhelper.UnlockTables(master.Conn) if err != nil { logprint("WARN : Could not unlock tables on old master", err) } dbhelper.StopSlave(master.Conn) // This is helpful because in some cases the old master can have an old configuration running _, err = master.Conn.Exec("SET GLOBAL gtid_slave_pos='" + newGtid + "'") if err != nil { logprint("WARN : Could not set gtid_slave_pos on old master", err) } _, err = master.Conn.Exec(cm + ", master_use_gtid=slave_pos") if err != nil { logprint("WARN : Change master failed on old master", err) } err = dbhelper.StartSlave(master.Conn) if err != nil { logprint("WARN : Start slave failed on old master", err) } if *readonly { err = dbhelper.SetReadOnly(master.Conn, true) if err != nil { logprintf("ERROR: Could not set old master as read-only, %s", err) } } // Phase 5: Switch slaves to new master logprint("INFO : Switching other slaves to the new master") var oldMasterKey int for k, sl := range slaves { if sl.URL == newMaster.URL { slaves[k].URL = master.URL oldMasterKey = k if *verbose { logprintf("DEBUG: New master %s found in slave slice at key %d, reinstancing URL to %s", sl.URL, k, master.URL) } continue } logprintf("INFO : Waiting for slave %s to sync", sl.URL) dbhelper.MasterPosWait(sl.Conn, masterGtid) if *verbose { sl.log() } logprintf("INFO : Change master on slave %s", sl.URL) err := dbhelper.StopSlave(sl.Conn) if err != nil { logprintf("WARN : Could not stop slave on server %s, %s", sl.URL, err) } _, err = sl.Conn.Exec("SET GLOBAL gtid_slave_pos='" + newGtid + "'") if err != nil { logprintf("WARN : Could not set gtid_slave_pos on slave %s, %s", sl.URL, err) } _, err = sl.Conn.Exec(cm) if err != nil { logprintf("ERROR: Change master failed on slave %s, %s", sl.URL, err) } err = dbhelper.StartSlave(sl.Conn) if err != nil { logprintf("ERROR: could not start slave on server %s, %s", sl.URL, err) } if *readonly { err = dbhelper.SetReadOnly(sl.Conn, true) if err != nil { logprintf("ERROR: Could not set slave %s as read-only, %s", sl.URL, err) } } } logprint("INFO : Switchover complete") return newMaster.URL, oldMasterKey }
/* Triggers a master failover. Returns the new master's URL and key */ func (master *ServerMonitor) failover() (string, int) { log.Println("INFO : Starting failover and electing a new master") var nmUrl string key := master.electCandidate(slaves) if key == -1 { return "", -1 } nmUrl = slaves[key].URL log.Printf("INFO : Slave %s has been elected as a new master", nmUrl) newMaster, err := newServerMonitor(nmUrl) if *preScript != "" { log.Printf("INFO : Calling pre-failover script") out, err := exec.Command(*preScript, master.Host, newMaster.Host).CombinedOutput() if err != nil { log.Println("ERROR:", err) } log.Println("INFO : Post-failover script complete:", string(out)) } log.Println("INFO : Switching master") log.Println("INFO : Stopping slave thread on new master") err = dbhelper.StopSlave(newMaster.Conn) if err != nil { log.Println("WARN : Stopping slave failed on new master") } cm := "CHANGE MASTER TO master_host='" + newMaster.IP + "', master_port=" + newMaster.Port + ", master_user='******', master_password='******'" log.Println("INFO : Resetting slave on new master and set read/write mode on") err = dbhelper.ResetSlave(newMaster.Conn, true) if err != nil { log.Println("WARN : Reset slave failed on new master") } err = dbhelper.SetReadOnly(newMaster.Conn, false) if err != nil { log.Println("ERROR: Could not set new master as read-write") } log.Println("INFO : Switching other slaves to the new master") for _, sl := range slaves { log.Printf("INFO : Change master on slave %s", sl.URL) err := dbhelper.StopSlave(sl.Conn) if err != nil { log.Printf("WARN : Could not stop slave on server %s, %s", sl.URL, err) } _, err = sl.Conn.Exec(cm) if err != nil { log.Printf("ERROR: Change master failed on slave %s, %s", sl.URL, err) } err = dbhelper.StartSlave(sl.Conn) if err != nil { log.Printf("ERROR: could not start slave on server %s, %s", sl.URL, err) } if *readonly { err = dbhelper.SetReadOnly(sl.Conn, true) if err != nil { log.Printf("ERROR: Could not set slave %s as read-only, %s", sl.URL, err) } } } if *postScript != "" { log.Printf("INFO : Calling post-failover script") out, err := exec.Command(*postScript, master.Host, newMaster.Host).CombinedOutput() if err != nil { log.Println("ERROR:", err) } log.Println("INFO : Post-failover script complete", string(out)) } log.Println("INFO : Failover complete") return newMaster.URL, key }
/* Triggers a master switchover. Returns the new master's URL */ func masterFailover(fail bool) { logprint("INFO : Starting master switch") // Phase 1: Cleanup and election var err error if fail == false { logprintf("INFO : Flushing tables on %s (master)", master.URL) err = dbhelper.FlushTablesNoLog(master.Conn) if err != nil { logprintf("WARN : Could not flush tables on master", err) } logprint("INFO : Checking long running updates on master") if dbhelper.CheckLongRunningWrites(master.Conn, 10) > 0 { logprint("ERROR: Long updates running on master. Cannot switchover") return } } logprint("INFO : Electing a new master") key := master.electCandidate(slaves) if key == -1 { return } logprintf("INFO : Slave %s [%d] has been elected as a new master", slaves[key].URL, key) // Shuffle the server list oldMaster := master var skey int for k, server := range servers { if slaves[key].URL == server.URL { skey = k break } } master = servers[skey] master.State = stateMaster slaves[key].delete(&slaves) // Call pre-failover script if preScript != "" { logprintf("INFO : Calling pre-failover script") out, err := exec.Command(preScript, oldMaster.Host, master.Host).CombinedOutput() if err != nil { logprint("ERROR:", err) } logprint("INFO : Pre-failover script complete:", string(out)) } // Phase 2: Reject updates and sync slaves if fail == false { oldMaster.freeze() logprintf("INFO : Rejecting updates on %s (old master)", oldMaster.URL) err = dbhelper.FlushTablesWithReadLock(oldMaster.Conn) if err != nil { logprintf("WARN : Could not lock tables on %s (old master) %s", oldMaster.URL, err) } } logprint("INFO : Switching master") if fail == false { logprint("INFO : Waiting for candidate Master to synchronize") oldMaster.refresh() if verbose { logprintf("DEBUG: Syncing on master GTID Binlog Pos [%s]", oldMaster.BinlogPos) oldMaster.log() } dbhelper.MasterPosWait(master.Conn, oldMaster.BinlogPos) if verbose { logprint("DEBUG: MASTER_POS_WAIT executed.") master.log() } } // Phase 3: Prepare new master logprint("INFO : Stopping slave thread on new master") err = dbhelper.StopSlave(master.Conn) if err != nil { logprint("WARN : Stopping slave failed on new master") } // Call post-failover script before unlocking the old master. if postScript != "" { logprintf("INFO : Calling post-failover script") out, err := exec.Command(postScript, oldMaster.Host, master.Host).CombinedOutput() if err != nil { logprint("ERROR:", err) } logprint("INFO : Post-failover script complete", string(out)) } logprint("INFO : Resetting slave on new master and set read/write mode on") err = dbhelper.ResetSlave(master.Conn, true) if err != nil { logprint("WARN : Reset slave failed on new master") } err = dbhelper.SetReadOnly(master.Conn, false) if err != nil { logprint("ERROR: Could not set new master as read-write") } cm := "CHANGE MASTER TO master_host='" + master.IP + "', master_port=" + master.Port + ", master_user='******', master_password='******'" if fail == false { // Get latest GTID pos oldMaster.refresh() // Insert a bogus transaction in order to have a new GTID pos on master err = dbhelper.FlushTables(master.Conn) if err != nil { logprint("WARN : Could not flush tables on new master", err) } // Phase 4: Demote old master to slave logprint("INFO : Switching old master as a slave") err = dbhelper.UnlockTables(oldMaster.Conn) if err != nil { logprint("WARN : Could not unlock tables on old master", err) } dbhelper.StopSlave(oldMaster.Conn) // This is helpful because in some cases the old master can have an old configuration running _, err = oldMaster.Conn.Exec("SET GLOBAL gtid_slave_pos='" + oldMaster.BinlogPos + "'") if err != nil { logprint("WARN : Could not set gtid_slave_pos on old master", err) } _, err = oldMaster.Conn.Exec(cm + ", master_use_gtid=slave_pos") if err != nil { logprint("WARN : Change master failed on old master", err) } err = dbhelper.StartSlave(oldMaster.Conn) if err != nil { logprint("WARN : Start slave failed on old master", err) } if readonly { err = dbhelper.SetReadOnly(oldMaster.Conn, true) if err != nil { logprintf("ERROR: Could not set old master as read-only, %s", err) } } // Add the old master to the slaves list oldMaster.State = stateSlave slaves = append(slaves, oldMaster) } // Phase 5: Switch slaves to new master logprint("INFO : Switching other slaves to the new master") for _, sl := range slaves { if fail == false { logprintf("INFO : Waiting for slave %s to sync", sl.URL) dbhelper.MasterPosWait(sl.Conn, oldMaster.BinlogPos) if verbose { sl.log() } } logprint("INFO : Change master on slave", sl.URL) err := dbhelper.StopSlave(sl.Conn) if err != nil { logprintf("WARN : Could not stop slave on server %s, %s", sl.URL, err) } if fail == false { _, err = sl.Conn.Exec("SET GLOBAL gtid_slave_pos='" + oldMaster.BinlogPos + "'") if err != nil { logprintf("WARN : Could not set gtid_slave_pos on slave %s, %s", sl.URL, err) } } _, err = sl.Conn.Exec(cm) if err != nil { logprintf("ERROR: Change master failed on slave %s, %s", sl.URL, err) } err = dbhelper.StartSlave(sl.Conn) if err != nil { logprintf("ERROR: could not start slave on server %s, %s", sl.URL, err) } if readonly { err = dbhelper.SetReadOnly(sl.Conn, true) if err != nil { logprintf("ERROR: Could not set slave %s as read-only, %s", sl.URL, err) } } } if postScript != "" { logprintf("INFO : Calling post-failover script") out, err := exec.Command(postScript, oldMaster.Host, master.Host).CombinedOutput() if err != nil { logprint("ERROR:", err) } logprint("INFO : Post-failover script complete", string(out)) } logprintf("INFO : Master switch on %s complete", master.URL) failCount = 0 if fail == true { failoverCtr++ failoverTs = time.Now().Unix() } return }