func clearDb(dbName string, swarm *swarm.Swarm) error { cmd := []string{} cmd = append(cmd, "mysql") cmd = append(cmd, "-u"+"cup_dba") cmd = append(cmd, "-p"+"111111") cmd = append(cmd, "-S"+"/DBAASDAT/upsql.sock") cmd = append(cmd, "-e drop database qjrtest1") res, err := swarm.Exec(cmd, dbName) if err != nil { log.WithFields(log.Fields{ "DbName": dbName, "err": err.Error(), }).Error("drop qjrtest1 fail") return err } res.Close() cmd = []string{} cmd = append(cmd, "mysql") cmd = append(cmd, "-u"+"cup_dba") cmd = append(cmd, "-p"+"111111") cmd = append(cmd, "-S"+"/DBAASDAT/upsql.sock") cmd = append(cmd, "-e drop database qjrtest2") res1, err := swarm.Exec(cmd, dbName) if err != nil { log.WithFields(log.Fields{ "DbName": dbName, "err": err.Error(), }).Error("drop qjrtest2 fail") return err } res1.Close() return nil }
func clearMaster(dbName string, swm *swarm.Swarm) error { err := swm.MasterReset(dbName) if err != nil { return err } return nil }
func voteSl(swm *swarm.Swarm, topology *structs.Topology) (string, error) { maxMasterLogFileNum := 0 maxReadMasterLogPos := 0 ch := make(chan []interface{}) maxDbName := "" for k, v := range topology.DataNodeGroup["default"] { if v.Type == consts.Slave && v.Status == consts.Normal { dbName := k go func() { masterLogFileNum, readMasterLogPos := swm.PositionGet(dbName) log.WithFields(log.Fields{ "SlaveName": dbName, "MasterLogFileNum": masterLogFileNum, "ReadMasterLogPos": readMasterLogPos, }).Debug("Sl vote") pos := []interface{}{dbName, masterLogFileNum, readMasterLogPos} ch <- pos }() } } for i := 0; i < topology.DataNodeGroupNormalCount["default"]-2; i++ { pos := <-ch dbName := pos[0].(string) masterLogFileNum := pos[1].(int) readMasterLogPos := pos[2].(int) if masterLogFileNum > maxMasterLogFileNum { if readMasterLogPos > maxReadMasterLogPos { maxMasterLogFileNum = masterLogFileNum maxReadMasterLogPos = readMasterLogPos maxDbName = dbName } } } // all sl unavaliable if maxDbName == "" { return "", errors.New( "when isolate M|Sb, all slave are unavilable for vote, isolate failed and return to hope health check enter sl auto isolate") } maxDbInfo := topology.DataNodeGroup["default"][maxDbName] log.WithFields(log.Fields{ "winer": maxDbName + "(" + maxDbInfo.Ip + ":" + strconv.Itoa(maxDbInfo.Port) + ")", }).Debug("Sl vote") return maxDbName, nil }
func createDb(dbName, dbString string, swarm *swarm.Swarm) error { cmd := []string{} cmd = append(cmd, "mysql") cmd = append(cmd, "-u"+"cup_dba") cmd = append(cmd, "-p"+"111111") cmd = append(cmd, "-S"+"/DBAASDAT/upsql.sock") cmd = append(cmd, "-e create database "+dbString) res, err := swarm.Exec(cmd, dbName) if err != nil { log.WithFields(log.Fields{ "DbName": dbName, "err": err.Error(), "Database": dbString, }).Error("create test database fail") return err } res.Close() return nil }
func clearSlave(dbName string, swm *swarm.Swarm) error { slaveStatus, err := swm.SlaveShowStatus(dbName) if err != nil { return err } if slaveStatus.MasterIp == "" { return nil } err = swm.SlaveStop(dbName) if err != nil { return err } err = swm.SlaveReset(dbName) if err != nil { return err } return nil }
func changeMaster(swm *swarm.Swarm, fromName, toIp string, toPort int) error { slaveStatus, err := swm.SlaveShowStatus(fromName) if err != nil { return err } log.WithFields(log.Fields{ "DbName": fromName, "SlaveStatus": slaveStatus, }).Debug("change master") if slaveStatus.MasterIp == toIp && slaveStatus.MasterPort == toPort { if slaveStatus.SlaveIoRunning == "No" && slaveStatus.SlaveSqlRunning == "No" { // 1.4 start slave err = swm.SlaveStart(fromName) if err != nil { return err } } return nil } if slaveStatus.SlaveIoRunning == "Yes" || slaveStatus.SlaveSqlRunning == "Yes" { // 1.1 stop slave err = swm.SlaveStop(fromName) if err != nil { return err } } if slaveStatus.MasterIp != "" { // 1.2 reset slave err = swm.SlaveReset(fromName) if err != nil { return err } } // 1.3 get gtid_executed gtidExecuted, err := swm.GetGtidExecuted(fromName) if err != nil { return err } if gtidExecuted != "" { // 1.4 reset master err = swm.MasterReset(fromName) if err != nil { return err } // 1.5 set gtid_purged err = swm.SetGtidPurged(fromName, gtidExecuted) if err != nil { return err } } // 1.6 change master err = swm.ChangeMaster(fromName, toIp, toPort) if err != nil { return err } // 1.7 start slave err = swm.SlaveStart(fromName) if err != nil { return err } return nil }
// no need RLock func RecoverDb(dbName string, rwMutex *sync.RWMutex, topology *structs.Topology, swm *swarm.Swarm) (error, []string) { var err error var steps []string defer func() { if err != nil { log.WithFields(log.Fields{ "DbName": dbName, "err": err.Error(), }).Error("recover fail") } else { log.WithFields(log.Fields{ "DbName": dbName, }).Info("recover success") } }() var recoverDbInfo *structs.DatabaseInfo for k, v := range topology.DataNodeGroup["default"] { if k == dbName { recoverDbInfo = v break } } if recoverDbInfo == nil { return errors.New("recover db node not found"), steps } // 1. check m slave status if recoverDbInfo.Type == consts.StandBy || recoverDbInfo.Type == consts.Slave { mName, _ := getMaster(topology) var masterSlStatus structs.SlaveStatus masterSlStatus, err = swm.SlaveShowStatus(mName) if err != nil { steps = append(steps, fmt.Sprintf("Step1. Clear M status fail: %s\n", err.Error())) return err, steps } if masterSlStatus.MasterIp != "" { // 1.1 stop m slave err = swm.SlaveStop(mName) if err != nil { steps = append(steps, fmt.Sprintf("Step1. Clear M status fail: %s\n", err.Error())) return err, steps } // 1.2 reset m slave err = swm.SlaveReset(mName) if err != nil { steps = append(steps, fmt.Sprintf("Step1. Clear M status fail: %s\n", err.Error())) return err, steps } } } steps = append(steps, "Step1. Clear M status success\n") // 2. recover change master if recoverDbInfo.Type == consts.StandBy { mName, mDbInfo := getMaster(topology) err = changeMaster(swm, dbName, mDbInfo.Ip, mDbInfo.Port) if err != nil { log.WithFields(log.Fields{ "Sb": dbName + "(" + recoverDbInfo.Ip + ":" + strconv.Itoa(recoverDbInfo.Port) + ")", "M": mName + "(" + mDbInfo.Ip + ":" + strconv.Itoa(mDbInfo.Port) + ")", "err:": err.Error(), }).Error("change master fail, return") steps = append(steps, fmt.Sprintf("Step2. Sb change master to M fail: %s\n", err.Error())) return err, steps } log.WithFields(log.Fields{ "Sb": dbName + "(" + recoverDbInfo.Ip + ":" + strconv.Itoa(recoverDbInfo.Port) + ")", "M": mName + "(" + mDbInfo.Ip + ":" + strconv.Itoa(mDbInfo.Port) + ")", }).Debug("change master success") steps = append(steps, "Step2. Sb change master to M success\n") } else if recoverDbInfo.Type == consts.Slave { sbName, sbDbInfo := getStandBy(topology) err = changeMaster(swm, dbName, sbDbInfo.Ip, sbDbInfo.Port) if err != nil { log.WithFields(log.Fields{ "Sl": dbName + "(" + recoverDbInfo.Ip + ":" + strconv.Itoa(recoverDbInfo.Port) + ")", "Sb": sbName + "(" + sbDbInfo.Ip + ":" + strconv.Itoa(sbDbInfo.Port) + ")", "err:": err.Error(), }).Error("change master fail, return") steps = append(steps, fmt.Sprintf("Step2. Sl change master to Sb fail: %s\n", err.Error())) return err, steps } log.WithFields(log.Fields{ "Sl": dbName + "(" + recoverDbInfo.Ip + ":" + strconv.Itoa(recoverDbInfo.Port) + ")", "Sb": sbName + "(" + sbDbInfo.Ip + ":" + strconv.Itoa(sbDbInfo.Port) + ")", }).Info("change master success") steps = append(steps, "Step2. Sl change master to Sb success\n") } // 3. update topology rwMutex.Lock() dbInfo := getDbInfo(dbName, topology) dbInfo.Status = consts.Normal updateVersionAndNormalCount(topology, 1) rwMutex.Unlock() return nil, steps }
// no need RLock func IsolateDB(dbName string, rwMutex *sync.RWMutex, topology *structs.Topology, swm *swarm.Swarm) (error, []string) { var err error steps := []string{} defer func() { if err != nil && err.Error() != "" { log.WithFields(log.Fields{ "DbName": dbName, "err": err.Error(), }).Error("isolate fail") } else { log.WithFields(log.Fields{ "DbName": dbName, }).Info("isolate success") } }() var isolateType string for k, v := range topology.DataNodeGroup["default"] { if k == dbName { isolateType = v.Type break } } if isolateType == "" { err = errors.New("isolate db node not found") return err, steps } btopology, _ := json.MarshalIndent(topology, "", " ") log.WithFields(log.Fields{ "IsolateDbType": isolateType, "DbName": dbName, "topology": string(btopology), }).Debug("isolate db") // only m normal || // isolate sb && only m sb normal || // isolate sl // no need swarm, update topology and return if topology.DataNodeGroupNormalCount["default"] == 1 || (isolateType == consts.StandBy && topology.DataNodeGroupNormalCount["default"] == 2) || isolateType == consts.Slave { rwMutex.Lock() topology.DataNodeGroup["default"][dbName].Status = consts.Abnormal updateVersionAndNormalCount(topology, -1) rwMutex.Unlock() steps = append(steps, fmt.Sprintf("Step1. Isolate %s success\n", dbName)) return nil, steps } if isolateType == consts.Master { // 1.sb checkGtidDiff sbName, sbDbInfo := getStandBy(topology) err = swm.IsDelay(sbName) if err != nil { log.WithFields(log.Fields{ "DbName": sbName, "err": err.Error(), }).Warn("Sb delay check fail, continue") err = nil } // 1. end ******************** // m sb normal // update topology and return if topology.DataNodeGroupNormalCount["default"] == 2 { // 3.update topology _, mDbInfo := getMaster(topology) rwMutex.Lock() sbDbInfo.Type = consts.Master // sb -> m mDbInfo.Type = consts.StandBy // m -> sb mDbInfo.Status = consts.Abnormal updateVersionAndNormalCount(topology, -1) steps = append(steps, fmt.Sprintf("Step1. Isolate %s success\n", dbName)) rwMutex.Unlock() return nil, steps // ******************** } // has normal sl // 2.vote sb var maxGtidDbName string maxGtidDbName, err = voteSl(swm, topology) if err != nil { steps = append(steps, fmt.Sprintf("Step2. Vote new Sb fail: %s\n", err.Error())) return err, steps } steps = append(steps, fmt.Sprintf("Step2. Vote new Sb success:%s\n", maxGtidDbName)) // 2. end ******************** // 3.update topology rwMutex.Lock() mName, mDbInfo := getMaster(topology) mDbInfo.Type = consts.Slave // m -> sl mDbInfo.Status = consts.Abnormal sbDbInfo.Type = consts.Master // sb -> master slDbInfo := topology.DataNodeGroup["default"][maxGtidDbName] slDbInfo.Type = consts.StandBy // maxGtidSl -> sb updateVersionAndNormalCount(topology, -1) rwMutex.Unlock() // 3. end ******************** // 4.for other sl(except origin m) changeMaster to newSb errs := otherSlChangeMasterParallel(topology, mName, maxGtidDbName, slDbInfo, swm) if len(errs) != 0 { steps = append(steps, fmt.Sprintf("Step3. Other Sl change master to new Sb fail:%v\n", errs)) } else { steps = append(steps, "Step3. Other Sl change master to new Sb success\n") } // 4. end ******************** } else if isolateType == consts.StandBy { sbName, sbDbInfo := getStandBy(topology) // has normal sl // 1.vote sb var maxGtidDbName string maxGtidDbName, err = voteSl(swm, topology) if err != nil { return err, steps } // 1. end ******************** // no need check gtidDiff // 2. newsb change master to master mName, mDbInfo := getMaster(topology) slDbInfo := topology.DataNodeGroup["default"][maxGtidDbName] err = changeMaster(swm, maxGtidDbName, mDbInfo.Ip, mDbInfo.Port) if err != nil { log.WithFields(log.Fields{ "Sb": maxGtidDbName + ":" + slDbInfo.Ip + strconv.Itoa(slDbInfo.Port), "M": mName + ":" + mDbInfo.Ip + strconv.Itoa(mDbInfo.Port), "err:": err.Error(), }).Error("new Sb change master to M fail, return") steps = append(steps, fmt.Sprintf("Step1. New Sb change master to M fail:%s\n", err.Error())) return err, steps } log.WithFields(log.Fields{ "Sb": maxGtidDbName + ":" + slDbInfo.Ip + strconv.Itoa(slDbInfo.Port), "M": mName + ":" + mDbInfo.Ip + strconv.Itoa(mDbInfo.Port), "IsolateType": isolateType, }).Debug("new Sb change master to M success") steps = append(steps, "Step1. New Sb change master to M success\n") // 2. end ******************** // 3.update topology rwMutex.Lock() sbDbInfo.Type = consts.Slave // sb -> sl sbDbInfo.Status = consts.Abnormal slDbInfo.Type = consts.StandBy // maxGtidSl -> sb updateVersionAndNormalCount(topology, -1) rwMutex.Unlock() // 3. end ******************** // 4.for other sl(except origin sb) changeMaster to newSb errs := otherSlChangeMasterParallel(topology, sbName, maxGtidDbName, slDbInfo, swm) if len(errs) != 0 { steps = append(steps, fmt.Sprintf("Step2. Other Sl change master to new Sb fail:%v\n", errs)) } else { steps = append(steps, "Step2. Other Sl change master to new Sb success\n") } // 4. end ******************** } else { return errors.New("Unhandle isolate logic"), steps } return nil, steps }
func checkTopo(swarm *swarm.Swarm, topology *structs.Topology, mCheck bool, t *testing.T) error { log.Println("***CheckTopo start***") var mName, mIp, sbName, sbIp string var mPort, sbPort int for k, v := range topology.DataNodeGroup["default"] { if v.Type == consts.Master && v.Status == consts.Normal { mName = k mIp = v.Ip mPort = v.Port } else if v.Type == consts.StandBy && v.Status == consts.Normal { sbName = k sbIp = v.Ip sbPort = v.Port } } if mCheck { // check m mSlaveStatus, err := swarm.SlaveShowStatus(mName) if err != nil { return fmt.Errorf(err.Error()) } if mSlaveStatus.MasterIp != "" || mSlaveStatus.MasterPort != 0 { return fmt.Errorf("master %s init/isolate/recover err, slave status:%#v\n", mName, mSlaveStatus) } } if sbName == "" { log.Println("***CheckTopo end***") return nil } // check sb sbSlaveStatus, err := swarm.SlaveShowStatus(sbName) if err != nil { return fmt.Errorf(err.Error()) } if sbSlaveStatus.MasterIp != mIp || sbSlaveStatus.MasterPort != mPort || sbSlaveStatus.SlaveIoRunning != "Yes" || sbSlaveStatus.SlaveSqlRunning != "Yes" { return fmt.Errorf("standby %s init/isolate/recover err, slave status:%#v\n", sbName, sbSlaveStatus) } // check sl if topology.DataNodeGroupNormalCount["default"] > 2 { for k, v := range topology.DataNodeGroup["default"] { if v.Type == consts.Slave && v.Status == consts.Normal { slSlaveStatus, err := swarm.SlaveShowStatus(k) if err != nil { return fmt.Errorf(err.Error()) } if slSlaveStatus.MasterIp != sbIp || slSlaveStatus.MasterPort != sbPort || slSlaveStatus.SlaveIoRunning != "Yes" || slSlaveStatus.SlaveSqlRunning != "Yes" { return fmt.Errorf("slave %s init/isolate/recover err, slave status:%#v\n", k, sbSlaveStatus) } } } } log.Println("***CheckTopo end***") return nil }