func BeforePromote(oldMaster string) error { conn, _ := zkhelper.ConnectToZk(*zkAddr) defer conn.Close() groups, err := models.ServerGroups(conn, *productName) if err != nil { log.Errorf("get server groups error %v, give up failover", err) return failover.ErrGiveupFailover } found := false for _, group := range groups { for _, server := range group.Servers { if server.Addr == oldMaster { found = true break } } } if !found { log.Errorf("can not find %s in any groups, give up failover", oldMaster) return failover.ErrGiveupFailover } return nil }
func (c *Conn) Run() { defer func() { r := recover() if err, ok := r.(error); ok { const size = 4096 buf := make([]byte, size) buf = buf[:runtime.Stack(buf, false)] log.Errorf("lastCmd %s, %v, %s", c.lastCmd, err, buf) } c.Close() }() for { c.alloc.Reset() data, err := c.readPacket() if err != nil { if err.Error() != io.EOF.Error() { log.Info(err) } return } if err := c.dispatch(data); err != nil { log.Errorf("dispatch error %s, %s", errors.ErrorStack(err), c) if err != mysql.ErrBadConn { //todo: fix this c.writeError(err) } } c.pkg.Sequence = 0 } }
func handleCrashedServer(s *models.Server) error { switch s.Type { case models.SERVER_TYPE_MASTER: //get slave and do promote slave, err := getSlave(s) if err != nil { log.Warning(errors.ErrorStack(err)) return err } log.Infof("try promote %+v", slave) err = callHttp(nil, genUrl(*apiServer, "/api/server_group/", slave.GroupId, "/promote"), "POST", slave) if err != nil { log.Errorf("do promote %v failed %v", slave, errors.ErrorStack(err)) return err } refreshSlave(s) //刷新 case models.SERVER_TYPE_SLAVE: log.Errorf("slave is down: %+v", s) case models.SERVER_TYPE_OFFLINE: //no need to handle it default: log.Fatalf("unkonwn type %+v", s) } return nil }
// wait for the slot reload chan and reload cluster topology // at most every slotReloadInterval // it also reload topology at a relative long periodic interval func (d *Dispatcher) slotsReloadLoop() { periodicReloadInterval := 60 * time.Second for { select { case <-time.After(d.slotReloadInterval): select { case _, ok := <-d.slotReloadChan: if !ok { log.Infof("exit reload slot table loop") return } log.Infof("request reload triggered") if slotInfos, err := d.reloadTopology(); err != nil { log.Errorf("reload slot table failed") } else { d.slotInfoChan <- slotInfos } case <-time.After(periodicReloadInterval): log.Infof("periodic reload triggered") if slotInfos, err := d.reloadTopology(); err != nil { log.Errorf("reload slot table failed") } else { d.slotInfoChan <- slotInfos } } } } }
func (s *MemcacheStats) publishMainStats() { s.mu.Lock() defer s.mu.Unlock() s.main = make(map[string]string) for key, isstr := range mainStringMetrics { key := key if isstr { s.main[key] = "" stats.Publish(s.cachePool.name+"Memcache"+formatKey(key), stats.StringFunc(func() string { s.mu.Lock() defer s.mu.Unlock() return s.main[key] })) } else { s.main[key] = "0" stats.Publish(s.cachePool.name+"Memcache"+formatKey(key), stats.IntFunc(func() int64 { s.mu.Lock() defer s.mu.Unlock() ival, err := strconv.ParseInt(s.main[key], 10, 64) if err != nil { log.Errorf("value '%v' for key %v is not an int", s.main[key], key) internalErrors.Add("MemcacheStats", 1) return -1 } return ival })) } } }
func WaitForReceiverWithTimeout(zkConn zkhelper.Conn, productName string, actionZkPath string, proxies []ProxyInfo, timeoutInMs int) error { if len(proxies) == 0 { return nil } times := 0 proxyIds := make(map[string]struct{}) var offlineProxyIds []string for _, p := range proxies { proxyIds[p.Id] = struct{}{} } checkTimes := timeoutInMs / 500 // check every 500ms for times < checkTimes { if times >= 6 && (times*500)%1000 == 0 { log.Warning("abnormal waiting time for receivers", actionZkPath, offlineProxyIds) } // get confirm ids nodes, _, err := zkConn.Children(actionZkPath) if err != nil { return errors.Trace(err) } confirmIds := make(map[string]struct{}) for _, node := range nodes { id := path.Base(node) confirmIds[id] = struct{}{} } if len(confirmIds) != 0 { match := true // check if all proxy have responsed var notMatchList []string for id, _ := range proxyIds { // if proxy id not in confirm ids, means someone didn't response if _, ok := confirmIds[id]; !ok { match = false notMatchList = append(notMatchList, id) } } if match { return nil } offlineProxyIds = notMatchList } times += 1 time.Sleep(500 * time.Millisecond) } if len(offlineProxyIds) > 0 { log.Error("proxies didn't responed: ", offlineProxyIds) } // set offline proxies for _, id := range offlineProxyIds { log.Errorf("mark proxy %s to PROXY_STATE_MARK_OFFLINE", id) if err := SetProxyStatus(zkConn, productName, id, PROXY_STATE_MARK_OFFLINE); err != nil { return errors.Trace(err) } } return errors.Trace(ErrReceiverTimeout) }
func (self *ResMan) handleMesosError(t *cmdMesosError) { defer func() { t.wait <- struct{}{} }() log.Errorf("%s\n", t.err) }
func Raw(t byte, val Value, isUnsigned bool) []byte { if val == nil { return nil } var ret []byte switch t { case MYSQL_TYPE_TINY, MYSQL_TYPE_SHORT, MYSQL_TYPE_INT24, MYSQL_TYPE_LONG, MYSQL_TYPE_LONGLONG, MYSQL_TYPE_YEAR: if isUnsigned { ret = []byte(strconv.FormatUint(val.(uint64), 10)) } else { ret = []byte(strconv.FormatInt(val.(int64), 10)) } case MYSQL_TYPE_FLOAT, MYSQL_TYPE_DOUBLE: ret = []byte(strconv.FormatFloat(val.(float64), 'f', 16, 64)) case MYSQL_TYPE_VARCHAR: str, ok := val.(string) if ok { ret = hack.Slice(str) break } fallthrough default: var ok bool ret, ok = val.([]byte) if !ok { log.Errorf("%v, %+v, %T", t, val, val) } } return ret }
func (s *MemcacheStats) updateItemsStats() { if s.items == nil { return } s.readStats("items", func(sKey, sValue string) { ival, err := strconv.ParseInt(sValue, 10, 64) if err != nil { log.Error(err) internalErrors.Add("MemcacheStats", 1) return } subkey, slabid, err := parseItemKey(sKey) if err != nil { log.Error(err) internalErrors.Add("MemcacheStats", 1) return } m, ok := s.items[subkey] if !ok { log.Errorf("Unknown memcache items stats %v %v: %v", subkey, slabid, ival) internalErrors.Add("MemcacheStats", 1) return } m[slabid] = ival }) }
func refreshSlave(master *models.Server) { var group models.ServerGroup err := callHttp(&group, genUrl(*apiServer, "/api/server_group/", master.GroupId), "GET", nil) if err == nil { for _, s := range group.Servers { if s.Type == models.SERVER_TYPE_SLAVE { err := callHttp(nil, genUrl(*apiServer, "/api/server_group/", master.GroupId, "/addServer"), "PUT", s) if err != nil { log.Errorf("slave refresh failed: %v error:v%", s, errors.ErrorStack(err)) } } } } else { log.Errorf("slave refresh failed: %v", errors.Trace(err)) } }
func (s *Server) OnSlotRangeChange(param *models.SlotMultiSetParam) { log.Warningf("slotRangeChange %+v", param) if !validSlot(param.From) || !validSlot(param.To) { log.Errorf("invalid slot number, %+v", param) return } for i := param.From; i <= param.To; i++ { switch param.Status { case models.SLOT_STATUS_OFFLINE: s.clearSlot(i) case models.SLOT_STATUS_ONLINE: s.fillSlot(i, true) default: log.Errorf("can not handle status %v", param.Status) } } }
func (s *Server) Run() error { for { conn, err := s.listener.Accept() if err != nil { log.Errorf("accept error %s", err.Error()) return err } go s.onConn(conn) } return nil }
func (s *MemcacheStats) readStats(k string, proc func(key, value string)) { defer func() { if x := recover(); x != nil { log.Errorf("Could not read memcache stats: %v", x) internalErrors.Add("MemcacheStats", 1) } }() conn := s.cachePool.Get(0) // This is not the same as defer rc.cachePool.Put(conn) defer func() { s.cachePool.Put(conn) }() stats, err := conn.Stats(k) if err != nil { conn.Close() conn = nil log.Errorf("Cannot export memcache %v stats: %v", k, err) internalErrors.Add("MemcacheStats", 1) return } s.mu.Lock() defer s.mu.Unlock() st := string(stats) lines := strings.Split(st, "\n") for _, line := range lines { if line == "" { continue } items := strings.Split(line, " ") if len(items) < 3 { //liuqi: if using apt-get, memcached info would be: STAT version 1.4.14 (Ubuntu) log.Errorf("Unexpected stats: %v", line) internalErrors.Add("MemcacheStats", 1) continue } proc(items[1], items[2]) } }
func WaitForReceiver(zkConn zkhelper.Conn, productName string, actionZkPath string, proxies []ProxyInfo) error { if len(proxies) == 0 { return nil } times := 0 var proxyIds []string var offlineProxyIds []string for _, p := range proxies { proxyIds = append(proxyIds, p.Id) } sort.Strings(proxyIds) // check every 500ms for times < 60 { if times >= 6 && (times*500)%1000 == 0 { log.Warning("abnormal waiting time for receivers", actionZkPath) } nodes, _, err := zkConn.Children(actionZkPath) if err != nil { return errors.Trace(err) } var confirmIds []string for _, node := range nodes { id := path.Base(node) confirmIds = append(confirmIds, id) } if len(confirmIds) != 0 { sort.Strings(confirmIds) if utils.Strings(proxyIds).Eq(confirmIds) { return nil } offlineProxyIds = proxyIds[len(confirmIds)-1:] } times += 1 time.Sleep(500 * time.Millisecond) } if len(offlineProxyIds) > 0 { log.Error("proxies didn't responed: ", offlineProxyIds) } // set offline proxies for _, id := range offlineProxyIds { log.Errorf("mark proxy %s to PROXY_STATE_MARK_OFFLINE", id) if err := SetProxyStatus(zkConn, productName, id, PROXY_STATE_MARK_OFFLINE); err != nil { return err } } return ErrReceiverTimeout }
func (s *MemcacheStats) updateSlabsStats() { if s.slabs == nil { return } s.readStats("slabs", func(sKey, sValue string) { ival, err := strconv.ParseInt(sValue, 10, 64) if err != nil { log.Error(err) internalErrors.Add("MemcacheStats", 1) return } if slabsSingleMetrics[sKey] { m, ok := s.slabs[sKey] if !ok { log.Errorf("Unknown memcache slabs stats %v: %v", sKey, ival) internalErrors.Add("MemcacheStats", 1) return } m[""] = ival return } subkey, slabid, err := parseSlabKey(sKey) if err != nil { log.Error(err) internalErrors.Add("MemcacheStats", 1) return } m, ok := s.slabs[subkey] if !ok { log.Errorf("Unknown memcache slabs stats %v %v: %v", subkey, slabid, ival) internalErrors.Add("MemcacheStats", 1) return } m[slabid] = ival }) }
func genUrl(args ...interface{}) string { url := "http://" for _, v := range args { switch v.(type) { case string: url += v.(string) case int: url += strconv.Itoa(v.(int)) default: log.Errorf("unsupported type %T", v) } } return url }
func (self *ShellExecutor) OnKillTask(driver *mesos.ExecutorDriver, tid mesos.TaskID) { taskId := tid.GetValue() log.Warningf("OnKillTask %s", taskId) self.lock.Lock() defer self.lock.Unlock() if cmd, ok := self.process[taskId]; ok { err := cmd.Process.Kill() if err != nil { log.Errorf("kill taskId %s failed, err:%v", taskId, err) } } log.Error("send kill state") self.sendStatusUpdate(tid.GetValue(), mesos.TaskState_TASK_KILLED, "task killed by framework!") }
func (si *SchemaInfo) override() { for _, override := range si.overrides { table, ok := si.tables[override.Name] if !ok { log.Warningf("Table not found for override: %v, %v", override, si.tables) continue } if override.PKColumns != nil { log.Infof("SetPK Table name %s, pk %v", override.Name, override.PKColumns) if err := table.SetPK(override.PKColumns); err != nil { log.Errorf("%s: %v", errors.ErrorStack(err), override) continue } } if si.cachePool.IsClosed() || override.Cache == nil { log.Infof("%+v", override) continue } switch override.Cache.Type { case "RW": table.CacheType = schema.CACHE_RW table.Cache = NewRowCache(table, si.cachePool) case "W": table.CacheType = schema.CACHE_W if len(override.Cache.Table) == 0 { log.Warningf("Incomplete cache specs: %v", override) continue } totable, ok := si.tables[override.Cache.Table] if !ok { log.Warningf("Table not found: %v", override) continue } if totable.Cache == nil { log.Warningf("Table has no cache: %v", override) continue } table.Cache = totable.Cache default: log.Warningf("Ignoring cache override: %+v", override) } } }
func main() { logging.SetRotateByHour() // logging.SetRotateByDay() err := logging.SetOutputByName("example.log") checkError(err) timer := time.NewTicker(time.Duration(10) * time.Second) for { select { case <-timer.C: logging.Debug(time.Now().String()) logging.Info(time.Now().String()) logging.Warningf("%s", time.Now().String()) logging.Errorf("%s", time.Now().String()) } } }
func (top *Topology) GetSlotByIndex(i int) (*models.Slot, *models.ServerGroup, error) { slot, err := models.GetSlot(top.zkConn, top.ProductName, i) if err != nil { return nil, nil, errors.Trace(err) } log.Debugf("get slot %d : %+v", i, slot) if slot.State.Status != models.SLOT_STATUS_ONLINE && slot.State.Status != models.SLOT_STATUS_MIGRATE { log.Errorf("slot not online, %+v", slot) } groupServer, err := models.GetGroup(top.zkConn, top.ProductName, slot.GroupId) if err != nil { return nil, nil, errors.Trace(err) } return slot, groupServer, nil }
func (mc *MultiKeyCmd) CoalesceRsp() *PipelineResponse { plRsp := &PipelineResponse{} var rsp *resp.Data switch mc.CmdType() { case MGET: rsp = &resp.Data{T: resp.T_Array, Array: make([]*resp.Data, mc.numSubCmds)} case MSET: rsp = OK_DATA case DEL: rsp = &resp.Data{T: resp.T_Integer} default: panic("invalid multi key cmd name") } for i, subCmdRsp := range mc.subCmdRsps { if subCmdRsp.err != nil { rsp = &resp.Data{T: resp.T_Error, String: []byte(subCmdRsp.err.Error())} break } reader := bufio.NewReader(bytes.NewReader(subCmdRsp.rsp.Raw())) data, err := resp.ReadData(reader) if err != nil { log.Errorf("re-parse response err=%s", err) rsp = &resp.Data{T: resp.T_Error, String: []byte(err.Error())} break } if data.T == resp.T_Error { rsp = data break } switch mc.CmdType() { case MGET: rsp.Array[i] = data case MSET: case DEL: rsp.Integer += data.Integer default: panic("invalid multi key cmd name") } } plRsp.rsp = resp.NewObjectFromData(rsp) return plRsp }
func (self *ShellExecutor) OnKillTask(driver *mesos.ExecutorDriver, tid mesos.TaskID) { taskId := tid.GetValue() log.Warningf("OnKillTask %s", taskId) self.lock.Lock() defer self.lock.Unlock() if contex, ok := self.process[taskId]; ok { ret, _ := exec.Command("pgrep", "-P", strconv.Itoa(contex.cmd.Process.Pid)).Output() log.Debug("children process", string(ret)) log.Debug("pid", contex.cmd.Process.Pid) ret, err := exec.Command("pkill", "-P", strconv.Itoa(contex.cmd.Process.Pid)).Output() if err != nil { log.Errorf("kill taskId %s failed, err:%v", taskId, err) } log.Debugf("kill taskId %s result %v", taskId, ret) contex.statusFile.Stop() } //log.Error("send kill state") //self.sendStatusUpdate(tid.GetValue(), mesos.TaskState_TASK_KILLED, "") }
func (s *Server) onConn(c net.Conn) { conn := s.newConn(c) if err := conn.Handshake(); err != nil { log.Errorf("handshake error %s", errors.ErrorStack(err)) c.Close() return } const key = "connections" s.IncCounter(key) defer func() { s.DecCounter(key) log.Infof("close %s", conn) }() s.rwlock.Lock() s.clients[conn.connectionId] = conn s.rwlock.Unlock() conn.Run() }
func decodeArgs(cmd uint32, buf []byte) ([][]byte, bool) { argc := common.ArgCount(cmd) //log.Debug("cmd:", common.CmdDescription(cmd), "details:", buf) if argc == 0 { return nil, true } args := make([][]byte, 0, argc) if argc == 1 { args = append(args, buf) return args, true } endPos := 0 cnt := 0 for ; cnt < argc-1 && endPos < len(buf); cnt++ { startPos := endPos pos := bytes.IndexByte(buf[startPos:], 0x0) if pos == -1 { log.Warning("invalid protocol") return nil, false } endPos = startPos + pos args = append(args, buf[startPos:endPos]) endPos++ } args = append(args, buf[endPos:]) //option data cnt++ if cnt != argc { log.Errorf("argc not match %d-%d", argc, len(args)) return nil, false } return args, true }
func handleAddServer(s *models.Server) { s.Type = models.SERVER_TYPE_SLAVE log.Infof("try reusing slave %+v", s) err := callHttp(nil, genUrl(*apiServer, "/api/server_group/", s.GroupId, "/addServer"), "PUT", s) log.Errorf("do reusing slave %v failed %v", s, errors.ErrorStack(err)) }
func (self *session) handleConnection(s *Server, conn net.Conn) { sessionId := s.allocSessionId() inbox := make(chan []byte, 200) out := make(chan []byte, 200) defer func() { if self.w != nil || self.c != nil { e := &event{tp: ctrlCloseSession, fromSessionId: sessionId, result: createResCh()} s.protoEvtCh <- e <-e.result close(inbox) //notify writer to quit } }() log.Debug("new sessionId", sessionId, "address:", conn.RemoteAddr()) go queueingWriter(inbox, out) go writer(conn, out) r := bufio.NewReaderSize(conn, 256*1024) //todo:1. reuse event's result channel, create less garbage. //2. heavily rely on goroutine switch, send reply in EventLoop can make it faster, but logic is not that clean //so i am not going to change it right now, maybe never for { tp, buf, err := ReadMessage(r) if err != nil { log.Debug(err, "sessionId", sessionId) return } args, ok := decodeArgs(tp, buf) if !ok { log.Debug("tp:", CmdDescription(tp), "argc not match", "details:", string(buf)) return } log.Debug("sessionId", sessionId, "tp:", CmdDescription(tp), "len(args):", len(args), "details:", string(buf)) switch tp { case CAN_DO, CAN_DO_TIMEOUT: //todo: CAN_DO_TIMEOUT timeout support self.w = self.getWorker(sessionId, inbox, conn) s.protoEvtCh <- &event{tp: tp, args: &Tuple{ t0: self.w, t1: string(args[0])}} case CANT_DO: s.protoEvtCh <- &event{tp: tp, fromSessionId: sessionId, args: &Tuple{t0: string(args[0])}} case ECHO_REQ: sendReply(inbox, ECHO_RES, [][]byte{buf}) case PRE_SLEEP: self.w = self.getWorker(sessionId, inbox, conn) s.protoEvtCh <- &event{tp: tp, args: &Tuple{t0: self.w}, fromSessionId: sessionId} case SET_CLIENT_ID: self.w = self.getWorker(sessionId, inbox, conn) s.protoEvtCh <- &event{tp: tp, args: &Tuple{t0: self.w, t1: string(args[0])}} case GRAB_JOB_UNIQ: if self.w == nil { log.Errorf("can't perform %s, need send CAN_DO first", CmdDescription(tp)) return } e := &event{tp: tp, fromSessionId: sessionId, result: createResCh()} s.protoEvtCh <- e job := (<-e.result).(*Job) if job == nil { log.Debug("sessionId", sessionId, "no job") sendReplyResult(inbox, nojobReply) break } //log.Debugf("%+v", job) sendReply(inbox, JOB_ASSIGN_UNIQ, [][]byte{ []byte(job.Handle), []byte(job.FuncName), []byte(job.Id), job.Data}) case SUBMIT_JOB, SUBMIT_JOB_LOW_BG, SUBMIT_JOB_LOW: if self.c == nil { self.c = &Client{Session: Session{SessionId: sessionId, in: inbox, ConnectAt: time.Now()}} } e := &event{tp: tp, args: &Tuple{t0: self.c, t1: args[0], t2: args[1], t3: args[2]}, result: createResCh(), } s.protoEvtCh <- e handle := <-e.result sendReply(inbox, JOB_CREATED, [][]byte{[]byte(handle.(string))}) case GET_STATUS: e := &event{tp: tp, args: &Tuple{t0: args[0]}, result: createResCh()} s.protoEvtCh <- e resp := (<-e.result).(*Tuple) sendReply(inbox, STATUS_RES, [][]byte{resp.t0.([]byte), bool2bytes(resp.t1), bool2bytes(resp.t2), int2bytes(resp.t3), int2bytes(resp.t4)}) case WORK_DATA, WORK_WARNING, WORK_STATUS, WORK_COMPLETE, WORK_FAIL, WORK_EXCEPTION: if self.w == nil { log.Errorf("can't perform %s, need send CAN_DO first", CmdDescription(tp)) return } s.protoEvtCh <- &event{tp: tp, args: &Tuple{t0: args}, fromSessionId: sessionId} default: log.Warningf("not support type %s", CmdDescription(tp)) } } }
func (t *MigrateTask) migrateSingleSlot(slotId int, to int) error { // set slot status s, err := models.GetSlot(t.zkConn, t.productName, slotId) if err != nil { log.Error(err) return err } if s.State.Status != models.SLOT_STATUS_ONLINE && s.State.Status != models.SLOT_STATUS_MIGRATE { log.Warning("status is not online && migrate", s) return nil } from := s.GroupId if s.State.Status == models.SLOT_STATUS_MIGRATE { from = s.State.MigrateStatus.From } // make sure from group & target group exists exists, err := models.GroupExists(t.zkConn, t.productName, from) if err != nil { return errors.Trace(err) } if !exists { log.Errorf("src group %d not exist when migrate from %d to %d", from, from, to) return errors.NotFoundf("group %d", from) } exists, err = models.GroupExists(t.zkConn, t.productName, to) if err != nil { return errors.Trace(err) } if !exists { return errors.NotFoundf("group %d", to) } // cannot migrate to itself, just ignore if from == to { log.Warning("from == to, ignore", s) return nil } // modify slot status if err := s.SetMigrateStatus(t.zkConn, from, to); err != nil { log.Error(err) return err } err = t.slotMigrator.Migrate(s, from, to, t, func(p SlotMigrateProgress) { // on migrate slot progress if p.Remain%500 == 0 { log.Info(p) } }) if err != nil { log.Error(err) return err } // migrate done, change slot status back s.State.Status = models.SLOT_STATUS_ONLINE s.State.MigrateStatus.From = models.INVALID_ID s.State.MigrateStatus.To = models.INVALID_ID if err := s.Update(t.zkConn); err != nil { log.Error(err) return err } return nil }
/** 获取cluster slots信息,并利用cluster nodes信息来将failed的slave过滤掉 */ func (d *Dispatcher) doReload(server string) (slotInfos []*SlotInfo, err error) { var conn net.Conn conn, err = d.connPool.GetConn(server) if err != nil { log.Error(server, err) return } else { log.Infof("query cluster slots from %s", server) } defer func() { if err != nil { conn.(*pool.PoolConn).MarkUnusable() } conn.Close() }() _, err = conn.Write(REDIS_CMD_CLUSTER_SLOTS) if err != nil { log.Errorf("write cluster slots error", server, err) return } r := bufio.NewReader(conn) var data *resp.Data data, err = resp.ReadData(r) if err != nil { log.Error(server, err) return } slotInfos = make([]*SlotInfo, 0, len(data.Array)) for _, info := range data.Array { slotInfos = append(slotInfos, NewSlotInfo(info)) } // filter slot info with cluster nodes information _, err = conn.Write(REDIS_CMD_CLUSTER_NODES) if err != nil { log.Errorf("write cluster nodes error", server, err) return } r = bufio.NewReader(conn) data, err = resp.ReadData(r) if err != nil { log.Error(server, err) return } aliveNodes := make(map[string]bool) lines := strings.Split(strings.TrimSpace(string(data.String)), "\n") for _, line := range lines { // 305fa52a4ed213df3ca97a4399d9e2a6e44371d2 10.4.17.164:7704 master - 0 1440042315188 2 connected 5461-10922 log.Debug(line) elements := strings.SplitN(line, " ", CLUSTER_NODES_FIELD_SPLIT_NUM) if !strings.Contains(elements[CLUSTER_NODES_FIELD_NUM_FLAGS], "fail") { aliveNodes[elements[CLUSTER_NODES_FIELD_NUM_IP_PORT]] = true } else { log.Warningf("node fails: %s", elements[1]) } } for _, si := range slotInfos { if d.readPrefer == READ_PREFER_MASTER { si.read = []string{si.write} } else if d.readPrefer == READ_PREFER_SLAVE || d.readPrefer == READ_PREFER_SLAVE_IDC { localIPPrefix := LocalIP() if len(localIPPrefix) > 0 { segments := strings.SplitN(localIPPrefix, ".", 3) localIPPrefix = strings.Join(segments[:2], ".") localIPPrefix += "." } var readNodes []string for _, node := range si.read { if !aliveNodes[node] { log.Infof("filter %s since it's not alive", node) continue } if d.readPrefer == READ_PREFER_SLAVE_IDC { // ips are regarded as in the same idc if they have the same first two segments, eg 10.4.x.x if !strings.HasPrefix(node, localIPPrefix) { log.Infof("filter %s by read prefer slave idc", node) continue } } readNodes = append(readNodes, node) } if len(readNodes) == 0 { readNodes = []string{si.write} } si.read = readNodes } } return }
// migrate multi slots func RunMigrateTask(task *MigrateTask) error { conn := CreateZkConn() defer conn.Close() lock := utils.GetZkLock(conn, productName) to := task.NewGroupId task.Status = MIGRATE_TASK_MIGRATING for slotId := task.FromSlot; slotId <= task.ToSlot; slotId++ { err := func() error { log.Info("start migrate slot:", slotId) lock.Lock(fmt.Sprintf("migrate %d", slotId)) defer func() { err := lock.Unlock() if err != nil { log.Info(err) } }() // set slot status s, err := models.GetSlot(conn, productName, slotId) if err != nil { log.Error(err) return err } if s.State.Status != models.SLOT_STATUS_ONLINE && s.State.Status != models.SLOT_STATUS_MIGRATE { log.Warning("status is not online && migrate", s) return nil } from := s.GroupId if s.State.Status == models.SLOT_STATUS_MIGRATE { from = s.State.MigrateStatus.From } // make sure from group & target group exists exists, err := models.GroupExists(conn, productName, from) if err != nil { return errors.Trace(err) } if !exists { log.Errorf("src group %d not exist when migrate from %d to %d", from, from, to) return errors.NotFoundf("group %d", from) } exists, err = models.GroupExists(conn, productName, to) if err != nil { return errors.Trace(err) } if !exists { return errors.NotFoundf("group %d", to) } // cannot migrate to itself if from == to { log.Warning("from == to, ignore", s) return nil } // modify slot status if err := s.SetMigrateStatus(conn, from, to); err != nil { log.Error(err) return err } // do real migrate err = MigrateSingleSlot(conn, slotId, from, to, task.Delay, task.stopChan) if err != nil { log.Error(err) return err } // migrate done, change slot status back s.State.Status = models.SLOT_STATUS_ONLINE s.State.MigrateStatus.From = models.INVALID_ID s.State.MigrateStatus.To = models.INVALID_ID if err := s.Update(zkConn); err != nil { log.Error(err) return err } return nil }() if err == ErrStopMigrateByUser { log.Info("stop migration job by user") break } else if err != nil { log.Error(err) task.Status = MIGRATE_TASK_ERR return err } task.Percent = (slotId - task.FromSlot + 1) * 100 / (task.ToSlot - task.FromSlot + 1) log.Info("total percent:", task.Percent) } task.Status = MIGRATE_TASK_FINISHED log.Info("migration finished") return nil }