func (s *Server) checkAndDoTopoChange(seq int) bool { act, err := s.topo.GetActionWithSeq(int64(seq)) if err != nil { //todo: error is not "not exist" log.PanicErrorf(err, "action failed, seq = %d", seq) } if !needResponse(act.Receivers, s.info) { //no need to response return false } log.Warnf("action %v receivers %v", seq, act.Receivers) switch act.Type { case models.ACTION_TYPE_SLOT_MIGRATE, models.ACTION_TYPE_SLOT_CHANGED, models.ACTION_TYPE_SLOT_PREMIGRATE: slot := &models.Slot{} s.getActionObject(seq, slot) s.fillSlot(slot.Id) case models.ACTION_TYPE_SERVER_GROUP_CHANGED: serverGroup := &models.ServerGroup{} s.getActionObject(seq, serverGroup) s.onGroupChange(serverGroup.Id) case models.ACTION_TYPE_SERVER_GROUP_REMOVE: //do not care case models.ACTION_TYPE_MULTI_SLOT_CHANGED: param := &models.SlotMultiSetParam{} s.getActionObject(seq, param) s.onSlotRangeChange(param) default: log.Panicf("unknown action %+v", act) } return true }
func WaitForReceiverWithTimeout(zkConn zkhelper.Conn, productName string, actionZkPath string, proxies []ProxyInfo, timeoutInMs int) error { if len(proxies) == 0 { return nil } times := 0 proxyIds := make(map[string]struct{}) var offlineProxyIds []string for _, p := range proxies { proxyIds[p.Id] = struct{}{} } checkTimes := timeoutInMs / 500 // check every 500ms for times < checkTimes { if times >= 6 && (times*500)%1000 == 0 { log.Warnf("abnormal waiting time for receivers: %s %v", actionZkPath, offlineProxyIds) } // get confirm ids nodes, _, err := zkConn.Children(actionZkPath) if err != nil { return errors.Trace(err) } confirmIds := make(map[string]struct{}) for _, node := range nodes { id := path.Base(node) confirmIds[id] = struct{}{} } if len(confirmIds) != 0 { match := true // check if all proxy have responsed var notMatchList []string for id, _ := range proxyIds { // if proxy id not in confirm ids, means someone didn't response if _, ok := confirmIds[id]; !ok { match = false notMatchList = append(notMatchList, id) } } if match { return nil } offlineProxyIds = notMatchList } times += 1 time.Sleep(500 * time.Millisecond) } if len(offlineProxyIds) > 0 { log.Errorf("proxies didn't responed: %v", offlineProxyIds) } // set offline proxies for _, id := range offlineProxyIds { log.Errorf("mark proxy %s to PROXY_STATE_MARK_OFFLINE", id) if err := SetProxyStatus(zkConn, productName, id, PROXY_STATE_MARK_OFFLINE); err != nil { return errors.Trace(err) } } return errors.Trace(ErrReceiverTimeout) }
func (s *Server) waitOnline() { for { info, err := s.topo.GetProxyInfo(s.info.Id) if err != nil { log.PanicErrorf(err, "get proxy info failed") } switch info.State { case models.PROXY_STATE_MARK_OFFLINE: s.handleMarkOffline() case models.PROXY_STATE_ONLINE: s.info.State = info.State log.Infof("we are online: %s", s.info.Id) _, err := s.topo.WatchNode(path.Join(models.GetProxyPath(s.topo.ProductName), s.info.Id), s.evtbus) if err != nil { log.PanicErrorf(err, "watch node failed") } return } select { case e := <-s.evtbus: switch e.(type) { case *killEvent: s.handleMarkOffline() } default: //otherwise ignore it } log.Warnf("wait to be online: %s", s.info.Id) time.Sleep(3 * time.Second) } }
func (s *Server) OnGroupChange(groupId int) { log.Warnf("group changed %d", groupId) for i, slot := range s.slots { if slot.Info.GroupId == groupId { s.fillSlot(i, true) } } }
func (top *Topology) doWatch(evtch <-chan topo.Event, evtbus chan interface{}) { e := <-evtch if e.State == topo.StateExpired || e.Type == topo.EventNotWatching { log.Panicf("session expired: %+v", e) } log.Warnf("topo event %+v", e) switch e.Type { //case topo.EventNodeCreated: //case topo.EventNodeDataChanged: case topo.EventNodeChildrenChanged: //only care children changed //todo:get changed node and decode event default: log.Warnf("%+v", e) } evtbus <- e }
func (s *Server) reRegisterAndFillSlots(state string, invokeFromRestart bool) { s.startLock.Lock() if s.isStarting() && invokeFromRestart == false { log.Warnf("server is restarting") s.startLock.Unlock() return } log.Warnf("server will restart status ,%d", s.status) s.setServerStatus(SERVER_STATUS_STARTING) s.startLock.Unlock() s.info.State = state s.cleanup() //s.topo.InitZkConn() s.register() // resume normal flag s.topo.watchSuspend.Set(false) s.setServerStatus(SERVER_STATUS_STARTED) s.rewatchProxy(true) s.rewatchNodes() s.fillSlots() log.Warnf("server restarted") }
func (top *Topology) doWatch(evtch <-chan topo.Event, evtbus chan interface{}) { e := <-evtch log.Warnf("topo event %+v", e) switch e.Type { //case topo.EventNodeCreated: //case topo.EventNodeDataChanged: case topo.EventNodeChildrenChanged: //only care children changed //todo:get changed node and decode event default: //log.Warnf("%+v", e) } if top.watchSuspend.Get() == false { evtbus <- e } }
func (s *Server) reRegister(state string, invokeFromRestart bool) { s.startLock.Lock() if s.isStarting() && invokeFromRestart == false { log.Warnf("server is restarting") s.startLock.Unlock() return } s.setServerStatus(SERVER_STATUS_STARTING) s.startLock.Unlock() s.info.State = state //s.topo.Close(s.info.Id) //s.topo.InitZkConn() s.register() s.setServerStatus(SERVER_STATUS_STARTED) }
func (s *Slot) prepare(r *Request, key []byte) (*SharedBackendConn, error) { if s.backend.bc == nil { log.Infof("slot-%04d is not ready: key = %s", s.Id, key) return nil, ErrSlotIsNotReady } if err := s.slotsmgrt(r, key); err != nil { log.Warnf("slot-%04d migrate from = %s to %s failed: key = %s, error = %s", s.Id, s.migrate.from, s.backend.addr, key, err) return nil, err } else { r.slot = &s.wait r.slot.Add(1) return s.backend.bc, nil } }
func (s *Server) OnSlotRangeChange(param *models.SlotMultiSetParam) { log.Warnf("slotRangeChange %+v", param) if !s.isValidSlot(param.From) || !s.isValidSlot(param.To) { log.Errorf("invalid slot number, %+v", param) return } for i := param.From; i <= param.To; i++ { switch param.Status { case models.SLOT_STATUS_OFFLINE: s.clearSlot(i) case models.SLOT_STATUS_ONLINE: s.fillSlot(i, true) default: log.Errorf("can not handle status %v", param.Status) } } }
func (s *Server) fillSlots() { for { refill := false for i := 0; i < router.MaxSlotNum; i++ { if err := s.fillSlot(i); err != nil { //s.reRegisterAndReWatchPrxoy(models.PROXY_STATE_ONLINE, true) //s.rewatchNodes(true) refill = true break } } if refill == false { break } } log.Warnf("fillSlots end") }
func getAllProxyOps() int64 { proxies, err := models.ProxyList(unsafeZkConn, globalEnv.ProductName(), nil) if err != nil { log.ErrorErrorf(err, "get proxy list failed") return -1 } var total int64 for _, p := range proxies { i, err := p.Ops() if err != nil { log.Warnf("get proxy ops failed %+v", err) } total += i } return total }
func (s *Server) checkAndDoTopoChange(seq int) bool { act, err := s.topo.GetActionWithSeq(int64(seq)) if err != nil { return false } if !needResponse(act.Receivers, s.info) { //no need to response return false } log.Warnf("action %v receivers %v", seq, act.Receivers) switch act.Type { case models.ACTION_TYPE_SLOT_MIGRATE, models.ACTION_TYPE_SLOT_CHANGED, models.ACTION_TYPE_SLOT_PREMIGRATE: slot := &models.Slot{} if err := s.getActionObject(seq, slot); err != nil { return false } if err := s.fillSlot(slot.Id); err != nil { //s.reRegisterAndFillSlots(models.PROXY_STATE_ONLINE) return false } case models.ACTION_TYPE_SERVER_GROUP_CHANGED: serverGroup := &models.ServerGroup{} if err := s.getActionObject(seq, serverGroup); err != nil { return false } if err := s.onGroupChange(serverGroup.Id); err != nil { return false } case models.ACTION_TYPE_SERVER_GROUP_REMOVE: //do not care case models.ACTION_TYPE_MULTI_SLOT_CHANGED: param := &models.SlotMultiSetParam{} if err := s.getActionObject(seq, param); err != nil { return false } if err := s.onSlotRangeChange(param); err != nil { return false } default: log.Errorf("unknown action %+v", act) } return true }
func WaitForReceiverWithTimeout(zkConn zkhelper.Conn, productName string, actionZkPath string, proxies []ProxyInfo, timeoutInMs int) error { if len(proxies) == 0 { return nil } times := 0 proxyIds := make(map[string]bool) for _, p := range proxies { proxyIds[p.Id] = true } // check every 500ms for times < timeoutInMs/500 { if times >= 6 && (times*500)%1000 == 0 { log.Warnf("abnormal waiting time for receivers: %s %v", actionZkPath, proxyIds) } // get confirm ids nodes, _, err := zkConn.Children(actionZkPath) if err != nil { return errors.Trace(err) } for _, node := range nodes { id := path.Base(node) delete(proxyIds, id) } if len(proxyIds) == 0 { return nil } times++ time.Sleep(500 * time.Millisecond) } log.Warn("proxies didn't responed: ", proxyIds) // set offline proxies /* for id, _ := range proxyIds { log.Errorf("mark proxy %s to PROXY_STATE_MARK_OFFLINE", id) if err := SetProxyStatus(zkConn, productName, id, PROXY_STATE_MARK_OFFLINE); err != nil { return errors.Trace(err) } } */ return ErrReceiverTimeout }
func runDashboard(addr string, httpLogFile string) { log.Infof("dashboard listening on addr: %s", addr) m := martini.Classic() f, err := os.OpenFile(httpLogFile, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666) if err != nil { log.PanicErrorf(err, "open http log file failed") } defer f.Close() m.Map(stdlog.New(f, "[martini]", stdlog.LstdFlags)) binRoot, err := filepath.Abs(filepath.Dir(os.Args[0])) if err != nil { log.PanicErrorf(err, "get binroot path failed") } m.Use(martini.Static(filepath.Join(binRoot, "assets/statics"))) m.Use(render.Renderer(render.Options{ Directory: filepath.Join(binRoot, "assets/template"), Extensions: []string{".tmpl", ".html"}, Charset: "UTF-8", IndentJSON: true, })) m.Use(cors.Allow(&cors.Options{ AllowOrigins: []string{"*"}, AllowMethods: []string{"POST", "GET", "DELETE", "PUT"}, AllowHeaders: []string{"Origin", "x-requested-with", "Content-Type", "Content-Range", "Content-Disposition", "Content-Description"}, ExposeHeaders: []string{"Content-Length"}, AllowCredentials: false, })) m.Get("/api/server_groups", apiGetServerGroupList) m.Get("/api/overview", apiOverview) m.Get("/api/redis/:addr/stat", apiRedisStat) m.Get("/api/redis/:addr/:id/slotinfo", apiGetRedisSlotInfo) m.Get("/api/redis/group/:group_id/:slot_id/slotinfo", apiGetRedisSlotInfoFromGroupId) m.Put("/api/server_groups", binding.Json(models.ServerGroup{}), apiAddServerGroup) m.Put("/api/server_group/(?P<id>[0-9]+)/addServer", binding.Json(models.Server{}), apiAddServerToGroup) m.Delete("/api/server_group/(?P<id>[0-9]+)", apiRemoveServerGroup) m.Put("/api/server_group/(?P<id>[0-9]+)/removeServer", binding.Json(models.Server{}), apiRemoveServerFromGroup) m.Get("/api/server_group/(?P<id>[0-9]+)", apiGetServerGroup) m.Post("/api/server_group/(?P<id>[0-9]+)/promote", binding.Json(models.Server{}), apiPromoteServer) m.Get("/api/migrate/status", apiMigrateStatus) m.Get("/api/migrate/tasks", apiGetMigrateTasks) m.Post("/api/migrate", binding.Json(migrateTaskForm{}), apiDoMigrate) m.Post("/api/rebalance", apiRebalance) m.Get("/api/slot/list", apiGetSlots) m.Get("/api/slot/:id", apiGetSingleSlot) m.Post("/api/slots/init", apiInitSlots) m.Get("/api/slots", apiGetSlots) m.Post("/api/slot", binding.Json(RangeSetTask{}), apiSlotRangeSet) m.Get("/api/proxy/list", apiGetProxyList) m.Get("/api/proxy/debug/vars", apiGetProxyDebugVars) m.Post("/api/proxy", binding.Json(models.ProxyInfo{}), apiSetProxyStatus) m.Get("/api/action/gc", apiActionGC) m.Get("/api/force_remove_locks", apiForceRemoveLocks) m.Get("/api/remove_fence", apiRemoveFence) //m.Get("/api/action/gc", apiActionGC) m.Get("/slots", pageSlots) m.Get("/", func(r render.Render) { r.Redirect("/admin") }) //check key slot correspondence m.Get("/api/keyslot/(?P<key>.+)", apiKeySlot) m.Get("/api/remove_migration", apiRemoveMigration) m.Get("/api/remove_migration_fail", apiRemoveMigrationForFail) m.Get("/api/proxy/slowop", apiGetProxySlowop) zkBuilder := utils.NewConnBuilder(globalEnv.NewZkConn) safeZkConn = zkBuilder.GetSafeConn() unsafeZkConn = zkBuilder.GetUnsafeConn() // create temp node in ZK if err := createDashboardNode(); err != nil { log.WarnErrorf(err, "create zk node failed") // do not release dashborad node here } // create long live migrate manager globalMigrateManager = NewMigrateManager(safeZkConn, globalEnv.ProductName()) go func() { tick := time.Tick(time.Second) var lastCnt, qps int64 for _ = range tick { cnt := getAllProxyOps() if cnt > 0 { qps = cnt - lastCnt lastCnt = cnt } else { qps = 0 } atomic.StoreInt64(&proxiesSpeed, qps) } }() go func() { for { err := models.ActionGC(safeZkConn, globalEnv.ProductName(), models.GC_TYPE_SEC, 60*60*24) if err != nil { log.Warnf("clean actions failed %+v", err) } time.Sleep(60 * 60 * 24 * time.Second) } }() m.RunOnAddr(addr) }
func apiRemoveMigrationForFail() (int, string) { if err := globalMigrateManager.RemoveMigrationsForFail(); err != nil { log.Warnf("RemoveMigration error: %+v", err) } return 200, "ok" }
func (c *unsafeConn) Close() { log.Warnf("do not close zk connection by yourself") }
func (s *Server) processAction(e interface{}) error { if s.topo.IsSessionExpiredEvent(e) { return topo.ErrSessionExpired } if strings.Index(getEventPath(e), models.GetProxyPath(s.topo.ProductName)) == 0 { info, err := s.topo.GetProxyInfo(s.info.Id) if err != nil { log.ErrorErrorf(err, "get proxy info failed: %s", s.info.Id) return err } switch info.State { case models.PROXY_STATE_MARK_OFFLINE: log.Warnf("mark offline, proxy got offline event: %s", s.info.Id) s.markOffline() case models.PROXY_STATE_ONLINE: s.rewatchProxy(false) default: log.Errorf("unknown proxy state %+v", info) } return nil } //re-watch nodes := s.rewatchNodes() seqs, err := models.ExtraSeqList(nodes) if err != nil { log.ErrorErrorf(err, "get seq list failed") //s.reRegisterAndFillSlots(models.PROXY_STATE_ONLINE) return err } if len(seqs) == 0 || !s.topo.IsChildrenChangedEvent(e) { return nil } //get last pos index := -1 for i, seq := range seqs { if s.lastActionSeq < seq { index = i break } } if index < 0 { return nil } actions := seqs[index:] for _, seq := range actions { exist, err := s.topo.Exist(path.Join(s.topo.GetActionResponsePath(seq), s.info.Id)) if err != nil { log.ErrorErrorf(err, "get action failed") //s.reRegisterAndFillSlots(models.PROXY_STATE_ONLINE) return err } if exist { continue } if s.checkAndDoTopoChange(seq) { s.responseAction(int64(seq)) } } s.lastActionSeq = seqs[len(seqs)-1] return nil }
func (t *MigrateTask) migrateSingleSlot(slotId int, to int) error { // set slot status s, err := models.GetSlot(t.zkConn, t.productName, slotId) if err != nil { log.ErrorErrorf(err, "get slot info failed") return err } if s.State.Status == models.SLOT_STATUS_OFFLINE { log.Warnf("status is offline: %+v", s) return nil } from := s.GroupId if s.State.Status == models.SLOT_STATUS_MIGRATE { from = s.State.MigrateStatus.From } // make sure from group & target group exists exists, err := models.GroupExists(t.zkConn, t.productName, from) if err != nil { return errors.Trace(err) } if !exists { log.Errorf("src group %d not exist when migrate from %d to %d", from, from, to) return errors.Errorf("group %d not found", from) } exists, err = models.GroupExists(t.zkConn, t.productName, to) if err != nil { return errors.Trace(err) } if !exists { return errors.Errorf("group %d not found", to) } // cannot migrate to itself, just ignore if from == to { log.Warnf("from == to, ignore: %+v", s) return nil } // modify slot status if err := s.SetMigrateStatus(t.zkConn, from, to); err != nil { log.ErrorErrorf(err, "set migrate status failed") return err } err = t.Migrate(s, from, to, func(p SlotMigrateProgress) { // on migrate slot progress if p.Remain%5000 == 0 { log.Infof("%+v", p) } }) if err != nil { log.ErrorErrorf(err, "migrate slot failed") return err } // migrate done, change slot status back s.State.Status = models.SLOT_STATUS_ONLINE s.State.MigrateStatus.From = models.INVALID_ID s.State.MigrateStatus.To = models.INVALID_ID if err := s.Update(t.zkConn); err != nil { log.ErrorErrorf(err, "update zk status failed, should be: %+v", s) return err } return nil }