Exemple #1
0
func (s *Server) checkAndDoTopoChange(seq int) bool {
	act, err := s.topo.GetActionWithSeq(int64(seq))
	if err != nil { //todo: error is not "not exist"
		log.PanicErrorf(err, "action failed, seq = %d", seq)
	}

	if !needResponse(act.Receivers, s.info) { //no need to response
		return false
	}

	log.Warnf("action %v receivers %v", seq, act.Receivers)

	switch act.Type {
	case models.ACTION_TYPE_SLOT_MIGRATE, models.ACTION_TYPE_SLOT_CHANGED,
		models.ACTION_TYPE_SLOT_PREMIGRATE:
		slot := &models.Slot{}
		s.getActionObject(seq, slot)
		s.fillSlot(slot.Id)
	case models.ACTION_TYPE_SERVER_GROUP_CHANGED:
		serverGroup := &models.ServerGroup{}
		s.getActionObject(seq, serverGroup)
		s.onGroupChange(serverGroup.Id)
	case models.ACTION_TYPE_SERVER_GROUP_REMOVE:
	//do not care
	case models.ACTION_TYPE_MULTI_SLOT_CHANGED:
		param := &models.SlotMultiSetParam{}
		s.getActionObject(seq, param)
		s.onSlotRangeChange(param)
	default:
		log.Panicf("unknown action %+v", act)
	}
	return true
}
Exemple #2
0
func WaitForReceiverWithTimeout(zkConn zkhelper.Conn, productName string, actionZkPath string, proxies []ProxyInfo, timeoutInMs int) error {
	if len(proxies) == 0 {
		return nil
	}

	times := 0
	proxyIds := make(map[string]struct{})
	var offlineProxyIds []string
	for _, p := range proxies {
		proxyIds[p.Id] = struct{}{}
	}

	checkTimes := timeoutInMs / 500
	// check every 500ms
	for times < checkTimes {
		if times >= 6 && (times*500)%1000 == 0 {
			log.Warnf("abnormal waiting time for receivers: %s %v", actionZkPath, offlineProxyIds)
		}
		// get confirm ids
		nodes, _, err := zkConn.Children(actionZkPath)
		if err != nil {
			return errors.Trace(err)
		}
		confirmIds := make(map[string]struct{})
		for _, node := range nodes {
			id := path.Base(node)
			confirmIds[id] = struct{}{}
		}
		if len(confirmIds) != 0 {
			match := true
			// check if all proxy have responsed
			var notMatchList []string
			for id, _ := range proxyIds {
				// if proxy id not in confirm ids, means someone didn't response
				if _, ok := confirmIds[id]; !ok {
					match = false
					notMatchList = append(notMatchList, id)
				}
			}
			if match {
				return nil
			}
			offlineProxyIds = notMatchList
		}
		times += 1
		time.Sleep(500 * time.Millisecond)
	}
	if len(offlineProxyIds) > 0 {
		log.Errorf("proxies didn't responed: %v", offlineProxyIds)
	}

	// set offline proxies
	for _, id := range offlineProxyIds {
		log.Errorf("mark proxy %s to PROXY_STATE_MARK_OFFLINE", id)
		if err := SetProxyStatus(zkConn, productName, id, PROXY_STATE_MARK_OFFLINE); err != nil {
			return errors.Trace(err)
		}
	}
	return errors.Trace(ErrReceiverTimeout)
}
Exemple #3
0
func (s *Server) waitOnline() {
	for {
		info, err := s.topo.GetProxyInfo(s.info.Id)
		if err != nil {
			log.PanicErrorf(err, "get proxy info failed")
		}
		switch info.State {
		case models.PROXY_STATE_MARK_OFFLINE:
			s.handleMarkOffline()
		case models.PROXY_STATE_ONLINE:
			s.info.State = info.State
			log.Infof("we are online: %s", s.info.Id)
			_, err := s.topo.WatchNode(path.Join(models.GetProxyPath(s.topo.ProductName), s.info.Id), s.evtbus)
			if err != nil {
				log.PanicErrorf(err, "watch node failed")
			}
			return
		}
		select {
		case e := <-s.evtbus:
			switch e.(type) {
			case *killEvent:
				s.handleMarkOffline()
			}
		default: //otherwise ignore it
		}
		log.Warnf("wait to be online: %s", s.info.Id)
		time.Sleep(3 * time.Second)
	}
}
Exemple #4
0
func (s *Server) OnGroupChange(groupId int) {
	log.Warnf("group changed %d", groupId)
	for i, slot := range s.slots {
		if slot.Info.GroupId == groupId {
			s.fillSlot(i, true)
		}
	}
}
Exemple #5
0
func (top *Topology) doWatch(evtch <-chan topo.Event, evtbus chan interface{}) {
	e := <-evtch
	if e.State == topo.StateExpired || e.Type == topo.EventNotWatching {
		log.Panicf("session expired: %+v", e)
	}

	log.Warnf("topo event %+v", e)

	switch e.Type {
	//case topo.EventNodeCreated:
	//case topo.EventNodeDataChanged:
	case topo.EventNodeChildrenChanged: //only care children changed
		//todo:get changed node and decode event
	default:
		log.Warnf("%+v", e)
	}

	evtbus <- e
}
Exemple #6
0
func (s *Server) reRegisterAndFillSlots(state string, invokeFromRestart bool) {
	s.startLock.Lock()
	if s.isStarting() && invokeFromRestart == false {
		log.Warnf("server is restarting")
		s.startLock.Unlock()
		return
	}
	log.Warnf("server will restart status ,%d", s.status)
	s.setServerStatus(SERVER_STATUS_STARTING)
	s.startLock.Unlock()
	s.info.State = state
	s.cleanup()
	//s.topo.InitZkConn()
	s.register()
	// resume normal flag
	s.topo.watchSuspend.Set(false)
	s.setServerStatus(SERVER_STATUS_STARTED)
	s.rewatchProxy(true)
	s.rewatchNodes()
	s.fillSlots()
	log.Warnf("server restarted")
}
Exemple #7
0
func (top *Topology) doWatch(evtch <-chan topo.Event, evtbus chan interface{}) {
	e := <-evtch
	log.Warnf("topo event %+v", e)
	switch e.Type {
	//case topo.EventNodeCreated:
	//case topo.EventNodeDataChanged:
	case topo.EventNodeChildrenChanged: //only care children changed
		//todo:get changed node and decode event
	default:
		//log.Warnf("%+v", e)
	}
	if top.watchSuspend.Get() == false {
		evtbus <- e
	}
}
Exemple #8
0
func (s *Server) reRegister(state string, invokeFromRestart bool) {
	s.startLock.Lock()
	if s.isStarting() && invokeFromRestart == false {
		log.Warnf("server is restarting")
		s.startLock.Unlock()
		return
	}
	s.setServerStatus(SERVER_STATUS_STARTING)
	s.startLock.Unlock()
	s.info.State = state
	//s.topo.Close(s.info.Id)
	//s.topo.InitZkConn()
	s.register()
	s.setServerStatus(SERVER_STATUS_STARTED)
}
Exemple #9
0
func (s *Slot) prepare(r *Request, key []byte) (*SharedBackendConn, error) {
	if s.backend.bc == nil {
		log.Infof("slot-%04d is not ready: key = %s", s.Id, key)
		return nil, ErrSlotIsNotReady
	}
	if err := s.slotsmgrt(r, key); err != nil {
		log.Warnf("slot-%04d migrate from = %s to %s failed: key = %s, error = %s",
			s.Id, s.migrate.from, s.backend.addr, key, err)
		return nil, err
	} else {
		r.slot = &s.wait
		r.slot.Add(1)
		return s.backend.bc, nil
	}
}
Exemple #10
0
func (s *Server) OnSlotRangeChange(param *models.SlotMultiSetParam) {
	log.Warnf("slotRangeChange %+v", param)
	if !s.isValidSlot(param.From) || !s.isValidSlot(param.To) {
		log.Errorf("invalid slot number, %+v", param)
		return
	}
	for i := param.From; i <= param.To; i++ {
		switch param.Status {
		case models.SLOT_STATUS_OFFLINE:
			s.clearSlot(i)
		case models.SLOT_STATUS_ONLINE:
			s.fillSlot(i, true)
		default:
			log.Errorf("can not handle status %v", param.Status)
		}
	}
}
Exemple #11
0
func (s *Server) fillSlots() {
	for {
		refill := false
		for i := 0; i < router.MaxSlotNum; i++ {
			if err := s.fillSlot(i); err != nil {
				//s.reRegisterAndReWatchPrxoy(models.PROXY_STATE_ONLINE, true)
				//s.rewatchNodes(true)
				refill = true
				break
			}
		}
		if refill == false {
			break
		}
	}
	log.Warnf("fillSlots end")
}
Exemple #12
0
func getAllProxyOps() int64 {
	proxies, err := models.ProxyList(unsafeZkConn, globalEnv.ProductName(), nil)
	if err != nil {
		log.ErrorErrorf(err, "get proxy list failed")
		return -1
	}

	var total int64
	for _, p := range proxies {
		i, err := p.Ops()
		if err != nil {
			log.Warnf("get proxy ops failed %+v", err)
		}
		total += i
	}
	return total
}
Exemple #13
0
func (s *Server) checkAndDoTopoChange(seq int) bool {
	act, err := s.topo.GetActionWithSeq(int64(seq))
	if err != nil {
		return false
	}
	if !needResponse(act.Receivers, s.info) { //no need to response
		return false
	}

	log.Warnf("action %v receivers %v", seq, act.Receivers)

	switch act.Type {
	case models.ACTION_TYPE_SLOT_MIGRATE, models.ACTION_TYPE_SLOT_CHANGED,
		models.ACTION_TYPE_SLOT_PREMIGRATE:
		slot := &models.Slot{}
		if err := s.getActionObject(seq, slot); err != nil {
			return false
		}
		if err := s.fillSlot(slot.Id); err != nil {
			//s.reRegisterAndFillSlots(models.PROXY_STATE_ONLINE)
			return false
		}
	case models.ACTION_TYPE_SERVER_GROUP_CHANGED:
		serverGroup := &models.ServerGroup{}
		if err := s.getActionObject(seq, serverGroup); err != nil {
			return false
		}
		if err := s.onGroupChange(serverGroup.Id); err != nil {
			return false
		}
	case models.ACTION_TYPE_SERVER_GROUP_REMOVE:
	//do not care
	case models.ACTION_TYPE_MULTI_SLOT_CHANGED:
		param := &models.SlotMultiSetParam{}
		if err := s.getActionObject(seq, param); err != nil {
			return false
		}
		if err := s.onSlotRangeChange(param); err != nil {
			return false
		}
	default:
		log.Errorf("unknown action %+v", act)
	}
	return true
}
Exemple #14
0
func WaitForReceiverWithTimeout(zkConn zkhelper.Conn, productName string, actionZkPath string, proxies []ProxyInfo, timeoutInMs int) error {
	if len(proxies) == 0 {
		return nil
	}

	times := 0
	proxyIds := make(map[string]bool)
	for _, p := range proxies {
		proxyIds[p.Id] = true
	}
	// check every 500ms
	for times < timeoutInMs/500 {
		if times >= 6 && (times*500)%1000 == 0 {
			log.Warnf("abnormal waiting time for receivers: %s %v", actionZkPath, proxyIds)
		}
		// get confirm ids
		nodes, _, err := zkConn.Children(actionZkPath)
		if err != nil {
			return errors.Trace(err)
		}
		for _, node := range nodes {
			id := path.Base(node)
			delete(proxyIds, id)
		}
		if len(proxyIds) == 0 {
			return nil
		}
		times++
		time.Sleep(500 * time.Millisecond)
	}
	log.Warn("proxies didn't responed: ", proxyIds)
	// set offline proxies
	/*
		for id, _ := range proxyIds {
			log.Errorf("mark proxy %s to PROXY_STATE_MARK_OFFLINE", id)
			if err := SetProxyStatus(zkConn, productName, id, PROXY_STATE_MARK_OFFLINE); err != nil {
				return errors.Trace(err)
			}
		}
	*/
	return ErrReceiverTimeout
}
Exemple #15
0
func runDashboard(addr string, httpLogFile string) {
	log.Infof("dashboard listening on addr: %s", addr)
	m := martini.Classic()
	f, err := os.OpenFile(httpLogFile, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
	if err != nil {
		log.PanicErrorf(err, "open http log file failed")
	}
	defer f.Close()

	m.Map(stdlog.New(f, "[martini]", stdlog.LstdFlags))
	binRoot, err := filepath.Abs(filepath.Dir(os.Args[0]))
	if err != nil {
		log.PanicErrorf(err, "get binroot path failed")
	}

	m.Use(martini.Static(filepath.Join(binRoot, "assets/statics")))
	m.Use(render.Renderer(render.Options{
		Directory:  filepath.Join(binRoot, "assets/template"),
		Extensions: []string{".tmpl", ".html"},
		Charset:    "UTF-8",
		IndentJSON: true,
	}))

	m.Use(cors.Allow(&cors.Options{
		AllowOrigins:     []string{"*"},
		AllowMethods:     []string{"POST", "GET", "DELETE", "PUT"},
		AllowHeaders:     []string{"Origin", "x-requested-with", "Content-Type", "Content-Range", "Content-Disposition", "Content-Description"},
		ExposeHeaders:    []string{"Content-Length"},
		AllowCredentials: false,
	}))

	m.Get("/api/server_groups", apiGetServerGroupList)
	m.Get("/api/overview", apiOverview)

	m.Get("/api/redis/:addr/stat", apiRedisStat)
	m.Get("/api/redis/:addr/:id/slotinfo", apiGetRedisSlotInfo)
	m.Get("/api/redis/group/:group_id/:slot_id/slotinfo", apiGetRedisSlotInfoFromGroupId)

	m.Put("/api/server_groups", binding.Json(models.ServerGroup{}), apiAddServerGroup)
	m.Put("/api/server_group/(?P<id>[0-9]+)/addServer", binding.Json(models.Server{}), apiAddServerToGroup)
	m.Delete("/api/server_group/(?P<id>[0-9]+)", apiRemoveServerGroup)

	m.Put("/api/server_group/(?P<id>[0-9]+)/removeServer", binding.Json(models.Server{}), apiRemoveServerFromGroup)
	m.Get("/api/server_group/(?P<id>[0-9]+)", apiGetServerGroup)
	m.Post("/api/server_group/(?P<id>[0-9]+)/promote", binding.Json(models.Server{}), apiPromoteServer)

	m.Get("/api/migrate/status", apiMigrateStatus)
	m.Get("/api/migrate/tasks", apiGetMigrateTasks)
	m.Post("/api/migrate", binding.Json(migrateTaskForm{}), apiDoMigrate)

	m.Post("/api/rebalance", apiRebalance)

	m.Get("/api/slot/list", apiGetSlots)
	m.Get("/api/slot/:id", apiGetSingleSlot)
	m.Post("/api/slots/init", apiInitSlots)
	m.Get("/api/slots", apiGetSlots)
	m.Post("/api/slot", binding.Json(RangeSetTask{}), apiSlotRangeSet)
	m.Get("/api/proxy/list", apiGetProxyList)
	m.Get("/api/proxy/debug/vars", apiGetProxyDebugVars)
	m.Post("/api/proxy", binding.Json(models.ProxyInfo{}), apiSetProxyStatus)

	m.Get("/api/action/gc", apiActionGC)
	m.Get("/api/force_remove_locks", apiForceRemoveLocks)
	m.Get("/api/remove_fence", apiRemoveFence)
	//m.Get("/api/action/gc", apiActionGC)
	m.Get("/slots", pageSlots)
	m.Get("/", func(r render.Render) {
		r.Redirect("/admin")
	})
	//check key slot correspondence
	m.Get("/api/keyslot/(?P<key>.+)", apiKeySlot)
	m.Get("/api/remove_migration", apiRemoveMigration)
	m.Get("/api/remove_migration_fail", apiRemoveMigrationForFail)
	m.Get("/api/proxy/slowop", apiGetProxySlowop)
	zkBuilder := utils.NewConnBuilder(globalEnv.NewZkConn)
	safeZkConn = zkBuilder.GetSafeConn()
	unsafeZkConn = zkBuilder.GetUnsafeConn()

	// create temp node in ZK
	if err := createDashboardNode(); err != nil {
		log.WarnErrorf(err, "create zk node failed") // do not release dashborad node here
	}

	// create long live migrate manager
	globalMigrateManager = NewMigrateManager(safeZkConn, globalEnv.ProductName())

	go func() {
		tick := time.Tick(time.Second)
		var lastCnt, qps int64
		for _ = range tick {
			cnt := getAllProxyOps()
			if cnt > 0 {
				qps = cnt - lastCnt
				lastCnt = cnt
			} else {
				qps = 0
			}
			atomic.StoreInt64(&proxiesSpeed, qps)
		}
	}()

	go func() {
		for {
			err := models.ActionGC(safeZkConn, globalEnv.ProductName(), models.GC_TYPE_SEC, 60*60*24)
			if err != nil {
				log.Warnf("clean actions failed %+v", err)
			}
			time.Sleep(60 * 60 * 24 * time.Second)
		}
	}()

	m.RunOnAddr(addr)
}
Exemple #16
0
func apiRemoveMigrationForFail() (int, string) {
	if err := globalMigrateManager.RemoveMigrationsForFail(); err != nil {
		log.Warnf("RemoveMigration error: %+v", err)
	}
	return 200, "ok"
}
Exemple #17
0
func (c *unsafeConn) Close() {
	log.Warnf("do not close zk connection by yourself")
}
Exemple #18
0
func (s *Server) processAction(e interface{}) error {
	if s.topo.IsSessionExpiredEvent(e) {
		return topo.ErrSessionExpired
	}
	if strings.Index(getEventPath(e), models.GetProxyPath(s.topo.ProductName)) == 0 {
		info, err := s.topo.GetProxyInfo(s.info.Id)
		if err != nil {
			log.ErrorErrorf(err, "get proxy info failed: %s", s.info.Id)
			return err
		}
		switch info.State {
		case models.PROXY_STATE_MARK_OFFLINE:
			log.Warnf("mark offline, proxy got offline event: %s", s.info.Id)
			s.markOffline()
		case models.PROXY_STATE_ONLINE:
			s.rewatchProxy(false)
		default:
			log.Errorf("unknown proxy state %+v", info)
		}
		return nil
	}

	//re-watch
	nodes := s.rewatchNodes()

	seqs, err := models.ExtraSeqList(nodes)
	if err != nil {
		log.ErrorErrorf(err, "get seq list failed")
		//s.reRegisterAndFillSlots(models.PROXY_STATE_ONLINE)
		return err
	}

	if len(seqs) == 0 || !s.topo.IsChildrenChangedEvent(e) {
		return nil
	}

	//get last pos
	index := -1
	for i, seq := range seqs {
		if s.lastActionSeq < seq {
			index = i
			break
		}
	}

	if index < 0 {
		return nil
	}

	actions := seqs[index:]
	for _, seq := range actions {
		exist, err := s.topo.Exist(path.Join(s.topo.GetActionResponsePath(seq), s.info.Id))
		if err != nil {
			log.ErrorErrorf(err, "get action failed")
			//s.reRegisterAndFillSlots(models.PROXY_STATE_ONLINE)
			return err
		}
		if exist {
			continue
		}
		if s.checkAndDoTopoChange(seq) {
			s.responseAction(int64(seq))
		}
	}

	s.lastActionSeq = seqs[len(seqs)-1]
	return nil
}
Exemple #19
0
func (t *MigrateTask) migrateSingleSlot(slotId int, to int) error {
	// set slot status
	s, err := models.GetSlot(t.zkConn, t.productName, slotId)
	if err != nil {
		log.ErrorErrorf(err, "get slot info failed")
		return err
	}
	if s.State.Status == models.SLOT_STATUS_OFFLINE {
		log.Warnf("status is offline: %+v", s)
		return nil
	}

	from := s.GroupId
	if s.State.Status == models.SLOT_STATUS_MIGRATE {
		from = s.State.MigrateStatus.From
	}

	// make sure from group & target group exists
	exists, err := models.GroupExists(t.zkConn, t.productName, from)
	if err != nil {
		return errors.Trace(err)
	}
	if !exists {
		log.Errorf("src group %d not exist when migrate from %d to %d", from, from, to)
		return errors.Errorf("group %d not found", from)
	}

	exists, err = models.GroupExists(t.zkConn, t.productName, to)
	if err != nil {
		return errors.Trace(err)
	}
	if !exists {
		return errors.Errorf("group %d not found", to)
	}

	// cannot migrate to itself, just ignore
	if from == to {
		log.Warnf("from == to, ignore: %+v", s)
		return nil
	}

	// modify slot status
	if err := s.SetMigrateStatus(t.zkConn, from, to); err != nil {
		log.ErrorErrorf(err, "set migrate status failed")
		return err
	}

	err = t.Migrate(s, from, to, func(p SlotMigrateProgress) {
		// on migrate slot progress
		if p.Remain%5000 == 0 {
			log.Infof("%+v", p)
		}
	})
	if err != nil {
		log.ErrorErrorf(err, "migrate slot failed")
		return err
	}

	// migrate done, change slot status back
	s.State.Status = models.SLOT_STATUS_ONLINE
	s.State.MigrateStatus.From = models.INVALID_ID
	s.State.MigrateStatus.To = models.INVALID_ID
	if err := s.Update(t.zkConn); err != nil {
		log.ErrorErrorf(err, "update zk status failed, should be: %+v", s)
		return err
	}
	return nil
}