// An A record is generated when there is only one ZknsAddr, it // has no port component and provides an IPv4 address. func (rz *zknsResolver) getA(qname string) ([]*pdnsReply, error) { if !strings.HasSuffix(qname, rz.zknsDomain) { return nil, fmt.Errorf("invalid domain for query: %v", qname) } if qname[0] == '_' { // Since PDNS probes for all types, use some heuristics to limit error noise. relog.Debug("skipping A query: %v", qname) return nil, nil } zkname := qname[:len(qname)-len(rz.zknsDomain)] nameParts := reverse(strings.Split(zkname, ".")) zkPath := path.Join(rz.zkRoot, path.Join(nameParts...)) addrs, err := zkns.ReadAddrs(rz.zconn, zkPath) if err != nil { return nil, err } if len(addrs.Entries) != 1 || addrs.Entries[0].IPv4 == "" { // Since PDNS probes for all types, this isn't really an error worth mentioning. // return nil, fmt.Errorf("invalid response for CNAME query: %v", qname) return nil, nil } return []*pdnsReply{&pdnsReply{qname, "IN", "A", defaultTTL, defaultId, addrs.Entries[0].IPv4}}, nil }
func main() { dbConfigsFile, dbCredentialsFile := dbconfigs.RegisterCommonFlags() flag.Parse() relog.Info("started vtaction %v", os.Args) rpc.HandleHTTP() jsonrpc.ServeHTTP() jsonrpc.ServeRPC() bsonrpc.ServeHTTP() bsonrpc.ServeRPC() logFile, err := os.OpenFile(*logFilename, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0666) if err != nil { relog.Fatal("Can't open log file: %v", err) } relog.SetOutput(logFile) relog.SetPrefix(fmt.Sprintf("vtaction [%v] ", os.Getpid())) if err := relog.SetLevelByName(*logLevel); err != nil { relog.Fatal("%v", err) } relog.HijackLog(nil) relog.HijackStdio(logFile, logFile) mycnf, mycnfErr := mysqlctl.ReadMycnf(*mycnfFile) if mycnfErr != nil { relog.Fatal("mycnf read failed: %v", mycnfErr) } relog.Debug("mycnf: %v", jscfg.ToJson(mycnf)) dbcfgs, cfErr := dbconfigs.Init(mycnf.SocketFile, *dbConfigsFile, *dbCredentialsFile) if err != nil { relog.Fatal("%s", cfErr) } mysqld := mysqlctl.NewMysqld(mycnf, dbcfgs.Dba, dbcfgs.Repl) topoServer := topo.GetServer() defer topo.CloseServers() actor := tabletmanager.NewTabletActor(mysqld, topoServer) // we delegate out startup to the micromanagement server so these actions // will occur after we have obtained our socket. bindAddr := fmt.Sprintf(":%v", *port) httpServer := &http.Server{Addr: bindAddr} go func() { if err := httpServer.ListenAndServe(); err != nil { relog.Error("httpServer.ListenAndServe err: %v", err) } }() actionErr := actor.HandleAction(*actionNode, *action, *actionGuid, *force) if actionErr != nil { relog.Fatal("action error: %v", actionErr) } relog.Info("finished vtaction %v", os.Args) }
// Shut off all replication. func (wr *Wrangler) stopSlaves(tabletMap map[topo.TabletAlias]*topo.TabletInfo) error { errs := make(chan error, len(tabletMap)) f := func(ti *topo.TabletInfo) { actionPath, err := wr.ai.StopSlave(ti.Alias()) if err == nil { err = wr.ai.WaitForCompletion(actionPath, wr.actionTimeout()) } if err != nil { relog.Debug("StopSlave failed: %v", err) } errs <- err } for _, tablet := range tabletMap { // Pass loop variable explicitly so we don't have a concurrency issue. go f(tablet) } // wait for responses for i := 0; i < len(tabletMap); i++ { if err := <-errs; err != nil { return err } } return nil }
func (rz *zknsResolver) getSRV(qname string) ([]*pdnsReply, error) { if !strings.HasSuffix(qname, rz.zknsDomain) { return nil, fmt.Errorf("invalid domain for query: %v", qname) } zkname := qname[:len(qname)-len(rz.zknsDomain)] nameParts := strings.Split(zkname, ".") portName := nameParts[0] if portName[0] != '_' { // Since PDNS probes for all types, this isn't really an error worth mentioning. // fmt.Errorf("invalid port name for query: %v", portName) relog.Debug("skipping SRV query: %v", qname) return nil, nil } nameParts = reverse(nameParts[1:]) zkPath := path.Join(rz.zkRoot, path.Join(nameParts...)) addrs, err := zkns.ReadAddrs(rz.zconn, zkPath) if err != nil { return nil, err } replies := make([]*pdnsReply, 0, 16) for _, addr := range addrs.Entries { content := fmt.Sprintf("%v\t%v %v %v", defaultPriority, defaultWeight, addr.NamedPortMap[portName], addr.Host) replies = append(replies, &pdnsReply{qname, "IN", "SRV", defaultTTL, defaultId, content}) } return replies, nil }
// This piece runs on the presumably empty machine acting as the target in the // create replica action. // // validate target (self) // shutdown_mysql() // create temp data directory /vt/target/vt_<keyspace> // copy compressed data files via HTTP // verify hash of compressed files // uncompress into /vt/vt_<target-uid>/data/vt_<keyspace> // start_mysql() // clean up compressed files func (mysqld *Mysqld) RestoreFromSnapshot(snapshotManifest *SnapshotManifest, fetchConcurrency, fetchRetryCount int, dontWaitForSlaveStart bool, hookExtraEnv map[string]string) error { if snapshotManifest == nil { return errors.New("RestoreFromSnapshot: nil snapshotManifest") } relog.Debug("ValidateCloneTarget") if err := mysqld.ValidateCloneTarget(hookExtraEnv); err != nil { return err } relog.Debug("Shutdown mysqld") if err := Shutdown(mysqld, true, MysqlWaitTime); err != nil { return err } relog.Debug("Fetch snapshot") if err := mysqld.fetchSnapshot(snapshotManifest, fetchConcurrency, fetchRetryCount); err != nil { return err } relog.Debug("Restart mysqld") if err := Start(mysqld, MysqlWaitTime); err != nil { return err } cmdList, err := StartReplicationCommands(mysqld, snapshotManifest.ReplicationState) if err != nil { return err } if err := mysqld.executeSuperQueryList(cmdList); err != nil { return err } if !dontWaitForSlaveStart { if err := mysqld.WaitForSlaveStart(SlaveStartDeadline); err != nil { return err } } h := hook.NewSimpleHook("postflight_restore") h.ExtraEnv = hookExtraEnv if err := h.ExecuteOptional(); err != nil { return err } return nil }
// Return a list of corresponding replication positions. // Handles masters and slaves, but it's up to the caller to guarantee // all tablets are in the same shard. func (wr *Wrangler) tabletReplicationPositions(tablets []*topo.TabletInfo) ([]*mysqlctl.ReplicationPosition, error) { relog.Debug("tabletReplicationPositions %v", tablets) calls := make([]*rpcContext, len(tablets)) wg := sync.WaitGroup{} f := func(idx int) { defer wg.Done() ti := tablets[idx] ctx := &rpcContext{tablet: ti} calls[idx] = ctx var actionPath string if ti.Type == topo.TYPE_MASTER { actionPath, ctx.err = wr.ai.MasterPosition(ti.Alias()) } else if ti.IsSlaveType() { actionPath, ctx.err = wr.ai.SlavePosition(ti.Alias()) } if ctx.err != nil { return } var result interface{} if result, ctx.err = wr.ai.WaitForCompletionReply(actionPath, wr.actionTimeout()); ctx.err != nil { return } ctx.position = result.(*mysqlctl.ReplicationPosition) } for i, tablet := range tablets { // Don't scan tablets that won't return something useful. Otherwise, you'll // end up waiting for a timeout. if tablet.Type == topo.TYPE_MASTER || tablet.IsSlaveType() { wg.Add(1) go f(i) } else { relog.Info("tabletReplicationPositions: skipping tablet %v type %v", tablet.Alias(), tablet.Type) } } wg.Wait() someErrors := false positions := make([]*mysqlctl.ReplicationPosition, len(tablets)) for i, ctx := range calls { if ctx == nil { continue } if ctx.err != nil { relog.Warning("could not get replication position for tablet %v %v", ctx.tablet.Alias(), ctx.err) someErrors = true } else { positions[i] = ctx.position } } if someErrors { return positions, fmt.Errorf("partial position map, some errors") } return positions, nil }
func (ta *TabletActor) slavePosition(actionNode *ActionNode) error { position, err := ta.mysqld.SlaveStatus() if err != nil { return err } relog.Debug("SlavePosition %#v", *position) actionNode.reply = position return nil }
func (ta *TabletActor) waitSlavePosition(actionNode *ActionNode) error { slavePos := actionNode.args.(*SlavePositionReq) relog.Debug("WaitSlavePosition %#v", *slavePos) if err := ta.mysqld.WaitMasterPos(&slavePos.ReplicationPosition, slavePos.WaitTimeout); err != nil { return err } return ta.slavePosition(actionNode) }
// Returns 'our' Server: // - If only one is registered, that's the one. // - If more than one are registered, use the 'VT_TOPOLOGY_SERVER' // environment variable. // - Then defaults to 'zookeeper'. // - Then panics. func GetServer() Server { if len(serverImpls) == 1 { for name, ts := range serverImpls { relog.Debug("Using only topo.Server: %v", name) return ts } } name := os.Getenv("VT_TOPOLOGY_SERVER") if name == "" { name = "zookeeper" } result := serverImpls[name] if result == nil { panic(fmt.Errorf("No topo.Server named %v", name)) } relog.Debug("Using topo.Server: %v", name) return result }
func CreateTabletReplicationPaths(ts Server, tablet *Tablet) error { relog.Debug("CreateTabletReplicationPaths %v", tablet.Alias()) trrp := tabletReplicationPath(tablet) err := ts.CreateReplicationPath(tablet.Keyspace, tablet.Shard, trrp) if err != nil && err != ErrNodeExists { return err } return nil }
func (zkd *Zkd) Teardown() error { relog.Info("zkctl.Teardown") if err := zkd.Shutdown(); err != nil { relog.Warning("failed zookeeper shutdown: %v", err.Error()) } var removalErr error for _, dir := range zkd.config.DirectoryList() { relog.Debug("remove data dir %v", dir) if err := os.RemoveAll(dir); err != nil { relog.Error("failed removing %v: %v", dir, err.Error()) removalErr = err } } return removalErr }
func (ta *TabletActor) reparentPosition(actionNode *ActionNode) error { slavePos := actionNode.args.(*mysqlctl.ReplicationPosition) replicationState, waitPosition, timePromoted, err := ta.mysqld.ReparentPosition(slavePos) if err != nil { return err } rsd := new(RestartSlaveData) rsd.ReplicationState = replicationState rsd.TimePromoted = timePromoted rsd.WaitPosition = waitPosition rsd.Parent = ta.tabletAlias relog.Debug("reparentPosition %v", rsd.String()) actionNode.reply = rsd return nil }
func commandPruneActionLogs(wr *wrangler.Wrangler, subFlags *flag.FlagSet, args []string) (string, error) { keepCount := subFlags.Int("keep-count", 10, "count to keep") subFlags.Parse(args) if subFlags.NArg() == 0 { relog.Fatal("action PruneActionLogs requires <zk action log path> ...") } paths, err := resolveWildcards(wr, subFlags.Args()) if err != nil { return "", err } zkts, ok := wr.TopoServer().(*zktopo.Server) if !ok { return "", fmt.Errorf("PruneActionLogs requires a zktopo.Server") } var errCount sync2.AtomicInt32 wg := sync.WaitGroup{} for _, zkActionLogPath := range paths { wg.Add(1) go func(zkActionLogPath string) { defer wg.Done() purgedCount, err := zkts.PruneActionLogs(zkActionLogPath, *keepCount) if err == nil { relog.Debug("%v pruned %v", zkActionLogPath, purgedCount) } else { relog.Error("%v pruning failed: %v", zkActionLogPath, err) errCount.Add(1) } }(zkActionLogPath) } wg.Wait() if errCount.Get() > 0 { return "", fmt.Errorf("some errors occurred, check the log") } return "", nil }
func (ta *TabletActor) restartSlave(actionNode *ActionNode) error { rsd := actionNode.args.(*RestartSlaveData) tablet, err := ta.ts.GetTablet(ta.tabletAlias) if err != nil { return err } // If this check fails, we seem reparented. The only part that // could have failed is the insert in the replication // graph. Do NOT try to reparent again. That will either wedge // replication or corrupt data. if tablet.Parent != rsd.Parent { relog.Debug("restart with new parent") // Remove tablet from the replication graph. err = ta.ts.DeleteReplicationPath(tablet.Keyspace, tablet.Shard, tablet.ReplicationPath()) if err != nil && err != topo.ErrNoNode { return err } // Move a lag slave into the orphan lag type so we can safely ignore // this reparenting until replication catches up. if tablet.Type == topo.TYPE_LAG { tablet.Type = topo.TYPE_LAG_ORPHAN } else { err = ta.mysqld.RestartSlave(rsd.ReplicationState, rsd.WaitPosition, rsd.TimePromoted) if err != nil { return err } } // Once this action completes, update authoritive tablet node first. tablet.Parent = rsd.Parent err = topo.UpdateTablet(ta.ts, tablet) if err != nil { return err } } else if rsd.Force { err = ta.mysqld.RestartSlave(rsd.ReplicationState, rsd.WaitPosition, rsd.TimePromoted) if err != nil { return err } // Complete the special orphan accounting. if tablet.Type == topo.TYPE_LAG_ORPHAN { tablet.Type = topo.TYPE_LAG err = topo.UpdateTablet(ta.ts, tablet) if err != nil { return err } } } else { // There is nothing to safely reparent, so check replication. If // either replication thread is not running, report an error. replicationPos, err := ta.mysqld.SlaveStatus() if err != nil { return fmt.Errorf("cannot verify replication for slave: %v", err) } if replicationPos.SecondsBehindMaster == mysqlctl.InvalidLagSeconds { return fmt.Errorf("replication not running for slave") } } // Insert the new tablet location in the replication graph now that // we've updated the tablet. err = ta.ts.CreateReplicationPath(tablet.Keyspace, tablet.Shard, tablet.ReplicationPath()) if err != nil && err != topo.ErrNodeExists { return err } return nil }
// This is a quick and dirty tool to resurrect the TopologyServer data from the // canonical data stored in the tablet nodes. // // cells: local vt cells to scan for all tablets // keyspaces: list of keyspaces to rebuild func (wr *Wrangler) RebuildReplicationGraph(cells []string, keyspaces []string) error { if cells == nil || len(cells) == 0 { return fmt.Errorf("must specify cells to rebuild replication graph") } if keyspaces == nil || len(keyspaces) == 0 { return fmt.Errorf("must specify keyspaces to rebuild replication graph") } allTablets := make([]*topo.TabletInfo, 0, 1024) for _, cell := range cells { tablets, err := GetAllTablets(wr.ts, cell) if err != nil { return err } allTablets = append(allTablets, tablets...) } for _, keyspace := range keyspaces { relog.Debug("delete keyspace shards: %v", keyspace) if err := wr.ts.DeleteKeyspaceShards(keyspace); err != nil { return err } } keyspacesToRebuild := make(map[string]bool) hasErr := false mu := sync.Mutex{} wg := sync.WaitGroup{} for _, ti := range allTablets { wg.Add(1) go func(ti *topo.TabletInfo) { defer wg.Done() if !ti.IsInReplicationGraph() { return } if !strInList(keyspaces, ti.Keyspace) { return } mu.Lock() keyspacesToRebuild[ti.Keyspace] = true mu.Unlock() err := topo.CreateTabletReplicationPaths(wr.ts, ti.Tablet) if err != nil { mu.Lock() hasErr = true mu.Unlock() relog.Warning("failed creating replication path: %v", err) } }(ti) } wg.Wait() for keyspace, _ := range keyspacesToRebuild { wg.Add(1) go func(keyspace string) { defer wg.Done() if err := wr.RebuildKeyspaceGraph(keyspace, nil); err != nil { mu.Lock() hasErr = true mu.Unlock() relog.Warning("RebuildKeyspaceGraph(%v) failed: %v", keyspace, err) return } }(keyspace) } wg.Wait() if hasErr { return fmt.Errorf("some errors occurred rebuilding replication graph, consult log") } return nil }
// Check all the tablets replication positions to find if some // will have a problem, and suggest a fix for them. func (wr *Wrangler) checkSlaveReplication(tabletMap map[topo.TabletAlias]*topo.TabletInfo, masterTabletUid uint32) error { relog.Info("Checking all replication positions will allow the transition:") masterIsDead := masterTabletUid == topo.NO_TABLET // Check everybody has the right master. If there is no master // (crash) just check that everyone has the same parent. for _, tablet := range tabletMap { if masterTabletUid == topo.NO_TABLET { masterTabletUid = tablet.Parent.Uid } if tablet.Parent.Uid != masterTabletUid { return fmt.Errorf("tablet %v not slaved correctly, expected %v, found %v", tablet.Alias(), masterTabletUid, tablet.Parent.Uid) } } // now check all the replication positions will allow us to proceed if masterIsDead { relog.Debug(" master is dead, not checking Seconds Behind Master value") } var lastError error mutex := sync.Mutex{} wg := sync.WaitGroup{} for _, tablet := range tabletMap { wg.Add(1) go func(tablet *topo.TabletInfo) { defer wg.Done() if tablet.Type == topo.TYPE_LAG { relog.Info(" skipping slave position check for %v tablet %v", tablet.Type, tablet.Alias()) return } actionPath, err := wr.ai.SlavePosition(tablet.Alias()) if err != nil { mutex.Lock() lastError = err mutex.Unlock() relog.Error(" error asking tablet %v for slave position: %v", tablet.Alias(), err) return } result, err := wr.ai.WaitForCompletionReply(actionPath, wr.actionTimeout()) if err != nil { mutex.Lock() lastError = err mutex.Unlock() if tablet.Type == topo.TYPE_BACKUP { relog.Warning(" failed to get slave position from backup tablet %v, either wait for backup to finish or scrap tablet (%v)", tablet.Alias(), err) } else { relog.Warning(" failed to get slave position from %v: %v", tablet.Alias(), err) } return } if !masterIsDead { replPos := result.(*mysqlctl.ReplicationPosition) var dur time.Duration = time.Duration(uint(time.Second) * replPos.SecondsBehindMaster) if dur > wr.actionTimeout() { err = fmt.Errorf("slave is too far behind to complete reparent in time (%v>%v), either increase timeout using 'vtctl -wait-time XXX ReparentShard ...' or scrap tablet %v", dur, wr.actionTimeout(), tablet.Alias()) relog.Error(" %v", err) mutex.Lock() lastError = err mutex.Unlock() return } relog.Debug(" slave is %v behind master (<%v), reparent should work for %v", dur, wr.actionTimeout(), tablet.Alias()) } }(tablet) } wg.Wait() return lastError }
// Close all registered Server. func CloseServers() { for name, ts := range serverImpls { relog.Debug("Closing topo.Server: %v", name) ts.Close() } }
// Check all the tablets to see if we can proceed with reparenting. // masterPosition is supplied from the demoted master if we are doing // this gracefully. func (wr *Wrangler) checkSlaveConsistency(tabletMap map[uint32]*topo.TabletInfo, masterPosition *mysqlctl.ReplicationPosition) error { relog.Debug("checkSlaveConsistency %v %#v", mapKeys(tabletMap), masterPosition) // FIXME(msolomon) Something still feels clumsy here and I can't put my finger on it. calls := make(chan *rpcContext, len(tabletMap)) f := func(ti *topo.TabletInfo) { ctx := &rpcContext{tablet: ti} defer func() { calls <- ctx }() var args *tm.SlavePositionReq if masterPosition != nil { // If the master position is known, do our best to wait for replication to catch up. args = &tm.SlavePositionReq{*masterPosition, int(wr.actionTimeout().Seconds())} } else { // In the case where a master is down, look for the last bit of data copied and wait // for that to apply. That gives us a chance to wait for all data. actionPath, err := wr.ai.SlavePosition(ti.Alias()) if err != nil { ctx.err = err return } result, err := wr.ai.WaitForCompletionReply(actionPath, wr.actionTimeout()) if err != nil { ctx.err = err return } replPos := result.(*mysqlctl.ReplicationPosition) lastDataPos := mysqlctl.ReplicationPosition{MasterLogFile: replPos.MasterLogFileIo, MasterLogPositionIo: replPos.MasterLogPositionIo} args = &tm.SlavePositionReq{lastDataPos, int(wr.actionTimeout().Seconds())} } // This option waits for the SQL thread to apply all changes to this instance. actionPath, err := wr.ai.WaitSlavePosition(ti.Alias(), args) if err != nil { ctx.err = err return } result, err := wr.ai.WaitForCompletionReply(actionPath, wr.actionTimeout()) if err != nil { ctx.err = err return } ctx.position = result.(*mysqlctl.ReplicationPosition) } for _, tablet := range tabletMap { // Pass loop variable explicitly so we don't have a concurrency issue. go f(tablet) } // map positions to tablets positionMap := make(map[string][]uint32) for i := 0; i < len(tabletMap); i++ { ctx := <-calls mapKey := "unavailable-tablet-error" if ctx.err == nil { mapKey = ctx.position.MapKey() } if _, ok := positionMap[mapKey]; !ok { positionMap[mapKey] = make([]uint32, 0, 32) } positionMap[mapKey] = append(positionMap[mapKey], ctx.tablet.Uid) } if len(positionMap) == 1 { // great, everyone agrees // demotedMasterReplicationState is nil if demotion failed if masterPosition != nil { demotedMapKey := masterPosition.MapKey() if _, ok := positionMap[demotedMapKey]; !ok { for slaveMapKey, _ := range positionMap { return fmt.Errorf("slave position doesn't match demoted master: %v != %v", demotedMapKey, slaveMapKey) } } } } else { // FIXME(msolomon) in the event of a crash, do you pick replica that is // furthest along or do you promote the majority? data loss vs availability // sounds like you pick the latest group and reclone. items := make([]string, 0, 32) for slaveMapKey, uids := range positionMap { tabletPaths := make([]string, len(uids)) for i, uid := range uids { tabletPaths[i] = tabletMap[uid].Alias().String() } items = append(items, fmt.Sprintf(" %v\n %v", slaveMapKey, strings.Join(tabletPaths, "\n "))) } sort.Strings(items) return fmt.Errorf("inconsistent slaves, mark some offline with vtctl ScrapTablet\n%v", strings.Join(items, "\n")) } return nil }
// Make this external, since in needs to be forced from time to time. func Scrap(ts topo.Server, tabletAlias topo.TabletAlias, force bool) error { tablet, err := ts.GetTablet(tabletAlias) if err != nil { return err } // If you are already scrap, skip deleting the path. It won't // be correct since the Parent will be cleared already. wasAssigned := tablet.IsAssigned() replicationPath := "" if wasAssigned { replicationPath = tablet.ReplicationPath() } tablet.Type = topo.TYPE_SCRAP tablet.Parent = topo.TabletAlias{} // Update the tablet first, since that is canonical. err = topo.UpdateTablet(ts, tablet) if err != nil { return err } // Remove any pending actions. Presumably forcing a scrap means you don't // want the agent doing anything and the machine requires manual attention. if force { err := ts.PurgeTabletActions(tabletAlias, ActionNodeCanBePurged) if err != nil { relog.Warning("purge actions failed: %v", err) } } if wasAssigned { err = ts.DeleteReplicationPath(tablet.Keyspace, tablet.Shard, replicationPath) if err != nil { switch err { case topo.ErrNoNode: relog.Debug("no replication path: %v", replicationPath) err = nil case topo.ErrNotEmpty: // If you are forcing the scrapping of a master, you can't update the // replication graph yet, since other nodes are still under the impression // they are slaved to this tablet. // If the node was not empty, we can't do anything about it - the replication // graph needs to be fixed by reparenting. If the action was forced, assume // the user knows best and squelch the error. if tablet.Parent.Uid == topo.NO_TABLET && force { err = nil } } if err != nil { relog.Warning("remove replication path failed: %v %v", replicationPath, err) } } } // run a hook for final cleanup, only in non-force mode. // (force mode executes on the vtctl side, not on the vttablet side) if !force { hk := hook.NewSimpleHook("postflight_scrap") configureTabletHook(hk, tablet.Alias()) if hookErr := hk.ExecuteOptional(); hookErr != nil { // we don't want to return an error, the server // is already in bad shape probably. relog.Warning("Scrap: postflight_scrap failed: %v", hookErr) } } return nil }
// Export addresses from the VT serving graph to a legacy zkns server. func (wr *Wrangler) ExportZknsForKeyspace(keyspace string) error { zkTopo, ok := wr.ts.(*zktopo.Server) if !ok { return fmt.Errorf("ExportZknsForKeyspace only works with zktopo") } zconn := zkTopo.GetZConn() shardNames, err := wr.ts.GetShardNames(keyspace) if err != nil { return err } // Scan the first shard to discover which cells need local serving data. aliases, err := topo.FindAllTabletAliasesInShard(wr.ts, keyspace, shardNames[0]) if err != nil { return err } cellMap := make(map[string]bool) for _, alias := range aliases { cellMap[alias.Cell] = true } for cell, _ := range cellMap { vtnsRootPath := fmt.Sprintf("/zk/%v/vt/ns/%v", cell, keyspace) zknsRootPath := fmt.Sprintf("/zk/%v/zkns/vt/%v", cell, keyspace) // Get the existing list of zkns children. If they don't get rewritten, // delete them as stale entries. zknsChildren, err := zk.ChildrenRecursive(zconn, zknsRootPath) if err != nil { if zookeeper.IsError(err, zookeeper.ZNONODE) { zknsChildren = make([]string, 0) } else { return err } } staleZknsPaths := make(map[string]bool) for _, child := range zknsChildren { staleZknsPaths[path.Join(zknsRootPath, child)] = true } vtnsChildren, err := zk.ChildrenRecursive(zconn, vtnsRootPath) if err != nil { if zookeeper.IsError(err, zookeeper.ZNONODE) { vtnsChildren = make([]string, 0) } else { return err } } for _, child := range vtnsChildren { vtnsAddrPath := path.Join(vtnsRootPath, child) zknsAddrPath := path.Join(zknsRootPath, child) _, stat, err := zconn.Get(vtnsAddrPath) if err != nil { return err } // Leaf nodes correspond to zkns vdns files in the old setup. if stat.NumChildren() > 0 { continue } zknsPathsWritten, err := wr.exportVtnsToZkns(zconn, vtnsAddrPath, zknsAddrPath) if err != nil { return err } relog.Debug("zknsPathsWritten: %v", zknsPathsWritten) for _, zkPath := range zknsPathsWritten { delete(staleZknsPaths, zkPath) } } relog.Debug("staleZknsPaths: %v", staleZknsPaths) prunePaths := make([]string, 0, len(staleZknsPaths)) for prunePath, _ := range staleZknsPaths { prunePaths = append(prunePaths, prunePath) } sort.Strings(prunePaths) // Prune paths in reverse order so we remove children first for i := len(prunePaths) - 1; i >= 0; i-- { relog.Info("prune stale zkns path %v", prunePaths[i]) if err := zconn.Delete(prunePaths[i], -1); err != nil && !zookeeper.IsError(err, zookeeper.ZNOTEMPTY) { return err } } } return nil }