forked from vitessio/vitess
/
reparent.go
241 lines (199 loc) · 7.76 KB
/
reparent.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
// Copyright 2012, Google Inc. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package wrangler
/*
Assume a graph of mysql nodes.
Replace node N with X.
Connect to N and record file/position from "show master status"
On N: (Demote Master)
SET GLOBAL READ_ONLY = 1;
FLUSH TABLES WITH READ LOCK;
UNLOCK TABLES;
While this is read-only, all the replicas should sync to the same point.
For all slaves of N:
show slave status
relay_master_log_file
exec_master_log_pos
Map file:pos to list of slaves that are in sync
There should be only one group (ideally). If not, manually resolve, or pick
the largest group.
Select X from N - X is the new root node. Might not be a "master" in terms of
voltron, but it will be the data source for the rest of the nodes.
On X: (Promote Slave)
STOP SLAVE;
RESET MASTER;
RESET SLAVE;
SHOW MASTER STATUS;
replication file,position
INSERT INTO _vt.replication_log (time_created_ns, 'reparent check') VALUES (<time>);
INSERT INTO _vt.reparent_log (time_created_ns, 'last post', 'new pos') VALUES ... ;
SHOW MASTER STATUS;
wait file,position
SET GLOBAL READ_ONLY=0;
Disabling READ_ONLY mode here is a matter of opinion.
Realistically it is probably safer to do this later on and minimize
the potential of replaying rows. It expands the write unavailable window
slightly - probably by about 1 second.
For all slaves in majority N:
if slave != X (Restart Slave)
STOP SLAVE;
RESET SLAVE;
CHANGE MASTER TO X;
START SLAVE;
SELECT MASTER_POS_WAIT(file, pos, deadline)
SELECT time_created FROM _vt.replication_log WHERE time_created_ns = <time>;
if no connection to N is available, ???
On X: (promoted slave)
SET GLOBAL READ_ONLY=0;
*/
import (
"fmt"
log "github.com/golang/glog"
myproto "github.com/youtube/vitess/go/vt/mysqlctl/proto"
"github.com/youtube/vitess/go/vt/tabletmanager/actionnode"
"github.com/youtube/vitess/go/vt/topo"
"github.com/youtube/vitess/go/vt/wrangler/events"
)
const (
SLAVE_STATUS_DEADLINE = 10e9
)
// ReparentShard creates the reparenting action and launches a goroutine
// to coordinate the procedure.
//
//
// leaveMasterReadOnly: leave the master in read-only mode, even
// though all the other necessary updates have been made.
// forceReparentToCurrentMaster: mostly for test setups, this can
// cause data loss.
func (wr *Wrangler) ReparentShard(keyspace, shard string, masterElectTabletAlias topo.TabletAlias, leaveMasterReadOnly, forceReparentToCurrentMaster bool) error {
// lock the shard
actionNode := actionnode.ReparentShard(masterElectTabletAlias)
lockPath, err := wr.lockShard(keyspace, shard, actionNode)
if err != nil {
return err
}
// do the work
err = wr.reparentShardLocked(keyspace, shard, masterElectTabletAlias, leaveMasterReadOnly, forceReparentToCurrentMaster)
// and unlock
return wr.unlockShard(keyspace, shard, actionNode, lockPath, err)
}
func (wr *Wrangler) reparentShardLocked(keyspace, shard string, masterElectTabletAlias topo.TabletAlias, leaveMasterReadOnly, forceReparentToCurrentMaster bool) error {
// critical read, we want up to date info (and the shard is locked).
shardInfo, err := wr.ts.GetShardCritical(keyspace, shard)
if err != nil {
return err
}
tabletMap, err := topo.GetTabletMapForShard(wr.ts, keyspace, shard)
if err != nil {
return err
}
slaveTabletMap, masterTabletMap := sortedTabletMap(tabletMap)
if shardInfo.MasterAlias == masterElectTabletAlias && !forceReparentToCurrentMaster {
return fmt.Errorf("master-elect tablet %v is already master - specify -force to override", masterElectTabletAlias)
}
masterElectTablet, ok := tabletMap[masterElectTabletAlias]
if !ok {
return fmt.Errorf("master-elect tablet %v not found in replication graph %v/%v %v", masterElectTabletAlias, keyspace, shard, mapKeys(tabletMap))
}
// Create reusable Reparent event with available info
ev := &events.Reparent{
ShardInfo: *shardInfo,
NewMaster: *masterElectTablet.Tablet,
}
if oldMasterTablet, ok := tabletMap[shardInfo.MasterAlias]; ok {
ev.OldMaster = *oldMasterTablet.Tablet
}
if !shardInfo.MasterAlias.IsZero() && !forceReparentToCurrentMaster {
err = wr.reparentShardGraceful(ev, shardInfo, slaveTabletMap, masterTabletMap, masterElectTablet, leaveMasterReadOnly)
} else {
err = wr.reparentShardBrutal(ev, shardInfo, slaveTabletMap, masterTabletMap, masterElectTablet, leaveMasterReadOnly, forceReparentToCurrentMaster)
}
if err == nil {
// only log if it works, if it fails we'll show the error
log.Infof("reparentShard finished")
}
return err
}
// ShardReplicationPositions returns the ReplicationPositions for all
// the tablets in a shard.
func (wr *Wrangler) ShardReplicationPositions(keyspace, shard string) ([]*topo.TabletInfo, []*myproto.ReplicationPosition, error) {
shardInfo, err := wr.ts.GetShard(keyspace, shard)
if err != nil {
return nil, nil, err
}
// lock the shard
actionNode := actionnode.CheckShard()
lockPath, err := wr.lockShard(keyspace, shard, actionNode)
if err != nil {
return nil, nil, err
}
tabletMap, posMap, err := wr.shardReplicationPositions(shardInfo)
return tabletMap, posMap, wr.unlockShard(keyspace, shard, actionNode, lockPath, err)
}
func (wr *Wrangler) shardReplicationPositions(shardInfo *topo.ShardInfo) ([]*topo.TabletInfo, []*myproto.ReplicationPosition, error) {
// FIXME(msolomon) this assumes no hierarchical replication, which is currently the case.
tabletMap, err := topo.GetTabletMapForShard(wr.ts, shardInfo.Keyspace(), shardInfo.ShardName())
if err != nil {
return nil, nil, err
}
tablets := CopyMapValues(tabletMap, []*topo.TabletInfo{}).([]*topo.TabletInfo)
positions, err := wr.tabletReplicationPositions(tablets)
return tablets, positions, err
}
// ReparentTablet attempts to reparent this tablet to the current
// master, based on the current replication position. If there is no
// match, it will fail.
func (wr *Wrangler) ReparentTablet(tabletAlias topo.TabletAlias) error {
// Get specified tablet.
// Get current shard master tablet.
// Sanity check they are in the same keyspace/shard.
// Get slave position for specified tablet.
// Get reparent position from master for the given slave position.
// Issue a restart slave on the specified tablet.
ti, err := wr.ts.GetTablet(tabletAlias)
if err != nil {
return err
}
shardInfo, err := wr.ts.GetShard(ti.Keyspace, ti.Shard)
if err != nil {
return err
}
if shardInfo.MasterAlias.IsZero() {
return fmt.Errorf("no master tablet for shard %v/%v", ti.Keyspace, ti.Shard)
}
masterTi, err := wr.ts.GetTablet(shardInfo.MasterAlias)
if err != nil {
return err
}
// Basic sanity checking.
if masterTi.Type != topo.TYPE_MASTER {
return fmt.Errorf("TopologyServer has inconsistent state for shard master %v", shardInfo.MasterAlias)
}
if masterTi.Keyspace != ti.Keyspace || masterTi.Shard != ti.Shard {
return fmt.Errorf("master %v and potential slave not in same keyspace/shard", shardInfo.MasterAlias)
}
pos, err := wr.ai.SlavePosition(ti, wr.actionTimeout())
if err != nil {
return err
}
log.Infof("slave tablet position: %v %v %v", tabletAlias, ti.MysqlAddr(), pos.MapKey())
actionPath, err := wr.ai.ReparentPosition(masterTi.Alias, pos)
if err != nil {
return err
}
result, err := wr.WaitForCompletionReply(actionPath)
if err != nil {
return err
}
rsd := result.(*actionnode.RestartSlaveData)
log.Infof("master tablet position: %v %v %v", shardInfo.MasterAlias, masterTi.MysqlAddr(), rsd.ReplicationState.ReplicationPosition.MapKey())
// An orphan is already in the replication graph but it is
// disconnected, hence we have to force this action.
rsd.Force = ti.Type == topo.TYPE_LAG_ORPHAN
actionPath, err = wr.ai.RestartSlave(ti.Alias, rsd)
if err != nil {
return err
}
return wr.WaitForCompletion(actionPath)
}