// compact compacts etcd store and returns current rev. // It will return the current compact time and global revision if no error occurred. // Note that CAS fail will not incur any error. func compact(ctx context.Context, client *clientv3.Client, t, rev int64) (int64, int64, error) { resp, err := client.KV.Txn(ctx).If( clientv3.Compare(clientv3.Version(compactRevKey), "=", t), ).Then( clientv3.OpPut(compactRevKey, strconv.FormatInt(rev, 10)), // Expect side effect: increment Version ).Else( clientv3.OpGet(compactRevKey), ).Commit() if err != nil { return t, rev, err } curRev := resp.Header.Revision if !resp.Succeeded { curTime := resp.Responses[0].GetResponseRange().Kvs[0].Version return curTime, curRev, nil } curTime := t + 1 if rev == 0 { // We don't compact on bootstrap. return curTime, curRev, nil } if _, err = client.Compact(ctx, rev); err != nil { return curTime, curRev, err } glog.Infof("etcd: compacted rev (%d), endpoints (%v)", rev, client.Endpoints()) return curTime, curRev, nil }
// WaitEvents waits on a key until it observes the given events and returns the final one. func WaitEvents(c *clientv3.Client, key string, rev int64, evs []storagepb.Event_EventType) (*storagepb.Event, error) { wc := c.Watch(context.Background(), key, clientv3.WithRev(rev)) if wc == nil { return nil, ErrNoWatcher } return waitEvents(wc, evs), nil }
/* Sync localdir to etcd server state. WARNING: ALL CONTENT OF localdir WILL BE LOST Return revision of synced state */ func firstSyncEtcDir_v3(prefix string, c *clientv3.Client, localdir string) int64 { cleanDir(localdir) key, option := prefixToKeyOption(prefix) // Get all values resp, err := c.Get(context.Background(), key, option, clientv3.WithSort(clientv3.SortByKey, clientv3.SortDescend)) if err != nil { panic(err) } for _, kv := range resp.Kvs { targetPath := keyToLocalPath(strings.TrimPrefix(string(kv.Key), prefix), localdir) if targetPath == "" { continue } targetDir := filepath.Dir(targetPath) os.MkdirAll(targetDir, DEFAULT_DIRMODE) err = ioutil.WriteFile(targetPath, kv.Value, DEFAULT_FILEMODE) if err != nil { log.Printf("firstSyncEtcDir_v3 error write file '%v': %v\n", targetPath, err) } } return resp.Header.Revision }
func WaitPrefixEvents(c *clientv3.Client, prefix string, rev int64, evs []mvccpb.Event_EventType) (*clientv3.Event, error) { wc := c.Watch(context.Background(), prefix, clientv3.WithPrefix(), clientv3.WithRev(rev)) if wc == nil { return nil, ErrNoWatcher } return waitEvents(wc, evs), nil }
// waitDeletes efficiently waits until all keys matched by Get(key, opts...) are deleted func waitDeletes(ctx context.Context, client *v3.Client, key string, opts ...v3.OpOption) error { getOpts := []v3.OpOption{v3.WithSort(v3.SortByCreatedRev, v3.SortAscend)} getOpts = append(getOpts, opts...) resp, err := client.Get(ctx, key, getOpts...) maxRev := int64(math.MaxInt64) getOpts = append(getOpts, v3.WithRev(0)) for err == nil { for len(resp.Kvs) > 0 { i := len(resp.Kvs) - 1 if resp.Kvs[i].CreateRevision <= maxRev { break } resp.Kvs = resp.Kvs[:i] } if len(resp.Kvs) == 0 { break } lastKV := resp.Kvs[len(resp.Kvs)-1] maxRev = lastKV.CreateRevision err = waitDelete(ctx, client, string(lastKV.Key), maxRev) if err != nil || len(resp.Kvs) == 1 { break } getOpts = append(getOpts, v3.WithLimit(int64(len(resp.Kvs)-1))) resp, err = client.Get(ctx, key, getOpts...) } return err }
func NewWatchProxy(c *clientv3.Client) pb.WatchServer { wp := &watchProxy{ cw: c.Watcher, ctx: clientv3.WithRequireLeader(c.Ctx()), retryLimiter: rate.NewLimiter(rate.Limit(retryPerSecond), retryPerSecond), leaderc: make(chan struct{}), } wp.ranges = newWatchRanges(wp) go func() { // a new streams without opening any watchers won't catch // a lost leader event, so have a special watch to monitor it rev := int64((uint64(1) << 63) - 2) for wp.ctx.Err() == nil { wch := wp.cw.Watch(wp.ctx, lostLeaderKey, clientv3.WithRev(rev)) for range wch { } wp.mu.Lock() close(wp.leaderc) wp.leaderc = make(chan struct{}) wp.mu.Unlock() wp.retryLimiter.Wait(wp.ctx) } wp.mu.Lock() <-wp.ctx.Done() wp.mu.Unlock() wp.wg.Wait() wp.ranges.stop() }() return wp }
func getKey(ctx context.Context, client *clientv3.Client, key string) (*clientv3.GetResponse, error) { for ctx.Err() == nil { if gr, err := client.Get(ctx, key); err == nil { return gr, nil } } return nil, ctx.Err() }
func deletePrefix(ctx context.Context, client *clientv3.Client, key string) error { for ctx.Err() == nil { if _, err := client.Delete(ctx, key, clientv3.WithPrefix()); err == nil { return nil } } return ctx.Err() }
// compactor periodically compacts historical versions of keys in etcd. // It will compact keys with versions older than given interval. // In other words, after compaction, it will only contain keys set during last interval. // Any API call for the older versions of keys will return error. // Interval is the time interval between each compaction. The first compaction happens after "interval". func compactor(ctx context.Context, client *clientv3.Client, interval time.Duration) { // Technical definitions: // We have a special key in etcd defined as *compactRevKey*. // compactRevKey's value will be set to the string of last compacted revision. // compactRevKey's version will be used as logical time for comparison. THe version is referred as compact time. // Initially, because the key doesn't exist, the compact time (version) is 0. // // Algorithm: // - Compare to see if (local compact_time) = (remote compact_time). // - If yes, increment both local and remote compact_time, and do a compaction. // - If not, set local to remote compact_time. // // Technical details/insights: // // The protocol here is lease based. If one compactor CAS successfully, the others would know it when they fail in // CAS later and would try again in 10 minutes. If an APIServer crashed, another one would "take over" the lease. // // For example, in the following diagram, we have a compactor C1 doing compaction in t1, t2. Another compactor C2 // at t1' (t1 < t1' < t2) would CAS fail, set its known oldRev to rev at t1, and try again in t2' (t2' > t2). // If C1 crashed and wouldn't compact at t2, C2 would CAS successfully at t2'. // // oldRev(t2) curRev(t2) // + // oldRev curRev | // + + | // | | | // | | t1' | t2' // +---v-------------v----^---------v------^----> // t0 t1 t2 // // We have the guarantees: // - in normal cases, the interval is 10 minutes. // - in failover, the interval is >10m and <20m // // FAQ: // - What if time is not accurate? We don't care as long as someone did the compaction. Atomicity is ensured using // etcd API. // - What happened under heavy load scenarios? Initially, each apiserver will do only one compaction // every 10 minutes. This is very unlikely affecting or affected w.r.t. server load. var compactTime int64 var rev int64 var err error for { select { case <-time.After(interval): case <-ctx.Done(): return } compactTime, rev, err = compact(ctx, client, compactTime, rev) if err != nil { glog.Errorf("etcd: endpoint (%v) compact failed: %v", client.Endpoints(), err) continue } } }
func waitUpdate(ctx context.Context, client *v3.Client, key string, opts ...v3.OpOption) error { cctx, cancel := context.WithCancel(ctx) defer cancel() wresp, ok := <-client.Watch(cctx, key, opts...) if !ok { return ctx.Err() } return wresp.Err() }
// snapshotToStdout streams a snapshot over stdout func snapshotToStdout(c *clientv3.Client) { // must explicitly fetch first revision since no retry on stdout wr := <-c.Watch(context.TODO(), "", clientv3.WithPrefix(), clientv3.WithRev(1)) if wr.Err() == nil { wr.CompactRevision = 1 } if rev := snapshot(os.Stdout, c, wr.CompactRevision+1); rev != 0 { err := fmt.Errorf("snapshot interrupted by compaction %v", rev) ExitWithError(ExitInterrupted, err) } os.Stdout.Sync() }
func loadEtcdV3Config(client etcdv3.Client, config *server.Config) error { configPath := "/" + msg.PathPrefix + "/config" resp, err := client.Get(ctx, configPath) if err != nil { log.Printf("skydns: falling back to default configuration, could not read from etcd: %s", err) return nil } for _, ev := range resp.Kvs { if err := json.Unmarshal([]byte(ev.Value), config); err != nil { return fmt.Errorf("failed to unmarshal config: %s", err.Error()) } } return nil }
// NewSession gets the leased session for a client. func NewSession(client *v3.Client, opts ...SessionOption) (*Session, error) { ops := &sessionOptions{ttl: defaultSessionTTL} for _, opt := range opts { opt(ops) } resp, err := client.Grant(client.Ctx(), int64(ops.ttl)) if err != nil { return nil, err } id := v3.LeaseID(resp.ID) ctx, cancel := context.WithCancel(client.Ctx()) keepAlive, err := client.KeepAlive(ctx, id) if err != nil || keepAlive == nil { return nil, err } donec := make(chan struct{}) s := &Session{client: client, id: id, cancel: cancel, donec: donec} // keep the lease alive until client error or cancelled context go func() { defer close(donec) for range keepAlive { // eat messages until keep alive channel closes } }() return s, nil }
// compact compacts etcd store and returns current rev. // If it couldn't get current revision, the old rev will be returned. func compact(ctx context.Context, client *clientv3.Client, oldRev int64) (int64, error) { resp, err := client.Get(ctx, "/") if err != nil { return oldRev, err } curRev := resp.Header.Revision if oldRev == 0 { return curRev, nil } err = client.Compact(ctx, oldRev) if err != nil { return curRev, err } return curRev, nil }
func waitDelete(ctx context.Context, client *v3.Client, key string, rev int64) error { cctx, cancel := context.WithCancel(ctx) defer cancel() wch := client.Watch(cctx, key, v3.WithRev(rev)) for wr := range wch { for _, ev := range wr.Events { if ev.Type == storagepb.DELETE { return nil } } } if err := ctx.Err(); err != nil { return err } return fmt.Errorf("lost watcher waiting for delete") }
func doSerializedGet(ctx context.Context, client *v3.Client, results chan result) { for { st := time.Now() _, err := client.Get(ctx, "abc", v3.WithSerializable()) if ctx.Err() != nil { break } var errStr string if err != nil { errStr = err.Error() } res := result{errStr: errStr, duration: time.Since(st), happened: time.Now()} results <- res } close(results) }
func toGRPC(c *clientv3.Client) grpcAPI { return grpcAPI{ pb.NewClusterClient(c.ActiveConnection()), pb.NewKVClient(c.ActiveConnection()), pb.NewLeaseClient(c.ActiveConnection()), pb.NewWatchClient(c.ActiveConnection()), pb.NewMaintenanceClient(c.ActiveConnection()), } }
func NewWatchProxy(c *clientv3.Client) pb.WatchServer { wp := &watchProxy{ cw: c.Watcher, wgs: watchergroups{ cw: c.Watcher, groups: make(map[watchRange]*watcherGroup), idToGroup: make(map[receiverID]*watcherGroup), proxyCtx: c.Ctx(), }, ctx: c.Ctx(), } go func() { <-wp.ctx.Done() wp.wgs.stop() }() return wp }
func lockUntilSignal(c *clientv3.Client, lockname string) error { s, err := concurrency.NewSession(c) if err != nil { return err } m := concurrency.NewMutex(s, lockname) ctx, cancel := context.WithCancel(context.TODO()) // unlock in case of ordinary shutdown donec := make(chan struct{}) sigc := make(chan os.Signal, 1) signal.Notify(sigc, os.Interrupt, os.Kill) go func() { <-sigc cancel() close(donec) }() s, serr := concurrency.NewSession(c) if serr != nil { return serr } if err := m.Lock(ctx); err != nil { return err } k, kerr := c.Get(ctx, m.Key()) if kerr != nil { return kerr } if len(k.Kvs) == 0 { return errors.New("lock lost on init") } display.Get(*k) select { case <-donec: return m.Unlock(context.TODO()) case <-s.Done(): } return errors.New("session expired") }
func getLeader(etcdClient *clientv3.Client, path string) (string, int64, error) { kv := clientv3.NewKV(etcdClient) ctx, cancel := context.WithTimeout(etcdClient.Ctx(), requestTimeout) resp, err := kv.Get(ctx, path) cancel() if err != nil { return "", 0, errors.Trace(err) } if len(resp.Kvs) != 1 { return "", 0, errors.Errorf("invalid getLeader resp: %v", resp) } var leader pdpb.Leader if err = leader.Unmarshal(resp.Kvs[0].Value); err != nil { return "", 0, errors.Trace(err) } return leader.GetAddr(), resp.Header.Revision, nil }
// StartCompactor starts a compactor in the background in order to compact keys // older than fixed time. // We need to compact keys because we can't let on disk data grow forever. // We save the most recent 10 minutes data. It should be enough for slow watchers and to tolerate burst. // TODO: We might keep a longer history (12h) in the future once storage API can take // advantage of multi-version key. func StartCompactor(ctx context.Context, client *clientv3.Client) { endpointsMapMu.Lock() defer endpointsMapMu.Unlock() // We can't have multiple compaction jobs for the same cluster. // Currently we rely on endpoints to differentiate clusters. var emptyStruct struct{} for _, ep := range client.Endpoints() { if _, ok := endpointsMap[ep]; ok { glog.V(4).Infof("compactor already exists for endpoints %v") return } } for _, ep := range client.Endpoints() { endpointsMap[ep] = emptyStruct } go compactor(ctx, client, compactInterval) }
func prepareObjs(ctx context.Context, e *event, client *clientv3.Client, codec runtime.Codec, versioner storage.Versioner) (curObj runtime.Object, oldObj runtime.Object, err error) { if !e.isDeleted { curObj, err = decodeObj(codec, versioner, e.value, e.rev) if err != nil { return nil, nil, err } } if e.isDeleted || !e.isCreated { getResp, err := client.Get(ctx, e.key, clientv3.WithRev(e.rev-1)) if err != nil { return nil, nil, err } oldObj, err = decodeObj(codec, versioner, getResp.Kvs[0].Value, getResp.Kvs[0].ModRevision) if err != nil { return nil, nil, err } } return curObj, oldObj, nil }
func getWatchChan(c *clientv3.Client, args []string) (clientv3.WatchChan, error) { if len(args) < 1 || len(args) > 2 { return nil, fmt.Errorf("bad number of arguments") } key := args[0] opts := []clientv3.OpOption{clientv3.WithRev(watchRev)} if len(args) == 2 { if watchPrefix { return nil, fmt.Errorf("`range_end` and `--prefix` are mutually exclusive") } opts = append(opts, clientv3.WithRange(args[1])) } if watchPrefix { opts = append(opts, clientv3.WithPrefix()) } if watchPrevKey { opts = append(opts, clientv3.WithPrevKV()) } return c.Watch(context.TODO(), key, opts...), nil }
func campaign(c *clientv3.Client, election string, prop string) error { s, err := concurrency.NewSession(c) if err != nil { return err } e := concurrency.NewElection(s, election) ctx, cancel := context.WithCancel(context.TODO()) donec := make(chan struct{}) sigc := make(chan os.Signal, 1) signal.Notify(sigc, os.Interrupt, os.Kill) go func() { <-sigc cancel() close(donec) }() s, serr := concurrency.NewSession(c) if serr != nil { return serr } if err = e.Campaign(ctx, prop); err != nil { return err } // print key since elected resp, err := c.Get(ctx, e.Key()) if err != nil { return err } display.Get(*resp) select { case <-donec: case <-s.Done(): return errors.New("elect: session expired") } return e.Resign(context.TODO()) }
func putKeyAtMostOnce(ctx context.Context, client *clientv3.Client, key string) error { gr, err := getKey(ctx, client, key) if err != nil { return err } var modrev int64 if len(gr.Kvs) > 0 { modrev = gr.Kvs[0].ModRevision } for ctx.Err() == nil { _, err := client.Txn(ctx).If(clientv3.Compare(clientv3.ModRevision(key), "=", modrev)).Then(clientv3.OpPut(key, key)).Commit() if err == nil { return nil } } return ctx.Err() }
func etcdMon_v3(prefix string, c3 *clientv3.Client, bus chan fileChangeEvent, startRevision int64) { key, option := prefixToKeyOption(prefix) ch := c3.Watch(context.Background(), key, option, clientv3.WithRev(startRevision)) for chEvent := range ch { for _, event := range chEvent.Events { fileEvent := fileChangeEvent{ Path: string(event.Kv.Key), Content: event.Kv.Value, } switch event.Type { case mvccpb.PUT: bus <- fileEvent case mvccpb.DELETE: fileEvent.IsRemoved = true bus <- fileEvent default: log.Println("etcdMon_v3 undefined event type: ", event.Type) } } } close(bus) }
// NewSession gets the leased session for a client. func NewSession(client *v3.Client) (*Session, error) { clientSessions.mu.Lock() defer clientSessions.mu.Unlock() if s, ok := clientSessions.sessions[client]; ok { return s, nil } resp, err := client.Create(context.TODO(), sessionTTL) if err != nil { return nil, err } id := lease.LeaseID(resp.ID) ctx, cancel := context.WithCancel(context.Background()) keepAlive, err := client.KeepAlive(ctx, id) if err != nil || keepAlive == nil { return nil, err } donec := make(chan struct{}) s := &Session{client: client, id: id, cancel: cancel, donec: donec} clientSessions.sessions[client] = s // keep the lease alive until client error or cancelled context go func() { defer func() { clientSessions.mu.Lock() delete(clientSessions.sessions, client) clientSessions.mu.Unlock() close(donec) }() for range keepAlive { // eat messages until keep alive channel closes } }() return s, nil }
func prepareObjs(ctx context.Context, e *event, client *clientv3.Client, codec runtime.Codec, versioner storage.Versioner) (curObj runtime.Object, oldObj runtime.Object, err error) { if !e.isDeleted { curObj, err = decodeObj(codec, versioner, e.value, e.rev) if err != nil { return nil, nil, err } } if e.isDeleted || !e.isCreated { getResp, err := client.Get(ctx, e.key, clientv3.WithRev(e.rev-1)) if err != nil { return nil, nil, err } // Note that this sends the *old* object with the etcd revision for the time at // which it gets deleted. // We assume old object is returned only in Deleted event. Users (e.g. cacher) need // to have larger than previous rev to tell the ordering. oldObj, err = decodeObj(codec, versioner, getResp.Kvs[0].Value, e.rev) if err != nil { return nil, nil, err } } return curObj, oldObj, nil }
func toGRPC(c *clientv3.Client) grpcAPI { if v, ok := proxies[c]; ok { return v } return grpcAPI{ pb.NewClusterClient(c.ActiveConnection()), grpcproxy.KvServerToKvClient(grpcproxy.NewKvProxy(c)), pb.NewLeaseClient(c.ActiveConnection()), grpcproxy.WatchServerToWatchClient(grpcproxy.NewWatchProxy(c)), pb.NewMaintenanceClient(c.ActiveConnection()), } }
func syncProcess_v3FSEvent(localDir string, serverPrefix string, c3 *clientv3.Client, event fileChangeEvent) { etcdPath, err := filepath.Rel(localDir, event.Path) if err != nil { log.Printf("syncProcess_v3 error get relpath '%v': %v\n", event.Path, err) return } etcdPath = serverPrefix + etcdPath etcdPath = strings.Replace(etcdPath, string(os.PathSeparator), "/", -1) switch { case event.IsRemoved: _, err := c3.Delete(context.Background(), etcdPath) if err != nil { log.Printf("syncProcess_v3 error while delete etcdkey '%v': %v\n", etcdPath, err) } case event.IsDir: files, _ := ioutil.ReadDir(event.Path) for _, file := range files { path := filepath.Join(event.Path, file.Name()) content := []byte(nil) if !file.IsDir() { content, err = ioutil.ReadFile(path) if err != nil { log.Println(err) } } syncProcess_v3FSEvent(localDir, serverPrefix, c3, fileChangeEvent{ Path: path, IsDir: file.IsDir(), IsRemoved: false, Content: content, }) } case !event.IsDir: resp, err := c3.Get(context.Background(), etcdPath) if err != nil { log.Printf("syncProcess_v3 Can't read key '%v': %v\n", etcdPath, err) } if len(resp.Kvs) > 0 { if bytes.Equal(resp.Kvs[0].Value, event.Content) { return } } _, err = c3.Put(context.Background(), etcdPath, string(event.Content)) if err != nil { log.Printf("syncProcess_v3 error while put etcdkey '%v': %v\n", etcdPath, err) } } }