// these parts of Create may require a retry func (s *consulStore) innerCreate(manifest pods.Manifest, nodeSelector klabels.Selector, podLabels klabels.Set) (fields.RC, error) { id := fields.ID(uuid.New()) rcp := kp.RCPath(id.String()) rc := fields.RC{ ID: id, Manifest: manifest, NodeSelector: nodeSelector, PodLabels: podLabels, ReplicasDesired: 0, Disabled: false, } jsonRC, err := json.Marshal(rc) if err != nil { return fields.RC{}, err } success, _, err := s.kv.CAS(&api.KVPair{ Key: rcp, Value: jsonRC, // the chance of the UUID already existing is vanishingly small, but // technically not impossible, so we should use the CAS index to guard // against duplicate UUIDs ModifyIndex: 0, }, nil) if err != nil { return fields.RC{}, consulutil.NewKVError("cas", rcp, err) } if !success { return fields.RC{}, CASError(rcp) } return rc, nil }
// performs a safe (ie check-and-set) mutation of the rc with the given id, // using the given function // if the mutator returns an error, it will be propagated out // if the returned RC has id="", then it will be deleted func (s *consulStore) mutateRc(id fields.ID, mutator func(fields.RC) (fields.RC, error)) error { rcp := kp.RCPath(id.String()) kvp, meta, err := s.kv.Get(rcp, nil) if err != nil { return err } if kvp == nil { return fmt.Errorf("replication controller %s does not exist", id) } rc, err := s.kvpToRC(kvp) if err != nil { return err } newKVP := &api.KVPair{ Key: rcp, ModifyIndex: meta.LastIndex, } var success bool newRC, err := mutator(rc) if err != nil { return err } if newRC.ID.String() == "" { // TODO: If this fails, then we have some dangling labels. // Perhaps they can be cleaned up later. // note that if the CAS fails afterwards, we will have still deleted // the labels, and then we will retry, which will involve deleting them // again // really the only way to solve this is a transaction err = s.applicator.RemoveAllLabels(labels.RC, id.String()) if err != nil { return err } success, _, err = s.kv.DeleteCAS(newKVP, nil) } else { b, err := json.Marshal(newRC) if err != nil { return err } newKVP.Value = b success, _, err = s.kv.CAS(newKVP, nil) } if err != nil { return err } if !success { return CASError(rcp) } return nil }
func (s *consulStore) Get(id fields.ID) (fields.RC, error) { kvp, _, err := s.kv.Get(kp.RCPath(id.String()), nil) if err != nil { return fields.RC{}, err } if kvp == nil { // ID didn't exist return fields.RC{}, nil } return s.kvpToRC(kvp) }
// close one child func (rcf *Farm) releaseChild(id fields.ID) { rcf.logger.WithField("rc", id).Infoln("Releasing replication controller") close(rcf.children[id].quit) delete(rcf.children, id) // if our lock is active, attempt to gracefully release it on this rc if rcf.lock != nil { err := rcf.lock.Unlock(kp.LockPath(kp.RCPath(id.String()))) if err != nil { rcf.logger.WithField("rc", id).Warnln("Could not release replication controller lock") } } }
func (s *consulStore) Watch(rc *fields.RC, quit <-chan struct{}) (<-chan struct{}, <-chan error) { updated := make(chan struct{}) errors := make(chan error) input := make(chan *api.KVPair) go consulutil.WatchSingle(kp.RCPath(rc.ID.String()), s.kv, input, quit, errors) go func() { defer close(updated) defer close(errors) for kvp := range input { if kvp == nil { // seems this RC got deleted from under us. quitting // would be unexpected, so we'll just wait for it to // reappear in consul continue } newRC, err := s.kvpToRC(kvp) if err != nil { select { case errors <- err: case <-quit: } } else { *rc = newRC select { case updated <- struct{}{}: case <-quit: } } } }() return updated, errors }
func (u update) lockPath(id rcf.ID) string { // RUs want to lock the RCs they're mutating, but this lock is separate // from the RC lock (which is held by the rc.WatchDesires goroutine), so the // key being locked is different return kp.LockPath(kp.RCPath(id.String(), "update")) }
// Start is a blocking function that monitors Consul for replication controllers. // The Farm will attempt to claim replication controllers as they appear and, // if successful, will start goroutines for those replication controllers to do // their job. Closing the quit channel will cause this function to return, // releasing all locks it holds. // // Start is not safe for concurrent execution. Do not execute multiple // concurrent instances of Start. func (rcf *Farm) Start(quit <-chan struct{}) { subQuit := make(chan struct{}) defer close(subQuit) rcWatch, rcErr := rcf.rcStore.WatchNew(subQuit) START_LOOP: for { select { case <-quit: rcf.logger.NoFields().Infoln("Halt requested, releasing replication controllers") rcf.releaseChildren() return case session := <-rcf.sessions: if session == "" { // our session has expired, we must assume our locked children // have all been released and that someone else may have // claimed them by now rcf.logger.NoFields().Errorln("Session expired, releasing replication controllers") rcf.lock = nil rcf.releaseChildren() } else { // a new session has been acquired - only happens after an // expiration message, so len(children)==0 rcf.logger.WithField("session", session).Infoln("Acquired new session") lock := rcf.kpStore.NewUnmanagedLock(session, "") rcf.lock = &lock // TODO: restart the watch so that you get updates right away? } case err := <-rcErr: rcf.logger.WithError(err).Errorln("Could not read consul replication controllers") case rcFields := <-rcWatch: rcf.logger.WithField("n", len(rcFields)).Debugln("Received replication controller update") if rcf.lock == nil { // we can't claim new nodes because our session is invalidated. // raise an error and ignore this update rcf.logger.NoFields().Warnln("Received replication controller update, but do not have session to acquire locks") continue } // track which children were found in the returned set foundChildren := make(map[fields.ID]struct{}) for _, rcField := range rcFields { rcLogger := rcf.logger.SubLogger(logrus.Fields{ "rc": rcField.ID, "pod": rcField.Manifest.ID(), }) if _, ok := rcf.children[rcField.ID]; ok { // this one is already ours, skip rcLogger.NoFields().Debugln("Got replication controller already owned by self") foundChildren[rcField.ID] = struct{}{} continue } err := rcf.lock.Lock(kp.LockPath(kp.RCPath(rcField.ID.String()))) if _, ok := err.(kp.AlreadyLockedError); ok { // someone else must have gotten it first - log and move to // the next one rcLogger.NoFields().Debugln("Lock on replication controller was denied") continue } else if err != nil { rcLogger.NoFields().Errorln("Got error while locking replication controller - session may be expired") // stop processing this update and go back to the select // chances are this error is a network problem or session // expiry, and all the others in this update would also fail continue START_LOOP } // at this point the rc is ours, time to spin it up rcLogger.NoFields().Infoln("Acquired lock on new replication controller, spawning") newChild := New( rcField, rcf.kpStore, rcf.rcStore, rcf.scheduler, rcf.labeler, rcLogger, ) childQuit := make(chan struct{}) rcf.children[rcField.ID] = childRC{rc: newChild, quit: childQuit} foundChildren[rcField.ID] = struct{}{} go func() { // disabled-ness is handled in watchdesires for err := range newChild.WatchDesires(childQuit) { rcLogger.WithError(err).Errorln("Got error in replication controller loop") } }() } // now remove any children that were not found in the result set rcf.logger.NoFields().Debugln("Pruning replication controllers that have disappeared") for id := range rcf.children { if _, ok := foundChildren[id]; !ok { rcf.releaseChild(id) } } } } }