// UpdateService updates a Service referenced by ServiceID with the given ServiceSpec. // - Returns `NotFound` if the Service is not found. // - Returns `InvalidArgument` if the ServiceSpec is malformed. // - Returns `Unimplemented` if the ServiceSpec references unimplemented features. // - Returns an error if the update fails. func (s *Server) UpdateService(ctx context.Context, request *api.UpdateServiceRequest) (*api.UpdateServiceResponse, error) { if request.ServiceID == "" || request.ServiceVersion == nil { return nil, grpc.Errorf(codes.InvalidArgument, errInvalidArgument.Error()) } if err := validateServiceSpec(request.Spec); err != nil { return nil, err } var service *api.Service s.store.View(func(tx store.ReadTx) { service = store.GetService(tx, request.ServiceID) }) if service == nil { return nil, grpc.Errorf(codes.NotFound, "service %s not found", request.ServiceID) } if request.Spec.Endpoint != nil && !reflect.DeepEqual(request.Spec.Endpoint, service.Spec.Endpoint) { if err := s.checkPortConflicts(request.Spec, request.ServiceID); err != nil { return nil, err } } err := s.store.Update(func(tx store.Tx) error { service = store.GetService(tx, request.ServiceID) if service == nil { return nil } // temporary disable network update if request.Spec != nil && !reflect.DeepEqual(request.Spec.Networks, service.Spec.Networks) { return errNetworkUpdateNotSupported } // orchestrator is designed to be stateless, so it should not deal // with service mode change (comparing current config with previous config). // proper way to change service mode is to delete and re-add. if request.Spec != nil && reflect.TypeOf(service.Spec.Mode) != reflect.TypeOf(request.Spec.Mode) { return errModeChangeNotAllowed } service.Meta.Version = *request.ServiceVersion service.Spec = *request.Spec.Copy() // Reset update status service.UpdateStatus = nil return store.UpdateService(tx, service) }) if err != nil { return nil, err } if service == nil { return nil, grpc.Errorf(codes.NotFound, "service %s not found", request.ServiceID) } return &api.UpdateServiceResponse{ Service: service, }, nil }
func (u *Updater) completeUpdate(ctx context.Context, serviceID string) { log.G(ctx).Debugf("update of service %s complete", serviceID) err := u.store.Update(func(tx store.Tx) error { service := store.GetService(tx, serviceID) if service == nil { return nil } if service.UpdateStatus == nil { // The service was changed since we started this update return nil } if service.UpdateStatus.State == api.UpdateStatus_ROLLBACK_STARTED { service.UpdateStatus.State = api.UpdateStatus_ROLLBACK_COMPLETED service.UpdateStatus.Message = "rollback completed" } else { service.UpdateStatus.State = api.UpdateStatus_COMPLETED service.UpdateStatus.Message = "update completed" } service.UpdateStatus.CompletedAt = ptypes.MustTimestampProto(time.Now()) return store.UpdateService(tx, service) }) if err != nil { log.G(ctx).WithError(err).Errorf("failed to mark update of service %s complete", serviceID) } }
// UpdateService updates a Service referenced by ServiceID with the given ServiceSpec. // - Returns `NotFound` if the Service is not found. // - Returns `InvalidArgument` if the ServiceSpec is malformed. // - Returns `Unimplemented` if the ServiceSpec references unimplemented features. // - Returns an error if the update fails. func (s *Server) UpdateService(ctx context.Context, request *api.UpdateServiceRequest) (*api.UpdateServiceResponse, error) { if request.ServiceID == "" || request.ServiceVersion == nil { return nil, grpc.Errorf(codes.InvalidArgument, errInvalidArgument.Error()) } if err := validateServiceSpec(request.Spec); err != nil { return nil, err } var service *api.Service err := s.store.Update(func(tx store.Tx) error { service = store.GetService(tx, request.ServiceID) if service == nil { return nil } service.Meta.Version = *request.ServiceVersion service.Spec = *request.Spec.Copy() return store.UpdateService(tx, service) }) if err != nil { return nil, err } if service == nil { return nil, grpc.Errorf(codes.NotFound, "service %s not found", request.ServiceID) } return &api.UpdateServiceResponse{ Service: service, }, nil }
func (r *Orchestrator) handleTaskChange(ctx context.Context, t *api.Task) { // If we already set the desired state past TaskStateRunning, there is no // further action necessary. if t.DesiredState > api.TaskStateRunning { return } var ( n *api.Node service *api.Service ) r.store.View(func(tx store.ReadTx) { if t.NodeID != "" { n = store.GetNode(tx, t.NodeID) } if t.ServiceID != "" { service = store.GetService(tx, t.ServiceID) } }) if !orchestrator.IsReplicatedService(service) { return } if t.Status.State > api.TaskStateRunning || (t.NodeID != "" && invalidNode(n)) { r.restartTasks[t.ID] = struct{}{} } }
func (r *Supervisor) waitRestart(ctx context.Context, oldDelay *delayedStart, cluster *api.Cluster, taskID string) { // Wait for the last restart delay to elapse. select { case <-oldDelay.doneCh: case <-ctx.Done(): return } // Start the next restart err := r.store.Update(func(tx store.Tx) error { t := store.GetTask(tx, taskID) if t == nil { return nil } if t.DesiredState > api.TaskStateRunning { return nil } service := store.GetService(tx, t.ServiceID) if service == nil { return nil } return r.Restart(ctx, tx, cluster, service, *t) }) if err != nil { log.G(ctx).WithError(err).Errorf("failed to restart task after waiting for previous restart") } }
func (u *Updater) pauseUpdate(ctx context.Context, serviceID, message string) { log.G(ctx).Debugf("pausing update of service %s", serviceID) err := u.store.Update(func(tx store.Tx) error { service := store.GetService(tx, serviceID) if service == nil { return nil } if service.UpdateStatus == nil { // The service was updated since we started this update return nil } if service.UpdateStatus.State == api.UpdateStatus_ROLLBACK_STARTED { service.UpdateStatus.State = api.UpdateStatus_ROLLBACK_PAUSED } else { service.UpdateStatus.State = api.UpdateStatus_PAUSED } service.UpdateStatus.Message = message return store.UpdateService(tx, service) }) if err != nil { log.G(ctx).WithError(err).Errorf("failed to pause update of service %s", serviceID) } }
// UpdateService updates a Service referenced by ServiceID with the given ServiceSpec. // - Returns `NotFound` if the Service is not found. // - Returns `InvalidArgument` if the ServiceSpec is malformed. // - Returns `Unimplemented` if the ServiceSpec references unimplemented features. // - Returns an error if the update fails. func (s *Server) UpdateService(ctx context.Context, request *api.UpdateServiceRequest) (*api.UpdateServiceResponse, error) { if request.ServiceID == "" || request.ServiceVersion == nil { return nil, grpc.Errorf(codes.InvalidArgument, errInvalidArgument.Error()) } if err := validateServiceSpec(request.Spec); err != nil { return nil, err } var service *api.Service err := s.store.Update(func(tx store.Tx) error { service = store.GetService(tx, request.ServiceID) if service == nil { return nil } // temporary disable network update if request.Spec != nil && !reflect.DeepEqual(request.Spec.Networks, service.Spec.Networks) { return errNetworkUpdateNotSupported } service.Meta.Version = *request.ServiceVersion service.Spec = *request.Spec.Copy() return store.UpdateService(tx, service) }) if err != nil { return nil, err } if service == nil { return nil, grpc.Errorf(codes.NotFound, "service %s not found", request.ServiceID) } return &api.UpdateServiceResponse{ Service: service, }, nil }
func (a *Allocator) allocateService(ctx context.Context, nc *networkContext, s *api.Service) error { if s.Spec.Endpoint != nil { if s.Endpoint == nil { s.Endpoint = &api.Endpoint{ Spec: s.Spec.Endpoint.Copy(), } } // The service is trying to expose ports to the external // world. Automatically attach the service to the ingress // network only if it is not already done. if len(s.Spec.Endpoint.Ports) != 0 { var found bool for _, vip := range s.Endpoint.VirtualIPs { if vip.NetworkID == ingressNetwork.ID { found = true break } } if !found { s.Endpoint.VirtualIPs = append(s.Endpoint.VirtualIPs, &api.Endpoint_VirtualIP{NetworkID: ingressNetwork.ID}) } } } if err := nc.nwkAllocator.ServiceAllocate(s); err != nil { nc.unallocatedServices[s.ID] = s return err } if err := a.store.Update(func(tx store.Tx) error { for { err := store.UpdateService(tx, s) if err != nil && err != store.ErrSequenceConflict { return fmt.Errorf("failed updating state in store transaction for service %s: %v", s.ID, err) } if err == store.ErrSequenceConflict { storeService := store.GetService(tx, s.ID) storeService.Endpoint = s.Endpoint s = storeService continue } break } return nil }); err != nil { if err := nc.nwkAllocator.ServiceDeallocate(s); err != nil { log.G(ctx).WithError(err).Errorf("failed rolling back allocation of service %s: %v", s.ID, err) } return err } return nil }
func (u *Updater) rollbackUpdate(ctx context.Context, serviceID, message string) { log.G(ctx).Debugf("starting rollback of service %s", serviceID) var service *api.Service err := u.store.Update(func(tx store.Tx) error { service = store.GetService(tx, serviceID) if service == nil { return nil } if service.UpdateStatus == nil { // The service was updated since we started this update return nil } service.UpdateStatus.State = api.UpdateStatus_ROLLBACK_STARTED service.UpdateStatus.Message = message if service.PreviousSpec == nil { return errors.New("cannot roll back service because no previous spec is available") } service.Spec = *service.PreviousSpec service.PreviousSpec = nil return store.UpdateService(tx, service) }) if err != nil { log.G(ctx).WithError(err).Errorf("failed to start rollback of service %s", serviceID) return } }
func (r *ReplicatedOrchestrator) resolveService(ctx context.Context, task *api.Task) *api.Service { if task.ServiceID == "" { return nil } var service *api.Service r.store.View(func(tx store.ReadTx) { service = store.GetService(tx, task.ServiceID) }) return service }
func (g *Orchestrator) addTask(ctx context.Context, batch *store.Batch, service *api.Service, nodeID string) { task := orchestrator.NewTask(g.cluster, service, 0, nodeID) err := batch.Update(func(tx store.Tx) error { if store.GetService(tx, service.ID) == nil { return nil } return store.CreateTask(tx, task) }) if err != nil { log.G(ctx).WithError(err).Errorf("global orchestrator: failed to create task") } }
// restartTask calls the restart supervisor's Restart function, which // sets a task's desired state to dead and restarts it if the restart // policy calls for it to be restarted. func (g *GlobalOrchestrator) restartTask(ctx context.Context, taskID string, serviceID string) { err := g.store.Update(func(tx store.Tx) error { t := store.GetTask(tx, taskID) if t == nil || t.DesiredState > api.TaskStateRunning { return nil } service := store.GetService(tx, serviceID) if service == nil { return nil } return g.restarts.Restart(ctx, tx, service, *t) }) if err != nil { log.G(ctx).WithError(err).Errorf("global orchestrator: restartTask transaction failed") } }
func isValidEndpoint(t assert.TestingT, s *store.MemoryStore, task *api.Task) bool { if task.ServiceID != "" { var service *api.Service s.View(func(tx store.ReadTx) { service = store.GetService(tx, task.ServiceID) }) if service == nil { return true } return assert.Equal(t, service.Endpoint, task.Endpoint) } return true }
// GetService returns a Service given a ServiceID. // - Returns `InvalidArgument` if ServiceID is not provided. // - Returns `NotFound` if the Service is not found. func (s *Server) GetService(ctx context.Context, request *api.GetServiceRequest) (*api.GetServiceResponse, error) { if request.ServiceID == "" { return nil, grpc.Errorf(codes.InvalidArgument, errInvalidArgument.Error()) } var service *api.Service s.store.View(func(tx store.ReadTx) { service = store.GetService(tx, request.ServiceID) }) if service == nil { return nil, grpc.Errorf(codes.NotFound, "service %s not found", request.ServiceID) } return &api.GetServiceResponse{ Service: service, }, nil }
func (a *Allocator) commitAllocatedService(ctx context.Context, batch *store.Batch, s *api.Service) error { if err := batch.Update(func(tx store.Tx) error { err := store.UpdateService(tx, s) if err == store.ErrSequenceConflict { storeService := store.GetService(tx, s.ID) storeService.Endpoint = s.Endpoint err = store.UpdateService(tx, storeService) } return errors.Wrapf(err, "failed updating state in store transaction for service %s", s.ID) }); err != nil { if err := a.netCtx.nwkAllocator.ServiceDeallocate(s); err != nil { log.G(ctx).WithError(err).Errorf("failed rolling back allocation of service %s", s.ID) } return err } return nil }
func (r *Orchestrator) tickTasks(ctx context.Context) { if len(r.restartTasks) > 0 { _, err := r.store.Batch(func(batch *store.Batch) error { for taskID := range r.restartTasks { err := batch.Update(func(tx store.Tx) error { // TODO(aaronl): optimistic update? t := store.GetTask(tx, taskID) if t != nil { if t.DesiredState > api.TaskStateRunning { return nil } service := store.GetService(tx, t.ServiceID) if !orchestrator.IsReplicatedService(service) { return nil } // Restart task if applicable if err := r.restarts.Restart(ctx, tx, r.cluster, service, *t); err != nil { return err } } return nil }) if err != nil { log.G(ctx).WithError(err).Errorf("Orchestrator task reaping transaction failed") } } return nil }) if err != nil { log.G(ctx).WithError(err).Errorf("orchestrator task removal batch failed") } r.restartTasks = make(map[string]struct{}) } }
func (u *Updater) startUpdate(ctx context.Context, serviceID string) { err := u.store.Update(func(tx store.Tx) error { service := store.GetService(tx, serviceID) if service == nil { return nil } if service.UpdateStatus != nil { return nil } service.UpdateStatus = &api.UpdateStatus{ State: api.UpdateStatus_UPDATING, Message: "update in progress", StartedAt: ptypes.MustTimestampProto(time.Now()), } return store.UpdateService(tx, service) }) if err != nil { log.G(ctx).WithError(err).Errorf("failed to mark update of service %s in progress", serviceID) } }
func (r *Orchestrator) restartTasksByNodeID(ctx context.Context, nodeID string) { var err error r.store.View(func(tx store.ReadTx) { var tasks []*api.Task tasks, err = store.FindTasks(tx, store.ByNodeID(nodeID)) if err != nil { return } for _, t := range tasks { if t.DesiredState > api.TaskStateRunning { continue } service := store.GetService(tx, t.ServiceID) if orchestrator.IsReplicatedService(service) { r.restartTasks[t.ID] = struct{}{} } } }) if err != nil { log.G(ctx).WithError(err).Errorf("failed to list tasks to remove") } }
func TestUpdaterFailureAction(t *testing.T) { ctx := context.Background() s := store.NewMemoryStore(nil) assert.NotNil(t, s) defer s.Close() // Fail new tasks the updater tries to run watch, cancel := state.Watch(s.WatchQueue(), state.EventUpdateTask{}) defer cancel() go func() { for { select { case e := <-watch: task := e.(state.EventUpdateTask).Task if task.DesiredState == api.TaskStateRunning && task.Status.State != api.TaskStateFailed { err := s.Update(func(tx store.Tx) error { task = store.GetTask(tx, task.ID) task.Status.State = api.TaskStateFailed return store.UpdateTask(tx, task) }) assert.NoError(t, err) } else if task.DesiredState > api.TaskStateRunning { err := s.Update(func(tx store.Tx) error { task = store.GetTask(tx, task.ID) task.Status.State = task.DesiredState return store.UpdateTask(tx, task) }) assert.NoError(t, err) } } } }() instances := 3 cluster := &api.Cluster{ Spec: api.ClusterSpec{ Annotations: api.Annotations{ Name: "default", }, }, } service := &api.Service{ ID: "id1", Spec: api.ServiceSpec{ Annotations: api.Annotations{ Name: "name1", }, Mode: &api.ServiceSpec_Replicated{ Replicated: &api.ReplicatedService{ Replicas: uint64(instances), }, }, Task: api.TaskSpec{ Runtime: &api.TaskSpec_Container{ Container: &api.ContainerSpec{ Image: "v:1", // This won't apply in this test because we set the old tasks to DEAD. StopGracePeriod: ptypes.DurationProto(time.Hour), }, }, }, Update: &api.UpdateConfig{ FailureAction: api.UpdateConfig_PAUSE, Parallelism: 1, Delay: *ptypes.DurationProto(500 * time.Millisecond), }, }, } err := s.Update(func(tx store.Tx) error { assert.NoError(t, store.CreateCluster(tx, cluster)) assert.NoError(t, store.CreateService(tx, service)) for i := 0; i < instances; i++ { assert.NoError(t, store.CreateTask(tx, newTask(cluster, service, uint64(i)))) } return nil }) assert.NoError(t, err) originalTasks := getRunnableSlotSlice(t, s, service) for _, slot := range originalTasks { for _, task := range slot { assert.Equal(t, "v:1", task.Spec.GetContainer().Image) } } service.Spec.Task.GetContainer().Image = "v:2" updater := NewUpdater(s, NewRestartSupervisor(s), cluster, service) updater.Run(ctx, getRunnableSlotSlice(t, s, service)) updatedTasks := getRunnableSlotSlice(t, s, service) v1Counter := 0 v2Counter := 0 for _, slot := range updatedTasks { for _, task := range slot { if task.Spec.GetContainer().Image == "v:1" { v1Counter++ } else if task.Spec.GetContainer().Image == "v:2" { v2Counter++ } } } assert.Equal(t, instances-1, v1Counter) assert.Equal(t, 1, v2Counter) s.View(func(tx store.ReadTx) { service = store.GetService(tx, service.ID) }) assert.Equal(t, api.UpdateStatus_PAUSED, service.UpdateStatus.State) // Updating again should do nothing while the update is PAUSED updater = NewUpdater(s, NewRestartSupervisor(s), cluster, service) updater.Run(ctx, getRunnableSlotSlice(t, s, service)) updatedTasks = getRunnableSlotSlice(t, s, service) v1Counter = 0 v2Counter = 0 for _, slot := range updatedTasks { for _, task := range slot { if task.Spec.GetContainer().Image == "v:1" { v1Counter++ } else if task.Spec.GetContainer().Image == "v:2" { v2Counter++ } } } assert.Equal(t, instances-1, v1Counter) assert.Equal(t, 1, v2Counter) // Switch to a service with FailureAction: CONTINUE err = s.Update(func(tx store.Tx) error { service = store.GetService(tx, service.ID) service.Spec.Update.FailureAction = api.UpdateConfig_CONTINUE service.UpdateStatus = nil assert.NoError(t, store.UpdateService(tx, service)) return nil }) assert.NoError(t, err) service.Spec.Task.GetContainer().Image = "v:3" updater = NewUpdater(s, NewRestartSupervisor(s), cluster, service) updater.Run(ctx, getRunnableSlotSlice(t, s, service)) updatedTasks = getRunnableSlotSlice(t, s, service) v2Counter = 0 v3Counter := 0 for _, slot := range updatedTasks { for _, task := range slot { if task.Spec.GetContainer().Image == "v:2" { v2Counter++ } else if task.Spec.GetContainer().Image == "v:3" { v3Counter++ } } } assert.Equal(t, 0, v2Counter) assert.Equal(t, instances, v3Counter) }
func (u *Updater) updateTask(ctx context.Context, slot orchestrator.Slot, updated *api.Task) error { // Kick off the watch before even creating the updated task. This is in order to avoid missing any event. taskUpdates, cancel := state.Watch(u.watchQueue, state.EventUpdateTask{ Task: &api.Task{ID: updated.ID}, Checks: []state.TaskCheckFunc{state.TaskCheckID}, }) defer cancel() // Create an empty entry for this task, so the updater knows a failure // should count towards the failure count. The timestamp is added // if/when the task reaches RUNNING. u.updatedTasksMu.Lock() u.updatedTasks[updated.ID] = time.Time{} u.updatedTasksMu.Unlock() var delayStartCh <-chan struct{} // Atomically create the updated task and bring down the old one. _, err := u.store.Batch(func(batch *store.Batch) error { oldTask, err := u.removeOldTasks(ctx, batch, slot) if err != nil { return err } err = batch.Update(func(tx store.Tx) error { if store.GetService(tx, updated.ServiceID) == nil { return errors.New("service was deleted") } if err := store.CreateTask(tx, updated); err != nil { return err } return nil }) if err != nil { return err } delayStartCh = u.restarts.DelayStart(ctx, nil, oldTask, updated.ID, 0, true) return nil }) if err != nil { return err } if delayStartCh != nil { select { case <-delayStartCh: case <-u.stopChan: return nil } } // Wait for the new task to come up. // TODO(aluzzardi): Consider adding a timeout here. for { select { case e := <-taskUpdates: updated = e.(state.EventUpdateTask).Task if updated.Status.State >= api.TaskStateRunning { u.updatedTasksMu.Lock() u.updatedTasks[updated.ID] = time.Now() u.updatedTasksMu.Unlock() return nil } case <-u.stopChan: return nil } } }
func (a *Allocator) allocateTask(ctx context.Context, t *api.Task) (err error) { taskUpdated := false nc := a.netCtx // We might be here even if a task allocation has already // happened but wasn't successfully committed to store. In such // cases skip allocation and go straight ahead to updating the // store. if !nc.nwkAllocator.IsTaskAllocated(t) { a.store.View(func(tx store.ReadTx) { if t.ServiceID != "" { s := store.GetService(tx, t.ServiceID) if s == nil { err = fmt.Errorf("could not find service %s", t.ServiceID) return } if !nc.nwkAllocator.IsServiceAllocated(s) { err = fmt.Errorf("service %s to which this task %s belongs has pending allocations", s.ID, t.ID) return } if s.Endpoint != nil { taskUpdateEndpoint(t, s.Endpoint) taskUpdated = true } } for _, na := range t.Networks { n := store.GetNetwork(tx, na.Network.ID) if n == nil { err = fmt.Errorf("failed to retrieve network %s while allocating task %s", na.Network.ID, t.ID) return } if !nc.nwkAllocator.IsAllocated(n) { err = fmt.Errorf("network %s attached to task %s not allocated yet", n.ID, t.ID) return } na.Network = n } if err = nc.nwkAllocator.AllocateTask(t); err != nil { err = errors.Wrapf(err, "failed during networktask allocation for task %s", t.ID) return } if nc.nwkAllocator.IsTaskAllocated(t) { taskUpdated = true } }) if err != nil { return err } } // Update the network allocations and moving to // PENDING state on top of the latest store state. if a.taskAllocateVote(networkVoter, t.ID) { if t.Status.State < api.TaskStatePending { updateTaskStatus(t, api.TaskStatePending, allocatedStatusMessage) taskUpdated = true } } if !taskUpdated { return errNoChanges } return nil }
func (a *Allocator) doNetworkInit(ctx context.Context) (err error) { na, err := networkallocator.New() if err != nil { return err } nc := &networkContext{ nwkAllocator: na, unallocatedTasks: make(map[string]*api.Task), unallocatedServices: make(map[string]*api.Service), unallocatedNetworks: make(map[string]*api.Network), ingressNetwork: newIngressNetwork(), } a.netCtx = nc defer func() { // Clear a.netCtx if initialization was unsuccessful. if err != nil { a.netCtx = nil } }() // Check if we have the ingress network. If not found create // it before reading all network objects for allocation. var networks []*api.Network a.store.View(func(tx store.ReadTx) { networks, err = store.FindNetworks(tx, store.ByName(ingressNetworkName)) if len(networks) > 0 { nc.ingressNetwork = networks[0] } }) if err != nil { return errors.Wrap(err, "failed to find ingress network during init") } // If ingress network is not found, create one right away // using the predefined template. if len(networks) == 0 { if err := a.store.Update(func(tx store.Tx) error { nc.ingressNetwork.ID = identity.NewID() if err := store.CreateNetwork(tx, nc.ingressNetwork); err != nil { return err } return nil }); err != nil { return errors.Wrap(err, "failed to create ingress network") } a.store.View(func(tx store.ReadTx) { networks, err = store.FindNetworks(tx, store.ByName(ingressNetworkName)) if len(networks) > 0 { nc.ingressNetwork = networks[0] } }) if err != nil { return errors.Wrap(err, "failed to find ingress network after creating it") } } // Try to complete ingress network allocation before anything else so // that the we can get the preferred subnet for ingress // network. if !na.IsAllocated(nc.ingressNetwork) { if err := a.allocateNetwork(ctx, nc.ingressNetwork); err != nil { log.G(ctx).WithError(err).Error("failed allocating ingress network during init") } else if _, err := a.store.Batch(func(batch *store.Batch) error { if err := a.commitAllocatedNetwork(ctx, batch, nc.ingressNetwork); err != nil { log.G(ctx).WithError(err).Error("failed committing allocation of ingress network during init") } return nil }); err != nil { log.G(ctx).WithError(err).Error("failed committing allocation of ingress network during init") } } // Allocate networks in the store so far before we started // watching. a.store.View(func(tx store.ReadTx) { networks, err = store.FindNetworks(tx, store.All) }) if err != nil { return errors.Wrap(err, "error listing all networks in store while trying to allocate during init") } var allocatedNetworks []*api.Network for _, n := range networks { if na.IsAllocated(n) { continue } if err := a.allocateNetwork(ctx, n); err != nil { log.G(ctx).WithError(err).Errorf("failed allocating network %s during init", n.ID) continue } allocatedNetworks = append(allocatedNetworks, n) } if _, err := a.store.Batch(func(batch *store.Batch) error { for _, n := range allocatedNetworks { if err := a.commitAllocatedNetwork(ctx, batch, n); err != nil { log.G(ctx).WithError(err).Errorf("failed committing allocation of network %s during init", n.ID) } } return nil }); err != nil { log.G(ctx).WithError(err).Error("failed committing allocation of networks during init") } // Allocate nodes in the store so far before we process watched events. var nodes []*api.Node a.store.View(func(tx store.ReadTx) { nodes, err = store.FindNodes(tx, store.All) }) if err != nil { return errors.Wrap(err, "error listing all nodes in store while trying to allocate during init") } var allocatedNodes []*api.Node for _, node := range nodes { if na.IsNodeAllocated(node) { continue } if node.Attachment == nil { node.Attachment = &api.NetworkAttachment{} } node.Attachment.Network = nc.ingressNetwork.Copy() if err := a.allocateNode(ctx, node); err != nil { log.G(ctx).WithError(err).Errorf("Failed to allocate network resources for node %s during init", node.ID) continue } allocatedNodes = append(allocatedNodes, node) } if _, err := a.store.Batch(func(batch *store.Batch) error { for _, node := range allocatedNodes { if err := a.commitAllocatedNode(ctx, batch, node); err != nil { log.G(ctx).WithError(err).Errorf("Failed to commit allocation of network resources for node %s during init", node.ID) } } return nil }); err != nil { log.G(ctx).WithError(err).Error("Failed to commit allocation of network resources for nodes during init") } // Allocate services in the store so far before we process watched events. var services []*api.Service a.store.View(func(tx store.ReadTx) { services, err = store.FindServices(tx, store.All) }) if err != nil { return errors.Wrap(err, "error listing all services in store while trying to allocate during init") } var allocatedServices []*api.Service for _, s := range services { if nc.nwkAllocator.IsServiceAllocated(s) { continue } if err := a.allocateService(ctx, s); err != nil { log.G(ctx).WithError(err).Errorf("failed allocating service %s during init", s.ID) continue } allocatedServices = append(allocatedServices, s) } if _, err := a.store.Batch(func(batch *store.Batch) error { for _, s := range allocatedServices { if err := a.commitAllocatedService(ctx, batch, s); err != nil { log.G(ctx).WithError(err).Errorf("failed committing allocation of service %s during init", s.ID) } } return nil }); err != nil { log.G(ctx).WithError(err).Error("failed committing allocation of services during init") } // Allocate tasks in the store so far before we started watching. var ( tasks []*api.Task allocatedTasks []*api.Task ) a.store.View(func(tx store.ReadTx) { tasks, err = store.FindTasks(tx, store.All) }) if err != nil { return errors.Wrap(err, "error listing all tasks in store while trying to allocate during init") } for _, t := range tasks { if taskDead(t) { continue } var s *api.Service if t.ServiceID != "" { a.store.View(func(tx store.ReadTx) { s = store.GetService(tx, t.ServiceID) }) } // Populate network attachments in the task // based on service spec. a.taskCreateNetworkAttachments(t, s) if taskReadyForNetworkVote(t, s, nc) { if t.Status.State >= api.TaskStatePending { continue } if a.taskAllocateVote(networkVoter, t.ID) { // If the task is not attached to any network, network // allocators job is done. Immediately cast a vote so // that the task can be moved to ALLOCATED state as // soon as possible. allocatedTasks = append(allocatedTasks, t) } continue } err := a.allocateTask(ctx, t) if err == nil { allocatedTasks = append(allocatedTasks, t) } else if err != errNoChanges { log.G(ctx).WithError(err).Errorf("failed allocating task %s during init", t.ID) nc.unallocatedTasks[t.ID] = t } } if _, err := a.store.Batch(func(batch *store.Batch) error { for _, t := range allocatedTasks { if err := a.commitAllocatedTask(ctx, batch, t); err != nil { log.G(ctx).WithError(err).Errorf("failed committing allocation of task %s during init", t.ID) } } return nil }); err != nil { log.G(ctx).WithError(err).Error("failed committing allocation of tasks during init") } return nil }
func (a *Allocator) doTaskAlloc(ctx context.Context, ev events.Event) { var ( isDelete bool t *api.Task ) switch v := ev.(type) { case state.EventCreateTask: t = v.Task.Copy() case state.EventUpdateTask: t = v.Task.Copy() case state.EventDeleteTask: isDelete = true t = v.Task.Copy() } nc := a.netCtx // If the task has stopped running or it's being deleted then // we should free the network resources associated with the // task right away. if taskDead(t) || isDelete { if nc.nwkAllocator.IsTaskAllocated(t) { if err := nc.nwkAllocator.DeallocateTask(t); err != nil { log.G(ctx).WithError(err).Errorf("Failed freeing network resources for task %s", t.ID) } } // Cleanup any task references that might exist in unallocatedTasks delete(nc.unallocatedTasks, t.ID) return } // If we are already in allocated state, there is // absolutely nothing else to do. if t.Status.State >= api.TaskStatePending { delete(nc.unallocatedTasks, t.ID) return } var s *api.Service if t.ServiceID != "" { a.store.View(func(tx store.ReadTx) { s = store.GetService(tx, t.ServiceID) }) if s == nil { // If the task is running it is not normal to // not be able to find the associated // service. If the task is not running (task // is either dead or the desired state is set // to dead) then the service may not be // available in store. But we still need to // cleanup network resources associated with // the task. if taskRunning(t) && !isDelete { log.G(ctx).Errorf("Event %T: Failed to get service %s for task %s state %s: could not find service %s", ev, t.ServiceID, t.ID, t.Status.State, t.ServiceID) return } } } // Populate network attachments in the task // based on service spec. a.taskCreateNetworkAttachments(t, s) nc.unallocatedTasks[t.ID] = t }
func (r *Orchestrator) initTasks(ctx context.Context, readTx store.ReadTx) error { tasks, err := store.FindTasks(readTx, store.All) if err != nil { return err } for _, t := range tasks { if t.NodeID != "" { n := store.GetNode(readTx, t.NodeID) if invalidNode(n) && t.Status.State <= api.TaskStateRunning && t.DesiredState <= api.TaskStateRunning { r.restartTasks[t.ID] = struct{}{} } } } _, err = r.store.Batch(func(batch *store.Batch) error { for _, t := range tasks { if t.ServiceID == "" { continue } // TODO(aluzzardi): We should NOT retrieve the service here. service := store.GetService(readTx, t.ServiceID) if service == nil { // Service was deleted err := batch.Update(func(tx store.Tx) error { return store.DeleteTask(tx, t.ID) }) if err != nil { log.G(ctx).WithError(err).Error("failed to set task desired state to dead") } continue } // TODO(aluzzardi): This is shady. We should have a more generic condition. if t.DesiredState != api.TaskStateReady || !orchestrator.IsReplicatedService(service) { continue } restartDelay := orchestrator.DefaultRestartDelay if t.Spec.Restart != nil && t.Spec.Restart.Delay != nil { var err error restartDelay, err = gogotypes.DurationFromProto(t.Spec.Restart.Delay) if err != nil { log.G(ctx).WithError(err).Error("invalid restart delay") restartDelay = orchestrator.DefaultRestartDelay } } if restartDelay != 0 { timestamp, err := gogotypes.TimestampFromProto(t.Status.Timestamp) if err == nil { restartTime := timestamp.Add(restartDelay) calculatedRestartDelay := restartTime.Sub(time.Now()) if calculatedRestartDelay < restartDelay { restartDelay = calculatedRestartDelay } if restartDelay > 0 { _ = batch.Update(func(tx store.Tx) error { t := store.GetTask(tx, t.ID) // TODO(aluzzardi): This is shady as well. We should have a more generic condition. if t == nil || t.DesiredState != api.TaskStateReady { return nil } r.restarts.DelayStart(ctx, tx, nil, t.ID, restartDelay, true) return nil }) continue } } else { log.G(ctx).WithError(err).Error("invalid status timestamp") } } // Start now err := batch.Update(func(tx store.Tx) error { return r.restarts.StartNow(tx, t.ID) }) if err != nil { log.G(ctx).WithError(err).WithField("task.id", t.ID).Error("moving task out of delayed state failed") } } return nil }) return err }
func (a *Allocator) doNetworkInit(ctx context.Context) error { na, err := networkallocator.New() if err != nil { return err } nc := &networkContext{ nwkAllocator: na, unallocatedTasks: make(map[string]*api.Task), unallocatedServices: make(map[string]*api.Service), unallocatedNetworks: make(map[string]*api.Network), } // Check if we have the ingress network. If not found create // it before reading all network objects for allocation. var networks []*api.Network a.store.View(func(tx store.ReadTx) { networks, err = store.FindNetworks(tx, store.ByName(ingressNetworkName)) if len(networks) > 0 { ingressNetwork = networks[0] } }) if err != nil { return fmt.Errorf("failed to find ingress network during init: %v", err) } // If ingress network is not found, create one right away // using the predefined template. if len(networks) == 0 { if err := a.store.Update(func(tx store.Tx) error { ingressNetwork.ID = identity.NewID() if err := store.CreateNetwork(tx, ingressNetwork); err != nil { return err } return nil }); err != nil { return fmt.Errorf("failed to create ingress network: %v", err) } a.store.View(func(tx store.ReadTx) { networks, err = store.FindNetworks(tx, store.ByName(ingressNetworkName)) if len(networks) > 0 { ingressNetwork = networks[0] } }) if err != nil { return fmt.Errorf("failed to find ingress network after creating it: %v", err) } } // Try to complete ingress network allocation before anything else so // that the we can get the preferred subnet for ingress // network. if !na.IsAllocated(ingressNetwork) { if err := a.allocateNetwork(ctx, nc, ingressNetwork); err != nil { log.G(ctx).Errorf("failed allocating ingress network during init: %v", err) } // Update store after allocation if err := a.store.Update(func(tx store.Tx) error { if err := store.UpdateNetwork(tx, ingressNetwork); err != nil { return err } return nil }); err != nil { return fmt.Errorf("failed to create ingress network: %v", err) } } // Allocate networks in the store so far before we started // watching. a.store.View(func(tx store.ReadTx) { networks, err = store.FindNetworks(tx, store.All) }) if err != nil { return fmt.Errorf("error listing all networks in store while trying to allocate during init: %v", err) } for _, n := range networks { if na.IsAllocated(n) { continue } if err := a.allocateNetwork(ctx, nc, n); err != nil { log.G(ctx).Errorf("failed allocating network %s during init: %v", n.ID, err) } } // Allocate nodes in the store so far before we process watched events. var nodes []*api.Node a.store.View(func(tx store.ReadTx) { nodes, err = store.FindNodes(tx, store.All) }) if err != nil { return fmt.Errorf("error listing all services in store while trying to allocate during init: %v", err) } for _, node := range nodes { if na.IsNodeAllocated(node) { continue } if node.Attachment == nil { node.Attachment = &api.NetworkAttachment{} } node.Attachment.Network = ingressNetwork.Copy() if err := a.allocateNode(ctx, nc, node); err != nil { log.G(ctx).Errorf("Failed to allocate network resources for node %s during init: %v", node.ID, err) } } // Allocate services in the store so far before we process watched events. var services []*api.Service a.store.View(func(tx store.ReadTx) { services, err = store.FindServices(tx, store.All) }) if err != nil { return fmt.Errorf("error listing all services in store while trying to allocate during init: %v", err) } for _, s := range services { if s.Spec.Endpoint == nil { continue } if na.IsServiceAllocated(s) { continue } if err := a.allocateService(ctx, nc, s); err != nil { log.G(ctx).Errorf("failed allocating service %s during init: %v", s.ID, err) } } // Allocate tasks in the store so far before we started watching. var tasks []*api.Task a.store.View(func(tx store.ReadTx) { tasks, err = store.FindTasks(tx, store.All) }) if err != nil { return fmt.Errorf("error listing all tasks in store while trying to allocate during init: %v", err) } if _, err := a.store.Batch(func(batch *store.Batch) error { for _, t := range tasks { if taskDead(t) { continue } var s *api.Service if t.ServiceID != "" { a.store.View(func(tx store.ReadTx) { s = store.GetService(tx, t.ServiceID) }) } // Populate network attachments in the task // based on service spec. a.taskCreateNetworkAttachments(t, s) if taskReadyForNetworkVote(t, s, nc) { if t.Status.State >= api.TaskStateAllocated { continue } if a.taskAllocateVote(networkVoter, t.ID) { // If the task is not attached to any network, network // allocators job is done. Immediately cast a vote so // that the task can be moved to ALLOCATED state as // soon as possible. if err := batch.Update(func(tx store.Tx) error { storeT := store.GetTask(tx, t.ID) if storeT == nil { return fmt.Errorf("task %s not found while trying to update state", t.ID) } updateTaskStatus(storeT, api.TaskStateAllocated, "allocated") if err := store.UpdateTask(tx, storeT); err != nil { return fmt.Errorf("failed updating state in store transaction for task %s: %v", storeT.ID, err) } return nil }); err != nil { log.G(ctx).WithError(err).Error("error updating task network") } } continue } err := batch.Update(func(tx store.Tx) error { _, err := a.allocateTask(ctx, nc, tx, t) return err }) if err != nil { log.G(ctx).Errorf("failed allocating task %s during init: %v", t.ID, err) nc.unallocatedTasks[t.ID] = t } } return nil }); err != nil { return err } a.netCtx = nc return nil }
func TestUpdaterRollback(t *testing.T) { ctx := context.Background() s := store.NewMemoryStore(nil) assert.NotNil(t, s) defer s.Close() orchestrator := NewReplicatedOrchestrator(s) defer orchestrator.Stop() var ( failImage1 uint32 failImage2 uint32 ) watchCreate, cancelCreate := state.Watch(s.WatchQueue(), state.EventCreateTask{}) defer cancelCreate() watchServiceUpdate, cancelServiceUpdate := state.Watch(s.WatchQueue(), state.EventUpdateService{}) defer cancelServiceUpdate() // Fail new tasks the updater tries to run watchUpdate, cancelUpdate := state.Watch(s.WatchQueue(), state.EventUpdateTask{}) defer cancelUpdate() go func() { failedLast := false for { select { case e := <-watchUpdate: task := e.(state.EventUpdateTask).Task if task.DesiredState == task.Status.State { continue } if task.DesiredState == api.TaskStateRunning && task.Status.State != api.TaskStateFailed && task.Status.State != api.TaskStateRunning { err := s.Update(func(tx store.Tx) error { task = store.GetTask(tx, task.ID) // Never fail two image2 tasks in a row, so there's a mix of // failed and successful tasks for the rollback. if task.Spec.GetContainer().Image == "image1" && atomic.LoadUint32(&failImage1) == 1 { task.Status.State = api.TaskStateFailed failedLast = true } else if task.Spec.GetContainer().Image == "image2" && atomic.LoadUint32(&failImage2) == 1 && !failedLast { task.Status.State = api.TaskStateFailed failedLast = true } else { task.Status.State = task.DesiredState failedLast = false } return store.UpdateTask(tx, task) }) assert.NoError(t, err) } else if task.DesiredState > api.TaskStateRunning { err := s.Update(func(tx store.Tx) error { task = store.GetTask(tx, task.ID) task.Status.State = task.DesiredState return store.UpdateTask(tx, task) }) assert.NoError(t, err) } } } }() // Create a service with four replicas specified before the orchestrator // is started. This should result in two tasks when the orchestrator // starts up. err := s.Update(func(tx store.Tx) error { s1 := &api.Service{ ID: "id1", Spec: api.ServiceSpec{ Annotations: api.Annotations{ Name: "name1", }, Task: api.TaskSpec{ Runtime: &api.TaskSpec_Container{ Container: &api.ContainerSpec{ Image: "image1", }, }, Restart: &api.RestartPolicy{ Condition: api.RestartOnNone, }, }, Mode: &api.ServiceSpec_Replicated{ Replicated: &api.ReplicatedService{ Replicas: 4, }, }, Update: &api.UpdateConfig{ FailureAction: api.UpdateConfig_ROLLBACK, Parallelism: 1, Delay: *ptypes.DurationProto(10 * time.Millisecond), Monitor: ptypes.DurationProto(500 * time.Millisecond), MaxFailureRatio: 0.4, }, }, } assert.NoError(t, store.CreateService(tx, s1)) return nil }) assert.NoError(t, err) // Start the orchestrator. go func() { assert.NoError(t, orchestrator.Run(ctx)) }() observedTask := testutils.WatchTaskCreate(t, watchCreate) assert.Equal(t, observedTask.Status.State, api.TaskStateNew) assert.Equal(t, observedTask.Spec.GetContainer().Image, "image1") observedTask = testutils.WatchTaskCreate(t, watchCreate) assert.Equal(t, observedTask.Status.State, api.TaskStateNew) assert.Equal(t, observedTask.Spec.GetContainer().Image, "image1") observedTask = testutils.WatchTaskCreate(t, watchCreate) assert.Equal(t, observedTask.Status.State, api.TaskStateNew) assert.Equal(t, observedTask.Spec.GetContainer().Image, "image1") observedTask = testutils.WatchTaskCreate(t, watchCreate) assert.Equal(t, observedTask.Status.State, api.TaskStateNew) assert.Equal(t, observedTask.Spec.GetContainer().Image, "image1") atomic.StoreUint32(&failImage2, 1) // Start a rolling update err = s.Update(func(tx store.Tx) error { s1 := store.GetService(tx, "id1") require.NotNil(t, s1) s1.PreviousSpec = s1.Spec.Copy() s1.UpdateStatus = nil s1.Spec.Task.GetContainer().Image = "image2" assert.NoError(t, store.UpdateService(tx, s1)) return nil }) assert.NoError(t, err) // Should see three tasks started, then a rollback observedTask = testutils.WatchTaskCreate(t, watchCreate) assert.Equal(t, observedTask.Status.State, api.TaskStateNew) assert.Equal(t, observedTask.Spec.GetContainer().Image, "image2") observedTask = testutils.WatchTaskCreate(t, watchCreate) assert.Equal(t, observedTask.Status.State, api.TaskStateNew) assert.Equal(t, observedTask.Spec.GetContainer().Image, "image2") observedTask = testutils.WatchTaskCreate(t, watchCreate) assert.Equal(t, observedTask.Status.State, api.TaskStateNew) assert.Equal(t, observedTask.Spec.GetContainer().Image, "image2") // Should get to the ROLLBACK_STARTED state for { e := <-watchServiceUpdate if e.(state.EventUpdateService).Service.UpdateStatus == nil { continue } if e.(state.EventUpdateService).Service.UpdateStatus.State == api.UpdateStatus_ROLLBACK_STARTED { break } } observedTask = testutils.WatchTaskCreate(t, watchCreate) assert.Equal(t, observedTask.Status.State, api.TaskStateNew) assert.Equal(t, observedTask.Spec.GetContainer().Image, "image1") observedTask = testutils.WatchTaskCreate(t, watchCreate) assert.Equal(t, observedTask.Status.State, api.TaskStateNew) assert.Equal(t, observedTask.Spec.GetContainer().Image, "image1") observedTask = testutils.WatchTaskCreate(t, watchCreate) assert.Equal(t, observedTask.Status.State, api.TaskStateNew) assert.Equal(t, observedTask.Spec.GetContainer().Image, "image1") // Should end up in ROLLBACK_COMPLETED state for { e := <-watchServiceUpdate if e.(state.EventUpdateService).Service.UpdateStatus.State == api.UpdateStatus_ROLLBACK_COMPLETED { break } } atomic.StoreUint32(&failImage1, 1) // Repeat the rolling update but this time fail the tasks that the // rollback creates. It should end up in ROLLBACK_PAUSED. err = s.Update(func(tx store.Tx) error { s1 := store.GetService(tx, "id1") require.NotNil(t, s1) s1.PreviousSpec = s1.Spec.Copy() s1.UpdateStatus = nil s1.Spec.Task.GetContainer().Image = "image2" assert.NoError(t, store.UpdateService(tx, s1)) return nil }) assert.NoError(t, err) // Should see three tasks started, then a rollback observedTask = testutils.WatchTaskCreate(t, watchCreate) assert.Equal(t, observedTask.Status.State, api.TaskStateNew) assert.Equal(t, observedTask.Spec.GetContainer().Image, "image2") observedTask = testutils.WatchTaskCreate(t, watchCreate) assert.Equal(t, observedTask.Status.State, api.TaskStateNew) assert.Equal(t, observedTask.Spec.GetContainer().Image, "image2") observedTask = testutils.WatchTaskCreate(t, watchCreate) assert.Equal(t, observedTask.Status.State, api.TaskStateNew) assert.Equal(t, observedTask.Spec.GetContainer().Image, "image2") // Should get to the ROLLBACK_STARTED state for { e := <-watchServiceUpdate if e.(state.EventUpdateService).Service.UpdateStatus == nil { continue } if e.(state.EventUpdateService).Service.UpdateStatus.State == api.UpdateStatus_ROLLBACK_STARTED { break } } observedTask = testutils.WatchTaskCreate(t, watchCreate) assert.Equal(t, observedTask.Status.State, api.TaskStateNew) assert.Equal(t, observedTask.Spec.GetContainer().Image, "image1") observedTask = testutils.WatchTaskCreate(t, watchCreate) assert.Equal(t, observedTask.Status.State, api.TaskStateNew) assert.Equal(t, observedTask.Spec.GetContainer().Image, "image1") observedTask = testutils.WatchTaskCreate(t, watchCreate) assert.Equal(t, observedTask.Status.State, api.TaskStateNew) assert.Equal(t, observedTask.Spec.GetContainer().Image, "image1") // Should end up in ROLLBACK_PAUSED state for { e := <-watchServiceUpdate if e.(state.EventUpdateService).Service.UpdateStatus.State == api.UpdateStatus_ROLLBACK_PAUSED { break } } }
func (tr *TaskReaper) tick() { if len(tr.dirty) == 0 { return } defer func() { tr.dirty = make(map[instanceTuple]struct{}) }() var deleteTasks []string tr.store.View(func(tx store.ReadTx) { for dirty := range tr.dirty { service := store.GetService(tx, dirty.serviceID) if service == nil { continue } taskHistory := tr.taskHistory if taskHistory < 0 { continue } var historicTasks []*api.Task switch service.Spec.GetMode().(type) { case *api.ServiceSpec_Replicated: var err error historicTasks, err = store.FindTasks(tx, store.BySlot(dirty.serviceID, dirty.instance)) if err != nil { continue } case *api.ServiceSpec_Global: tasksByNode, err := store.FindTasks(tx, store.ByNodeID(dirty.nodeID)) if err != nil { continue } for _, t := range tasksByNode { if t.ServiceID == dirty.serviceID { historicTasks = append(historicTasks, t) } } } if int64(len(historicTasks)) <= taskHistory { continue } // TODO(aaronl): This could filter for non-running tasks and use quickselect // instead of sorting the whole slice. sort.Sort(tasksByTimestamp(historicTasks)) for _, t := range historicTasks { if t.DesiredState <= api.TaskStateRunning { // Don't delete running tasks continue } deleteTasks = append(deleteTasks, t.ID) taskHistory++ if int64(len(historicTasks)) <= taskHistory { break } } } }) if len(deleteTasks) > 0 { tr.store.Batch(func(batch *store.Batch) error { for _, taskID := range deleteTasks { batch.Update(func(tx store.Tx) error { return store.DeleteTask(tx, taskID) }) } return nil }) } }
func (a *Allocator) allocateService(ctx context.Context, s *api.Service) error { nc := a.netCtx if s.Spec.Endpoint != nil { // service has user-defined endpoint if s.Endpoint == nil { // service currently has no allocated endpoint, need allocated. s.Endpoint = &api.Endpoint{ Spec: s.Spec.Endpoint.Copy(), } } // The service is trying to expose ports to the external // world. Automatically attach the service to the ingress // network only if it is not already done. if len(s.Spec.Endpoint.Ports) != 0 { var found bool for _, vip := range s.Endpoint.VirtualIPs { if vip.NetworkID == nc.ingressNetwork.ID { found = true break } } if !found { s.Endpoint.VirtualIPs = append(s.Endpoint.VirtualIPs, &api.Endpoint_VirtualIP{NetworkID: nc.ingressNetwork.ID}) } } } else if s.Endpoint != nil { // service has no user-defined endpoints while has already allocated network resources, // need deallocated. if err := nc.nwkAllocator.ServiceDeallocate(s); err != nil { return err } } if err := nc.nwkAllocator.ServiceAllocate(s); err != nil { nc.unallocatedServices[s.ID] = s return err } // If the service doesn't expose ports any more and if we have // any lingering virtual IP references for ingress network // clean them up here. if s.Spec.Endpoint == nil || len(s.Spec.Endpoint.Ports) == 0 { if s.Endpoint != nil { for i, vip := range s.Endpoint.VirtualIPs { if vip.NetworkID == nc.ingressNetwork.ID { n := len(s.Endpoint.VirtualIPs) s.Endpoint.VirtualIPs[i], s.Endpoint.VirtualIPs[n-1] = s.Endpoint.VirtualIPs[n-1], nil s.Endpoint.VirtualIPs = s.Endpoint.VirtualIPs[:n-1] break } } } } if err := a.store.Update(func(tx store.Tx) error { for { err := store.UpdateService(tx, s) if err != nil && err != store.ErrSequenceConflict { return fmt.Errorf("failed updating state in store transaction for service %s: %v", s.ID, err) } if err == store.ErrSequenceConflict { storeService := store.GetService(tx, s.ID) storeService.Endpoint = s.Endpoint s = storeService continue } break } return nil }); err != nil { if err := nc.nwkAllocator.ServiceDeallocate(s); err != nil { log.G(ctx).WithError(err).Errorf("failed rolling back allocation of service %s: %v", s.ID, err) } return err } return nil }
func (a *Allocator) allocateTask(ctx context.Context, nc *networkContext, tx store.Tx, t *api.Task) (*api.Task, error) { taskUpdated := false // Get the latest task state from the store before updating. storeT := store.GetTask(tx, t.ID) if storeT == nil { return nil, fmt.Errorf("could not find task %s while trying to update network allocation", t.ID) } // We might be here even if a task allocation has already // happened but wasn't successfully committed to store. In such // cases skip allocation and go straight ahead to updating the // store. if !nc.nwkAllocator.IsTaskAllocated(t) { if t.ServiceID != "" { s := store.GetService(tx, t.ServiceID) if s == nil { return nil, fmt.Errorf("could not find service %s", t.ServiceID) } if !nc.nwkAllocator.IsServiceAllocated(s) { return nil, fmt.Errorf("service %s to which this task %s belongs has pending allocations", s.ID, t.ID) } taskUpdateEndpoint(t, s.Endpoint) } for _, na := range t.Networks { n := store.GetNetwork(tx, na.Network.ID) if n == nil { return nil, fmt.Errorf("failed to retrieve network %s while allocating task %s", na.Network.ID, t.ID) } if !nc.nwkAllocator.IsAllocated(n) { return nil, fmt.Errorf("network %s attached to task %s not allocated yet", n.ID, t.ID) } na.Network = n } if err := nc.nwkAllocator.AllocateTask(t); err != nil { return nil, fmt.Errorf("failed during networktask allocation for task %s: %v", t.ID, err) } if nc.nwkAllocator.IsTaskAllocated(t) { taskUpdateNetworks(storeT, t.Networks) taskUpdateEndpoint(storeT, t.Endpoint) taskUpdated = true } } // Update the network allocations and moving to // ALLOCATED state on top of the latest store state. if a.taskAllocateVote(networkVoter, t.ID) { if storeT.Status.State < api.TaskStateAllocated { updateTaskStatus(storeT, api.TaskStateAllocated, "allocated") taskUpdated = true } } if taskUpdated { if err := store.UpdateTask(tx, storeT); err != nil { return nil, fmt.Errorf("failed updating state in store transaction for task %s: %v", storeT.ID, err) } } return storeT, nil }
// UpdateService updates a Service referenced by ServiceID with the given ServiceSpec. // - Returns `NotFound` if the Service is not found. // - Returns `InvalidArgument` if the ServiceSpec is malformed. // - Returns `Unimplemented` if the ServiceSpec references unimplemented features. // - Returns an error if the update fails. func (s *Server) UpdateService(ctx context.Context, request *api.UpdateServiceRequest) (*api.UpdateServiceResponse, error) { if request.ServiceID == "" || request.ServiceVersion == nil { return nil, grpc.Errorf(codes.InvalidArgument, errInvalidArgument.Error()) } if err := validateServiceSpec(request.Spec); err != nil { return nil, err } var service *api.Service s.store.View(func(tx store.ReadTx) { service = store.GetService(tx, request.ServiceID) }) if service == nil { return nil, grpc.Errorf(codes.NotFound, "service %s not found", request.ServiceID) } if request.Spec.Endpoint != nil && !reflect.DeepEqual(request.Spec.Endpoint, service.Spec.Endpoint) { if err := s.checkPortConflicts(request.Spec, request.ServiceID); err != nil { return nil, err } } err := s.store.Update(func(tx store.Tx) error { service = store.GetService(tx, request.ServiceID) if service == nil { return grpc.Errorf(codes.NotFound, "service %s not found", request.ServiceID) } // temporary disable network update requestSpecNetworks := request.Spec.Task.Networks if len(requestSpecNetworks) == 0 { requestSpecNetworks = request.Spec.Networks } specNetworks := service.Spec.Task.Networks if len(specNetworks) == 0 { specNetworks = service.Spec.Networks } if !reflect.DeepEqual(requestSpecNetworks, specNetworks) { return grpc.Errorf(codes.Unimplemented, errNetworkUpdateNotSupported.Error()) } // Check to see if all the secrets being added exist as objects // in our datastore err := s.checkSecretExistence(tx, request.Spec) if err != nil { return err } // orchestrator is designed to be stateless, so it should not deal // with service mode change (comparing current config with previous config). // proper way to change service mode is to delete and re-add. if reflect.TypeOf(service.Spec.Mode) != reflect.TypeOf(request.Spec.Mode) { return grpc.Errorf(codes.Unimplemented, errModeChangeNotAllowed.Error()) } if service.Spec.Annotations.Name != request.Spec.Annotations.Name { return grpc.Errorf(codes.Unimplemented, errRenameNotSupported.Error()) } service.Meta.Version = *request.ServiceVersion service.PreviousSpec = service.Spec.Copy() service.Spec = *request.Spec.Copy() // Reset update status service.UpdateStatus = nil return store.UpdateService(tx, service) }) if err != nil { return nil, err } return &api.UpdateServiceResponse{ Service: service, }, nil }