Exemple #1
0
func restartPolicyFromGRPC(p *swarmapi.RestartPolicy) *types.RestartPolicy {
	var rp *types.RestartPolicy
	if p != nil {
		rp = &types.RestartPolicy{}

		switch p.Condition {
		case swarmapi.RestartOnNone:
			rp.Condition = types.RestartPolicyConditionNone
		case swarmapi.RestartOnFailure:
			rp.Condition = types.RestartPolicyConditionOnFailure
		case swarmapi.RestartOnAny:
			rp.Condition = types.RestartPolicyConditionAny
		default:
			rp.Condition = types.RestartPolicyConditionAny
		}

		if p.Delay != nil {
			delay, _ := ptypes.Duration(p.Delay)
			rp.Delay = &delay
		}
		if p.Window != nil {
			window, _ := ptypes.Duration(p.Window)
			rp.Window = &window
		}

		rp.MaxAttempts = &p.MaxAttempts
	}
	return rp
}
Exemple #2
0
func validateRestartPolicy(rp *api.RestartPolicy) error {
	if rp == nil {
		return nil
	}

	if rp.Delay != nil {
		delay, err := ptypes.Duration(rp.Delay)
		if err != nil {
			return err
		}
		if delay < 0 {
			return grpc.Errorf(codes.InvalidArgument, "TaskSpec: restart-delay cannot be negative")
		}
	}

	if rp.Window != nil {
		win, err := ptypes.Duration(rp.Window)
		if err != nil {
			return err
		}
		if win < 0 {
			return grpc.Errorf(codes.InvalidArgument, "TaskSpec: restart-window cannot be negative")
		}
	}

	return nil
}
Exemple #3
0
func printClusterSummary(cluster *api.Cluster) {
	w := tabwriter.NewWriter(os.Stdout, 8, 8, 8, ' ', 0)
	defer w.Flush()

	common.FprintfIfNotEmpty(w, "ID\t: %s\n", cluster.ID)
	common.FprintfIfNotEmpty(w, "Name\t: %s\n", cluster.Spec.Annotations.Name)
	fmt.Fprintf(w, "Orchestration settings:\n")
	fmt.Fprintf(w, "  Task history entries: %d\n", cluster.Spec.Orchestration.TaskHistoryRetentionLimit)

	heartbeatPeriod, err := ptypes.Duration(cluster.Spec.Dispatcher.HeartbeatPeriod)
	if err == nil {
		fmt.Fprintf(w, "Dispatcher settings:\n")
		fmt.Fprintf(w, "  Dispatcher heartbeat period: %s\n", heartbeatPeriod.String())
	}

	fmt.Fprintf(w, "Certificate Authority settings:\n")
	if cluster.Spec.CAConfig.NodeCertExpiry != nil {
		clusterDuration, err := ptypes.Duration(cluster.Spec.CAConfig.NodeCertExpiry)
		if err != nil {
			fmt.Fprintf(w, "  Certificate Validity Duration: [ERROR PARSING DURATION]\n")
		} else {
			fmt.Fprintf(w, "  Certificate Validity Duration: %s\n", clusterDuration.String())
		}
	}
	if len(cluster.Spec.CAConfig.ExternalCAs) > 0 {
		fmt.Fprintf(w, "  External CAs:\n")
		for _, ca := range cluster.Spec.CAConfig.ExternalCAs {
			fmt.Fprintf(w, "    %s: %s\n", ca.Protocol, ca.URL)
		}
	}

	fmt.Fprintln(w, "  Join Tokens:")
	fmt.Fprintln(w, "    Worker:", cluster.RootCA.JoinTokens.Worker)
	fmt.Fprintln(w, "    Manager:", cluster.RootCA.JoinTokens.Manager)

	if cluster.Spec.TaskDefaults.LogDriver != nil {
		fmt.Fprintf(w, "Default Log Driver\t: %s\n", cluster.Spec.TaskDefaults.LogDriver.Name)
		var keys []string

		if len(cluster.Spec.TaskDefaults.LogDriver.Options) != 0 {
			for k := range cluster.Spec.TaskDefaults.LogDriver.Options {
				keys = append(keys, k)
			}
			sort.Strings(keys)

			for _, k := range keys {
				v := cluster.Spec.TaskDefaults.LogDriver.Options[k]
				if v != "" {
					fmt.Fprintf(w, "  %s\t: %s\n", k, v)
				} else {
					fmt.Fprintf(w, "  %s\t\n", k)

				}
			}
		}
	}
}
Exemple #4
0
func healthConfigFromGRPC(h *swarmapi.HealthConfig) *container.HealthConfig {
	interval, _ := ptypes.Duration(h.Interval)
	timeout, _ := ptypes.Duration(h.Timeout)
	return &container.HealthConfig{
		Test:     h.Test,
		Interval: interval,
		Timeout:  timeout,
		Retries:  int(h.Retries),
	}
}
Exemple #5
0
// SwarmFromGRPC converts a grpc Cluster to a Swarm.
func SwarmFromGRPC(c swarmapi.Cluster) types.Swarm {
	swarm := types.Swarm{
		ID: c.ID,
		Spec: types.Spec{
			Orchestration: types.OrchestrationConfig{
				TaskHistoryRetentionLimit: c.Spec.Orchestration.TaskHistoryRetentionLimit,
			},
			Raft: types.RaftConfig{
				SnapshotInterval:           c.Spec.Raft.SnapshotInterval,
				KeepOldSnapshots:           c.Spec.Raft.KeepOldSnapshots,
				LogEntriesForSlowFollowers: c.Spec.Raft.LogEntriesForSlowFollowers,
				HeartbeatTick:              c.Spec.Raft.HeartbeatTick,
				ElectionTick:               c.Spec.Raft.ElectionTick,
			},
		},
	}

	heartbeatPeriod, _ := ptypes.Duration(c.Spec.Dispatcher.HeartbeatPeriod)
	swarm.Spec.Dispatcher.HeartbeatPeriod = uint64(heartbeatPeriod)

	swarm.Spec.CAConfig.NodeCertExpiry, _ = ptypes.Duration(c.Spec.CAConfig.NodeCertExpiry)

	for _, ca := range c.Spec.CAConfig.ExternalCAs {
		swarm.Spec.CAConfig.ExternalCAs = append(swarm.Spec.CAConfig.ExternalCAs, &types.ExternalCA{
			Protocol: types.ExternalCAProtocol(strings.ToLower(ca.Protocol.String())),
			URL:      ca.URL,
			Options:  ca.Options,
		})
	}

	// Meta
	swarm.Version.Index = c.Meta.Version.Index
	swarm.CreatedAt, _ = ptypes.Timestamp(c.Meta.CreatedAt)
	swarm.UpdatedAt, _ = ptypes.Timestamp(c.Meta.UpdatedAt)

	// Annotations
	swarm.Spec.Name = c.Spec.Annotations.Name
	swarm.Spec.Labels = c.Spec.Annotations.Labels

	for _, policy := range c.Spec.AcceptancePolicy.Policies {
		p := types.Policy{
			Role:       types.NodeRole(strings.ToLower(policy.Role.String())),
			Autoaccept: policy.Autoaccept,
		}
		if policy.Secret != nil {
			secret := string(policy.Secret.Data)
			p.Secret = &secret
		}
		swarm.Spec.AcceptancePolicy.Policies = append(swarm.Spec.AcceptancePolicy.Policies, p)
	}

	return swarm
}
Exemple #6
0
func (c *containerConfig) healthcheck() *enginecontainer.HealthConfig {
	hcSpec := c.spec().Healthcheck
	if hcSpec == nil {
		return nil
	}
	interval, _ := ptypes.Duration(hcSpec.Interval)
	timeout, _ := ptypes.Duration(hcSpec.Timeout)
	return &enginecontainer.HealthConfig{
		Test:     hcSpec.Test,
		Interval: interval,
		Timeout:  timeout,
		Retries:  int(hcSpec.Retries),
	}
}
Exemple #7
0
// SwarmFromGRPC converts a grpc Cluster to a Swarm.
func SwarmFromGRPC(c swarmapi.Cluster) types.Swarm {
	swarm := types.Swarm{
		ClusterInfo: types.ClusterInfo{
			ID: c.ID,
			Spec: types.Spec{
				Orchestration: types.OrchestrationConfig{
					TaskHistoryRetentionLimit: &c.Spec.Orchestration.TaskHistoryRetentionLimit,
				},
				Raft: types.RaftConfig{
					SnapshotInterval:           c.Spec.Raft.SnapshotInterval,
					KeepOldSnapshots:           &c.Spec.Raft.KeepOldSnapshots,
					LogEntriesForSlowFollowers: c.Spec.Raft.LogEntriesForSlowFollowers,
					HeartbeatTick:              int(c.Spec.Raft.HeartbeatTick),
					ElectionTick:               int(c.Spec.Raft.ElectionTick),
				},
				EncryptionConfig: types.EncryptionConfig{
					AutoLockManagers: c.Spec.EncryptionConfig.AutoLockManagers,
				},
			},
		},
		JoinTokens: types.JoinTokens{
			Worker:  c.RootCA.JoinTokens.Worker,
			Manager: c.RootCA.JoinTokens.Manager,
		},
	}

	heartbeatPeriod, _ := ptypes.Duration(c.Spec.Dispatcher.HeartbeatPeriod)
	swarm.Spec.Dispatcher.HeartbeatPeriod = heartbeatPeriod

	swarm.Spec.CAConfig.NodeCertExpiry, _ = ptypes.Duration(c.Spec.CAConfig.NodeCertExpiry)

	for _, ca := range c.Spec.CAConfig.ExternalCAs {
		swarm.Spec.CAConfig.ExternalCAs = append(swarm.Spec.CAConfig.ExternalCAs, &types.ExternalCA{
			Protocol: types.ExternalCAProtocol(strings.ToLower(ca.Protocol.String())),
			URL:      ca.URL,
			Options:  ca.Options,
		})
	}

	// Meta
	swarm.Version.Index = c.Meta.Version.Index
	swarm.CreatedAt, _ = ptypes.Timestamp(c.Meta.CreatedAt)
	swarm.UpdatedAt, _ = ptypes.Timestamp(c.Meta.UpdatedAt)

	// Annotations
	swarm.Spec.Name = c.Spec.Annotations.Name
	swarm.Spec.Labels = c.Spec.Annotations.Labels

	return swarm
}
Exemple #8
0
func validateClusterSpec(spec *api.ClusterSpec) error {
	if spec == nil {
		return grpc.Errorf(codes.InvalidArgument, errInvalidArgument.Error())
	}

	// Validate that duration being provided is valid, and over our minimum
	if spec.CAConfig.NodeCertExpiry != nil {
		expiry, err := ptypes.Duration(spec.CAConfig.NodeCertExpiry)
		if err != nil {
			return grpc.Errorf(codes.InvalidArgument, errInvalidArgument.Error())
		}
		if expiry < ca.MinNodeCertExpiration {
			return grpc.Errorf(codes.InvalidArgument, "minimum certificate expiry time is: %s", ca.MinNodeCertExpiration)
		}
	}

	// Validate that AcceptancePolicies only include Secrets that are bcrypted
	// TODO(diogo): Add a global list of acceptace algorithms. We only support bcrypt for now.
	if len(spec.AcceptancePolicy.Policies) > 0 {
		for _, policy := range spec.AcceptancePolicy.Policies {
			if policy.Secret != nil && strings.ToLower(policy.Secret.Alg) != "bcrypt" {
				return grpc.Errorf(codes.InvalidArgument, "hashing algorithm is not supported: %s", policy.Secret.Alg)
			}
		}
	}

	return nil
}
Exemple #9
0
func printClusterSummary(cluster *api.Cluster) {
	w := tabwriter.NewWriter(os.Stdout, 8, 8, 8, ' ', 0)
	defer w.Flush()

	common.FprintfIfNotEmpty(w, "ID\t: %s\n", cluster.ID)
	common.FprintfIfNotEmpty(w, "Name\t: %s\n", cluster.Spec.Annotations.Name)
	if len(cluster.Spec.AcceptancePolicy.Policies) > 0 {
		fmt.Fprintf(w, "Acceptance Policies:\n")
		for _, policy := range cluster.Spec.AcceptancePolicy.Policies {
			fmt.Fprintf(w, "  Role\t: %v\n", policy.Role)
			fmt.Fprintf(w, "    Autoaccept\t: %v\n", policy.Autoaccept)
			if policy.Secret != nil {
				fmt.Fprintf(w, "    Secret\t: %v\n", string(policy.Secret.Data))
			}
		}
	}
	fmt.Fprintf(w, "Orchestration settings:\n")
	fmt.Fprintf(w, "  Task history entries: %d\n", cluster.Spec.Orchestration.TaskHistoryRetentionLimit)
	fmt.Fprintf(w, "Dispatcher settings:\n")
	fmt.Fprintf(w, "  Dispatcher heartbeat period: %d\n", cluster.Spec.Dispatcher.HeartbeatPeriod)
	if cluster.Spec.CAConfig.NodeCertExpiry != nil {
		fmt.Fprintf(w, "Certificate Authority settings:\n")
		clusterDuration, err := ptypes.Duration(cluster.Spec.CAConfig.NodeCertExpiry)
		if err != nil {
			fmt.Fprintf(w, "  Certificate Validity Duration: [ERROR PARSING DURATION]\n")
		} else {
			fmt.Fprintf(w, "  Certificate Validity Duration: %s\n", clusterDuration.String())
		}
	}
}
// updateCluster is called when there are cluster changes, and it ensures that the local RootCA is
// always aware of changes in clusterExpiry and the Root CA key material
func (s *Server) updateCluster(ctx context.Context, cluster *api.Cluster) {
	s.mu.Lock()
	s.joinTokens = cluster.RootCA.JoinTokens.Copy()
	s.mu.Unlock()
	var err error

	// If the cluster has a RootCA, let's try to update our SecurityConfig to reflect the latest values
	rCA := cluster.RootCA
	if len(rCA.CACert) != 0 && len(rCA.CAKey) != 0 {
		expiry := DefaultNodeCertExpiration
		if cluster.Spec.CAConfig.NodeCertExpiry != nil {
			// NodeCertExpiry exists, let's try to parse the duration out of it
			clusterExpiry, err := ptypes.Duration(cluster.Spec.CAConfig.NodeCertExpiry)
			if err != nil {
				log.G(ctx).WithFields(logrus.Fields{
					"cluster.id": cluster.ID,
					"method":     "(*Server).updateCluster",
				}).WithError(err).Warn("failed to parse certificate expiration, using default")
			} else {
				// We were able to successfully parse the expiration out of the cluster.
				expiry = clusterExpiry
			}
		} else {
			// NodeCertExpiry seems to be nil
			log.G(ctx).WithFields(logrus.Fields{
				"cluster.id": cluster.ID,
				"method":     "(*Server).updateCluster",
			}).WithError(err).Warn("failed to parse certificate expiration, using default")

		}
		// Attempt to update our local RootCA with the new parameters
		err = s.securityConfig.UpdateRootCA(rCA.CACert, rCA.CAKey, expiry)
		if err != nil {
			log.G(ctx).WithFields(logrus.Fields{
				"cluster.id": cluster.ID,
				"method":     "(*Server).updateCluster",
			}).WithError(err).Error("updating Root CA failed")
		} else {
			log.G(ctx).WithFields(logrus.Fields{
				"cluster.id": cluster.ID,
				"method":     "(*Server).updateCluster",
			}).Debugf("Root CA updated successfully")
		}
	}

	// Update our security config with the list of External CA URLs
	// from the new cluster state.

	// TODO(aaronl): In the future, this will be abstracted with an
	// ExternalCA interface that has different implementations for
	// different CA types. At the moment, only CFSSL is supported.
	var cfsslURLs []string
	for _, ca := range cluster.Spec.CAConfig.ExternalCAs {
		if ca.Protocol == api.ExternalCA_CAProtocolCFSSL {
			cfsslURLs = append(cfsslURLs, ca.URL)
		}
	}

	s.securityConfig.externalCA.UpdateURLs(cfsslURLs...)
}
Exemple #11
0
func (s *session) heartbeat(ctx context.Context) error {
	log.G(ctx).Debugf("(*session).heartbeat")
	client := api.NewDispatcherClient(s.conn)
	heartbeat := time.NewTimer(1) // send out a heartbeat right away
	defer heartbeat.Stop()

	for {
		select {
		case <-heartbeat.C:
			heartbeatCtx, cancel := context.WithTimeout(ctx, dispatcherRPCTimeout)
			resp, err := client.Heartbeat(heartbeatCtx, &api.HeartbeatRequest{
				SessionID: s.sessionID,
			})
			cancel()
			if err != nil {
				if grpc.Code(err) == codes.NotFound {
					err = errNodeNotRegistered
				}

				return err
			}

			period, err := ptypes.Duration(&resp.Period)
			if err != nil {
				return err
			}

			heartbeat.Reset(period)
		case <-s.closed:
			return errSessionClosed
		case <-ctx.Done():
			return ctx.Err()
		}
	}
}
func (u *Updater) worker(ctx context.Context, queue <-chan *api.Task) {
	for t := range queue {
		updated := newTask(u.cluster, u.newService, t.Slot)
		updated.DesiredState = api.TaskStateReady
		if isGlobalService(u.newService) {
			updated.NodeID = t.NodeID
		}

		if err := u.updateTask(ctx, t, updated); err != nil {
			log.G(ctx).WithError(err).WithField("task.id", t.ID).Error("update failed")
		}

		if u.newService.Spec.Update != nil && (u.newService.Spec.Update.Delay.Seconds != 0 || u.newService.Spec.Update.Delay.Nanos != 0) {
			delay, err := ptypes.Duration(&u.newService.Spec.Update.Delay)
			if err != nil {
				log.G(ctx).WithError(err).Error("invalid update delay")
				continue
			}
			select {
			case <-time.After(delay):
			case <-u.stopChan:
				return
			}
		}
	}
}
Exemple #13
0
func restartPolicyFromGRPC(p *swarmapi.RestartPolicy) *types.RestartPolicy {
	var rp *types.RestartPolicy
	if p != nil {
		rp = &types.RestartPolicy{}
		rp.Condition = types.RestartPolicyCondition(strings.ToLower(p.Condition.String()))
		if p.Delay != nil {
			delay, _ := ptypes.Duration(p.Delay)
			rp.Delay = &delay
		}
		if p.Window != nil {
			window, _ := ptypes.Duration(p.Window)
			rp.Window = &window
		}

		rp.MaxAttempts = &p.MaxAttempts
	}
	return rp
}
Exemple #14
0
func (c *containerAdapter) shutdown(ctx context.Context) error {
	// Default stop grace period to 10s.
	stopgrace := 10 * time.Second
	spec := c.container.spec()
	if spec.StopGracePeriod != nil {
		stopgrace, _ = ptypes.Duration(spec.StopGracePeriod)
	}
	return c.client.ContainerStop(ctx, c.container.name(), &stopgrace)
}
Exemple #15
0
// ServiceFromGRPC converts a grpc Service to a Service.
func ServiceFromGRPC(s swarmapi.Service) types.Service {
	spec := s.Spec
	containerConfig := spec.Task.Runtime.(*swarmapi.TaskSpec_Container).Container

	networks := make([]types.NetworkAttachmentConfig, 0, len(spec.Networks))
	for _, n := range spec.Networks {
		networks = append(networks, types.NetworkAttachmentConfig{Target: n.Target, Aliases: n.Aliases})
	}
	service := types.Service{
		ID: s.ID,

		Spec: types.ServiceSpec{
			TaskTemplate: types.TaskSpec{
				ContainerSpec: containerSpecFromGRPC(containerConfig),
				Resources:     resourcesFromGRPC(s.Spec.Task.Resources),
				RestartPolicy: restartPolicyFromGRPC(s.Spec.Task.Restart),
				Placement:     placementFromGRPC(s.Spec.Task.Placement),
				LogDriver:     driverFromGRPC(s.Spec.Task.LogDriver),
			},

			Networks:     networks,
			EndpointSpec: endpointSpecFromGRPC(s.Spec.Endpoint),
		},
		Endpoint: endpointFromGRPC(s.Endpoint),
	}

	// Meta
	service.Version.Index = s.Meta.Version.Index
	service.CreatedAt, _ = ptypes.Timestamp(s.Meta.CreatedAt)
	service.UpdatedAt, _ = ptypes.Timestamp(s.Meta.UpdatedAt)

	// Annotations
	service.Spec.Name = s.Spec.Annotations.Name
	service.Spec.Labels = s.Spec.Annotations.Labels

	// UpdateConfig
	if s.Spec.Update != nil {
		service.Spec.UpdateConfig = &types.UpdateConfig{
			Parallelism: s.Spec.Update.Parallelism,
		}

		service.Spec.UpdateConfig.Delay, _ = ptypes.Duration(&s.Spec.Update.Delay)
	}

	//Mode
	switch t := s.Spec.GetMode().(type) {
	case *swarmapi.ServiceSpec_Global:
		service.Spec.Mode.Global = &types.GlobalService{}
	case *swarmapi.ServiceSpec_Replicated:
		service.Spec.Mode.Replicated = &types.ReplicatedService{
			Replicas: &t.Replicated.Replicas,
		}
	}

	return service
}
Exemple #16
0
func (u *Updater) worker(ctx context.Context, queue <-chan slot) {
	for slot := range queue {
		// Do we have a task with the new spec in desired state = RUNNING?
		// If so, all we have to do to complete the update is remove the
		// other tasks. Or if we have a task with the new spec that has
		// desired state < RUNNING, advance it to running and remove the
		// other tasks.
		var (
			runningTask *api.Task
			cleanTask   *api.Task
		)
		for _, t := range slot {
			if !u.isTaskDirty(t) {
				if t.DesiredState == api.TaskStateRunning {
					runningTask = t
					break
				}
				if t.DesiredState < api.TaskStateRunning {
					cleanTask = t
				}
			}
		}
		if runningTask != nil {
			if err := u.useExistingTask(ctx, slot, runningTask); err != nil {
				log.G(ctx).WithError(err).Error("update failed")
			}
		} else if cleanTask != nil {
			if err := u.useExistingTask(ctx, slot, cleanTask); err != nil {
				log.G(ctx).WithError(err).Error("update failed")
			}
		} else {
			updated := newTask(u.cluster, u.newService, slot[0].Slot)
			updated.DesiredState = api.TaskStateReady
			if isGlobalService(u.newService) {
				updated.NodeID = slot[0].NodeID
			}

			if err := u.updateTask(ctx, slot, updated); err != nil {
				log.G(ctx).WithError(err).WithField("task.id", updated.ID).Error("update failed")
			}
		}

		if u.newService.Spec.Update != nil && (u.newService.Spec.Update.Delay.Seconds != 0 || u.newService.Spec.Update.Delay.Nanos != 0) {
			delay, err := ptypes.Duration(&u.newService.Spec.Update.Delay)
			if err != nil {
				log.G(ctx).WithError(err).Error("invalid update delay")
				continue
			}
			select {
			case <-time.After(delay):
			case <-u.stopChan:
				return
			}
		}
	}
}
Exemple #17
0
func containerSpecFromGRPC(c *swarmapi.ContainerSpec) types.ContainerSpec {
	containerSpec := types.ContainerSpec{
		Image:    c.Image,
		Labels:   c.Labels,
		Command:  c.Command,
		Args:     c.Args,
		Hostname: c.Hostname,
		Env:      c.Env,
		Dir:      c.Dir,
		User:     c.User,
		Groups:   c.Groups,
		TTY:      c.TTY,
	}

	// Mounts
	for _, m := range c.Mounts {
		mount := mounttypes.Mount{
			Target:   m.Target,
			Source:   m.Source,
			Type:     mounttypes.Type(strings.ToLower(swarmapi.Mount_MountType_name[int32(m.Type)])),
			ReadOnly: m.ReadOnly,
		}

		if m.BindOptions != nil {
			mount.BindOptions = &mounttypes.BindOptions{
				Propagation: mounttypes.Propagation(strings.ToLower(swarmapi.Mount_BindOptions_MountPropagation_name[int32(m.BindOptions.Propagation)])),
			}
		}

		if m.VolumeOptions != nil {
			mount.VolumeOptions = &mounttypes.VolumeOptions{
				NoCopy: m.VolumeOptions.NoCopy,
				Labels: m.VolumeOptions.Labels,
			}
			if m.VolumeOptions.DriverConfig != nil {
				mount.VolumeOptions.DriverConfig = &mounttypes.Driver{
					Name:    m.VolumeOptions.DriverConfig.Name,
					Options: m.VolumeOptions.DriverConfig.Options,
				}
			}
		}
		containerSpec.Mounts = append(containerSpec.Mounts, mount)
	}

	if c.StopGracePeriod != nil {
		grace, _ := ptypes.Duration(c.StopGracePeriod)
		containerSpec.StopGracePeriod = &grace
	}

	if c.Healthcheck != nil {
		containerSpec.Healthcheck = healthConfigFromGRPC(c.Healthcheck)
	}

	return containerSpec
}
Exemple #18
0
func validateUpdate(uc *api.UpdateConfig) error {
	if uc == nil {
		return nil
	}

	delay, err := ptypes.Duration(&uc.Delay)
	if err != nil {
		return err
	}

	if delay < 0 {
		return grpc.Errorf(codes.InvalidArgument, "TaskSpec: update-delay cannot be negative")
	}

	return nil
}
Exemple #19
0
func containerSpecFromGRPC(c *swarmapi.ContainerSpec) types.ContainerSpec {
	containerSpec := types.ContainerSpec{
		Image:   c.Image,
		Labels:  c.Labels,
		Command: c.Command,
		Args:    c.Args,
		Env:     c.Env,
		Dir:     c.Dir,
		User:    c.User,
	}

	// Mounts
	for _, m := range c.Mounts {
		mount := types.Mount{
			Target:   m.Target,
			Source:   m.Source,
			Type:     types.MountType(strings.ToLower(swarmapi.Mount_MountType_name[int32(m.Type)])),
			Writable: m.Writable,
		}

		if m.BindOptions != nil {
			mount.BindOptions = &types.BindOptions{
				Propagation: types.MountPropagation(strings.ToLower(swarmapi.Mount_BindOptions_MountPropagation_name[int32(m.BindOptions.Propagation)])),
			}
		}

		if m.VolumeOptions != nil {
			mount.VolumeOptions = &types.VolumeOptions{
				Populate: m.VolumeOptions.Populate,
				Labels:   m.VolumeOptions.Labels,
			}
			if m.VolumeOptions.DriverConfig != nil {
				mount.VolumeOptions.DriverConfig = &types.Driver{
					Name:    m.VolumeOptions.DriverConfig.Name,
					Options: m.VolumeOptions.DriverConfig.Options,
				}
			}
		}
		containerSpec.Mounts = append(containerSpec.Mounts, mount)
	}

	if c.StopGracePeriod != nil {
		grace, _ := ptypes.Duration(c.StopGracePeriod)
		containerSpec.StopGracePeriod = &grace
	}
	return containerSpec
}
Exemple #20
0
// updateCluster is called when there are cluster changes, and it ensures that the local RootCA is
// always aware of changes in clusterExpiry and the Root CA key material
func (s *Server) updateCluster(ctx context.Context, cluster *api.Cluster) {
	s.mu.Lock()
	s.acceptancePolicy = cluster.Spec.AcceptancePolicy.Copy()
	s.mu.Unlock()
	var err error

	// If the cluster has a RootCA, let's try to update our SecurityConfig to reflect the latest values
	rCA := cluster.RootCA
	if len(rCA.CACert) != 0 && len(rCA.CAKey) != 0 {
		expiry := DefaultNodeCertExpiration
		if cluster.Spec.CAConfig.NodeCertExpiry != nil {
			// NodeCertExpiry exists, let's try to parse the duration out of it
			clusterExpiry, err := ptypes.Duration(cluster.Spec.CAConfig.NodeCertExpiry)
			if err != nil {
				log.G(ctx).WithFields(logrus.Fields{
					"cluster.id": cluster.ID,
					"method":     "(*Server).updateCluster",
				}).WithError(err).Warn("failed to parse certificate expiration, using default")
			} else {
				// We were able to successfully parse the expiration out of the cluster.
				expiry = clusterExpiry
			}
		} else {
			// NodeCertExpiry seems to be nil
			log.G(ctx).WithFields(logrus.Fields{
				"cluster.id": cluster.ID,
				"method":     "(*Server).updateCluster",
			}).WithError(err).Warn("failed to parse certificate expiration, using default")

		}
		// Attempt to update our local RootCA with the new parameters
		err = s.securityConfig.UpdateRootCA(rCA.CACert, rCA.CAKey, expiry)
		if err != nil {
			log.G(ctx).WithFields(logrus.Fields{
				"cluster.id": cluster.ID,
				"method":     "(*Server).updateCluster",
			}).WithError(err).Error("updating Root CA failed")
		} else {
			log.G(ctx).WithFields(logrus.Fields{
				"cluster.id": cluster.ID,
				"method":     "(*Server).updateCluster",
			}).Debugf("Root CA updated successfully")
		}
	}
}
Exemple #21
0
func serviceSpecFromGRPC(spec *swarmapi.ServiceSpec) *types.ServiceSpec {
	if spec == nil {
		return nil
	}

	serviceNetworks := make([]types.NetworkAttachmentConfig, 0, len(spec.Networks))
	for _, n := range spec.Networks {
		serviceNetworks = append(serviceNetworks, types.NetworkAttachmentConfig{Target: n.Target, Aliases: n.Aliases})
	}

	taskNetworks := make([]types.NetworkAttachmentConfig, 0, len(spec.Task.Networks))
	for _, n := range spec.Task.Networks {
		taskNetworks = append(taskNetworks, types.NetworkAttachmentConfig{Target: n.Target, Aliases: n.Aliases})
	}

	containerConfig := spec.Task.Runtime.(*swarmapi.TaskSpec_Container).Container
	convertedSpec := &types.ServiceSpec{
		Annotations: types.Annotations{
			Name:   spec.Annotations.Name,
			Labels: spec.Annotations.Labels,
		},

		TaskTemplate: types.TaskSpec{
			ContainerSpec: containerSpecFromGRPC(containerConfig),
			Resources:     resourcesFromGRPC(spec.Task.Resources),
			RestartPolicy: restartPolicyFromGRPC(spec.Task.Restart),
			Placement:     placementFromGRPC(spec.Task.Placement),
			LogDriver:     driverFromGRPC(spec.Task.LogDriver),
			Networks:      taskNetworks,
			ForceUpdate:   spec.Task.ForceUpdate,
		},

		Networks:     serviceNetworks,
		EndpointSpec: endpointSpecFromGRPC(spec.Endpoint),
	}

	// UpdateConfig
	if spec.Update != nil {
		convertedSpec.UpdateConfig = &types.UpdateConfig{
			Parallelism:     spec.Update.Parallelism,
			MaxFailureRatio: spec.Update.MaxFailureRatio,
		}

		convertedSpec.UpdateConfig.Delay, _ = ptypes.Duration(&spec.Update.Delay)
		if spec.Update.Monitor != nil {
			convertedSpec.UpdateConfig.Monitor, _ = ptypes.Duration(spec.Update.Monitor)
		}

		switch spec.Update.FailureAction {
		case swarmapi.UpdateConfig_PAUSE:
			convertedSpec.UpdateConfig.FailureAction = types.UpdateFailureActionPause
		case swarmapi.UpdateConfig_CONTINUE:
			convertedSpec.UpdateConfig.FailureAction = types.UpdateFailureActionContinue
		}
	}

	// Mode
	switch t := spec.GetMode().(type) {
	case *swarmapi.ServiceSpec_Global:
		convertedSpec.Mode.Global = &types.GlobalService{}
	case *swarmapi.ServiceSpec_Replicated:
		convertedSpec.Mode.Replicated = &types.ReplicatedService{
			Replicas: &t.Replicated.Replicas,
		}
	}

	return convertedSpec
}
Exemple #22
0
// ServiceFromGRPC converts a grpc Service to a Service.
func ServiceFromGRPC(s swarmapi.Service) types.Service {
	spec := s.Spec
	containerConfig := spec.Task.Runtime.(*swarmapi.TaskSpec_Container).Container

	serviceNetworks := make([]types.NetworkAttachmentConfig, 0, len(spec.Networks))
	for _, n := range spec.Networks {
		serviceNetworks = append(serviceNetworks, types.NetworkAttachmentConfig{Target: n.Target, Aliases: n.Aliases})
	}

	taskNetworks := make([]types.NetworkAttachmentConfig, 0, len(spec.Task.Networks))
	for _, n := range spec.Task.Networks {
		taskNetworks = append(taskNetworks, types.NetworkAttachmentConfig{Target: n.Target, Aliases: n.Aliases})
	}

	service := types.Service{
		ID: s.ID,

		Spec: types.ServiceSpec{
			TaskTemplate: types.TaskSpec{
				ContainerSpec: containerSpecFromGRPC(containerConfig),
				Resources:     resourcesFromGRPC(s.Spec.Task.Resources),
				RestartPolicy: restartPolicyFromGRPC(s.Spec.Task.Restart),
				Placement:     placementFromGRPC(s.Spec.Task.Placement),
				LogDriver:     driverFromGRPC(s.Spec.Task.LogDriver),
				Networks:      taskNetworks,
			},

			Networks:     serviceNetworks,
			EndpointSpec: endpointSpecFromGRPC(s.Spec.Endpoint),
		},
		Endpoint: endpointFromGRPC(s.Endpoint),
	}

	// Meta
	service.Version.Index = s.Meta.Version.Index
	service.CreatedAt, _ = ptypes.Timestamp(s.Meta.CreatedAt)
	service.UpdatedAt, _ = ptypes.Timestamp(s.Meta.UpdatedAt)

	// Annotations
	service.Spec.Name = s.Spec.Annotations.Name
	service.Spec.Labels = s.Spec.Annotations.Labels

	// UpdateConfig
	if s.Spec.Update != nil {
		service.Spec.UpdateConfig = &types.UpdateConfig{
			Parallelism: s.Spec.Update.Parallelism,
		}

		service.Spec.UpdateConfig.Delay, _ = ptypes.Duration(&s.Spec.Update.Delay)

		switch s.Spec.Update.FailureAction {
		case swarmapi.UpdateConfig_PAUSE:
			service.Spec.UpdateConfig.FailureAction = types.UpdateFailureActionPause
		case swarmapi.UpdateConfig_CONTINUE:
			service.Spec.UpdateConfig.FailureAction = types.UpdateFailureActionContinue
		}
	}

	// Mode
	switch t := s.Spec.GetMode().(type) {
	case *swarmapi.ServiceSpec_Global:
		service.Spec.Mode.Global = &types.GlobalService{}
	case *swarmapi.ServiceSpec_Replicated:
		service.Spec.Mode.Replicated = &types.ReplicatedService{
			Replicas: &t.Replicated.Replicas,
		}
	}

	// UpdateStatus
	service.UpdateStatus = types.UpdateStatus{}
	if s.UpdateStatus != nil {
		switch s.UpdateStatus.State {
		case swarmapi.UpdateStatus_UPDATING:
			service.UpdateStatus.State = types.UpdateStateUpdating
		case swarmapi.UpdateStatus_PAUSED:
			service.UpdateStatus.State = types.UpdateStatePaused
		case swarmapi.UpdateStatus_COMPLETED:
			service.UpdateStatus.State = types.UpdateStateCompleted
		}

		service.UpdateStatus.StartedAt, _ = ptypes.Timestamp(s.UpdateStatus.StartedAt)
		service.UpdateStatus.CompletedAt, _ = ptypes.Timestamp(s.UpdateStatus.CompletedAt)
		service.UpdateStatus.Message = s.UpdateStatus.Message
	}

	return service
}
Exemple #23
0
// Run starts the update and returns only once its complete or cancelled.
func (u *Updater) Run(ctx context.Context, slots []slot) {
	defer close(u.doneChan)

	service := u.newService

	// If the update is in a PAUSED state, we should not do anything.
	if service.UpdateStatus != nil &&
		(service.UpdateStatus.State == api.UpdateStatus_PAUSED ||
			service.UpdateStatus.State == api.UpdateStatus_ROLLBACK_PAUSED) {
		return
	}

	var dirtySlots []slot
	for _, slot := range slots {
		if u.isSlotDirty(slot) {
			dirtySlots = append(dirtySlots, slot)
		}
	}
	// Abort immediately if all tasks are clean.
	if len(dirtySlots) == 0 {
		if service.UpdateStatus != nil &&
			(service.UpdateStatus.State == api.UpdateStatus_UPDATING ||
				service.UpdateStatus.State == api.UpdateStatus_ROLLBACK_STARTED) {
			u.completeUpdate(ctx, service.ID)
		}
		return
	}

	// If there's no update in progress, we are starting one.
	if service.UpdateStatus == nil {
		u.startUpdate(ctx, service.ID)
	}

	parallelism := 0
	if service.Spec.Update != nil {
		parallelism = int(service.Spec.Update.Parallelism)
	}
	if parallelism == 0 {
		// TODO(aluzzardi): We could try to optimize unlimited parallelism by performing updates in a single
		// goroutine using a batch transaction.
		parallelism = len(dirtySlots)
	}

	// Start the workers.
	slotQueue := make(chan slot)
	wg := sync.WaitGroup{}
	wg.Add(parallelism)
	for i := 0; i < parallelism; i++ {
		go func() {
			u.worker(ctx, slotQueue)
			wg.Done()
		}()
	}

	failureAction := api.UpdateConfig_PAUSE
	allowedFailureFraction := float32(0)
	monitoringPeriod := defaultMonitor

	if service.Spec.Update != nil {
		failureAction = service.Spec.Update.FailureAction
		allowedFailureFraction = service.Spec.Update.AllowedFailureFraction

		if service.Spec.Update.Monitor != nil {
			var err error
			monitoringPeriod, err = ptypes.Duration(service.Spec.Update.Monitor)
			if err != nil {
				monitoringPeriod = defaultMonitor
			}
		}
	}

	var failedTaskWatch chan events.Event

	if failureAction != api.UpdateConfig_CONTINUE {
		var cancelWatch func()
		failedTaskWatch, cancelWatch = state.Watch(
			u.store.WatchQueue(),
			state.EventUpdateTask{
				Task:   &api.Task{ServiceID: service.ID, Status: api.TaskStatus{State: api.TaskStateRunning}},
				Checks: []state.TaskCheckFunc{state.TaskCheckServiceID, state.TaskCheckStateGreaterThan},
			},
		)
		defer cancelWatch()
	}

	stopped := false
	failedTasks := make(map[string]struct{})
	totalFailures := 0

	failureTriggersAction := func(failedTask *api.Task) bool {
		// Ignore tasks we have already seen as failures.
		if _, found := failedTasks[failedTask.ID]; found {
			return false
		}

		// If this failed/completed task is one that we
		// created as part of this update, we should
		// follow the failure action.
		u.updatedTasksMu.Lock()
		startedAt, found := u.updatedTasks[failedTask.ID]
		u.updatedTasksMu.Unlock()

		if found && (startedAt.IsZero() || time.Since(startedAt) <= monitoringPeriod) {
			failedTasks[failedTask.ID] = struct{}{}
			totalFailures++
			if float32(totalFailures)/float32(len(dirtySlots)) > allowedFailureFraction {
				switch failureAction {
				case api.UpdateConfig_PAUSE:
					stopped = true
					message := fmt.Sprintf("update paused due to failure or early termination of task %s", failedTask.ID)
					u.pauseUpdate(ctx, service.ID, message)
					return true
				case api.UpdateConfig_ROLLBACK:
					// Never roll back a rollback
					if service.UpdateStatus != nil && service.UpdateStatus.State == api.UpdateStatus_ROLLBACK_STARTED {
						message := fmt.Sprintf("rollback paused due to failure or early termination of task %s", failedTask.ID)
						u.pauseUpdate(ctx, service.ID, message)
						return true
					}
					stopped = true
					message := fmt.Sprintf("update rolled back due to failure or early termination of task %s", failedTask.ID)
					u.rollbackUpdate(ctx, service.ID, message)
					return true
				}
			}
		}

		return false
	}

slotsLoop:
	for _, slot := range dirtySlots {
	retryLoop:
		for {
			// Wait for a worker to pick up the task or abort the update, whichever comes first.
			select {
			case <-u.stopChan:
				stopped = true
				break slotsLoop
			case ev := <-failedTaskWatch:
				if failureTriggersAction(ev.(state.EventUpdateTask).Task) {
					break slotsLoop
				}
			case slotQueue <- slot:
				break retryLoop
			}
		}
	}

	close(slotQueue)
	wg.Wait()

	if !stopped {
		// Keep watching for task failures for one more monitoringPeriod,
		// before declaring the update complete.
		doneMonitoring := time.After(monitoringPeriod)
	monitorLoop:
		for {
			select {
			case <-u.stopChan:
				stopped = true
				break monitorLoop
			case <-doneMonitoring:
				break monitorLoop
			case ev := <-failedTaskWatch:
				if failureTriggersAction(ev.(state.EventUpdateTask).Task) {
					break monitorLoop
				}
			}
		}
	}

	// TODO(aaronl): Potentially roll back the service if not enough tasks
	// have reached RUNNING by this point.

	if !stopped {
		u.completeUpdate(ctx, service.ID)
	}
}
Exemple #24
0
// Restart initiates a new task to replace t if appropriate under the service's
// restart policy.
func (r *RestartSupervisor) Restart(ctx context.Context, tx store.Tx, service *api.Service, t api.Task) error {
	// TODO(aluzzardi): This function should not depend on `service`.

	t.DesiredState = api.TaskStateShutdown
	err := store.UpdateTask(tx, &t)
	if err != nil {
		log.G(ctx).WithError(err).Errorf("failed to set task desired state to dead")
		return err
	}

	if !r.shouldRestart(ctx, &t, service) {
		return nil
	}

	var restartTask *api.Task

	if isReplicatedService(service) {
		restartTask = newTask(service, t.Slot)
	} else if isGlobalService(service) {
		restartTask = newTask(service, 0)
		restartTask.NodeID = t.NodeID
	} else {
		log.G(ctx).Error("service not supported by restart supervisor")
		return nil
	}

	n := store.GetNode(tx, t.NodeID)

	restartTask.DesiredState = api.TaskStateAccepted

	var restartDelay time.Duration
	// Restart delay does not applied to drained nodes
	if n == nil || n.Spec.Availability != api.NodeAvailabilityDrain {
		if t.Spec.Restart != nil && t.Spec.Restart.Delay != nil {
			var err error
			restartDelay, err = ptypes.Duration(t.Spec.Restart.Delay)
			if err != nil {
				log.G(ctx).WithError(err).Error("invalid restart delay; using default")
				restartDelay = defaultRestartDelay
			}
		} else {
			restartDelay = defaultRestartDelay
		}
	}

	waitStop := true

	// Normally we wait for the old task to stop running, but we skip this
	// if the old task is already dead or the node it's assigned to is down.
	if (n != nil && n.Status.State == api.NodeStatus_DOWN) || t.Status.State > api.TaskStateRunning {
		waitStop = false
	}

	if err := store.CreateTask(tx, restartTask); err != nil {
		log.G(ctx).WithError(err).WithField("task.id", restartTask.ID).Error("task create failed")
		return err
	}

	r.recordRestartHistory(restartTask)

	r.DelayStart(ctx, tx, &t, restartTask.ID, restartDelay, waitStop)
	return nil
}
Exemple #25
0
// Run runs dispatcher tasks which should be run on leader dispatcher.
// Dispatcher can be stopped with cancelling ctx or calling Stop().
func (d *Dispatcher) Run(ctx context.Context) error {
	d.mu.Lock()
	if d.isRunning() {
		d.mu.Unlock()
		return fmt.Errorf("dispatcher is already running")
	}
	logger := log.G(ctx).WithField("module", "dispatcher")
	ctx = log.WithLogger(ctx, logger)
	if err := d.markNodesUnknown(ctx); err != nil {
		logger.Errorf(`failed to move all nodes to "unknown" state: %v`, err)
	}
	configWatcher, cancel, err := store.ViewAndWatch(
		d.store,
		func(readTx store.ReadTx) error {
			clusters, err := store.FindClusters(readTx, store.ByName(store.DefaultClusterName))
			if err != nil {
				return err
			}
			if err == nil && len(clusters) == 1 {
				heartbeatPeriod, err := ptypes.Duration(clusters[0].Spec.Dispatcher.HeartbeatPeriod)
				if err == nil && heartbeatPeriod > 0 {
					d.config.HeartbeatPeriod = heartbeatPeriod
				}
				if clusters[0].NetworkBootstrapKeys != nil {
					d.networkBootstrapKeys = clusters[0].NetworkBootstrapKeys
				}
			}
			return nil
		},
		state.EventUpdateCluster{},
	)
	if err != nil {
		d.mu.Unlock()
		return err
	}
	defer cancel()
	d.ctx, d.cancel = context.WithCancel(ctx)
	d.mu.Unlock()

	publishManagers := func() {
		mgrs := getWeightedPeers(d.cluster)
		sort.Sort(weightedPeerByNodeID(mgrs))
		d.mu.Lock()
		if reflect.DeepEqual(mgrs, d.lastSeenManagers) {
			d.mu.Unlock()
			return
		}
		d.lastSeenManagers = mgrs
		d.mu.Unlock()
		d.mgrQueue.Publish(mgrs)
	}

	publishManagers()
	publishTicker := time.NewTicker(1 * time.Second)
	defer publishTicker.Stop()

	batchTimer := time.NewTimer(maxBatchInterval)
	defer batchTimer.Stop()

	for {
		select {
		case <-publishTicker.C:
			publishManagers()
		case <-d.processTaskUpdatesTrigger:
			d.processTaskUpdates()
			batchTimer.Reset(maxBatchInterval)
		case <-batchTimer.C:
			d.processTaskUpdates()
			batchTimer.Reset(maxBatchInterval)
		case v := <-configWatcher:
			cluster := v.(state.EventUpdateCluster)
			d.mu.Lock()
			if cluster.Cluster.Spec.Dispatcher.HeartbeatPeriod != nil {
				// ignore error, since Spec has passed validation before
				heartbeatPeriod, _ := ptypes.Duration(cluster.Cluster.Spec.Dispatcher.HeartbeatPeriod)
				if heartbeatPeriod != d.config.HeartbeatPeriod {
					// only call d.nodes.updatePeriod when heartbeatPeriod changes
					d.config.HeartbeatPeriod = heartbeatPeriod
					d.nodes.updatePeriod(d.config.HeartbeatPeriod, d.config.HeartbeatEpsilon, d.config.GracePeriodMultiplier)
				}
			}
			d.networkBootstrapKeys = cluster.Cluster.NetworkBootstrapKeys
			d.mu.Unlock()
			d.keyMgrQueue.Publish(struct{}{})
		case <-d.ctx.Done():
			return nil
		}
	}
}
Exemple #26
0
// Restart initiates a new task to replace t if appropriate under the service's
// restart policy.
func (r *Supervisor) Restart(ctx context.Context, tx store.Tx, cluster *api.Cluster, service *api.Service, t api.Task) error {
	// TODO(aluzzardi): This function should not depend on `service`.

	// Is the old task still in the process of restarting? If so, wait for
	// its restart delay to elapse, to avoid tight restart loops (for
	// example, when the image doesn't exist).
	r.mu.Lock()
	oldDelay, ok := r.delays[t.ID]
	if ok {
		if !oldDelay.waiter {
			oldDelay.waiter = true
			go r.waitRestart(ctx, oldDelay, cluster, t.ID)
		}
		r.mu.Unlock()
		return nil
	}
	r.mu.Unlock()

	// Sanity check: was the task shut down already by a separate call to
	// Restart? If so, we must avoid restarting it, because this will create
	// an extra task. This should never happen unless there is a bug.
	if t.DesiredState > api.TaskStateRunning {
		return errors.New("Restart called on task that was already shut down")
	}

	t.DesiredState = api.TaskStateShutdown
	err := store.UpdateTask(tx, &t)
	if err != nil {
		log.G(ctx).WithError(err).Errorf("failed to set task desired state to dead")
		return err
	}

	if !r.shouldRestart(ctx, &t, service) {
		return nil
	}

	var restartTask *api.Task

	if orchestrator.IsReplicatedService(service) {
		restartTask = orchestrator.NewTask(cluster, service, t.Slot, "")
	} else if orchestrator.IsGlobalService(service) {
		restartTask = orchestrator.NewTask(cluster, service, 0, t.NodeID)
	} else {
		log.G(ctx).Error("service not supported by restart supervisor")
		return nil
	}

	n := store.GetNode(tx, t.NodeID)

	restartTask.DesiredState = api.TaskStateReady

	var restartDelay time.Duration
	// Restart delay is not applied to drained nodes
	if n == nil || n.Spec.Availability != api.NodeAvailabilityDrain {
		if t.Spec.Restart != nil && t.Spec.Restart.Delay != nil {
			var err error
			restartDelay, err = ptypes.Duration(t.Spec.Restart.Delay)
			if err != nil {
				log.G(ctx).WithError(err).Error("invalid restart delay; using default")
				restartDelay = orchestrator.DefaultRestartDelay
			}
		} else {
			restartDelay = orchestrator.DefaultRestartDelay
		}
	}

	waitStop := true

	// Normally we wait for the old task to stop running, but we skip this
	// if the old task is already dead or the node it's assigned to is down.
	if (n != nil && n.Status.State == api.NodeStatus_DOWN) || t.Status.State > api.TaskStateRunning {
		waitStop = false
	}

	if err := store.CreateTask(tx, restartTask); err != nil {
		log.G(ctx).WithError(err).WithField("task.id", restartTask.ID).Error("task create failed")
		return err
	}

	r.recordRestartHistory(restartTask)

	r.DelayStart(ctx, tx, &t, restartTask.ID, restartDelay, waitStop)
	return nil
}
Exemple #27
0
func (r *Supervisor) shouldRestart(ctx context.Context, t *api.Task, service *api.Service) bool {
	// TODO(aluzzardi): This function should not depend on `service`.

	condition := orchestrator.RestartCondition(t)

	if condition != api.RestartOnAny &&
		(condition != api.RestartOnFailure || t.Status.State == api.TaskStateCompleted) {
		return false
	}

	if t.Spec.Restart == nil || t.Spec.Restart.MaxAttempts == 0 {
		return true
	}

	instanceTuple := instanceTuple{
		instance:  t.Slot,
		serviceID: t.ServiceID,
	}

	// Instance is not meaningful for "global" tasks, so they need to be
	// indexed by NodeID.
	if orchestrator.IsGlobalService(service) {
		instanceTuple.nodeID = t.NodeID
	}

	r.mu.Lock()
	defer r.mu.Unlock()

	restartInfo := r.history[instanceTuple]
	if restartInfo == nil {
		return true
	}

	if t.Spec.Restart.Window == nil || (t.Spec.Restart.Window.Seconds == 0 && t.Spec.Restart.Window.Nanos == 0) {
		return restartInfo.totalRestarts < t.Spec.Restart.MaxAttempts
	}

	if restartInfo.restartedInstances == nil {
		return true
	}

	window, err := ptypes.Duration(t.Spec.Restart.Window)
	if err != nil {
		log.G(ctx).WithError(err).Error("invalid restart lookback window")
		return restartInfo.totalRestarts < t.Spec.Restart.MaxAttempts
	}
	lookback := time.Now().Add(-window)

	var next *list.Element
	for e := restartInfo.restartedInstances.Front(); e != nil; e = next {
		next = e.Next()

		if e.Value.(restartedInstance).timestamp.After(lookback) {
			break
		}
		restartInfo.restartedInstances.Remove(e)
	}

	numRestarts := uint64(restartInfo.restartedInstances.Len())

	if numRestarts == 0 {
		restartInfo.restartedInstances = nil
	}

	return numRestarts < t.Spec.Restart.MaxAttempts
}
Exemple #28
0
// Run runs dispatcher tasks which should be run on leader dispatcher.
// Dispatcher can be stopped with cancelling ctx or calling Stop().
func (d *Dispatcher) Run(ctx context.Context) error {
	d.mu.Lock()
	if d.isRunning() {
		d.mu.Unlock()
		return fmt.Errorf("dispatcher is already running")
	}
	ctx = log.WithModule(ctx, "dispatcher")
	if err := d.markNodesUnknown(ctx); err != nil {
		log.G(ctx).Errorf(`failed to move all nodes to "unknown" state: %v`, err)
	}
	configWatcher, cancel, err := store.ViewAndWatch(
		d.store,
		func(readTx store.ReadTx) error {
			clusters, err := store.FindClusters(readTx, store.ByName(store.DefaultClusterName))
			if err != nil {
				return err
			}
			if err == nil && len(clusters) == 1 {
				heartbeatPeriod, err := ptypes.Duration(clusters[0].Spec.Dispatcher.HeartbeatPeriod)
				if err == nil && heartbeatPeriod > 0 {
					d.config.HeartbeatPeriod = heartbeatPeriod
				}
				if clusters[0].NetworkBootstrapKeys != nil {
					d.networkBootstrapKeys = clusters[0].NetworkBootstrapKeys
				}
			}
			return nil
		},
		state.EventUpdateCluster{},
	)
	if err != nil {
		d.mu.Unlock()
		return err
	}

	peerWatcher, peerCancel := d.cluster.SubscribePeers()
	defer peerCancel()
	d.lastSeenManagers = getWeightedPeers(d.cluster)

	defer cancel()
	d.ctx, d.cancel = context.WithCancel(ctx)
	d.mu.Unlock()

	publishManagers := func(peers []*api.Peer) {
		var mgrs []*api.WeightedPeer
		for _, p := range peers {
			mgrs = append(mgrs, &api.WeightedPeer{
				Peer:   p,
				Weight: remotes.DefaultObservationWeight,
			})
		}
		d.mu.Lock()
		d.lastSeenManagers = mgrs
		d.mu.Unlock()
		d.mgrQueue.Publish(mgrs)
	}

	batchTimer := time.NewTimer(maxBatchInterval)
	defer batchTimer.Stop()

	for {
		select {
		case ev := <-peerWatcher:
			publishManagers(ev.([]*api.Peer))
		case <-d.processUpdatesTrigger:
			d.processUpdates()
			batchTimer.Reset(maxBatchInterval)
		case <-batchTimer.C:
			d.processUpdates()
			batchTimer.Reset(maxBatchInterval)
		case v := <-configWatcher:
			cluster := v.(state.EventUpdateCluster)
			d.mu.Lock()
			if cluster.Cluster.Spec.Dispatcher.HeartbeatPeriod != nil {
				// ignore error, since Spec has passed validation before
				heartbeatPeriod, _ := ptypes.Duration(cluster.Cluster.Spec.Dispatcher.HeartbeatPeriod)
				if heartbeatPeriod != d.config.HeartbeatPeriod {
					// only call d.nodes.updatePeriod when heartbeatPeriod changes
					d.config.HeartbeatPeriod = heartbeatPeriod
					d.nodes.updatePeriod(d.config.HeartbeatPeriod, d.config.HeartbeatEpsilon, d.config.GracePeriodMultiplier)
				}
			}
			d.networkBootstrapKeys = cluster.Cluster.NetworkBootstrapKeys
			d.mu.Unlock()
			d.keyMgrQueue.Publish(cluster.Cluster.NetworkBootstrapKeys)
		case <-d.ctx.Done():
			return nil
		}
	}
}
Exemple #29
0
func (r *Orchestrator) initTasks(ctx context.Context, readTx store.ReadTx) error {
	tasks, err := store.FindTasks(readTx, store.All)
	if err != nil {
		return err
	}
	for _, t := range tasks {
		if t.NodeID != "" {
			n := store.GetNode(readTx, t.NodeID)
			if invalidNode(n) && t.Status.State <= api.TaskStateRunning && t.DesiredState <= api.TaskStateRunning {
				r.restartTasks[t.ID] = struct{}{}
			}
		}
	}

	_, err = r.store.Batch(func(batch *store.Batch) error {
		for _, t := range tasks {
			if t.ServiceID == "" {
				continue
			}

			// TODO(aluzzardi): We should NOT retrieve the service here.
			service := store.GetService(readTx, t.ServiceID)
			if service == nil {
				// Service was deleted
				err := batch.Update(func(tx store.Tx) error {
					return store.DeleteTask(tx, t.ID)
				})
				if err != nil {
					log.G(ctx).WithError(err).Error("failed to set task desired state to dead")
				}
				continue
			}
			// TODO(aluzzardi): This is shady. We should have a more generic condition.
			if t.DesiredState != api.TaskStateReady || !orchestrator.IsReplicatedService(service) {
				continue
			}
			restartDelay := orchestrator.DefaultRestartDelay
			if t.Spec.Restart != nil && t.Spec.Restart.Delay != nil {
				var err error
				restartDelay, err = ptypes.Duration(t.Spec.Restart.Delay)
				if err != nil {
					log.G(ctx).WithError(err).Error("invalid restart delay")
					restartDelay = orchestrator.DefaultRestartDelay
				}
			}
			if restartDelay != 0 {
				timestamp, err := ptypes.Timestamp(t.Status.Timestamp)
				if err == nil {
					restartTime := timestamp.Add(restartDelay)
					calculatedRestartDelay := restartTime.Sub(time.Now())
					if calculatedRestartDelay < restartDelay {
						restartDelay = calculatedRestartDelay
					}
					if restartDelay > 0 {
						_ = batch.Update(func(tx store.Tx) error {
							t := store.GetTask(tx, t.ID)
							// TODO(aluzzardi): This is shady as well. We should have a more generic condition.
							if t == nil || t.DesiredState != api.TaskStateReady {
								return nil
							}
							r.restarts.DelayStart(ctx, tx, nil, t.ID, restartDelay, true)
							return nil
						})
						continue
					}
				} else {
					log.G(ctx).WithError(err).Error("invalid status timestamp")
				}
			}

			// Start now
			err := batch.Update(func(tx store.Tx) error {
				return r.restarts.StartNow(tx, t.ID)
			})
			if err != nil {
				log.G(ctx).WithError(err).WithField("task.id", t.ID).Error("moving task out of delayed state failed")
			}
		}
		return nil
	})

	return err
}