Пример #1
0
func main() {
	kingpin.CommandLine.Name = "p2-replicate"
	kingpin.CommandLine.Help = `p2-replicate uses the replication package to schedule deployment of a pod across multiple nodes. See the replication package's README and godoc for more information.

	Example invocation: p2-replicate --min-nodes 2 helloworld.yaml aws{1,2,3}.example.com

	This will take the pod whose manifest is located at helloworld.yaml and
	deploy it to the three nodes aws1.example.com, aws2.example.com, and
	aws3.example.com

	Because of --min-nodes 2, the replicator will ensure that at least two healthy
	nodes remain up at all times, according to p2's health checks.
`

	kingpin.Version(version.VERSION)
	_, opts := flags.ParseWithConsulOptions()
	client := kp.NewConsulClient(opts)
	store := kp.NewConsulStore(client)
	healthChecker := checker.NewConsulHealthChecker(client)

	manifest, err := pods.ManifestFromURI(*manifestUri)
	if err != nil {
		log.Fatalf("%s", err)
	}

	logger := logging.NewLogger(logrus.Fields{
		"pod": manifest.ID(),
	})
	logger.Logger.Formatter = &logrus.TextFormatter{
		DisableTimestamp: false,
		FullTimestamp:    true,
		TimestampFormat:  "15:04:05.000",
	}

	// create a lock with a meaningful name and set up a renewal loop for it
	thisHost, err := os.Hostname()
	if err != nil {
		log.Fatalf("Could not retrieve hostname: %s", err)
	}
	thisUser, err := user.Current()
	if err != nil {
		log.Fatalf("Could not retrieve user: %s", err)
	}
	lockMessage := fmt.Sprintf("%q from %q at %q", thisUser.Username, thisHost, time.Now())
	repl, err := replication.NewReplicator(
		manifest,
		logger,
		*hosts,
		len(*hosts)-*minNodes,
		store,
		healthChecker,
		health.HealthState(*threshold),
		lockMessage,
	)
	if err != nil {
		log.Fatalf("Could not initialize replicator: %s", err)
	}

	replication, errCh, err := repl.InitializeReplication(*overrideLock)
	if err != nil {
		log.Fatalf("Unable to initialize replication: %s", err)
	}

	// auto-drain this channel
	go func() {
		for range errCh {
		}
	}()

	go func() {
		// clear lock immediately on ctrl-C
		signals := make(chan os.Signal, 1)
		signal.Notify(signals, os.Interrupt)
		<-signals
		replication.Cancel()
		os.Exit(1)
	}()

	replication.Enact()
}
Пример #2
0
func main() {
	replicate.Version(version.VERSION)
	replicate.Parse(os.Args[1:])

	opts := kp.Options{
		Address: *consulUrl,
		Token:   *consulToken,
		Client:  net.NewHeaderClient(*headers, http.DefaultTransport),
		HTTPS:   *https,
	}
	store := kp.NewConsulStore(opts)
	healthChecker := health.NewConsulHealthChecker(opts)

	// Fetch manifest (could be URI) into temp file
	localMan, err := ioutil.TempFile("", "tempmanifest")
	defer os.Remove(localMan.Name())
	if err != nil {
		log.Fatalln("Couldn't create tempfile")
	}
	if err := uri.URICopy(*manifestUri, localMan.Name()); err != nil {
		log.Fatalf("Could not fetch manifest: %s", err)
	}

	manifest, err := pods.ManifestFromPath(localMan.Name())
	if err != nil {
		log.Fatalf("Invalid manifest: %s", err)
	}

	healthResults, err := healthChecker.Service(manifest.ID())
	if err != nil {
		log.Fatalf("Could not get initial health results: %s", err)
	}
	order := health.SortOrder{
		Nodes:  *hosts,
		Health: healthResults,
	}
	sort.Sort(order)

	repl := replication.Replicator{
		Manifest: *manifest,
		Store:    store,
		Health:   healthChecker,
		Nodes:    *hosts, // sorted by the health.SortOrder
		Active:   len(*hosts) - *minNodes,
		Logger: logging.NewLogger(logrus.Fields{
			"pod": manifest.ID(),
		}),
		Threshold: health.HealthState(*threshold),
	}
	repl.Logger.Logger.Formatter = &logrus.TextFormatter{
		DisableTimestamp: false,
		FullTimestamp:    true,
		TimestampFormat:  "15:04:05.000",
	}

	if err := repl.CheckPreparers(); err != nil {
		log.Fatalf("Preparer check failed: %s", err)
	}

	// create a lock with a meaningful name and set up a renewal loop for it
	thisHost, err := os.Hostname()
	if err != nil {
		log.Fatalf("Could not retrieve hostname: %s", err)
	}
	thisUser, err := user.Current()
	if err != nil {
		log.Fatalf("Could not retrieve user: %s", err)
	}
	lock, err := store.NewLock(fmt.Sprintf("%q from %q at %q", thisUser.Username, thisHost, time.Now()))
	if err != nil {
		log.Fatalf("Could not generate lock: %s", err)
	}
	// deferring on main is not particularly useful, since os.Exit will skip
	// the defer, so we have to manually destroy the lock at the right exit
	// paths
	go func() {
		for range time.Tick(10 * time.Second) {
			if err := lock.Renew(); err != nil {
				// if the renewal failed, then either the lock is already dead
				// or the consul agent cannot be reached
				log.Fatalf("Lock could not be renewed: %s", err)
			}
		}
	}()
	if err := repl.LockHosts(lock, *overrideLock); err != nil {
		lock.Destroy()
		log.Fatalf("Could not lock all hosts: %s", err)
	}

	// auto-drain this channel
	errs := make(chan error)
	go func() {
		for range errs {
		}
	}()

	quitch := make(chan struct{})
	go func() {
		// clear lock immediately on ctrl-C
		signals := make(chan os.Signal, 1)
		signal.Notify(signals, os.Interrupt)
		<-signals
		close(quitch)
		lock.Destroy()
		os.Exit(1)
	}()

	repl.Enact(errs, quitch)
	lock.Destroy()
}
Пример #3
0
func TestPublishLatestHealth(t *testing.T) {
	// This channel imitates the channel that consulutil.WatchPrefix would return
	healthListChan := make(chan api.KVPairs)
	quitCh := make(chan struct{})
	outCh := make(chan []*health.Result, 1)
	defer close(outCh)
	defer close(quitCh)

	errCh := publishLatestHealth(healthListChan, quitCh, outCh)

	go func() {
		err, open := <-errCh
		if err != nil {
			t.Fatal(err)
		}
		if !open {
			return
		}
	}()

	oldStatus := health.HealthState("passing")
	newStatus := health.HealthState("critical")
	hrOld := &health.Result{
		Status: oldStatus,
	}
	hrOldJSON, err := json.Marshal(hrOld)
	if err != nil {
		t.Fatalf("json marshal err: %v", err)
	}
	oldKV := &api.KVPair{Key: "health/service/node1.example.com", Value: hrOldJSON}

	hrNew := &health.Result{
		Status: newStatus,
	}
	hrNewJSON, err := json.Marshal(hrNew)
	if err != nil {
		t.Fatalf("json marshal err: %v", err)
	}
	newKV := &api.KVPair{Key: "health/service/node1.example.com", Value: hrNewJSON}

	// Basic test that publishLatestHealth drains the channels correctly
	// We write three times to ensure that at least one of the newKV values has flushed through the channel
	select {
	case healthListChan <- api.KVPairs{oldKV}:
	case <-time.After(1 * time.Second):
		t.Fatal("Failed to write to chan. Deadlock?")
	}

	select {
	case healthListChan <- api.KVPairs{newKV}:
	case <-time.After(1 * time.Second):
		t.Fatal("Failed to write to chan. Deadlock?")
	}

	select {
	case healthListChan <- api.KVPairs{newKV}:
	case <-time.After(1 * time.Second):
		t.Fatal("Failed to write to chan. Deadlock?")
	}

	select {
	case result := <-outCh:
		if len(result) < 1 {
			t.Fatalf("Got wrong number of results. Expected 1, got %d", len(result))
		}
		if result[0].Status != newStatus {
			t.Fatalf("expected status to match %s, was %s", newStatus, result[0].Status)
		}
		return
	case <-time.After(1 * time.Second):
		t.Fatal("oh no, timeout")
	}
}
Пример #4
0
func (ds *daemonSet) PublishToReplication() error {
	// We must cancel the replication because if we try to call
	// InitializeReplicationWithCheck, we will get an error
	ds.cancelReplication()

	podLocations, err := ds.CurrentPods()
	if err != nil {
		return util.Errorf("Error retrieving pod locations from daemon set: %v", err)
	}
	nodes := podLocations.Nodes()

	ds.logger.Infof("Preparing to publish the following nodes: %v", nodes)

	thisHost, err := os.Hostname()
	if err != nil {
		ds.logger.Errorf("Could not retrieve hostname: %s", err)
		thisHost = ""
	}
	thisUser, err := user.Current()
	if err != nil {
		ds.logger.Errorf("Could not retrieve user: %s", err)
		thisUser = &user.User{}
	}
	lockMessage := fmt.Sprintf("%q from %q at %q", thisUser.Username, thisHost, time.Now())
	repl, err := replication.NewReplicator(
		ds.DaemonSet.Manifest,
		ds.logger,
		nodes,
		len(nodes)-ds.DaemonSet.MinHealth,
		ds.kpStore,
		ds.applicator,
		*ds.healthChecker,
		health.HealthState(health.Passing),
		lockMessage,
		ds.Timeout,
	)
	if err != nil {
		ds.logger.Errorf("Could not initialize replicator: %s", err)
		return err
	}

	ds.logger.Info("New replicator was made")

	// Replication locks are designed to make sure that two replications to
	// the same nodes cannot occur at the same time. The granularity is
	// pod-wide as an optimization for consul performance (only need to
	// lock a single key) with limited downside when human operators are
	// executing deploys, because the likelihood of a lock collision is
	// low. With daemon sets, locking is not necessary because the node
	// sets should not overlap when they are managed properly. Even when
	// there is a node overlap between two daemon sets, a simple mutual
	// exclusion lock around replication will not prevent the pod manifest
	// on an overlapped node from thrashing. Therefore, it makes sense for
	// daemon sets to ignore this locking mechanism and always try to
	// converge nodes to the specified manifest
	replication, errCh, err := repl.InitializeDaemonSetReplication(
		replication.DefaultConcurrentReality,
		ds.rateLimitInterval,
	)
	if err != nil {
		ds.logger.Errorf("Unable to initialize replication: %s", err)
		return err
	}

	ds.logger.Info("Replication initialized")

	// auto-drain this channel
	go func() {
		for err := range errCh {
			ds.logger.Errorf("Error occurred in replication: '%v'", err)
		}
	}()

	// Set a new replication
	ds.currentReplication = replication

	go replication.Enact()

	ds.logger.Info("Replication enacted")

	return nil
}