func main() { kingpin.CommandLine.Name = "p2-replicate" kingpin.CommandLine.Help = `p2-replicate uses the replication package to schedule deployment of a pod across multiple nodes. See the replication package's README and godoc for more information. Example invocation: p2-replicate --min-nodes 2 helloworld.yaml aws{1,2,3}.example.com This will take the pod whose manifest is located at helloworld.yaml and deploy it to the three nodes aws1.example.com, aws2.example.com, and aws3.example.com Because of --min-nodes 2, the replicator will ensure that at least two healthy nodes remain up at all times, according to p2's health checks. ` kingpin.Version(version.VERSION) _, opts := flags.ParseWithConsulOptions() client := kp.NewConsulClient(opts) store := kp.NewConsulStore(client) healthChecker := checker.NewConsulHealthChecker(client) manifest, err := pods.ManifestFromURI(*manifestUri) if err != nil { log.Fatalf("%s", err) } logger := logging.NewLogger(logrus.Fields{ "pod": manifest.ID(), }) logger.Logger.Formatter = &logrus.TextFormatter{ DisableTimestamp: false, FullTimestamp: true, TimestampFormat: "15:04:05.000", } // create a lock with a meaningful name and set up a renewal loop for it thisHost, err := os.Hostname() if err != nil { log.Fatalf("Could not retrieve hostname: %s", err) } thisUser, err := user.Current() if err != nil { log.Fatalf("Could not retrieve user: %s", err) } lockMessage := fmt.Sprintf("%q from %q at %q", thisUser.Username, thisHost, time.Now()) repl, err := replication.NewReplicator( manifest, logger, *hosts, len(*hosts)-*minNodes, store, healthChecker, health.HealthState(*threshold), lockMessage, ) if err != nil { log.Fatalf("Could not initialize replicator: %s", err) } replication, errCh, err := repl.InitializeReplication(*overrideLock) if err != nil { log.Fatalf("Unable to initialize replication: %s", err) } // auto-drain this channel go func() { for range errCh { } }() go func() { // clear lock immediately on ctrl-C signals := make(chan os.Signal, 1) signal.Notify(signals, os.Interrupt) <-signals replication.Cancel() os.Exit(1) }() replication.Enact() }
func main() { replicate.Version(version.VERSION) replicate.Parse(os.Args[1:]) opts := kp.Options{ Address: *consulUrl, Token: *consulToken, Client: net.NewHeaderClient(*headers, http.DefaultTransport), HTTPS: *https, } store := kp.NewConsulStore(opts) healthChecker := health.NewConsulHealthChecker(opts) // Fetch manifest (could be URI) into temp file localMan, err := ioutil.TempFile("", "tempmanifest") defer os.Remove(localMan.Name()) if err != nil { log.Fatalln("Couldn't create tempfile") } if err := uri.URICopy(*manifestUri, localMan.Name()); err != nil { log.Fatalf("Could not fetch manifest: %s", err) } manifest, err := pods.ManifestFromPath(localMan.Name()) if err != nil { log.Fatalf("Invalid manifest: %s", err) } healthResults, err := healthChecker.Service(manifest.ID()) if err != nil { log.Fatalf("Could not get initial health results: %s", err) } order := health.SortOrder{ Nodes: *hosts, Health: healthResults, } sort.Sort(order) repl := replication.Replicator{ Manifest: *manifest, Store: store, Health: healthChecker, Nodes: *hosts, // sorted by the health.SortOrder Active: len(*hosts) - *minNodes, Logger: logging.NewLogger(logrus.Fields{ "pod": manifest.ID(), }), Threshold: health.HealthState(*threshold), } repl.Logger.Logger.Formatter = &logrus.TextFormatter{ DisableTimestamp: false, FullTimestamp: true, TimestampFormat: "15:04:05.000", } if err := repl.CheckPreparers(); err != nil { log.Fatalf("Preparer check failed: %s", err) } // create a lock with a meaningful name and set up a renewal loop for it thisHost, err := os.Hostname() if err != nil { log.Fatalf("Could not retrieve hostname: %s", err) } thisUser, err := user.Current() if err != nil { log.Fatalf("Could not retrieve user: %s", err) } lock, err := store.NewLock(fmt.Sprintf("%q from %q at %q", thisUser.Username, thisHost, time.Now())) if err != nil { log.Fatalf("Could not generate lock: %s", err) } // deferring on main is not particularly useful, since os.Exit will skip // the defer, so we have to manually destroy the lock at the right exit // paths go func() { for range time.Tick(10 * time.Second) { if err := lock.Renew(); err != nil { // if the renewal failed, then either the lock is already dead // or the consul agent cannot be reached log.Fatalf("Lock could not be renewed: %s", err) } } }() if err := repl.LockHosts(lock, *overrideLock); err != nil { lock.Destroy() log.Fatalf("Could not lock all hosts: %s", err) } // auto-drain this channel errs := make(chan error) go func() { for range errs { } }() quitch := make(chan struct{}) go func() { // clear lock immediately on ctrl-C signals := make(chan os.Signal, 1) signal.Notify(signals, os.Interrupt) <-signals close(quitch) lock.Destroy() os.Exit(1) }() repl.Enact(errs, quitch) lock.Destroy() }
func TestPublishLatestHealth(t *testing.T) { // This channel imitates the channel that consulutil.WatchPrefix would return healthListChan := make(chan api.KVPairs) quitCh := make(chan struct{}) outCh := make(chan []*health.Result, 1) defer close(outCh) defer close(quitCh) errCh := publishLatestHealth(healthListChan, quitCh, outCh) go func() { err, open := <-errCh if err != nil { t.Fatal(err) } if !open { return } }() oldStatus := health.HealthState("passing") newStatus := health.HealthState("critical") hrOld := &health.Result{ Status: oldStatus, } hrOldJSON, err := json.Marshal(hrOld) if err != nil { t.Fatalf("json marshal err: %v", err) } oldKV := &api.KVPair{Key: "health/service/node1.example.com", Value: hrOldJSON} hrNew := &health.Result{ Status: newStatus, } hrNewJSON, err := json.Marshal(hrNew) if err != nil { t.Fatalf("json marshal err: %v", err) } newKV := &api.KVPair{Key: "health/service/node1.example.com", Value: hrNewJSON} // Basic test that publishLatestHealth drains the channels correctly // We write three times to ensure that at least one of the newKV values has flushed through the channel select { case healthListChan <- api.KVPairs{oldKV}: case <-time.After(1 * time.Second): t.Fatal("Failed to write to chan. Deadlock?") } select { case healthListChan <- api.KVPairs{newKV}: case <-time.After(1 * time.Second): t.Fatal("Failed to write to chan. Deadlock?") } select { case healthListChan <- api.KVPairs{newKV}: case <-time.After(1 * time.Second): t.Fatal("Failed to write to chan. Deadlock?") } select { case result := <-outCh: if len(result) < 1 { t.Fatalf("Got wrong number of results. Expected 1, got %d", len(result)) } if result[0].Status != newStatus { t.Fatalf("expected status to match %s, was %s", newStatus, result[0].Status) } return case <-time.After(1 * time.Second): t.Fatal("oh no, timeout") } }
func (ds *daemonSet) PublishToReplication() error { // We must cancel the replication because if we try to call // InitializeReplicationWithCheck, we will get an error ds.cancelReplication() podLocations, err := ds.CurrentPods() if err != nil { return util.Errorf("Error retrieving pod locations from daemon set: %v", err) } nodes := podLocations.Nodes() ds.logger.Infof("Preparing to publish the following nodes: %v", nodes) thisHost, err := os.Hostname() if err != nil { ds.logger.Errorf("Could not retrieve hostname: %s", err) thisHost = "" } thisUser, err := user.Current() if err != nil { ds.logger.Errorf("Could not retrieve user: %s", err) thisUser = &user.User{} } lockMessage := fmt.Sprintf("%q from %q at %q", thisUser.Username, thisHost, time.Now()) repl, err := replication.NewReplicator( ds.DaemonSet.Manifest, ds.logger, nodes, len(nodes)-ds.DaemonSet.MinHealth, ds.kpStore, ds.applicator, *ds.healthChecker, health.HealthState(health.Passing), lockMessage, ds.Timeout, ) if err != nil { ds.logger.Errorf("Could not initialize replicator: %s", err) return err } ds.logger.Info("New replicator was made") // Replication locks are designed to make sure that two replications to // the same nodes cannot occur at the same time. The granularity is // pod-wide as an optimization for consul performance (only need to // lock a single key) with limited downside when human operators are // executing deploys, because the likelihood of a lock collision is // low. With daemon sets, locking is not necessary because the node // sets should not overlap when they are managed properly. Even when // there is a node overlap between two daemon sets, a simple mutual // exclusion lock around replication will not prevent the pod manifest // on an overlapped node from thrashing. Therefore, it makes sense for // daemon sets to ignore this locking mechanism and always try to // converge nodes to the specified manifest replication, errCh, err := repl.InitializeDaemonSetReplication( replication.DefaultConcurrentReality, ds.rateLimitInterval, ) if err != nil { ds.logger.Errorf("Unable to initialize replication: %s", err) return err } ds.logger.Info("Replication initialized") // auto-drain this channel go func() { for err := range errCh { ds.logger.Errorf("Error occurred in replication: '%v'", err) } }() // Set a new replication ds.currentReplication = replication go replication.Enact() ds.logger.Info("Replication enacted") return nil }