// setKeys sets n random keys and values across each machine in a // cluster and returns these values to later be checked with checkKeys. // If all the values don't get set due to a machine that is down and // error is NOT returned. An error is returned if no keys are able to be // set. func setKeys(cluster platform.Cluster, n int) (map[string]string, error) { var written = map[string]string{} for _, m := range cluster.Machines() { for i := 0; i < n; i++ { // random key and value, may overwrwite previous sets if // collision which is fine key := strconv.Itoa(rand.Int())[0:3] value := strconv.Itoa(rand.Int())[0:3] cmd := cluster.NewCommand("curl", "-w", "%{http_code}", "-s", fmt.Sprintf("http://%v:2379/v2/keys/%v", m.IP(), key), "-XPUT", "-d", "value="+value) b, err := cmd.Output() if err != nil { continue } // check for 201 or 200 resp header if !bytes.HasSuffix(b, []byte("200")) && !bytes.HasSuffix(b, []byte("201")) { continue } written[key] = value } } if len(written) == 0 { return nil, fmt.Errorf("failed to write any keys") } plog.Infof("wrote %v keys", len(written)) return written, nil }
// setKeys sets n random keys and values across each machine in a // cluster and returns these values to later be checked with checkKeys. // If all the values don't get set due to a machine that is down and // error is NOT returned. An error is returned if no keys are able to be // set. func SetKeys(cluster platform.Cluster, n int) (map[string]string, error) { var written = map[string]string{} for _, m := range cluster.Machines() { for i := 0; i < n; i++ { // random key and value, may overwrwite previous sets if // collision which is fine key := strconv.Itoa(rand.Int())[0:3] value := strconv.Itoa(rand.Int())[0:3] b, err := m.SSH(fmt.Sprintf("curl -s -w %%{http_code} -s http://127.0.0.1:2379/v2/keys/%v -XPUT -d value=%v", key, value)) if err != nil { return nil, err } // check for 201 or 200 resp header if !bytes.HasSuffix(b, []byte("200")) && !bytes.HasSuffix(b, []byte("201")) { continue } written[key] = value } } if len(written) == 0 { return nil, fmt.Errorf("failed to write any keys") } plog.Infof("wrote %v keys", len(written)) return written, nil }
func checkEtcdVersion(cluster platform.Cluster, m platform.Machine, expected string) error { const ( retries = 5 retryWait = 3 * time.Second ) var err error var b []byte for i := 0; i < retries; i++ { cmd := cluster.NewCommand("curl", "-L", fmt.Sprintf("http://%v:2379/version", m.IP())) b, err = cmd.Output() if err != nil { plog.Infof("retrying version check, hit failure %v", err) time.Sleep(retryWait) continue } break } if err != nil { return fmt.Errorf("curling version: %v", err) } plog.Infof("got version: %s", b) if string(b) != expected { return fmt.Errorf("expected %v, got %s", expected, b) } return nil }
func checkEtcdVersion(cluster platform.Cluster, m platform.Machine, expected string) error { var b []byte var err error checker := func() error { cmd := cluster.NewCommand("curl", "-L", fmt.Sprintf("http://%v:2379/version", m.IP())) b, err = cmd.Output() if err != nil { return fmt.Errorf("curl failed: %v", err) } return nil } if err := util.Retry(15, 10*time.Second, checker); err != nil { return err } plog.Infof("got version: %s", b) if string(b) != expected { return fmt.Errorf("expected %v, got %s", expected, b) } return nil }
// checkKeys tests that each node in the cluster has the full provided // key set in keyMap. Quorum get must be used. func CheckKeys(cluster platform.Cluster, keyMap map[string]string, quorum bool) error { for i, m := range cluster.Machines() { for k, v := range keyMap { var cmd string if quorum { cmd = fmt.Sprintf("curl http://127.0.0.1:2379/v2/keys/%v?quorum=true", k) } else { cmd = fmt.Sprintf("curl http://127.0.0.1:2379/v2/keys/%v", k) } s, err := m.SSHSession() if err != nil { return err } defer s.Close() b, err := s.Output(cmd) if err != nil { return fmt.Errorf("error curling key: %v", err) } var jsonMap map[string]interface{} err = json.Unmarshal(b, &jsonMap) if err != nil { return err } // error code? errorCode, ok := jsonMap["errorCode"] if ok { msg := jsonMap["message"] return fmt.Errorf("machine %v errorCode %v: %v: %s", i, errorCode, msg, b) } node, ok := jsonMap["node"] if !ok { return fmt.Errorf("retrieving key in CheckKeys, no node in resp") } n := node.(map[string]interface{}) value, ok := n["value"] if !ok { return fmt.Errorf("retrieving key in CheckKeys, no value in resp") } if value != v { return fmt.Errorf("checkKeys got incorrect value! expected:%v got: %v", v, value) } } } plog.Infof("checked %v keys", len(keyMap)) return nil }
func runBootchart(cmd *cobra.Command, args []string) { if len(args) != 0 { fmt.Fprintf(os.Stderr, "No args accepted\n") os.Exit(2) } var ( cluster platform.Cluster err error ) if kolaPlatform == "qemu" { cluster, err = platform.NewQemuCluster(kola.QEMUOptions) } else if kolaPlatform == "gce" { cluster, err = platform.NewGCECluster(kola.GCEOptions) } else if kolaPlatform == "aws" { cluster, err = platform.NewAWSCluster(kola.AWSOptions) } else { fmt.Fprintf(os.Stderr, "Invalid platform: %v", kolaPlatform) } if err != nil { fmt.Fprintf(os.Stderr, "Cluster failed: %v\n", err) os.Exit(1) } defer cluster.Destroy() m, err := cluster.NewMachine("") if err != nil { fmt.Fprintf(os.Stderr, "Machine failed: %v\n", err) os.Exit(1) } defer m.Destroy() ssh, err := m.SSHSession() if err != nil { fmt.Fprintf(os.Stderr, "SSH failed: %v\n", err) os.Exit(1) } ssh.Stdout = os.Stdout ssh.Stderr = os.Stderr if err = ssh.Run("systemd-analyze plot"); err != nil { fmt.Fprintf(os.Stderr, "SSH failed: %v\n", err) os.Exit(1) } }
func runSpawn(cmd *cobra.Command, args []string) { var userdata []byte var err error var cluster platform.Cluster if spawnUserData != "" { userdata, err = ioutil.ReadFile(spawnUserData) if err != nil { die("Reading userdata failed: %v", err) } } switch kolaPlatform { case "qemu": cluster, err = platform.NewQemuCluster(kola.QEMUOptions) case "gce": cluster, err = platform.NewGCECluster(kola.GCEOptions) case "aws": cluster, err = platform.NewAWSCluster(kola.AWSOptions) default: err = fmt.Errorf("invalid platform %q", kolaPlatform) } if err != nil { die("Cluster failed: %v", err) } mach, err := cluster.NewMachine(string(userdata)) if err != nil { die("Spawning instance failed: %v", err) } if spawnRemove { defer mach.Destroy() } if spawnShell { if err := platform.Manhole(mach); err != nil { die("Manhole failed: %v", err) } } }
// checkKeys tests that each node in the cluster has the full provided // key set in keyMap. Quorum get must be used. func checkKeys(cluster platform.Cluster, keyMap map[string]string) error { for i, m := range cluster.Machines() { for k, v := range keyMap { cmd := cluster.NewCommand("curl", fmt.Sprintf("http://%v:2379/v2/keys/%v?quorum=true", m.IP(), k)) b, err := cmd.Output() if err != nil { return fmt.Errorf("error curling key: %v", err) } var jsonMap map[string]interface{} err = json.Unmarshal(b, &jsonMap) if err != nil { return err } // error code? errorCode, ok := jsonMap["errorCode"] if ok { msg := jsonMap["message"] return fmt.Errorf("machine %v errorCode %v: %v: %s", i, errorCode, msg, b) } node, ok := jsonMap["node"] if !ok { return fmt.Errorf("retrieving key in CheckKeys, no node in resp") } n := node.(map[string]interface{}) value, ok := n["value"] if !ok { return fmt.Errorf("retrieving key in CheckKeys, no value in resp") } if value != v { return fmt.Errorf("checkKeys got incorrect value! expected:%v got: %v", v, value) } } } plog.Infof("checked %v keys", len(keyMap)) return nil }
// create a cluster and run test func runTest(t *Test, pltfrm string) error { var err error var cluster platform.Cluster if pltfrm == "qemu" { cluster, err = platform.NewQemuCluster(*QemuImage) } else if pltfrm == "gce" { cluster, err = platform.NewGCECluster(GCEOpts()) } else { plog.Errorf("Invalid platform: %v", pltfrm) } if err != nil { return fmt.Errorf("Cluster failed: %v", err) } defer func() { if err := cluster.Destroy(); err != nil { plog.Errorf("cluster.Destroy(): %v", err) } }() url, err := cluster.GetDiscoveryURL(t.ClusterSize) if err != nil { return fmt.Errorf("Failed to create discovery endpoint: %v", err) } cfgs := makeConfigs(url, t.CloudConfig, t.ClusterSize) for i := 0; i < t.ClusterSize; i++ { _, err := cluster.NewMachine(cfgs[i]) if err != nil { return fmt.Errorf("Cluster failed starting machine: %v", err) } plog.Infof("%v instance up", pltfrm) } // pass along all registered native functions var names []string for k := range t.NativeFuncs { names = append(names, k) } // Cluster -> TestCluster tcluster := platform.TestCluster{t.Name, names, cluster} // drop kolet binary on machines if t.NativeFuncs != nil { err = scpKolet(tcluster) if err != nil { return fmt.Errorf("dropping kolet binary: %v", err) } } // run test err = t.Run(tcluster) return err }
func discovery(cluster platform.Cluster, version int) error { if plog.LevelAt(capnslog.DEBUG) { // get journalctl -f from all machines before starting for _, m := range cluster.Machines() { if err := platform.StreamJournal(m); err != nil { return fmt.Errorf("failed to start journal: %v", err) } } } // start etcd on each machine asynchronously. for _, m := range cluster.Machines() { if err := doStart(m, version, false); err != nil { return err } } // block until each instance is reported as started. for i, m := range cluster.Machines() { if err := doStart(m, version, true); err != nil { return err } plog.Infof("etcd instance%d started", i) } var keyMap map[string]string var retryFuncs []func() error retryFuncs = append(retryFuncs, func() error { var err error keyMap, err = SetKeys(cluster, 5) if err != nil { return err } return nil }) retryFuncs = append(retryFuncs, func() error { var quorumRead bool if version == 2 { quorumRead = true } if err := CheckKeys(cluster, keyMap, quorumRead); err != nil { return err } return nil }) for _, retry := range retryFuncs { if err := util.Retry(5, 5*time.Second, retry); err != nil { return fmt.Errorf("discovery failed health check: %v", err) } // NOTE(pb): etcd1 seems to fail in an odd way when I try quorum // read, instead just sleep between setting and getting. time.Sleep(2 * time.Second) } return nil }
func discovery(cluster platform.Cluster, version int) error { csize := len(cluster.Machines()) if plog.LevelAt(capnslog.DEBUG) { // get journalctl -f from all machines before starting for _, m := range cluster.Machines() { if err := m.StartJournal(); err != nil { return fmt.Errorf("failed to start journal: %v", err) } } } // point etcd on each machine to discovery for i, m := range cluster.Machines() { // start etcd instance var etcdStart string if version == 1 { etcdStart = "sudo systemctl start etcd.service" } else if version == 2 { etcdStart = "sudo systemctl start etcd2.service" } else { return fmt.Errorf("etcd version unspecified") } _, err := m.SSH(etcdStart) if err != nil { return fmt.Errorf("SSH cmd to %v failed: %s", m.IP(), err) } plog.Infof("etcd instance%d started", i) } err := getClusterHealth(cluster.Machines()[0], csize) if err != nil { return fmt.Errorf("discovery failed health check: %v", err) } return nil }
// create a cluster and run test func RunTest(t *Test, pltfrm string) error { var err error var cluster platform.Cluster switch pltfrm { case "qemu": cluster, err = platform.NewQemuCluster(QEMUOptions) case "gce": cluster, err = platform.NewGCECluster(GCEOptions) case "aws": cluster, err = platform.NewAWSCluster(AWSOptions) default: err = fmt.Errorf("invalid platform %q", pltfrm) } if err != nil { return fmt.Errorf("Cluster failed: %v", err) } defer func() { if err := cluster.Destroy(); err != nil { plog.Errorf("cluster.Destroy(): %v", err) } }() url, err := cluster.GetDiscoveryURL(t.ClusterSize) if err != nil { return fmt.Errorf("Failed to create discovery endpoint: %v", err) } cfgs := makeConfigs(url, t.CloudConfig, t.ClusterSize) if t.ClusterSize > 0 { _, err := platform.NewMachines(cluster, cfgs) if err != nil { return fmt.Errorf("Cluster failed starting machines: %v", err) } } // pass along all registered native functions var names []string for k := range t.NativeFuncs { names = append(names, k) } // prevent unsafe access if tests ever become parallel and access tempTestOptions := make(map[string]string, 0) for k, v := range testOptions { tempTestOptions[k] = v } // Cluster -> TestCluster tcluster := platform.TestCluster{ Name: t.Name, NativeFuncs: names, Options: tempTestOptions, Cluster: cluster, } // drop kolet binary on machines if t.NativeFuncs != nil { err = scpKolet(tcluster) if err != nil { return fmt.Errorf("dropping kolet binary: %v", err) } } // run test err = t.Run(tcluster) // give some time for the remote journal to be flushed so it can be read // before we run the deferred machine destruction if err != nil { time.Sleep(10 * time.Second) } return err }