// setKeys sets n random keys and values across each machine in a // cluster and returns these values to later be checked with checkKeys. // If all the values don't get set due to a machine that is down and // error is NOT returned. An error is returned if no keys are able to be // set. func SetKeys(cluster platform.Cluster, n int) (map[string]string, error) { var written = map[string]string{} for _, m := range cluster.Machines() { for i := 0; i < n; i++ { // random key and value, may overwrwite previous sets if // collision which is fine key := strconv.Itoa(rand.Int())[0:3] value := strconv.Itoa(rand.Int())[0:3] b, err := m.SSH(fmt.Sprintf("curl -s -w %%{http_code} -s http://127.0.0.1:2379/v2/keys/%v -XPUT -d value=%v", key, value)) if err != nil { return nil, err } // check for 201 or 200 resp header if !bytes.HasSuffix(b, []byte("200")) && !bytes.HasSuffix(b, []byte("201")) { continue } written[key] = value } } if len(written) == 0 { return nil, fmt.Errorf("failed to write any keys") } plog.Infof("wrote %v keys", len(written)) return written, nil }
// setKeys sets n random keys and values across each machine in a // cluster and returns these values to later be checked with checkKeys. // If all the values don't get set due to a machine that is down and // error is NOT returned. An error is returned if no keys are able to be // set. func setKeys(cluster platform.Cluster, n int) (map[string]string, error) { var written = map[string]string{} for _, m := range cluster.Machines() { for i := 0; i < n; i++ { // random key and value, may overwrwite previous sets if // collision which is fine key := strconv.Itoa(rand.Int())[0:3] value := strconv.Itoa(rand.Int())[0:3] cmd := cluster.NewCommand("curl", "-w", "%{http_code}", "-s", fmt.Sprintf("http://%v:2379/v2/keys/%v", m.IP(), key), "-XPUT", "-d", "value="+value) b, err := cmd.Output() if err != nil { continue } // check for 201 or 200 resp header if !bytes.HasSuffix(b, []byte("200")) && !bytes.HasSuffix(b, []byte("201")) { continue } written[key] = value } } if len(written) == 0 { return nil, fmt.Errorf("failed to write any keys") } plog.Infof("wrote %v keys", len(written)) return written, nil }
// checkKeys tests that each node in the cluster has the full provided // key set in keyMap. Quorum get must be used. func CheckKeys(cluster platform.Cluster, keyMap map[string]string, quorum bool) error { for i, m := range cluster.Machines() { for k, v := range keyMap { var cmd string if quorum { cmd = fmt.Sprintf("curl http://127.0.0.1:2379/v2/keys/%v?quorum=true", k) } else { cmd = fmt.Sprintf("curl http://127.0.0.1:2379/v2/keys/%v", k) } s, err := m.SSHSession() if err != nil { return err } defer s.Close() b, err := s.Output(cmd) if err != nil { return fmt.Errorf("error curling key: %v", err) } var jsonMap map[string]interface{} err = json.Unmarshal(b, &jsonMap) if err != nil { return err } // error code? errorCode, ok := jsonMap["errorCode"] if ok { msg := jsonMap["message"] return fmt.Errorf("machine %v errorCode %v: %v: %s", i, errorCode, msg, b) } node, ok := jsonMap["node"] if !ok { return fmt.Errorf("retrieving key in CheckKeys, no node in resp") } n := node.(map[string]interface{}) value, ok := n["value"] if !ok { return fmt.Errorf("retrieving key in CheckKeys, no value in resp") } if value != v { return fmt.Errorf("checkKeys got incorrect value! expected:%v got: %v", v, value) } } } plog.Infof("checked %v keys", len(keyMap)) return nil }
// checkKeys tests that each node in the cluster has the full provided // key set in keyMap. Quorum get must be used. func checkKeys(cluster platform.Cluster, keyMap map[string]string) error { for i, m := range cluster.Machines() { for k, v := range keyMap { cmd := cluster.NewCommand("curl", fmt.Sprintf("http://%v:2379/v2/keys/%v?quorum=true", m.IP(), k)) b, err := cmd.Output() if err != nil { return fmt.Errorf("error curling key: %v", err) } var jsonMap map[string]interface{} err = json.Unmarshal(b, &jsonMap) if err != nil { return err } // error code? errorCode, ok := jsonMap["errorCode"] if ok { msg := jsonMap["message"] return fmt.Errorf("machine %v errorCode %v: %v: %s", i, errorCode, msg, b) } node, ok := jsonMap["node"] if !ok { return fmt.Errorf("retrieving key in CheckKeys, no node in resp") } n := node.(map[string]interface{}) value, ok := n["value"] if !ok { return fmt.Errorf("retrieving key in CheckKeys, no value in resp") } if value != v { return fmt.Errorf("checkKeys got incorrect value! expected:%v got: %v", v, value) } } } plog.Infof("checked %v keys", len(keyMap)) return nil }
func discovery(cluster platform.Cluster, version int) error { if plog.LevelAt(capnslog.DEBUG) { // get journalctl -f from all machines before starting for _, m := range cluster.Machines() { if err := platform.StreamJournal(m); err != nil { return fmt.Errorf("failed to start journal: %v", err) } } } // start etcd on each machine asynchronously. for _, m := range cluster.Machines() { if err := doStart(m, version, false); err != nil { return err } } // block until each instance is reported as started. for i, m := range cluster.Machines() { if err := doStart(m, version, true); err != nil { return err } plog.Infof("etcd instance%d started", i) } var keyMap map[string]string var retryFuncs []func() error retryFuncs = append(retryFuncs, func() error { var err error keyMap, err = SetKeys(cluster, 5) if err != nil { return err } return nil }) retryFuncs = append(retryFuncs, func() error { var quorumRead bool if version == 2 { quorumRead = true } if err := CheckKeys(cluster, keyMap, quorumRead); err != nil { return err } return nil }) for _, retry := range retryFuncs { if err := util.Retry(5, 5*time.Second, retry); err != nil { return fmt.Errorf("discovery failed health check: %v", err) } // NOTE(pb): etcd1 seems to fail in an odd way when I try quorum // read, instead just sleep between setting and getting. time.Sleep(2 * time.Second) } return nil }
func discovery(cluster platform.Cluster, version int) error { csize := len(cluster.Machines()) if plog.LevelAt(capnslog.DEBUG) { // get journalctl -f from all machines before starting for _, m := range cluster.Machines() { if err := m.StartJournal(); err != nil { return fmt.Errorf("failed to start journal: %v", err) } } } // point etcd on each machine to discovery for i, m := range cluster.Machines() { // start etcd instance var etcdStart string if version == 1 { etcdStart = "sudo systemctl start etcd.service" } else if version == 2 { etcdStart = "sudo systemctl start etcd2.service" } else { return fmt.Errorf("etcd version unspecified") } _, err := m.SSH(etcdStart) if err != nil { return fmt.Errorf("SSH cmd to %v failed: %s", m.IP(), err) } plog.Infof("etcd instance%d started", i) } err := getClusterHealth(cluster.Machines()[0], csize) if err != nil { return fmt.Errorf("discovery failed health check: %v", err) } return nil }