Beispiel #1
0
func getServerTime(server, networkPassword string) (timeResult, util.ServerStatus, error) {
	start := time.Now()
	status, err := util.GetServerStatus(server, networkPassword)
	return timeResult{
		Start:  start,
		End:    time.Now(),
		Result: status.CurrentTime,
	}, status, err
}
Beispiel #2
0
func main() {
	defer glog.Flush()
	glog.CopyStandardLogTo("INFO")
	flag.Parse()
	if strings.TrimSpace(*network) == "" {
		log.Fatalf("You need to specify -network")
	}

	binaryHash := fileHash(*binaryPath)
	glog.Infof("binaryHash = %s", binaryHash)

	servers := util.ResolveNetwork(*network)
	log.Printf("Checking network health\n")
	if statuses, err := util.EnsureNetworkHealthy(servers, *networkPassword); err != nil {
		log.Fatalf("Aborting upgrade for safety: %v", err)
	} else {
		if allNodesUpdated(statuses, binaryHash) {
			log.Printf("All nodes are already running the requested version.\n")
			return
		}
	}

	log.Printf("Restarting %q nodes until their binary hash is %s\n", *network, binaryHash)

	for rtry := 0; rtry < 5; rtry++ {
		for _, server := range servers {
			var statuses map[string]util.ServerStatus
			var err error

			started := time.Now()
			for time.Since(started) < *networkHealthTimeout {
				statuses, err = util.EnsureNetworkHealthy(servers, *networkPassword)
				if err != nil {
					log.Printf("Network is not healthy: %v\n", err)
					time.Sleep(1 * time.Second)
					continue
				}
				log.Printf("Network became healthy.\n")
				break
			}
			if err != nil {
				log.Fatalf("Network did not become healthy within %v, aborting. (reason: %v)\n", *networkHealthTimeout, err)
			}

			if statuses[server].ExecutableHash == binaryHash {
				if allNodesUpdated(statuses, binaryHash) {
					log.Printf("All done!\n")
					return
				}
				log.Printf("Skipping %q which is already running the requested version\n", server)
				continue
			}

			lastApplied := statuses[server].AppliedIndex

			log.Printf("Killing node %q\n", server)
			if err := quit(server); err != nil {
				log.Printf("%v\n", err)
			}

			for htry := 0; htry < 60; htry++ {
				time.Sleep(1 * time.Second)
				current, err := util.GetServerStatus(server, *networkPassword)
				if err != nil {
					log.Printf("Node unhealthy: %v\n", err)
					continue
				}
				if current.ExecutableHash != binaryHash {
					log.Printf("Node %q came up with hash %s instead of %s?!\n",
						server, current.ExecutableHash, binaryHash)
					break
				}
				if current.AppliedIndex < lastApplied {
					log.Printf("Node %q has not yet applied all messages it saw before, waiting (got %d, want ≥ %d)\n",
						server, current.AppliedIndex, lastApplied)
					continue
				}
				log.Printf("Node %q was upgraded and is healthy again\n", server)
				break
			}
		}
	}

	log.Printf("All done!\n")
}
Beispiel #3
0
func TestMessageOfDeath(t *testing.T) {
	tempdir, err := ioutil.TempDir("", "robustirc-message-of-death-")
	if err != nil {
		t.Fatalf("Could not create tempdir: %v", err)
	}

	l, err := localnet.NewLocalnet(-1, tempdir)
	if err != nil {
		t.Fatalf("Could not start local RobustIRC network: %v", err)
	}
	defer l.Kill(true)

	l.EnablePanicCommand = "1"

	// For each of the nodes, start a goroutine that verifies that the node crashes, then start it again
	var wg sync.WaitGroup

	for i := 0; i < 3; i++ {
		cmd, tempdir, addr := l.StartIRCServer(i == 0)
		wg.Add(1)
		go func(cmd *exec.Cmd, tempdir string, addr string) {
			defer wg.Done()

			terminated := make(chan error)
			skipped := make(chan bool)

			go func() {
				terminated <- cmd.Wait()
			}()

			go func() {
				// Poll messages of death counter.
				parser := &prometheus_text.Parser{}
				for {
					time.Sleep(50 * time.Millisecond)
					req, err := http.NewRequest("GET", addr+"metrics", nil)
					if err != nil {
						continue
					}
					resp, err := l.Httpclient.Do(req)
					if err != nil {
						continue
					}
					if resp.StatusCode != http.StatusOK {
						continue
					}
					metrics, err := parser.TextToMetricFamilies(resp.Body)
					if err != nil {
						continue
					}
					applied, ok := metrics["applied_messages"]
					if !ok {
						continue
					}
					for _, m := range applied.GetMetric() {
						for _, labelpair := range m.GetLabel() {
							if labelpair.GetName() == "type" &&
								labelpair.GetValue() == types.RobustType(types.RobustMessageOfDeath).String() {
								if m.GetCounter().GetValue() > 0 {
									skipped <- true
								}
							}
						}
					}
				}
			}()

			// Wait for the server to either crash or skip a message of death.
			select {
			case <-terminated:
				t.Logf("Node terminated (as expected)")
			case <-skipped:
				t.Logf("Node skipped message of death")
			}

			// Run restart.sh for that node.
			rcmd := exec.Command(filepath.Join(tempdir, "restart.sh"))
			if err := rcmd.Start(); err != nil {
				t.Errorf("Cannot restart node: %v", err)
				return
			}
			l.RecordResource("pid", strconv.Itoa(cmd.Process.Pid))

			// Ensure the node comes back up.
			started := time.Now()
			for time.Since(started) < 10*time.Second {
				if _, err := util.GetServerStatus(addr, l.NetworkPassword); err != nil {
					t.Logf("Node %s unhealthy: %v", addr, err)
					time.Sleep(1 * time.Second)
					continue
				}
				t.Logf("Node %s became healthy", addr)
				return
			}
			t.Errorf("Node did not become healthy within 10s")
		}(cmd, tempdir, addr)
	}

	// Connect and send the PANIC message.
	session, err := robustsession.Create(strings.Join(l.Servers(), ","), filepath.Join(tempdir, "cert.pem"))
	if err != nil {
		t.Fatalf("Could not create robustsession: %v", err)
	}
	foundjoin := make(chan bool)
	go func() {
		for msg := range session.Messages {
			if !strings.HasPrefix(msg, ":mod!1@") ||
				!strings.HasSuffix(msg, " JOIN :#mod") {
				continue
			}
			select {
			case foundjoin <- true:
			default:
				t.Errorf("Found JOIN too early (channel write blocks)")
			}
		}
	}()
	go func() {
		for err := range session.Errors {
			t.Errorf("RobustSession error: %v", err)
		}
	}()

	session.PostMessage("NICK mod")
	session.PostMessage("USER 1 2 3 4")
	session.PostMessage("PANIC")

	t.Logf("Message of death sent")

	wg.Wait()

	healthy := false
	for try := 0; try < 5; try++ {
		if l.Healthy() {
			healthy = true
			break
		}
		time.Sleep(1 * time.Second)
	}

	if !healthy {
		t.Fatalf("Expected recovery, but not all nodes are healthy")
	}

	// Verify sending a JOIN now results in an output message.
	session.PostMessage("JOIN #mod")
	select {
	case <-foundjoin:
		t.Logf("JOIN reply received, network progressing")
	case <-time.After(5 * time.Second):
		t.Errorf("Timeout waiting for JOIN message")
	}
}