Example #1
0
// SpofMonkey detects single points of failure by netsplitting one node at a
// time away from the rest of the cluster and ensuring that the cluster
// continues to make progress.
func (d *Director) SpofMonkey(rng *rand.Rand, intensity float64) bool {
	if spofIndex >= state.NodeCount() {
		return false
	} else if len(spofOrder) != state.NodeCount() {
		// Unfortunately we can't do this in an init() because we defer
		// the parsing of flag arguments until later.
		spofOrder = rng.Perm(state.NodeCount())
	}

	i := spofOrder[spofIndex]
	log.Printf("[monkey] Testing if %v is a single point of failure",
		d.agents[i])

	netsplit := d.net.FindPerimeter([]uint{uint(i)})
	spofIndex++

	d.agents[i].Freeze()

	for _, target := range netsplit {
		target.GoodbyeForever()
	}

	// We need to make sure that the cluster is capable of servicing
	// requests that no member of the cluster has ever seen before in order
	// to ensure that the cluster is making progress. To do this, we
	// determine the request ID of the last request that has been created,
	// and make sure that we see a request that was created *after* that
	// (any requests generated after targetRequestId were necessarily
	// generated after the actions above).
	targetRequestId := state.LastGeneratedRequest()
	for <-state.GotRequest() <= targetRequestId {
	}

	log.Printf("[monkey] %v is (probably) not a single point of failure!",
		d.agents[i])

	for _, target := range netsplit {
		target.WhyHelloThere()
	}

	d.agents[i].Thaw()

	return true
}
// Rules for sequence numbers:
//
// - Gaps are temporarily OK, but not in the long run.
func (h *Harness) resultHandler() {
	results := make(map[int]*result)
	for {
		result := <-h.result
		sequenceNumber, body, err := h.parseResponse(result.resp)
		if err != nil {
			h.lose(err.Error())
			return
		}
		result.body = body

		if sequenceNumber < h.nextSequenceNumber {
			h.losef(`[%d] Received an already-processed sequence number from %v in response to %s

Output: %s`, sequenceNumber, result.node, result.query, util.FmtOutput(result.resp))
			return
		}

		if old, ok := results[sequenceNumber]; ok {
			h.losef(`[%d] Received a still-pending sequence number from %v in response to %s

Output: %s

This sequence number was originally received in response to %s

Original output: %s`, sequenceNumber, result.node, result.query, util.FmtOutput(result.resp), old.query, util.FmtOutput(old.resp))
			return
		}

		if sequenceNumber > h.nextSequenceNumber {
			log.Printf("[%d] Result from %v waiting on sequence number %d", sequenceNumber, result.node, h.nextSequenceNumber)
		}

		// Notify SPOF monkey that we got a valid request
		select {
		case state.GotRequest() <- result.reqid:
		default:
		}

		results[sequenceNumber] = result
		h.processPending(results)
	}
}