// SpofMonkey detects single points of failure by netsplitting one node at a // time away from the rest of the cluster and ensuring that the cluster // continues to make progress. func (d *Director) SpofMonkey(rng *rand.Rand, intensity float64) bool { if spofIndex >= state.NodeCount() { return false } else if len(spofOrder) != state.NodeCount() { // Unfortunately we can't do this in an init() because we defer // the parsing of flag arguments until later. spofOrder = rng.Perm(state.NodeCount()) } i := spofOrder[spofIndex] log.Printf("[monkey] Testing if %v is a single point of failure", d.agents[i]) netsplit := d.net.FindPerimeter([]uint{uint(i)}) spofIndex++ d.agents[i].Freeze() for _, target := range netsplit { target.GoodbyeForever() } // We need to make sure that the cluster is capable of servicing // requests that no member of the cluster has ever seen before in order // to ensure that the cluster is making progress. To do this, we // determine the request ID of the last request that has been created, // and make sure that we see a request that was created *after* that // (any requests generated after targetRequestId were necessarily // generated after the actions above). targetRequestId := state.LastGeneratedRequest() for <-state.GotRequest() <= targetRequestId { } log.Printf("[monkey] %v is (probably) not a single point of failure!", d.agents[i]) for _, target := range netsplit { target.WhyHelloThere() } d.agents[i].Thaw() return true }
// Rules for sequence numbers: // // - Gaps are temporarily OK, but not in the long run. func (h *Harness) resultHandler() { results := make(map[int]*result) for { result := <-h.result sequenceNumber, body, err := h.parseResponse(result.resp) if err != nil { h.lose(err.Error()) return } result.body = body if sequenceNumber < h.nextSequenceNumber { h.losef(`[%d] Received an already-processed sequence number from %v in response to %s Output: %s`, sequenceNumber, result.node, result.query, util.FmtOutput(result.resp)) return } if old, ok := results[sequenceNumber]; ok { h.losef(`[%d] Received a still-pending sequence number from %v in response to %s Output: %s This sequence number was originally received in response to %s Original output: %s`, sequenceNumber, result.node, result.query, util.FmtOutput(result.resp), old.query, util.FmtOutput(old.resp)) return } if sequenceNumber > h.nextSequenceNumber { log.Printf("[%d] Result from %v waiting on sequence number %d", sequenceNumber, result.node, h.nextSequenceNumber) } // Notify SPOF monkey that we got a valid request select { case state.GotRequest() <- result.reqid: default: } results[sequenceNumber] = result h.processPending(results) } }