Example #1
0
func (d UDist) CDF(U float64) float64 {
	if U < 0 {
		return 0
	} else if U >= float64(d.N1*d.N2) {
		return 1
	}

	if d.hasTies() {
		// TODO: Minimize U?
		p, ok := makeUmemo(int(2*U), d.N1, d.T)[len(d.T)][ukey{d.N1, int(2 * U)}]
		if !ok {
			panic("makeUmemo did not return expected memoization table")
		}
		return p / mathx.Choose(d.N1+d.N2, d.N1)
	}

	// There are no ties. Use the fast algorithm. U must be integral.
	Ui := int(math.Floor(U))
	// The distribution is symmetric around U = m * n / 2. Sum up
	// whichever tail is smaller.
	flip := Ui >= (d.N1*d.N2+1)/2
	if flip {
		Ui = d.N1*d.N2 - Ui - 1
	}
	pdfs := d.p(Ui)
	p := 0.0
	for _, pdf := range pdfs[:Ui+1] {
		p += pdf
	}
	if flip {
		p = 1 - p
	}
	return p
}
Example #2
0
func (d UDist) PMF(U float64) float64 {
	if U < 0 || U >= 0.5+float64(d.N1*d.N2) {
		return 0
	}

	if d.hasTies() {
		// makeUmemo computes the CDF directly. Take its
		// difference to get the PMF.
		p1, ok1 := makeUmemo(int(2*U)-1, d.N1, d.T)[len(d.T)][ukey{d.N1, int(2*U) - 1}]
		p2, ok2 := makeUmemo(int(2*U), d.N1, d.T)[len(d.T)][ukey{d.N1, int(2 * U)}]
		if !ok1 || !ok2 {
			panic("makeUmemo did not return expected memoization table")
		}
		return (p2 - p1) / mathx.Choose(d.N1+d.N2, d.N1)
	}

	// There are no ties. Use the fast algorithm. U must be integral.
	Ui := int(math.Floor(U))
	// TODO: Use symmetry to minimize U
	return d.p(Ui)[Ui]
}
Example #3
0
// This computes the cumulative counts of the Mann-Whitney U
// distribution in the presence of ties using the computation from
// Cheung, Ying Kuen; Klotz, Jerome H. (1997). "The Mann Whitney
// Wilcoxon Distribution Using Linked Lists". Statistica Sinica 7:
// 805-813, with much guidance from appendix L of Klotz, A
// Computational Approach to Statistics.
//
// makeUmemo constructs a table memo[K][ukey{n1, 2*U}], where K is the
// number of ranks (up to len(t)), n1 is the size of the first sample
// (up to the n1 argument), and U is the U statistic (up to the
// argument twoU/2). The value of an entry in the memo table is the
// number of permutations of a sample of size n1 in a ranking with tie
// vector t[:K] having a U statistic <= U.
func makeUmemo(twoU, n1 int, t []int) []map[ukey]float64 {
	// Another candidate for a fast implementation is van de Wiel,
	// "The split-up algorithm: a fast symbolic method for
	// computing p-values of distribution-free statistics". This
	// is what's used by R's coin package. It's a comparatively
	// recent publication, so it's presumably faster (or perhaps
	// just more general) than previous techniques, but I can't
	// get my hands on the paper.
	//
	// TODO: ~40% of this function's time is spent in mapassign on
	// the assignment lines in the two loops and another ~20% in
	// map access and iteration. Improving map behavior or
	// replacing the maps altogether with some other constant-time
	// structure could double performance.
	//
	// TODO: The worst case for this function is when there are
	// few ties. Yet the best case overall is when there are *no*
	// ties. Can we get the best of both worlds? Use the fast
	// algorithm for the most part when there are few ties and mix
	// in the general algorithm just where we need it? That's
	// certainly possible for sub-problems where t[:k] has no
	// ties, but that doesn't help if t[0] has a tie but nothing
	// else does. Is it possible to rearrange the ranks without
	// messing up our computation of the U statistic for
	// sub-problems?

	K := len(t)

	// Compute a coefficients. The a slice is indexed by k (a[0]
	// is unused).
	a := make([]int, K+1)
	a[1] = t[0]
	for k := 2; k <= K; k++ {
		a[k] = a[k-1] + t[k-2] + t[k-1]
	}

	// Create the memo table for the counts function, A. The A
	// slice is indexed by k (A[0] is unused).
	//
	// In "The Mann Whitney Distribution Using Linked Lists", they
	// use linked lists (*gasp*) for this, but within each K it's
	// really just a memoization table, so it's faster to use a
	// map. The outer structure is a slice indexed by k because we
	// need to find all memo entries with certain values of k.
	//
	// TODO: The n1 and twoU values in the ukeys follow strict
	// patterns. For each K value, the n1 values are every integer
	// between two bounds. For each (K, n1) value, the twoU values
	// are every integer multiple of a certain base between two
	// bounds. It might be worth turning these into directly
	// indexible slices.
	A := make([]map[ukey]float64, K+1)
	A[K] = map[ukey]float64{ukey{n1: n1, twoU: twoU}: 0}

	// Compute memo table (k, n1, twoU) triples from high K values
	// to low K values. This drives the recurrence relation
	// downward to figure out all of the needed argument triples.
	//
	// TODO: Is it possible to generate this table bottom-up? If
	// so, this could be a pure dynamic programming algorithm and
	// we could discard the K dimension. We could at least store
	// the inputs in a more compact representation that replaces
	// the twoU dimension with an interval and a step size (as
	// suggested by Cheung, Klotz, not that they make it at all
	// clear *why* they're suggesting this).
	tsum := sumint(t) // always ∑ t[0:k]
	for k := K - 1; k >= 2; k-- {
		tsum -= t[k]
		A[k] = make(map[ukey]float64)

		// Construct A[k] from A[k+1].
		for A_kplus1 := range A[k+1] {
			rkLow := maxint(0, A_kplus1.n1-tsum)
			rkHigh := minint(A_kplus1.n1, t[k])
			for rk := rkLow; rk <= rkHigh; rk++ {
				twoU_k := A_kplus1.twoU - rk*(a[k+1]-2*A_kplus1.n1+rk)
				n1_k := A_kplus1.n1 - rk
				if twoUmin(n1_k, t[:k], a) <= twoU_k && twoU_k <= twoUmax(n1_k, t[:k], a) {
					key := ukey{n1: n1_k, twoU: twoU_k}
					A[k][key] = 0
				}
			}
		}
	}

	// Fill counts in memo table from low K values to high K
	// values. This unwinds the recurrence relation.

	// Start with K==2 base case.
	//
	// TODO: Later computations depend on these, but these don't
	// depend on anything (including each other), so if K==2, we
	// can skip the memo table altogether.
	if K < 2 {
		panic("K < 2")
	}
	N_2 := t[0] + t[1]
	for A_2i := range A[2] {
		Asum := 0.0
		r2Low := maxint(0, A_2i.n1-t[0])
		r2High := (A_2i.twoU - A_2i.n1*(t[0]-A_2i.n1)) / N_2
		for r2 := r2Low; r2 <= r2High; r2++ {
			Asum += mathx.Choose(t[0], A_2i.n1-r2) *
				mathx.Choose(t[1], r2)
		}
		A[2][A_2i] = Asum
	}

	// Derive counts for the rest of the memo table.
	tsum = t[0] // always ∑ t[0:k-1]
	for k := 3; k <= K; k++ {
		tsum += t[k-2]

		// Compute A[k] counts from A[k-1] counts.
		for A_ki := range A[k] {
			Asum := 0.0
			rkLow := maxint(0, A_ki.n1-tsum)
			rkHigh := minint(A_ki.n1, t[k-1])
			for rk := rkLow; rk <= rkHigh; rk++ {
				twoU_kminus1 := A_ki.twoU - rk*(a[k]-2*A_ki.n1+rk)
				n1_kminus1 := A_ki.n1 - rk
				x, ok := A[k-1][ukey{n1: n1_kminus1, twoU: twoU_kminus1}]
				if !ok && twoUmax(n1_kminus1, t[:k-1], a) < twoU_kminus1 {
					x = mathx.Choose(tsum, n1_kminus1)
				}
				Asum += x * mathx.Choose(t[k-1], rk)
			}
			A[k][A_ki] = Asum
		}
	}

	return A
}
Example #4
0
// udistRef computes the PMF and CDF of the U distribution for two
// samples of sizes n1 and sum(t)-n1 with tie vector t. The returned
// pmf and cdf are indexed by 2*U.
//
// This uses the "graphical method" of Klotz (1966). It is very slow
// (Θ(∏ (t[i]+1)) = Ω(2^|t|)), but very correct, and hence useful as a
// reference for testing faster implementations.
func udistRef(n1 int, t []int) (pmf, cdf []float64) {
	// Enumerate all u vectors for which 0 <= u_i <= t_i. Count
	// the number of permutations of two samples of sizes n1 and
	// sum(t)-n1 with tie vector t and accumulate these counts by
	// their U statistics in count[2*U].
	counts := make([]int, 1+2*n1*(sumint(t)-n1))

	u := make([]int, len(t))
	u[0] = -1 // Get enumeration started.
enumu:
	for {
		// Compute the next u vector.
		u[0]++
		for i := 0; i < len(u) && u[i] > t[i]; i++ {
			if i == len(u)-1 {
				// All u vectors have been enumerated.
				break enumu
			}
			// Carry.
			u[i+1]++
			u[i] = 0
		}

		// Is this a legal u vector?
		if sumint(u) != n1 {
			// Klotz (1966) has a method for directly
			// enumerating legal u vectors, but the point
			// of this is to be correct, not fast.
			continue
		}

		// Compute 2*U statistic for this u vector.
		twoU, vsum := 0, 0
		for i, u_i := range u {
			v_i := t[i] - u_i
			// U = U + vsum*u_i + u_i*v_i/2
			twoU += 2*vsum*u_i + u_i*v_i
			vsum += v_i
		}

		// Compute Π choose(t_i, u_i). This is the number of
		// ways of permuting the input sample under u.
		prod := 1
		for i, u_i := range u {
			prod *= int(mathx.Choose(t[i], u_i) + 0.5)
		}

		// Accumulate the permutations on this u path.
		counts[twoU] += prod

		if false {
			// Print a table in the form of Klotz's
			// "direct enumeration" example.
			//
			// Convert 2U = 2UQV' to UQt' used in Klotz
			// examples.
			UQt := float64(twoU)/2 + float64(n1*n1)/2
			fmt.Printf("%+v %f %-2d\n", u, UQt, prod)
		}
	}

	// Convert counts into probabilities for PMF and CDF.
	pmf = make([]float64, len(counts))
	cdf = make([]float64, len(counts))
	total := int(mathx.Choose(sumint(t), n1) + 0.5)
	for i, count := range counts {
		pmf[i] = float64(count) / float64(total)
		if i > 0 {
			cdf[i] = cdf[i-1]
		}
		cdf[i] += pmf[i]
	}
	return
}