func (d UDist) CDF(U float64) float64 { if U < 0 { return 0 } else if U >= float64(d.N1*d.N2) { return 1 } if d.hasTies() { // TODO: Minimize U? p, ok := makeUmemo(int(2*U), d.N1, d.T)[len(d.T)][ukey{d.N1, int(2 * U)}] if !ok { panic("makeUmemo did not return expected memoization table") } return p / mathx.Choose(d.N1+d.N2, d.N1) } // There are no ties. Use the fast algorithm. U must be integral. Ui := int(math.Floor(U)) // The distribution is symmetric around U = m * n / 2. Sum up // whichever tail is smaller. flip := Ui >= (d.N1*d.N2+1)/2 if flip { Ui = d.N1*d.N2 - Ui - 1 } pdfs := d.p(Ui) p := 0.0 for _, pdf := range pdfs[:Ui+1] { p += pdf } if flip { p = 1 - p } return p }
func (d UDist) PMF(U float64) float64 { if U < 0 || U >= 0.5+float64(d.N1*d.N2) { return 0 } if d.hasTies() { // makeUmemo computes the CDF directly. Take its // difference to get the PMF. p1, ok1 := makeUmemo(int(2*U)-1, d.N1, d.T)[len(d.T)][ukey{d.N1, int(2*U) - 1}] p2, ok2 := makeUmemo(int(2*U), d.N1, d.T)[len(d.T)][ukey{d.N1, int(2 * U)}] if !ok1 || !ok2 { panic("makeUmemo did not return expected memoization table") } return (p2 - p1) / mathx.Choose(d.N1+d.N2, d.N1) } // There are no ties. Use the fast algorithm. U must be integral. Ui := int(math.Floor(U)) // TODO: Use symmetry to minimize U return d.p(Ui)[Ui] }
// This computes the cumulative counts of the Mann-Whitney U // distribution in the presence of ties using the computation from // Cheung, Ying Kuen; Klotz, Jerome H. (1997). "The Mann Whitney // Wilcoxon Distribution Using Linked Lists". Statistica Sinica 7: // 805-813, with much guidance from appendix L of Klotz, A // Computational Approach to Statistics. // // makeUmemo constructs a table memo[K][ukey{n1, 2*U}], where K is the // number of ranks (up to len(t)), n1 is the size of the first sample // (up to the n1 argument), and U is the U statistic (up to the // argument twoU/2). The value of an entry in the memo table is the // number of permutations of a sample of size n1 in a ranking with tie // vector t[:K] having a U statistic <= U. func makeUmemo(twoU, n1 int, t []int) []map[ukey]float64 { // Another candidate for a fast implementation is van de Wiel, // "The split-up algorithm: a fast symbolic method for // computing p-values of distribution-free statistics". This // is what's used by R's coin package. It's a comparatively // recent publication, so it's presumably faster (or perhaps // just more general) than previous techniques, but I can't // get my hands on the paper. // // TODO: ~40% of this function's time is spent in mapassign on // the assignment lines in the two loops and another ~20% in // map access and iteration. Improving map behavior or // replacing the maps altogether with some other constant-time // structure could double performance. // // TODO: The worst case for this function is when there are // few ties. Yet the best case overall is when there are *no* // ties. Can we get the best of both worlds? Use the fast // algorithm for the most part when there are few ties and mix // in the general algorithm just where we need it? That's // certainly possible for sub-problems where t[:k] has no // ties, but that doesn't help if t[0] has a tie but nothing // else does. Is it possible to rearrange the ranks without // messing up our computation of the U statistic for // sub-problems? K := len(t) // Compute a coefficients. The a slice is indexed by k (a[0] // is unused). a := make([]int, K+1) a[1] = t[0] for k := 2; k <= K; k++ { a[k] = a[k-1] + t[k-2] + t[k-1] } // Create the memo table for the counts function, A. The A // slice is indexed by k (A[0] is unused). // // In "The Mann Whitney Distribution Using Linked Lists", they // use linked lists (*gasp*) for this, but within each K it's // really just a memoization table, so it's faster to use a // map. The outer structure is a slice indexed by k because we // need to find all memo entries with certain values of k. // // TODO: The n1 and twoU values in the ukeys follow strict // patterns. For each K value, the n1 values are every integer // between two bounds. For each (K, n1) value, the twoU values // are every integer multiple of a certain base between two // bounds. It might be worth turning these into directly // indexible slices. A := make([]map[ukey]float64, K+1) A[K] = map[ukey]float64{ukey{n1: n1, twoU: twoU}: 0} // Compute memo table (k, n1, twoU) triples from high K values // to low K values. This drives the recurrence relation // downward to figure out all of the needed argument triples. // // TODO: Is it possible to generate this table bottom-up? If // so, this could be a pure dynamic programming algorithm and // we could discard the K dimension. We could at least store // the inputs in a more compact representation that replaces // the twoU dimension with an interval and a step size (as // suggested by Cheung, Klotz, not that they make it at all // clear *why* they're suggesting this). tsum := sumint(t) // always ∑ t[0:k] for k := K - 1; k >= 2; k-- { tsum -= t[k] A[k] = make(map[ukey]float64) // Construct A[k] from A[k+1]. for A_kplus1 := range A[k+1] { rkLow := maxint(0, A_kplus1.n1-tsum) rkHigh := minint(A_kplus1.n1, t[k]) for rk := rkLow; rk <= rkHigh; rk++ { twoU_k := A_kplus1.twoU - rk*(a[k+1]-2*A_kplus1.n1+rk) n1_k := A_kplus1.n1 - rk if twoUmin(n1_k, t[:k], a) <= twoU_k && twoU_k <= twoUmax(n1_k, t[:k], a) { key := ukey{n1: n1_k, twoU: twoU_k} A[k][key] = 0 } } } } // Fill counts in memo table from low K values to high K // values. This unwinds the recurrence relation. // Start with K==2 base case. // // TODO: Later computations depend on these, but these don't // depend on anything (including each other), so if K==2, we // can skip the memo table altogether. if K < 2 { panic("K < 2") } N_2 := t[0] + t[1] for A_2i := range A[2] { Asum := 0.0 r2Low := maxint(0, A_2i.n1-t[0]) r2High := (A_2i.twoU - A_2i.n1*(t[0]-A_2i.n1)) / N_2 for r2 := r2Low; r2 <= r2High; r2++ { Asum += mathx.Choose(t[0], A_2i.n1-r2) * mathx.Choose(t[1], r2) } A[2][A_2i] = Asum } // Derive counts for the rest of the memo table. tsum = t[0] // always ∑ t[0:k-1] for k := 3; k <= K; k++ { tsum += t[k-2] // Compute A[k] counts from A[k-1] counts. for A_ki := range A[k] { Asum := 0.0 rkLow := maxint(0, A_ki.n1-tsum) rkHigh := minint(A_ki.n1, t[k-1]) for rk := rkLow; rk <= rkHigh; rk++ { twoU_kminus1 := A_ki.twoU - rk*(a[k]-2*A_ki.n1+rk) n1_kminus1 := A_ki.n1 - rk x, ok := A[k-1][ukey{n1: n1_kminus1, twoU: twoU_kminus1}] if !ok && twoUmax(n1_kminus1, t[:k-1], a) < twoU_kminus1 { x = mathx.Choose(tsum, n1_kminus1) } Asum += x * mathx.Choose(t[k-1], rk) } A[k][A_ki] = Asum } } return A }
// udistRef computes the PMF and CDF of the U distribution for two // samples of sizes n1 and sum(t)-n1 with tie vector t. The returned // pmf and cdf are indexed by 2*U. // // This uses the "graphical method" of Klotz (1966). It is very slow // (Θ(∏ (t[i]+1)) = Ω(2^|t|)), but very correct, and hence useful as a // reference for testing faster implementations. func udistRef(n1 int, t []int) (pmf, cdf []float64) { // Enumerate all u vectors for which 0 <= u_i <= t_i. Count // the number of permutations of two samples of sizes n1 and // sum(t)-n1 with tie vector t and accumulate these counts by // their U statistics in count[2*U]. counts := make([]int, 1+2*n1*(sumint(t)-n1)) u := make([]int, len(t)) u[0] = -1 // Get enumeration started. enumu: for { // Compute the next u vector. u[0]++ for i := 0; i < len(u) && u[i] > t[i]; i++ { if i == len(u)-1 { // All u vectors have been enumerated. break enumu } // Carry. u[i+1]++ u[i] = 0 } // Is this a legal u vector? if sumint(u) != n1 { // Klotz (1966) has a method for directly // enumerating legal u vectors, but the point // of this is to be correct, not fast. continue } // Compute 2*U statistic for this u vector. twoU, vsum := 0, 0 for i, u_i := range u { v_i := t[i] - u_i // U = U + vsum*u_i + u_i*v_i/2 twoU += 2*vsum*u_i + u_i*v_i vsum += v_i } // Compute Π choose(t_i, u_i). This is the number of // ways of permuting the input sample under u. prod := 1 for i, u_i := range u { prod *= int(mathx.Choose(t[i], u_i) + 0.5) } // Accumulate the permutations on this u path. counts[twoU] += prod if false { // Print a table in the form of Klotz's // "direct enumeration" example. // // Convert 2U = 2UQV' to UQt' used in Klotz // examples. UQt := float64(twoU)/2 + float64(n1*n1)/2 fmt.Printf("%+v %f %-2d\n", u, UQt, prod) } } // Convert counts into probabilities for PMF and CDF. pmf = make([]float64, len(counts)) cdf = make([]float64, len(counts)) total := int(mathx.Choose(sumint(t), n1) + 0.5) for i, count := range counts { pmf[i] = float64(count) / float64(total) if i > 0 { cdf[i] = cdf[i-1] } cdf[i] += pmf[i] } return }