### License: GPL v.3+
from __future__ import division
from numpy import arange, ones, array
from scipy.misc import comb
def _mww_cache(m, n):
# The caching strategy is adapted from R. In order to calculate
# p(m, n, U), all p(m, n, u) with u__ u).
if m > n:
# Invert m and n, since this is what the recursive algorithm does
m, n = n, m
cache = array([[None] * (n+1)] * (m+1))
# (indices 0 are wasted... who cares)
return cache
def mww_u_pdf(m, n, bigu):
"""
Mann-Whitney-Wilcoxon test: probability of obtaining a given U in comparing
samples of n and m elements, respectively, from the same distribution.
The @cache argument must be a dictionary, and can help speed up subsequent
calls of this function (even for different values of m, n, bigu).
"""
return _mww_u_pdf(m, n, bigu) / comb(m+n, n)
def _mww_u_pdf(m, n, bigu, cache=None):
if m > n:
# Invert m and n: same result, and better exploits the cache
m, n = n, m
max_u = m*n
if bigu > max_u / 2:
bigu = max_u - bigu
if bigu < 0:
return 0
if n == 0 or m == 0:
if bigu == 0:
return 1
else:
return 0
if cache == None:
# This is the original (non-recursive) call: initialize the cache.
cache = _mww_cache(m, n)
if cache[m][n] is None:
cache[m][n] = -ones(max_u/2+1)
elif cache[m][n][bigu] != -1:
return cache[m][n][bigu]
# Mann & Whitney (1947), Eq. (1):
p1 = _mww_u_pdf(m-1, n , bigu-n, cache)
p2 = _mww_u_pdf(m , n-1 , bigu , cache)
# (...except that I only sum recursively, and then divide in the wrappers)
res = p1 + p2
cache[m][n][bigu] = res
return res
def mww_u_cdf(m, n, bigu):
"""
Mann-Whitney-Wilcoxon test: probability of obtaining a U not larger than
bigu in comparing samples of n and m elements, respectively, from the same
distribution.
The @cache argument must be a dictionary, and can help speed up subsequent
calls of this function (even for different values of m, n, bigu).
"""
if bigu > m*n / 2:
left_tail = 0
# The distribution is symmetric, calculate the cheapest version:
bigu = m*n - bigu - 1
else:
left_tail = 1
cache = _mww_cache(m, n)
# Calculate backwards in order to initialize the cache as large as needed:
res = sum(_mww_u_pdf(m, n, u, cache)
for u in range(bigu + left_tail)[::-1]) / comb(m+n, n)
if not left_tail:
res = 1 - res
return res
__