"""
06 Aug 2013
"""
from bisect import bisect_left
from itertools import combinations
from warnings import warn
import numpy as np
def mad(arr):
""" Median Absolute Deviation: a "Robust" version of standard deviation.
Indices variability of the sample.
https://en.wikipedia.org/wiki/Median_absolute_deviation
"""
if not isinstance(arr, np.ndarray):
arr = np.array(arr)
arr = np.ma.array(arr).compressed() # should be faster to not use masked arrays.
med = np.median(arr)
return np.median(np.abs(arr - med))
def right_double_mad(arr):
""" Double Median Absolute Deviation: a 'Robust' version of standard deviation.
Indices variability of the sample.
http://eurekastatistics.com/using-the-median-absolute-deviation-to-find-outliers
"""
if not isinstance(arr, np.ndarray):
arr = np.array(arr)
arr = np.ma.array(arr).compressed() # should be faster to not use masked arrays.
med = np.median(arr)
right_mad = np.median(np.abs(arr[arr > med] - med))
return right_mad
def newton_raphson (guess, contour, sq_length, jmax=2000, xacc=1e-12):
"""
Newton-Raphson method as defined in:
http://www.maths.tcd.ie/~ryan/TeachingArchive/161/teaching/newton-raphson.c.html
used to search for the persistence length of a given model.
:param guess: starting value
:param contour: of the model
:param sq_length: square of the distance between first and last particle
:param 100 jmax: number of tries
:param 1e-12 xacc: precision of the optimization
:returns: persistence length
"""
for _ in range(1, jmax):
contour_guess = contour / guess
expcx = np.exp(-contour_guess) - 1
# function
fx = 2 * pow(guess, 2) * (contour_guess + expcx) - sq_length
# derivative
df = 2 * contour * expcx + 4 * guess * (expcx + contour_guess)
dx = -fx / df
if abs(dx) < xacc:
# print ("found root after %d attempts, at %f\n" % (j, x))
return guess
guess += dx
raise Exception("ERROR: exceeded max tries no root\n")
class Interpolate(object):
"""
Simple linear interpolation, to be used when the one from scipy is not
available.
"""
def __init__(self, x_list, y_list):
for i, (x, y) in enumerate(zip(x_list, x_list[1:])):
if y - x < 0:
raise ValueError("x must be in strictly ascending")
if y - x == 0 and i >= len(x_list)-2:
x_list = x_list[:-1]
y_list = y_list[:-1]
if any(y - x <= 0 for x, y in zip(x_list, x_list[1:])):
raise ValueError("x must be in strictly ascending")
x_list = self.x_list = list(map(float, x_list))
y_list = self.y_list = list(map(float, y_list))
intervals = list(zip(x_list, x_list[1:], y_list, y_list[1:]))
self.slopes = [(y2 - y1)/(x2 - x1) for x1, x2, y1, y2 in intervals]
def __call__(self, x):
i = bisect_left(self.x_list, x) - 1
return self.y_list[i] + self.slopes[i] * (x - self.x_list[i])
def transform(val):
with np.errstate(divide='ignore'):
return np.log10(val)
def nozero_log(values):
# Set the virtual minimum of the matrix to half the non-null real minimum
minv = float(min([v for v in list(values.values()) if v])) / 2
# if minv > 1:
# warn('WARNING: probable problem with normalization, check.\n')
# minv /= 2 # TODO: something better
logminv = transform(minv)
for i in values:
try:
values[i] = transform(values[i])
except ValueError:
values[i] = logminv
def nozero_log_list(values):
# Set the virtual minimum of the matrix to half the non-null real minimum
try:
if not np.isfinite(transform(0)):
raise Exception()
minv = 0.
except:
try:
minv = float(min([v for v in values if v])) / 2
except ValueError:
minv = 1
# if minv > 1:
# warn('WARNING: probable problem with normalization, check.\n')
# minv /= 2 # TODO: something better
logminv = transform(minv)
return [transform(v) if v else logminv for v in values]
def nozero_log_matrix(values, transformation):
# Set the virtual minimum of the matrix to half the non-null real minimum
try:
if not np.isfinite(transform(0)):
raise Exception()
minv = 0.
except:
try:
minv = float(min([v for l in values for v in l
if v and not np.isnan(v)])) / 2
except ValueError:
minv = 1
logminv = transformation(minv)
return [[transformation(v) if v else logminv for v in l] for l in values]
def zscore(values):
"""
Calculates the log10, Z-score of a given list of values.
.. note::
_______________________/___
/
/
/
/
/
/
/
/
/
/
/
/
/ score
___/_________________________________
/
"""
# get the log trasnform values
nozero_log(values)
mean_v = np.mean(list(values.values()))
std_v = np.std (list(values.values()))
# replace values by z-score
for i in values:
values[i] = (values[i] - mean_v) / std_v
[docs]def calinski_harabasz(scores, clusters):
"""
Implementation of the CH score [CalinskiHarabasz1974]_, that has shown to be
one the most accurate way to compare clustering methods
[MilliganCooper1985]_ [Tibshirani2001]_.
The CH score is:
.. math::
CH(k) = \\frac{B(k) / (k-1)}{W(k)/(n - k)}
Where :math:`B(k)` and :math:`W(k)` are the between and within cluster sums
of squares, with :math:`k` clusters, and :math:`n` the total number of
elements (models in this case).
:param scores: a dict with, as keys, a tuple with a pair of models; and, as
value, the distance between these models.
:param clusters: a dict with, as key, the cluster number, and as value a
list of models
:param nmodels: total number of models
:returns: the CH score
"""
cluster_list = [c for c in clusters if len(clusters[c]) > 1]
if len(cluster_list) <= 1:
return 0.0
nmodels = sum([len(clusters[c]) for c in cluster_list])
between_cluster = (sum([sum([sum([scores[(md1, md2)]**2
for md1 in clusters[cl1]])
for md2 in clusters[cl2]])
/ (len(clusters[cl1]) * len(clusters[cl2]))
for cl1, cl2 in combinations(cluster_list, 2)])
/ ((len(cluster_list) - 1.0) / 2))
within_cluster = (sum([sum([scores[(md1, md2)]**2
for md1, md2 in combinations(clusters[cls], 2)])
/ (len(clusters[cls]) * (len(clusters[cls]) - 1.0) / 2)
for cls in cluster_list]))
return ((between_cluster / (len(cluster_list) - 1))
/
(within_cluster / (nmodels - len(cluster_list))))
def mean_none(values):
"""
Calculates the mean of a list of values without taking into account the None
:param values: list of values
:returns: the mean value
"""
values = [i for i in values if i !=None]
if values:
return float(sum(values)) / len(values)
else:
return None