from abc import ABCMeta, abstractmethod
import numpy as np
from glmnet import LogitNet
from sklearn import linear_model
from scipy import stats
[docs]class Distance(metaclass = ABCMeta):
"""This abstract base class defines how the distance between the observed and
simulated data should be implemented.
"""
[docs] @abstractmethod
def __init__(self, statistics_calc):
"""The constructor of a sub-class must accept a non-optional statistics
calculator as a parameter. If stored to self.statistics_calc, the
private helper method _calculate_summary_stat can be used.
Parameters
----------
statistics_calc : abcpy.stasistics.Statistics
Statistics extractor object that conforms to the Statistics class.
"""
raise NotImplementedError
[docs] @abstractmethod
def distance(d1, d2):
"""To be overwritten by any sub-class: should calculate the distance between two
sets of data d1 and d2 using their respective statistics.
Notes
-----
The data sets d1 and d2 are array-like structures that contain n1 and n2 data
points each. An implementation of the distance function should work along
the following steps:
1. Transform both input sets dX = [ dX1, dX2, ..., dXn ] to sX = [sX1, sX2,
..., sXn] using the statistics object. See _calculate_summary_stat method.
2. Calculate the mutual desired distance, here denoted by -, between the
statstics dist = [s11 - s21, s12 - s22, ..., s1n - s2n].
Important: any sub-class must not calculate the distance between data sets
d1 and d2 directly. This is the reason why any sub-class must be
initialized with a statistics object.
Parameters
----------
d1: Python list
Contains n1 data points.
d2: Python list
Contains n2 data points.
Returns
-------
numpy.ndarray
The distance between the two input data sets.
"""
raise NotImplementedError
[docs] @abstractmethod
def dist_max(self):
"""To be overwritten by sub-class: should return maximum possible value of the
desired distance function.
Examples
--------
If the desired distance maps to :math:`\mathbb{R}`, this method should return numpy.inf.
Returns
-------
numpy.float
The maximal possible value of the desired distance function.
"""
raise NotImplementedError
def _calculate_summary_stat(self,d1,d2):
"""Helper function that extracts the summary statistics s1 and s2 from d1 and
d2 using the statistics object stored in self.statistics_calc.
Parameters
----------
d1 : array-like
d1 contains n data sets.
d2 : array-like
d2 contains n data sets.
Returns
-------
numpy.ndarray
The summary statistics extracted from d1 and d2.
"""
s1 = self.statistics_calc.statistics(d1)
s2 = self.statistics_calc.statistics(d2)
return (s1,s2)
[docs]class Euclidean(Distance):
"""
This class implements the Euclidean distance between two vectors.
The maximum value of the distance is np.inf.
"""
[docs] def __init__(self, statistics):
self.statistics_calc = statistics
# Since the observations do always stay the same, we can save the
# summary statistics of them and not recalculate it each time
self.s1 = None
self.data_set = None
self.dataSame = False
[docs] def distance(self, d1, d2):
"""Calculates the distance between two datasets.
Parameters
----------
d1, d2: list
A list, containing a list describing the data set
"""
if not isinstance(d1, list):
raise TypeError('Data is not of allowed types')
if not isinstance(d2, list):
raise TypeError('Data is not of allowed types')
# Check whether d1 is same as self.data_set
if self.data_set is not None:
if len(np.array(d1[0]).reshape(-1,)) == 1:
self.data_set == d1
else:
self.dataSame = all([(np.array(self.data_set[i]) == np.array(d1[i])).all() for i in range(len(d1))])
# Extract summary statistics from the dataset
if(self.s1 is None or self.dataSame is False):
self.s1 = self.statistics_calc.statistics(d1)
self.data_set = d1
s2 = self.statistics_calc.statistics(d2)
# compute distance between the statistics
dist = np.zeros(shape=(self.s1.shape[0],s2.shape[0]))
for ind1 in range(0, self.s1.shape[0]):
for ind2 in range(0, s2.shape[0]):
dist[ind1,ind2] = np.sqrt(np.sum(pow(self.s1[ind1,:]-s2[ind2,:],2)))
return dist.mean()
[docs] def dist_max(self):
return np.inf
[docs]class PenLogReg(Distance):
"""
This class implements a distance mesure based on the classification accuracy.
The classification accuracy is calculated between two dataset d1 and d2 using
lasso penalized logistics regression and return it as a distance. The lasso
penalized logistic regression is done using glmnet package of Friedman et. al.
[2]. While computing the distance, the algorithm automatically chooses
the most relevant summary statistics as explained in Gutmann et. al. [1].
The maximum value of the distance is 1.0.
[1] Gutmann, M., Dutta, R., Kaski, S., and Corander, J. (2014). Statistical
inference of intractable generative models via classification. arXiv:1407.4981.
[2] Friedman, J., Hastie, T., and Tibshirani, R. (2010). Regularization
paths for generalized linear models via coordinate descent. Journal of Statistical
Software, 33(1), 1–22.
"""
[docs] def __init__(self, statistics):
self.statistics_calc = statistics
# Since the observations do always stay the same, we can save the summary statistics of them and not recalculate it each time
self.s1 = None
self.data_set = None
self.dataSame = False
[docs] def distance(self, d1, d2):
"""Calculates the distance between two datasets.
Parameters
----------
d1, d2: list
A list, containing a list describing the data set
"""
if not isinstance(d1, list):
raise TypeError('Data is not of allowed types')
if not isinstance(d2, list):
raise TypeError('Data is not of allowed types')
# Check whether d1 is same as self.data_set
if self.data_set is not None:
if len(np.array(d1[0]).reshape(-1,)) == 1:
self.data_set == d1
else:
self.dataSame = all([(np.array(self.data_set[i]) == np.array(d1[i])).all() for i in range(len(d1))])
# Extract summary statistics from the dataset
if(self.s1 is None or self.dataSame is False):
self.s1 = self.statistics_calc.statistics(d1)
self.data_set = d1
s2 = self.statistics_calc.statistics(d2)
# compute distnace between the statistics
training_set_features = np.concatenate((self.s1, s2), axis=0)
label_s1 = np.zeros(shape=(len(self.s1), 1))
label_s2 = np.ones(shape=(len(s2), 1))
training_set_labels = np.concatenate((label_s1, label_s2), axis=0).ravel()
m = LogitNet(alpha = 1, n_splits = 10)
m = m.fit(training_set_features, training_set_labels)
distance = 2.0 * (m.cv_mean_score_[np.where(m.lambda_path_== m.lambda_max_)[0][0]] - 0.5)
return distance
[docs] def dist_max(self):
return 1.0
[docs]class LogReg(Distance):
"""This class implements a distance measure based on the classification
accuracy [1]. The classification accuracy is calculated between two dataset d1 and d2 using
logistics regression and return it as a distance. The maximum value of the distance is 1.0.
[1] Gutmann, M., Dutta, R., Kaski, S., and Corander, J. (2014). Statistical
inference of intractable generative models via classification. arXiv:1407.4981.
"""
[docs] def __init__(self, statistics):
self.statistics_calc = statistics
# Since the observations do always stay the same, we can save the summary statistics of them and not recalculate it each time
self.s1 = None
self.data_set = None
self.dataSame = False
[docs] def distance(self, d1, d2):
"""Calculates the distance between two datasets.
Parameters
----------
d1, d2: list
A list, containing a list describing the data set
"""
if not isinstance(d1, list):
raise TypeError('Data is not of allowed types')
if not isinstance(d2, list):
raise TypeError('Data is not of allowed types')
# Check whether d1 is same as self.data_set
if self.data_set is not None:
if len(np.array(d1[0]).reshape(-1,)) == 1:
self.data_set == d1
else:
self.dataSame = all([(np.array(self.data_set[i]) == np.array(d1[i])).all() for i in range(len(d1))])
# Extract summary statistics from the dataset
if(self.s1 is None or self.dataSame is False):
self.s1 = self.statistics_calc.statistics(d1)
self.data_set = d1
s2 = self.statistics_calc.statistics(d2)
# compute distance between the statistics
training_set_features = np.concatenate((self.s1, s2), axis=0)
label_s1 = np.zeros(shape=(len(self.s1), 1))
label_s2 = np.ones(shape=(len(s2), 1))
training_set_labels = np.concatenate((label_s1, label_s2), axis=0).ravel()
reg_inv = 1e5
log_reg_model = linear_model.LogisticRegression(C=reg_inv, penalty='l1', max_iter=1000, solver='liblinear')
log_reg_model.fit(training_set_features, training_set_labels)
score = log_reg_model.score(training_set_features, training_set_labels)
distance = 2.0 * (score - 0.5)
return distance
[docs] def dist_max(self):
return 1.0