Source code for abcpy.distances

from abc import ABCMeta, abstractmethod

import numpy as np
from glmnet import LogitNet
from sklearn import linear_model

from abcpy.utils import wass_dist


[docs]class Distance(metaclass=ABCMeta): """This abstract base class defines how the distance between the observed and simulated data should be implemented. """
[docs] def __init__(self, statistics_calc): """The constructor of a sub-class must accept a non-optional statistics calculator as a parameter; then, it must call the __init__ method of the parent class. This ensures that the object is initialized correctly so that the _calculate_summary_stat private method can be called when computing the distances. Parameters ---------- statistics_calc : abcpy.statistics.Statistics Statistics extractor object that conforms to the Statistics class. """ self.statistics_calc = statistics_calc # Since the observations do always stay the same, we can save the # summary statistics of them and not recalculate it each time self.s1 = None self.data_set = None self.dataSame = False
[docs] @abstractmethod def distance(self, d1, d2): """To be overwritten by any sub-class: should calculate the distance between two sets of data d1 and d2 using their respective statistics. Usually, calling the _calculate_summary_stat private method to obtain statistics from the datasets is handy; that also keeps track of the first provided dataset (which is the observation in ABCpy inference schemes) and avoids computing the statistics for that multiple times. Notes ----- The data sets d1 and d2 are array-like structures that contain n1 and n2 data points each. An implementation of the distance function should work along the following steps: 1. Transform both input sets dX = [ dX1, dX2, ..., dXn ] to sX = [sX1, sX2, ..., sXn] using the statistics object. See _calculate_summary_stat method. 2. Calculate the mutual desired distance, here denoted by - between the statistics; for instance, dist = [s11 - s21, s12 - s22, ..., s1n - s2n] (in some cases however you may want to compute all pairwise distances between statistics elements. Important: any sub-class must not calculate the distance between data sets d1 and d2 directly. This is the reason why any sub-class must be initialized with a statistics object. Parameters ---------- d1: Python list Contains n1 data points. d2: Python list Contains n2 data points. Returns ------- numpy.float The distance between the two input data sets. """ raise NotImplementedError
[docs] @abstractmethod def dist_max(self): """To be overwritten by sub-class: should return maximum possible value of the desired distance function. Examples -------- If the desired distance maps to :math:`\mathbb{R}`, this method should return numpy.inf. Returns ------- numpy.float The maximal possible value of the desired distance function. """ raise NotImplementedError
def _calculate_summary_stat(self, d1, d2): """Helper function that extracts the summary statistics s1 and s2 from d1 and d2 using the statistics object stored in self.statistics_calc. This stores s1 for the purpose of checking whether that is repeated in next calls to the function, and avoiding computing the statitistics for the same dataset several times. Parameters ---------- d1 : array-like d1 contains n data sets. d2 : array-like d2 contains m data sets. Returns ------- tuple Tuple containing numpy.ndarray's with the summary statistics extracted from d1 and d2. """ if not isinstance(d1, list): raise TypeError('Data is not of allowed types') if not isinstance(d2, list): raise TypeError('Data is not of allowed types') # Check whether d1 is same as self.data_set if self.data_set is not None: # check that the the observations have the same length; if not, they can't be the same: if len(d1) != len(self.data_set): self.dataSame = False elif len(np.array(d1[0]).reshape(-1, )) == 1: self.dataSame = self.data_set == d1 else: self.dataSame = all([(np.array(self.data_set[i]) == np.array(d1[i])).all() for i in range(len(d1))]) # Extract summary statistics from the dataset if self.s1 is None or self.dataSame is False: self.s1 = self.statistics_calc.statistics(d1) self.data_set = d1 s2 = self.statistics_calc.statistics(d2) return self.s1, s2
[docs]class Euclidean(Distance): """ This class implements the Euclidean distance between two vectors. The maximum value of the distance is np.inf. """
[docs] def __init__(self, statistics): """ Parameters ---------- statistics_calc : abcpy.statistics.Statistics Statistics extractor object that conforms to the Statistics class. """ super(Euclidean, self).__init__(statistics)
[docs] def distance(self, d1, d2): """Calculates the distance between two datasets, by computing Euclidean distance between each element of d1 and d2 and taking their average. Parameters ---------- d1: Python list Contains n1 data points. d2: Python list Contains n2 data points. Returns ------- numpy.float The distance between the two input data sets. """ s1, s2 = self._calculate_summary_stat(d1, d2) # compute distance between the statistics dist = np.zeros(shape=(s1.shape[0], s2.shape[0])) for ind1 in range(0, s1.shape[0]): for ind2 in range(0, s2.shape[0]): dist[ind1, ind2] = np.sqrt(np.sum(pow(s1[ind1, :] - s2[ind2, :], 2))) return dist.mean()
[docs] def dist_max(self): """ Returns ------- numpy.float The maximal possible value of the desired distance function. """ return np.inf
[docs]class PenLogReg(Distance): """ This class implements a distance measure based on the classification accuracy. The classification accuracy is calculated between two dataset d1 and d2 using lasso penalized logistics regression and return it as a distance. The lasso penalized logistic regression is done using glmnet package of Friedman et. al. [2]. While computing the distance, the algorithm automatically chooses the most relevant summary statistics as explained in Gutmann et. al. [1]. The maximum value of the distance is 1.0. [1] Gutmann, M. U., Dutta, R., Kaski, S., & Corander, J. (2018). Likelihood-free inference via classification. Statistics and Computing, 28(2), 411-425. [2] Friedman, J., Hastie, T., and Tibshirani, R. (2010). Regularization paths for generalized linear models via coordinate descent. Journal of Statistical Software, 33(1), 1–22. """
[docs] def __init__(self, statistics): """ Parameters ---------- statistics_calc : abcpy.statistics.Statistics Statistics extractor object that conforms to the Statistics class. """ super(PenLogReg, self).__init__(statistics) self.n_folds = 10 # for cross validation in PenLogReg
[docs] def distance(self, d1, d2): """Calculates the distance between two datasets. Parameters ---------- d1: Python list Contains n1 data points. d2: Python list Contains n2 data points. Returns ------- numpy.float The distance between the two input data sets. """ s1, s2 = self._calculate_summary_stat(d1, d2) self.n_simulate = s1.shape[0] if not s2.shape[0] == self.n_simulate: raise RuntimeError("The number of simulations in the two data sets should be the same in order for " "the classification accuracy implemented in PenLogReg to be a proper distance. Please " "check that `n_samples` in the `sample()` method for the sampler is equal to " "the number of datasets in the observations.") # compute distance between the statistics training_set_features = np.concatenate((s1, s2), axis=0) label_s1 = np.zeros(shape=(len(s1), 1)) label_s2 = np.ones(shape=(len(s2), 1)) training_set_labels = np.concatenate((label_s1, label_s2), axis=0).ravel() groups = np.repeat(np.arange(self.n_folds), np.int(np.ceil(self.n_simulate / self.n_folds))) groups = groups[:self.n_simulate].tolist() groups += groups # duplicate it as groups need to be defined for both datasets m = LogitNet(alpha=1, n_splits=self.n_folds) # note we are not using random seed here! m = m.fit(training_set_features, training_set_labels, groups=groups) distance = 2.0 * (m.cv_mean_score_[np.where(m.lambda_path_ == m.lambda_max_)[0][0]] - 0.5) return distance
[docs] def dist_max(self): """ Returns ------- numpy.float The maximal possible value of the desired distance function. """ return 1.0
[docs]class LogReg(Distance): """This class implements a distance measure based on the classification accuracy [1]. The classification accuracy is calculated between two dataset d1 and d2 using logistics regression and return it as a distance. The maximum value of the distance is 1.0. The logistic regression may not converge when using one single sample in each dataset (as for instance by putting n_samples_per_param=1 in an inference routine). [1] Gutmann, M. U., Dutta, R., Kaski, S., & Corander, J. (2018). Likelihood-free inference via classification. Statistics and Computing, 28(2), 411-425. """
[docs] def __init__(self, statistics, seed=None): """ Parameters ---------- statistics_calc : abcpy.statistics.Statistics Statistics extractor object that conforms to the Statistics class. seed : integer, optionl Seed used to initialize the Random Numbers Generator used to determine the (random) cross validation split in the Logistic Regression classifier. """ super(LogReg, self).__init__(statistics) # seed is used for a RandomState for the random split in the LogisticRegression classifier: self.rng = np.random.RandomState(seed=seed)
[docs] def distance(self, d1, d2): """Calculates the distance between two datasets. Parameters ---------- d1: Python list Contains n1 data points. d2: Python list Contains n2 data points. Returns ------- numpy.float The distance between the two input data sets. """ s1, s2 = self._calculate_summary_stat(d1, d2) # compute distance between the statistics training_set_features = np.concatenate((s1, s2), axis=0) label_s1 = np.zeros(shape=(len(s1), 1)) label_s2 = np.ones(shape=(len(s2), 1)) training_set_labels = np.concatenate((label_s1, label_s2), axis=0).ravel() reg_inv = 1e5 log_reg_model = linear_model.LogisticRegression(C=reg_inv, penalty='l1', max_iter=1000, solver='liblinear', random_state=self.rng.randint(0, np.iinfo(np.uint32).max)) log_reg_model.fit(training_set_features, training_set_labels) score = log_reg_model.score(training_set_features, training_set_labels) distance = 2.0 * (score - 0.5) return distance
[docs] def dist_max(self): """ Returns ------- numpy.float The maximal possible value of the desired distance function. """ return 1.0
[docs]class Wasserstein(Distance): """This class implements a distance measure based on the 2-Wasserstein distance, as used in [1]. This considers the several simulations/observations in the datasets as iid samples from the model for a fixed parameter value/from the data generating model, and computes the 2-Wasserstein distance between the empirical distributions those simulations/observations define. [1] Bernton, E., Jacob, P.E., Gerber, M. and Robert, C.P. (2019), Approximate Bayesian computation with the Wasserstein distance. J. R. Stat. Soc. B, 81: 235-269. doi:10.1111/rssb.12312 """
[docs] def __init__(self, statistics, num_iter_max=100000): """ Parameters ---------- statistics_calc : abcpy.statistics.Statistics Statistics extractor object that conforms to the Statistics class. num_iter_max : integer, optional The maximum number of iterations in the linear programming algorithm to estimate the Wasserstein distance. Default to 100000. """ super(Wasserstein, self).__init__(statistics) self.num_iter_max = num_iter_max
[docs] def distance(self, d1, d2): """Calculates the distance between two datasets. Parameters ---------- d1: Python list Contains n1 data points. d2: Python list Contains n2 data points. Returns ------- numpy.float The distance between the two input data sets. """ s1, s2 = self._calculate_summary_stat(d1, d2) # compute the Wasserstein distance between the empirical distributions: return wass_dist(samples_1=s1, samples_2=s2, num_iter_max=self.num_iter_max)
[docs] def dist_max(self): """ Returns ------- numpy.float The maximal possible value of the desired distance function. """ # As the statistics are positive, the max possible value is 1 return np.inf