Source code for abcpy.approx_lhd

import numpy as np
from abc import ABCMeta, abstractmethod
from glmnet import LogitNet
from scipy.stats import gaussian_kde, rankdata, norm
from sklearn.covariance import ledoit_wolf

from abcpy.graphtools import GraphTools


[docs]class Approx_likelihood(metaclass=ABCMeta):
    """This abstract base class defines the approximate likelihood
    function.
    """

[docs]    @abstractmethod
    def __init__(self, statistics_calc):
        """
        The constructor of a sub-class must accept a non-optional statistics
        calculator; then, it must call the __init__ method of the parent class. This ensures that the
        object is initialized correctly so that the _calculate_summary_stat private method can be called when computing
        the distances.


        Parameters
        ----------
        statistics_calc : abcpy.statistics.Statistics
            Statistics extractor object that conforms to the Statistics class.
        """
        self.statistics_calc = statistics_calc

        # Since the observations do always stay the same, we can save the
        #  summary statistics of them and not recalculate it each time
        self.stat_obs = None
        self.data_set = None
        self.dataSame = False

[docs]    @abstractmethod
    def loglikelihood(self, y_obs, y_sim):
        """To be overwritten by any sub-class: should compute the approximate loglikelihood
        value given the observed data set y_obs and the data set y_sim simulated from
        model set at the parameter value.

        Parameters
        ----------
        y_obs: Python list
            Observed data set.
        y_sim: Python list
            Simulated data set from model at the parameter value.
            
        Returns
        -------
        float
            Computed approximate loglikelihood.
        """

        raise NotImplemented

[docs]    def likelihood(self, y_obs, y_sim):
        """Computes the likelihood by taking the exponential of the loglikelihood method.

        Parameters
        ----------
        y_obs: Python list
            Observed data set.
        y_sim: Python list
            Simulated data set from model at the parameter value.

        Returns
        -------
        float
            Computed approximate likelihood.

        """
        return np.exp(self.loglikelihood(y_obs, y_sim))

    def _calculate_summary_stat(self, y_obs, y_sim):
        """Helper function that extracts the summary statistics s_obs and s_sim from y_obs and
        y_y_sim using the statistics object stored in self.statistics_calc. This stores s_obs for the purpose of checking
        whether that is repeated in next calls to the function, and avoiding computing the statitistics for the same
        dataset several times.

        Parameters
        ----------
        y_obs : array-like
            d1 contains n_obs data sets.
        y_sim : array-like
            d2 contains n_sim data sets.

        Returns
        -------
        tuple
            Tuple containing numpy.ndarray's with the summary statistics extracted from d1 and d2.
        """
        if not isinstance(y_obs, list):
            raise TypeError('Observed data is not of allowed types')

        if not isinstance(y_sim, list):
            raise TypeError('simulated data is not of allowed types')

        # Check whether y_obs is same as the stored dataset.
        if self.data_set is not None:
            # check that the the observations have the same length; if not, they can't be the same:
            if len(y_obs) != len(self.data_set):
                self.dataSame = False
            elif len(np.array(y_obs[0]).reshape(-1, )) == 1:
                self.dataSame = self.data_set == y_obs
            else:  # otherwise it fails when y_obs[0] is array
                self.dataSame = all(
                    [(np.array(self.data_set[i]) == np.array(y_obs[i])).all() for i in range(len(y_obs))])

        if self.stat_obs is None or self.dataSame is False:
            self.stat_obs = self.statistics_calc.statistics(y_obs)
            self.data_set = y_obs

        # Extract summary statistics from the simulated data
        stat_sim = self.statistics_calc.statistics(y_sim)

        if self.stat_obs.shape[1] != stat_sim.shape[1]:
            raise ValueError("The dimension of summaries in the two datasets is different; check the dimension of the"
                             " provided observations and simulations.")

        return self.stat_obs, stat_sim


[docs]class SynLikelihood(Approx_likelihood):

[docs]    def __init__(self, statistics_calc):
        """This class implements the approximate likelihood function which computes the approximate
        likelihood using the synthetic likelihood approach described in Wood [1].
        For synthetic likelihood approximation, we compute the robust precision matrix using Ledoit and Wolf's [2]
        method.

        [1] S. N. Wood. Statistical inference for noisy nonlinear ecological
        dynamic systems. Nature, 466(7310):1102–1104, Aug. 2010.

        [2] O. Ledoit and M. Wolf, A Well-Conditioned Estimator for Large-Dimensional Covariance Matrices,
        Journal of Multivariate Analysis, Volume 88, Issue 2, pages 365-411, February 2004.


        Parameters
        ----------
        statistics_calc : abcpy.statistics.Statistics
            Statistics extractor object that conforms to the Statistics class.
        """

        super(SynLikelihood, self).__init__(statistics_calc)

[docs]    def loglikelihood(self, y_obs, y_sim):
        """Computes the loglikelihood.

        Parameters
        ----------
        y_obs: Python list
            Observed data set.
        y_sim: Python list
            Simulated data set from model at the parameter value.

        Returns
        -------
        float
            Computed approximate loglikelihood.

        """

        stat_obs, stat_sim = self._calculate_summary_stat(y_obs, y_sim)

        # Compute the mean, robust precision matrix and determinant of precision matrix
        mean_sim = np.mean(stat_sim, 0)
        lw_cov_, _ = ledoit_wolf(stat_sim)
        robust_precision_sim = np.linalg.inv(lw_cov_)
        sign_logdet, robust_precision_sim_logdet = np.linalg.slogdet(robust_precision_sim)  # we do not need sign
        # print("DEBUG: combining.")
        # we may have different observation; loop on those now:
        # likelihoods = np.zeros(stat_obs.shape[0])
        # for i, single_stat_obs in enumerate(stat_obs):
        #     x_new = np.einsum('i,ij,j->', single_stat_obs - mean_sim, robust_precision_sim, single_stat_obs - mean_sim)
        #     likelihoods[i] = np.exp(-0.5 * x_new)
        # do without for loop:
        diff = stat_obs - mean_sim.reshape(1, -1)
        x_news = np.einsum('bi,ij,bj->b', diff, robust_precision_sim, diff)
        logliks = -0.5 * x_news
        logfactor = 0.5 * self.stat_obs.shape[0] * robust_precision_sim_logdet
        return np.sum(logliks) + logfactor  # compute the sum of the different loglikelihoods for each observation


[docs]class SemiParametricSynLikelihood(Approx_likelihood):

[docs]    def __init__(self, statistics_calc, bw_method_marginals="silverman"):
        """
        This class implements the approximate likelihood function which computes the approximate
        likelihood using the semiparametric Synthetic Likelihood (semiBSL) approach described in [1]. Specifically, this
        represents the likelihood as a product of univariate marginals and the copula components (exploiting Sklar's
        theorem).
        The marginals are approximated from simulations using a Gaussian KDE, while the copula is assumed to be a Gaussian
        copula, whose parameters are estimated from data as well.

        This does not yet include shrinkage strategies for the correlation matrix.

        [1] An, Z., Nott, D. J., & Drovandi, C. (2020). Robust Bayesian synthetic likelihood via a semi-parametric approach.
        Statistics and Computing, 30(3), 543-557.

        Parameters
        ----------
        statistics_calc : abcpy.statistics.Statistics
            Statistics extractor object that conforms to the Statistics class.
        bw_method_marginals : str, scalar or callable, optional
            The method used to calculate the estimator bandwidth, passed to `scipy.stats.gaussian_kde`. Following the docs
            of that method, this can be 'scott', 'silverman', a scalar constant or a callable. If a scalar, this will be
            used directly as `kde.factor`. If a callable, it should take a `gaussian_kde` instance as only parameter
            and return a scalar. If None (default), 'silverman' is used. See the Notes in `scipy.stats.gaussian_kde`
            for more details.
        """
        super(SemiParametricSynLikelihood, self).__init__(statistics_calc)
        # create a dict in which store the denominator of the correlation matrix for the different n values;
        # this saves from repeating computations:
        self.corr_matrix_denominator = {}
        self.bw_method_marginals = bw_method_marginals  # the bw method to use in the gaussian_kde

[docs]    def loglikelihood(self, y_obs, y_sim):
        """Computes the loglikelihood. This implementation aims to be equivalent to the `BSL` R package,
        but the results are slightly different due to small differences in the way the KDE is performed

        Parameters
        ----------
        y_obs: Python list
            Observed data set.
        y_sim: Python list
            Simulated data set from model at the parameter value.

        Returns
        -------
        float
            Computed approximate loglikelihood.
        """

        stat_obs, stat_sim = self._calculate_summary_stat(y_obs, y_sim)
        n_obs, d = stat_obs.shape
        if d < 2:
            raise RuntimeError("The dimension of the statistics need to be at least 2 in order to apply semiBSL.")

        # first: estimate the marginal KDEs for each coordinate
        logpdf_obs = np.zeros_like(stat_obs)  # this will contain the estimated pdf at the various observation points
        u_obs = np.zeros_like(stat_obs)  # this instead will contain the transformed u's using the estimated CDF
        for j in range(d):
            # estimate the KDE using the data in stat_sim for coordinate j. This leads to slightly different results
            # from the R package implementation due to slightly different way to estimate the factor as well as
            # different way to evaluate the kernel (they use a weird interpolation there).
            kde = gaussian_kde(stat_sim[:, j], bw_method=self.bw_method_marginals)
            logpdf_obs[:, j] = kde.logpdf(stat_obs[:, j])
            for i in range(n_obs):  # loop over the different observations
                u_obs[i, j] = kde.integrate_box_1d(-np.infty, stat_obs[i, j])  # compute the CDF
        etas_obs = norm.ppf(u_obs)

        # second: estimate the correlation matrix for the gaussian copula using gaussian rank correlation
        R_hat = self._estimate_gaussian_correlation(stat_sim)
        R_hat_inv = np.linalg.inv(R_hat)
        R_sign_det, R_inv_logdet = np.linalg.slogdet(R_hat_inv)  # sign not used

        # third: combine the two to compute the loglikelihood;
        # for each observation:
        # logliks = np.zeros(n_obs)
        # for i in range(n_obs):
        #     logliks[i] = np.sum(logpdf_obs[i])  # sum along marginals along dimensions
        #     # add the copula density:
        #     logliks[i] += 0.5 * R_inv_logdet
        #     logliks[i] -= 0.5 * np.einsum("i,ij,j->", etas_obs[i], R_hat_inv - np.eye(d), etas_obs[i])

        # do jointly:
        loglik = np.sum(logpdf_obs)  # sum along marginals along dimensions
        # add the copula density:
        copula_density = -0.5 * np.einsum("bi,ij,bj->b", etas_obs, R_hat_inv - np.eye(d), etas_obs)
        loglik += np.sum(copula_density) + 0.5 * n_obs * R_inv_logdet

        return loglik

    def _estimate_gaussian_correlation(self, x):
        """Estimates the correlation matrix using data in `x` in the way described in [1]. This implementation
        gives the same results as the `BSL` R package.

        Parameters
        ----------
        x: np.ndarray
            Data set.

        Returns
        -------
        np.ndarray
            Estimated correlation matrix for the gaussian copula.
        """
        n, d = x.shape
        r = np.zeros_like(x)
        for j in range(d):
            r[:, j] = rankdata(x[:, j])

        rqnorm = norm.ppf(r / (n + 1))

        if n not in self.corr_matrix_denominator.keys():
            # compute the denominator:
            self.corr_matrix_denominator[n] = np.sum(norm.ppf((np.arange(n) + 1) / (n + 1)) ** 2)
        denominator = self.corr_matrix_denominator[n]

        R_hat = np.einsum('ki,kj->ij', rqnorm, rqnorm) / denominator

        return R_hat


[docs]class PenLogReg(Approx_likelihood, GraphTools):

[docs]    def __init__(self, statistics_calc, model, n_simulate, n_folds=10, max_iter=100000, seed=None):
        """This class implements the approximate likelihood function which computes the approximate
        likelihood up to a constant using penalized logistic regression described in
        Dutta et. al. [1]. It takes one additional function handler defining the
        true model and two additional parameters n_folds and n_simulate correspondingly defining number
        of folds used to estimate prediction error using cross-validation and the number
        of simulated dataset sampled from each parameter to approximate the likelihood
        function. For lasso penalized logistic regression we use glmnet of Friedman et.
        al. [2].

        [1] Thomas, O., Dutta, R., Corander, J., Kaski, S., & Gutmann, M. U. (2020).
        Likelihood-free inference by ratio estimation. Bayesian Analysis.

        [2] Friedman, J., Hastie, T., and Tibshirani, R. (2010). Regularization
        paths for generalized linear models via coordinate descent. Journal of Statistical
        Software, 33(1), 1–22.

        Parameters
        ----------
        statistics_calc : abcpy.statistics.Statistics
            Statistics extractor object that conforms to the Statistics class.
        model : abcpy.models.Model
            Model object that conforms to the Model class.
        n_simulate : int
            Number of data points to simulate for the reference data set; this has to be the same as n_samples_per_param
            when calling the sampler. The reference data set is generated by drawing parameters from the prior and
            samples from the model when PenLogReg is instantiated.
        n_folds: int, optional
            Number of folds for cross-validation. The default value is 10.
        max_iter: int, optional
            Maximum passes over the data. The default is 100000.
        seed: int, optional
            Seed for the random number generator. The used glmnet solver is not
            deterministic, this seed is used for determining the cv folds. The default value is
            None.
        """

        super(PenLogReg, self).__init__(statistics_calc)  # call the super init to initialize correctly

        self.model = model
        self.n_folds = n_folds
        self.n_simulate = n_simulate
        self.seed = seed
        self.rng = np.random.RandomState(seed)
        self.max_iter = max_iter
        # Simulate reference data and extract summary statistics from the reference data
        self.ref_data_stat = self._simulate_ref_data(rng=self.rng)[0]

[docs]    def loglikelihood(self, y_obs, y_sim):
        """Computes the loglikelihood.

        Parameters
        ----------
        y_obs: Python list
            Observed data set.
        y_sim: Python list
            Simulated data set from model at the parameter value.

        Returns
        -------
        float
            Computed approximate loglikelihood.
        """
        stat_obs, stat_sim = self._calculate_summary_stat(y_obs, y_sim)

        if not stat_sim.shape[0] == self.n_simulate:
            raise RuntimeError("The number of samples in the reference data set is not the same as the number of "
                               "samples in the generated data. Please check that `n_samples` in the `sample()` method"
                               "for the sampler is equal to `n_simulate` in PenLogReg.")

        # Compute the approximate likelihood for the y_obs given theta
        y = np.append(np.zeros(self.n_simulate), np.ones(self.n_simulate))
        X = np.array(np.concatenate((stat_sim, self.ref_data_stat), axis=0))
        # define here groups for cross-validation:
        groups = np.repeat(np.arange(self.n_folds), np.int(np.ceil(self.n_simulate / self.n_folds)))
        groups = groups[:self.n_simulate].tolist()
        groups += groups  # duplicate it as groups need to be defined for both datasets
        m = LogitNet(alpha=1, n_splits=self.n_folds, max_iter=self.max_iter, random_state=self.seed, scoring="log_loss")
        m = m.fit(X, y, groups=groups)
        result = -np.sum((m.intercept_ + np.sum(np.multiply(m.coef_, stat_obs), axis=1)), axis=0)

        return result

    def _simulate_ref_data(self, rng=np.random.RandomState()):
        """
        Simulate the reference data set. This code is run at the initialization of
        Penlogreg

        Parameters
        ----------
        rng: Random number generator, optional
            Defines the random number generator to be used. If None, a newly initialized one is used

        Returns
        -------
        list
            The simulated list of datasets.

        """

        ref_data_stat = [[None] * self.n_simulate for i in range(len(self.model))]
        self.sample_from_prior(rng=rng)
        for model_index, model in enumerate(self.model):
            ind = 0
            while ref_data_stat[model_index][-1] is None:
                data = model.forward_simulate(model.get_input_values(), 1, rng=rng)
                # this is wrong, it applies the computation of the statistic independently to the element of data[0]:
                # print("data[0]", data[0].tolist())
                # data_stat = self.statistics_calc.statistics(data[0].tolist())
                # print("stat of data[0]", data_stat)
                # print("data", data)
                data_stat = self.statistics_calc.statistics(data)
                # print("stat of data", data_stat)
                ref_data_stat[model_index][ind] = data_stat
                ind += 1
            ref_data_stat[model_index] = np.squeeze(np.asarray(ref_data_stat[model_index]))
        return ref_data_stat