Source code for pyleoclim.utils.correlation

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Relevant functions for correlation analysis
"""

__all__ = [
    'fdr',
    'association'
]

import numpy as np
import scipy.stats as stats
from .tsmodel import ar1_fit_evenly
#from .tsutils import phaseran


# def corr_sig(y1, y2, nsim=1000, method='isospectral', alpha=0.05):
#     """ Estimates the Pearson's correlation and associated significance between two non IID time series
    
#     The significance of the correlation is assessed using one of the following methods:
        
#     1) 'ttest': T-test adjusted for effective sample size. 
#         This is a parametric test (data are Gaussian and identically distributed) with a rather ad-hoc adjustment. 
#         It is instantaneous but makes a lot of assumptions about the data, many of which may not be met.
#     2) 'isopersistent': AR(1) modeling of x and y.
#         This is a parametric test as well (series follow an AR(1) model) but 
#         solves the issue by direct simulation. 
#     3) 'isospectral': phase randomization of original inputs. (default)
#         This is a non-parametric method, assuming only wide-sense stationarity
        
#     For 2 and 3, computational requirements scale with nsim.
#     When possible, nsim should be at least 1000. 

#     Parameters
#     ----------
#     y1 : array
#         vector of (real) numbers of same length as y2, no NaNs allowed
        
#     y2 : array
#         vector of (real) numbers of same length as y1, no NaNs allowed
        
#     nsim : int
#         the number of simulations [default: 1000]
        
#     method : str; {'ttest','isopersistent','isospectral' (default)}
#         method for significance testing
        
#     alpha : float
#         significance level for critical value estimation [default: 0.05]

#     Returns
#     -------
#     res : dict 
#         the result dictionary, containing

#         - r : float
#             correlation coefficient
#         - p : float 
#             the p-value
#         - signif : bool
#             true if significant; false otherwise
#             Note that signif = True if and only if p <= alpha.
         
#     See also
#     --------

#     pyleoclim.utils.correlation.corr_ttest : Estimates the significance of correlations between 2 time series using the classical T-test adjusted for effective sample size
#     pyleoclim.utils.correlation.corr_isopersist : Computes correlation between two timeseries, and their significance using Ar(1) modeling
#     pyleoclim.utils.correlation.corr_isospec : Estimates the significance of the correlation using phase randomization
#     pyleoclim.utils.correlation.fdr : Determine significance based on the false discovery rate
     
#     """
#     y1 = np.array(y1, dtype=float)
#     y2 = np.array(y2, dtype=float)

#     assert np.size(y1) == np.size(y2), 'The size of y1 and y2 should be the same'

#     if method == 'ttest':
#         (r, signif, p) = corr_ttest(y1, y2, alpha=alpha)
#     elif method == 'isopersistent':
#         (r, signif, p) = corr_isopersist(y1, y2, alpha=alpha, nsim=nsim)
#     elif method == 'isospectral':
#         (r, signif, p) = corr_isospec(y1, y2, alpha=alpha, nsim=nsim)
        
#     res={'r':r,'signif':signif,'p':p}    
    
#     return res

[docs]def fdr(pvals, qlevel=0.05, method='original', adj_method=None, adj_args={}): ''' Determine significance based on the false discovery rate The false discovery rate is a method of conceptualizing the rate of type I errors in null hypothesis testing when conducting multiple comparisons. Translated from fdr.R by Dr. Chris Paciorek Parameters ---------- pvals : list or array A vector of p-values on which to conduct the multiple testing. qlevel : float The proportion of false positives desired. method : {'original', 'general'} Method for performing the testing. - 'original' follows Benjamini & Hochberg (1995); - 'general' is much more conservative, requiring no assumptions on the p-values (see Benjamini & Yekutieli (2001)). 'original' is recommended, and if desired, using 'adj_method="mean"' to increase power. adj_method: {'mean', 'storey', 'two-stage'} Method for increasing the power of the procedure by estimating the proportion of alternative p-values. - 'mean', the modified Storey estimator in Ventura et al. (2004) - 'storey', the method of Storey (2002) - 'two-stage', the iterative approach of Benjamini et al. (2001) adj_args : dict Arguments for adj_method; see prop_alt() for description, but note that for "two-stage", qlevel and fdr_method are taken from the qlevel and method arguments for fdr() Returns ------- fdr_res : array or None A vector of the indices of the significant tests; None if no significant tests See also -------- References ---------- - fdr.R by Dr. Chris Paciorek: https://www.stat.berkeley.edu/~paciorek/research/code/code.html ''' n = len(pvals) a = 0 if adj_method is not None: if adj_method == 'two-stage': qlevel = qlevel / (1+qlevel) # see Benjamini et al. (2001) for proof that this controls the FDR at level qlevel adj_args['qlevel'] = qlevel adj_args['fdr_method'] = method print(f'Adjusting cutoff using two-stage method, with method: {adj_args["fdr_method"]}; qlevel: {adj_args["qlevel"]}') elif adj_method == 'mean': if adj_args == {}: # default arguments for "mean" method of Ventura et al. (2004) adj_args['edf_lower'] = 0.8 adj_args['num_steps'] = 20 print(f'Adjusting cutoff using mean method, with edf_lower: {adj_args["edf_lower"]}; num_steps: {adj_args["num_steps"]}') a = prop_alt(pvals, adj_method, adj_args) if a == 1: # all hypotheses are estimated to be alternatives fdr_res = np.arange(n) else: qlevel = qlevel / (1-a) # adjust for estimate of a; default is 0 fdr_res = fdr_master(pvals, qlevel, method) return fdr_res
#----------- # Utilities #-----------
[docs]def corr_ttest(y1, y2, alpha=0.05, df_min=10): """ Estimates the significance of correlations between 2 time series using the classical T-test adjusted for effective sample size. The degrees of freedom are adjusted following n_eff=n(1-g)/(1+g) where g is the lag-1 autocorrelation. Parameters ---------- y1 : array vectors of (real) numbers with identical length, no NaNs allowed y2 : array vectors of (real) numbers with identical length, no NaNs allowed alpha : float significance level for critical value estimation [default: 0.05] df_min : float threshold for the effective degrees of freedom, under which things are not computed Returns ------- r : float correlation between x and y signif : bool true (1) if significant; false (0) otherwise pval : float test p-value (the probability of the test statistic exceeding the observed one by chance alone) See also -------- pyleoclim.utils.correlation.fdr : Determine significance based on the false discovery rate """ r = stats.pearsonr(y1, y2)[0] g1 = ar1_fit_evenly(y1) g2 = ar1_fit_evenly(y2) N = np.size(y1) Ney1 = N * (1-g1) / (1+g1) Ney2 = N * (1-g2) / (1+g2) Ne = stats.mstats.gmean([Ney1,Ney2]) assert Ne >= df_min, 'Too few effective d.o.f. to apply this method!' df = Ne - 2 t = r*np.sqrt(df/(1-r**2)) pval = stats.t.cdf(-np.abs(t), df) # one sided T-test ; leverage symmetry of T distribution tcrit = stats.t.ppf(1-alpha,df) rcrit = np.sign(r)*np.sqrt(tcrit**2/(df*(1+tcrit**2))) signif = pval <= alpha return r, signif, pval, rcrit
# def corr_isopersist(y1, y2, alpha=0.05, nsim=1000): # ''' Computes the Pearson's correlation between two timeseries, and their significance using Ar(1) modeling. # The significance is gauged via a non-parametric (Monte Carlo) simulation of # correlations with nsim AR(1) processes with identical persistence # properties as x and y ; the measure of which is the lag-1 autocorrelation (g). # Parameters # ---------- # y1 : array # vectors of (real) numbers with identical length, no NaNs allowed # y2 : array # vectors of (real) numbers with identical length, no NaNs allowed # alpha : float # significance level for critical value estimation [default: 0.05] # nsim : int # number of simulations [default: 1000] # Returns # ------- # r : float # correlation between x and y # signif : bool # true (1) if significant; false (0) otherwise # pval : float # test p-value (the probability of the test statstic exceeding the observed one by chance alone) # Notes # ----- # The probability of obtaining a test statistic at least as extreme as the one actually observed, # assuming that the null hypothesis is true. # The test is 1 tailed on |r|: Ho = { |r| = 0 }, Ha = { |r| > 0 } # The test is rejected (signif = 1) if pval <= alpha, otherwise signif=0; # (Some Rights Reserved) Hepta Technologies, 2009 # v1.0 USC, Aug 10 2012, based on corr_signif. # See also # -------- # pyleoclim.utils.correlation.corr_sig : Estimates the Pearson's correlation and associated significance between two non IID time series # pyleoclim.utils.correlation.corr_ttest: Estimates Pearson's correlation and associated significance using a t-test # pyleoclim.utils.correlation.corr_isospec : Estimates Pearson's correlation and associated significance using # pyleoclim.utils.correlation.fdr : Determine significance based on the false discovery rate # ''' # r = stats.pearsonr(y1, y2)[0] # ra = np.abs(r) # y1_red, g1 = isopersistent_rn(y1, nsim) # y2_red, g2 = isopersistent_rn(y2, nsim) # rs = np.zeros(nsim) # for i in np.arange(nsim): # rs[i] = stats.pearsonr(y1_red[:, i], y2_red[:, i])[0] # rsa = np.abs(rs) # xi = np.linspace(0, 1.1*np.max([ra, np.max(rsa)]), 200) # kde = stats.gaussian_kde(rsa) # prob = kde(xi).T # diff = np.abs(ra - xi) # # min_diff = np.min(diff) # pos = np.argmin(diff) # pval = np.trapz(prob[pos:], xi[pos:]) # rcrit = np.percentile(rsa, 100*(1-alpha)) # signif = ra >= rcrit # return r, signif, pval # def corr_isospec(y1, y2, alpha=0.05, nsim=1000): # ''' Estimates the significance of the correlation using phase randomization # Estimates the significance of correlations between non IID # time series by phase randomization of original inputs. # This function creates 'nsim' random time series that have the same power # spectrum as the original time series but random phases. # Parameters # ---------- # y1 : array # vectors of (real) numbers with identical length, no NaNs allowed # y2 : array # vectors of (real) numbers with identical length, no NaNs allowed # alpha : float # significance level for critical value estimation [default: 0.05] # nsim : int # number of simulations [default: 1000] # Returns # ------- # r : float # correlation between y1 and y2 # signif : bool # true (1) if significant; false (0) otherwise # F : float # Fraction of time series with higher correlation coefficents than observed (approximates the p-value). # See also # -------- # pyleoclim.utils.correlation.corr_sig : Estimates the Pearson's correlation and associated significance between two non IID time series # pyleoclim.utils.correlation.corr_ttest : Estimates Pearson's correlation and associated significance using a t-test # pyleoclim.utils.correlation.corr_isopersist : Estimates Pearson's correlation and associated significance using AR(1) simulations # pyleoclim.utils.correlation.fdr : Determine significance based on the false discovery rate # References # ---------- # - Ebisuzaki, W, 1997: A method to estimate the statistical significance of a correlation when the data are serially correlated. J. of Climate, 10, 2147-2153. # - Prichard, D., Theiler, J. Generating Surrogate Data for Time Series with Several Simultaneously Measured Variables (1994) Physical Review Letters, Vol 73, Number 7 (Some Rights Reserved) USC Climate Dynamics Lab, 2012. # ''' # r = stats.pearsonr(y1, y2)[0] # # generate phase-randomized samples using the Theiler & Prichard method # Y1surr = phaseran(y1, nsim) # Y2surr = phaseran(y2, nsim) # # compute correlations # Y1s = preprocessing.scale(Y1surr) # Y2s = preprocessing.scale(Y2surr) # n = np.size(y1) # C = np.dot(np.transpose(Y1s), Y2s) / (n-1) # rSim = np.diag(C) # # compute fraction of values higher than observed # F = np.sum(np.abs(rSim) >= np.abs(r)) / nsim # # establish significance # signif = F < alpha # significant or not? # return r, signif, F ''' The FDR procedures translated from fdr.R by Dr. Chris Paciorek (https://www.stat.berkeley.edu/~paciorek/research/code/code.html) '''
[docs]def fdr_basic(pvals,qlevel=0.05): ''' The basic FDR of Benjamini & Hochberg (1995). Parameters ---------- pvals : list or array A vector of p-values on which to conduct the multiple testing. qlevel : float The proportion of false positives desired. Returns ------- fdr_res : array or None A vector of the indices of the significant tests; None if no significant tests See also -------- pyleoclim.utils.correlation.fdf : Determine significance based on the false discovery rate References ---------- Benjamini, Yoav; Hochberg, Yosef (1995). "Controlling the false discovery rate: a practical and powerful approach to multiple testing". Journal of the Royal Statistical Society, Series B. 57 (1): 289–300. MR 1325392 ''' n = len(pvals) sorted_pvals = np.sort(pvals) sort_index = np.argsort(pvals) indices = np.arange(1, n+1)*(sorted_pvals <= qlevel*np.arange(1, n+1)/n) num_reject = np.max(indices) if num_reject: indices = np.arange(num_reject) fdr_res = np.sort(sort_index[indices]) else: fdr_res = None return fdr_res
[docs]def fdr_master(pvals, qlevel=0.05, method='original'): ''' Perform various versions of the FDR procedure Parameters ---------- pvals : list or array A vector of p-values on which to conduct the multiple testing. qlevel : float The proportion of false positives desired. method : {'original', 'general'} Method for performing the testing. - 'original' follows Benjamini & Hochberg (1995); - 'general' is much more conservative, requiring no assumptions on the p-values (see Benjamini & Yekutieli (2001)). We recommend using 'original', and if desired, using 'adj_method="mean"' to increase power. Returns ------- fdr_res : array or None A vector of the indices of the significant tests; None if no significant tests See also -------- pyleoclim.utils.correlation.fdf : Determine significance based on the false discovery rate References ---------- - Benjamini, Yoav; Hochberg, Yosef (1995). "Controlling the false discovery rate: a practical and powerful approach to multiple testing". Journal of the Royal Statistical Society, Series B. 57 (1): 289–300. MR 1325392 - Benjamini, Yoav; Yekutieli, Daniel (2001). "The control of the false discovery rate in multiple testing under dependency". Annals of Statistics. 29 (4): 1165–1188. doi:10.1214/aos/1013699998 ''' if method == 'general': n = len(pvals) qlevel = qlevel / np.sum(1/np.arange(1, n+1)) fdr_res = fdr_basic(pvals, qlevel) return fdr_res
[docs]def storey(edf_quantile, pvals): ''' The basic Storey (2002) estimator of a, the proportion of alternative hypotheses. Parameters ---------- edf_quantile : float The quantile of the empirical distribution function at which to estimate a. pvals : list or array A vector of p-values on which to estimate a Returns ------- a : int estimate of a, the number of alternative hypotheses See also -------- pyleoclim.utils.correlation.fdf : Determine significance based on the false discovery rate References ---------- - Storey, J.D., 2002, A direct approach to False Discovery Rates. Journal of the Royal Statistical Society, Series B, 64, 3, 479-498 ''' if edf_quantile >= 1 or edf_quantile <= 0: raise ValueError(f'Wrong edf_quantile: {edf_quantile}; must be within (0, 1)!') pvals = np.array(pvals) a = (np.mean(pvals<=edf_quantile) - edf_quantile) / (1 - edf_quantile) a = np.max(a, 0) # set to 0 if a is negative return a
[docs]def prop_alt(pvals, adj_method='mean', adj_args={'edf_lower': 0.8, 'num_steps': 20}): ''' Calculate an estimate of a, the proportion of alternative hypotheses, using one of several methods Parameters ---------- pvals : list or array A vector of p-values on which to estimate a adj_method: str; {'mean', 'storey', 'two-stage'} Method for increasing the power of the procedure by estimating the proportion of alternative p-values. - 'mean', the modified Storey estimator that we suggest in Ventura et al. (2004) - 'storey', the method of Storey (2002) - 'two-stage', the iterative approach of Benjamini et al. (2001) adj_args : dict - for "mean", specify "edf_lower", the smallest quantile at which to estimate a, and "num_steps", the number of quantiles to use the approach uses the average of the Storey (2002) estimator for the num_steps quantiles starting at "edf_lower" and finishing just less than 1 - for "storey", specify "edf_quantile", the quantile at which to calculate the estimator - for "two-stage", the method uses a standard FDR approach to estimate which p-values are significant this number is the estimate of a; therefore the method requires specification of qlevel, the proportion of false positives and "fdr_method" ('original' or 'general'), the FDR method to be used. We do not recommend 'general' as this is very conservative and will underestimate a. Returns ------- a : int estimate of a, the number of alternative hypotheses See also -------- pyleoclim.utils.correlation.fdf : Determine significance based on the false discovery rate References ---------- - Storey, J.D. (2002). A direct approach to False Discovery Rates. Journal of the Royal Statistical Society, Series B, 64, 3, 479-498 - Benjamini, Yoav; Yekutieli, Daniel (2001). "The control of the false discovery rate in multiple testing under dependency". Annals of Statistics. 29 (4): 1165–1188. doi:10.1214/aos/1013699998 - Ventura, V., Paciorek, C., Risbey, J.S. (2004). Controlling the proportion of falsely rejected hypotheses when conducting multiple tests with climatological data. Journal of climate, 17, 4343-4356 ''' n = len(pvals) if adj_method == 'two-stage': fdr_res = fdr_master(pvals, adj_method['qlevel'], adj_args['fdr_method']) a = len(fdr_res)/n return a elif adj_method == 'storey': if 'edf_quantile' not in adj_args: raise ValueError('`edf_quantile` must be specified in `adj_args`!') a = storey(adj_args['edf_quantile'], pvals) return a elif adj_method == 'mean': if adj_args['edf_lower']>=1 or adj_args['edf_lower']<=0: raise ValueError(f'Wrong edf_lower: {adj_args["edf_lower"]}; must be within (0, 1)!') if adj_args['num_steps']<1 or type(adj_args['num_steps']) is not int: raise ValueError(f'Wrong num_steps: {adj_args["num_steps"]}; must be an integer >= 1') stepsize = (1 - adj_args['edf_lower']) / adj_args['num_steps'] edf_quantiles = np.linspace(adj_args['edf_lower'], adj_args['edf_lower']+stepsize*(adj_args['num_steps']-1), adj_args['num_steps']) a_vec = [storey(edf_q, pvals) for edf_q in edf_quantiles] a = np.mean(a_vec) return a else: raise ValueError('Wrong method: {method}!')
[docs]def cov_shrink_rblw(S, n): """Compute a shrinkage estimate of the covariance matrix using the Rao-Blackwellized Ledoit-Wolf estimator described by Chen et al. 2011 [1] Contributed by `Robert McGibbon <https://rmcgibbo.org/>`_. Parameters ---------- S : array, shape=(n, n) Sample covariance matrix (e.g. estimated with np.cov(X.T)) n : int Number of data points used in the estimate of S. Returns ------- sigma : array, shape=(p, p) Estimated shrunk covariance matrix shrinkage : float The applied covariance shrinkage intensity. Notes ----- See the `covar documentation <https://pythonhosted.org/covar/generated/covar.cov_shrink_rblw.html>`_ for math details. References ---------- - Y. Chen, A. Wiesel and A. O. Hero (2011), Robust Shrinkage Estimation of High-Dimensional Covariance Matrices, IEEE Transactions on Signal Processing, vol. 59, no. 9, pp. 4097-4107, doi:10.1109/TSP.2011.2138698 See Also -------- sklearn.covariance.ledoit_wolf : very similar approach using the same shrinkage target, :math:`T`, but a different method for estimating the shrinkage intensity, :math:`gamma`. """ p = S.shape[0] if S.shape[1] != p: raise ValueError('S must be a (p x p) matrix') alpha = (n-2)/(n*(n+2)) beta = ((p+1)*n - 2) / (n*(n+2)) trace_S = 0 # np.trace(S) trace_S2 = 0 # np.trace(S.dot(S)) for i in range(p): trace_S += S[i,i] for j in range(p): trace_S2 += S[i,j]*S[i,j] U = ((p * trace_S2 / (trace_S*trace_S)) - 1) rho = min(alpha + beta/U, 1) F = (trace_S / p) * np.eye(p) return (1-rho)*np.asarray(S) + rho*F, rho
[docs]def association(y1, y2, statistic='pearsonr',settings=None): ''' Quantify the strength of a relationship (e.g. linear) between paired observations y1 and y2. Parameters ---------- y1 : array, length n vector of (real) numbers of same length as y2, no NaNs allowed y2 : array, length n vector of (real) numbers of same length as y1, no NaNs allowed statistic : str, optional The statistic used to measure the association, to be chosen from a subset of https://docs.scipy.org/doc/scipy/reference/stats.html#association-correlation-tests ['pearsonr','spearmanr','pointbiserialr','kendalltau','weightedtau'] The default is 'pearsonr'. settings : dict, optional optional arguments to modify the behavior of the SciPy association functions Raises ------ ValueError Complains loudly if the requested statistic is not from the list above. Returns ------- res : instance result class structure containing the result. The first element (res[0]) is always the statistic. ''' y1 = np.array(y1, dtype=float) y2 = np.array(y2, dtype=float) assert np.size(y1) == np.size(y2), 'The size of y1 and y2 should be the same' args = {} if settings is None else settings.copy() acceptable_methods = ['linregress','pearsonr','spearmanr','pointbiserialr','kendalltau','weightedtau'] if statistic in acceptable_methods: func = getattr(stats, statistic) res = func(y1,y2,**args) else: raise ValueError(f'Wrong statistic: {statistic}; acceptable choices are {acceptable_methods}') return res