Source code for pyleoclim.utils.decomposition

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Feb 25 06:29:36 2020

@author: deborahkhider
Contains eigendecomposition methods:
Principal Component Analysis, Singular Spectrum Analysis, Multi-channel SSA
"""

__all__ = [
    'mcpca',
    'pca',
    'ssa',
    'mssa',
]

import numpy as np
from sklearn.decomposition import PCA
from .tsutils import standardize
from .tsmodel import ar1_sim
from scipy.linalg import eigh, toeplitz
from nitime import algorithms as alg
from statsmodels.multivariate.pca import PCA
import copy

#------
# Main functions
#------

def mcpca(ys, nMC=200, **pca_kwargs):
    '''Monte-Carlo Principal Component Analysis 
    
    Carries out Principal Component Analysis  (most unfortunately named EOF analysis in the meteorology
    and climate literature) on a data matrix ys.  
    If NaNs are present, they will be infilled via the EM algorithm.
    
    The significance of eigenvalues is gauged against those of AR(1) surrogates fit to the data.
              
    Parameters
    ----------
    ys : 2D numpy array (nt, nrec)
        nt   = number of time samples (assumed identical for all records)
        nrec = number of records (aka variables, channels, etc)
        
    nMC : int 
        the number of Monte-Carlo simulations
    
    pca_kwargs : tuple 
        keyword arguments for the PCA method
    
    Returns
    -------
    res : dict containing:
        
        - eigvals : eigenvalues (nrec,)

        - eigvals95 : eigenvalues of the AR(1) ensemble (nrec, nMC)

        - pcs : PC series of all components (nt, rec)

        - eofs : EOFs of all components (nrec, nrec)
    
    References:
    ----------    
    Deininger, M., McDermott, F., Mudelsee, M. et al. (2017): Coherency of late Holocene 
    European speleothem δ18O records linked to North Atlantic Ocean circulation. 
    Climate Dynamics, 49, 595–618. https://doi.org/10.1007/s00382-016-3360-8

    Written by Jun Hu (Rice University).
    Adapted for pyleoclim by Julien Emile-Geay (USC)
    '''
    nt, nrec = ys.shape
    
    pc_mc = np.zeros((nt,nrec)) # principal components
    eof_mc = np.zeros((nrec,nrec))  #eof (spatial loadings)
    #eigenvalue matrices
    eigvals = np.zeros((nrec))
    eig_ar1 = np.zeros((nrec,nMC))

    # apply PCA algorithm to the data matrix     
    pc = PCA(ys,ncomp=nrec, **pca_kwargs) # TODO : implement EM infilling with missing = ‘fill-em’ 
    eigvals = pc.eigenvals
    
    # generate surrogate matrix
    y_ar1 = np.full((nt,nrec,nMC), 0, dtype=np.double)
    
    for i in range(nrec):
        yi = copy.deepcopy(ys[:,i])
        # generate nMC AR(1) surrogates
        y_ar1[:,i,:] = ar1_sim(yi, nMC)
        # assign PC and EOF matrices
        if pc.loadings[:,i][0]>0:
            eof_mc[:,i]  = pc.loadings[:,i]
            pc_mc[:,i]   = pc.factors[:,i]
        else:   # flip sign (arbitrary)
            eof_mc[:,i]  = -pc.loadings[:,i] 
            pc_mc[:,i]   = -pc.factors[:,i]
            
    # loop over Monte Carlo iterations     
    for m in range(nMC):    
        pc_ar1 = PCA(y_ar1[:,:,m],ncomp=nrec,**pca_kwargs)
        eig_ar1[:,m] = pc_ar1.eigenvals
 
    eig95 = np.percentile(eig_ar1, 95, axis=1)
                
    # assign result
    res = {'eigvals': eigvals, 'eigvals95': eig95, 'pcs': pc_mc, 'eofs': eof_mc}

    return res



[docs]def pca(ys,n_components=None,copy=True,whiten=False, svd_solver='auto',tol=0.0,iterated_power='auto',random_state=None): '''Principal Component Analysis (Empirical Orthogonal Functions) Decomposition of a signal or data set in terms of orthogonal basis functions. From scikit-learn: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html Parameters ---------- ys : array timeseries n_components : int,None,or str [default: None] Number of components to keep. if n_components is not set all components are kept: If n_components == 'mle' and svd_solver == 'full', Minka’s MLE is used to guess the dimension. Use of n_components == 'mle' will interpret svd_solver == 'auto' as svd_solver == 'full'. If 0 < n_components < 1 and svd_solver == 'full', select the number of components such that the amount of variance that needs to be explained is greater than the percentage specified by n_components. If svd_solver == 'arpack', the number of components must be strictly less than the minimum of n_features and n_samples. copy : bool,optional [default: True] If False, data passed to fit are overwritten and running fit(X).transform(X) will not yield the expected results, use fit_transform(X) instead. whiten : bool,optional [default: False] When True (False by default) the components_ vectors are multiplied by the square root of n_samples and then divided by the singular values to ensure uncorrelated outputs with unit component-wise variances. svd_solver : str {‘auto’, ‘full’, ‘arpack’, ‘randomized’} If auto : The solver is selected by a default policy based on X.shape and n_components: if the input data is larger than 500x500 and the number of components to extract is lower than 80% of the smallest dimension of the data, then the more efficient ‘randomized’ method is enabled. Otherwise the exact full SVD is computed and optionally truncated afterwards. If full : run exact full SVD calling the standard LAPACK solver via scipy.linalg.svd and select the components by postprocessing If arpack : run SVD truncated to n_components calling ARPACK solver via scipy.sparse.linalg.svds. It requires strictly 0 < n_components < min(X.shape) If randomized : run randomized SVD by the method of Halko et al. tol : float >= 0 ,optional [default: 0] Tolerance for singular values computed by svd_solver == ‘arpack’. iterated_power : int >= 0, or string {'auto'} [default: 'auto'] Number of iterations for the power method computed by svd_solver == ‘randomized’. random_state : int, RandomState instance, or None, optional [default: None] If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by np.random. Used when svd_solver == ‘arpack’ or ‘randomized’. Returns ------- dict Sklearn PCA object dictionary of all attributes and values. -components_array, shape (n_components, n_features) Principal axes in feature space, representing the directions of maximum variance in the data. The components are sorted by explained_variance_. -explained_variance_array, shape (n_components,) The amount of variance explained by each of the selected components. Equal to n_components largest eigenvalues of the covariance matrix of X. New in version 0.18. -explained_variance_ratio_array, shape (n_components,) Percentage of variance explained by each of the selected components. If n_components is not set then all components are stored and the sum of the ratios is equal to 1.0. -singular_values_array, shape (n_components,) The singular values corresponding to each of the selected components. The singular values are equal to the 2-norms of the n_components variables in the lower-dimensional space. New in version 0.19. -mean_array, shape (n_features,) Per-feature empirical mean, estimated from the training set. Equal to X.mean(axis=0). -n_components_int The estimated number of components. When n_components is set to ‘mle’ or a number between 0 and 1 (with svd_solver == ‘full’) this number is estimated from input data. Otherwise it equals the parameter n_components, or the lesser value of n_features and n_samples if n_components is None. -n_features_int Number of features in the training data. -n_samples_int Number of samples in the training data. -noise_variance_float The estimated noise covariance following the Probabilistic PCA model from Tipping and Bishop 1999. See “Pattern Recognition and Machine Learning” by C. Bishop, 12.2.1 p. 574 or http://www.miketipping.com/papers/met-mppca.pdf. It is required to compute the estimated data covariance and score samples. Equal to the average of (min(n_features, n_samples) - n_components) smallest eigenvalues of the covariance matrix of X. ''' if np.any(np.isnan(ys)): raise ValueError('matrix may not have null values.') pca=PCA(n_components=n_components,copy=copy,whiten=whiten,svd_solver=svd_solver,tol=tol,iterated_power=iterated_power,random_state=random_state) return pca.fit(ys).__dict__
[docs]def mssa(ys, M=None, nMC=0, f=0.3): '''Multi-channel singular spectrum analysis analysis Multivariate generalization of SSA [2], using the original algorithm of [1]. Each variable is called a channel, hence the name. Parameters ---------- ys : array multiple time series (dimension: length of time series x total number of time series) M : int window size (embedding dimension, default: 10% of the length of the series) nMC : int Number of iteration in the Monte-Carlo process [default=0, no Monte Carlo process] f : float fraction (0<f<=1) of good data points for identifying significant PCs [f = 0.3] Returns ------- res : dict Containing: - eigvals : array of eigenvalue spectrum - eigvals05 : The 5% percentile of eigenvalues - eigvals95 : The 95% percentile of eigenvalues - PC : matrix of principal components (2D array) - RC : matrix of RCs (nrec,N,nrec*M) (2D array) References ---------- [1]_ Vautard, R., and M. Ghil (1989), Singular spectrum analysis in nonlinear dynamics, with applications to paleoclimatic time series, Physica D, 35, 395–424. [2]_ Jiang, N., J. D. Neelin, and M. Ghil (1995), Quasi-quadrennial and quasi-biennial variability in the equatorial Pacific, Clim. Dyn., 12, 101-112. See Also -------- pyleoclim.utils.decomposition.ssa : Singular Spectrum Analysis (single channel) ''' N = len(ys[:, 0]) nrec = len(ys[0, :]) if M == None: M=int(N/10) Y = np.zeros((N - M + 1, nrec * M)) for irec in np.arange(nrec): for m in np.arange(0, M): Y[:, m + irec * M] = ys[m:N - M + 1 + m, irec] C = np.dot(np.nan_to_num(np.transpose(Y)), np.nan_to_num(Y)) / (N - M + 1) D, eigvecs = eigh(C) sort_tmp = np.sort(D) eigvals = sort_tmp[::-1] sortarg = np.argsort(-D) eigvecs = eigvecs[:, sortarg] # test the signifiance using Monte-Carlo Ym = np.zeros((N - M + 1, nrec * M)) noise = np.zeros((nrec, N, nMC)) for irec in np.arange(nrec): noise[irec, 0, :] = ys[0, irec] eigvals_R = np.zeros((nrec * M, nMC)) # estimate coefficents of ar1 processes, and then generate ar1 time series (noise) # TODO : update to use ar1_sim(), as in ssa() for irec in np.arange(nrec): Xs = ys[:, irec] coefs_est, var_est = alg.AR_est_YW(Xs[~np.isnan(Xs)], 1) sigma_est = np.sqrt(var_est) for jt in range(1, N): noise[irec, jt, :] = coefs_est * noise[irec, jt - 1, :] + sigma_est * np.random.randn(1, nMC) for m in range(nMC): for irec in np.arange(nrec): noise[irec, :, m] = (noise[irec, :, m] - np.mean(noise[irec, :, m])) / ( np.std(noise[irec, :, m], ddof=1)) for im in np.arange(0, M): Ym[:, im + irec * M] = noise[irec, im:N - M + 1 + im, m] Cn = np.dot(np.nan_to_num(np.transpose(Ym)), np.nan_to_num(Ym)) / (N - M + 1) # eigvals_R[:,m] = np.diag(np.dot(np.dot(eigvecs,Cn),np.transpose(eigvecs))) eigvals_R[:, m] = np.diag(np.dot(np.dot(np.transpose(eigvecs), Cn), eigvecs)) eigvals95 = np.percentile(eigvals_R, 95, axis=1) eigvals05 = np.percentile(eigvals_R, 5, axis=1) # determine principal component time series PC = np.zeros((N - M + 1, nrec * M)) PC[:, :] = np.nan for k in np.arange(nrec * M): for i in np.arange(0, N - M + 1): # modify for nan prod = Y[i, :] * eigvecs[:, k] ngood = sum(~np.isnan(prod)) # must have at least m*f good points if ngood >= M * f: PC[i, k] = sum(prod[~np.isnan(prod)]) # the columns of this matrix are Ak(t), k=1 to M (T-PCs) # compute reconstructed timeseries Np = N - M + 1 RC = np.zeros((nrec, N, nrec * M)) for k in np.arange(nrec): for im in np.arange(M): x2 = np.dot(np.expand_dims(PC[:, im], axis=1), np.expand_dims(eigvecs[0 + k * M:M + k * M, im], axis=0)) x2 = np.flipud(x2) for n in np.arange(N): RC[k, n, im] = np.diagonal(x2, offset=-(Np - 1 - n)).mean() res = {'eigvals': eigvals, 'eigvecs': eigvecs, 'q05': eigvals05, 'q95': eigvals95, 'PC': PC, 'RC': RC} return res
[docs]def ssa(y, M=None, nMC=0, f=0.5, trunc=None, var_thresh = 80): '''Singular spectrum analysis Nonparametric eigendecomposition of timeseries into orthogonal oscillations. This implementation uses the method of [1], with applications presented in [2]. Optionally (nMC>0), the significance of eigenvalues is assessed by Monte-Carlo simulations of an AR(1) model fit to X, using [3]. The method expects regular spacing, but is tolerant to missing values, up to a fraction 0<f<1 (see [4]). Parameters ---------- y : array of length N time series (evenly-spaced, possibly with up to f*N NaNs) M : int window size (default: 10% of N) nMC : int Number of iterations in the Monte-Carlo simulation (default nMC=0, bypasses Monte Carlo SSA) Currently only supported for evenly-spaced, gap-free data. f : float maximum allowable fraction of missing values. (Default is 0.5) trunc : str if present, truncates the expansion to a level K < M owing to one of 3 criteria: (1) 'kaiser': variant of the Kaiser-Guttman rule, retaining eigenvalues larger than the median (2) 'mc-ssa': Monte-Carlo SSA (use modes above the 95% threshold) (3) 'var': first K modes that explain at least var_thresh % of the variance. Default is None, which bypasses truncation (K = M) var_thresh : float variance threshold for reconstruction (only impactful if trunc is set to 'var') Returns ------- res : dict Containing: - eigvals : (M, ) array of eigenvalue spectrum - eigvecs : Matrix of temporal eigenvectors - PC : (N - M + 1, M) array of principal components - RCmat : (N, M) array of reconstructed components - RCseries : (N,) reconstructed series - pctvar: (M, ) array of the fraction of variance (%) associated with each mode - eigvals_q : (M, ) array contaitning the 5% and 95% quantiles of the Monte-Carlo eigenvalue spectrum [ if nMC >0 ] References ---------- [1]_ Vautard, R., and M. Ghil (1989), Singular spectrum analysis in nonlinear dynamics, with applications to paleoclimatic time series, Physica D, 35, 395–424. [2]_ Ghil, M., R. M. Allen, M. D. Dettinger, K. Ide, D. Kondrashov, M. E. Mann, A. Robertson, A. Saunders, Y. Tian, F. Varadi, and P. Yiou (2002), Advanced spectral methods for climatic time series, Rev. Geophys., 40(1), 1003–1052, doi:10.1029/2000RG000092. [3]_ Allen, M. R., and L. A. Smith (1996), Monte Carlo SSA: Detecting irregular oscillations in the presence of coloured noise, J. Clim., 9, 3373–3404. [4]_ Schoellhamer, D. H. (2001), Singular spectrum analysis for time series with missing data, Geophysical Research Letters, 28(16), 3187–3190, doi:10.1029/2000GL012698. See Also -------- pyleoclim.utils.decomposition.mssa : Multi-channel SSA ''' ys, mu, scale = standardize(y) N = len(ys) if M == None: M=int(N/10) c = np.zeros(M) for j in range(M): prod = ys[0:N - j] * ys[j:N] c[j] = sum(prod[~np.isnan(prod)]) / (sum(~np.isnan(prod)) - 1) C = toeplitz(c[0:M]) #form correlation matrix D, eigvecs = eigh(C) # solve eigendecomposition sort_tmp = np.sort(D) eigvals = sort_tmp[::-1] sortarg = np.argsort(-D) eigvecs = eigvecs[:, sortarg] # determine principal component time series PC = np.zeros((N - M + 1, M)) PC[:, :] = np.nan for k in np.arange(M): for i in np.arange(0, N - M + 1): # modify for nan prod = ys[i:i + M] * eigvecs[:, k] ngood = sum(~np.isnan(prod)) # must have at least m*f good points if ngood >= M * f: PC[i, k] = sum( prod[~np.isnan(prod)]) * M / ngood # the columns of this matrix are Ak(t), k=1 to M (T-PCs) pctvar = eigvals**2/np.sum(eigvals**2)*100 # percent variance if nMC > 0: # If Monte-Carlo SSA is requested. noise = ar1_sim(ys, nMC) # generate MC AR(1) surrogates of y eigvals_R = np.zeros((M,nMC)) # define eigenvalue matrix lgs = np.arange(-N + 1, N) for m in range(nMC): xn, _ , _ = standardize(noise[:, m]) # center and standardize Gn = np.correlate(xn, xn, "full") Gn = Gn / (N - abs(lgs)) Cn = toeplitz(Gn[N - 1:N - 1 + M]) eigvals_R[:, m] = np.diag(np.dot(np.dot(np.transpose(eigvecs), Cn), eigvecs)) eigvals_q = np.empty((M,2)) eigvals_q[:,0] = np.percentile(eigvals_R, 5, axis=1) eigvals_q[:,1] = np.percentile(eigvals_R, 95, axis=1) mode_idx = np.where(eigvals>=eigvals_q[:,1])[0] # modes to retain else: eigvals_q = None if trunc == 'mcssa': if nMC == 0: raise ValueError('nMC must be larger than 0 to enable MC-SSA truncation') elif trunc == 'kaiser': mval = np.median(eigvals) # median eigenvalues mode_idx = np.where(eigvals>=mval)[0] elif trunc == 'var': mode_idx = np.arange(np.argwhere(np.cumsum(pctvar)>=var_thresh)[0]+1) else: mode_idx = np.arange(M) # compute reconstructed timeseries Np = N - M + 1 RCmat = np.zeros((N, M)) for im in range(M): xdum = np.dot(np.expand_dims(PC[:, im], axis=1), np.expand_dims(eigvecs[0:M, im], axis=0)) xdum = np.flipud(xdum) for n in np.arange(N): RCmat[n, im] = np.diagonal(xdum, offset=-(Np - 1 - n)).mean() del xdum #RCmat = RCmat + np.tile(mu, reps=[N, M]) # restore the mean and variance RCseries = scale*RCmat[:,mode_idx].sum(axis=1) + mu # export results res = {'eigvals': eigvals, 'eigvecs': eigvecs, 'PC': PC, 'RCseries': RCseries, 'RCmat': RCmat, 'pctvar': pctvar, 'eigvals_q': eigvals_q, 'mode_idx': mode_idx} return res