Source code for pyleoclim.utils.decomposition

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Eigendecomposition methods:
Singular Spectrum Analysis (SSA). 
soon: PCA, Monte-Carlo PCA, Multi-Channel SSA
"""

__all__ = [
    'ssa'
]

import numpy as np
#from sklearn.decomposition import PCA
#from statsmodels.multivariate.pca import PCA
from .tsutils import standardize
from .tsmodel import ar1_sim
from .correlation import cov_shrink_rblw
from scipy.linalg import eigh, toeplitz, ishermitian, eig
from kneed import KneeLocator
import warnings
#from nitime import algorithms as alg
#import copy

#------
# Main functions
#------

# def mcpca(ys, nMC=200, **pca_kwargs):
#     '''Monte-Carlo Principal Component Analysis

#     Carries out Principal Component Analysis  (most unfortunately named EOF analysis in the meteorology
#     and climate literature) on a data matrix ys.

#     The significance of eigenvalues is gauged against those of AR(1) surrogates fit to the data.

#     TODO: enable for ensembles and generally debug

#     Parameters
#     ----------
#     ys : 2D numpy array (nt, nrec)
#         nt   = number of time samples (assumed identical for all records)
#         nrec = number of records (aka variables, channels, etc)

#     nMC : int
#         the number of Monte-Carlo simulations

#     pca_kwargs : tuple
#         keyword arguments for the PCA method

#     Returns
#     -------
#     res : dict containing:

#         - eigvals : eigenvalues (nrec,)

#         - eigvals95 : eigenvalues of the AR(1) ensemble (nrec, nMC)

#         - pcs : PC series of all components (nt, rec)

#         - eofs : EOFs of all components (nrec, nrec)

#     References:
#     ----------
#     Deininger, M., McDermott, F., Mudelsee, M. et al. (2017): Coherency of late Holocene
#     European speleothem δ18O records linked to North Atlantic Ocean circulation.
#     Climate Dynamics, 49, 595–618. https://doi.org/10.1007/s00382-016-3360-8

#     Written by Jun Hu (Rice University).
#     Adapted for pyleoclim by Julien Emile-Geay (USC)
#     '''
#     nt, nrec = ys.shape

#     ncomp = min(nt,nrec)

#     pc_mc = np.zeros((nt,nrec)) # principal components
#     eof_mc = np.zeros((nrec,nrec))  #eof (spatial loadings)
#     #eigenvalue matrices
#     eigvals = np.zeros((nrec))
#     eig_ar1 = np.zeros((nrec,nMC))

#     # apply PCA algorithm to the data matrix
#     pca_res = PCA(ys,ncomp=ncomp, **pca_kwargs)
#     eigvals = pca_res.eigenvals

#     # generate surrogate matrix
#     y_ar1 = np.full((nt,nrec,nMC), 0, dtype=np.double)

#     for i in range(nrec):
#         yi = copy.deepcopy(ys[:,i])
#         # generate nMC AR(1) surrogates
#         y_ar1[:,i,:] = ar1_sim(yi, nMC)
#         # assign PC and EOF matrices
#         if pc.loadings[:,i][0]>0:
#             eof_mc[:,i]  = pc.loadings[:,i]
#             pc_mc[:,i]   = pc.factors[:,i]
#         else:   # flip sign (arbitrary)
#             eof_mc[:,i]  = -pc.loadings[:,i]
#             pc_mc[:,i]   = -pc.factors[:,i]
#         # estimate effective sample size
#         #PC1 =
#         neff[i] = tsutils.eff_sample_size(PC1)

#     # loop over Monte Carlo iterations
#     for m in range(nMC):
#         pc_ar1 = PCA(y_ar1[:,:,m],ncomp=nrec,**pca_kwargs)
#         eig_ar1[:,m] = pc_ar1.eigenvals

#     eig95 = np.percentile(eig_ar1, 95, axis=1)


#     # assign result
#     #res = {'eigvals': eigvals, 'eigvals95': eig95, 'pcs': pc_mc, 'eofs': eof_mc}

#     # compute effective sample size
#     PC1  = out.factors[:,0]
#     neff = tsutils.eff_sample_size(PC1)

#     # compute percent variance
#     pctvar = out.eigenvals**2/np.sum(out.eigenvals**2)*100

#     # assign result to SpatiamDecomp class
#     # Note: need to grab coordinates from Series or LiPDSeries
#     res = MultivariateDecomp(name='PCA', time = self.series_list[0].time, neff= neff,
#                         pcs = out.scores, pctvar = pctvar,  locs = None,
#                         eigvals = out.eigenvals, eigvecs = out.eigenvecs)

#     return res


# def mssa(ys, M=None, nMC=0, f=0.3):
#     '''Multi-channel singular spectrum analysis analysis

#     Multivariate generalization of SSA [2], using the original algorithm of [1].
#     Each variable is called a channel, hence the name.

#     Parameters
#     ----------

#     ys : array
#           multiple time series (dimension: length of time series x total number of time series)

#     M : int
#        window size (embedding dimension, default: 10% of the length of the series)

#     nMC : int
#        Number of iteration in the Monte-Carlo process [default=0, no Monte Carlo process]

#     f : float
#        fraction (0<f<=1) of good data points for identifying significant PCs [f = 0.3]

#     Returns
#     -------
#     res : dict
#         Containing:

#         - eigvals : array of eigenvalue spectrum

#         - eigvals05 : The 5% percentile of eigenvalues

#         - eigvals95 : The 95% percentile of eigenvalues

#         - PC : matrix of principal components (2D array)

#         - RC : matrix of RCs (nrec,N,nrec*M) (2D array)

#     References
#     ----------
#     [1]_ Vautard, R., and M. Ghil (1989), Singular spectrum analysis in nonlinear
#     dynamics, with applications to paleoclimatic time series, Physica D, 35,
#     395–424.

#     [2]_ Jiang, N., J. D. Neelin, and M. Ghil (1995), Quasi-quadrennial and
#     quasi-biennial variability in the equatorial Pacific, Clim. Dyn., 12, 101-112.

#     See Also
#     --------

#     pyleoclim.utils.decomposition.ssa : Singular Spectrum Analysis (single channel)

#     '''
#     N = len(ys[:, 0])
#     nrec = len(ys[0, :])
#     if M == None:
#         M=int(N/10)
#     Y = np.zeros((N - M + 1, nrec * M))
#     for irec in np.arange(nrec):
#         for m in np.arange(0, M):
#             Y[:, m + irec * M] = ys[m:N - M + 1 + m, irec]

#     C = np.dot(np.nan_to_num(np.transpose(Y)), np.nan_to_num(Y)) / (N - M + 1)
#     D, eigvecs = eigh(C)

#     sort_tmp = np.sort(D)
#     eigvals = sort_tmp[::-1]
#     sortarg = np.argsort(-D)

#     eigvecs = eigvecs[:, sortarg]

#     # test the signifiance using Monte-Carlo
#     Ym = np.zeros((N - M + 1, nrec * M))
#     noise = np.zeros((nrec, N, nMC))
#     for irec in np.arange(nrec):
#         noise[irec, 0, :] = ys[0, irec]
#     eigvals_R = np.zeros((nrec * M, nMC))
#     # estimate coefficents of ar1 processes, and then generate ar1 time series (noise)
#     # TODO : update to use ar1_sim(), as in ssa()
#     for irec in np.arange(nrec):
#         Xs = ys[:, irec]
#         coefs_est, var_est = alg.AR_est_YW(Xs[~np.isnan(Xs)], 1)
#         sigma_est = np.sqrt(var_est)

#         for jt in range(1, N):
#             noise[irec, jt, :] = coefs_est * noise[irec, jt - 1, :] + sigma_est * np.random.randn(1, nMC)

#     for m in range(nMC):
#         for irec in np.arange(nrec):
#             noise[irec, :, m] = (noise[irec, :, m] - np.mean(noise[irec, :, m])) / (
#                 np.std(noise[irec, :, m], ddof=1))
#             for im in np.arange(0, M):
#                 Ym[:, im + irec * M] = noise[irec, im:N - M + 1 + im, m]
#         Cn = np.dot(np.nan_to_num(np.transpose(Ym)), np.nan_to_num(Ym)) / (N - M + 1)
#         # eigvals_R[:,m] = np.diag(np.dot(np.dot(eigvecs,Cn),np.transpose(eigvecs)))
#         eigvals_R[:, m] = np.diag(np.dot(np.dot(np.transpose(eigvecs), Cn), eigvecs))

#     eigvals95 = np.percentile(eigvals_R, 95, axis=1)
#     eigvals05 = np.percentile(eigvals_R, 5, axis=1)


#     # determine principal component time series
#     PC = np.zeros((N - M + 1, nrec * M))
#     PC[:, :] = np.nan
#     for k in np.arange(nrec * M):
#         for i in np.arange(0, N - M + 1):
#             #   modify for nan
#             prod = Y[i, :] * eigvecs[:, k]
#             ngood = sum(~np.isnan(prod))
#             #   must have at least m*f good points
#             if ngood >= M * f:
#                 PC[i, k] = sum(prod[~np.isnan(prod)])  # the columns of this matrix are Ak(t), k=1 to M (T-PCs)

#     # compute reconstructed timeseries
#     Np = N - M + 1

#     RC = np.zeros((nrec, N, nrec * M))

#     for k in np.arange(nrec):
#         for im in np.arange(M):
#             x2 = np.dot(np.expand_dims(PC[:, im], axis=1), np.expand_dims(eigvecs[0 + k * M:M + k * M, im], axis=0))
#             x2 = np.flipud(x2)

#             for n in np.arange(N):
#                 RC[k, n, im] = np.diagonal(x2, offset=-(Np - 1 - n)).mean()
#     res = {'eigvals': eigvals, 'eigvecs': eigvecs, 'q05': eigvals05, 'q95': eigvals95, 'PC': PC, 'RC': RC}

#     return res

[docs]def ssa(y, M=None, nMC=0, f=0.5, trunc=None, var_thresh = 80, online = True):
    '''Singular spectrum analysis

    Nonparametric eigendecomposition of timeseries into orthogonal oscillations.
    This implementation  uses the method of [1], with applications presented in [2].
    Optionally (nMC>0), the significance of eigenvalues is assessed by Monte-Carlo simulations of an AR(1) model fit to X, using [3].
    The method expects regular spacing, but is tolerant to missing values, up to a fraction 0<f<1 (see [4]).

    Parameters
    ----------

    y : array of length N
          time series (evenly-spaced, possibly with up to f*N NaNs)

    M : int
       window size (default: 10% of N)

    nMC : int
        Number of iterations in the Monte-Carlo simulation (default nMC=0, bypasses Monte Carlo SSA)
        Currently only supported for evenly-spaced, gap-free data.

    f : float
        maximum allowable fraction of missing values. (Default is 0.5)

    trunc : str
        if present, truncates the expansion to a level K < M owing to one of 4 criteria:
            (1) 'kaiser': variant of the Kaiser-Guttman rule, retaining eigenvalues larger than the median
            (2) 'mcssa': Monte-Carlo SSA (use modes above the 95% quantile from an AR(1) process)
            (3) 'var': first K modes that explain at least var_thresh % of the variance.
        Default is None, which bypasses truncation (K = M)
            (4) 'knee': Wherever the "knee" of the screeplot occurs.
        Recommended as a first pass at identifying significant modes as it tends to be more robust than 'kaiser' or 'var', and faster than 'mcssa'.
        While no truncation method is imposed by default, if the goal is to enhance the S/N ratio and reconstruct a smooth version of the attractor's skeleton, 
        then the knee-finding method is a good compromise between objectivity and efficiency.
        See kneed's `documentation <https://kneed.readthedocs.io/en/latest/index.html>`_ for more details on the knee finding algorithm.

    var_thresh : float
        variance threshold for reconstruction (only impactful if trunc is set to 'var')
    
    online : bool; {True,False}
        Whether or not to conduct knee finding analysis online or offline. 
        Only called when trunc = 'knee'. Default is True
        See kneed's `documentation <https://kneed.readthedocs.io/en/latest/api.html#kneelocator>`_ for details.

    Returns
    -------
    res : dict containing:

        - eigvals : (M, ) array of eigenvalues

        - eigvecs : (M, M) Matrix of temporal eigenvectors (T-EOFs)

        - PC : (N - M + 1, M) array of principal components (T-PCs)

        - RCmat : (N,  M) array of reconstructed components

        - RCseries : (N,) reconstructed series, with mean and variance restored

        - pctvar: (M, ) array of the fraction of variance (%) associated with each mode

        - eigvals_q : (M, 2) array containting the 5% and 95% quantiles of the Monte-Carlo eigenvalue spectrum [ if nMC >0 ]

        - mode_idx : array of indices of eigenvalues >=eigvals_q

    References
    ----------
    [1]_ Vautard, R., and M. Ghil (1989), Singular spectrum analysis in nonlinear
    dynamics, with applications to paleoclimatic time series, Physica D, 35,
    395–424.

    [2]_ Ghil, M., R. M. Allen, M. D. Dettinger, K. Ide, D. Kondrashov, M. E. Mann,
    A. Robertson, A. Saunders, Y. Tian, F. Varadi, and P. Yiou (2002),
    Advanced spectral methods for climatic time series, Rev. Geophys., 40(1),
    1003–1052, doi:10.1029/2000RG000092.

    [3]_ Allen, M. R., and L. A. Smith (1996), Monte Carlo SSA: Detecting irregular
    oscillations in the presence of coloured noise, J. Clim., 9, 3373–3404.

    [4]_ Schoellhamer, D. H. (2001), Singular spectrum analysis for time series with
    missing data, Geophysical Research Letters, 28(16), 3187–3190, doi:10.1029/2000GL012698.

    See also
    --------
    
    pyleoclim.core.series.Series.ssa : Singular Spectrum Analysis for timeseries objects
    
    pyleoclim.core.ssares.SsaRes.modeplot : plot SSA modes
    
    pyleoclim.core.ssares.SsaRes.screeplot : plot SSA eigenvalue spectrum

    '''

    ys, mu, scale = standardize(y)

    N = len(ys)

    if M == None:
        M=int(N/10)
    c = np.zeros(M)

    for j in range(M):
        prod = ys[0:N - j] * ys[j:N]
        c[j] = sum(prod[~np.isnan(prod)]) / (sum(~np.isnan(prod)) - 1)

    C = toeplitz(c[0:M])  #form sample correlation matrix
    d, _ = eigh(C)  # extract eigenvalues
    
    nmodes = np.where(np.real(d)>0)[0].size # effective number of modes
    
    if any(d <= np.finfo(d.dtype).eps): # if C is singular
        Cr, g  = cov_shrink_rblw(C, n=nmodes)  # apply Rao-Blackwellized Ledoit-Wolf estimator 
        warnings.warn('Ill-conditioned covariance matrix; regularized with shrinkage factor: {:3.2f}'.format(g),stacklevel=2) 
    else:
        Cr = C 
    
    # solve eigendecomposition 
    if ishermitian(Cr):  
        D, eigvecs = eigh(Cr) 
    else:
        D, eigvecs = eig(Cr) 
            
    D[D<0] = 0     # impose positive eigenvalues    

    # rank eigenvalues/vectors in decreasing order     
    sort_tmp = np.sort(D)
    eigvals = sort_tmp[::-1]
    sortarg = np.argsort(-D)
    eigvecs = eigvecs[:, sortarg]

    # determine principal component time series
    PC = np.zeros((N - M + 1, M))
    PC[:, :] = np.nan
    for k in np.arange(M):
        for i in np.arange(0, N - M + 1):
            #   modify for nan
            prod = ys[i:i + M] * eigvecs[:, k]
            ngood = sum(~np.isnan(prod))
            #   must have at least m*f good points
            if ngood >= M * f:
                PC[i, k] = sum(
                    prod[~np.isnan(prod)]) * M / ngood  # the columns of this matrix are Ak(t), k=1 to M (T-PCs)

    pctvar = eigvals/np.sum(eigvals)*100 # percent variance

    if nMC > 0: # If Monte-Carlo SSA is requested.
        trunc == 'mcssa'
        noise = ar1_sim(ys, nMC)  # generate MC AR(1) surrogates of y
        eigvals_R = np.zeros((M,nMC)) # define eigenvalue matrix
        lgs = np.arange(-N + 1, N)

        for m in range(nMC):
            xn, _ , _ = standardize(noise[:, m]) # center and standardize
            Gn = np.correlate(xn, xn, "full")
            Gn = Gn / (N - abs(lgs))
            Cn = toeplitz(Gn[N - 1:N - 1 + M])
            eigvals_R[:, m] = np.diag(np.dot(np.dot(np.transpose(eigvecs), Cn), eigvecs))

        eigvals_q = np.empty((M,2))
        eigvals_q[:,0] = np.percentile(eigvals_R, 5, axis=1)
        eigvals_q[:,1] = np.percentile(eigvals_R, 95, axis=1)
        mode_idx = np.where(eigvals>=eigvals_q[:,1])[0]
    else:
        eigvals_q = None

    if trunc is None:
        mode_idx = np.arange(M)
    elif trunc == 'kaiser':
        mval     = np.median(eigvals) # median eigenvalues
        mode_idx = np.where(eigvals>=mval)[0]
    elif trunc == 'var':
        mode_idx = np.arange(np.argwhere(np.cumsum(pctvar)>=var_thresh)[0]+1)
    elif trunc == 'knee':
        modes = np.arange(len(eigvals))
        knee = KneeLocator(x=modes,y=eigvals,curve='convex',direction='decreasing',online=online).knee
        mode_idx = np.arange(knee+1)
    if nMC == 0 and trunc == 'mcssa':
        raise ValueError('nMC must be larger than 0 to enable MC-SSA truncation')
    elif nMC>0:
       mode_idx = np.where(eigvals>=eigvals_q[:,1])[0]
       
    nmodes = len(mode_idx)

    # compute reconstructed timeseries
    Np = N - M + 1
    RCmat = np.zeros((N, M))

    for im in range(M):
        xdum = np.dot(np.expand_dims(PC[:, im], axis=1), np.expand_dims(eigvecs[0:M, im], axis=0))
        xdum = np.flipud(xdum)

        for n in np.arange(N):
            RCmat[n, im] = np.diagonal(xdum, offset=-(Np - 1 - n)).mean()
        del xdum

    RCmat = scale*RCmat + np.tile(mu, reps=[N, M])  # restore the mean and variance

    #RCseries = scale*RCmat[:,mode_idx].sum(axis=1) + mu
    
    RCmodes = RCmat[:,mode_idx]
    RCseries = (RCmodes-RCmodes.mean()).sum(axis=1) + mu
    #if nmodes > 1:
    #    RCseries -= scale*mu*(nmodes-1)

    # export results
    res = {'eigvals': eigvals, 'eigvecs': eigvecs, 'PC': PC, 'RCseries': RCseries, 'RCmat': RCmat, 'pctvar': pctvar, 'eigvals_q': eigvals_q, 'mode_idx': mode_idx}
    return res