#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Feb 25 06:29:36 2020
@author: deborahkhider
Contains eigendecomposition methods:
Principal Component Analysis, Singular Spectrum Analysis, Multi-channel SSA
"""
__all__ = [
'mcpca',
'pca',
'ssa',
'mssa',
]
import numpy as np
from sklearn.decomposition import PCA
from .tsutils import standardize
from .tsmodel import ar1_sim
from scipy.linalg import eigh, toeplitz
from nitime import algorithms as alg
from statsmodels.multivariate.pca import PCA
import copy
#------
# Main functions
#------
def mcpca(ys, nMC=200, **pca_kwargs):
'''Monte-Carlo Principal Component Analysis
Carries out Principal Component Analysis (most unfortunately named EOF analysis in the meteorology
and climate literature) on a data matrix ys.
If NaNs are present, they will be infilled via the EM algorithm.
The significance of eigenvalues is gauged against those of AR(1) surrogates fit to the data.
Parameters
----------
ys : 2D numpy array (nt, nrec)
nt = number of time samples (assumed identical for all records)
nrec = number of records (aka variables, channels, etc)
nMC : int
the number of Monte-Carlo simulations
pca_kwargs : tuple
keyword arguments for the PCA method
Returns
-------
res : dict containing:
- eigvals : eigenvalues (nrec,)
- eigvals95 : eigenvalues of the AR(1) ensemble (nrec, nMC)
- pcs : PC series of all components (nt, rec)
- eofs : EOFs of all components (nrec, nrec)
References:
----------
Deininger, M., McDermott, F., Mudelsee, M. et al. (2017): Coherency of late Holocene
European speleothem δ18O records linked to North Atlantic Ocean circulation.
Climate Dynamics, 49, 595–618. https://doi.org/10.1007/s00382-016-3360-8
Written by Jun Hu (Rice University).
Adapted for pyleoclim by Julien Emile-Geay (USC)
'''
nt, nrec = ys.shape
pc_mc = np.zeros((nt,nrec)) # principal components
eof_mc = np.zeros((nrec,nrec)) #eof (spatial loadings)
#eigenvalue matrices
eigvals = np.zeros((nrec))
eig_ar1 = np.zeros((nrec,nMC))
# apply PCA algorithm to the data matrix
pc = PCA(ys,ncomp=nrec, **pca_kwargs) # TODO : implement EM infilling with missing = ‘fill-em’
eigvals = pc.eigenvals
# generate surrogate matrix
y_ar1 = np.full((nt,nrec,nMC), 0, dtype=np.double)
for i in range(nrec):
yi = copy.deepcopy(ys[:,i])
# generate nMC AR(1) surrogates
y_ar1[:,i,:] = ar1_sim(yi, nMC)
# assign PC and EOF matrices
if pc.loadings[:,i][0]>0:
eof_mc[:,i] = pc.loadings[:,i]
pc_mc[:,i] = pc.factors[:,i]
else: # flip sign (arbitrary)
eof_mc[:,i] = -pc.loadings[:,i]
pc_mc[:,i] = -pc.factors[:,i]
# loop over Monte Carlo iterations
for m in range(nMC):
pc_ar1 = PCA(y_ar1[:,:,m],ncomp=nrec,**pca_kwargs)
eig_ar1[:,m] = pc_ar1.eigenvals
eig95 = np.percentile(eig_ar1, 95, axis=1)
# assign result
res = {'eigvals': eigvals, 'eigvals95': eig95, 'pcs': pc_mc, 'eofs': eof_mc}
return res
[docs]def pca(ys,n_components=None,copy=True,whiten=False, svd_solver='auto',tol=0.0,iterated_power='auto',random_state=None):
'''Principal Component Analysis (Empirical Orthogonal Functions)
Decomposition of a signal or data set in terms of orthogonal basis functions.
From scikit-learn: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
Parameters
----------
ys : array
timeseries
n_components : int,None,or str
[default: None]
Number of components to keep. if n_components is not set all components are kept:
If n_components == 'mle' and svd_solver == 'full', Minka’s MLE is used to guess the dimension. Use of n_components == 'mle' will interpret svd_solver == 'auto' as svd_solver == 'full'.
If 0 < n_components < 1 and svd_solver == 'full', select the number of components such that the amount of variance that needs to be explained is greater than the percentage specified by n_components.
If svd_solver == 'arpack', the number of components must be strictly less than the minimum of n_features and n_samples.
copy : bool,optional
[default: True]
If False, data passed to fit are overwritten and running fit(X).transform(X) will not yield the expected results, use fit_transform(X) instead.
whiten : bool,optional
[default: False]
When True (False by default) the components_ vectors are multiplied by the square root of n_samples and then divided by the singular values to ensure uncorrelated outputs with unit component-wise variances.
svd_solver : str {‘auto’, ‘full’, ‘arpack’, ‘randomized’}
If auto :
The solver is selected by a default policy based on X.shape and n_components: if the input data is larger than 500x500 and the number of components to extract is lower than 80% of the smallest dimension of the data, then the more efficient ‘randomized’ method is enabled.
Otherwise the exact full SVD is computed and optionally truncated afterwards.
If full :
run exact full SVD calling the standard LAPACK solver via scipy.linalg.svd and select the components by postprocessing
If arpack :
run SVD truncated to n_components calling ARPACK solver via scipy.sparse.linalg.svds. It requires strictly 0 < n_components < min(X.shape)
If randomized :
run randomized SVD by the method of Halko et al.
tol : float >= 0 ,optional
[default: 0]
Tolerance for singular values computed by svd_solver == ‘arpack’.
iterated_power : int >= 0, or string {'auto'}
[default: 'auto']
Number of iterations for the power method computed by svd_solver == ‘randomized’.
random_state : int, RandomState instance, or None, optional
[default: None]
If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used by np.random.
Used when svd_solver == ‘arpack’ or ‘randomized’.
Returns
-------
dict
Sklearn PCA object dictionary of all attributes and values.
-components_array, shape (n_components, n_features)
Principal axes in feature space, representing the directions of maximum variance in the data. The components are sorted by explained_variance_.
-explained_variance_array, shape (n_components,)
The amount of variance explained by each of the selected components.
Equal to n_components largest eigenvalues of the covariance matrix of X.
New in version 0.18.
-explained_variance_ratio_array, shape (n_components,)
Percentage of variance explained by each of the selected components.
If n_components is not set then all components are stored and the sum of the ratios is equal to 1.0.
-singular_values_array, shape (n_components,)
The singular values corresponding to each of the selected components. The singular values are equal to the 2-norms of the n_components variables in the lower-dimensional space.
New in version 0.19.
-mean_array, shape (n_features,)
Per-feature empirical mean, estimated from the training set.
Equal to X.mean(axis=0).
-n_components_int
The estimated number of components. When n_components is set to ‘mle’ or a number between 0 and 1 (with svd_solver == ‘full’) this number is estimated from input data. Otherwise it equals the parameter n_components, or the lesser value of n_features and n_samples if n_components is None.
-n_features_int
Number of features in the training data.
-n_samples_int
Number of samples in the training data.
-noise_variance_float
The estimated noise covariance following the Probabilistic PCA model from Tipping and Bishop 1999. See “Pattern Recognition and Machine Learning” by C. Bishop, 12.2.1 p. 574 or http://www.miketipping.com/papers/met-mppca.pdf. It is required to compute the estimated data covariance and score samples.
Equal to the average of (min(n_features, n_samples) - n_components) smallest eigenvalues of the covariance matrix of X.
'''
if np.any(np.isnan(ys)):
raise ValueError('matrix may not have null values.')
pca=PCA(n_components=n_components,copy=copy,whiten=whiten,svd_solver=svd_solver,tol=tol,iterated_power=iterated_power,random_state=random_state)
return pca.fit(ys).__dict__
[docs]def mssa(ys, M=None, nMC=0, f=0.3):
'''Multi-channel singular spectrum analysis analysis
Multivariate generalization of SSA [2], using the original algorithm of [1].
Each variable is called a channel, hence the name.
Parameters
----------
ys : array
multiple time series (dimension: length of time series x total number of time series)
M : int
window size (embedding dimension, default: 10% of the length of the series)
nMC : int
Number of iteration in the Monte-Carlo process [default=0, no Monte Carlo process]
f : float
fraction (0<f<=1) of good data points for identifying significant PCs [f = 0.3]
Returns
-------
res : dict
Containing:
- eigvals : array of eigenvalue spectrum
- eigvals05 : The 5% percentile of eigenvalues
- eigvals95 : The 95% percentile of eigenvalues
- PC : matrix of principal components (2D array)
- RC : matrix of RCs (nrec,N,nrec*M) (2D array)
References
----------
[1]_ Vautard, R., and M. Ghil (1989), Singular spectrum analysis in nonlinear
dynamics, with applications to paleoclimatic time series, Physica D, 35,
395–424.
[2]_ Jiang, N., J. D. Neelin, and M. Ghil (1995), Quasi-quadrennial and
quasi-biennial variability in the equatorial Pacific, Clim. Dyn., 12, 101-112.
See Also
--------
pyleoclim.utils.decomposition.ssa : Singular Spectrum Analysis (single channel)
'''
N = len(ys[:, 0])
nrec = len(ys[0, :])
if M == None:
M=int(N/10)
Y = np.zeros((N - M + 1, nrec * M))
for irec in np.arange(nrec):
for m in np.arange(0, M):
Y[:, m + irec * M] = ys[m:N - M + 1 + m, irec]
C = np.dot(np.nan_to_num(np.transpose(Y)), np.nan_to_num(Y)) / (N - M + 1)
D, eigvecs = eigh(C)
sort_tmp = np.sort(D)
eigvals = sort_tmp[::-1]
sortarg = np.argsort(-D)
eigvecs = eigvecs[:, sortarg]
# test the signifiance using Monte-Carlo
Ym = np.zeros((N - M + 1, nrec * M))
noise = np.zeros((nrec, N, nMC))
for irec in np.arange(nrec):
noise[irec, 0, :] = ys[0, irec]
eigvals_R = np.zeros((nrec * M, nMC))
# estimate coefficents of ar1 processes, and then generate ar1 time series (noise)
# TODO : update to use ar1_sim(), as in ssa()
for irec in np.arange(nrec):
Xs = ys[:, irec]
coefs_est, var_est = alg.AR_est_YW(Xs[~np.isnan(Xs)], 1)
sigma_est = np.sqrt(var_est)
for jt in range(1, N):
noise[irec, jt, :] = coefs_est * noise[irec, jt - 1, :] + sigma_est * np.random.randn(1, nMC)
for m in range(nMC):
for irec in np.arange(nrec):
noise[irec, :, m] = (noise[irec, :, m] - np.mean(noise[irec, :, m])) / (
np.std(noise[irec, :, m], ddof=1))
for im in np.arange(0, M):
Ym[:, im + irec * M] = noise[irec, im:N - M + 1 + im, m]
Cn = np.dot(np.nan_to_num(np.transpose(Ym)), np.nan_to_num(Ym)) / (N - M + 1)
# eigvals_R[:,m] = np.diag(np.dot(np.dot(eigvecs,Cn),np.transpose(eigvecs)))
eigvals_R[:, m] = np.diag(np.dot(np.dot(np.transpose(eigvecs), Cn), eigvecs))
eigvals95 = np.percentile(eigvals_R, 95, axis=1)
eigvals05 = np.percentile(eigvals_R, 5, axis=1)
# determine principal component time series
PC = np.zeros((N - M + 1, nrec * M))
PC[:, :] = np.nan
for k in np.arange(nrec * M):
for i in np.arange(0, N - M + 1):
# modify for nan
prod = Y[i, :] * eigvecs[:, k]
ngood = sum(~np.isnan(prod))
# must have at least m*f good points
if ngood >= M * f:
PC[i, k] = sum(prod[~np.isnan(prod)]) # the columns of this matrix are Ak(t), k=1 to M (T-PCs)
# compute reconstructed timeseries
Np = N - M + 1
RC = np.zeros((nrec, N, nrec * M))
for k in np.arange(nrec):
for im in np.arange(M):
x2 = np.dot(np.expand_dims(PC[:, im], axis=1), np.expand_dims(eigvecs[0 + k * M:M + k * M, im], axis=0))
x2 = np.flipud(x2)
for n in np.arange(N):
RC[k, n, im] = np.diagonal(x2, offset=-(Np - 1 - n)).mean()
res = {'eigvals': eigvals, 'eigvecs': eigvecs, 'q05': eigvals05, 'q95': eigvals95, 'PC': PC, 'RC': RC}
return res
[docs]def ssa(y, M=None, nMC=0, f=0.5, trunc=None, var_thresh = 80):
'''Singular spectrum analysis
Nonparametric eigendecomposition of timeseries into orthogonal oscillations.
This implementation uses the method of [1], with applications presented in [2].
Optionally (nMC>0), the significance of eigenvalues is assessed by Monte-Carlo simulations of an AR(1) model fit to X, using [3].
The method expects regular spacing, but is tolerant to missing values, up to a fraction 0<f<1 (see [4]).
Parameters
----------
y : array of length N
time series (evenly-spaced, possibly with up to f*N NaNs)
M : int
window size (default: 10% of N)
nMC : int
Number of iterations in the Monte-Carlo simulation (default nMC=0, bypasses Monte Carlo SSA)
Currently only supported for evenly-spaced, gap-free data.
f : float
maximum allowable fraction of missing values. (Default is 0.5)
trunc : str
if present, truncates the expansion to a level K < M owing to one of 3 criteria:
(1) 'kaiser': variant of the Kaiser-Guttman rule, retaining eigenvalues larger than the median
(2) 'mc-ssa': Monte-Carlo SSA (use modes above the 95% threshold)
(3) 'var': first K modes that explain at least var_thresh % of the variance.
Default is None, which bypasses truncation (K = M)
var_thresh : float
variance threshold for reconstruction (only impactful if trunc is set to 'var')
Returns
-------
res : dict
Containing:
- eigvals : (M, ) array of eigenvalue spectrum
- eigvecs : Matrix of temporal eigenvectors
- PC : (N - M + 1, M) array of principal components
- RCmat : (N, M) array of reconstructed components
- RCseries : (N,) reconstructed series
- pctvar: (M, ) array of the fraction of variance (%) associated with each mode
- eigvals_q : (M, ) array contaitning the 5% and 95% quantiles of the Monte-Carlo eigenvalue spectrum [ if nMC >0 ]
References
----------
[1]_ Vautard, R., and M. Ghil (1989), Singular spectrum analysis in nonlinear
dynamics, with applications to paleoclimatic time series, Physica D, 35,
395–424.
[2]_ Ghil, M., R. M. Allen, M. D. Dettinger, K. Ide, D. Kondrashov, M. E. Mann,
A. Robertson, A. Saunders, Y. Tian, F. Varadi, and P. Yiou (2002),
Advanced spectral methods for climatic time series, Rev. Geophys., 40(1),
1003–1052, doi:10.1029/2000RG000092.
[3]_ Allen, M. R., and L. A. Smith (1996), Monte Carlo SSA: Detecting irregular
oscillations in the presence of coloured noise, J. Clim., 9, 3373–3404.
[4]_ Schoellhamer, D. H. (2001), Singular spectrum analysis for time series with
missing data, Geophysical Research Letters, 28(16), 3187–3190, doi:10.1029/2000GL012698.
See Also
--------
pyleoclim.utils.decomposition.mssa : Multi-channel SSA
'''
ys, mu, scale = standardize(y)
N = len(ys)
if M == None:
M=int(N/10)
c = np.zeros(M)
for j in range(M):
prod = ys[0:N - j] * ys[j:N]
c[j] = sum(prod[~np.isnan(prod)]) / (sum(~np.isnan(prod)) - 1)
C = toeplitz(c[0:M]) #form correlation matrix
D, eigvecs = eigh(C) # solve eigendecomposition
sort_tmp = np.sort(D)
eigvals = sort_tmp[::-1]
sortarg = np.argsort(-D)
eigvecs = eigvecs[:, sortarg]
# determine principal component time series
PC = np.zeros((N - M + 1, M))
PC[:, :] = np.nan
for k in np.arange(M):
for i in np.arange(0, N - M + 1):
# modify for nan
prod = ys[i:i + M] * eigvecs[:, k]
ngood = sum(~np.isnan(prod))
# must have at least m*f good points
if ngood >= M * f:
PC[i, k] = sum(
prod[~np.isnan(prod)]) * M / ngood # the columns of this matrix are Ak(t), k=1 to M (T-PCs)
pctvar = eigvals**2/np.sum(eigvals**2)*100 # percent variance
if nMC > 0: # If Monte-Carlo SSA is requested.
noise = ar1_sim(ys, nMC) # generate MC AR(1) surrogates of y
eigvals_R = np.zeros((M,nMC)) # define eigenvalue matrix
lgs = np.arange(-N + 1, N)
for m in range(nMC):
xn, _ , _ = standardize(noise[:, m]) # center and standardize
Gn = np.correlate(xn, xn, "full")
Gn = Gn / (N - abs(lgs))
Cn = toeplitz(Gn[N - 1:N - 1 + M])
eigvals_R[:, m] = np.diag(np.dot(np.dot(np.transpose(eigvecs), Cn), eigvecs))
eigvals_q = np.empty((M,2))
eigvals_q[:,0] = np.percentile(eigvals_R, 5, axis=1)
eigvals_q[:,1] = np.percentile(eigvals_R, 95, axis=1)
mode_idx = np.where(eigvals>=eigvals_q[:,1])[0] # modes to retain
else:
eigvals_q = None
if trunc == 'mcssa':
if nMC == 0:
raise ValueError('nMC must be larger than 0 to enable MC-SSA truncation')
elif trunc == 'kaiser':
mval = np.median(eigvals) # median eigenvalues
mode_idx = np.where(eigvals>=mval)[0]
elif trunc == 'var':
mode_idx = np.arange(np.argwhere(np.cumsum(pctvar)>=var_thresh)[0]+1)
else:
mode_idx = np.arange(M)
# compute reconstructed timeseries
Np = N - M + 1
RCmat = np.zeros((N, M))
for im in range(M):
xdum = np.dot(np.expand_dims(PC[:, im], axis=1), np.expand_dims(eigvecs[0:M, im], axis=0))
xdum = np.flipud(xdum)
for n in np.arange(N):
RCmat[n, im] = np.diagonal(xdum, offset=-(Np - 1 - n)).mean()
del xdum
#RCmat = RCmat + np.tile(mu, reps=[N, M]) # restore the mean and variance
RCseries = scale*RCmat[:,mode_idx].sum(axis=1) + mu
# export results
res = {'eigvals': eigvals, 'eigvecs': eigvecs, 'PC': PC, 'RCseries': RCseries, 'RCmat': RCmat, 'pctvar': pctvar, 'eigvals_q': eigvals_q, 'mode_idx': mode_idx}
return res