Source code for pyleoclim.utils.tsbase

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Basic functionalities to clean timeseries data prior to analysis
"""

import numpy as np
from typing import OrderedDict

#APIs

__all__ = [
    'clean_ts',
    'dropna',
    'sort_ts',
    'is_evenly_spaced',
    'reduce_duplicated_timestamps',
]


[docs]def clean_ts(ys, ts, verbose=False):
    ''' Cleaning the timeseries

    Delete the NaNs in the time series and sort it with time axis ascending,
    duplicate timestamps will be reduced by averaging the values.

    Parameters
    ----------
    ys : array
        A time series, NaNs allowed
    ts : array
        The time axis of the time series, NaNs allowed
    verbose : bool
        If True, will print a warning message

    Returns
    -------
    ys : array
        The time series without nans
    ts : array
        The time axis of the time series without nans

    See also
    --------

    pyleoclim.utils.tsbase.dropna : Drop NaN values

    pyleoclim.utils.tsbase.sort_ts : Sort timeseries

    pyleoclim.utils.tsbase.reduce_duplicated_timestamps : Consolidate duplicated timestamps

    '''
    ys, ts = dropna(ys, ts, verbose=verbose)
    ys, ts = sort_ts(ys, ts, verbose=verbose)
    ys, ts = reduce_duplicated_timestamps(ys, ts, verbose=verbose)

    return ys, ts


[docs]def dropna(ys, ts, verbose=False):
    '''Drop NaN values
    
    Remove entries of ys or ts that bear NaNs

    Parameters
    ----------
    ys : array
        A time series, NaNs allowed
    ts : array
        The time axis of the time series, NaNs allowed
    verbose : bool
        If True, will print a warning message

    Returns
    -------
    ys : array
        The time series without nans
    ts : array
        The time axis of the time series without nans

    '''
    ys = np.asarray(ys, dtype=float)
    ts = np.asarray(ts, dtype=float)
    assert ys.size == ts.size, 'The size of time axis and data value should be equal!'

    ys_tmp = np.copy(ys)
    ys = ys[~np.isnan(ys_tmp)]
    ts = ts[~np.isnan(ys_tmp)]
    ts_tmp = np.copy(ts)
    ys = ys[~np.isnan(ts_tmp)]
    ts = ts[~np.isnan(ts_tmp)]

    if verbose and any(np.isnan(ys_tmp)):
        print('NaNs have been detected and dropped.')

    return ys, ts

[docs]def sort_ts(ys, ts, verbose=False):
    ''' Sort timeseries
    
    Sort ts values in ascending order

    Parameters
    ----------
    ys : array
        Dependent variable
    ts : array
        Independent variable
    verbose : bool
        If True, will print a warning message

    Returns
    -------
    ys : array
        Dependent variable
    ts : array
        Independent variable, sorted in ascending order

    '''
    ys = np.asarray(ys, dtype=float)
    ts = np.asarray(ts, dtype=float)
    assert ys.size == ts.size, 'time and value arrays must be of equal length'

    # sort the time series so that the time axis will be ascending
    dt = np.median(np.diff(ts))
    if dt < 0:
        sort_ind = np.argsort(ts)
        ys = ys[sort_ind]
        ts = ts[sort_ind]
        if verbose:
            print('The time axis has been adjusted to be prograde')

    return ys, ts

[docs]def reduce_duplicated_timestamps(ys, ts, verbose=False):
    ''' Consolidate duplicated timestamps
    
    Reduce duplicated timestamps in a timeseries by averaging the values

    Parameters
    ----------
    ys : array
        Dependent variable
    ts : array
        Independent variable
    verbose : bool
        If True, will print a warning message

    Returns
    -------
    ys : array
        Dependent variable
    ts : array
        Independent variable, with duplicated timestamps reduced by averaging the values

    '''
    ys = np.asarray(ys, dtype=float)
    ts = np.asarray(ts, dtype=float)
    assert ys.size == ts.size, 'The size of time axis and data value should be equal!'

    if len(ts) != len(set(ts)):
        value = OrderedDict()
        for t, y in zip(ts, ys):
            if t not in value:
                value[t] = [y]
            else:
                value[t].append(y)

        ts = []
        ys = []
        for k, v in value.items():
            ts.append(k)
            ys.append(np.mean(v))

        ts = np.array(ts)
        ys = np.array(ys)

        if verbose:
            print('Duplicate timestamps have been combined by averaging values.')
    return ys, ts

[docs]def is_evenly_spaced(ts, tol=1e-4):
    ''' Check if a time axis is evenly spaced, within a given tolerance

    Parameters
    ----------

    ts : array
        The time axis of a time series
        
    tol : float64
        Numerical tolerance for the relative difference

    Returns
    -------

    check : bool
        True - evenly spaced; False - unevenly spaced.

    '''
    if ts is None:
        check = True
    else:
        dts = np.diff(ts)
        dt_mean = dts.mean()   
        check = all(np.abs((dt - dt_mean)/dt_mean) < tol for dt in np.diff(ts)) # compare relative spacing to the mean
        
    return check