Source code for pyleoclim.utils.tsbase

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Basic functionalities to clean timeseries data prior to analysis
"""

import numpy as np
from typing import OrderedDict
import operator
import warnings
import pandas as pd
import scipy.stats as st

#APIs

__all__ = [
    'clean_ts',
    'dropna',
    'sort_ts',
    'is_evenly_spaced',
    'reduce_duplicated_timestamps',
]

# UDUNITS, see: http://cfconventions.org/cf-conventions/cf-conventions#time-coordinate
SECONDS_PER_YEAR = 31556925.974592  # 86400 * 365.24219878

MATCH_A  = frozenset(['y', 'yr', 'yrs', 'year', 'years'])
MATCH_KA = frozenset(['ka', 'ky', 'kyr', 'kyrs', 'kiloyear', 'kiloyr', 'kiloyrs'])
MATCH_MA = frozenset(['ma', 'my','myr','myrs'])
MATCH_GA = frozenset(['ga', 'gy', 'gyr', 'gyrs'])

MATCH_CE = frozenset(['ad', 'ce'])
MATCH_BP = frozenset(['bp','bnf','b1950'])

#MATCH_NAME = frozenset(['time', 'age'])

[docs]def disambiguate_time_metadata(time_unit):
    '''
    Infer time_name and time_unit from (possibly ambiguous) time units as commonly
    provided in the field. 

    Parameters
    ----------
    time_unit : str
        time units, preferaby something like "kyr BP" or 'year C.E.'. Otherwise,
        wild guesses will be attempted to decipher your meaning. 

    Returns
    -------
    time_name : str
        Name of the time vector (e.g., 'Time','Age'). Possibly None if no guess could be made
    time_unit : str
        Updated units for the time vector (e.g., 'ky BP').

    '''
    time_name = None
    tu = time_unit.lower().replace(".","").split()
    
    if tu[0] in MATCH_KA:
        time_unit = 'ka'
        time_name = 'Age'
    elif tu[0] in MATCH_MA:
        time_unit = 'Ma'
        time_name = 'Age'
    elif tu[0] in MATCH_GA:
        time_unit = 'Ga'
        time_name = 'Age'
    elif tu[0] in MATCH_CE: 
        time_name = 'Time'
        time_unit = 'years CE'
    elif tu[0] in MATCH_BP:
        time_name = 'Age' 
    elif tu[0] in MATCH_A:
        time_name = 'Time'
            
    #else:
    #   raise ValueError(f"Input time_unit={time_unit} is not supported. Supported input of the form 'UNIT DATUM', where UNIT can be {MATCH_A} .")
        #raise ValueError(f"Input time_unit={time_unit} is not supported. Supported input: 'year', 'years', 'yr', 'yrs', 'CE', 'AD', 'y BP', 'yr BP', 'yrs BP', 'year BP', 'years BP', 'ky BP', 'kyr BP', 'kyrs BP', 'ka BP', 'my BP', 'myr BP', 'myrs BP', 'ma BP'.")
    
    if len(tu)>1:
        if tu[1] in MATCH_BP:
            time_name = 'Age'
        elif tu[1] in MATCH_CE:
            #time_unit = 'years CE'
            time_name = 'Time'
    
    return time_name, time_unit


[docs]def time_unit_to_datum_exp_dir(time_unit, time_name=None, verbose=False):
    """Convert time unit (yr, ka, ma, etc) to a datum, exponent, direction 
    triplet. Based on the time_unit (and optionally, the time_name) the datum
    (year zero), exponent (10^x year units), and direction (prograde/retrograde)
    can be inferred. A verbose option is included here for users who want to 
    confirm the resulting inference.

    Parameters
    ----------
    time_unit: str
        Time unit indicates the major unit of time. Examples: annum (yr), 
        kiloyear (ka, ky), milayear (ma, my), gigayear (ga, gy)
    time_name: str
        (Optional) If 'age', direction is always 'retrograde'. Defaults to None,
        which is effectively unused.
    verbose: bool
        (Optional) If True, includes a print statement explaining the 
        conversion.

    Returns
    -------
    datum : int, optional
        origin point for the time scale.
    exponent : int, optional
        Base-10 exponent for year multiplier. Dates in kyr should use 3, dates in Myr should use 6, etc.
    direction: str
        Direction of time flow, 'prograde' or 'retrograde'.

    Examples
    --------

    .. jupyter-execute::

        from pyleoclim.utils.tsbase import time_unit_to_datum_exp_dir

        (datum, exponent, direction) = time_unit_to_datum_exp_dir(time_unit)
        (datum, exponent, direction)

    """
    # set defaults ; overwrite if we find cause for it
    exponent = 0
    datum = 0
    direction = 'prograde'
    
    tu = time_unit.lower().split()
    # deal with statements explicit about exponents, and take a first guess at datum/direction
    if tu[0] in MATCH_A:
        exponent = 0
        datum = 0
    elif tu[0] in MATCH_KA:
        datum = 1950
        exponent = 3
        direction = 'retrograde'
    elif tu[0] in MATCH_MA:
        datum = 1950
        exponent = 6
        direction = 'retrograde'
    elif tu[0] in MATCH_GA:
        datum = 1950
        exponent = 9
        direction = 'retrograde'
    elif tu[0].replace('.','') in MATCH_CE:
        exponent = 0
        datum = 0
        direction = 'prograde'
    elif tu[0].replace('.','') in MATCH_BP:
        exponent = 0
        datum = 1950
        direction = 'retrograde'    
    elif tu[0].find('cal')>=0:
        exponent = 0
        datum = 1950
        direction = 'retrograde'    
    else:
        warnings.warn(f'Time unit "{time_unit}" unknown; triggering defaults', stacklevel=4)

    # if provided, deal with statements about datum/direction, like kyr BP, years CE, etc
    if len(tu) > 1:
        datum_str = tu[1].replace('.','') # make lowercase + strip stops, so "B.P." --> "bp"
        if datum_str == 'b2k':
            datum = 2000
            direction = 'retrograde'
        elif datum_str in MATCH_BP:
            datum = 1950
            direction = 'retrograde'
        elif datum_str in MATCH_CE:
            datum = 0
            direction = 'prograde'

    if time_name is not None:
        if time_name.lower() == 'age':
            direction = 'retrograde' 
        elif time_name.lower() == 'time':
            direction = 'prograde'
        elif time_name.lower() in MATCH_A:
            direction = 'prograde'
            exponent = 0
            datum = 0

    if verbose:
        print(f'Provided time metadata translated to {direction} flow, 10^{exponent} year units, and year {datum} datum')    
  
    return (datum, exponent, direction)

[docs]def convert_datetime_index_to_time(datetime_index, time_unit, time_name):
    """ Convert a Pandas DatetimeIndex into a numpy array of floats.
    
    The general formula is:

        datetime_index = datum +/- time*10**exponent
    
    where we assume ``time`` to use the Gregorian calendar. If dealing with other
    calendars, then conversions need to happen before reaching pyleoclim.

    Parameters
    ----------
    datetime_index: pd.DatetimeIndex
        Index to covert to floats
    time_unit: str
        Time unit indicates the major unit of time. Examples: annum (yr), 
        kiloyear (ka, ky), milayear (ma, my), gigayear (ga, gy)
    time_name: str
        If 'age', direction is always 'retrograde'. 

    Returns
    -------
    np.array((float,)) of converted times

    Examples
    --------

    .. jupyter-execute::

        from pyleoclim.utils.tsbase import convert_datetime_index_to_time
        import pandas as pd
        import numpy as np

        time_unit = 'ga'
        time_name = None
        dti = pd.date_range("2018-01-01", periods=5, freq="Y", unit='s')
        df = pd.DataFrame(np.array(range(5)), index=dti)
        time = convert_datetime_index_to_time(
                    df.index,
                    time_unit,
                    time_name=time_name,
                    )
        print(np.array(time))

    """
    datum, exponent, direction = time_unit_to_datum_exp_dir(time_unit, time_name)
    if direction == 'prograde':
        multiplier = 1
    elif direction == 'retrograde':
        multiplier = -1
    else:
        raise ValueError(f'Expected one of {"prograde", "retrograde"}, got {direction}')

    if not isinstance(datetime_index, pd.DatetimeIndex):
        raise ValueError('The provided index is not a proper DatetimeIndex object')
    if datetime_index.unit != 's':
        raise ValueError(
            "Only 'second' resolution is currently supported. "
            "Please cast to second resolution with `.as_unit('s')`"
        )
    
    time = (
        multiplier * (datetime_index.to_numpy() - np.datetime64(str(datum), "s"))
    ).astype(float) / (10**exponent * SECONDS_PER_YEAR)
    return pd.Index(time)


[docs]def time_to_datetime(time, datum=0, exponent=0, direction='prograde', unit='s'):
    '''
    Converts a vector of time values to a pandas datetime object

    The general formula is:

        datetime_index = datum +/- time*10**exponent
    
    where we assume ``time`` to use the Gregorian calendar. If dealing with other
    calendars, then conversions need to happen before reaching pyleoclim.

    Parameters
    ----------
    time : array-like
        the time axis to be converted
    datum : int, optional
        origin point for the time scale. The default is 0.
    exponent : int, optional
        Base-10 exponent for year multiplier. Dates in kyr should use 3, dates in Myr should use 6, etc.
        The default is 0.
    direction : str, optional
        Direction of time flow, 'prograde' [default] or 'retrograde'.
    unit : str, optional
        Units of the datetime. Default is 's', corresponding to seconds.
        Only change if you have an excellent reason to use finer resolution!

    Returns
    -------
    index, a datetime64[unit] object
    '''
    if direction not in ('prograde', 'retrograde'):
        raise ValueError(f'Expected one of {"prograde", "retrograde"}, got {direction}')
    
    if direction == 'prograde':
        op = operator.add
    elif direction == 'retrograde':
        op = operator.sub
        
    index = op(
        np.datetime64(str(datum), 's'),
        (time*SECONDS_PER_YEAR*10**exponent).astype('timedelta64[s]')
    )
    return index


[docs]def clean_ts(ys, ts, verbose=False):
    ''' Cleaning the timeseries

    Delete the NaNs in the time series and sort it with time axis ascending,
    duplicate timestamps will be reduced by averaging the values.

    Parameters
    ----------
    ys : array
        A time series, NaNs allowed
    ts : array
        The time axis of the time series, NaNs allowed
    verbose : bool
        If True, will print a warning message

    Returns
    -------
    ys : array
        The time series without nans
    ts : array
        The time axis of the time series without nans

    See also
    --------

    pyleoclim.utils.tsbase.dropna : Drop NaN values

    pyleoclim.utils.tsbase.sort_ts : Sort timeseries

    pyleoclim.utils.tsbase.reduce_duplicated_timestamps : Consolidate duplicated timestamps

    '''
    ys, ts = dropna(ys, ts, verbose=verbose)
    ys, ts = sort_ts(ys, ts, verbose=verbose)
    ys, ts = reduce_duplicated_timestamps(ys, ts, verbose=verbose)

    return ys, ts


def dropna(ys, ts, verbose=False):
    '''Drop NaN values

    Remove entries of ys or ts that bear NaNs

    Parameters
    ----------
    ys : array
        A time series, NaNs allowed
    ts : array
        The time axis of the time series, NaNs allowed
    verbose : bool
        If True, will print a warning message

    Returns
    -------
    ys : array
        The time series without nans
    ts : array
        The time axis of the time series without nans

    See Also
    --------
    https://pandas.pydata.org/docs/reference/api/pandas.Series.dropna.html

    '''
    ys = np.asarray(ys, dtype=float)
    ts = np.asarray(ts, dtype=float)
    assert ys.size == ts.size, 'The size of time axis and data value should be equal!'

    ys_tmp = np.copy(ys)
    ys = ys[~np.isnan(ys_tmp)]
    ts = ts[~np.isnan(ys_tmp)]
    ts_tmp = np.copy(ts)
    ys = ys[~np.isnan(ts_tmp)]
    ts = ts[~np.isnan(ts_tmp)]

    if verbose and any(np.isnan(ys_tmp)):
        print('NaNs have been detected and dropped.')

    return ys, ts

[docs]def sort_ts(ys, ts, ascending = True, verbose=False):
    ''' Sort timeseries

    Parameters
    ----------
    ys : array
        Dependent variable
    ts : array
        Independent variable
    verbose : bool
        If True, will print a warning message

    Returns
    -------
    ys : array
        Dependent variable
    ts : array
        Independent variable, sorted in ascending order

    '''
    ys = np.asarray(ys, dtype=float)
    ts = np.asarray(ts, dtype=float)
    assert ys.size == ts.size, 'time and value arrays must be of equal length'

    sort_ind = np.argsort(ts)

    ys = ys[sort_ind]
    ts = ts[sort_ind]

    if ascending:
        if verbose:
            print('Time axis values sorted in ascending order')
    else:
        ys = ys[::-1] # flip the series
        ts = ts[::-1]
        if verbose:
            print('Time axis values sorted in descending order')

    return ys, ts

[docs]def reduce_duplicated_timestamps(ys, ts, verbose=False):
    ''' Consolidate duplicated timestamps

    Reduce duplicated timestamps in a timeseries by averaging the values

    Parameters
    ----------
    ys : array
        Dependent variable
    ts : array
        Independent variable
    verbose : bool
        If True, will print a warning message

    Returns
    -------
    ys : array
        Dependent variable
    ts : array
        Independent variable, with duplicated timestamps reduced by averaging the values

    '''
    ys = np.asarray(ys, dtype=float)
    ts = np.asarray(ts, dtype=float)
    assert ys.size == ts.size, 'The size of time axis and data value should be equal!'

    if len(ts) != len(set(ts)):
        value = OrderedDict()
        for t, y in zip(ts, ys):
            if t not in value:
                value[t] = [y]
            else:
                value[t].append(y)

        ts = []
        ys = []
        for k, v in value.items():
            ts.append(k)
            ys.append(np.mean(v))

        ts = np.array(ts)
        ys = np.array(ys)

        if verbose:
            print('Duplicate timestamps have been combined by averaging values.')
    return ys, ts

[docs]def is_evenly_spaced(x, tol=1e-4):
    ''' Check if an axis x is evenly spaced, within a given tolerance

    Parameters
    ----------

    x : array

    tol : float64
        Numerical tolerance for the relative difference

    Returns
    -------

    check : bool
        True - evenly spaced; False - unevenly spaced.

    '''
    if x is None:
        check = True
    else:
        dx = np.diff(x)
        dx_mean = dx.mean()
        check = all(np.abs((dx - dx_mean)/dx_mean) < tol for dx in np.diff(x)) # compare relative spacing to the mean

    return check

def resolution(x):
    '''
    Computes the resolution (increments) of an axis, and returns its descriptive statistics

    Parameters
    ----------
    x : array

    Returns
    -------
    res : array
        array of time increments

    stats : DescribeResult
        descriptive statistics of res
        
    sign : str
        sign of the resolution    
        'positive' if all values of res are > 0
        'negative' if all values of res are < 0
        'mixed' otherwise 

    See Also
    --------

    https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.describe.html
    '''
    res = np.diff(x)
    stats = st.describe(res)
        
    if all(res > 0):
        sign = 'positive'
    elif all(res < 0):
        sign = 'negative'
    else:
        sign = 'mixed'
        
    return (res, stats, sign)