Source code for pyleoclim.utils.lipdutils

 # -*- coding: utf-8 -*-
"""
Utilities to manipulate LiPD files and automate data transformation whenever possible. 
These functions are used throughout Pyleoclim but are not meant for direct interaction by users.
Also handles integration with the LinkedEarth wiki and the LinkedEarth Ontology.
"""

import lipd as lpd
import numpy as np
import os
import json
import requests
import wget
from bs4 import BeautifulSoup
import string


[docs]class CaseInsensitiveDict(dict):
    def __setitem__(self, key, value):
        super().__setitem__(key.lower().replace(" ", ""), value)

    def __getitem__(self, key):
        return super().__getitem__(key.lower().replace(" ", ""))

PLOT_DEFAULT = {'GroundIce': ['#86CDFA', 'h'],
                     'Borehole': ['#00008b', 'h'],
                     'Coral': ['#FF8B00', 'o'],
                     'Documents': ['#f8d568', 'p'],
                     'GlacierIce': ['#86CDFA', 'd'],
                     'Hybrid': ['#808000', '*'],
                     'LakeSediment': ['#8A4513', '^'],
                     'MarineSediment': ['#8A4513', 's'],
                     'Sclerosponge': ['r', 'o'],
                     'Speleothem': ['#FF1492', 'd'],
                     'Wood': ['#32CC32', '^'],
                     'MolluskShell': ['#FFD600', 'h'],
                     'Peat': ['#2F4F4F', '*'],
                     'Midden': ['#824E2B', 'o'],
                     'FluvialSediment': ['#4169E0','d'],
                     'TerrestrialSediment': ['#8A4513','o'],
                     'Shoreline': ['#add8e6','o'],
                     'Instrumental' : ['#8f21d8', '*'],
                     'Model' : ['#b4a7d6', "d"],
                     'Other': ['k', 'o']
                    }


"""
The followng functions handle web scrapping to grab information regarding the controlled vocabulary

"""

[docs]def get_archive_type():
    '''
    Scrape the LiPDverse website to obtain the list of possible archives and associated synonyms

    Returns
    -------
    res : Dictionary
        Keys correspond to the preferred terms and values represent known synonyms

    '''
    url = "https://lipdverse.org/vocabulary/archivetype/"
    response = requests.get(url)

    if response.status_code == 200:
        # Parse the content of the request with BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Get the names of the archiveTypes   
        
        h3_tags = soup.find_all('h3')
        archiveName = []
        for item in h3_tags:
            archiveName.append(item.get_text())
            
        
            
        # Get the known synonyms
        h4_tags = soup.find_all('h4', string="Known synonyms")
        
        synonyms=[]
        
        
        for h4_tag in h4_tags:
            next_element = h4_tag.find_next_sibling()

            found_p = False
            while next_element and next_element.name != 'div':
                if next_element.name == 'p':
                    synonyms_text = next_element.get_text()
                    words = [word.strip() for word in synonyms_text.split(',')]
                    synonyms.append(words)
                    found_p = True
                    break

                next_element = next_element.find_next_sibling()
            
            # If a <p> tag was not found, insert an empty string
            if not found_p:
                synonyms.append([])
        
        #create a dictionary for the results
        res= {}
        for idx,item in enumerate(archiveName):
            res[item]=synonyms[idx]

    else:
        print("failed to retrieve the webpage; returning static list, which may be out of date")
        
        res = ["Borehole",
                       "Coral",
                       "FluvialSediment",
                       "GlacierIce",
                       "GroundIce",
                       "LakeSediment",
                       "MarineSediment",
                       "Midden",
                       "MolluskShell",
                       "Peat",
                       "Scelorosponge",
                       "Shoreline",
                       "Spleleothem",
                       "TerrestrialSediment",
                       "Wood"]
    
    return res


"""
The following functions handle the LiPD files
"""

[docs]def enumerateLipds(lipds):
    """Enumerate the LiPD files loaded in the workspace

    Parameters
    ----------

    lipds : dict
        A dictionary of LiPD files.

    """
    print("Below are the available records")
    lipds_list = [val for val in lipds.keys()]
    for idx, val in enumerate(lipds_list):
        print(idx,': ',val)

[docs]def getLipd(lipds):
    """Prompt for a LiPD file

    Ask the user to select a LiPD file from a list
    Use this function in conjunction with enumerateLipds()

    Parameters
    ----------

    lipds : dict
        A dictionary of LiPD files. Can be obtained from
        pyleoclim.readLipd()

    Returns
    -------

    select_lipd : int
        The index of the LiPD file

    """
    enumerateLipds(lipds)
    lipds_list = [val for val in lipds.keys()]
    choice = int(input("Enter the number of the file: "))
    lipd_name = lipds_list[choice]
    select_lipd = lipds[lipd_name]

    return select_lipd


"""
The following functions work at the variables level
"""

[docs]def promptForVariable():
    """Prompt for a specific variable

    Ask the user to select the variable they are interested in.
    Use this function in conjunction with readHeaders() or getTSO()

    Returns
    -------

    select_var : int
        The index of the variable

    """
    select_var = int(input("Enter the number of the variable you wish to use: "))
    return select_var

[docs]def xAxisTs(timeseries):
    """ Get the x-axis for the timeseries.

    Parameters
    ----------

    timeseries : dict
        a timeseries object

    Returns
    -------

    x_axis : array
        the values for the x-axis representation
    label : string
        returns either "age", "year", or "depth"
    """

    if "depth" in timeseries.keys() and "age" in timeseries.keys() or\
            "depth" in timeseries.keys() and "year" in timeseries.keys():
        print("Both time and depth information available, selecting time")
        if "age" in timeseries.keys() and "year" in timeseries.keys():
            print("Both age and year representation available, selecting age")
            x_axis = timeseries["age"]
            label = "age"
        elif "year" in timeseries.keys():
            x_axis = timeseries["year"]
            label = "year"
        elif "age" in timeseries.keys():
            x_axis = timeseries["age"]
    elif "depth" in timeseries.keys():
        x_axis =  timeseries["depth"]
        label = "depth"
    elif "age" in timeseries.keys():
        x_axis = timeseries["age"]
        label = "age"
    elif "year" in timeseries.keys():
        x_axis = timeseries["year"]
        label = "year"
    else:
        raise KeyError("No age or depth information available")

    return x_axis, label

[docs]def checkXaxis(timeseries, x_axis= None):
    """Check that a x-axis is present for the timeseries

    Parameters
    ----------

    timeseries : dict
        a timeseries

    x_axis : string
        the x-axis representation, either depth, age or year

    Returns
    -------

    x : array
        the values for the x-axis representation

    label : string
        returns either "age", "year", or "depth"

    """
    if x_axis is None:
        x, label = xAxisTs(timeseries)
        x = np.array(x, dtype = 'float64')
    elif x_axis == "depth":
        if not "depth" in timeseries.keys():
            raise ValueError("Depth not available for this record")
        else:
            x = np.array(timeseries['depth'], dtype = 'float64')
            label = "depth"
    elif x_axis == "age":
        if not "age" in timeseries.keys():
            raise ValueError("Age not available for this record")
        else:
            x = np.array(timeseries['age'], dtype = 'float64')
            label = "age"
    elif x_axis == "year":
        if not "year" in timeseries.keys():
            raise ValueError("Year not available for this record")
        else:
            x = np.array(timeseries['year'], dtype = 'float64')
            label = "year"
    else:
        raise KeyError("enter either 'depth','age',or 'year'")

    return x, label

[docs]def checkTimeAxis(timeseries, x_axis = None):
    """ This function makes sure that time is available for the timeseries

    Parameters
    ----------

    timeseries : dict
        A LiPD timeseries object

    Returns
    -------

    x : array
        the time values for the timeseries

    label : string
        the time representation for the timeseries
    """
    if x_axis is None:
        if not 'age' in timeseries.keys() and not 'year' in timeseries.keys():
            raise KeyError("No time information available")
        elif 'age' in timeseries.keys() and 'year' in timeseries.keys():
            print("Both age and year information are available, using age")
            label = 'age'
        elif 'age' in timeseries.keys():
            label = 'age'
        elif 'year' in timeseries.keys():
            label = 'year'
    elif x_axis == 'age':
        if not 'age' in timeseries.keys():
            raise KeyError('Age is not available for this record')
        else:
            label = 'age'
    elif x_axis == 'year':
        if not 'year' in timeseries.keys():
            raise KeyError('Year is not available for this record')
        else:
            label='year'
    else:
        raise KeyError('Only None, year and age are valid entries for x_axis parameter')

    x = np.array(timeseries[label], dtype = 'float64')

    return x, label

[docs]def searchVar(timeseries_list, key, exact = True, override = True):
    """ This function searched for keywords (exact match) for a variable

    Parameters
    ----------

    timeseries_list : list
        A list of available series

    key : list
        A list of keys to search

    exact : bool
        if True, looks for an exact match.

    override : bool
        if True, override the exact match if no match is found

    Returns
    -------

    match : list
        A list of keys for the timeseries that match the selection
        criteria.
    """

    # Make sure thaat the keys are contained in a list
    if type(key) is not list:
       if type(key) is str:
           key = [key]
       else:
           raise TypeError("Key terms should be entered as a list")

    match = []

    if exact == True:
    #Search for exact match with the key
        for keyVal in key:
            for val in timeseries_list.keys():
                ts_temp = timeseries_list[val]
                if "variableName" in ts_temp.keys():
                    name = ts_temp["variableName"]
                    if keyVal.lower() == name.lower():
                        match.append(val)
                elif "paleoData_variableName" in ts_temp.keys():
                    name = ts_temp["paleoData_variableName"]
                    if keyVal.lower() == name.lower():
                        match.append(val)
                elif "chronData_variableName" in ts_temp.keys():
                    name = ts_temp["chronData_variableName"]
                    if keyVal.lower() == name.lower():
                        match.append(val)
                elif "ProxyObservationType" in ts_temp.keys():
                    name = ts_temp["ProxyObservationType"]
                    if keyVal.lower() == name.lower():
                        match.append(val)
                elif "paleoData_proxyObservationType" in ts_temp.keys():
                    name = ts_temp["paleoData_proxyObservationType"]
                    if keyVal.lower() == name.lower():
                        match.append(val)
                elif "chronData_proxyObservationType" in ts_temp.keys():
                    name = ts_temp["chronData_proxyObservationType"]
                    if keyVal.lower() == name.lower():
                        match.append(val)
                elif "InferredVariableType" in ts_temp.keys():
                    name = ts_temp["InferredVariableType"]
                    if keyVal.lower() == name.lower():
                        match.append(val)
                elif "paleoData_inferredVariableType" in ts_temp.keys():
                    name = ts_temp["paleoData_inferredVariableType"]
                    if keyVal.lower() == name.lower():
                        match.append(val)
                elif "chronData_inferredVariableType" in ts_temp.keys():
                    name = ts_temp["chronData_inferredVariableType"]
                    if keyVal.lower() == name.lower():
                        match.append(val)
    else:
    # Search for the word in the ley
        for keyVal in key:
            for val in timeseries_list.keys():
                ts_temp = timeseries_list[val]
                if "variableName" in ts_temp.keys():
                    name = ts_temp["variableName"]
                    if keyVal.lower() in name.lower():
                        match.append(val)
                elif "paleoData_variableName" in ts_temp.keys():
                    name = ts_temp["paleoData_variableName"]
                    if keyVal.lower() in name.lower():
                        match.append(val)
                elif "chronData_variableName" in ts_temp.keys():
                    name = ts_temp["chronData_variableName"]
                    if keyVal.lower() in name.lower():
                        match.append(val)
                elif "ProxyObservationType" in ts_temp.keys():
                    name = ts_temp["ProxyObservationType"]
                    if keyVal.lower() in name.lower():
                        match.append(val)
                elif "paleoData_proxyObservationType" in ts_temp.keys():
                    name = ts_temp["paleoData_proxyObservationType"]
                    if keyVal.lower() in name.lower():
                        match.append(val)
                elif "chronData_proxyObservationType" in ts_temp.keys():
                    name = ts_temp["chronData_proxyObservationType"]
                    if keyVal.lower() in name.lower():
                        match.append(val)
                elif "InferredVariableType" in ts_temp.keys():
                    name = ts_temp["InferredVariableType"]
                    if keyVal.lower() in name.lower():
                        match.append(val)
                elif "paleoData_inferredVariableType" in ts_temp.keys():
                    name = ts_temp["paleoData_inferredVariableType"]
                    if keyVal.lower() in name.lower():
                        match.append(val)
                elif "chronData_inferredVariableType" in ts_temp.keys():
                    name = ts_temp["chronData_inferredVariableType"]
                    if keyVal.lower() in name.lower():
                        match.append(val)

    # Expand the search if asked
    if not match and exact == True and override == True:
        print("No match found on exact search, running partial match")
        for keyVal in key:
            for val in timeseries_list.keys():
                ts_temp = timeseries_list[val]
                if "variableName" in ts_temp.keys():
                    name = ts_temp["variableName"]
                    if keyVal.lower() in name.lower():
                        match.append(val)
                elif "paleoData_variableName" in ts_temp.keys():
                    name = ts_temp["paleoData_variableName"]
                    if keyVal.lower() in name.lower():
                        match.append(val)
                elif "chronData_variableName" in ts_temp.keys():
                    name = ts_temp["chronData_variableName"]
                    if keyVal.lower() in name.lower():
                        match.append(val)
                elif "ProxyObservationType" in ts_temp.keys():
                    name = ts_temp["ProxyObservationType"]
                    if keyVal.lower() in name.lower():
                        match.append(val)
                elif "paleoData_proxyObservationType" in ts_temp.keys():
                    name = ts_temp["paleoData_proxyObservationType"]
                    if keyVal.lower() in name.lower():
                        match.append(val)
                elif "chronData_proxyObservationType" in ts_temp.keys():
                    name = ts_temp["chronData_proxyObservationType"]
                    if keyVal.lower() in name.lower():
                        match.append(val)
                elif "InferredVariableType" in ts_temp.keys():
                    name = ts_temp["InferredVariableType"]
                    if keyVal.lower() in name.lower():
                        match.append(val)
                elif "paleoData_inferredVariableType" in ts_temp.keys():
                    name = ts_temp["paleoData_inferredVariableType"]
                    if keyVal.lower() in name.lower():
                        match.append(val)
                elif "chronData_inferredVariableType" in ts_temp.keys():
                    name = ts_temp["chronData_inferredVariableType"]
                    if keyVal.lower() in name.lower():
                        match.append(val)

    # Get the unique entries
    match = list(set(match))

    # Narrow down if more than one match is found by asking the user
    if len(match) > 1:
        print("More than one series match your search criteria")
        for idx, val in enumerate(match):
            print(idx,": ", val)
        choice = int(input("Enter the number for the variable: "))
        match = match[choice]
    elif not match:
        print("No match found.")
        print("Here are the available variables: ")
        v = list(timeseries_list.keys())
        for idx, val in enumerate(v):
            print(idx,": ",val)
        choice = input("Please select the variable you'd like to use or enter to continue: ")
        if not choice:
            match =""
        else:
            choice = int(choice)
            match = v[choice]
    else:
        match = match[0]

    return match

"""
The following functions handle the time series objects
"""
[docs]def enumerateTs(timeseries_list):
    """Enumerate the available time series objects

    Parameters
    ----------

    timeseries_list : list
        a  list of available timeseries objects.


    """
    available_y = []
    dataSetName =[]
    at =[]
    for item in timeseries_list:
        if 'dataSetName' in item.keys():
            dataSetName.append(item['dataSetName'])
        else:
            dataSetName.append('NA')
        if 'paleoData_variableName' in item.keys():
            available_y.append(item['paleoData_variableName'])
        elif 'chronData_variableName' in item.keys():
            available_y.append(item['chronData_variableName'])
        else:
            available_y.append('NA')
        if 'archiveType' in item.keys():
            at.append(item['archiveType'])
        else:
            at.append('NA')

    for idx,val in enumerate(available_y):
        print(idx,': ',dataSetName[idx], ': ',at[idx],': ', val)

[docs]def getTs(timeseries_list, option = None):
    """Get a specific timeseries object from a dictionary of timeseries

    Parameters
    ----------

    timeseries_list : list
        a  list of available timeseries objects.

    option : string
        An expression to filter the datasets. Uses lipd.filterTs()

    Returns
    -------

    timeseries : single timeseries object or list of timeseries
        A single timeseries object if not optional filter selected or a filtered
        list if optional arguments given
        
    See also
    --------
    
    pyleoclim.utils.lipdutils.enumerateTs : Enumerate the available time series objects

    pyleoclim.utils.lipdutils.promptForVariable : Prompt for a specific variable

    """
    if not option:
        enumerateTs(timeseries_list)
        select_TSO = promptForVariable()
        timeseries = timeseries_list[select_TSO]
    else:
        timeseries = lpd.filterTs(timeseries_list, option)

    return timeseries

"""
Functions to handle data on the LinkedEarth Platform
"""
[docs]def LipdToOntology(archiveType):
    """ standardize archiveType

    Transform the archiveType from their LiPD name to their ontology counterpart

    Parameters
    ----------

    archiveType : string
        name of the archiveType from the LiPD file

    Returns
    -------

    archiveType : string
        archiveType according to the ontology

    """
    #Align with the ontology
    
    if archiveType != None:
    
        if archiveType.lower().replace(" ", "") == "icecore":
            archiveType = 'GlacierIce'
        elif archiveType.lower().replace(" ", "") == "ice-other":
            archiveType = 'GlacierIce'
        elif archiveType.lower().replace(" ", "") == 'tree':
            archiveType = 'Wood'
        elif archiveType.lower().replace(" ","") not in [key.lower() for key in PLOT_DEFAULT.keys()]:
            archiveType='Other'

    return archiveType

[docs]def timeUnitsCheck(units):
    """ This function attempts to make sense of the time units by checking for equivalence

    Parameters
    ----------

    units : string
        The units string for the timeseries

    Returns
    -------

    unit_group : string
        Whether the units belongs to age_units, kage_units, year_units, ma_units, or undefined
    """

    age_units = ['year B.P.','yr B.P.','yr BP','BP','yrs BP','years B.P.',\
                 'yr. BP','yr. B.P.', 'cal. BP', 'cal B.P.', \
                 'year BP','years BP']
    kage_units = ['kyr BP','kaBP','ka BP','ky','kyr','kyr B.P.', 'ka B.P.', 'ky BP',\
                  'kyrs BP','ky B.P.', 'kyrs B.P.', 'kyBP', 'kyrBP']
    year_units = ['AD','CE','year C.E.','year A.D.', 'year CE','year AD',\
                  'years C.E.','years A.D.','yr CE','yr AD','yr C.E.'\
                  'yr A.D.', 'yrs C.E.', 'yrs A.D.', 'yrs CE', 'yrs AD']
    mage_units = ['my BP', 'myr BP', 'myrs BP', 'ma BP', 'ma',\
                  'my B.P.', 'myr B.P.', 'myrs B.P.', 'ma B.P.']
    undefined = ['years', 'yr','year','yrs']

    if units in age_units:
        unit_group = 'age_units'
    elif units in kage_units:
        unit_group = 'kage_units'
    elif units in year_units:
        unit_group = 'year_units'
    elif units in undefined:
        unit_group = 'undefined'
    elif units in mage_units:
        unit_group = 'ma_units'
    else:
        unit_group = 'unknown'

    return unit_group



[docs]def whatArchives(print_response=True):
    """ Get the names for ArchiveType from LinkedEarth Ontology

    Parameters
    ----------

    print_response : bool
        Whether to print the results on the console. Default is True

    Returns
    -------

    res : JSON-object with the request from LinkedEarth wiki api

    """
    url = "http://wiki.linked.earth/store/ds/query"

    query = """PREFIX core: <http://linked.earth/ontology#>
    PREFIX wiki: <http://wiki.linked.earth/Special:URIResolver/>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    SELECT distinct ?a
    WHERE {
    {
        ?dataset wiki:Property-3AArchiveType ?a.
    }UNION
    {
        ?w core:proxyArchiveType ?t.
        ?t rdfs:label ?a
    }
    }"""

    response = requests.post(url, data = {'query': query})
    res_i = json.loads(response.text)

    if print_response == True:
        print("The following archive types are available on the wiki:")
        for item in res_i['results']['bindings']:
            print ("*" + item['a']['value'])
    res=[]
    for item in res_i['results']['bindings']:
        res.append(item['a']['value'])

    return res

[docs]def whatProxyObservations(print_response=True):
    """ Get the names for ProxyObservations from LinkedEarth Ontology

    Parameters
    ----------

    print_response : bool
        Whether to print the results on the console. Default is True

    Returns
    -------

    res : JSON-object with the request from LinkedEarth wiki api
    """

    url = "http://wiki.linked.earth/store/ds/query"

    query = """PREFIX core: <http://linked.earth/ontology#>
    PREFIX wiki: <http://wiki.linked.earth/Special:URIResolver/>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    SELECT distinct ?a
    WHERE
    {
        ?w core:proxyObservationType ?t.
        ?t rdfs:label ?a
    }"""

    response = requests.post(url, data = {'query': query})
    res_i = json.loads(response.text)

    if print_response==True:
        print("The following proxy observation types are available on the wiki: ")
        for item in res_i['results']['bindings']:
            print (item['a']['value'])

    res=[]
    for item in res_i['results']['bindings']:
        res.append(item['a']['value'])

    return res

[docs]def whatProxySensors(print_response=True):
    """ Get the names for ProxySensors from LinkedEarth Ontology

    Parameters
    ----------

    print_response : bool
        Whether to print the results on the console. Default is True

    Returns
    -------

    res : JSON-object with the request from LinkedEarth wiki api
    """
    url = "http://wiki.linked.earth/store/ds/query"

    query = """PREFIX core: <http://linked.earth/ontology#>
    PREFIX wiki: <http://wiki.linked.earth/Special:URIResolver/>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    SELECT distinct ?a ?b
    WHERE
    {
        ?w core:sensorGenus ?a.
        ?w core:sensorSpecies ?b .

    }"""

    response = requests.post(url, data = {'query': query})
    res_i = json.loads(response.text)

    if print_response == True:
        print("The available sensor genus/species are: ")
        for item in res_i['results']['bindings']:
            print ("*" + 'Genus: '+item['a']['value']+' Species: ' +item['b']['value'])

    res=[]
    for item in res_i['results']['bindings']:
        res.append(item['a']['value'])

    return res

[docs]def whatInferredVariables(print_response=True):
    """ Get the names for InferredVariables from LinkedEarth Ontology

    Parameters
    ----------

    print_response : bool
        Whether to print the results on the console. Default is True

    Returns
    -------

    res : JSON-object with the request from LinkedEarth wiki api
    """
    url = "http://wiki.linked.earth/store/ds/query"

    query = """PREFIX core: <http://linked.earth/ontology#>
    PREFIX wiki: <http://wiki.linked.earth/Special:URIResolver/>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    SELECT distinct ?a
    WHERE
    {
        ?w core:inferredVariableType ?t.
        ?t rdfs:label ?a
    }"""

    response = requests.post(url, data = {'query': query})
    res_i = json.loads(response.text)

    if print_response == True:
        print("The following Inferred Variable types are available on the wiki: ")
        for item in res_i['results']['bindings']:
            print ("*" + item['a']['value'])
            res=[]
    for item in res_i['results']['bindings']:
        res.append(item['a']['value'])
    return res

[docs]def whatInterpretations(print_response=True):
    """ Get the names for interpretations from LinkedEarth Ontology

    Parameters
    ----------

    print_response : bool
        Whether to print the results on the console. Default is True

    Returns
    -------

    res : JSON-object with the request from LinkedEarth wiki api
    """

    url = "http://wiki.linked.earth/store/ds/query"

    query = """PREFIX core: <http://linked.earth/ontology#>
    PREFIX wiki: <http://wiki.linked.earth/Special:URIResolver/>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    SELECT distinct ?a ?b
    WHERE
    {
        ?w core:name ?a.
        ?w core:detail ?b .

    }"""

    response = requests.post(url, data = {'query': query})
    res_i = json.loads(response.text)

    if print_response==True:
        print("The following interpretation are available on the wiki: ")
        for item in res_i['results']['bindings']:
            print ("*" + 'Name: '+item['a']['value']+' Detail: ' +item['b']['value'])
    res=[]
    for item in res_i['results']['bindings']:
        res.append(item['a']['value'])
    return res

[docs]def queryLinkedEarth(archiveType=[ ], proxyObsType=[ ], infVarType = [ ], sensorGenus=[ ],
                    sensorSpecies=[ ], interpName =[ ], interpDetail =[ ], ageUnits = [ ],
                    ageBound = [ ], ageBoundType = [ ], recordLength = [ ], resolution = [ ],
                    lat = [ ], lon = [ ], alt = [ ], print_response = True, download_lipd = True,
                    download_folder = 'default'):
    """ This function allows to query the LinkedEarth wiki for records.

    This function allows to query the LinkedEarth wiki for specific catagories.
    If you have more than one keyword per catagory, enter them in a list. If you don't
    wish to use a particular terms, leave a blank in-between the brackets.

    Parameters
    ----------

    archiveType : list of strings
        The type of archive (enter all query terms, separated by a comma)

    proxyObsType : list of strings
        The type of proxy observation (enter all query terms, separated by a comma)

    infVarType : list of strings
        The type of inferred variable (enter all query terms, separated by a comma)

    sensorGenus : list of strings
        The Genus of the sensor (enter all query terms, separated by a comma)

    sensorSpecies : list of strings
        The Species of the sensor (enter all query terms, separated by a comma)

    interpName : list of strings
        The name of the interpretation (enter all query terms, separated by a comma)

    interpDetail : list of strings
        The detail of the interpretation (enter all query terms, separated by a comma)

    ageUnits : list of strings
        The units of in which the age (year) is expressed in.
        Warning: Separate each query if need to run across multiple age queries (i.e., yr B.P. vs kyr B.P.). If the units are different but the meaning is the same (e.g., yr B.P. vs yr BP, enter all search terms separated by a comma).

    ageBound : list of floats
        Enter the minimum and maximum age value to search for.
        Warning: You MUST enter a minimum AND maximum value. If you wish to perform a query such as "all ages before 2000 A.D.", enter a minimum value of -99999 to cover all bases.

    ageBoundType : list of strings
        The type of querying to perform. Possible values include: "any", "entire", and "entirely".
        - any: Overlap any portions of matching datasets (default)
        - entirely: are entirely overlapped by matching datasets
        - entire: overlap entire matching datasets but dataset can be shorter than the bounds

    recordLength : list of floats
        The minimum length the record needs to have while matching the ageBound criteria. For instance, "look for all records between 3000 and 6000 year BP with a record length of at least 1500 year".

    resolution : list of floats
        The maximum resolution of the resord. Resolution has the same units as age/year. For instance, "look for all records with a resolution of at least 100 years".
        Warning: Resolution applies to specific variables rather than an entire dataset. Imagine the case where some measurements are made every cm while others are made every 5cm. If you require a specific variable to have the needed resolution, make sure that either the proxyObservationType, inferredVariableType, and/or Interpretation fields are completed.

    lat : list of floats
        The minimum and maximum latitude. South is expressed with negative numbers.
        Warning: You MUST enter a minimum AND maximum value. If you wish to perform a query looking for records from the Northern Hemisphere, enter [0,90].

    lon : list of floats
        The minimum and maximum longitude. West is expressed with negative numbers.
        Warning: You MUST enter a minimum AND a maximum value. If you wish to perform a query looking for records from the Western Hemisphere, enter [-180,0].

    alt : list of floats
        The minimum and maximum altitude. Depth below sea level is expressed as negative numbers.
        Warning: You MUST enter a minimum AND a maximum value. If you wish to perform a query looking for records below a certain depth (e.g., 500), enter [-99999,-500].

    print_response : bool
        If True, prints the URLs to the matching LiPD files

    download_lipd : bool
        If True, download the matching LiPD files

    download_folder : string
        Location to download the LiPD files. If "default", will download in the current directory.

    Returns
    -------

    res : the response to the query

    """
    # Perform a lot of checks
    if len(ageBound)==1:
        raise ValueError("You need to provide a minimum and maximum boundary.")

    if ageBound and not ageUnits:
        raise ValueError("When providing age limits, you must also enter the units")

    if recordLength and not ageUnits:
        raise ValueError("When providing a record length, you must also enter the units")

    if ageBound and ageBound[0]>ageBound[1]:
        ageBound = [ageBound[1],ageBound[0]]

    if len(ageBoundType)>1:
        raise ValueError("Only one search possible at a time.")
        if ageBoundType not in ["any","entirely","entire"]:
            raise ValueError("ageBoundType is not recognized")

    if recordLength and ageBound and recordLength[0] > (ageBound[1]-ageBound[0]):
        raise ValueError("The required recordLength is greater than the provided age bounds")

    if len(resolution)>1:
        raise ValueError("You can only search for a maximum resolution one at a time.")

    if len(lat)==1:
        raise ValueError("Please enter a lower AND upper boundary for the latitude search")

    if lat and lat[1]<lat[0]:
        lat = [lat[1],lat[0]]

    if len(lon)==1:
        raise ValueError("Please enter a lower AND upper boundary for the longitude search")

    if lon and lon[1]<lon[0]:
        lon = [lon[1],lon[0]]

    if len(alt)==1:
        raise ValueError("Please enter a lower AND upper boundary for the altitude search")

    if alt and alt[1]<alt[0]:
        alt = [alt[1],alt[0]]

    # Perform the query
    url = "http://wiki.linked.earth/store/ds/query"

    query = """PREFIX core: <http://linked.earth/ontology#>
    PREFIX wiki: <http://wiki.linked.earth/Special:URIResolver/>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

    SELECT  distinct ?dataset ?dataset_label
    WHERE {
    ?dataset rdfs:label ?dataset_label
    """

    ### Look for data field
    dataQ=""
    if archiveType or proxyObsType or infVarType or sensorGenus or sensorSpecies or interpName or interpDetail or ageUnits or ageBound or recordLength or resolution:
        dataQ = "?dataset core:includesChronData|core:includesPaleoData ?data."

    ### Look for variable
    ## measuredVariable
    measuredVarQ=""
    if proxyObsType or archiveType or sensorGenus or sensorSpecies or interpName or interpDetail or resolution:
        measuredVarQ = "?data core:foundInMeasurementTable / core:includesVariable ?v."

    ## InferredVar
    inferredVarQ=""
    if infVarType or interpName or interpDetail or resolution:
        inferredVarQ = "?data core:foundInMeasurementTable / core:includesVariable ?v1."

    ### Archive Query
    archiveTypeQ=""
    if len(archiveType)>0:
        #add values for the archiveType
        query += "VALUES ?a {"
        for item in archiveType:
            query +="\""+item+"\" "
        query += "}\n"
        # Create the query
        archiveTypeQ = """
    #Archive Type query
    {
        ?dataset wiki:Property-3AArchiveType ?a.
    }UNION
    {
        ?p core:proxyArchiveType / rdfs:label ?a.
    }"""

    ### ProxyObservationQuery
    proxyObsTypeQ=""
    if len(proxyObsType)>0:
       #  add values for the proxyObservationType
       query+="VALUES ?b {"
       for item in proxyObsType:
           query += "\""+item+"\""
       query += "}\n"
       # Create the query
       proxyObsTypeQ="?v core:proxyObservationType/rdfs:label ?b."

    ### InferredVariableQuery
    infVarTypeQ=""
    if len(infVarType)>0:
        query+="VALUES ?c {"
        for item in infVarType:
            query+="\""+item+"\""
        query+="}\n"
        # create the query
        infVarTypeQ="""
    ?v1 core:inferredVariableType ?t.
    ?t rdfs:label ?c.
    """
    ### ProxySensorQuery
    sensorQ=""
    if len(sensorGenus)>0 or len(sensorSpecies)>0:
        sensorQ="""
    ?p core:proxySensorType ?sensor.
    """
    ## Genus query
    genusQ=""
    if len(sensorGenus)>0:
        query+="VALUES ?genus {"
        for item in sensorGenus:
            query+="\""+item+"\""
        query+="}\n"
        # create the query
        genusQ = "?sensor core:sensorGenus ?genus."

    ## Species query
    speciesQ=""
    if len(sensorSpecies)>0:
        query+=  "VALUES ?species {"
        for item in sensorSpecies:
            query+="\""+item+"\""
        query+="}\n"
        #Create the query
        speciesQ = "?sensor core:sensorSpecies ?species."

    ### Proxy system query
    proxySystemQ = ""
    if len(archiveType)>0 or len(sensorGenus)>0 or len(sensorSpecies)>0:
        proxySystemQ="?v ?proxySystem ?p."

    ### Deal with interpretation
    ## Make sure there is an interpretation to begin with
    interpQ = ""
    if len(interpName)>0 or len(interpDetail)>0:
        interpQ= """
    {?v1 core:interpretedAs ?interpretation}
    UNION
    {?v core:interpretedAs ?interpretation}
    """

    ## Name
    interpNameQ=""
    if len(interpName)>0:
        query+= "VALUES ?intName {"
        for item in interpName:
            query+="\""+item+"\""
        query+=  "}\n"
        #Create the query
        interpNameQ = "?interpretation core:name ?intName."

    ## detail
    interpDetailQ = ""
    if len(interpDetail)>0:
        query+= "VALUES ?intDetail {"
        for item in interpDetail:
            query+="\""+item+"\""
        query+="}\n"
        #Create the query
        interpDetailQ = "?interpretation core:detail ?intDetail."

    ### Age
    ## Units
    ageUnitsQ = ""
    if len(ageUnits)>0:
        query+= "VALUES ?units {"
        for item in ageUnits:
            query+="\""+item+"\""
        query+="}\n"
        query+="""VALUES ?ageOrYear{"Age" "Year"}\n"""
        # create the query
        ageUnitsQ ="""
    ?data core:foundInMeasurementTable / core:includesVariable ?v2.
    ?v2 core:inferredVariableType ?aoy.
    ?aoy rdfs:label ?ageOrYear.
    ?v2 core:hasUnits ?units .
    """
    ## Minimum and maximum
    ageQ = ""
    if ageBoundType[0] == "entirely":
        if len(ageBound)>0 and len(recordLength)>0:
            ageQ="""
    ?v2 core:hasMinValue ?e1.
    ?v2 core:hasMaxValue ?e2.
    filter(?e1<=""" +str(ageBound[0])+ """&& ?e2>="""+str(ageBound[1])+""" && abs(?e1-?e2)>="""+str(recordLength[0])+""").
    """
        elif len(ageBound)>0 and len(recordLength)==0:
            ageQ="""
    ?v2 core:hasMinValue ?e1.
    ?v2 core:hasMaxValue ?e2.
    filter(?e1<=""" +str(ageBound[0])+ """&& ?e2>="""+str(ageBound[1])+""").
    """
    elif ageBoundType[0] == "entire":
        if len(ageBound)>0 and len(recordLength)>0:
            ageQ="""
    ?v2 core:hasMinValue ?e1.
    ?v2 core:hasMaxValue ?e2.
    filter(?e1>=""" +str(ageBound[0])+ """&& ?e2<="""+str(ageBound[1])+""" && abs(?e1-?e2)>="""+str(recordLength[0])+""").
    """
        elif len(ageBound)>0 and len(recordLength)==0:
            ageQ="""
    ?v2 core:hasMinValue ?e1.
    ?v2 core:hasMaxValue ?e2.
    filter(?e1>=""" +str(ageBound[0])+ """&& ?e2<="""+str(ageBound[1])+""").
    """
    elif ageBoundType[0] == "any":
        if len(ageBound)>0 and len(recordLength)>0:
            ageQ="""
    ?v2 core:hasMinValue ?e1.
    filter(?e1<=""" +str(ageBound[1])+ """ && abs(?e1-"""+str(ageBound[1])+""")>="""+str(recordLength[0])+""").
    """
        elif len(ageBound)>0 and len(recordLength)==0:
            ageQ="""
    ?v2 core:hasMinValue ?e1.
    filter(?e1<=""" +str(ageBound[1])+ """).
    """

    ### Resolution
    resQ=""
    if len(resolution)>0:
        resQ = """
    {
    ?v core:hasResolution/(core:hasMeanValue |core:hasMedianValue) ?resValue.
    filter (xsd:float(?resValue)<100)
    }
    UNION
    {
    ?v1 core:hasResolution/(core:hasMeanValue |core:hasMedianValue) ?resValue1.
    filter (xsd:float(?resValue1)<"""+str(resolution[0])+""")
    }
    """

    ### Location
    locQ=""
    if lon or lat or alt:
           locQ = "?dataset core:collectedFrom ?z."

    ## Altitude
    latQ=""
    if len(lat)>0:
        latQ="""
    ?z <http://www.w3.org/2003/01/geo/wgs84_pos#lat> ?lat.
    filter(xsd:float(?lat)<"""+str(lat[1])+""" && xsd:float(?lat)>"""+str(lat[0])+""").
    """

    ##Longitude
    lonQ=""
    if len(lon)>0:
        lonQ = """
    ?z <http://www.w3.org/2003/01/geo/wgs84_pos#long> ?long.
    filter(xsd:float(?long)<"""+str(lon[1])+""" && xsd:float(?long)>"""+str(lon[0])+""").
    """

    ## Altitude
    altQ=""
    if len(alt)>0:
        altQ="""
    ?z <http://www.w3.org/2003/01/geo/wgs84_pos#alt> ?alt.
    filter(xsd:float(?alt)<"""+str(alt[1])+""" && xsd:float(?alt)>"""+str(alt[0])+""").
    """

    query += """
    ?dataset a core:Dataset.
    """+dataQ+"""
    """+measuredVarQ+"""
    # By proxyObservationType
    """+proxyObsTypeQ+"""
    """+inferredVarQ+"""
    # By InferredVariableType
    """+infVarTypeQ+"""
    # Look for the proxy system model: needed for sensor and archive queries
    """+proxySystemQ+"""
    # Sensor query
    """+sensorQ+"""
    """+genusQ+"""
    """+speciesQ+"""
    # Archive query (looks in both places)
    """+archiveTypeQ+"""
    # Interpretation query
    """+interpQ+"""
    """+interpNameQ+"""
    """+interpDetailQ+"""
    # Age Query
    """+ageUnitsQ+"""
    """+ageQ+"""
    # Location Query
    """+locQ+"""
    #Latitude
    """+latQ+"""
    #Longitude
    """+lonQ+"""
    #Altitude
    """+altQ+"""
    #Resolution Query
    """+resQ+"""
    }"""

    #print(query)
    response = requests.post(url, data = {'query': query})
    res = json.loads(response.text)
    if print_response == True:
        for item in res['results']['bindings']:
            print (item['dataset']['value'])

    #download files
    if download_lipd == True:
        for item in res['results']['bindings']:
            dataset = (item['dataset_label']['value'])
            download_url = 'http://wiki.linked.earth/wiki/index.php/Special:WTLiPD?op=export&lipdid='+dataset
            if download_folder == 'default':
                path = os.getcwd()+'/'
            else:
                if download_folder[-1] == '/':
                    path = download_folder
                    wget.download(download_url, path)
                else:
                    path = download_folder+'/'
            if os.path.exists(path) == False:
                os.mkdir(path)
            wget.download(download_url, path+'/'+dataset+'.lpd')

    return res

[docs]def pre_process_list(list_str):
    """ Pre-process a series of strings for capitalized letters, space, and punctuation

    Parameters
    ----------

    list_str : list
        A list of strings from which to strip capitals, spaces, and other characters

    Returns
    -------

    res : list
        A list of strings with capitalization, spaces, and punctuation removed
    """
    res=[]
    for item in list_str:
        res.append(pre_process_str(item))
    return res

[docs]def similar_string(list_str, search):
    """ Returns a list of indices for strings with similar values

    Parameters
    ----------

    list_str : list
        A list of strings

    search : str
        A keyword search

    Returns
    -------

    indices: list
        A list of indices with similar value as the keyword
    """
    #exact matches
    indices = [i for i, x in enumerate(list_str) if x == search]
    # proximity matches
    return indices

[docs]def pre_process_str(word):
    """Pre-process a string for capitalized letters, space, and punctuation

    Parameters
    ----------

    string : str
        A string from which to strip capitals, spaces, and other characters

    Returns
    -------
    res : str
        A string with capitalization, spaces, and punctuation removed
    """
    d=word.replace(" ","").lower()
    stopset=list(string.punctuation)
    res="".join([i for i in d if i not in stopset])
    return res

"""
Deal with models
"""
[docs]def isModel(csvName, lipd):
    """Check for the presence of a model in the same object as the measurement table

    Parameters
    ----------

    csvName : string
        The name of the csv file corresponding to the measurement table

    lipd : dict
        A LiPD object

    Returns
    -------

    model : list
        List of models already available

    dataObject : string
        The name of the paleoData or ChronData
        object in which the model(s) are stored
    """
    csvNameSplit = csvName.split('.')
    for val in csvNameSplit:
        if "chron" in val or "paleo" in val:
            tableName = val

    if tableName[0] == 'c':
        objectName = 'chron'+tableName.split('chron')[1][0]
        dataObject = lipd["chronData"][objectName]
    elif tableName[0] == 'p':
        objectName = 'paleo'+tableName.split('paleo')[1][0]
        dataObject = lipd["paleoData"][objectName]
    else:
        raise KeyError("Key name should only include 'chron' or 'paleo'")

    if "model" in dataObject.keys():
        model_list = dataObject["model"]
        model = list(model_list.keys())
    else:
        model=[]

    return model, objectName


[docs]def modelNumber(model):
    """Assign a new or existing model number

    Parameters
    ----------

    model : list
        List of possible model number. Obtained from isModel

    Returns
    -------

    modelNum : int
        The number of the model
    """
    if model:
        print("There is " + str(len(model)) + " model(s) already available.")
        print("creating a new model...")
        modelNum = len(model)
        print("Your new model number is "+ str(modelNum))
    else:
        print("No previous model available. Creating a new model...")
        modelNum = 0
        print("Your model number is "+ str(modelNum))

    return modelNum

"""
Get entire tables
"""

[docs]def isMeasurement(csv_dict):
    """ Check whether measurement tables are available

    Parameters
    ----------

    csv_dict : dict
        Dictionary of available csv

    Returns
    -------

    paleoMeasurementTables : list
        List of available paleoMeasurementTables
    chronMeasurementTables : list
        List of available chronMeasurementTables
    """
    chronMeasurementTables = []
    paleoMeasurementTables =[]

    for val in csv_dict.keys():
        if "measurement" in val and "chron" in val:
            chronMeasurementTables.append(val)
        if "measurement" in val and "paleo" in val:
            paleoMeasurementTables.append(val)

    return chronMeasurementTables, paleoMeasurementTables

[docs]def whichMeasurement(measurementTableList):
    """Select a measurement table from a list

    Use in conjunction with the function isMeasurement

    Parameters
    ----------

    measurementTableList : list
        List of measurement tables contained in the LiPD file. Output from the isMeasurement function

    Returns
    -------

    csvName : string
        the name of the csv file

    """
    if len(measurementTableList)>1:
        print("More than one table is available.")
        for idx, val in enumerate(measurementTableList):
            print(idx, ": ", val)
        csvName = measurementTableList[int(input("Which one would you like to use? "))]
    else:
        csvName = measurementTableList[0]

    return csvName

[docs]def getMeasurement(csvName, lipd):
    """Extract the dictionary corresponding to the measurement table

    Parameters
    ----------

    csvName : string
        The name of the csv file
    lipd : dict
        The LiPD object from which to extract the data

    Returns
    -------

    ts_list : dict
        A dictionary containing data and metadata for each column in the
        csv file.

    """
    csvNameSplit = csvName.split('.')
    for val in csvNameSplit:
        if "chron" in val or "paleo" in val:
            tableName = val

    if tableName[0] == 'c':
        objectName = 'chron'+tableName.split('chron')[1][0]
        ts_list = lipd["chronData"][objectName]["measurementTable"][tableName]["columns"]
    elif tableName[0] == 'p':
        objectName = 'paleo'+tableName.split('paleo')[1][0]
        ts_list = lipd["paleoData"][objectName]["measurementTable"][tableName]["columns"]
    else:
        raise KeyError("Key name should only include 'chron' or 'paleo'")

    return ts_list

"""
Deal with ensembles
"""

[docs]def isEnsemble(csv_dict):
    """ Check whether ensembles are available

    Parameters
    ----------

    csv_dict : dict
        Dictionary of available csv

    Returns
    -------

    paleoEnsembleTables : list
        List of available paleoEnsembleTables
    chronEnsembleTables : list
        List of availale chronEnsemble Tables

    """
    chronEnsembleTables =[]
    paleoEnsembleTables =[]
    for val in csv_dict.keys():
        if "ensemble" in val and "chron" in val:
            chronEnsembleTables.append(val)
        elif "ensemble" in val and "paleo" in val:
            paleoEnsembleTables.append(val)

    return chronEnsembleTables, paleoEnsembleTables

[docs]def whichEnsemble(ensembleTableList):
    """Select an ensemble table from a list

    Use in conjunction with the function isMeasurement

    Parameters
    ----------

    measurementTableList : list
        List of measurement tables contained in the LiPD file. Output from the isMeasurement function
    csv_list : list
        Dictionary of available csv

    Returns
    -------

    csvName : string
        the name of the csv file

    """
    if len(ensembleTableList)>1:
        print("More than one table is available.")
        for idx, val in enumerate(ensembleTableList):
            print(idx, ": ", val)
        csvName = ensembleTableList[int(input("Which one would you like to use? "))]
    else:
        csvName = ensembleTableList[0]

    return csvName

[docs]def getEnsemble(csv_dict, csvName):
    """ Extracts the ensemble values and depth vector from the dictionary and
    returns them into two numpy arrays.

    Parameters
    ----------

    csv_dict : dict
        dictionary containing the availableTables

    csvName : str
        Name of the csv

    Returns
    -------

    depth : array
        Vector of depth

    ensembleValues : array
        The matrix of Ensemble values
    """
    ensemble_dict=csv_dict[csvName]
    ensembleValues=[]
    for val in ensemble_dict.keys():
        if 'depth' in val:
            depth = ensemble_dict[val]["values"]
        else:
            ensembleValues.append(ensemble_dict[val]["values"])
    ensembleValues= np.transpose(np.array(ensembleValues))

    return depth, ensembleValues

[docs]def mapAgeEnsembleToPaleoData(ensembleValues, depthEnsemble, depthPaleo):
    """ Map the depth for the ensemble age values to the paleo depth

    Parameters
    ----------

    ensembleValues : array
        A matrix of possible age models. Realizations
        should be stored in columns
    depthEnsemble : array
        A vector of depth. The vector should have the same
        length as the number of rows in the ensembleValues
    depthPaleo : array
        A vector corresponding to the depth at which there
        are paleodata information

    Returns
    -------

    ensembleValuesToPaleo : array
        A matrix of age ensemble on the PaleoData scale

    """
    if len(depthEnsemble)!=np.shape(ensembleValues)[0]:
        raise ValueError("Depth and age need to have the same length")

    #Make sure that numpy arrays were given
    ensembleValues=np.array(ensembleValues)
    depthEnsemble=np.array(depthEnsemble)
    depthPaleo = np.array(depthPaleo)

    #Interpolate
    ensembleValuesToPaleo = np.zeros((len(depthPaleo),np.shape(ensembleValues)[1])) #placeholder
    for i in np.arange(0,np.shape(ensembleValues)[1]):
        ensembleValuesToPaleo[:,i]=np.interp(depthPaleo,depthEnsemble,ensembleValues[:,i])

    return ensembleValuesToPaleo

[docs]def gen_dict_extract(key, var):
    '''Recursively searches for all the values in nested dictionaries corresponding
    to a particular key

    Parameters
    ----------

    key : str
        The key to search for
    var : dict
        The dictionary to search

    '''
    if hasattr(var,'items'):
        for k, v in var.items():
            if k == key:
                yield v
            if isinstance(v, dict):
                for result in gen_dict_extract(key, v):
                    yield result
            elif isinstance(v, list):
                for d in v:
                    for result in gen_dict_extract(key, d):
                        yield result