# -*- coding: utf-8 -*-
"""
Utilities to manipulate LiPD files and automate data transformation whenever possible.
These functions are used throughout Pyleoclim but are not meant for direct interaction by users.
Also handles integration with the LinkedEarth wiki and the LinkedEarth Ontology.
"""
import lipd as lpd
import numpy as np
import os
import json
import requests
import wget
from bs4 import BeautifulSoup
import string
[docs]class CaseInsensitiveDict(dict):
def __setitem__(self, key, value):
super().__setitem__(key.lower().replace(" ", ""), value)
def __getitem__(self, key):
return super().__getitem__(key.lower().replace(" ", ""))
PLOT_DEFAULT = {'GroundIce': ['#86CDFA', 'h'],
'Borehole': ['#00008b', 'h'],
'Coral': ['#FF8B00', 'o'],
'Documents': ['#f8d568', 'p'],
'GlacierIce': ['#86CDFA', 'd'],
'Hybrid': ['#808000', '*'],
'LakeSediment': ['#8A4513', '^'],
'MarineSediment': ['#8A4513', 's'],
'Sclerosponge': ['r', 'o'],
'Speleothem': ['#FF1492', 'd'],
'Wood': ['#32CC32', '^'],
'MolluskShell': ['#FFD600', 'h'],
'Peat': ['#2F4F4F', '*'],
'Midden': ['#824E2B', 'o'],
'FluvialSediment': ['#4169E0','d'],
'TerrestrialSediment': ['#8A4513','o'],
'Shoreline': ['#add8e6','o'],
'Instrumental' : ['#8f21d8', '*'],
'Model' : ['#b4a7d6', "d"],
'Other': ['k', 'o']
}
"""
The followng functions handle web scrapping to grab information regarding the controlled vocabulary
"""
[docs]def get_archive_type():
'''
Scrape the LiPDverse website to obtain the list of possible archives and associated synonyms
Returns
-------
res : Dictionary
Keys correspond to the preferred terms and values represent known synonyms
'''
url = "https://lipdverse.org/vocabulary/archivetype/"
response = requests.get(url)
if response.status_code == 200:
# Parse the content of the request with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
# Get the names of the archiveTypes
h3_tags = soup.find_all('h3')
archiveName = []
for item in h3_tags:
archiveName.append(item.get_text())
# Get the known synonyms
h4_tags = soup.find_all('h4', string="Known synonyms")
synonyms=[]
for h4_tag in h4_tags:
next_element = h4_tag.find_next_sibling()
found_p = False
while next_element and next_element.name != 'div':
if next_element.name == 'p':
synonyms_text = next_element.get_text()
words = [word.strip() for word in synonyms_text.split(',')]
synonyms.append(words)
found_p = True
break
next_element = next_element.find_next_sibling()
# If a <p> tag was not found, insert an empty string
if not found_p:
synonyms.append([])
#create a dictionary for the results
res= {}
for idx,item in enumerate(archiveName):
res[item]=synonyms[idx]
else:
print("failed to retrieve the webpage; returning static list, which may be out of date")
res = ["Borehole",
"Coral",
"FluvialSediment",
"GlacierIce",
"GroundIce",
"LakeSediment",
"MarineSediment",
"Midden",
"MolluskShell",
"Peat",
"Scelorosponge",
"Shoreline",
"Spleleothem",
"TerrestrialSediment",
"Wood"]
return res
"""
The following functions handle the LiPD files
"""
[docs]def enumerateLipds(lipds):
"""Enumerate the LiPD files loaded in the workspace
Parameters
----------
lipds : dict
A dictionary of LiPD files.
"""
print("Below are the available records")
lipds_list = [val for val in lipds.keys()]
for idx, val in enumerate(lipds_list):
print(idx,': ',val)
[docs]def getLipd(lipds):
"""Prompt for a LiPD file
Ask the user to select a LiPD file from a list
Use this function in conjunction with enumerateLipds()
Parameters
----------
lipds : dict
A dictionary of LiPD files. Can be obtained from
pyleoclim.readLipd()
Returns
-------
select_lipd : int
The index of the LiPD file
"""
enumerateLipds(lipds)
lipds_list = [val for val in lipds.keys()]
choice = int(input("Enter the number of the file: "))
lipd_name = lipds_list[choice]
select_lipd = lipds[lipd_name]
return select_lipd
"""
The following functions work at the variables level
"""
[docs]def promptForVariable():
"""Prompt for a specific variable
Ask the user to select the variable they are interested in.
Use this function in conjunction with readHeaders() or getTSO()
Returns
-------
select_var : int
The index of the variable
"""
select_var = int(input("Enter the number of the variable you wish to use: "))
return select_var
[docs]def xAxisTs(timeseries):
""" Get the x-axis for the timeseries.
Parameters
----------
timeseries : dict
a timeseries object
Returns
-------
x_axis : array
the values for the x-axis representation
label : string
returns either "age", "year", or "depth"
"""
if "depth" in timeseries.keys() and "age" in timeseries.keys() or\
"depth" in timeseries.keys() and "year" in timeseries.keys():
print("Both time and depth information available, selecting time")
if "age" in timeseries.keys() and "year" in timeseries.keys():
print("Both age and year representation available, selecting age")
x_axis = timeseries["age"]
label = "age"
elif "year" in timeseries.keys():
x_axis = timeseries["year"]
label = "year"
elif "age" in timeseries.keys():
x_axis = timeseries["age"]
elif "depth" in timeseries.keys():
x_axis = timeseries["depth"]
label = "depth"
elif "age" in timeseries.keys():
x_axis = timeseries["age"]
label = "age"
elif "year" in timeseries.keys():
x_axis = timeseries["year"]
label = "year"
else:
raise KeyError("No age or depth information available")
return x_axis, label
[docs]def checkXaxis(timeseries, x_axis= None):
"""Check that a x-axis is present for the timeseries
Parameters
----------
timeseries : dict
a timeseries
x_axis : string
the x-axis representation, either depth, age or year
Returns
-------
x : array
the values for the x-axis representation
label : string
returns either "age", "year", or "depth"
"""
if x_axis is None:
x, label = xAxisTs(timeseries)
x = np.array(x, dtype = 'float64')
elif x_axis == "depth":
if not "depth" in timeseries.keys():
raise ValueError("Depth not available for this record")
else:
x = np.array(timeseries['depth'], dtype = 'float64')
label = "depth"
elif x_axis == "age":
if not "age" in timeseries.keys():
raise ValueError("Age not available for this record")
else:
x = np.array(timeseries['age'], dtype = 'float64')
label = "age"
elif x_axis == "year":
if not "year" in timeseries.keys():
raise ValueError("Year not available for this record")
else:
x = np.array(timeseries['year'], dtype = 'float64')
label = "year"
else:
raise KeyError("enter either 'depth','age',or 'year'")
return x, label
[docs]def checkTimeAxis(timeseries, x_axis = None):
""" This function makes sure that time is available for the timeseries
Parameters
----------
timeseries : dict
A LiPD timeseries object
Returns
-------
x : array
the time values for the timeseries
label : string
the time representation for the timeseries
"""
if x_axis is None:
if not 'age' in timeseries.keys() and not 'year' in timeseries.keys():
raise KeyError("No time information available")
elif 'age' in timeseries.keys() and 'year' in timeseries.keys():
print("Both age and year information are available, using age")
label = 'age'
elif 'age' in timeseries.keys():
label = 'age'
elif 'year' in timeseries.keys():
label = 'year'
elif x_axis == 'age':
if not 'age' in timeseries.keys():
raise KeyError('Age is not available for this record')
else:
label = 'age'
elif x_axis == 'year':
if not 'year' in timeseries.keys():
raise KeyError('Year is not available for this record')
else:
label='year'
else:
raise KeyError('Only None, year and age are valid entries for x_axis parameter')
x = np.array(timeseries[label], dtype = 'float64')
return x, label
[docs]def searchVar(timeseries_list, key, exact = True, override = True):
""" This function searched for keywords (exact match) for a variable
Parameters
----------
timeseries_list : list
A list of available series
key : list
A list of keys to search
exact : bool
if True, looks for an exact match.
override : bool
if True, override the exact match if no match is found
Returns
-------
match : list
A list of keys for the timeseries that match the selection
criteria.
"""
# Make sure thaat the keys are contained in a list
if type(key) is not list:
if type(key) is str:
key = [key]
else:
raise TypeError("Key terms should be entered as a list")
match = []
if exact == True:
#Search for exact match with the key
for keyVal in key:
for val in timeseries_list.keys():
ts_temp = timeseries_list[val]
if "variableName" in ts_temp.keys():
name = ts_temp["variableName"]
if keyVal.lower() == name.lower():
match.append(val)
elif "paleoData_variableName" in ts_temp.keys():
name = ts_temp["paleoData_variableName"]
if keyVal.lower() == name.lower():
match.append(val)
elif "chronData_variableName" in ts_temp.keys():
name = ts_temp["chronData_variableName"]
if keyVal.lower() == name.lower():
match.append(val)
elif "ProxyObservationType" in ts_temp.keys():
name = ts_temp["ProxyObservationType"]
if keyVal.lower() == name.lower():
match.append(val)
elif "paleoData_proxyObservationType" in ts_temp.keys():
name = ts_temp["paleoData_proxyObservationType"]
if keyVal.lower() == name.lower():
match.append(val)
elif "chronData_proxyObservationType" in ts_temp.keys():
name = ts_temp["chronData_proxyObservationType"]
if keyVal.lower() == name.lower():
match.append(val)
elif "InferredVariableType" in ts_temp.keys():
name = ts_temp["InferredVariableType"]
if keyVal.lower() == name.lower():
match.append(val)
elif "paleoData_inferredVariableType" in ts_temp.keys():
name = ts_temp["paleoData_inferredVariableType"]
if keyVal.lower() == name.lower():
match.append(val)
elif "chronData_inferredVariableType" in ts_temp.keys():
name = ts_temp["chronData_inferredVariableType"]
if keyVal.lower() == name.lower():
match.append(val)
else:
# Search for the word in the ley
for keyVal in key:
for val in timeseries_list.keys():
ts_temp = timeseries_list[val]
if "variableName" in ts_temp.keys():
name = ts_temp["variableName"]
if keyVal.lower() in name.lower():
match.append(val)
elif "paleoData_variableName" in ts_temp.keys():
name = ts_temp["paleoData_variableName"]
if keyVal.lower() in name.lower():
match.append(val)
elif "chronData_variableName" in ts_temp.keys():
name = ts_temp["chronData_variableName"]
if keyVal.lower() in name.lower():
match.append(val)
elif "ProxyObservationType" in ts_temp.keys():
name = ts_temp["ProxyObservationType"]
if keyVal.lower() in name.lower():
match.append(val)
elif "paleoData_proxyObservationType" in ts_temp.keys():
name = ts_temp["paleoData_proxyObservationType"]
if keyVal.lower() in name.lower():
match.append(val)
elif "chronData_proxyObservationType" in ts_temp.keys():
name = ts_temp["chronData_proxyObservationType"]
if keyVal.lower() in name.lower():
match.append(val)
elif "InferredVariableType" in ts_temp.keys():
name = ts_temp["InferredVariableType"]
if keyVal.lower() in name.lower():
match.append(val)
elif "paleoData_inferredVariableType" in ts_temp.keys():
name = ts_temp["paleoData_inferredVariableType"]
if keyVal.lower() in name.lower():
match.append(val)
elif "chronData_inferredVariableType" in ts_temp.keys():
name = ts_temp["chronData_inferredVariableType"]
if keyVal.lower() in name.lower():
match.append(val)
# Expand the search if asked
if not match and exact == True and override == True:
print("No match found on exact search, running partial match")
for keyVal in key:
for val in timeseries_list.keys():
ts_temp = timeseries_list[val]
if "variableName" in ts_temp.keys():
name = ts_temp["variableName"]
if keyVal.lower() in name.lower():
match.append(val)
elif "paleoData_variableName" in ts_temp.keys():
name = ts_temp["paleoData_variableName"]
if keyVal.lower() in name.lower():
match.append(val)
elif "chronData_variableName" in ts_temp.keys():
name = ts_temp["chronData_variableName"]
if keyVal.lower() in name.lower():
match.append(val)
elif "ProxyObservationType" in ts_temp.keys():
name = ts_temp["ProxyObservationType"]
if keyVal.lower() in name.lower():
match.append(val)
elif "paleoData_proxyObservationType" in ts_temp.keys():
name = ts_temp["paleoData_proxyObservationType"]
if keyVal.lower() in name.lower():
match.append(val)
elif "chronData_proxyObservationType" in ts_temp.keys():
name = ts_temp["chronData_proxyObservationType"]
if keyVal.lower() in name.lower():
match.append(val)
elif "InferredVariableType" in ts_temp.keys():
name = ts_temp["InferredVariableType"]
if keyVal.lower() in name.lower():
match.append(val)
elif "paleoData_inferredVariableType" in ts_temp.keys():
name = ts_temp["paleoData_inferredVariableType"]
if keyVal.lower() in name.lower():
match.append(val)
elif "chronData_inferredVariableType" in ts_temp.keys():
name = ts_temp["chronData_inferredVariableType"]
if keyVal.lower() in name.lower():
match.append(val)
# Get the unique entries
match = list(set(match))
# Narrow down if more than one match is found by asking the user
if len(match) > 1:
print("More than one series match your search criteria")
for idx, val in enumerate(match):
print(idx,": ", val)
choice = int(input("Enter the number for the variable: "))
match = match[choice]
elif not match:
print("No match found.")
print("Here are the available variables: ")
v = list(timeseries_list.keys())
for idx, val in enumerate(v):
print(idx,": ",val)
choice = input("Please select the variable you'd like to use or enter to continue: ")
if not choice:
match =""
else:
choice = int(choice)
match = v[choice]
else:
match = match[0]
return match
"""
The following functions handle the time series objects
"""
[docs]def enumerateTs(timeseries_list):
"""Enumerate the available time series objects
Parameters
----------
timeseries_list : list
a list of available timeseries objects.
"""
available_y = []
dataSetName =[]
at =[]
for item in timeseries_list:
if 'dataSetName' in item.keys():
dataSetName.append(item['dataSetName'])
else:
dataSetName.append('NA')
if 'paleoData_variableName' in item.keys():
available_y.append(item['paleoData_variableName'])
elif 'chronData_variableName' in item.keys():
available_y.append(item['chronData_variableName'])
else:
available_y.append('NA')
if 'archiveType' in item.keys():
at.append(item['archiveType'])
else:
at.append('NA')
for idx,val in enumerate(available_y):
print(idx,': ',dataSetName[idx], ': ',at[idx],': ', val)
[docs]def getTs(timeseries_list, option = None):
"""Get a specific timeseries object from a dictionary of timeseries
Parameters
----------
timeseries_list : list
a list of available timeseries objects.
option : string
An expression to filter the datasets. Uses lipd.filterTs()
Returns
-------
timeseries : single timeseries object or list of timeseries
A single timeseries object if not optional filter selected or a filtered
list if optional arguments given
See also
--------
pyleoclim.utils.lipdutils.enumerateTs : Enumerate the available time series objects
pyleoclim.utils.lipdutils.promptForVariable : Prompt for a specific variable
"""
if not option:
enumerateTs(timeseries_list)
select_TSO = promptForVariable()
timeseries = timeseries_list[select_TSO]
else:
timeseries = lpd.filterTs(timeseries_list, option)
return timeseries
"""
Functions to handle data on the LinkedEarth Platform
"""
[docs]def LipdToOntology(archiveType):
""" standardize archiveType
Transform the archiveType from their LiPD name to their ontology counterpart
Parameters
----------
archiveType : string
name of the archiveType from the LiPD file
Returns
-------
archiveType : string
archiveType according to the ontology
"""
#Align with the ontology
if archiveType != None:
if archiveType.lower().replace(" ", "") == "icecore":
archiveType = 'GlacierIce'
elif archiveType.lower().replace(" ", "") == "ice-other":
archiveType = 'GlacierIce'
elif archiveType.lower().replace(" ", "") == 'tree':
archiveType = 'Wood'
elif archiveType.lower().replace(" ","") not in [key.lower() for key in PLOT_DEFAULT.keys()]:
archiveType='Other'
return archiveType
[docs]def timeUnitsCheck(units):
""" This function attempts to make sense of the time units by checking for equivalence
Parameters
----------
units : string
The units string for the timeseries
Returns
-------
unit_group : string
Whether the units belongs to age_units, kage_units, year_units, ma_units, or undefined
"""
age_units = ['year B.P.','yr B.P.','yr BP','BP','yrs BP','years B.P.',\
'yr. BP','yr. B.P.', 'cal. BP', 'cal B.P.', \
'year BP','years BP']
kage_units = ['kyr BP','kaBP','ka BP','ky','kyr','kyr B.P.', 'ka B.P.', 'ky BP',\
'kyrs BP','ky B.P.', 'kyrs B.P.', 'kyBP', 'kyrBP']
year_units = ['AD','CE','year C.E.','year A.D.', 'year CE','year AD',\
'years C.E.','years A.D.','yr CE','yr AD','yr C.E.'\
'yr A.D.', 'yrs C.E.', 'yrs A.D.', 'yrs CE', 'yrs AD']
mage_units = ['my BP', 'myr BP', 'myrs BP', 'ma BP', 'ma',\
'my B.P.', 'myr B.P.', 'myrs B.P.', 'ma B.P.']
undefined = ['years', 'yr','year','yrs']
if units in age_units:
unit_group = 'age_units'
elif units in kage_units:
unit_group = 'kage_units'
elif units in year_units:
unit_group = 'year_units'
elif units in undefined:
unit_group = 'undefined'
elif units in mage_units:
unit_group = 'ma_units'
else:
unit_group = 'unknown'
return unit_group
[docs]def whatArchives(print_response=True):
""" Get the names for ArchiveType from LinkedEarth Ontology
Parameters
----------
print_response : bool
Whether to print the results on the console. Default is True
Returns
-------
res : JSON-object with the request from LinkedEarth wiki api
"""
url = "http://wiki.linked.earth/store/ds/query"
query = """PREFIX core: <http://linked.earth/ontology#>
PREFIX wiki: <http://wiki.linked.earth/Special:URIResolver/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT distinct ?a
WHERE {
{
?dataset wiki:Property-3AArchiveType ?a.
}UNION
{
?w core:proxyArchiveType ?t.
?t rdfs:label ?a
}
}"""
response = requests.post(url, data = {'query': query})
res_i = json.loads(response.text)
if print_response == True:
print("The following archive types are available on the wiki:")
for item in res_i['results']['bindings']:
print ("*" + item['a']['value'])
res=[]
for item in res_i['results']['bindings']:
res.append(item['a']['value'])
return res
[docs]def whatProxyObservations(print_response=True):
""" Get the names for ProxyObservations from LinkedEarth Ontology
Parameters
----------
print_response : bool
Whether to print the results on the console. Default is True
Returns
-------
res : JSON-object with the request from LinkedEarth wiki api
"""
url = "http://wiki.linked.earth/store/ds/query"
query = """PREFIX core: <http://linked.earth/ontology#>
PREFIX wiki: <http://wiki.linked.earth/Special:URIResolver/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT distinct ?a
WHERE
{
?w core:proxyObservationType ?t.
?t rdfs:label ?a
}"""
response = requests.post(url, data = {'query': query})
res_i = json.loads(response.text)
if print_response==True:
print("The following proxy observation types are available on the wiki: ")
for item in res_i['results']['bindings']:
print (item['a']['value'])
res=[]
for item in res_i['results']['bindings']:
res.append(item['a']['value'])
return res
[docs]def whatProxySensors(print_response=True):
""" Get the names for ProxySensors from LinkedEarth Ontology
Parameters
----------
print_response : bool
Whether to print the results on the console. Default is True
Returns
-------
res : JSON-object with the request from LinkedEarth wiki api
"""
url = "http://wiki.linked.earth/store/ds/query"
query = """PREFIX core: <http://linked.earth/ontology#>
PREFIX wiki: <http://wiki.linked.earth/Special:URIResolver/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT distinct ?a ?b
WHERE
{
?w core:sensorGenus ?a.
?w core:sensorSpecies ?b .
}"""
response = requests.post(url, data = {'query': query})
res_i = json.loads(response.text)
if print_response == True:
print("The available sensor genus/species are: ")
for item in res_i['results']['bindings']:
print ("*" + 'Genus: '+item['a']['value']+' Species: ' +item['b']['value'])
res=[]
for item in res_i['results']['bindings']:
res.append(item['a']['value'])
return res
[docs]def whatInferredVariables(print_response=True):
""" Get the names for InferredVariables from LinkedEarth Ontology
Parameters
----------
print_response : bool
Whether to print the results on the console. Default is True
Returns
-------
res : JSON-object with the request from LinkedEarth wiki api
"""
url = "http://wiki.linked.earth/store/ds/query"
query = """PREFIX core: <http://linked.earth/ontology#>
PREFIX wiki: <http://wiki.linked.earth/Special:URIResolver/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT distinct ?a
WHERE
{
?w core:inferredVariableType ?t.
?t rdfs:label ?a
}"""
response = requests.post(url, data = {'query': query})
res_i = json.loads(response.text)
if print_response == True:
print("The following Inferred Variable types are available on the wiki: ")
for item in res_i['results']['bindings']:
print ("*" + item['a']['value'])
res=[]
for item in res_i['results']['bindings']:
res.append(item['a']['value'])
return res
[docs]def whatInterpretations(print_response=True):
""" Get the names for interpretations from LinkedEarth Ontology
Parameters
----------
print_response : bool
Whether to print the results on the console. Default is True
Returns
-------
res : JSON-object with the request from LinkedEarth wiki api
"""
url = "http://wiki.linked.earth/store/ds/query"
query = """PREFIX core: <http://linked.earth/ontology#>
PREFIX wiki: <http://wiki.linked.earth/Special:URIResolver/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT distinct ?a ?b
WHERE
{
?w core:name ?a.
?w core:detail ?b .
}"""
response = requests.post(url, data = {'query': query})
res_i = json.loads(response.text)
if print_response==True:
print("The following interpretation are available on the wiki: ")
for item in res_i['results']['bindings']:
print ("*" + 'Name: '+item['a']['value']+' Detail: ' +item['b']['value'])
res=[]
for item in res_i['results']['bindings']:
res.append(item['a']['value'])
return res
[docs]def queryLinkedEarth(archiveType=[ ], proxyObsType=[ ], infVarType = [ ], sensorGenus=[ ],
sensorSpecies=[ ], interpName =[ ], interpDetail =[ ], ageUnits = [ ],
ageBound = [ ], ageBoundType = [ ], recordLength = [ ], resolution = [ ],
lat = [ ], lon = [ ], alt = [ ], print_response = True, download_lipd = True,
download_folder = 'default'):
""" This function allows to query the LinkedEarth wiki for records.
This function allows to query the LinkedEarth wiki for specific catagories.
If you have more than one keyword per catagory, enter them in a list. If you don't
wish to use a particular terms, leave a blank in-between the brackets.
Parameters
----------
archiveType : list of strings
The type of archive (enter all query terms, separated by a comma)
proxyObsType : list of strings
The type of proxy observation (enter all query terms, separated by a comma)
infVarType : list of strings
The type of inferred variable (enter all query terms, separated by a comma)
sensorGenus : list of strings
The Genus of the sensor (enter all query terms, separated by a comma)
sensorSpecies : list of strings
The Species of the sensor (enter all query terms, separated by a comma)
interpName : list of strings
The name of the interpretation (enter all query terms, separated by a comma)
interpDetail : list of strings
The detail of the interpretation (enter all query terms, separated by a comma)
ageUnits : list of strings
The units of in which the age (year) is expressed in.
Warning: Separate each query if need to run across multiple age queries (i.e., yr B.P. vs kyr B.P.). If the units are different but the meaning is the same (e.g., yr B.P. vs yr BP, enter all search terms separated by a comma).
ageBound : list of floats
Enter the minimum and maximum age value to search for.
Warning: You MUST enter a minimum AND maximum value. If you wish to perform a query such as "all ages before 2000 A.D.", enter a minimum value of -99999 to cover all bases.
ageBoundType : list of strings
The type of querying to perform. Possible values include: "any", "entire", and "entirely".
- any: Overlap any portions of matching datasets (default)
- entirely: are entirely overlapped by matching datasets
- entire: overlap entire matching datasets but dataset can be shorter than the bounds
recordLength : list of floats
The minimum length the record needs to have while matching the ageBound criteria. For instance, "look for all records between 3000 and 6000 year BP with a record length of at least 1500 year".
resolution : list of floats
The maximum resolution of the resord. Resolution has the same units as age/year. For instance, "look for all records with a resolution of at least 100 years".
Warning: Resolution applies to specific variables rather than an entire dataset. Imagine the case where some measurements are made every cm while others are made every 5cm. If you require a specific variable to have the needed resolution, make sure that either the proxyObservationType, inferredVariableType, and/or Interpretation fields are completed.
lat : list of floats
The minimum and maximum latitude. South is expressed with negative numbers.
Warning: You MUST enter a minimum AND maximum value. If you wish to perform a query looking for records from the Northern Hemisphere, enter [0,90].
lon : list of floats
The minimum and maximum longitude. West is expressed with negative numbers.
Warning: You MUST enter a minimum AND a maximum value. If you wish to perform a query looking for records from the Western Hemisphere, enter [-180,0].
alt : list of floats
The minimum and maximum altitude. Depth below sea level is expressed as negative numbers.
Warning: You MUST enter a minimum AND a maximum value. If you wish to perform a query looking for records below a certain depth (e.g., 500), enter [-99999,-500].
print_response : bool
If True, prints the URLs to the matching LiPD files
download_lipd : bool
If True, download the matching LiPD files
download_folder : string
Location to download the LiPD files. If "default", will download in the current directory.
Returns
-------
res : the response to the query
"""
# Perform a lot of checks
if len(ageBound)==1:
raise ValueError("You need to provide a minimum and maximum boundary.")
if ageBound and not ageUnits:
raise ValueError("When providing age limits, you must also enter the units")
if recordLength and not ageUnits:
raise ValueError("When providing a record length, you must also enter the units")
if ageBound and ageBound[0]>ageBound[1]:
ageBound = [ageBound[1],ageBound[0]]
if len(ageBoundType)>1:
raise ValueError("Only one search possible at a time.")
if ageBoundType not in ["any","entirely","entire"]:
raise ValueError("ageBoundType is not recognized")
if recordLength and ageBound and recordLength[0] > (ageBound[1]-ageBound[0]):
raise ValueError("The required recordLength is greater than the provided age bounds")
if len(resolution)>1:
raise ValueError("You can only search for a maximum resolution one at a time.")
if len(lat)==1:
raise ValueError("Please enter a lower AND upper boundary for the latitude search")
if lat and lat[1]<lat[0]:
lat = [lat[1],lat[0]]
if len(lon)==1:
raise ValueError("Please enter a lower AND upper boundary for the longitude search")
if lon and lon[1]<lon[0]:
lon = [lon[1],lon[0]]
if len(alt)==1:
raise ValueError("Please enter a lower AND upper boundary for the altitude search")
if alt and alt[1]<alt[0]:
alt = [alt[1],alt[0]]
# Perform the query
url = "http://wiki.linked.earth/store/ds/query"
query = """PREFIX core: <http://linked.earth/ontology#>
PREFIX wiki: <http://wiki.linked.earth/Special:URIResolver/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
SELECT distinct ?dataset ?dataset_label
WHERE {
?dataset rdfs:label ?dataset_label
"""
### Look for data field
dataQ=""
if archiveType or proxyObsType or infVarType or sensorGenus or sensorSpecies or interpName or interpDetail or ageUnits or ageBound or recordLength or resolution:
dataQ = "?dataset core:includesChronData|core:includesPaleoData ?data."
### Look for variable
## measuredVariable
measuredVarQ=""
if proxyObsType or archiveType or sensorGenus or sensorSpecies or interpName or interpDetail or resolution:
measuredVarQ = "?data core:foundInMeasurementTable / core:includesVariable ?v."
## InferredVar
inferredVarQ=""
if infVarType or interpName or interpDetail or resolution:
inferredVarQ = "?data core:foundInMeasurementTable / core:includesVariable ?v1."
### Archive Query
archiveTypeQ=""
if len(archiveType)>0:
#add values for the archiveType
query += "VALUES ?a {"
for item in archiveType:
query +="\""+item+"\" "
query += "}\n"
# Create the query
archiveTypeQ = """
#Archive Type query
{
?dataset wiki:Property-3AArchiveType ?a.
}UNION
{
?p core:proxyArchiveType / rdfs:label ?a.
}"""
### ProxyObservationQuery
proxyObsTypeQ=""
if len(proxyObsType)>0:
# add values for the proxyObservationType
query+="VALUES ?b {"
for item in proxyObsType:
query += "\""+item+"\""
query += "}\n"
# Create the query
proxyObsTypeQ="?v core:proxyObservationType/rdfs:label ?b."
### InferredVariableQuery
infVarTypeQ=""
if len(infVarType)>0:
query+="VALUES ?c {"
for item in infVarType:
query+="\""+item+"\""
query+="}\n"
# create the query
infVarTypeQ="""
?v1 core:inferredVariableType ?t.
?t rdfs:label ?c.
"""
### ProxySensorQuery
sensorQ=""
if len(sensorGenus)>0 or len(sensorSpecies)>0:
sensorQ="""
?p core:proxySensorType ?sensor.
"""
## Genus query
genusQ=""
if len(sensorGenus)>0:
query+="VALUES ?genus {"
for item in sensorGenus:
query+="\""+item+"\""
query+="}\n"
# create the query
genusQ = "?sensor core:sensorGenus ?genus."
## Species query
speciesQ=""
if len(sensorSpecies)>0:
query+= "VALUES ?species {"
for item in sensorSpecies:
query+="\""+item+"\""
query+="}\n"
#Create the query
speciesQ = "?sensor core:sensorSpecies ?species."
### Proxy system query
proxySystemQ = ""
if len(archiveType)>0 or len(sensorGenus)>0 or len(sensorSpecies)>0:
proxySystemQ="?v ?proxySystem ?p."
### Deal with interpretation
## Make sure there is an interpretation to begin with
interpQ = ""
if len(interpName)>0 or len(interpDetail)>0:
interpQ= """
{?v1 core:interpretedAs ?interpretation}
UNION
{?v core:interpretedAs ?interpretation}
"""
## Name
interpNameQ=""
if len(interpName)>0:
query+= "VALUES ?intName {"
for item in interpName:
query+="\""+item+"\""
query+= "}\n"
#Create the query
interpNameQ = "?interpretation core:name ?intName."
## detail
interpDetailQ = ""
if len(interpDetail)>0:
query+= "VALUES ?intDetail {"
for item in interpDetail:
query+="\""+item+"\""
query+="}\n"
#Create the query
interpDetailQ = "?interpretation core:detail ?intDetail."
### Age
## Units
ageUnitsQ = ""
if len(ageUnits)>0:
query+= "VALUES ?units {"
for item in ageUnits:
query+="\""+item+"\""
query+="}\n"
query+="""VALUES ?ageOrYear{"Age" "Year"}\n"""
# create the query
ageUnitsQ ="""
?data core:foundInMeasurementTable / core:includesVariable ?v2.
?v2 core:inferredVariableType ?aoy.
?aoy rdfs:label ?ageOrYear.
?v2 core:hasUnits ?units .
"""
## Minimum and maximum
ageQ = ""
if ageBoundType[0] == "entirely":
if len(ageBound)>0 and len(recordLength)>0:
ageQ="""
?v2 core:hasMinValue ?e1.
?v2 core:hasMaxValue ?e2.
filter(?e1<=""" +str(ageBound[0])+ """&& ?e2>="""+str(ageBound[1])+""" && abs(?e1-?e2)>="""+str(recordLength[0])+""").
"""
elif len(ageBound)>0 and len(recordLength)==0:
ageQ="""
?v2 core:hasMinValue ?e1.
?v2 core:hasMaxValue ?e2.
filter(?e1<=""" +str(ageBound[0])+ """&& ?e2>="""+str(ageBound[1])+""").
"""
elif ageBoundType[0] == "entire":
if len(ageBound)>0 and len(recordLength)>0:
ageQ="""
?v2 core:hasMinValue ?e1.
?v2 core:hasMaxValue ?e2.
filter(?e1>=""" +str(ageBound[0])+ """&& ?e2<="""+str(ageBound[1])+""" && abs(?e1-?e2)>="""+str(recordLength[0])+""").
"""
elif len(ageBound)>0 and len(recordLength)==0:
ageQ="""
?v2 core:hasMinValue ?e1.
?v2 core:hasMaxValue ?e2.
filter(?e1>=""" +str(ageBound[0])+ """&& ?e2<="""+str(ageBound[1])+""").
"""
elif ageBoundType[0] == "any":
if len(ageBound)>0 and len(recordLength)>0:
ageQ="""
?v2 core:hasMinValue ?e1.
filter(?e1<=""" +str(ageBound[1])+ """ && abs(?e1-"""+str(ageBound[1])+""")>="""+str(recordLength[0])+""").
"""
elif len(ageBound)>0 and len(recordLength)==0:
ageQ="""
?v2 core:hasMinValue ?e1.
filter(?e1<=""" +str(ageBound[1])+ """).
"""
### Resolution
resQ=""
if len(resolution)>0:
resQ = """
{
?v core:hasResolution/(core:hasMeanValue |core:hasMedianValue) ?resValue.
filter (xsd:float(?resValue)<100)
}
UNION
{
?v1 core:hasResolution/(core:hasMeanValue |core:hasMedianValue) ?resValue1.
filter (xsd:float(?resValue1)<"""+str(resolution[0])+""")
}
"""
### Location
locQ=""
if lon or lat or alt:
locQ = "?dataset core:collectedFrom ?z."
## Altitude
latQ=""
if len(lat)>0:
latQ="""
?z <http://www.w3.org/2003/01/geo/wgs84_pos#lat> ?lat.
filter(xsd:float(?lat)<"""+str(lat[1])+""" && xsd:float(?lat)>"""+str(lat[0])+""").
"""
##Longitude
lonQ=""
if len(lon)>0:
lonQ = """
?z <http://www.w3.org/2003/01/geo/wgs84_pos#long> ?long.
filter(xsd:float(?long)<"""+str(lon[1])+""" && xsd:float(?long)>"""+str(lon[0])+""").
"""
## Altitude
altQ=""
if len(alt)>0:
altQ="""
?z <http://www.w3.org/2003/01/geo/wgs84_pos#alt> ?alt.
filter(xsd:float(?alt)<"""+str(alt[1])+""" && xsd:float(?alt)>"""+str(alt[0])+""").
"""
query += """
?dataset a core:Dataset.
"""+dataQ+"""
"""+measuredVarQ+"""
# By proxyObservationType
"""+proxyObsTypeQ+"""
"""+inferredVarQ+"""
# By InferredVariableType
"""+infVarTypeQ+"""
# Look for the proxy system model: needed for sensor and archive queries
"""+proxySystemQ+"""
# Sensor query
"""+sensorQ+"""
"""+genusQ+"""
"""+speciesQ+"""
# Archive query (looks in both places)
"""+archiveTypeQ+"""
# Interpretation query
"""+interpQ+"""
"""+interpNameQ+"""
"""+interpDetailQ+"""
# Age Query
"""+ageUnitsQ+"""
"""+ageQ+"""
# Location Query
"""+locQ+"""
#Latitude
"""+latQ+"""
#Longitude
"""+lonQ+"""
#Altitude
"""+altQ+"""
#Resolution Query
"""+resQ+"""
}"""
#print(query)
response = requests.post(url, data = {'query': query})
res = json.loads(response.text)
if print_response == True:
for item in res['results']['bindings']:
print (item['dataset']['value'])
#download files
if download_lipd == True:
for item in res['results']['bindings']:
dataset = (item['dataset_label']['value'])
download_url = 'http://wiki.linked.earth/wiki/index.php/Special:WTLiPD?op=export&lipdid='+dataset
if download_folder == 'default':
path = os.getcwd()+'/'
else:
if download_folder[-1] == '/':
path = download_folder
wget.download(download_url, path)
else:
path = download_folder+'/'
if os.path.exists(path) == False:
os.mkdir(path)
wget.download(download_url, path+'/'+dataset+'.lpd')
return res
[docs]def pre_process_list(list_str):
""" Pre-process a series of strings for capitalized letters, space, and punctuation
Parameters
----------
list_str : list
A list of strings from which to strip capitals, spaces, and other characters
Returns
-------
res : list
A list of strings with capitalization, spaces, and punctuation removed
"""
res=[]
for item in list_str:
res.append(pre_process_str(item))
return res
[docs]def similar_string(list_str, search):
""" Returns a list of indices for strings with similar values
Parameters
----------
list_str : list
A list of strings
search : str
A keyword search
Returns
-------
indices: list
A list of indices with similar value as the keyword
"""
#exact matches
indices = [i for i, x in enumerate(list_str) if x == search]
# proximity matches
return indices
[docs]def pre_process_str(word):
"""Pre-process a string for capitalized letters, space, and punctuation
Parameters
----------
string : str
A string from which to strip capitals, spaces, and other characters
Returns
-------
res : str
A string with capitalization, spaces, and punctuation removed
"""
d=word.replace(" ","").lower()
stopset=list(string.punctuation)
res="".join([i for i in d if i not in stopset])
return res
"""
Deal with models
"""
[docs]def isModel(csvName, lipd):
"""Check for the presence of a model in the same object as the measurement table
Parameters
----------
csvName : string
The name of the csv file corresponding to the measurement table
lipd : dict
A LiPD object
Returns
-------
model : list
List of models already available
dataObject : string
The name of the paleoData or ChronData
object in which the model(s) are stored
"""
csvNameSplit = csvName.split('.')
for val in csvNameSplit:
if "chron" in val or "paleo" in val:
tableName = val
if tableName[0] == 'c':
objectName = 'chron'+tableName.split('chron')[1][0]
dataObject = lipd["chronData"][objectName]
elif tableName[0] == 'p':
objectName = 'paleo'+tableName.split('paleo')[1][0]
dataObject = lipd["paleoData"][objectName]
else:
raise KeyError("Key name should only include 'chron' or 'paleo'")
if "model" in dataObject.keys():
model_list = dataObject["model"]
model = list(model_list.keys())
else:
model=[]
return model, objectName
[docs]def modelNumber(model):
"""Assign a new or existing model number
Parameters
----------
model : list
List of possible model number. Obtained from isModel
Returns
-------
modelNum : int
The number of the model
"""
if model:
print("There is " + str(len(model)) + " model(s) already available.")
print("creating a new model...")
modelNum = len(model)
print("Your new model number is "+ str(modelNum))
else:
print("No previous model available. Creating a new model...")
modelNum = 0
print("Your model number is "+ str(modelNum))
return modelNum
"""
Get entire tables
"""
[docs]def isMeasurement(csv_dict):
""" Check whether measurement tables are available
Parameters
----------
csv_dict : dict
Dictionary of available csv
Returns
-------
paleoMeasurementTables : list
List of available paleoMeasurementTables
chronMeasurementTables : list
List of available chronMeasurementTables
"""
chronMeasurementTables = []
paleoMeasurementTables =[]
for val in csv_dict.keys():
if "measurement" in val and "chron" in val:
chronMeasurementTables.append(val)
if "measurement" in val and "paleo" in val:
paleoMeasurementTables.append(val)
return chronMeasurementTables, paleoMeasurementTables
[docs]def whichMeasurement(measurementTableList):
"""Select a measurement table from a list
Use in conjunction with the function isMeasurement
Parameters
----------
measurementTableList : list
List of measurement tables contained in the LiPD file. Output from the isMeasurement function
Returns
-------
csvName : string
the name of the csv file
"""
if len(measurementTableList)>1:
print("More than one table is available.")
for idx, val in enumerate(measurementTableList):
print(idx, ": ", val)
csvName = measurementTableList[int(input("Which one would you like to use? "))]
else:
csvName = measurementTableList[0]
return csvName
[docs]def getMeasurement(csvName, lipd):
"""Extract the dictionary corresponding to the measurement table
Parameters
----------
csvName : string
The name of the csv file
lipd : dict
The LiPD object from which to extract the data
Returns
-------
ts_list : dict
A dictionary containing data and metadata for each column in the
csv file.
"""
csvNameSplit = csvName.split('.')
for val in csvNameSplit:
if "chron" in val or "paleo" in val:
tableName = val
if tableName[0] == 'c':
objectName = 'chron'+tableName.split('chron')[1][0]
ts_list = lipd["chronData"][objectName]["measurementTable"][tableName]["columns"]
elif tableName[0] == 'p':
objectName = 'paleo'+tableName.split('paleo')[1][0]
ts_list = lipd["paleoData"][objectName]["measurementTable"][tableName]["columns"]
else:
raise KeyError("Key name should only include 'chron' or 'paleo'")
return ts_list
"""
Deal with ensembles
"""
[docs]def isEnsemble(csv_dict):
""" Check whether ensembles are available
Parameters
----------
csv_dict : dict
Dictionary of available csv
Returns
-------
paleoEnsembleTables : list
List of available paleoEnsembleTables
chronEnsembleTables : list
List of availale chronEnsemble Tables
"""
chronEnsembleTables =[]
paleoEnsembleTables =[]
for val in csv_dict.keys():
if "ensemble" in val and "chron" in val:
chronEnsembleTables.append(val)
elif "ensemble" in val and "paleo" in val:
paleoEnsembleTables.append(val)
return chronEnsembleTables, paleoEnsembleTables
[docs]def whichEnsemble(ensembleTableList):
"""Select an ensemble table from a list
Use in conjunction with the function isMeasurement
Parameters
----------
measurementTableList : list
List of measurement tables contained in the LiPD file. Output from the isMeasurement function
csv_list : list
Dictionary of available csv
Returns
-------
csvName : string
the name of the csv file
"""
if len(ensembleTableList)>1:
print("More than one table is available.")
for idx, val in enumerate(ensembleTableList):
print(idx, ": ", val)
csvName = ensembleTableList[int(input("Which one would you like to use? "))]
else:
csvName = ensembleTableList[0]
return csvName
[docs]def getEnsemble(csv_dict, csvName):
""" Extracts the ensemble values and depth vector from the dictionary and
returns them into two numpy arrays.
Parameters
----------
csv_dict : dict
dictionary containing the availableTables
csvName : str
Name of the csv
Returns
-------
depth : array
Vector of depth
ensembleValues : array
The matrix of Ensemble values
"""
ensemble_dict=csv_dict[csvName]
ensembleValues=[]
for val in ensemble_dict.keys():
if 'depth' in val:
depth = ensemble_dict[val]["values"]
else:
ensembleValues.append(ensemble_dict[val]["values"])
ensembleValues= np.transpose(np.array(ensembleValues))
return depth, ensembleValues
[docs]def mapAgeEnsembleToPaleoData(ensembleValues, depthEnsemble, depthPaleo):
""" Map the depth for the ensemble age values to the paleo depth
Parameters
----------
ensembleValues : array
A matrix of possible age models. Realizations
should be stored in columns
depthEnsemble : array
A vector of depth. The vector should have the same
length as the number of rows in the ensembleValues
depthPaleo : array
A vector corresponding to the depth at which there
are paleodata information
Returns
-------
ensembleValuesToPaleo : array
A matrix of age ensemble on the PaleoData scale
"""
if len(depthEnsemble)!=np.shape(ensembleValues)[0]:
raise ValueError("Depth and age need to have the same length")
#Make sure that numpy arrays were given
ensembleValues=np.array(ensembleValues)
depthEnsemble=np.array(depthEnsemble)
depthPaleo = np.array(depthPaleo)
#Interpolate
ensembleValuesToPaleo = np.zeros((len(depthPaleo),np.shape(ensembleValues)[1])) #placeholder
for i in np.arange(0,np.shape(ensembleValues)[1]):
ensembleValuesToPaleo[:,i]=np.interp(depthPaleo,depthEnsemble,ensembleValues[:,i])
return ensembleValuesToPaleo