import os
import sys
import shlex
from subprocess import Popen, PIPE
import kosh
kosh_example_sql_file = "kosh_example.sql"
# Create a new store (erase if exists)
store = kosh.connect(kosh_example_sql_file, delete_all_contents=True)
from kosh import connect
import os
# connect to store
store = connect(kosh_example_sql_file)
Adding datasets to the store¶
Let's add the first 10 runs.
import glob
try:
from tqdm.autonotebook import tqdm
except Exception:
tqdm = list
runs = glob.glob("sample_files/run*hdf5")
print("we found: {} runs".format(len(runs)))
for run in tqdm(runs[:10]):
name = os.path.basename(run).split(".")[0]
print("DATASET NAME:", name)
# let's make sure it is unique, in case we run this cell multiple times
datasets = list(store.find(name=name))
if len(datasets) == 0:
store.create(name)
else:
print("we found {} datasets already matching this name".format(len(datasets)))
print(datasets[0])
we found: 125 runs DATASET NAME: run_000 DATASET NAME: run_001 DATASET NAME: run_002 DATASET NAME: run_003 DATASET NAME: run_004 DATASET NAME: run_005 DATASET NAME: run_006 DATASET NAME: run_007 DATASET NAME: run_008 DATASET NAME: run_009
Adding attributes do a dataset¶
For each of these runs let's add metadata
import random
def create_metadata():
metadata = {"param1": random.random() * 2.,
"param2": random.random() * 1.5,
"param3": random.random() * 5,
"param4": random.random() * 3,
"param5": random.random() * 2.5,
"param6": chr(random.randint(65, 91)),
}
metadata["project"] = "Kosh Tutorial"
return metadata
pbar = tqdm(runs[:10])
for run in pbar:
name = os.path.basename(run).split(".")[0]
# Retrieve dataset via name
dataset = list(store.search(name=name))[0]
# Let's create a few random attributes
metadata = create_metadata()
for attribute in metadata:
setattr(dataset, attribute, metadata[attribute])
print(dataset)
KOSH DATASET id: e64b14fbd2a942b3b7407d91a46b2bfe name: run_009 creator: cdoutrix --- Attributes --- creator: cdoutrix name: run_009 param1: 0.6871634189142024 param2: 0.8973919877284344 param3: 0.2798480016106958 param4: 1.7072707195701757 param5: 0.8422236756200085 param6: B project: Kosh Tutorial --- Associated Data (0)--- --- Ensembles (0)--- [] --- Ensemble Attributes ---
Creating datasets with all the metadata at once.¶
Writing datasets attributes one at a time, meant accessing the store and editing every single time. This can be slow.
Let's speeds things up by writing all the attributes at once.
We will also turn to asynchronous mode on to speed up things further. This means we will only write to the store when the user says so. At that time Kosh will double check that nobody else changes any of these attributes while you were in async mode.
store.synchronous(False)
pbar = tqdm(runs[10:])
for i, run in enumerate(pbar):
name = os.path.basename(run).split(".")[0]
#pbar.set_description("run: {:45}".format(name))
# let's make sure it is unique
#datasets = store.search(name=name)
datasets=[]
if len(datasets) == 0:
metadata = create_metadata()
dataset = store.create(name, metadata=metadata)
else:
print("we found {} datasets already matching this name".format(len(datasets)))
print(datasets[0])
print(dataset)
# We need to sync the store to ensure it's written to the database
store.sync()
KOSH DATASET id: b155e7995e35403d94ce56bd9faa85a3 name: run_106 creator: cdoutrix --- Attributes --- creator: cdoutrix name: run_106 param1: 1.322292257762755 param2: 1.1538768595778675 param3: 0.469006113943895 param4: 1.4952803942290576 param5: 0.6610797559461854 param6: Z project: Kosh Tutorial --- Associated Data (0)--- --- Ensembles (0)--- [] --- Ensemble Attributes ---
Adding/Modifying/Deleting Dataset attributes¶
# List existing attributes
print(dataset.listattributes())
['creator', 'id', 'name', 'param1', 'param2', 'param3', 'param4', 'param5', 'param6', 'project']
# Create a new attribute
dataset.new_attribute = "new"
print(dataset.listattributes())
print(dataset.new_attribute)
['creator', 'id', 'name', 'new_attribute', 'param1', 'param2', 'param3', 'param4', 'param5', 'param6', 'project'] new
# modify an attribute
dataset.new_attribute = "changed"
print(dataset.new_attribute)
changed
# Modify/add many attributes at once (less db access, faster)
dataset.update({"new_attribute": "changed_again", "yet_another_new_attribute":"yana"})
print(dataset.listattributes())
print(dataset.new_attribute)
print(dataset.yet_another_new_attribute)
['creator', 'id', 'name', 'new_attribute', 'param1', 'param2', 'param3', 'param4', 'param5', 'param6', 'project', 'yet_another_new_attribute'] changed_again yana
# Deleting attributes
del(dataset.new_attribute)
del(dataset.yet_another_new_attribute)
print(dataset.listattributes())
['creator', 'id', 'name', 'param1', 'param2', 'param3', 'param4', 'param5', 'param6', 'project']
Deleting datasets from the store¶
store.delete(dataset)
Querying the store¶
When querying the store use a dictionary to specify key/values we want to look for:
datasets = list(store.find(param6='B', ids_only=True)) # Only their ids (faster)
print(len(datasets))
6
Using sina's query capabilities we can use ranges
(more on sina utils here).
from sina.utils import DataRange
datasets = list(store.find(param1=DataRange(min=1.7)))
print(len(datasets))
25
We can also search for datasets having a specific attribute (independently of its type or value).
datasets = list(store.find('param1'))
# or using sina's tools
from sina.utils import exists
datasets = list(store.find(param1=exists()))
print(len(datasets))
125
Closing the store¶
Once you are done with the store you should close it.
store.close()
Context Managers¶
Kosh store can be opened with a context manager.
with kosh.connect(kosh_example_sql_file) as store:
dataset = store.create()
For example you could use this with a thread pool.
import concurrent
def kosh_task(store_name):
with kosh.KoshStore(store_name, read_only=True) as store:
dataset = list(store.find(name='Dataset1'))[0]
return "{} {}".format(dataset.attr1,dataset.attr2)
with kosh.connect("threads.sql", delete_all_contents=True) as store:
ds = store.create(name="Dataset1", metadata={"attr1":"1","attr2":"2"})
with concurrent.futures.ThreadPoolExecutor() as pool:
kosh_thread = pool.submit(kosh_task, "threads.sql")
print("kosh:", kosh_thread.result())
kosh: 1 2