import os
import sys
import shlex
from subprocess import Popen, PIPE
import kosh
kosh_example_sql_file = "kosh_example.sql"
# Create a new store (erase if exists)
store = kosh.connect(kosh_example_sql_file, delete_all_contents=True)
from kosh import connect
import os
# connect to store
store = connect(kosh_example_sql_file)
Connection Types¶
You can also select from the following connection_types ['write', 'append', 'read'] to prevent users from modifying the store.
'write'(default): allows the user add, modify, and delete data'append': allows the user to only add data but not modify or delete data'read': allows the user to only read data but not add, modify, or delete data
# write (default)
# store = connect(kosh_example_sql_file)
# append
# store = connect(kosh_example_sql_file, connection_type='append')
# read
# store = connect(kosh_example_sql_file, connection_type='read')
Adding datasets to the store¶
Let's add the first 10 runs.
import glob
try:
from tqdm.autonotebook import tqdm
except Exception:
tqdm = list
runs = glob.glob("sample_files/run*hdf5")
print("we found: {} runs".format(len(runs)))
for run in tqdm(runs[:10]):
name = os.path.basename(run).split(".")[0]
print("DATASET NAME:", name)
# let's make sure it is unique, in case we run this cell multiple times
datasets = list(store.find(name=name))
if len(datasets) == 0:
store.create(name)
else:
print("we found {} datasets already matching this name".format(len(datasets)))
print(datasets[0])
we found: 125 runs
/tmp/ipykernel_1755666/1930883886.py:3: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console) from tqdm.autonotebook import tqdm
0%| | 0/10 [00:00<?, ?it/s]
DATASET NAME: run_000 DATASET NAME: run_001 DATASET NAME: run_002 DATASET NAME: run_003 DATASET NAME: run_004 DATASET NAME: run_005 DATASET NAME: run_006 DATASET NAME: run_007 DATASET NAME: run_008 DATASET NAME: run_009
Adding attributes do a dataset¶
For each of these runs let's add metadata
import random
def create_metadata():
metadata = {"param1": random.random() * 2.,
"param2": random.random() * 1.5,
"param3": random.random() * 5,
"param4": random.random() * 3,
"param5": random.random() * 2.5,
"param6": chr(random.randint(65, 91)),
}
metadata["project"] = "Kosh Tutorial"
return metadata
pbar = tqdm(runs[:10])
for run in pbar:
name = os.path.basename(run).split(".")[0]
# Retrieve dataset via name
dataset = list(store.search(name=name))[0]
# Let's create a few random attributes
metadata = create_metadata()
for attribute in metadata:
setattr(dataset, attribute, metadata[attribute])
print(dataset)
0%| | 0/10 [00:00<?, ?it/s]
KOSH DATASET id: df4cb24c25fc4514a1212949866b76e9 name: run_009 creator: moreno45 --- Attributes --- creator: moreno45 name: run_009 param1: 1.1081383271323024 param2: 1.1352872949668016 param3: 2.6503527789141454 param4: 2.488883773006854 param5: 1.4491023080970376 param6: Y project: Kosh Tutorial --- Associated Data (0)--- --- Ensembles (0)--- [] --- Ensemble Attributes --- --- Alias Feature Dictionary ---
Creating datasets with all the metadata at once.¶
Writing datasets attributes one at a time, meant accessing the store and editing every single time. This can be slow.
Let's speeds things up by writing all the attributes at once.
We will also turn to asynchronous mode on to speed up things further. This means we will only write to the store when the user says so. At that time Kosh will double check that nobody else changes any of these attributes while you were in async mode.
store.synchronous(False)
pbar = tqdm(runs[10:])
for i, run in enumerate(pbar):
name = os.path.basename(run).split(".")[0]
#pbar.set_description("run: {:45}".format(name))
# let's make sure it is unique
#datasets = store.search(name=name)
datasets=[]
if len(datasets) == 0:
metadata = create_metadata()
dataset = store.create(name, metadata=metadata)
else:
print("we found {} datasets already matching this name".format(len(datasets)))
print(datasets[0])
print(dataset)
# We need to sync the store to ensure it's written to the database
store.sync()
0%| | 0/115 [00:00<?, ?it/s]
KOSH DATASET id: 378dfd7e04b748aabffc9e1269117d77 name: run_124 creator: moreno45 --- Attributes --- creator: moreno45 name: run_124 param1: 1.686446463186504 param2: 0.5321867931776765 param3: 4.720563146590528 param4: 1.1314785287463742 param5: 1.7971765969075726 param6: Z project: Kosh Tutorial --- Associated Data (0)--- --- Ensembles (0)--- [] --- Ensemble Attributes --- --- Alias Feature Dictionary ---
Adding/Modifying/Deleting Dataset attributes¶
# List existing attributes
print(dataset.listattributes())
['creator', 'id', 'name', 'param1', 'param2', 'param3', 'param4', 'param5', 'param6', 'project']
# Create a new attribute
dataset.new_attribute = "new"
print(dataset.listattributes())
print(dataset.new_attribute)
['creator', 'id', 'name', 'new_attribute', 'param1', 'param2', 'param3', 'param4', 'param5', 'param6', 'project'] new
# modify an attribute
dataset.new_attribute = "changed"
print(dataset.new_attribute)
changed
# Modify/add many attributes at once (less db access, faster)
dataset.update({"new_attribute": "changed_again", "yet_another_new_attribute":"yana"})
print(dataset.listattributes())
print(dataset.new_attribute)
print(dataset.yet_another_new_attribute)
['creator', 'id', 'name', 'new_attribute', 'param1', 'param2', 'param3', 'param4', 'param5', 'param6', 'project', 'yet_another_new_attribute'] changed_again yana
# Deleting attributes
del(dataset.new_attribute)
del(dataset.yet_another_new_attribute)
print(dataset.listattributes())
['creator', 'id', 'name', 'param1', 'param2', 'param3', 'param4', 'param5', 'param6', 'project']
Deleting datasets from the store¶
store.delete(dataset)
Querying the store¶
When querying the store use a dictionary to specify key/values we want to look for:
datasets = list(store.find(param6='B', ids_only=True)) # Only their ids (faster)
print(len(datasets))
2
Using sina's query capabilities we can use ranges (more on sina utils here).
from sina.utils import DataRange
datasets = list(store.find(param1=DataRange(min=1.7)))
print(len(datasets))
13
We can also search for datasets having a specific attribute (independently of its type or value).
datasets = list(store.find('param1'))
# or using sina's tools
from sina.utils import exists
datasets = list(store.find(param1=exists()))
print(len(datasets))
125
Closing the store¶
Once you are done with the store you should close it.
store.close()
Context Managers¶
Kosh store can be opened with a context manager.
with kosh.connect(kosh_example_sql_file) as store:
dataset = store.create()
For example you could use this with a thread pool.
import concurrent
def kosh_task(store_name):
with kosh.KoshStore(store_name, read_only=True) as store:
dataset = list(store.find(name='Dataset1'))[0]
return "{} {}".format(dataset.attr1,dataset.attr2)
with kosh.connect("threads.sql", delete_all_contents=True) as store:
ds = store.create(name="Dataset1", metadata={"attr1":"1","attr2":"2"})
with concurrent.futures.ThreadPoolExecutor() as pool:
kosh_thread = pool.submit(kosh_task, "threads.sql")
print("kosh:", kosh_thread.result())
kosh: 1 2