Connecting to a Store and Adding Datasets¶

In this Notebook we create a new store and add a few datasets to it.

Connect to store (using sina local file)¶

First let's create an empty databse (with you as a single user).

In a real application only admin user should have write permission to the file.

In [1]:

Copied!





import os
import sys
import shlex
from subprocess import Popen, PIPE
import kosh

kosh_example_sql_file = "kosh_example.sql"

# Create a new store (erase if exists)
store = kosh.connect(kosh_example_sql_file, delete_all_contents=True)
import os
import sys
import shlex
from subprocess import Popen, PIPE
import kosh

kosh_example_sql_file = "kosh_example.sql"

# Create a new store (erase if exists)
store = kosh.connect(kosh_example_sql_file, delete_all_contents=True)

In [2]:

Copied!

from  kosh import connect
import os

# connect to store
store = connect(kosh_example_sql_file)
from  kosh import connect
import os

# connect to store
store = connect(kosh_example_sql_file)

Connection Types¶

You can also select from the following connection_types ['write', 'append', 'read'] to prevent users from modifying the store.

'write' (default): allows the user add, modify, and delete data
'append': allows the user to only add data but not modify or delete data
'read': allows the user to only read data but not add, modify, or delete data

In [3]:

Copied!





# write (default)
# store = connect(kosh_example_sql_file)

# append
# store = connect(kosh_example_sql_file, connection_type='append')

# read
# store = connect(kosh_example_sql_file, connection_type='read')
# write (default)
# store = connect(kosh_example_sql_file)

# append
# store = connect(kosh_example_sql_file, connection_type='append')

# read
# store = connect(kosh_example_sql_file, connection_type='read')

Adding datasets to the store¶

Let's add the first 10 runs.

In [4]:

Copied!





import glob
try:
    from tqdm.autonotebook import tqdm
except Exception:
    tqdm = list

runs = glob.glob("sample_files/run*hdf5")
print("we found: {} runs".format(len(runs)))

for run in tqdm(runs[:10]):
    name = os.path.basename(run).split(".")[0]
    print("DATASET NAME:", name)
    # let's make sure it is unique, in case we run this cell multiple times
    datasets = list(store.find(name=name))
    if len(datasets) == 0:
        store.create(name)
    else:
        print("we found {} datasets already matching this name".format(len(datasets)))
        print(datasets[0])
import glob
try:
    from tqdm.autonotebook import tqdm
except Exception:
    tqdm = list

runs = glob.glob("sample_files/run*hdf5")
print("we found: {} runs".format(len(runs)))

for run in tqdm(runs[:10]):
    name = os.path.basename(run).split(".")[0]
    print("DATASET NAME:", name)
    # let's make sure it is unique, in case we run this cell multiple times
    datasets = list(store.find(name=name))
    if len(datasets) == 0:
        store.create(name)
    else:
        print("we found {} datasets already matching this name".format(len(datasets)))
        print(datasets[0])

we found: 125 runs

/tmp/ipykernel_1755666/1930883886.py:3: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)
  from tqdm.autonotebook import tqdm

  0%|          | 0/10 [00:00<?, ?it/s]

DATASET NAME: run_000
DATASET NAME: run_001
DATASET NAME: run_002
DATASET NAME: run_003
DATASET NAME: run_004
DATASET NAME: run_005
DATASET NAME: run_006
DATASET NAME: run_007
DATASET NAME: run_008
DATASET NAME: run_009

Adding attributes do a dataset¶

For each of these runs let's add metadata

In [5]:

Copied!





import random

def create_metadata():
    metadata = {"param1": random.random() * 2.,
                "param2": random.random() * 1.5,
                "param3": random.random() * 5,
                "param4": random.random() * 3,
                "param5": random.random() * 2.5,
                "param6": chr(random.randint(65, 91)),
               }
    metadata["project"] = "Kosh Tutorial"
    return metadata

pbar = tqdm(runs[:10])
for run in pbar:
    name = os.path.basename(run).split(".")[0]
    # Retrieve dataset via name
    dataset = list(store.search(name=name))[0]
    # Let's create a few random attributes
    metadata = create_metadata()
    for attribute in metadata:
        setattr(dataset, attribute, metadata[attribute])
print(dataset)
import random

def create_metadata():
    metadata = {"param1": random.random() * 2.,
                "param2": random.random() * 1.5,
                "param3": random.random() * 5,
                "param4": random.random() * 3,
                "param5": random.random() * 2.5,
                "param6": chr(random.randint(65, 91)),
               }
    metadata["project"] = "Kosh Tutorial"
    return metadata

pbar = tqdm(runs[:10])
for run in pbar:
    name = os.path.basename(run).split(".")[0]
    # Retrieve dataset via name
    dataset = list(store.search(name=name))[0]
    # Let's create a few random attributes
    metadata = create_metadata()
    for attribute in metadata:
        setattr(dataset, attribute, metadata[attribute])
print(dataset)

  0%|          | 0/10 [00:00<?, ?it/s]

KOSH DATASET
	id: df4cb24c25fc4514a1212949866b76e9
	name: run_009
	creator: moreno45

--- Attributes ---
	creator: moreno45
	name: run_009
	param1: 1.1081383271323024
	param2: 1.1352872949668016
	param3: 2.6503527789141454
	param4: 2.488883773006854
	param5: 1.4491023080970376
	param6: Y
	project: Kosh Tutorial
--- Associated Data (0)---
--- Ensembles (0)---
	[]
--- Ensemble Attributes ---
--- Alias Feature Dictionary ---

Creating datasets with all the metadata at once.¶

Writing datasets attributes one at a time, meant accessing the store and editing every single time. This can be slow.

Let's speeds things up by writing all the attributes at once.

We will also turn to asynchronous mode on to speed up things further. This means we will only write to the store when the user says so. At that time Kosh will double check that nobody else changes any of these attributes while you were in async mode.

In [6]:

Copied!





store.synchronous(False)
pbar = tqdm(runs[10:])
for i, run in enumerate(pbar):
    name = os.path.basename(run).split(".")[0]
    #pbar.set_description("run: {:45}".format(name))
    # let's make sure it is unique
    #datasets = store.search(name=name)
    datasets=[]
    if len(datasets) == 0:
        metadata = create_metadata()
        dataset = store.create(name, metadata=metadata)
    else:
        print("we found {} datasets already matching this name".format(len(datasets)))
        print(datasets[0])
print(dataset)
# We need to sync the store to ensure it's written to the database
store.sync()
store.synchronous(False)
pbar = tqdm(runs[10:])
for i, run in enumerate(pbar):
    name = os.path.basename(run).split(".")[0]
    #pbar.set_description("run: {:45}".format(name))
    # let's make sure it is unique
    #datasets = store.search(name=name)
    datasets=[]
    if len(datasets) == 0:
        metadata = create_metadata()
        dataset = store.create(name, metadata=metadata)
    else:
        print("we found {} datasets already matching this name".format(len(datasets)))
        print(datasets[0])
print(dataset)
# We need to sync the store to ensure it's written to the database
store.sync()

  0%|          | 0/115 [00:00<?, ?it/s]

KOSH DATASET
	id: 378dfd7e04b748aabffc9e1269117d77
	name: run_124
	creator: moreno45

--- Attributes ---
	creator: moreno45
	name: run_124
	param1: 1.686446463186504
	param2: 0.5321867931776765
	param3: 4.720563146590528
	param4: 1.1314785287463742
	param5: 1.7971765969075726
	param6: Z
	project: Kosh Tutorial
--- Associated Data (0)---
--- Ensembles (0)---
	[]
--- Ensemble Attributes ---
--- Alias Feature Dictionary ---

Adding/Modifying/Deleting Dataset attributes¶

In [7]:

Copied!

# List existing attributes
print(dataset.listattributes())
# List existing attributes
print(dataset.listattributes())

['creator', 'id', 'name', 'param1', 'param2', 'param3', 'param4', 'param5', 'param6', 'project']

In [8]:

Copied!





# Create a new attribute
dataset.new_attribute = "new"
print(dataset.listattributes())
print(dataset.new_attribute)
# Create a new attribute
dataset.new_attribute = "new"
print(dataset.listattributes())
print(dataset.new_attribute)

['creator', 'id', 'name', 'new_attribute', 'param1', 'param2', 'param3', 'param4', 'param5', 'param6', 'project']
new

In [9]:

Copied!

# modify an attribute
dataset.new_attribute = "changed"
print(dataset.new_attribute)
# modify an attribute
dataset.new_attribute = "changed"
print(dataset.new_attribute)

changed

In [10]:

Copied!





# Modify/add many attributes at once (less db access, faster)
dataset.update({"new_attribute": "changed_again", "yet_another_new_attribute":"yana"})
print(dataset.listattributes())
print(dataset.new_attribute)
print(dataset.yet_another_new_attribute)
# Modify/add many attributes at once (less db access, faster)
dataset.update({"new_attribute": "changed_again", "yet_another_new_attribute":"yana"})
print(dataset.listattributes())
print(dataset.new_attribute)
print(dataset.yet_another_new_attribute)

['creator', 'id', 'name', 'new_attribute', 'param1', 'param2', 'param3', 'param4', 'param5', 'param6', 'project', 'yet_another_new_attribute']
changed_again
yana

In [11]:

Copied!





# Deleting attributes
del(dataset.new_attribute)
del(dataset.yet_another_new_attribute)
print(dataset.listattributes())
# Deleting attributes
del(dataset.new_attribute)
del(dataset.yet_another_new_attribute)
print(dataset.listattributes())

['creator', 'id', 'name', 'param1', 'param2', 'param3', 'param4', 'param5', 'param6', 'project']

Deleting datasets from the store¶

In [12]:

Copied!

store.delete(dataset)
store.delete(dataset)

Querying the store¶

When querying the store use a dictionary to specify key/values we want to look for:

In [13]:

Copied!

datasets = list(store.find(param6='B', ids_only=True))  # Only their ids (faster)
print(len(datasets))
datasets = list(store.find(param6='B', ids_only=True))  # Only their ids (faster)
print(len(datasets))

Using sina's query capabilities we can use ranges (more on sina utils here).

In [14]:

Copied!

from sina.utils import DataRange
datasets = list(store.find(param1=DataRange(min=1.7)))
print(len(datasets))
from sina.utils import DataRange
datasets = list(store.find(param1=DataRange(min=1.7)))
print(len(datasets))

We can also search for datasets having a specific attribute (independently of its type or value).

In [15]:

Copied!





datasets = list(store.find('param1'))
# or using sina's tools
from sina.utils import exists
datasets = list(store.find(param1=exists()))
print(len(datasets))
datasets = list(store.find('param1'))
# or using sina's tools
from sina.utils import exists
datasets = list(store.find(param1=exists()))
print(len(datasets))

Closing the store¶

Once you are done with the store you should close it.

In [16]:

Copied!

store.close()
store.close()

Context Managers¶

Kosh store can be opened with a context manager.

In [17]:

Copied!

with kosh.connect(kosh_example_sql_file) as store:
    dataset = store.create()
with kosh.connect(kosh_example_sql_file) as store:
    dataset = store.create()

For example you could use this with a thread pool.

In [18]:

Copied!





import concurrent

def kosh_task(store_name):
    with kosh.KoshStore(store_name, read_only=True) as store:
        dataset = list(store.find(name='Dataset1'))[0]
    return "{} {}".format(dataset.attr1,dataset.attr2)

with kosh.connect("threads.sql", delete_all_contents=True) as store:
    ds = store.create(name="Dataset1", metadata={"attr1":"1","attr2":"2"})
    
with concurrent.futures.ThreadPoolExecutor() as pool:
    kosh_thread = pool.submit(kosh_task, "threads.sql")
    print("kosh:", kosh_thread.result())
import concurrent

def kosh_task(store_name):
    with kosh.KoshStore(store_name, read_only=True) as store:
        dataset = list(store.find(name='Dataset1'))[0]
    return "{} {}".format(dataset.attr1,dataset.attr2)

with kosh.connect("threads.sql", delete_all_contents=True) as store:
    ds = store.create(name="Dataset1", metadata={"attr1":"1","attr2":"2"})
    
with concurrent.futures.ThreadPoolExecutor() as pool:
    kosh_thread = pool.submit(kosh_task, "threads.sql")
    print("kosh:", kosh_thread.result())

kosh: 1 2