from __future__ import print_function # For Python 2 compatibility
import kosh
import os
import random
def prep_stores(source_name="my_source_store.sql", dest_name="my_dest_store.sql", data_dir="my_data_dir"):
"""
This creates two new stores and adds a dataset with 3 associated files to it to the first store"
"""
try:
os.remove(source_name)
except:
pass
try:
os.remove(dest_name)
except:
pass
# Let's create a "source" and a "destination" store
source_store = kosh.connect(source_name, delete_all_contents=True)
dest_store = kosh.connect(dest_name, delete_all_contents=True)
# Let's create a dataset we'd like to transfer
dataset = source_store.create(name="a_dataset", metadata={"int_attr":1, "float_attr":2., "str_attr": "string"})
# let's create some files to associate
# first a directory
try:
os.makedirs(data_dir)
except Exception:
pass
filenames = ["a.txt", "b.txt", "c.py"]
filenames = [os.path.join(data_dir, f) for f in filenames]
dataset.associate(filenames, "test")
for filename in filenames:
with open(filename, "w") as f:
print("some data", file=f)
print(random.randint(0, 10000000), file=f) # to ensure unique SHAs
Transferring Datasets from one store to another w/o moving the data itself.¶
In this case a simple python Python script will suffice, see This Example for more details.
# Let's prepare the stores
prep_stores()
# Let's open our source store:
my_store = kosh.connect("my_source_store.sql")
# Let's open our target store
target_store = kosh.connect("my_dest_store.sql")
# Let's find the dataset(s) of interest in the source
datasets = my_store.find(name="a_dataset")
# And let's transfer
for dataset in datasets:
target_store.import_dataset(dataset.export())
# Voila! Let's check
print(list(target_store.find(name="a_dataset"))[0])
KOSH DATASET id: 5711bbd74ec7465ca105396d6e31fbaf name: a_dataset creator: cdoutrix --- Attributes --- creator: cdoutrix float_attr: 2.0 int_attr: 1 name: a_dataset str_attr: string --- Associated Data (3)--- Mime_type: test my_data_dir/a.txt ( 2b7622d2728b4070afc0835ce7f4b724 ) my_data_dir/b.txt ( 21f0c796ed0742e39439e59a24d152bb ) my_data_dir/c.py ( 45911f91b38f4ab0ba0c7ad429409331 ) --- Ensembles (0)--- [] --- Ensemble Attributes ---
Data needs to be moved or copied.¶
On the same file system¶
If you need to move some files simply use kosh mv
Example: moving file.py to new_named_file.py
kosh mv --stores store1.sql --sources file.py --destination new_named_file.py
usage: kosh mv [-h] --stores STORES [--destination-stores DESTINATION_STORES] --sources SOURCES [SOURCES ...]
[--dataset_record_type DATASET_RECORD_TYPE] [--dataset_matching_attributes DATASET_MATCHING_ATTRIBUTES]
--destination DESTINATION [--version] [--merge_strategy {conservative,preserve,overwrite}] [--mk_dirs]
OR within Python itself
store.mv('file.py', 'new_named_file.py')
You can also copy files to another place and store
kosh cp --stores store1.sql --sources file.py --destination new_named_file.py
usage: kosh cp [-h] --stores STORES [--destination-stores DESTINATION_STORES] --sources SOURCES [SOURCES ...]
[--dataset_record_type DATASET_RECORD_TYPE] [--dataset_matching_attributes DATASET_MATCHING_ATTRIBUTES]
--destination DESTINATION [--version] [--merge_strategy {conservative,preserve,overwrite}] [--mk_dirs]
OR within Python itself
store.cp('file.py', 'new_named_file.py')
Kosh should handle properly directories and patterns (*)
After the fact¶
Ooops! You moved the files to a new place but forgot to do so via kosh mv
Fear not! Kosh can probably help you fix your stores
usage: kosh reassociate --stores STORES --new_uris NEW_URIS [NEW_URIS ...] [--original_uris ORIGINAL_URIS [ORIGINAL_URIS ...]]
[--no_absolute_path]
Option 1: just point to the new files¶
kosh reassociate --stores store.sql --new_uris new_named_file.py
Kosh will compute the "short sha" on the target(s) and try to find a match.
The new_uris can be a directory or pattern
Option 2: Using the old name¶
kosh reassociate --stores store.sql --new_uris new_named_file.py --original_uris file.py
Option 3: I know the fast sha¶
kosh reassociate --stores store.sql --new_uris new_named_file.py --original_uris c6a15fa59ae2d070a88a6a96503543d4baeb8f381f247854ef04adb67f79d818
Moving files across filesystem (remote host)¶
Here we assume that we need to bring data from a remote machine
Because Kosh will need to do a LOT of talking with the remote host it is preferable to setup an ssh agent so you do not need to type you password over and over
Please see this guide to setup yor keys and agent properly
# Let's ask for the password and setup ssh agent
import getpass
password = getpass.getpass()+"\n"
from subprocess import Popen, PIPE
import shlex
agent = Popen("ssh-agent", stdin=PIPE, stdout=PIPE, stderr=PIPE)
o,e = agent.communicate()
for line in o.decode().split("\n"):
sp = line.split("=")
if len(sp) > 1:
variable = sp[0]
value = sp[1].split(";")[0]
os.environ[variable] = value
add = Popen("ssh-add", stdin=PIPE, stdout=PIPE, stderr=PIPE)
add.communicate(password.encode())
(b'', b'Enter passphrase for /g/g19/cdoutrix/.ssh/id_rsa: Identity added: /g/g19/cdoutrix/.ssh/id_rsa (/g/g19/cdoutrix/.ssh/id_rsa)\n')
# Let's prepare our data
prep_stores()
my_store = kosh.connect("my_source_store.sql")
target_store = kosh.connect("my_dest_store.sql")
Now let's fake our "remote host"
import socket
user = getpass.getuser()
hostname = socket.gethostname()
Ok all we need to do is to copy the data from the remote host to a new local directory
# Let's cleanup first
import shutil
try:
shutil.rmtree("my_new_data_dir")
except:
pass
os.makedirs("my_new_data_dir")
Let's build the command line to copy the data over
import sys
cmd = "{}/bin/kosh cp --stores my_source_store.sql --destination_stores my_dest_store.sql --sources {}@{}:{}/my_data_dir --destination my_new_data_dir".format(sys.prefix, user, hostname, os.getcwd())
print("We will be executing:\n{}".format(cmd))
We will be executing: /g/g19/cdoutrix/miniconda3/envs/kosh/bin/kosh cp --stores my_source_store.sql --destination_stores my_dest_store.sql --sources cdoutrix@pascal83:/g/g19/cdoutrix/git/kosh/examples/my_data_dir --destination my_new_data_dir
p = Popen(shlex.split(cmd), stdin=PIPE, stdout=PIPE, stderr=PIPE)
o, e = p.communicate()
# Now let's check our second store (on the remote) contains data
remote_store = kosh.connect("my_dest_store.sql")
print(list(remote_store.find()))
[]
Moving files across disconnected filesystems¶
Let's assume you have a LOT of data, you need to move it to another computer but you have a VERY slow connection to the other computer.
Using scp/rsync will take months and you can't wait.
Kosh solution at this point is to tar
(or htar
) your data on the original machine, manually transfer the data to the other machine (USB stick, DVD, etc...) and run tar again on the other end
Kosh will look for the datasets referencing the files your tarring and add them to the tarball.
When extracting Kosh will add these dataset (with the new local paths) to your destination store.
The syntax is the same as your regular tar
/htar
(you can pass any command accepted by tar
/htar
) except you need to point to the kosh store and the tarball name must be specified via -f
Example:
kosh tar cv --stores store1.sql store2.sql -f my_big_tar.tgz *.hdf5
Once one the destination machine you can do:
kosh tar cv --stores destination_store.sql -f my_big_tar.tgz
Your files are untarred and the dataset originally in store1 and store2 that pointed to these files are now in destination_store
Cleaning up store for dead files¶
Sometimes files are gone either because someone else removed them or a disk failed, etc...
Whatever the reason Kosh stores and datasets have capability to self clean.
kosh cleanup_files -s 'cmd_line.sql' # cleans up all non existing uri
Beware if you have URI pointing to non file sources they would be dissociated it is recommended to use a filter on datasets sources, e.g:
kosh cleanup_files -s 'cmd_line.sql' mime_type='hdf5' # cleans up all non existing uri pointing ot hdf5 mime_type sources
You can also accomplish the same thing from Python
dataset = list(my_store.find())[0]
# Let's delete on of the files
associated = next(dataset.find())
print("Removing {}".format(associated.uri))
os.remove(associated.uri)
print(dataset.cleanup_files(dry_run=True))
print(dataset)
print(dataset.cleanup_files())
Removing /g/g19/cdoutrix/git/kosh/examples/my_data_dir/a.txt ['/g/g19/cdoutrix/git/kosh/examples/my_data_dir/a.txt'] KOSH DATASET id: edfd986619a640ea8d6e48be5f303a9b name: a_dataset creator: cdoutrix --- Attributes --- creator: cdoutrix float_attr: 2.0 int_attr: 1 name: a_dataset str_attr: string --- Associated Data (3)--- Mime_type: test /g/g19/cdoutrix/git/kosh/examples/my_data_dir/a.txt ( 622c1b84b9af49f18db6651019c9523d ) /g/g19/cdoutrix/git/kosh/examples/my_data_dir/b.txt ( 66af741682fd4ea1959a24e9db9d70db ) /g/g19/cdoutrix/git/kosh/examples/my_data_dir/c.py ( a7d5169831ea4f39bcf1b7f7972a3b18 ) --- Ensembles (0)--- [] --- Ensemble Attributes --- ['/g/g19/cdoutrix/git/kosh/examples/my_data_dir/a.txt']