Ensemble

`KoshEnsemble`

Bases: KoshDataset

Source code in kosh/ensemble.py

class KoshEnsemble(KoshDataset):
    def __init__(self, id, store, schema=None, record=None):
        """Kosh Ensemble
Ensemble allows to link together many datasets.
These datasets will inherit attributes and associated sources from the ensemble.

        :param id: dataset's unique Id
        :type id: str
        :param store: store containing the dataset
        :type store: KoshSinaStore
        :param schema: Kosh schema validator
        :type schema: KoshSchema
        :param record: to avoid looking up in sina pass sina record
        :type record: Record
        """
        super(KoshEnsemble, self).__init__(id, store,
                                           schema=schema, record=record,
                                           kosh_type=store._ensembles_type)
        self.__dict__["__protected__"] = ["__name__", "__creator__", "__store__",
                                          "_associated_data_", "__features__",
                                          "_associated_datasets_", "__ok_duplicates__"]
        # Attributes that the members can have on their own
        self.__dict__["__ok_duplicates__"] = ["creator", "id", "name"]

    def __str__(self):
        """string representation"""
        st = super(KoshEnsemble, self).__str__()
        st = st.replace("KOSH DATASET", "KOSH ENSEMBLE")
        st = st[:st.find("--- Ensembles") - 1]
        if self._associated_datasets_ is not None:
            st += "\n--- Member Datasets ({})---\n".format(
                len(self._associated_datasets_))
            st += "\t{}".format(self._associated_datasets_)
        return st

    def cleanup_files(self, dry_run=False, interactive=False, **search_keys):
        """Cleanup the ensemble's members from references to dead files.
        You can filter associated objects by passing key=values
        e.g mime_type=hdf5 will only dissociate non-existing files associated with mime_type hdf5
        some_att=some_val will only dissociate non-existing files associated and having the attribute
        'some_att' with value of 'some_val'
        returns list of uris to be removed.
        :param dry_run: Only does a dry_run
        :type dry_run: bool
        :param interactive: interactive mode, ask before dissociating
        :type interactive: bool
        :returns: list of uris (to be) removed.
        :rtype: list
        """
        missings = super(
            KoshEnsemble,
            self).cleanup_files(
            dry_run=dry_run,
            interactive=interactive,
            **search_keys)
        for dataset in self.get_members():
            missings += dataset.cleanup_files(dry_run=dry_run,
                                              interactive=interactive, **search_keys)
        return missings

    def export(self, file=None):
        """Exports this ensemble datasets
        :param file: export datasets to a file
        :type file: None or str
        :return: dataset and its associated data
        :rtype: dict"""
        records = [cleanup_sina_record_from_kosh_sync(self.get_record()), ]
        for dataset_id in self.get_members(ids_only=True):
            records.append(
                cleanup_sina_record_from_kosh_sync(
                    self.__store__.get_record(dataset_id)))
        # We also need to export the relationships
        rels = self.get_sina_store().relationships.find(
            None, self.__store__._ensemble_predicate, self.id)
        relationships = []
        for rel in rels:
            relationships.append(rel.to_json())
        output_dict = {
            "minimum_kosh_version": None,
            "kosh_version": kosh.version(comparable=True),
            "sources_type": self.__store__._sources_type,
            "records": records,
            "relationships": relationships
        }

        update_json_file_with_records_and_relationships(file, output_dict)
        return output_dict

    def create(self, name="Unnamed Dataset", id=None,
               metadata={}, schema=None, sina_type=None, **kargs):
        """create a new (possibly named) dataset as a member of this ensemble.

        :param name: name for the dataset, defaults to None
        :type name: str, optional
        :param id: unique Id, defaults to None which means use uuid4()
        :type id: str, optional
        :param metadata: dictionary of attribute/value pair for the dataset, defaults to {}
        :type metadata: dict, optional
        :param schema: a KoshSchema object to validate datasets and when setting attributes
        :type schema: KoshSchema
        :param sina_type: If you want to query the store for a specific sina record type, not just a dataset
        :type sina_type: str
        :param kargs: extra keyword arguments (ignored)
        :type kargs: dict
        :raises RuntimeError: Dataset already exists
        :return: KoshDataset
        :rtype: KoshDataset
        """
        if sina_type == self.__store__._ensembles_type:
            raise ValueError("You cannot create an ensemble from an ensemble")

        attributes = self.list_attributes()
        for key in metadata:
            if key in attributes:
                if metadata[key] != getattr(self, key):
                    raise ValueError(
                        "'{}' is an attribute of this ensemble and "
                        "therefore cannot be an attribute of its descendants".format(key))
                else:
                    warnings.warn(
                        "'{}' is an attribute of this ensemble and "
                        "therefore cannot be an attribute of its descendants"
                        ". Values match so we will accept it here.".format(key), UserWarning)

        ds = self.__store__.create(
            name=name,
            id=id,
            metadata=metadata,
            schema=schema,
            sina_type=sina_type,
            **kargs)
        self.add(ds)
        return ds

    def add(self, dataset):
        """Adds a dataset to this ensemble
        :param dataset: The dataset to add to this ensemble""
        :type dataset: KoshDataset or str
        """
        # Step1 make sure the dataset does not belong to another ensemble
        if isinstance(dataset, KoshDataset):
            dataset_id = dataset.id
        else:
            dataset_id = dataset
            dataset = self.__store__._load(dataset_id)
        relationships = self.get_sina_store().relationships.find(
            dataset_id, self.__store__._ensemble_predicate, None)
        for rel in relationships:
            if rel.object_id != self.id:
                other_ensemble = self.__store__.open(rel.object_id)
                # Ok... Already a member of another ensemble.
                # let's make sure there are no conflict here
                for att in self.list_attributes():
                    if att in self.__dict__["__ok_duplicates__"]:
                        continue
                    if att in other_ensemble.list_attributes():
                        raise ValueError(
                            "Dataset {} is already part of ensemble {} "
                            "which already provides support for attribute: {}. Bailing".format(
                                dataset_id, rel.object_id, att))
            else:
                # ok it's already done, no need to do anything else
                return

        # Ok we're good, let's now makes sure attributes are ok
        attributes = self.list_attributes(dictionary=True)
        dataset_attributes = dataset.list_attributes(dictionary=True)
        for att in dataset.list_attributes():
            if att in self.__dict__["__ok_duplicates__"]:
                continue
            if att in attributes and dataset_attributes[att] != attributes[att]:
                raise ValueError(
                    "Dataset {} has attribute `{}` with value {}, this ensemble ({}) has value `{}`".format(
                        dataset_id, att, dataset_attributes[att], self.id, attributes[att]))
        # At this point we need to add the ensemble attributes to the dataset
        for att in self.list_attributes():
            if att in self.__dict__["__ok_duplicates__"]:
                continue
            dataset.___setattr___(att, getattr(self, att), force=True)
        # Ok We are clear let's create the relationship
        rel = sina.model.Relationship(
            self.id, dataset_id, self.__store__._ensemble_predicate)
        self.get_sina_store().relationships.insert(rel)

    def remove(self, dataset):
        """Removes a dataset from this ensemble. Does not delete the dataset.
        :param dataset: The dataset to remove
        :type dataset: KoshDataset or str
        """
        # Step1 make sure the dataset does not belong to another ensemble
        if isinstance(dataset, KoshDataset):
            dataset_id = dataset.id
        else:
            dataset_id = dataset
            dataset = self.__store__._load(dataset_id)
        relationships = self.get_sina_store().relationships.find(
            dataset_id, self.__store__._ensemble_predicate, self.id)
        if len(relationships) == 0:
            warnings.warn(
                "Dataset {} is not a member of ensemble {}".format(
                    dataset_id, self.id))
            return

        rel = relationships[0]
        self.get_sina_store().relationships.delete(rel.subject_id, rel.predicate, rel.object_id)

    delete = remove

    def get_members(self, ids_only=False):
        """Generator for member datasets
        :param ids_only: generator will return ids if True Kosh datasets otherwise
        :type ids_only: bool
        :returns: generator of dataset (or ids)
        :rtype: str or KoshDataset
        """
        for id in self._associated_datasets_:
            if ids_only:
                yield id
            else:
                yield self.__store__.open(id)

    def find_datasets(self, *atts, **keys):
        """Find datasets members of this ensemble that are matching some metadata.
        Arguments are the metadata names we are looking for e.g
        find("attr1", "attr2")
        you can further restrict by specifying exact value for a metadata
        via key=value
        you can return ids only by using: ids_only=True
        range can be specified via: sina.utils.DataRange(min, max)

        :return: generator of matching datasets in this ensemble
        :rtype: generator
        """

        members_ids = list(self.get_members(ids_only=True))
        return self.__store__.find(id_pool=members_ids, *atts, **keys)

    def clone(self, *atts, **keys):
        """We cannot clone an ensemble"""
        raise NotImplementedError("Ensembles objects cannot clone themselves")

    def list_attributes(self, dictionary=False, no_duplicate=False):
        """list_attributes list all non protected attributes

        :parm dictionary: return a dictionary of value/pair rather than just attributes names
        :type dictionary: bool

        :param no_duplicate: return only attributes that cannot be duplicated in members
        :type no_duplicate: bool

        :return: list of attributes set on object
        :rtype: list
        """

        attributes = super(KoshEnsemble, self).list_attributes(dictionary)
        if no_duplicate:
            return [x for x in attributes if x not in self.__dict__["__ok_duplicates__"]]
        else:
            return attributes

`init(id, store, schema=None, record=None)`

Kosh Ensemble Ensemble allows to link together many datasets. These datasets will inherit attributes and associated sources from the ensemble.

    :param id: dataset's unique Id
    :type id: str
    :param store: store containing the dataset
    :type store: KoshSinaStore
    :param schema: Kosh schema validator
    :type schema: KoshSchema
    :param record: to avoid looking up in sina pass sina record
    :type record: Record

Source code in kosh/ensemble.py

    def __init__(self, id, store, schema=None, record=None):
        """Kosh Ensemble
Ensemble allows to link together many datasets.
These datasets will inherit attributes and associated sources from the ensemble.

        :param id: dataset's unique Id
        :type id: str
        :param store: store containing the dataset
        :type store: KoshSinaStore
        :param schema: Kosh schema validator
        :type schema: KoshSchema
        :param record: to avoid looking up in sina pass sina record
        :type record: Record
        """
        super(KoshEnsemble, self).__init__(id, store,
                                           schema=schema, record=record,
                                           kosh_type=store._ensembles_type)
        self.__dict__["__protected__"] = ["__name__", "__creator__", "__store__",
                                          "_associated_data_", "__features__",
                                          "_associated_datasets_", "__ok_duplicates__"]
        # Attributes that the members can have on their own
        self.__dict__["__ok_duplicates__"] = ["creator", "id", "name"]

`str()`

string representation

Source code in kosh/ensemble.py

def __str__(self):
    """string representation"""
    st = super(KoshEnsemble, self).__str__()
    st = st.replace("KOSH DATASET", "KOSH ENSEMBLE")
    st = st[:st.find("--- Ensembles") - 1]
    if self._associated_datasets_ is not None:
        st += "\n--- Member Datasets ({})---\n".format(
            len(self._associated_datasets_))
        st += "\t{}".format(self._associated_datasets_)
    return st

`add(dataset)`

Adds a dataset to this ensemble

Parameters:

Name	Type	Description	Default
`dataset`	`KoshDataset \| str`	The dataset to add to this ensemble""	required

Source code in kosh/ensemble.py

def add(self, dataset):
    """Adds a dataset to this ensemble
    :param dataset: The dataset to add to this ensemble""
    :type dataset: KoshDataset or str
    """
    # Step1 make sure the dataset does not belong to another ensemble
    if isinstance(dataset, KoshDataset):
        dataset_id = dataset.id
    else:
        dataset_id = dataset
        dataset = self.__store__._load(dataset_id)
    relationships = self.get_sina_store().relationships.find(
        dataset_id, self.__store__._ensemble_predicate, None)
    for rel in relationships:
        if rel.object_id != self.id:
            other_ensemble = self.__store__.open(rel.object_id)
            # Ok... Already a member of another ensemble.
            # let's make sure there are no conflict here
            for att in self.list_attributes():
                if att in self.__dict__["__ok_duplicates__"]:
                    continue
                if att in other_ensemble.list_attributes():
                    raise ValueError(
                        "Dataset {} is already part of ensemble {} "
                        "which already provides support for attribute: {}. Bailing".format(
                            dataset_id, rel.object_id, att))
        else:
            # ok it's already done, no need to do anything else
            return

    # Ok we're good, let's now makes sure attributes are ok
    attributes = self.list_attributes(dictionary=True)
    dataset_attributes = dataset.list_attributes(dictionary=True)
    for att in dataset.list_attributes():
        if att in self.__dict__["__ok_duplicates__"]:
            continue
        if att in attributes and dataset_attributes[att] != attributes[att]:
            raise ValueError(
                "Dataset {} has attribute `{}` with value {}, this ensemble ({}) has value `{}`".format(
                    dataset_id, att, dataset_attributes[att], self.id, attributes[att]))
    # At this point we need to add the ensemble attributes to the dataset
    for att in self.list_attributes():
        if att in self.__dict__["__ok_duplicates__"]:
            continue
        dataset.___setattr___(att, getattr(self, att), force=True)
    # Ok We are clear let's create the relationship
    rel = sina.model.Relationship(
        self.id, dataset_id, self.__store__._ensemble_predicate)
    self.get_sina_store().relationships.insert(rel)

`cleanup_files(dry_run=False, interactive=False, **search_keys)`

Cleanup the ensemble's members from references to dead files. You can filter associated objects by passing key=values e.g mime_type=hdf5 will only dissociate non-existing files associated with mime_type hdf5 some_att=some_val will only dissociate non-existing files associated and having the attribute 'some_att' with value of 'some_val' returns list of uris to be removed.

Parameters:

Name	Type	Description	Default
`dry_run`	`bool`	Only does a dry_run	`False`
`interactive`	`bool`	interactive mode, ask before dissociating	`False`

Returns:

Type	Description
`list`	list of uris (to be) removed.

Source code in kosh/ensemble.py

def cleanup_files(self, dry_run=False, interactive=False, **search_keys):
    """Cleanup the ensemble's members from references to dead files.
    You can filter associated objects by passing key=values
    e.g mime_type=hdf5 will only dissociate non-existing files associated with mime_type hdf5
    some_att=some_val will only dissociate non-existing files associated and having the attribute
    'some_att' with value of 'some_val'
    returns list of uris to be removed.
    :param dry_run: Only does a dry_run
    :type dry_run: bool
    :param interactive: interactive mode, ask before dissociating
    :type interactive: bool
    :returns: list of uris (to be) removed.
    :rtype: list
    """
    missings = super(
        KoshEnsemble,
        self).cleanup_files(
        dry_run=dry_run,
        interactive=interactive,
        **search_keys)
    for dataset in self.get_members():
        missings += dataset.cleanup_files(dry_run=dry_run,
                                          interactive=interactive, **search_keys)
    return missings

`clone(*atts, **keys)`

We cannot clone an ensemble

Source code in kosh/ensemble.py

def clone(self, *atts, **keys):
    """We cannot clone an ensemble"""
    raise NotImplementedError("Ensembles objects cannot clone themselves")

`create(name='Unnamed Dataset', id=None, metadata={}, schema=None, sina_type=None, **kargs)`

create a new (possibly named) dataset as a member of this ensemble.

Parameters:

Name	Type	Description	Default
`name`	`str, optional`	name for the dataset, defaults to None	`'Unnamed Dataset'`
`id`	`str, optional`	unique Id, defaults to None which means use uuid4()	`None`
`metadata`	`dict, optional`	dictionary of attribute/value pair for the dataset, defaults to {}	`{}`
`schema`	`KoshSchema`	a KoshSchema object to validate datasets and when setting attributes	`None`
`sina_type`	`str`	If you want to query the store for a specific sina record type, not just a dataset	`None`
`kargs`	`dict`	extra keyword arguments (ignored)	`{}`

Returns:

Type	Description
`KoshDataset`	KoshDataset

Raises:

Type	Description
`RuntimeError`	Dataset already exists

Source code in kosh/ensemble.py

def create(self, name="Unnamed Dataset", id=None,
           metadata={}, schema=None, sina_type=None, **kargs):
    """create a new (possibly named) dataset as a member of this ensemble.

    :param name: name for the dataset, defaults to None
    :type name: str, optional
    :param id: unique Id, defaults to None which means use uuid4()
    :type id: str, optional
    :param metadata: dictionary of attribute/value pair for the dataset, defaults to {}
    :type metadata: dict, optional
    :param schema: a KoshSchema object to validate datasets and when setting attributes
    :type schema: KoshSchema
    :param sina_type: If you want to query the store for a specific sina record type, not just a dataset
    :type sina_type: str
    :param kargs: extra keyword arguments (ignored)
    :type kargs: dict
    :raises RuntimeError: Dataset already exists
    :return: KoshDataset
    :rtype: KoshDataset
    """
    if sina_type == self.__store__._ensembles_type:
        raise ValueError("You cannot create an ensemble from an ensemble")

    attributes = self.list_attributes()
    for key in metadata:
        if key in attributes:
            if metadata[key] != getattr(self, key):
                raise ValueError(
                    "'{}' is an attribute of this ensemble and "
                    "therefore cannot be an attribute of its descendants".format(key))
            else:
                warnings.warn(
                    "'{}' is an attribute of this ensemble and "
                    "therefore cannot be an attribute of its descendants"
                    ". Values match so we will accept it here.".format(key), UserWarning)

    ds = self.__store__.create(
        name=name,
        id=id,
        metadata=metadata,
        schema=schema,
        sina_type=sina_type,
        **kargs)
    self.add(ds)
    return ds

`export(file=None)`

Exports this ensemble datasets

Parameters:

Name	Type	Description	Default
`file`	`None \| str`	export datasets to a file	`None`

Returns:

Type	Description
`dict`	dataset and its associated data

Source code in kosh/ensemble.py

def export(self, file=None):
    """Exports this ensemble datasets
    :param file: export datasets to a file
    :type file: None or str
    :return: dataset and its associated data
    :rtype: dict"""
    records = [cleanup_sina_record_from_kosh_sync(self.get_record()), ]
    for dataset_id in self.get_members(ids_only=True):
        records.append(
            cleanup_sina_record_from_kosh_sync(
                self.__store__.get_record(dataset_id)))
    # We also need to export the relationships
    rels = self.get_sina_store().relationships.find(
        None, self.__store__._ensemble_predicate, self.id)
    relationships = []
    for rel in rels:
        relationships.append(rel.to_json())
    output_dict = {
        "minimum_kosh_version": None,
        "kosh_version": kosh.version(comparable=True),
        "sources_type": self.__store__._sources_type,
        "records": records,
        "relationships": relationships
    }

    update_json_file_with_records_and_relationships(file, output_dict)
    return output_dict

`find_datasets(*atts, **keys)`

Find datasets members of this ensemble that are matching some metadata. Arguments are the metadata names we are looking for e.g find("attr1", "attr2") you can further restrict by specifying exact value for a metadata via key=value you can return ids only by using: ids_only=True range can be specified via: sina.utils.DataRange(min, max)

Returns:

Type	Description
`generator`	generator of matching datasets in this ensemble

Source code in kosh/ensemble.py

def find_datasets(self, *atts, **keys):
    """Find datasets members of this ensemble that are matching some metadata.
    Arguments are the metadata names we are looking for e.g
    find("attr1", "attr2")
    you can further restrict by specifying exact value for a metadata
    via key=value
    you can return ids only by using: ids_only=True
    range can be specified via: sina.utils.DataRange(min, max)

    :return: generator of matching datasets in this ensemble
    :rtype: generator
    """

    members_ids = list(self.get_members(ids_only=True))
    return self.__store__.find(id_pool=members_ids, *atts, **keys)

`get_members(ids_only=False)`

Generator for member datasets

Parameters:

Name	Type	Description	Default
`ids_only`	`bool`	generator will return ids if True Kosh datasets otherwise	`False`

Returns:

Type	Description
`str \| KoshDataset`	generator of dataset (or ids)

Source code in kosh/ensemble.py

def get_members(self, ids_only=False):
    """Generator for member datasets
    :param ids_only: generator will return ids if True Kosh datasets otherwise
    :type ids_only: bool
    :returns: generator of dataset (or ids)
    :rtype: str or KoshDataset
    """
    for id in self._associated_datasets_:
        if ids_only:
            yield id
        else:
            yield self.__store__.open(id)

`list_attributes(dictionary=False, no_duplicate=False)`

list_attributes list all non protected attributes

:parm dictionary: return a dictionary of value/pair rather than just attributes names

Parameters:

Name	Type	Description	Default
`no_duplicate`	`bool`	return only attributes that cannot be duplicated in members	`False`

Returns:

Type	Description
`list`	list of attributes set on object

Source code in kosh/ensemble.py

def list_attributes(self, dictionary=False, no_duplicate=False):
    """list_attributes list all non protected attributes

    :parm dictionary: return a dictionary of value/pair rather than just attributes names
    :type dictionary: bool

    :param no_duplicate: return only attributes that cannot be duplicated in members
    :type no_duplicate: bool

    :return: list of attributes set on object
    :rtype: list
    """

    attributes = super(KoshEnsemble, self).list_attributes(dictionary)
    if no_duplicate:
        return [x for x in attributes if x not in self.__dict__["__ok_duplicates__"]]
    else:
        return attributes

`remove(dataset)`

Removes a dataset from this ensemble. Does not delete the dataset.

Parameters:

Name	Type	Description	Default
`dataset`	`KoshDataset \| str`	The dataset to remove	required

Source code in kosh/ensemble.py

def remove(self, dataset):
    """Removes a dataset from this ensemble. Does not delete the dataset.
    :param dataset: The dataset to remove
    :type dataset: KoshDataset or str
    """
    # Step1 make sure the dataset does not belong to another ensemble
    if isinstance(dataset, KoshDataset):
        dataset_id = dataset.id
    else:
        dataset_id = dataset
        dataset = self.__store__._load(dataset_id)
    relationships = self.get_sina_store().relationships.find(
        dataset_id, self.__store__._ensemble_predicate, self.id)
    if len(relationships) == 0:
        warnings.warn(
            "Dataset {} is not a member of ensemble {}".format(
                dataset_id, self.id))
        return

    rel = relationships[0]
    self.get_sina_store().relationships.delete(rel.subject_id, rel.predicate, rel.object_id)

Ensemble

KoshEnsemble

__init__(id, store, schema=None, record=None)

__str__()

add(dataset)

cleanup_files(dry_run=False, interactive=False, **search_keys)

clone(*atts, **keys)

create(name='Unnamed Dataset', id=None, metadata={}, schema=None, sina_type=None, **kargs)

export(file=None)

find_datasets(*atts, **keys)

get_members(ids_only=False)

list_attributes(dictionary=False, no_duplicate=False)

remove(dataset)

`KoshEnsemble`

`init(id, store, schema=None, record=None)`

`str()`

`add(dataset)`

`cleanup_files(dry_run=False, interactive=False, **search_keys)`

`clone(*atts, **keys)`

`create(name='Unnamed Dataset', id=None, metadata={}, schema=None, sina_type=None, **kargs)`

`export(file=None)`

`find_datasets(*atts, **keys)`

`get_members(ids_only=False)`

`list_attributes(dictionary=False, no_duplicate=False)`

`remove(dataset)`