Kosh Tansformers using SciKit-Learn¶
This notebook introduces some of sklearn
-based Kosh's transformers.
Splitter¶
This transformer allows to split some extracted data between training, test and validation splits
It also retuns n_splits
(default 1) variations
It can use any scikit-learn model_selection splitter class (see: here for more details)
The base class is passed via the splitter
keyword and defaults to ShuffleSplitter
A good example of what these splitters do can be found here
Deafult values are:
splitter
:ShuffleSplitter
n_splits
: 1
random_state
: None
(pass an int for reproducibility)
train_size
: 0.9
test_size
: 0.1
validation_size
: 0.
A basic example with fake data:
import kosh
import numpy
class FakeLoader(kosh.loaders.KoshLoader):
types = {"fake":["numpy",]}
def extract(self):
return numpy.arange(1000, dtype=numpy.float32)
def list_features(self):
return "range_1000"
store = kosh.connect("skl.sql", delete_all_contents=True)
store.add_loader(FakeLoader)
dataset = store.create()
dataset.associate("fake_file.fake", mime_type="fake")
print("Default return")
print(len(dataset["range_1000"]()))
Default return 1000
splits = dataset.get_execution_graph("range_100", transformers=[kosh.transformers.Splitter(random_state=73)])
# Length of split is 1 because we asked for one variation (n_splits=1)
train, test = splits()[0] # No validation by default
print(len(train), len(test))
900 100
splits = dataset.get("range_100", transformers=[kosh.transformers.Splitter(random_state=73, test_size=.15, train_size=.75)])
train, test, validation = splits[0] # Now we have validation (train+test < 1.)
print(len(train), len(test), len(validation))
750 150 100
Scalers¶
These allow to process scikit scalers on a datasets (numpy array)
- SKL: can return a SKL estimator (skl_class.fit(input) or the input fitted to each label from the class, all argument to init the class are passed back to skl_class
class SKL(KoshTransformer):
types = {"numpy": ["estimator", "numpy"]}
def __init__(self, *args, **kargs):
kw = {}
for arg in ["n_samples", "sampling_method"]:
setattr(self, arg, kargs.pop(arg, None))
kw[arg] = getattr(self, arg)
skl_class = kargs.pop("skl_class")
self.skl_class = skl_class(*args, **kargs)
kw.update(kargs)
super(SKL, self).__init__(*args, **kargs)
For convenience sckit_learn's DBSCAN and KMeans are available
from kosh.transformers import DBSCAN
import numpy
class FakeLoader(kosh.loaders.KoshLoader):
types = {"fake":["numpy",]}
def extract(self):
return numpy.array([[1, 2], [2, 2], [2, 3],
[8, 7], [8, 8], [25, 80]])
def list_features(self):
return "data"
store = kosh.utils.create_new_db("skl.sql")
store.add_loader(FakeLoader)
dataset = store.create()
dataset.associate("fake_file.fake", mime_type="fake")
clustering_transformer = DBSCAN(eps=3, min_samples=2)
# Let's get the clustered data back (format='numpy')
clustered = dataset.get("data", transformers=[clustering_transformer,])
print(clustered)
#Let's get back the estimator
estimator = dataset.get("data", transformers=[clustering_transformer,], format="estimator")
print(estimator)
([-1, 0, 1], [array([[25, 80]]), array([[1, 2], [2, 2], [2, 3]]), array([[8, 7], [8, 8]])]) ([-1, 0, 1], [array([[25, 80]]), array([[1, 2], [2, 2], [2, 3]]), array([[8, 7], [8, 8]])])