koshClustering

`KoshCluster`

Bases: KoshOperator

Clusters together similar samples from a dataset, and then returns cluster representatives to form a non-redundant subsample of the original dataset. The datasets need to be of shape (n_samples, n_features). All datasets must have the same number of features. If the datasets are more than two dimensions there is an option to flatten them.

Source code in kosh/operators/koshClustering.py

class KoshCluster(KoshOperator):
    """Clusters together similar samples from a dataset, and then
    returns cluster representatives to form a non-redundant
    subsample of the original dataset. The datasets need to be of
    shape (n_samples, n_features). All datasets must have the same
    number of features. If the datasets are more than two dimensions
    there is an option to flatten them.
    """
    types = {"numpy": ["numpy", "pandas"]}

    def __init__(self, *args, **options):
        """
        :param inputs: One or more arrays of size (n_samples, n_features).
        datasets must have same number of n_features.
        :type inputs: kosh datasets
        :param flatten: Flattens data to two dimensions.
        (n_samples, n_features_1*n_features_2* ... *n_features_m)
        :type flatten: bool
        :param distance_function: distance metric 'euclidean', 'seuclidean',
        'sqeuclidean', 'beuclidean', or user defined function. Defaults to
        'euclidean'
        :type distance_function: string or user defined function
        :param scaling_function: Scaling function to use on data before it
        is clustered.
        :type scaling_function: string or user defined function
        :param batch: Whether to cluster data in batches
        :type batch: bool
        :param batch_size: Size of the batches
        :type batch_size: int
        :param gather_to: Which process to gather data to if samples are
        smaller than number of processes or batch size.
        type gather_to: int
        :param convergence_num: If int, converged after the data size is the same for
        'num' iterations. The default is 2. If float, converged after the change in data
        size is less than convergence_num*100 percent of the original data size.
        :type convergence_num: int or float between 0 and 1
        :param core_sample: Whether to retain a sample from the center of
        the cluster (core sample), or a randomly chosen sample.
        :type core_sample: bool
        :param eps: The distance around a sample that defines its neighbors.
        :type eps: float
        :param auto_eps: Use the algorithm to find the epsilon distance for
        clustering based on the desired information loss.
        :type auto_eps: bool
        :param eps_0: The initial epsilon guess for the auto eps algorithm.
        :type eps_0: float
        :param min_samples: The minimum number of samples to form a cluster.
        :type min_samples: int
        :param target_loss: The proportion of information loss allowed from removing
        samples from the original dataset. The default is .01 or 1% loss.
        :type target_loss: float
        :param verbose: Verbose message
        :type verbose: bool
        :param output: The retained data or the indices to get the retained
        data from the original dataset.
        :type output: string
        :param format: Returns the indices as numpy array ('numpy') or
        defaults to pandas dataframe.
        :type format: string
        :returns: A list containing: 1. The reduced dataset or indices to reduce the original
        dataset. 2. The estimated information loss or if using the auto eps algorithm (eps=-1)
        the second item in the list will be the epsilon value found with auto eps.
        :rtype: list with elements in the list being either numpy array or pandas dataframe
        """

        super(KoshCluster, self).__init__(*args, **options)

        self.options = options

        # In case they don't have mpi4py
        try:
            from mpi4py import MPI
            comm = MPI.COMM_WORLD
        except ImportError:
            class Comm():
                def Get_size(self):
                    return 1

                def Get_rank(self):
                    return 0
            comm = Comm()

        self.comm = comm

        # Decide on parallel or serial batch clustering
        self.rank = self.comm.Get_rank()
        self.nprocs = self.comm.Get_size()
        self.do_parallel = (self.nprocs > 1)

        # Verbose options
        self.verbose = self.options.get("verbose", False)
        options['verbose'] = self.verbose
        # Define primary rank
        self.primary = self.options.get("gather_to", 0)
        options['gather_to'] = self.primary
        self.pverbose = (self.rank == self.primary) and self.verbose

        if self.pverbose:
            print("Number of ranks: %s" % self.nprocs)

        # Check batching options
        self.batching = self.options.get("batch", False)
        self.batch_size = options.get("batch_size", 3000)
        # Guarantees value exists in options
        options['batch_size'] = self.batch_size

        # Check for automatic loss-based subsampling
        self.target_loss = self.options.get('target_loss', .01)
        self.autoEPS = self.options.get('auto_eps', False)

    def operate(self, *inputs, **kargs):
        """
        Checks for serial or parallel clustering and calls
        those functions
        """

        if self.pverbose:
            print("Reading in %s datasets." % len(inputs))

        # Get the sizes of each kosh dataset
        input_sizes = []
        for input_ in inputs:
            input_sizes.append(input_.shape[0])

        total_sample_size = sum(input_sizes)

        # Logical checks for batch and parallel clustering
        if (total_sample_size <= self.batch_size):
            self.batching = False
            if self.pverbose:
                print("Total sample size is less than batch size.")

        # Case where user has multiple processes but no batching
        if (not self.batching and self.do_parallel):
            self.batching = True
            if self.pverbose:
                print("Parallel requires batch=True;")
                print("Switching to batch clustering.")

        # Case where data size is smaller than number of processes
        if total_sample_size <= self.nprocs:
            self.do_parallel = False
            if self.pverbose:
                print("Total sample size is less than number of processors.")
                print("Switching to serial clustering.")
                print("Idling all non-primary processors.")
            if self.rank != self.primary:
                return [None, ]

        if not self.autoEPS:
            # Standard calls to operator just calls either option
            if self.batching and self.do_parallel:
                r_data = _koshParallelClustering_(inputs,
                                                  self.options,
                                                  self.comm,
                                                  input_sizes)
            else:
                r_data = _koshSerialClustering_(inputs, self.options)

        else:
            # AutoEPS will compute needed EPS for the desired loss
            #   and return a list with the data and found EPS value
            [data, epsActual] = _koshAutoEPS_(inputs,
                                              self.options,
                                              self.target_loss,
                                              input_sizes,
                                              self.comm,
                                              self.do_parallel)
            r_data = [data, epsActual]
            # When data is None return None instead of list
            if data is None:
                return [None, ]

        return r_data

`init(*args, **options)`

Parameters:

Name	Type	Description	Default
`inputs`	`kosh datasets`	One or more arrays of size (n_samples, n_features). datasets must have same number of n_features.	required
`flatten`	`bool`	Flattens data to two dimensions. (n_samples, n_features_1n_features_2 ... *n_features_m)	required
`distance_function`	`string \| user defined function`	distance metric 'euclidean', 'seuclidean', 'sqeuclidean', 'beuclidean', or user defined function. Defaults to 'euclidean'	required
`scaling_function`	`string \| user defined function`	Scaling function to use on data before it is clustered.	required
`batch`	`bool`	Whether to cluster data in batches	required
`batch_size`	`int`	Size of the batches	required
`gather_to`		Which process to gather data to if samples are smaller than number of processes or batch size. type gather_to: int	required
`convergence_num`	`int \| float between 0 and 1`	If int, converged after the data size is the same for 'num' iterations. The default is 2. If float, converged after the change in data size is less than convergence_num*100 percent of the original data size.	required
`core_sample`	`bool`	Whether to retain a sample from the center of the cluster (core sample), or a randomly chosen sample.	required
`eps`	`float`	The distance around a sample that defines its neighbors.	required
`auto_eps`	`bool`	Use the algorithm to find the epsilon distance for clustering based on the desired information loss.	required
`eps_0`	`float`	The initial epsilon guess for the auto eps algorithm.	required
`min_samples`	`int`	The minimum number of samples to form a cluster.	required
`target_loss`	`float`	The proportion of information loss allowed from removing samples from the original dataset. The default is .01 or 1% loss.	required
`verbose`	`bool`	Verbose message	required
`output`	`string`	The retained data or the indices to get the retained data from the original dataset.	required
`format`	`string`	Returns the indices as numpy array ('numpy') or defaults to pandas dataframe.	required

Returns:

Type	Description
`list with elements in the list being either numpy array \| pandas dataframe`	A list containing: 1. The reduced dataset or indices to reduce the original dataset. 2. The estimated information loss or if using the auto eps algorithm (eps=-1) the second item in the list will be the epsilon value found with auto eps.

Source code in kosh/operators/koshClustering.py

def __init__(self, *args, **options):
    """
    :param inputs: One or more arrays of size (n_samples, n_features).
    datasets must have same number of n_features.
    :type inputs: kosh datasets
    :param flatten: Flattens data to two dimensions.
    (n_samples, n_features_1*n_features_2* ... *n_features_m)
    :type flatten: bool
    :param distance_function: distance metric 'euclidean', 'seuclidean',
    'sqeuclidean', 'beuclidean', or user defined function. Defaults to
    'euclidean'
    :type distance_function: string or user defined function
    :param scaling_function: Scaling function to use on data before it
    is clustered.
    :type scaling_function: string or user defined function
    :param batch: Whether to cluster data in batches
    :type batch: bool
    :param batch_size: Size of the batches
    :type batch_size: int
    :param gather_to: Which process to gather data to if samples are
    smaller than number of processes or batch size.
    type gather_to: int
    :param convergence_num: If int, converged after the data size is the same for
    'num' iterations. The default is 2. If float, converged after the change in data
    size is less than convergence_num*100 percent of the original data size.
    :type convergence_num: int or float between 0 and 1
    :param core_sample: Whether to retain a sample from the center of
    the cluster (core sample), or a randomly chosen sample.
    :type core_sample: bool
    :param eps: The distance around a sample that defines its neighbors.
    :type eps: float
    :param auto_eps: Use the algorithm to find the epsilon distance for
    clustering based on the desired information loss.
    :type auto_eps: bool
    :param eps_0: The initial epsilon guess for the auto eps algorithm.
    :type eps_0: float
    :param min_samples: The minimum number of samples to form a cluster.
    :type min_samples: int
    :param target_loss: The proportion of information loss allowed from removing
    samples from the original dataset. The default is .01 or 1% loss.
    :type target_loss: float
    :param verbose: Verbose message
    :type verbose: bool
    :param output: The retained data or the indices to get the retained
    data from the original dataset.
    :type output: string
    :param format: Returns the indices as numpy array ('numpy') or
    defaults to pandas dataframe.
    :type format: string
    :returns: A list containing: 1. The reduced dataset or indices to reduce the original
    dataset. 2. The estimated information loss or if using the auto eps algorithm (eps=-1)
    the second item in the list will be the epsilon value found with auto eps.
    :rtype: list with elements in the list being either numpy array or pandas dataframe
    """

    super(KoshCluster, self).__init__(*args, **options)

    self.options = options

    # In case they don't have mpi4py
    try:
        from mpi4py import MPI
        comm = MPI.COMM_WORLD
    except ImportError:
        class Comm():
            def Get_size(self):
                return 1

            def Get_rank(self):
                return 0
        comm = Comm()

    self.comm = comm

    # Decide on parallel or serial batch clustering
    self.rank = self.comm.Get_rank()
    self.nprocs = self.comm.Get_size()
    self.do_parallel = (self.nprocs > 1)

    # Verbose options
    self.verbose = self.options.get("verbose", False)
    options['verbose'] = self.verbose
    # Define primary rank
    self.primary = self.options.get("gather_to", 0)
    options['gather_to'] = self.primary
    self.pverbose = (self.rank == self.primary) and self.verbose

    if self.pverbose:
        print("Number of ranks: %s" % self.nprocs)

    # Check batching options
    self.batching = self.options.get("batch", False)
    self.batch_size = options.get("batch_size", 3000)
    # Guarantees value exists in options
    options['batch_size'] = self.batch_size

    # Check for automatic loss-based subsampling
    self.target_loss = self.options.get('target_loss', .01)
    self.autoEPS = self.options.get('auto_eps', False)

`operate(*inputs, **kargs)`

Checks for serial or parallel clustering and calls those functions

Source code in kosh/operators/koshClustering.py

def operate(self, *inputs, **kargs):
    """
    Checks for serial or parallel clustering and calls
    those functions
    """

    if self.pverbose:
        print("Reading in %s datasets." % len(inputs))

    # Get the sizes of each kosh dataset
    input_sizes = []
    for input_ in inputs:
        input_sizes.append(input_.shape[0])

    total_sample_size = sum(input_sizes)

    # Logical checks for batch and parallel clustering
    if (total_sample_size <= self.batch_size):
        self.batching = False
        if self.pverbose:
            print("Total sample size is less than batch size.")

    # Case where user has multiple processes but no batching
    if (not self.batching and self.do_parallel):
        self.batching = True
        if self.pverbose:
            print("Parallel requires batch=True;")
            print("Switching to batch clustering.")

    # Case where data size is smaller than number of processes
    if total_sample_size <= self.nprocs:
        self.do_parallel = False
        if self.pverbose:
            print("Total sample size is less than number of processors.")
            print("Switching to serial clustering.")
            print("Idling all non-primary processors.")
        if self.rank != self.primary:
            return [None, ]

    if not self.autoEPS:
        # Standard calls to operator just calls either option
        if self.batching and self.do_parallel:
            r_data = _koshParallelClustering_(inputs,
                                              self.options,
                                              self.comm,
                                              input_sizes)
        else:
            r_data = _koshSerialClustering_(inputs, self.options)

    else:
        # AutoEPS will compute needed EPS for the desired loss
        #   and return a list with the data and found EPS value
        [data, epsActual] = _koshAutoEPS_(inputs,
                                          self.options,
                                          self.target_loss,
                                          input_sizes,
                                          self.comm,
                                          self.do_parallel)
        r_data = [data, epsActual]
        # When data is None return None instead of list
        if data is None:
            return [None, ]

    return r_data

`KoshClusterLossPlot`

Bases: KoshOperator

Source code in kosh/operators/koshClustering.py

class KoshClusterLossPlot(KoshOperator):
    types = {"numpy": ["mpl", "mpl/png", "numpy"]}
    """Calculates sample size and estimated information loss
    for a range of distance values.
"""

    def __init__(self, *args, **options):
        super(KoshClusterLossPlot, self).__init__(*args, **options)
        self.options = options

    def operate(self, *inputs, **kargs):
        """
        :param inputs: One or more arrays of size (n_samples, n_features).
        Datasets must have same number of n_features.
        :type inputs: kosh datasets
        :param method: DBSCAN, HDBSCAN, or HAC
        (Hierarchical Agglomerative Clustering)
        :type method: string
        :param flatten: Flattens data to two dimensions.
        (n_samples, n_features_1*n_features_2* ... *n_features_m)
        :type flatten: bool
        :param val_range: Range of distance values to use for
        clustering/subsampling
        :type val_range: array
        :param val_type: Choose the type of value range for clustering:
        raw distance ('raw'), scaled distance ('scaled'), or number of
        clusters ('Nclusters').
        :type val_type: string
        :param scaling_function: Scaling function to use on data before
        it is clustered.
        :type scaling_function: string or user defined function
        :param distance_function: A valid pairwise distance option from
        scipy.spatial.distance, or a user defined distance function.
        :type distance_function: string, or callable
        :param draw_plot: Whether to plot the plt object. otherwise it
        returns a list of three arrays: the distance value range,
        loss estimate, and sample size. You can pass a matplotlib Axes
        instance if desired.
        :type draw_plot: bool or matplotlib.pyplot.Axes object
        :param outputFormat: Returns the information as matplotlib pyplot
        object ('mpl'), png file ('mpl/png'),
                             or numpy array ('numpy')
        :type outputFormat: string
        :param min_samples: The minimum number of samples to form a cluster.
        (Only for DBSCAN)
        :type min_samples: int
        :param n_jobs: The number of parallel jobs to run. -1 means
        using all processors.
        :type n_jobs: int
        :return: plt object showing loss/sample size information, location
        of the saved file, or an array with val_range, loss estimate, and
        sample size
        :rtype: object, string, array
        """

        data = inputs[0][:]
        for input_ in inputs[1:]:
            data = np.append(data, input_[:], axis=0)

        self.fileNameTemplate = self.options.get(
            "fileNameTemplate", "./clusterLossPlot")
        method = self.options.get("method", "DBSCAN")
        flatten = self.options.get("flatten", False)
        val_range = self.options.get("val_range", np.linspace(1e-4, 1.5, 30))
        val_type = self.options.get("val_type", "raw")
        distance_function = self.options.get("distance_function", "euclidean")
        # options are: 'mpl',, 'mpl/png', 'numpy'
        outputFormat = self.options.get("outputFormat", 'mpl')
        min_samples = self.options.get("min_samples", 2)
        n_jobs = self.options.get("n_jobs", 1)
        scaling_function = self.options.get("scaling_function", '')

        cluster_object = Cluster(
            data,
            method=method,
            scaling_function=scaling_function,
            flatten=flatten)

        draw_plot = self.options.get("draw_plot",
                                     (outputFormat == 'mpl') or
                                     (outputFormat == 'mpl/png'))

        output = cluster_object.lossPlot(
            val_range=val_range,
            val_type=val_type,
            distance_function=distance_function,
            draw_plot=draw_plot,
            min_samples=min_samples,
            n_jobs=n_jobs)

        if outputFormat == 'mpl/png':
            fileName = "{}_{}_{:.2g}_{:.2g}.png".format(
                self.fileNameTemplate,
                distance_function, val_range[0], val_range[-1])
            output.savefig(fileName)
            return fileName
        else:  # return output for obj or array options
            return output

`types = {'numpy': ['mpl', 'mpl/png', 'numpy']}` `class-attribute` `instance-attribute`

Calculates sample size and estimated information loss for a range of distance values.

`operate(*inputs, **kargs)`

Parameters:

Name	Type	Description	Default
`inputs`	`kosh datasets`	One or more arrays of size (n_samples, n_features). Datasets must have same number of n_features.	`()`
`method`	`string`	DBSCAN, HDBSCAN, or HAC (Hierarchical Agglomerative Clustering)	required
`flatten`	`bool`	Flattens data to two dimensions. (n_samples, n_features_1n_features_2 ... *n_features_m)	required
`val_range`	`array`	Range of distance values to use for clustering/subsampling	required
`val_type`	`string`	Choose the type of value range for clustering: raw distance ('raw'), scaled distance ('scaled'), or number of clusters ('Nclusters').	required
`scaling_function`	`string \| user defined function`	Scaling function to use on data before it is clustered.	required
`distance_function`	`string, \| callable`	A valid pairwise distance option from scipy.spatial.distance, or a user defined distance function.	required
`draw_plot`	`bool \| matplotlib.pyplot.Axes object`	Whether to plot the plt object. otherwise it returns a list of three arrays: the distance value range, loss estimate, and sample size. You can pass a matplotlib Axes instance if desired.	required
`outputFormat`	`string`	Returns the information as matplotlib pyplot object ('mpl'), png file ('mpl/png'), or numpy array ('numpy')	required
`min_samples`	`int`	The minimum number of samples to form a cluster. (Only for DBSCAN)	required
`n_jobs`	`int`	The number of parallel jobs to run. -1 means using all processors.	required

Returns:

Type	Description
`object, string, array`	plt object showing loss/sample size information, location of the saved file, or an array with val_range, loss estimate, and sample size

Source code in kosh/operators/koshClustering.py

def operate(self, *inputs, **kargs):
    """
    :param inputs: One or more arrays of size (n_samples, n_features).
    Datasets must have same number of n_features.
    :type inputs: kosh datasets
    :param method: DBSCAN, HDBSCAN, or HAC
    (Hierarchical Agglomerative Clustering)
    :type method: string
    :param flatten: Flattens data to two dimensions.
    (n_samples, n_features_1*n_features_2* ... *n_features_m)
    :type flatten: bool
    :param val_range: Range of distance values to use for
    clustering/subsampling
    :type val_range: array
    :param val_type: Choose the type of value range for clustering:
    raw distance ('raw'), scaled distance ('scaled'), or number of
    clusters ('Nclusters').
    :type val_type: string
    :param scaling_function: Scaling function to use on data before
    it is clustered.
    :type scaling_function: string or user defined function
    :param distance_function: A valid pairwise distance option from
    scipy.spatial.distance, or a user defined distance function.
    :type distance_function: string, or callable
    :param draw_plot: Whether to plot the plt object. otherwise it
    returns a list of three arrays: the distance value range,
    loss estimate, and sample size. You can pass a matplotlib Axes
    instance if desired.
    :type draw_plot: bool or matplotlib.pyplot.Axes object
    :param outputFormat: Returns the information as matplotlib pyplot
    object ('mpl'), png file ('mpl/png'),
                         or numpy array ('numpy')
    :type outputFormat: string
    :param min_samples: The minimum number of samples to form a cluster.
    (Only for DBSCAN)
    :type min_samples: int
    :param n_jobs: The number of parallel jobs to run. -1 means
    using all processors.
    :type n_jobs: int
    :return: plt object showing loss/sample size information, location
    of the saved file, or an array with val_range, loss estimate, and
    sample size
    :rtype: object, string, array
    """

    data = inputs[0][:]
    for input_ in inputs[1:]:
        data = np.append(data, input_[:], axis=0)

    self.fileNameTemplate = self.options.get(
        "fileNameTemplate", "./clusterLossPlot")
    method = self.options.get("method", "DBSCAN")
    flatten = self.options.get("flatten", False)
    val_range = self.options.get("val_range", np.linspace(1e-4, 1.5, 30))
    val_type = self.options.get("val_type", "raw")
    distance_function = self.options.get("distance_function", "euclidean")
    # options are: 'mpl',, 'mpl/png', 'numpy'
    outputFormat = self.options.get("outputFormat", 'mpl')
    min_samples = self.options.get("min_samples", 2)
    n_jobs = self.options.get("n_jobs", 1)
    scaling_function = self.options.get("scaling_function", '')

    cluster_object = Cluster(
        data,
        method=method,
        scaling_function=scaling_function,
        flatten=flatten)

    draw_plot = self.options.get("draw_plot",
                                 (outputFormat == 'mpl') or
                                 (outputFormat == 'mpl/png'))

    output = cluster_object.lossPlot(
        val_range=val_range,
        val_type=val_type,
        distance_function=distance_function,
        draw_plot=draw_plot,
        min_samples=min_samples,
        n_jobs=n_jobs)

    if outputFormat == 'mpl/png':
        fileName = "{}_{}_{:.2g}_{:.2g}.png".format(
            self.fileNameTemplate,
            distance_function, val_range[0], val_range[-1])
        output.savefig(fileName)
        return fileName
    else:  # return output for obj or array options
        return output

`KoshHopkins`

Bases: KoshOperator

Calculates the Hopkins statistic or cluster tendency of the data

Source code in kosh/operators/koshClustering.py

class KoshHopkins(KoshOperator):
    """Calculates the Hopkins statistic or cluster tendency of the data

    """
    types = {"numpy": ["numpy", ]}

    def __init__(self, *args, **options):
        super(KoshHopkins, self).__init__(*args, **options)
        self.options = options

    def operate(self, *inputs, **kargs):
        """
        from a sample of the dataset. A value close to 0 means uniformly
        distributed, .5 means randomly distributed, and a value close to 1
        means highly clustered.

        :param inputs: One or more arrays of size (n_samples, n_features).
        Datasets must have same number of n_features.
        :type inputs: kosh datasets
        :param sample_ratio: Proportion of data for sample
        :type sample_ratio: float, between zero and one
        :param scaling_function: Scaling function to use on data before
        it is clustered.
        :type scaling_function: string or user defined function
        :param flatten: Flattens data to two dimensions.
        (n_samples, n_features_1*n_features_2* ... *n_features_m)
        :type flatten: bool
        :return: Hopkins statistic
        :rtype: float
        """

        data = inputs[0][:]
        for input_ in inputs[1:]:
            data = np.append(data, input_[:], axis=0)

        sample_ratio = self.options.get("sample_ratio", .1)
        scaling_function = self.options.get("scaling_function", '')
        flatten = self.options.get("flatten", False)

        cluster_object = Cluster(data,
                                 scaling_function=scaling_function,
                                 flatten=flatten)
        hopkins_stat = cluster_object.hopkins(sample_ratio=sample_ratio)
        return hopkins_stat

`operate(*inputs, **kargs)`

from a sample of the dataset. A value close to 0 means uniformly distributed, .5 means randomly distributed, and a value close to 1 means highly clustered.

Parameters:

Name	Type	Description	Default
`inputs`	`kosh datasets`	One or more arrays of size (n_samples, n_features). Datasets must have same number of n_features.	`()`
`sample_ratio`	`float, between zero and one`	Proportion of data for sample	required
`scaling_function`	`string \| user defined function`	Scaling function to use on data before it is clustered.	required
`flatten`	`bool`	Flattens data to two dimensions. (n_samples, n_features_1n_features_2 ... *n_features_m)	required

Returns:

Type	Description
`float`	Hopkins statistic

Source code in kosh/operators/koshClustering.py

def operate(self, *inputs, **kargs):
    """
    from a sample of the dataset. A value close to 0 means uniformly
    distributed, .5 means randomly distributed, and a value close to 1
    means highly clustered.

    :param inputs: One or more arrays of size (n_samples, n_features).
    Datasets must have same number of n_features.
    :type inputs: kosh datasets
    :param sample_ratio: Proportion of data for sample
    :type sample_ratio: float, between zero and one
    :param scaling_function: Scaling function to use on data before
    it is clustered.
    :type scaling_function: string or user defined function
    :param flatten: Flattens data to two dimensions.
    (n_samples, n_features_1*n_features_2* ... *n_features_m)
    :type flatten: bool
    :return: Hopkins statistic
    :rtype: float
    """

    data = inputs[0][:]
    for input_ in inputs[1:]:
        data = np.append(data, input_[:], axis=0)

    sample_ratio = self.options.get("sample_ratio", .1)
    scaling_function = self.options.get("scaling_function", '')
    flatten = self.options.get("flatten", False)

    cluster_object = Cluster(data,
                             scaling_function=scaling_function,
                             flatten=flatten)
    hopkins_stat = cluster_object.hopkins(sample_ratio=sample_ratio)
    return hopkins_stat

koshClustering

KoshCluster

__init__(*args, **options)

operate(*inputs, **kargs)

KoshClusterLossPlot

types = {'numpy': ['mpl', 'mpl/png', 'numpy']} class-attribute instance-attribute

operate(*inputs, **kargs)

KoshHopkins

operate(*inputs, **kargs)

`KoshCluster`

`init(*args, **options)`

`operate(*inputs, **kargs)`

`KoshClusterLossPlot`

`types = {'numpy': ['mpl', 'mpl/png', 'numpy']}` `class-attribute` `instance-attribute`

`operate(*inputs, **kargs)`

`KoshHopkins`

`operate(*inputs, **kargs)`