Source code for clustering_algorithms

"""
OpenEnsembles is a resource for performing and analyzing ensemble clustering
This file contains calls to clustering algorithms. Please refer to this 
documentation for specifics about the variable parameters and their defaults, but interact with clustering 
through the openensembles.clustering() class. 

OpenEnsembles is a resource for performing and analyzing ensemble clustering

Copyright (C) 2017 Naegle Lab

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
"""

import numpy as np 
import pandas as pd 
import sklearn.cluster as skc
import sklearn as sk
import matplotlib.pyplot as plt
from sklearn import datasets
import scipy.cluster.hierarchy as sch
from sklearn import preprocessing
import scipy.stats as stats
from types import FunctionType
import re
import warnings
from sklearn import mixture

[docs]class clustering_algorithms:
    """
    The calls to all clustering algorithms reside here. Each call takes on the form of explicitly encoding the default 
    sklearn parameters, overwriting any passed in as kwargs.

    The act of clustering creates a list of assignments of objects in data assigned to clustering classes
    var_params contains all the final parameters used in the act of clustering

    Parameters
    ----------
    data: matrix
        Data matrix, assumes feature dimensions are found column wise (objects in rows)
    kwargs: dict
        Variable parameters passed in as a dict. These are specific to each algorithm
        The keyword 'distance' is used by OpenEnsembles to generalize distance/similarity calls across algorithms. Use 'distance' and a string for distance metric. 
        Algorithms requiring similarity will use the conversion of Distance (D) to Similarity (S) according to: S = np.exp(-D / D.std()). 
        Use 'affinity' and 'precomputed' to exert greater control over usage. 
    K: int
        Number of clusters to create, required for most, but not all algorithms. Default K=2

    Attributes
    ----------
    var_params: dict
        The dict of all parameters, including defaults assumed and those overwritten by kwargs

    See Also
    --------
    openensembles.cluster()
    """
    def __init__(self, data, kwargs, K=2):
        self.data = data
        self.out = []
        self.var_params = kwargs
        self.K = K
        #args should have K, even if a default value (should it? not all algorithms need this, use default)
        #if 'K' not in self.args:
        #    raise ValueError('clustering_algorithms should have an instantiated K as part of kwargs key, pair')

[docs]    def clustering_algorithms_available(self):
        """
        Report available algorithms
        
        Returns
        -------
        methods: dict
            Dictionary with keys equal to available function calls

        """
        methods =  [method for method in dir(self) if callable(getattr(self, method))]
        methods.remove('clustering_algorithms_available')
        methodDict = {}
        for method in methods:
            if not re.match('__', method):
                methodDict[method] = ''
        return methodDict


[docs]    def kmeans(self):
        """
        kmeans clustering see `skc.KMeans <http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html>`_

        **Defaults and var_params:** skc.KMeans(n_clusters=2, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=1)

    
        Other Parameters
        ----------------
        var_params: dict
            Pass variable params through constructor as dictionary pairs. Current default parameters are listed above

        Returns
        -------
        labels: list of ints
            Solution of clustering labels for each object (updated in object.out)
            
        """
        params={}
        params['init'] = 'k-means++'
        params['n_init'] = 10
        params['max_iter'] = 300
        params['tol'] = 0.0001
        params['precompute_distances'] = 'auto'
        params['verbose'] = 0
        params['random_state'] = None
        params['copy_x'] = True
        params['n_jobs'] = 1
        if not self.K:
            raise ValueError('kmeans clustering requires an argument K=<intiger value>')

        #for anything in self.var_params that may replace defaults, update the param list
        params = returnParams(self.var_params, params, 'kmeans')

        seed = params['random_state'][1][0]
        solution=skc.KMeans(n_clusters=self.K, init=params['init'], 
            n_init=params['n_init'], max_iter=params['max_iter'], tol=params['tol'],
            precompute_distances=params['precompute_distances'], verbose=params['verbose'],
            random_state=seed, copy_x=params['copy_x'], n_jobs=params['n_jobs'])
        solution.fit(self.data)
        self.out = solution.labels_
        self.var_params = params #update dictionary of parameters to match that used.


[docs]    def spectral(self):
        """ 
        Spectral clustering, see `skc.SpectralClustering <http://scikit-learn.org/stable/modules/generated/sklearn.cluster.SpectralClustering.html>`_

        **Defaults and var_params:** skc.SpectralClustering(n_clusters=2, eigen_solver=None, random_state=None, n_init=10, gamma=1.0, affinity=’rbf’, n_neighbors=10, eigen_tol=0.0, assign_labels=’kmeans’, degree=3, coef0=1, kernel_params=None, n_jobs=1)
        Other Parameters
        ----------------
        var_params: dict
            Pass variable params through constructor as dictionary pairs. Current default parameters are listed above

        Returns
        -------
        labels: list of ints
            Solution of clustering labels for each object (updated in object.out)       
        """
        params = {}

        params['affinity'] = 'rbf'
        params['assign_labels'] = 'kmeans'
        params['coef0'] = 1
        params['degree'] = 3
        params['eigen_solver']=None
        params['eigen_tol'] = '0.0'
        params['gamma'] = 1.0
        params['kernel_params']=None
        params['n_init'] = 10
        params['n_jobs']=1
        params['n_neighbors'] = 10
        params['random_state'] = None

        #NOt used directly by spectral, the true default is affinity with rbf
        params['distance'] = 'euclidean'
        params['M'] = []

        if not self.K:
            raise ValueError('spectral clustering requires an argument K=<intiger value>')

        #for anything in self.var_params that may replace defaults, update the param list
        params = returnParams(self.var_params, params, 'spectral')
 
        seed = params['random_state'][1][0]

        # handle the cases of affinity set, affinity as precomputed with a matrix, distance as a string that needs to be converted and distance as precomputed, which shoudl fail

        if 'affinity' in self.var_params:
            if self.var_params['affinity'] == 'precomputed':
                solution = skc.SpectralClustering(n_clusters=self.K, n_neighbors=params['n_neighbors'], gamma=params['gamma'],
                    eigen_solver=params['eigen_solver'], random_state=seed, n_init=params['n_init'],
                    affinity='precomputed', coef0=params['coef0'], kernel_params=params['kernel_params'],
                    eigen_tol=params['eigen_tol'], assign_labels=params['assign_labels'], n_jobs=params['n_jobs'])
                x = np.shape(self.var_params['M'])
                solution.fit(self.var_params['M'])
                params['M'] = self.var_params['M']
            else: #it's affinity, that's not precomputed, but overrides the default
                solution = skc.SpectralClustering(n_clusters=self.K, n_neighbors=params['n_neighbors'], gamma=params['gamma'],
                            eigen_solver=params['eigen_solver'], random_state=seed, n_init=params['n_init'],
                            affinity=params['affinity'], coef0=params['coef0'], kernel_params=params['kernel_params'],
                            eigen_tol=params['eigen_tol'], assign_labels=params['assign_labels'], n_jobs=params['n_jobs'])
                solution.fit(self.data)

        elif 'distance' in self.var_params:
            if self.var_params['distance'] == 'precomputed':
                raise ValueError("If precomputing a matrix for Spectral clustering, it must be a similarity matrix")

            params['affinity'] = 'precomputed'
            D = returnDistanceMatrix(self.data, self.var_params['distance'])
            S = convertDistanceToSimilarity(D)
            solution = skc.SpectralClustering(n_clusters=self.K, n_neighbors=params['n_neighbors'], gamma=params['gamma'],
                        eigen_solver=params['eigen_solver'], random_state=seed, n_init=params['n_init'],
                        affinity='precomputed', coef0=params['coef0'], kernel_params=params['kernel_params'],
                        eigen_tol=params['eigen_tol'], assign_labels=params['assign_labels'], n_jobs=params['n_jobs'])
            solution.fit(S)
        else: #else it's an affinity that is not precomputed.
            solution = skc.SpectralClustering(n_clusters=self.K, n_neighbors=params['n_neighbors'], gamma=params['gamma'],
                            eigen_solver=params['eigen_solver'], random_state=seed, n_init=params['n_init'],
                            affinity=params['affinity'], coef0=params['coef0'], kernel_params=params['kernel_params'],
                            eigen_tol=params['eigen_tol'], assign_labels=params['assign_labels'], n_jobs=params['n_jobs'])
            solution.fit(self.data)
        self.out = solution.labels_
        self.var_params = params #update dictionary of parameters to match that used.        

######### BELOW HERE ARE ALGORITHMS that accept DISTANCE METRICS

[docs]    def agglomerative(self):
        """
        Uses sklearns `AgglomerativeClustering algorithm <http://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html>`_ 

        **Defaults and var_params:** sklearn.cluster.AgglomerativeClustering(n_clusters=2, affinity='euclidean', connectivity=None, compute_full_tree='auto', linkage='ward', pooling_func=<function mean>)

        Other Parameters
        ----------------
        var_params: dict
            Pass variable params through constructor as dictionary pairs. Current default parameters are listed above

        Returns
        -------
        labels: list of ints
            Solution of clustering labels for each object (updated in object.out)  
        """
        params = {}
        params['distance'] = 'euclidean'
        params['affinity'] = params['distance']
        #params['memory'] = 'Memory(cachedir=None)'
        params['connectivity']= None
        #params['n_components'] = None #gone in newest version
        params['compute_full_tree'] = 'auto'
        params['linkage'] = 'ward'
        params['pooling_func'] = np.mean

        params = returnParams(self.var_params, params, 'agglomerative')
        params['affinity'] = params['distance']

        if not self.K:
            raise ValueError('agglomerative clustering requires an argument K=<intiger value>')

        solution = skc.AgglomerativeClustering(n_clusters=self.K, affinity=params['affinity'],
            connectivity=params['connectivity'],
            compute_full_tree=params['compute_full_tree'], linkage=params['linkage'] , pooling_func=params['pooling_func'])
        solution.fit(self.data)
        self.out = solution.labels_
        self.var_params = params #update dictionary of parameters to match that used.


[docs]    def DBSCAN(self):
        """
        Uses `sklearn's DBSCAN <http://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html>`_

        **Defaults and var_params:** sklearn.cluster.DBSCAN(eps=0.5, min_samples=5, metric='euclidean', algorithm='auto', leaf_size=30, p=None, n_jobs=1)
        
        Other Parameters
        ----------------
        var_params: dict
            Pass variable params through constructor as dictionary pairs. Current default parameters are listed above

        Returns
        -------
        labels: list of ints
            Solution of clustering labels for each object (updated in object.out)
        
        """
        params = {}
        params['distance'] = 'euclidean'
        params['eps']=0.5
        params['min_samples']=5
        params['metric']='precomputed'
        params['algorithm']='auto'
        params['leaf_size']=30
        params['p']=None, 
        params['n_jobs'] = 1

        params = returnParams(self.var_params, params, 'DBSCAN')

        if 'distance' in self.var_params:
            if self.var_params['distance'] == 'precomputed':
                d = self.var_params['M']
            else:
                d = returnDistanceMatrix(self.data, params['distance'])        
        else:
            d = returnDistanceMatrix(self.data, params['distance'])        

        solution = skc.DBSCAN(eps=params['eps'], min_samples=params['min_samples'], metric=params['metric'], 
            algorithm=params['algorithm'], leaf_size=params['leaf_size'], 
            p=params['p'], n_jobs=params['n_jobs']) 
        solution.fit(d)
        self.out = solution.labels_
        self.var_params = params #update dictionary of parameters to match that used.

[docs]    def AffinityPropagation(self):
        """
        Uses `sklearn's Affinity Propagation <http://scikit-learn.org/stable/modules/generated/sklearn.cluster.AffinityPropagation.html>`_

        **Defaults and var_params:** sklearn.cluster.AffinityPropagation(damping=0.5, max_iter=200, convergence_iter=15, copy=True, preference=None, affinity='euclidean', verbose=False)

        Other Parameters
        ----------------
        var_params: dict
            Pass variable params through constructor as dictionary pairs. Current default parameters are listed above

        Returns
        -------
        labels: list of ints
            Solution of clustering labels for each object (updated in object.out)
        """
        params = {}
        params['distance'] = 'euclidean'
        params['affinity'] = 'precomputed'
        params['damping'] = 0.5
        params['max_iter'] = 200
        params['convergence_iter'] = 15
        params['copy'] = True
        params['preference'] = None

        params['verbose'] = False
        params = returnParams(self.var_params, params, 'AffinityPropagation')

        #params['distance'] says what to precompute on
        params['affinity'] = 'precomputed'
        d = returnDistanceMatrix(self.data, params['distance'])
        
        solution = skc.AffinityPropagation(damping=params['damping'], max_iter=params['max_iter'], convergence_iter=params['convergence_iter'], 
            copy=params['copy'], preference=params['preference'], affinity=params['affinity'], verbose=params['verbose'])
        solution.fit(d) #operates on distance matrix

        self.out = solution.labels_
        self.var_params = params #update dictionary of parameters to match that used.

[docs]    def Birch(self):
        """
        Uses `sklearn's Birch <http://scikit-learn.org/stable/modules/generated/sklearn.cluster.Birch.html>`_
        **Defaults and var_params:** sklearn.cluster.Birch(threshold=0.5, branching_factor=50, compute_labels=True, copy=True)

        Other Parameters
        ----------------
        var_params: dict
            Pass variable params through constructor as dictionary pairs. Current default parameters are listed above

        Returns
        -------
        labels: list of ints
            Solution of clustering labels for each object (updated in object.out)
        """
        params = {}
        params['distance'] = 'euclidean' #not mutable
        params['threshold'] = 0.5
        params['branching_factor'] = 50
        params['n_clusters'] = self.K
        params['compute_labels'] = True
        params['copy'] = True
        
        if not self.K:
            raise ValueError('Birch clustering requires an argument K=<intiger value>')

        params = returnParams(self.var_params, params, 'Birch')
        d = returnDistanceMatrix(self.data, params['distance'])

        solution = skc.Birch(threshold=params['threshold'], branching_factor=params['branching_factor'], n_clusters=params['n_clusters'],
            compute_labels=params['compute_labels'], copy=params['copy'])
        solution.fit(d)

        self.out = solution.labels_
        self.var_params = params


[docs]    def MeanShift(self):
        """
        Uses `sklearn's MeanShift <http://scikit-learn.org/stable/modules/generated/sklearn.cluster.MeanShift.html>`_

         **Defaults and var_params:** sklearn.cluster.MeanShift(bandwidth=None, seeds=None, bin_seeding=False, min_bin_freq=1, cluster_all=True, n_jobs=1)

        Other Parameters
        ----------------
        var_params: dict
            Pass variable params through constructor as dictionary pairs. Current default parameters are listed above

        Returns
        -------
        labels: list of ints
            Solution of clustering labels for each object (updated in object.out)

        """

        params = {}
        params['bandwidth']=None
        params['seeds']=None
        params['bin_seeding']=False 
        params['min_bin_freq']=1
        params['cluster_all']=True
        params['n_jobs']=1
        params['quantile'] = 0.3

        params = returnParams(self.var_params, params, 'GaussianMixture')

        bandwidth = skc.estimate_bandwidth(self.data, quantile=params['quantile'])
        params['bandwidth'] = bandwidth
        solution = skc.MeanShift(bandwidth=params['bandwidth'], seeds=params['seeds'], bin_seeding=params['bin_seeding'], 
            min_bin_freq=params['min_bin_freq'], cluster_all=params['cluster_all'], n_jobs=params['n_jobs'])
        solution.fit(self.data)
        labels = solution.fit_predict(self.data)
        self.out = labels
        self.var_params = params

[docs]    def GaussianMixture(self):
        """
        Uses `sklearn's GuassianMixture <http://scikit-learn.org/stable/modules/mixture.html>`_

         **Defaults and var_params:** (n_components=2, covariance_type=’full’, tol=0.001, reg_covar=1e-06, max_iter=100, n_init=1, init_params=’kmeans’, weights_init=None, means_init=None, precisions_init=None, random_state=None, warm_start=False, verbose=0, verbose_interval=10)

        Other Parameters
        ----------------
        var_params: dict
            Pass variable params through constructor as dictionary pairs. Current default parameters are listed above

        Returns
        -------
        labels: list of ints
            Solution of clustering labels for each object (updated in object.out)

        """
        params = {}
        params['n_components'] = self.K
        params['covariance_type']='full'
        params['tol']=0.001
        params['reg_covar']=1e-06
        params['max_iter']=100
        params['n_init']=1
        params['init_params']='kmeans'
        params['weights_init']=None
        params['means_init']=None 
        params['precisions_init']=None
        params['random_state']=None
        params['warm_start']=False 
        params['verbose']=0
        params['verbose_interval']=10

        if not self.K:
            raise ValueError('GaussinaMixture clustering requires an argument K=<intiger value>')

        params = returnParams(self.var_params, params, 'GaussianMixture')

        seed = params['random_state'][1][0]


        solution=mixture.GaussianMixture(n_components=params['n_components'], covariance_type=params['covariance_type'], 
            tol=params['tol'], reg_covar=params['reg_covar'], max_iter=params['max_iter'], n_init=params['n_init'], 
            init_params=params['init_params'], weights_init=params['weights_init'], means_init=params['means_init'], 
            precisions_init=params['precisions_init'], random_state = seed, warm_start=params['warm_start'], 
            verbose=params['verbose'], verbose_interval=params['verbose_interval'])
        solution.fit(self.data)
        labels = solution.predict(self.data)
        self.out = labels
        self.var_params = params

[docs]def returnParams(paramsSent, paramsExpected, algorithm):
    """
    A utility for variable parameter setting in clustering algorithms
    Takes two dictionaries of parameter key, value pairs and replaces that in paramsExpected 
    with anything in paramsSent. 

    Returns
    -------
    params: dict
        Dict of parameters that represent the final parameters, overwritten in paramsExpected by paramsSent
        This will handle checking to make sure that if precomputed distances have been selected, that a distance 
        or similarity matrix is also passed.

    Warnings
    --------
        Will warn users if a key in sent does not appear in expected.
    """

    if 'distance' in paramsSent:
        if paramsSent['distance'] == 'precomputed':
            if 'M' not in paramsSent:
                raise ValueError("Precomputed distances require a distance matrix passed as 'M' ")

    if 'affinity' in paramsSent:
        if paramsSent['affinity'] == 'precomputed':
            if 'M' not in paramsSent:
                raise ValueError("Precomputed affinity require a similarity matrix passed as 'M' ")

    overlap = set(paramsSent.keys()) & set(paramsExpected.keys())
    params = paramsExpected.copy()
    paramsToCheck = paramsSent.copy()
    for key in overlap:
        params[key] = paramsSent[key] #if it was sent, overwrite default.
        del paramsToCheck[key]

    #warn if there are keys in paramsToCheck (means they were sent, but not expected) (covers distance metric as well)
    if 'random_state' in paramsToCheck:
        del paramsToCheck['random_state']
    for key in paramsToCheck:
        warnings.warn("Parameter %s was not expected in algorithm %s and will be ignored"%(key, algorithm), UserWarning)

    # Warn if the paramsSent contain a distance, but paramsExpected does not
    #if 'distance' in paramsSent:
    #    if 'distance' not in paramsExpected:
    #        warnings.warn("Algorithm does not take a distance metric and distance metric %s will be ignored"%(paramsSent['distance']), UserWarning)

    return params

[docs]def returnDistanceMatrix(data, distance):
    """
    A utility to calculate a distance matrix, according to type in <distance> on the data array. 

    Parameters
    ----------
    data: matrix
        Data matrix to calculate distances from
    distance: string
        Distance metric. See `sklearn's pairwise distances <http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html>`_

    Returns
    -------
    d: matrix 
        the distance matrix computed by distance

    Raises
    ------
    ValueError:
        if the distance metric is not available.
    """
    #distDict = sk.metrics.pairwise.pairwise_distances()
    #if distance not in distDict:
    #    raise ValueError("ERROR: the distance you requested, %s, is not available. Please see sklearn.metrics.pairwise.distance_metrics()"%(distance))
    
    #d = distDict[distance](data)
    d = sk.metrics.pairwise.pairwise_distances(data, metric=distance)

    return d

[docs]def convertDistanceToSimilarity(D, beta=1.0):
    """
    A utility to convert a distance matrix to a similarity matrix

    Parameters
    ----------
    D: matrix of floats
        A matrix of distances, such as returned by returnDistanceMatrix(data,distanceType)
    beta: float
        A variable for mapping distance to similarity.

    Returns
    -------
    S: a matrix of floats
        A matrix of similarity values. according to S = np.exp(-beta * D / D.std())
    """
    S = np.exp(-beta*D/D.std())
    return S