Source code for openensembles.clustering_algorithms

"""
OpenEnsembles is a resource for performing and analyzing ensemble clustering
This file contains calls to clustering algorithms. Please refer to this 
documentation for specifics about the variable parameters and their defaults, but interact with clustering 
through the openensembles.clustering() class. 

OpenEnsembles is a resource for performing and analyzing ensemble clustering

Copyright (C) 2017 Naegle Lab

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
"""

import numpy as np 
import pandas as pd 
import sklearn.cluster as skc
import sklearn as sk
import matplotlib.pyplot as plt
from sklearn import datasets
import scipy.cluster.hierarchy as sch
from sklearn import preprocessing
import scipy.stats as stats
from types import FunctionType
import re
import warnings
from sklearn import mixture
import hdbscan

class clustering_algorithms:
	"""
	The calls to all clustering algorithms reside here. Each call takes on the form of explicitly encoding the default 
	sklearn parameters, overwriting any passed in as kwargs.

	The act of clustering creates a list of assignments of objects in data assigned to clustering classes
	var_params contains all the final parameters used in the act of clustering

	Parameters
	----------
	data: matrix
		Data matrix, assumes feature dimensions are found column wise (objects in rows)
	kwargs: dict
		Variable parameters passed in as a dict. These are specific to each algorithm
		The keyword 'distance' is used by OpenEnsembles to generalize distance/similarity calls across algorithms. Use 'distance' and a string for distance metric. 
		Algorithms requiring similarity will use the conversion of Distance (D) to Similarity (S) according to: S = np.exp(-D / D.std()). 
		Use 'affinity' and 'precomputed' to exert greater control over usage. 
	K: int
		Number of clusters to create, required for most, but not all algorithms. Default K=2

	Attributes
	----------
	var_params: dict
		The dict of all parameters, including defaults assumed and those overwritten by kwargs

	See Also
	--------
	openensembles.cluster()
	"""
	def __init__(self, data, kwargs, K=2):
		self.data = data
		self.out = []
		self.var_params = kwargs
		self.K = K
		#args should have K, even if a default value (should it? not all algorithms need this, use default)
		#if 'K' not in self.args:
		#    raise ValueError('clustering_algorithms should have an instantiated K as part of kwargs key, pair')

	def clustering_algorithms_available(self):
		"""
		Report available algorithms
		
		Returns
		-------
		methods: dict
			Dictionary with keys equal to available function calls

		"""
		methods =  [method for method in dir(self) if callable(getattr(self, method))]
		methods.remove('clustering_algorithms_available')
		methodDict = {}
		for method in methods:
			if not re.match('__', method):
				methodDict[method] = ''
		return methodDict


	def kmeans(self):
		"""
		kmeans clustering see `skc.KMeans <http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html>`_

		**Defaults and var_params:** skc.KMeans(n_clusters=2, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=1)

	
		Other Parameters
		----------------
		var_params: dict
			Pass variable params through constructor as dictionary pairs. Current default parameters are listed above

		Returns
		-------
		labels: list of ints
			Solution of clustering labels for each object (updated in object.out)
			
		"""
		params={}
		params['init'] = 'k-means++'
		params['n_init'] = 10
		params['max_iter'] = 300
		params['tol'] = 0.0001
		params['precompute_distances'] = 'auto'
		params['verbose'] = 0
		params['random_state'] = None
		params['copy_x'] = True
		params['n_jobs'] = 1
		if not self.K:
			raise ValueError('kmeans clustering requires an argument K=<intiger value>')

		#for anything in self.var_params that may replace defaults, update the param list
		params = returnParams(self.var_params, params, 'kmeans')

		seed = params['random_state'][1][0]
		solution=skc.KMeans(n_clusters=self.K, init=params['init'], 
			n_init=params['n_init'], max_iter=params['max_iter'], tol=params['tol'],
			precompute_distances=params['precompute_distances'], verbose=params['verbose'],
			random_state=seed, copy_x=params['copy_x'], n_jobs=params['n_jobs'])
		solution.fit(self.data)
		self.out = solution.labels_
		self.var_params = params #update dictionary of parameters to match that used.


	def spectral(self):
		""" 
		Spectral clustering, see `skc.SpectralClustering <http://scikit-learn.org/stable/modules/generated/sklearn.cluster.SpectralClustering.html>`_

		**Defaults and var_params:** skc.SpectralClustering(n_clusters=2, eigen_solver=None, random_state=None, n_init=10, gamma=1.0, affinity=’rbf’, n_neighbors=10, eigen_tol=0.0, assign_labels=’kmeans’, degree=3, coef0=1, kernel_params=None, n_jobs=1)
		Other Parameters
		----------------
		var_params: dict
			Pass variable params through constructor as dictionary pairs. Current default parameters are listed above

		Returns
		-------
		labels: list of ints
			Solution of clustering labels for each object (updated in object.out)       
		"""
		params = {}

		params['affinity'] = 'rbf'
		params['assign_labels'] = 'kmeans'
		params['coef0'] = 1
		params['degree'] = 3
		params['eigen_solver']=None
		params['eigen_tol'] = '0.0'
		params['gamma'] = 1.0
		params['kernel_params']=None
		params['n_init'] = 10
		params['n_jobs']=1
		params['n_neighbors'] = 10
		params['random_state'] = None

		#NOt used directly by spectral, the true default is affinity with rbf
		params['distance'] = 'euclidean'
		params['M'] = []

		if not self.K:
			raise ValueError('spectral clustering requires an argument K=<intiger value>')

		#for anything in self.var_params that may replace defaults, update the param list
		params = returnParams(self.var_params, params, 'spectral')
 
		seed = params['random_state'][1][0]

		# handle the cases of affinity set, affinity as precomputed with a matrix, distance as a string that needs to be converted and distance as precomputed, which shoudl fail

		if 'affinity' in self.var_params:
			if self.var_params['affinity'] == 'precomputed':
				solution = skc.SpectralClustering(n_clusters=self.K, n_neighbors=params['n_neighbors'], gamma=params['gamma'],
					eigen_solver=params['eigen_solver'], random_state=seed, n_init=params['n_init'],
					affinity='precomputed', coef0=params['coef0'], kernel_params=params['kernel_params'],
					eigen_tol=params['eigen_tol'], assign_labels=params['assign_labels'], n_jobs=params['n_jobs'])
				x = np.shape(self.var_params['M'])
				solution.fit(self.var_params['M'])
				params['M'] = self.var_params['M']
			else: #it's affinity, that's not precomputed, but overrides the default
				solution = skc.SpectralClustering(n_clusters=self.K, n_neighbors=params['n_neighbors'], gamma=params['gamma'],
							eigen_solver=params['eigen_solver'], random_state=seed, n_init=params['n_init'],
							affinity=params['affinity'], coef0=params['coef0'], kernel_params=params['kernel_params'],
							eigen_tol=params['eigen_tol'], assign_labels=params['assign_labels'], n_jobs=params['n_jobs'])
				solution.fit(self.data)

		elif 'distance' in self.var_params:
			if self.var_params['distance'] == 'precomputed':
				raise ValueError("If precomputing a matrix for Spectral clustering, it must be a similarity matrix")

			params['affinity'] = 'precomputed'
			D = returnDistanceMatrix(self.data, self.var_params['distance'])
			S = convertDistanceToSimilarity(D)
			solution = skc.SpectralClustering(n_clusters=self.K, n_neighbors=params['n_neighbors'], gamma=params['gamma'],
						eigen_solver=params['eigen_solver'], random_state=seed, n_init=params['n_init'],
						affinity='precomputed', coef0=params['coef0'], kernel_params=params['kernel_params'],
						eigen_tol=params['eigen_tol'], assign_labels=params['assign_labels'], n_jobs=params['n_jobs'])
			solution.fit(S)
		else: #else it's an affinity that is not precomputed.
			solution = skc.SpectralClustering(n_clusters=self.K, n_neighbors=params['n_neighbors'], gamma=params['gamma'],
							eigen_solver=params['eigen_solver'], random_state=seed, n_init=params['n_init'],
							affinity=params['affinity'], coef0=params['coef0'], kernel_params=params['kernel_params'],
							eigen_tol=params['eigen_tol'], assign_labels=params['assign_labels'], n_jobs=params['n_jobs'])
			solution.fit(self.data)
		self.out = solution.labels_
		self.var_params = params #update dictionary of parameters to match that used.        

######### BELOW HERE ARE ALGORITHMS that accept DISTANCE METRICS

	def agglomerative(self):
		"""
		Uses sklearns `AgglomerativeClustering algorithm <http://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html>`_ 

		**Defaults and var_params: ** sklearn.cluster.AgglomerativeClustering(n_clusters=2, affinity='euclidean', memory=None, connectivity=None, compute_full_tree='auto', linkage='ward', distance_threshold=None)

		Other Parameters
		----------------
		var_params: dict
			Pass variable params through constructor as dictionary pairs. Current default parameters are listed above

		Returns
		-------
		labels: list of ints
			Solution of clustering labels for each object (updated in object.out)  
		"""
		params = {}
		params['distance'] = 'euclidean'
		params['affinity'] = params['distance']
		params['memory'] = None
		params['connectivity']= None
		#params['n_components'] = None #gone in newest version
		params['compute_full_tree'] = 'auto'
		params['linkage'] = 'ward'
		params['distance_threshold'] = None
		#params['pooling_func'] = np.mean #Removed in sklearn 0.22.1

		params = returnParams(self.var_params, params, 'agglomerative')
		params['affinity'] = params['distance']

		if not self.K:
			raise ValueError('agglomerative clustering requires an argument K=<intiger value>')

		solution = skc.AgglomerativeClustering(n_clusters=self.K, affinity=params['affinity'],
			connectivity=params['connectivity'],
			compute_full_tree=params['compute_full_tree'], linkage=params['linkage'] , pooling_func=params['pooling_func'])
		solution.fit(self.data)
		self.out = solution.labels_
		self.var_params = params #update dictionary of parameters to match that used.


	def DBSCAN(self):
		"""
		Uses `sklearn's DBSCAN <http://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html>`_

		**Defaults and var_params:** sklearn.cluster.DBSCAN(eps=0.5, min_samples=5, metric='euclidean', algorithm='auto', leaf_size=30, p=None, n_jobs=1)
		
		Other Parameters
		----------------
		var_params: dict
			Pass variable params through constructor as dictionary pairs. Current default parameters are listed above

		Returns
		-------
		labels: list of ints
			Solution of clustering labels for each object (updated in object.out)
		
		"""
		params = {}
		params['distance'] = 'euclidean'
		params['eps']=0.5
		params['min_samples']=5
		params['metric']='precomputed'
		params['algorithm']='auto'
		params['leaf_size']=30
		params['p']=None, 
		params['n_jobs'] = 1

		params = returnParams(self.var_params, params, 'DBSCAN')

		if 'distance' in self.var_params:
			if self.var_params['distance'] == 'precomputed':
				d = self.var_params['M']
			else:
				d = returnDistanceMatrix(self.data, params['distance'])        
		else:
			d = returnDistanceMatrix(self.data, params['distance'])        

		solution = skc.DBSCAN(eps=params['eps'], min_samples=params['min_samples'], metric=params['metric'], 
			algorithm=params['algorithm'], leaf_size=params['leaf_size'], 
			p=params['p'], n_jobs=params['n_jobs']) 
		solution.fit(d)
		self.out = solution.labels_
		self.var_params = params #update dictionary of parameters to match that used.

	def HDBSCAN(self):
		"""
		Hieararchical Density-Based Spatial Clustering of Applications with Noise
		Uses `HDBSCAN <https://github.com/scikit-learn-contrib/hdbscan>`_

		**Defaults and var_params:** hdbscan.HDBSCAN(eps=0.5, min_samples=5, metric='euclidean', algorithm='auto', leaf_size=30, p=None, n_jobs=1)
		
		Other Parameters
		----------------
		var_params: dict
			Pass variable params through constructor as dictionary pairs. Current default parameters are listed above

		Returns
		-------
		labels: list of ints
			Solution of clustering labels for each object (updated in object.out)
		
		"""

		params = {}
		params['min_cluster_size'] = 5
		params['distance'] = 'euclidean'
		params['alpha']=1.0
		params['min_samples']=None
		params['metric']='precomputed' #always uses precomputed, with calculation of euclidean as distance matrix, by default
		params['algorithm']='best'
		params['leaf_size']=40
		params['p']=None, 
		params['approx_min_span_tree'] = True
		params['gen_min_span_tree'] = False
		params['core_dist_n_jobs'] = 4
		params['cluster_selection_method'] = 'eom'
		params['allow_single_cluster'] = False
		params['prediction_data'] = False
		params['match_reference_implementation'] = False

		params = returnParams(self.var_params, params, 'HDBSCAN')

		if 'distance' in self.var_params:
			if self.var_params['distance'] == 'precomputed':
				d = self.var_params['M']
			else:
				d = returnDistanceMatrix(self.data, params['distance'])        
		else:
			d = returnDistanceMatrix(self.data, params['distance'])        

		solution = hdbscan.HDBSCAN(min_cluster_size=params['min_cluster_size'], min_samples=params['min_samples'], metric=params['metric'], 
			alpha=params['alpha'], p=params['p'], algorithm=params['algorithm'], 
			leaf_size=params['leaf_size'], approx_min_span_tree=params['approx_min_span_tree'], 
			gen_min_span_tree=params['gen_min_span_tree'], core_dist_n_jobs=params['core_dist_n_jobs'], 
			cluster_selection_method=params['cluster_selection_method'], allow_single_cluster=params['allow_single_cluster'], 
			prediction_data=params['prediction_data'], match_reference_implementation=params['match_reference_implementation'])
		self.out = solution.fit_predict(d)
		self.var_params = params #update dictionary of parameters to match that used.


	def AffinityPropagation(self):
		"""
		Uses `sklearn's Affinity Propagation <http://scikit-learn.org/stable/modules/generated/sklearn.cluster.AffinityPropagation.html>`_

		**Defaults and var_params:** sklearn.cluster.AffinityPropagation(damping=0.5, max_iter=200, convergence_iter=15, copy=True, preference=None, affinity='euclidean', verbose=False)

		Other Parameters
		----------------
		var_params: dict
			Pass variable params through constructor as dictionary pairs. Current default parameters are listed above

		Returns
		-------
		labels: list of ints
			Solution of clustering labels for each object (updated in object.out)
		"""
		params = {}
		params['distance'] = 'euclidean'
		params['affinity'] = 'precomputed'
		params['damping'] = 0.5
		params['max_iter'] = 200
		params['convergence_iter'] = 15
		params['copy'] = True
		params['preference'] = None

		params['verbose'] = False
		params = returnParams(self.var_params, params, 'AffinityPropagation')

		#params['distance'] says what to precompute on
		params['affinity'] = 'precomputed'
		d = returnDistanceMatrix(self.data, params['distance'])
		
		solution = skc.AffinityPropagation(damping=params['damping'], max_iter=params['max_iter'], convergence_iter=params['convergence_iter'], 
			copy=params['copy'], preference=params['preference'], affinity=params['affinity'], verbose=params['verbose'])
		solution.fit(d) #operates on distance matrix

		self.out = solution.labels_
		self.var_params = params #update dictionary of parameters to match that used.

	def Birch(self):
		"""
		Uses `sklearn's Birch <http://scikit-learn.org/stable/modules/generated/sklearn.cluster.Birch.html>`_
		**Defaults and var_params:** sklearn.cluster.Birch(threshold=0.5, branching_factor=50, compute_labels=True, copy=True)

		Other Parameters
		----------------
		var_params: dict
			Pass variable params through constructor as dictionary pairs. Current default parameters are listed above

		Returns
		-------
		labels: list of ints
			Solution of clustering labels for each object (updated in object.out)
		"""
		params = {}
		params['distance'] = 'euclidean' #not mutable
		params['threshold'] = 0.5
		params['branching_factor'] = 50
		params['n_clusters'] = self.K
		params['compute_labels'] = True
		params['copy'] = True
		
		if not self.K:
			raise ValueError('Birch clustering requires an argument K=<intiger value>')

		params = returnParams(self.var_params, params, 'Birch')
		d = returnDistanceMatrix(self.data, params['distance'])

		solution = skc.Birch(threshold=params['threshold'], branching_factor=params['branching_factor'], n_clusters=params['n_clusters'],
			compute_labels=params['compute_labels'], copy=params['copy'])
		solution.fit(d)

		self.out = solution.labels_
		self.var_params = params


	def MeanShift(self):
		"""
		Uses `sklearn's MeanShift <http://scikit-learn.org/stable/modules/generated/sklearn.cluster.MeanShift.html>`_

		 **Defaults and var_params:** sklearn.cluster.MeanShift(bandwidth=None, seeds=None, bin_seeding=False, min_bin_freq=1, cluster_all=True, n_jobs=1)

		Other Parameters
		----------------
		var_params: dict
			Pass variable params through constructor as dictionary pairs. Current default parameters are listed above

		Returns
		-------
		labels: list of ints
			Solution of clustering labels for each object (updated in object.out)

		"""

		params = {}
		params['bandwidth']=None
		params['seeds']=None
		params['bin_seeding']=False 
		params['min_bin_freq']=1
		params['cluster_all']=True
		params['n_jobs']=1
		params['quantile'] = 0.3

		params = returnParams(self.var_params, params, 'GaussianMixture')

		bandwidth = skc.estimate_bandwidth(self.data, quantile=params['quantile'])
		params['bandwidth'] = bandwidth
		solution = skc.MeanShift(bandwidth=params['bandwidth'], seeds=params['seeds'], bin_seeding=params['bin_seeding'], 
			min_bin_freq=params['min_bin_freq'], cluster_all=params['cluster_all'], n_jobs=params['n_jobs'])
		solution.fit(self.data)
		labels = solution.fit_predict(self.data)
		self.out = labels
		self.var_params = params

	def GaussianMixture(self):
		"""
		Uses `sklearn's GuassianMixture <http://scikit-learn.org/stable/modules/mixture.html>`_

		 **Defaults and var_params:** (n_components=2, covariance_type=’full’, tol=0.001, reg_covar=1e-06, max_iter=100, n_init=1, init_params=’kmeans’, weights_init=None, means_init=None, precisions_init=None, random_state=None, warm_start=False, verbose=0, verbose_interval=10)

		Other Parameters
		----------------
		var_params: dict
			Pass variable params through constructor as dictionary pairs. Current default parameters are listed above

		Returns
		-------
		labels: list of ints
			Solution of clustering labels for each object (updated in object.out)

		"""
		params = {}
		params['n_components'] = self.K
		params['covariance_type']='full'
		params['tol']=0.001
		params['reg_covar']=1e-06
		params['max_iter']=100
		params['n_init']=1
		params['init_params']='kmeans'
		params['weights_init']=None
		params['means_init']=None 
		params['precisions_init']=None
		params['random_state']=None
		params['warm_start']=False 
		params['verbose']=0
		params['verbose_interval']=10

		if not self.K:
			raise ValueError('GaussinaMixture clustering requires an argument K=<intiger value>')

		params = returnParams(self.var_params, params, 'GaussianMixture')

		seed = params['random_state'][1][0]


		solution=mixture.GaussianMixture(n_components=params['n_components'], covariance_type=params['covariance_type'], 
			tol=params['tol'], reg_covar=params['reg_covar'], max_iter=params['max_iter'], n_init=params['n_init'], 
			init_params=params['init_params'], weights_init=params['weights_init'], means_init=params['means_init'], 
			precisions_init=params['precisions_init'], random_state = seed, warm_start=params['warm_start'], 
			verbose=params['verbose'], verbose_interval=params['verbose_interval'])
		solution.fit(self.data)
		labels = solution.predict(self.data)
		self.out = labels
		self.var_params = params

[docs]def returnParams(paramsSent, paramsExpected, algorithm): """ A utility for variable parameter setting in clustering algorithms Takes two dictionaries of parameter key, value pairs and replaces that in paramsExpected with anything in paramsSent. Returns ------- params: dict Dict of parameters that represent the final parameters, overwritten in paramsExpected by paramsSent This will handle checking to make sure that if precomputed distances have been selected, that a distance or similarity matrix is also passed. Warnings -------- Will warn users if a key in sent does not appear in expected. """ if 'distance' in paramsSent: if paramsSent['distance'] == 'precomputed': if 'M' not in paramsSent: raise ValueError("Precomputed distances require a distance matrix passed as 'M' ") if 'affinity' in paramsSent: if paramsSent['affinity'] == 'precomputed': if 'M' not in paramsSent: raise ValueError("Precomputed affinity require a similarity matrix passed as 'M' ") overlap = set(paramsSent.keys()) & set(paramsExpected.keys()) params = paramsExpected.copy() paramsToCheck = paramsSent.copy() for key in overlap: params[key] = paramsSent[key] #if it was sent, overwrite default. del paramsToCheck[key] #warn if there are keys in paramsToCheck (means they were sent, but not expected) (covers distance metric as well) if 'random_state' in paramsToCheck: del paramsToCheck['random_state'] for key in paramsToCheck: warnings.warn("Parameter %s was not expected in algorithm %s and will be ignored"%(key, algorithm), UserWarning) # Warn if the paramsSent contain a distance, but paramsExpected does not #if 'distance' in paramsSent: # if 'distance' not in paramsExpected: # warnings.warn("Algorithm does not take a distance metric and distance metric %s will be ignored"%(paramsSent['distance']), UserWarning) return params
[docs]def returnDistanceMatrix(data, distance): """ A utility to calculate a distance matrix, according to type in <distance> on the data array. Parameters ---------- data: matrix Data matrix to calculate distances from distance: string Distance metric. See `sklearn's pairwise distances <http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html>`_ Returns ------- d: matrix the distance matrix computed by distance Raises ------ ValueError: if the distance metric is not available. """ #distDict = sk.metrics.pairwise.pairwise_distances() #if distance not in distDict: # raise ValueError("ERROR: the distance you requested, %s, is not available. Please see sklearn.metrics.pairwise.distance_metrics()"%(distance)) #d = distDict[distance](data) d = sk.metrics.pairwise.pairwise_distances(data, metric=distance) return d
[docs]def convertDistanceToSimilarity(D, beta=1.0): """ A utility to convert a distance matrix to a similarity matrix Parameters ---------- D: matrix of floats A matrix of distances, such as returned by returnDistanceMatrix(data,distanceType) beta: float A variable for mapping distance to similarity. Returns ------- S: a matrix of floats A matrix of similarity values. according to S = np.exp(-beta * D / D.std()) """ S = np.exp(-beta*D/D.std()) return S