"""
OpenEnsembles is a resource for performing and analyzing ensemble clustering
Copyright (C) 2017 Naegle Lab
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
import numpy as np
import pandas as pd
import pprint as pp
import copy
import matplotlib.pyplot as plt
import sklearn.cluster
import sys
import os
import networkx as nx
from collections import defaultdict
[docs]class mixture_model:
"""
Implementation of the article Mixture Models for Ensemble CLustering
Topchy, Jain, and Punch, "A mixture model for clustering ensembles Proc. SIAM Int. Conf. Data Mining (2004)
Written by Pedro da Silva Tavares and adapted by Kristen M. Naegle
Parameters
----------
parg: list of lists
Solutions of assignments of objects to clusters across an ensemble
N: int
Number of data points to cluster
nEnsCluster: int
Number of clusters to create in mixture model
Default is 2
iterations:
Number of expectation maximization iterations
Default is 10
Attributes
----------
labels: list of ints
Final Mixture Model partitions of objects
See Also
--------
openensembles.cluster.mixture_model
References
----------
"""
def __init__(self, parg, N, nEnsCluster=2, iterations=10):
self.parg = parg #list of lists of solutions
self.N = N# number of data points
self.nEnsCluster = nEnsCluster #number of clusters to make from ensemble
self.iterations = iterations
self.y = self.gatherPartitions()
self.y
self.K = self.genKj()
self.alpha, self.v, self.ExpZ = self.initParameters()
self.labels = []
self.piFinishing = {}
[docs] def gatherPartitions(self):
'''
Returns the y vector.
parg: list of H-labeling solutions
nElem: number of features/objects
'''
H = len(self.parg)
listParts = np.concatenate(self.parg).reshape(H,self.N)
#print listParts[:,0]
y = []
[y.append(listParts[:,i]) for i in range(self.N)]
y = pd.DataFrame(y, columns= np.arange(H))
y.index.name = 'objs'
y.columns.name = 'partition'
return y
[docs] def genKj(self):
'''
Generates the K(j) H-array that contains the tuples of unique
clusters of each j-th partition, eg: K = [(X,Y), (A,B)]
'''
#K = np.zeros(y.shape[1], dtype= int)
K = []
aux = []
for i in range(self.y.shape[1]):
if 'NaN' in np.unique(self.y.iloc[:,i].values):
aux = copy.copy(self.y.iloc[:,i].values)
aux = [x for x in aux if x != 'NaN']
K.append(aux)
else:
K.append(tuple(np.unique(self.y.iloc[:,i].values)))
return K
[docs] def initParameters(self):
'''
The function initializes the parameters of the mixture model.
'''
def initAlpha():
return np.ones(self.nEnsCluster) / self.nEnsCluster
def initV():
v = []
[v.append([]) for j in range(self.y.shape[1])]
#[v[j].append(list(np.ones(len(self.K[j])) / len(self.K[j]))) for j in range(self.y.shape[1]) for m in range(self.nEnsCluster)]
for j in range(self.y.shape[1]):
for m in range(self.nEnsCluster):
aux = abs(np.random.randn(len(self.K[j])))
v[j].append( aux / sum(aux) )
return v
def initExpZ():
return np.zeros(self.y.shape[0] * self.nEnsCluster).reshape(self.y.shape[0],self.nEnsCluster)
alpha = initAlpha()
v = initV()
ExpZ = initExpZ()
return alpha, v, ExpZ
[docs] def expectation(self):
'''
Compute the Expectation (ExpZ) according to parameters.
Obs: y(N,H) Kj(H) alpha(M) v(H,M,K(j)) ExpZ(N,M)
'''
def sigma(a,b):
return 1 if a == b else 0
M = self.ExpZ.shape[1]
nElem = self.y.shape[0]
for m in range(M):
for i in range(nElem):
prod1 = 1
num = 0
for j in range(self.y.shape[1]):
ix1 = 0
for k in self.K[j]:
prod1 *= ( self.v[j][m][ix1] ** sigma(self.y.iloc[i][j],k) )
ix1 += 1
num += self.alpha[m] * prod1
den = 0
for n in range(M):
prod2 = 1
for j2 in range(self.y.shape[1]):
ix2 = 0
for k in self.K[j2]:
prod2 *= ( self.v[j2][n][ix2] ** sigma(self.y.iloc[i][j2],k) )
ix2 += 1
den += self.alpha[n] * prod2
self.ExpZ[i][m] = float(num) / float(den)
return self.ExpZ
[docs] def maximization(self):
'''
Update the parameters taking into account the ExpZ computed in the
Expectation (ExpZ) step.
Obs: y(N,H) Kj(H) alpha(M) v(H,M,K(j)) ExpZ(N,M)
'''
def vecSigma(vec, k):
'''
Compare i-th elements of vector to k assigining to
a vector 1 if i-th == k, 0 otherwise.
'''
aux = []
for i in vec:
if i == k:
aux.append(1)
else:
aux.append(0)
return np.asarray(aux)
def updateAlpha():
for m in range(self.alpha.shape[0]):
self.alpha[m] = float(sum(self.ExpZ[:,m])) / float(sum(sum(self.ExpZ)))
return self.alpha
def updateV():
for j in range(self.y.shape[1]):
for m in range(self.alpha.shape[0]):
ix = 0
for k in self.K[j]:
num = sum(vecSigma(self.y.iloc[:,j],k) * np.array(self.ExpZ[:,m]))
den = 0
for kx in self.K[j]:
den += sum(vecSigma(self.y.iloc[:,j],kx) * np.asarray(self.ExpZ[:,m]))
self.v[j][m][ix] = float(num) / float(den)
ix += 1
return self.v
self.alpha = updateAlpha()
self.v = updateV()
return self.alpha, self.v
def emProcess(self):
def piConsensus():
'''
The function outputs the final ensemble solution based on ExpZ values.
'''
maxExpZValues = {}
piFinishing = {}
labels = []
for i in range(self.ExpZ.shape[0]):
maxExpZValues[i] = max(self.ExpZ[i,:])
piFinishing[i] = []
for j in range(self.ExpZ.shape[1]):
if (maxExpZValues[i] == self.ExpZ[i,j]):
piFinishing[i].append(j + 1)
labels.append(j+1)
# choose randomly in the case of same values of ExpZ[i,:]
#[piFinishing[i].delete(random.choice(piFinishing[i])) for i in piFinishing.keys() if (len(piFinishing[i]) > 1)]
return piFinishing, labels
i = 0
while(i<self.iterations):
self.ExpZ = self.expectation()
self.alpha, self.v = self.maximization()
i += 1
piFinishing, labels = piConsensus()
self.piFinishing = piFinishing
self.labels = np.asarray(labels)
# return piFinishing, labels
return self.labels
class co_occurrence_linkage:
"""
Returns a final solution to the ensemble that is the agglomerative clustering of the co_occurrence matrix
according to the linkage passed by linkage (default is Average).
Parameters
----------
co_occ_object: openensembles.coMat object
The co-occurrence object to operate on
threshold: float
The threshold to cut the linkage at to create hard partitions
linkage: string
linkage type. See `scipy.cluster.hierarchy <https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage>`_
Returns
-------
labels: list of ints
The final solution
See Also
--------
openensembles.cluster.finish_co_occ_linkage()
"""
def __init__(self, co_occ_object, threshold, linkage='average'):
self.coMat = co_occ_object
self.N = len(self.coMat.co_matrix)
self.labels = np.empty(self.N)
self.K= 0 #number of clusters made by the cut, to be replaced
self.linkage = linkage
self.threshold = threshold
def finish(self):
"""
This is the function that is called ona co_occurrence_linkage object that agglomeratively clusters the co_occurrence_matrix (self.coMat.co_matrix)
According to self.linkage (linkage parameter set in initialization of object) with clusters equal to self.K (also set in intialization)
"""
#first get linkage, then cut
lnk = self.coMat.link(linkage=self.linkage)
labels = self.coMat.cut(lnk, self.threshold)
self.K = len(np.unique(labels))
self.labels = labels
return self.labels
class majority_vote:
"""
Based on Ana Fred's 2001 paper,
this algorithm assingns clusters to the same class if they co-cluster at least 50% of the time. It
greedily joins clusters with the evidence that at least one pair of items from two different clusters co-cluster
a majority of the time. Outliers will get their own cluster. The original algorithm chose 0.5 as the threshold for a
majority vote, but here you can override the default parameter.
Parameters
----------
co_occ_object: openensembles.coMat object
The co-occurrence object to operate on
threshold: float
The threshold, fraction of times any pair of objects clusters together that is considred for majority.
For example threshold=0.5 means 50% of the time or more.
Attributes
----------
K: int
Number of clusters created
labels: list of ints
Solution of cluster assignments
References
----------
Fred, Ana. “Finding Consistent Clusters in Data Partitions.” In Multiple Classifier Systems, edited by Josef Kittler and Fabio Roli, LNCS 2096., 309–18. Springer, 2001.
See Also
--------
openensembles.cluster.finish_majority_vote()
"""
def __init__(self, co_occ_matrix, threshold=0.5):
self.co_matrix = co_occ_matrix
self.K= 0 #number of clusters made
self.N = len(co_occ_matrix)
self.labels = np.empty(self.N)
self.threshold=threshold
def finish(self):
"""
Finish the ensemble by majority vote
"""
labels = np.zeros(self.N).astype(int)
currCluster = 1
x = self.co_matrix.as_matrix()
for i in range(0, self.N):
for j in range(i+1, self.N):
if x[i,j] > self.threshold:
# the clusters should be set to the same value and if both belong to an existing cluster, all
# members of the clusters should be joined
if labels[i] and labels[j]:
cluster_num = min(labels[i] , labels[j])
cluster_toChange = max(labels[i] , labels[j])
indices = [k for k, x in enumerate(labels) if x == cluster_toChange]
labels[indices] = cluster_num
elif not labels[i] and not labels[j]: #a new cluster
labels[i] = currCluster
labels[j] = currCluster
currCluster += 1
else: #one of them is in a cluster and one is not, one will be assigned the same thing, but saves an if
cluster_num = max(labels[i] , labels[j])
labels[i] = cluster_num
labels[j] = cluster_num
else: #else don't join them and give them an assignment the first time they are traversed
if not labels[i]:
labels[i] = currCluster
currCluster += 1
if not labels[j]:
labels[j] = currCluster
currCluster += 1
#Add a cleanup state, if there are any cluster numbers that jump, move them down
clusters = np.sort(np.unique(labels))
for ind in range(0,len(clusters)-1): #operating on ind+1
if clusters[ind+1] != clusters[ind]+1:
cluster_num = clusters[ind] + 1
#print("updating cluster num %d to %d")%(clusters[ind+1], cluster_num)
indices = [k for k, x in enumerate(labels) if x == clusters[ind+1]]
labels[indices] = cluster_num
clusters[ind+1] = cluster_num
self.labels = labels
[docs]class graph_closure:
"""
Returns a final solution of the ensemble based on treating the co-occurrence matrix as a weighted graph whose
solution is found from identifying network components within the graph
Finds k-percolated cliques in G, e.g,
Unless the cliques argument evaluates to True, this algorithm
first enumerates all cliques in G. These are stored in memory,
which in large graphs can consume large amounts of memory.
Returns a generator object. To return a list of percolated k-cliques,
Notes
-----
References
----------
* Based on the method outlined in Palla et. al., Nature 435, 814-818 (2005)
* Based on Code for `Percolation From ConradLee <https://gist.github.com/conradlee/1341985>`_
See Also
--------
openensembles.cluster.finish_graph_closure()
"""
def __init__(self, co_occ_matrix, threshold, clique_size = 3):
self.co_matrix = co_occ_matrix
self.K= 0 #number of clusters made
self.N = len(co_occ_matrix)
self.labels = np.empty(self.N)
self.threshold = threshold
self.coMat_binary = np.array(self.co_matrix >= threshold).astype(int)
self.clique_size = clique_size
[docs] def finish(self):
""" Finishes the ensemble by taking a binary adjacency matrix, defined in initilization according to the threshold given
and percolates the cliques
"""
# From ConradLee on GitHUB
def get_percolated_cliques(G, k):
perc_graph = nx.Graph()
cliques = [frozenset(c) for c in nx.find_cliques(G) if len(c) >= k]
perc_graph.add_nodes_from(cliques)
# First index which nodes are in which cliques
membership_dict = defaultdict(list)
for clique in cliques:
for node in clique:
membership_dict[node].append(clique)
# For each clique, see which adjacent cliques percolate
for clique in cliques:
for adj_clique in get_adjacent_cliques(clique, membership_dict):
if len(clique.intersection(adj_clique)) >= (k - 1):
perc_graph.add_edge(clique, adj_clique)
# Connected components of clique graph with perc edges
# are the percolated cliques
for component in nx.connected_components(perc_graph):
yield(frozenset.union(*component))
def get_adjacent_cliques(clique, membership_dict):
adjacent_cliques = set()
for n in clique:
for adj_clique in membership_dict[n]:
if clique != adj_clique:
adjacent_cliques.add(adj_clique)
return adjacent_cliques
G = nx.from_numpy_matrix(self.coMat_binary)
y = get_percolated_cliques(G, self.clique_size)
z = list(y)
clusterNum = 0
while z:
l = list(z.pop())
self.labels[l] = int(clusterNum)
clusterNum+=1
self.labels = self.labels.astype(int)
return self.labels