Source code for openensembles.finishing

"""
OpenEnsembles is a resource for performing and analyzing ensemble clustering

Copyright (C) 2017 Naegle Lab

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
										   
"""
import numpy as np
import pandas as pd
import pprint as pp
import copy
import matplotlib.pyplot as plt
import sklearn.cluster
import sys
import os
import networkx as nx
from collections import defaultdict

[docs]class mixture_model: """ Implementation of the article Mixture Models for Ensemble CLustering Topchy, Jain, and Punch, "A mixture model for clustering ensembles Proc. SIAM Int. Conf. Data Mining (2004) Written by Pedro da Silva Tavares and adapted by Kristen M. Naegle Parameters ---------- parg: list of lists Solutions of assignments of objects to clusters across an ensemble N: int Number of data points to cluster nEnsCluster: int Number of clusters to create in mixture model Default is 2 iterations: Number of expectation maximization iterations Default is 10 Attributes ---------- labels: list of ints Final Mixture Model partitions of objects See Also -------- openensembles.cluster.mixture_model References ---------- """ def __init__(self, parg, N, nEnsCluster=2, iterations=10): self.parg = parg #list of lists of solutions self.N = N# number of data points self.nEnsCluster = nEnsCluster #number of clusters to make from ensemble self.iterations = iterations self.y = self.gatherPartitions() self.y self.K = self.genKj() self.alpha, self.v, self.ExpZ = self.initParameters() self.labels = [] self.piFinishing = {}
[docs] def gatherPartitions(self): ''' Returns the y vector. parg: list of H-labeling solutions nElem: number of features/objects ''' H = len(self.parg) listParts = np.concatenate(self.parg).reshape(H,self.N) #print listParts[:,0] y = [] [y.append(listParts[:,i]) for i in range(self.N)] y = pd.DataFrame(y, columns= np.arange(H)) y.index.name = 'objs' y.columns.name = 'partition' return y
[docs] def genKj(self): ''' Generates the K(j) H-array that contains the tuples of unique clusters of each j-th partition, eg: K = [(X,Y), (A,B)] ''' #K = np.zeros(y.shape[1], dtype= int) K = [] aux = [] for i in range(self.y.shape[1]): if 'NaN' in np.unique(self.y.iloc[:,i].values): aux = copy.copy(self.y.iloc[:,i].values) aux = [x for x in aux if x != 'NaN'] K.append(aux) else: K.append(tuple(np.unique(self.y.iloc[:,i].values))) return K
[docs] def initParameters(self): ''' The function initializes the parameters of the mixture model. ''' def initAlpha(): return np.ones(self.nEnsCluster) / self.nEnsCluster def initV(): v = [] [v.append([]) for j in range(self.y.shape[1])] #[v[j].append(list(np.ones(len(self.K[j])) / len(self.K[j]))) for j in range(self.y.shape[1]) for m in range(self.nEnsCluster)] for j in range(self.y.shape[1]): for m in range(self.nEnsCluster): aux = abs(np.random.randn(len(self.K[j]))) v[j].append( aux / sum(aux) ) return v def initExpZ(): return np.zeros(self.y.shape[0] * self.nEnsCluster).reshape(self.y.shape[0],self.nEnsCluster) alpha = initAlpha() v = initV() ExpZ = initExpZ() return alpha, v, ExpZ
[docs] def expectation(self): ''' Compute the Expectation (ExpZ) according to parameters. Obs: y(N,H) Kj(H) alpha(M) v(H,M,K(j)) ExpZ(N,M) ''' def sigma(a,b): return 1 if a == b else 0 M = self.ExpZ.shape[1] nElem = self.y.shape[0] for m in range(M): for i in range(nElem): prod1 = 1 num = 0 for j in range(self.y.shape[1]): ix1 = 0 for k in self.K[j]: prod1 *= ( self.v[j][m][ix1] ** sigma(self.y.iloc[i][j],k) ) ix1 += 1 num += self.alpha[m] * prod1 den = 0 for n in range(M): prod2 = 1 for j2 in range(self.y.shape[1]): ix2 = 0 for k in self.K[j2]: prod2 *= ( self.v[j2][n][ix2] ** sigma(self.y.iloc[i][j2],k) ) ix2 += 1 den += self.alpha[n] * prod2 self.ExpZ[i][m] = float(num) / float(den) return self.ExpZ
[docs] def maximization(self): ''' Update the parameters taking into account the ExpZ computed in the Expectation (ExpZ) step. Obs: y(N,H) Kj(H) alpha(M) v(H,M,K(j)) ExpZ(N,M) ''' def vecSigma(vec, k): ''' Compare i-th elements of vector to k assigining to a vector 1 if i-th == k, 0 otherwise. ''' aux = [] for i in vec: if i == k: aux.append(1) else: aux.append(0) return np.asarray(aux) def updateAlpha(): for m in range(self.alpha.shape[0]): self.alpha[m] = float(sum(self.ExpZ[:,m])) / float(sum(sum(self.ExpZ))) return self.alpha def updateV(): for j in range(self.y.shape[1]): for m in range(self.alpha.shape[0]): ix = 0 for k in self.K[j]: num = sum(vecSigma(self.y.iloc[:,j],k) * np.array(self.ExpZ[:,m])) den = 0 for kx in self.K[j]: den += sum(vecSigma(self.y.iloc[:,j],kx) * np.asarray(self.ExpZ[:,m])) self.v[j][m][ix] = float(num) / float(den) ix += 1 return self.v self.alpha = updateAlpha() self.v = updateV() return self.alpha, self.v
def emProcess(self): def piConsensus(): ''' The function outputs the final ensemble solution based on ExpZ values. ''' maxExpZValues = {} piFinishing = {} labels = [] for i in range(self.ExpZ.shape[0]): maxExpZValues[i] = max(self.ExpZ[i,:]) piFinishing[i] = [] for j in range(self.ExpZ.shape[1]): if (maxExpZValues[i] == self.ExpZ[i,j]): piFinishing[i].append(j + 1) labels.append(j+1) # choose randomly in the case of same values of ExpZ[i,:] #[piFinishing[i].delete(random.choice(piFinishing[i])) for i in piFinishing.keys() if (len(piFinishing[i]) > 1)] return piFinishing, labels i = 0 while(i<self.iterations): self.ExpZ = self.expectation() self.alpha, self.v = self.maximization() i += 1 piFinishing, labels = piConsensus() self.piFinishing = piFinishing self.labels = np.asarray(labels) # return piFinishing, labels return self.labels
class co_occurrence_linkage: """ Returns a final solution to the ensemble that is the agglomerative clustering of the co_occurrence matrix according to the linkage passed by linkage (default is Average). Parameters ---------- co_occ_object: openensembles.coMat object The co-occurrence object to operate on threshold: float The threshold to cut the linkage at to create hard partitions linkage: string linkage type. See `scipy.cluster.hierarchy <https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage>`_ Returns ------- labels: list of ints The final solution See Also -------- openensembles.cluster.finish_co_occ_linkage() """ def __init__(self, co_occ_object, threshold, linkage='average'): self.coMat = co_occ_object self.N = len(self.coMat.co_matrix) self.labels = np.empty(self.N) self.K= 0 #number of clusters made by the cut, to be replaced self.linkage = linkage self.threshold = threshold def finish(self): """ This is the function that is called ona co_occurrence_linkage object that agglomeratively clusters the co_occurrence_matrix (self.coMat.co_matrix) According to self.linkage (linkage parameter set in initialization of object) with clusters equal to self.K (also set in intialization) """ #first get linkage, then cut lnk = self.coMat.link(linkage=self.linkage) labels = self.coMat.cut(lnk, self.threshold) self.K = len(np.unique(labels)) self.labels = labels return self.labels class majority_vote: """ Based on Ana Fred's 2001 paper, this algorithm assingns clusters to the same class if they co-cluster at least 50% of the time. It greedily joins clusters with the evidence that at least one pair of items from two different clusters co-cluster a majority of the time. Outliers will get their own cluster. The original algorithm chose 0.5 as the threshold for a majority vote, but here you can override the default parameter. Parameters ---------- co_occ_object: openensembles.coMat object The co-occurrence object to operate on threshold: float The threshold, fraction of times any pair of objects clusters together that is considred for majority. For example threshold=0.5 means 50% of the time or more. Attributes ---------- K: int Number of clusters created labels: list of ints Solution of cluster assignments References ---------- Fred, Ana. “Finding Consistent Clusters in Data Partitions.” In Multiple Classifier Systems, edited by Josef Kittler and Fabio Roli, LNCS 2096., 309–18. Springer, 2001. See Also -------- openensembles.cluster.finish_majority_vote() """ def __init__(self, co_occ_matrix, threshold=0.5): self.co_matrix = co_occ_matrix self.K= 0 #number of clusters made self.N = len(co_occ_matrix) self.labels = np.empty(self.N) self.threshold=threshold def finish(self): """ Finish the ensemble by majority vote """ labels = np.zeros(self.N).astype(int) currCluster = 1 x = self.co_matrix.as_matrix() for i in range(0, self.N): for j in range(i+1, self.N): if x[i,j] > self.threshold: # the clusters should be set to the same value and if both belong to an existing cluster, all # members of the clusters should be joined if labels[i] and labels[j]: cluster_num = min(labels[i] , labels[j]) cluster_toChange = max(labels[i] , labels[j]) indices = [k for k, x in enumerate(labels) if x == cluster_toChange] labels[indices] = cluster_num elif not labels[i] and not labels[j]: #a new cluster labels[i] = currCluster labels[j] = currCluster currCluster += 1 else: #one of them is in a cluster and one is not, one will be assigned the same thing, but saves an if cluster_num = max(labels[i] , labels[j]) labels[i] = cluster_num labels[j] = cluster_num else: #else don't join them and give them an assignment the first time they are traversed if not labels[i]: labels[i] = currCluster currCluster += 1 if not labels[j]: labels[j] = currCluster currCluster += 1 #Add a cleanup state, if there are any cluster numbers that jump, move them down clusters = np.sort(np.unique(labels)) for ind in range(0,len(clusters)-1): #operating on ind+1 if clusters[ind+1] != clusters[ind]+1: cluster_num = clusters[ind] + 1 #print("updating cluster num %d to %d")%(clusters[ind+1], cluster_num) indices = [k for k, x in enumerate(labels) if x == clusters[ind+1]] labels[indices] = cluster_num clusters[ind+1] = cluster_num self.labels = labels
[docs]class graph_closure: """ Returns a final solution of the ensemble based on treating the co-occurrence matrix as a weighted graph whose solution is found from identifying network components within the graph Finds k-percolated cliques in G, e.g, Unless the cliques argument evaluates to True, this algorithm first enumerates all cliques in G. These are stored in memory, which in large graphs can consume large amounts of memory. Returns a generator object. To return a list of percolated k-cliques, Notes ----- References ---------- * Based on the method outlined in Palla et. al., Nature 435, 814-818 (2005) * Based on Code for `Percolation From ConradLee <https://gist.github.com/conradlee/1341985>`_ See Also -------- openensembles.cluster.finish_graph_closure() """ def __init__(self, co_occ_matrix, threshold, clique_size = 3): self.co_matrix = co_occ_matrix self.K= 0 #number of clusters made self.N = len(co_occ_matrix) self.labels = np.empty(self.N) self.threshold = threshold self.coMat_binary = np.array(self.co_matrix >= threshold).astype(int) self.clique_size = clique_size
[docs] def finish(self): """ Finishes the ensemble by taking a binary adjacency matrix, defined in initilization according to the threshold given and percolates the cliques """ # From ConradLee on GitHUB def get_percolated_cliques(G, k): perc_graph = nx.Graph() cliques = [frozenset(c) for c in nx.find_cliques(G) if len(c) >= k] perc_graph.add_nodes_from(cliques) # First index which nodes are in which cliques membership_dict = defaultdict(list) for clique in cliques: for node in clique: membership_dict[node].append(clique) # For each clique, see which adjacent cliques percolate for clique in cliques: for adj_clique in get_adjacent_cliques(clique, membership_dict): if len(clique.intersection(adj_clique)) >= (k - 1): perc_graph.add_edge(clique, adj_clique) # Connected components of clique graph with perc edges # are the percolated cliques for component in nx.connected_components(perc_graph): yield(frozenset.union(*component)) def get_adjacent_cliques(clique, membership_dict): adjacent_cliques = set() for n in clique: for adj_clique in membership_dict[n]: if clique != adj_clique: adjacent_cliques.add(adj_clique) return adjacent_cliques G = nx.from_numpy_matrix(self.coMat_binary) y = get_percolated_cliques(G, self.clique_size) z = list(y) clusterNum = 0 while z: l = list(z.pop()) self.labels[l] = int(clusterNum) clusterNum+=1 self.labels = self.labels.astype(int) return self.labels