"""
OpenEnsembles is a resource for performing and analyzing ensemble clustering
Copyright (C) 2017 Naegle Lab
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
import matplotlib.pyplot as plt
import numpy as np
import pylab
import pandas as pd
import scipy.cluster.hierarchy as sch
from scipy.spatial import distance as ssd
class coMat:
"""
A class that allows you to create and operate on a co-occurrence matrix
Parameters
----------
cObj: openensembles.cluster object
The clustering object and all contained solutions of interest
data_source_name: string
The name of the data source of interest
Atrributes
----------
parg: array of ints
An array reshaped from all contained clustering solutions
N: int
Number of objects
nEnsembles: int
Number of clustering solutions
co_matrix: pandas dataframe
The co-occurrence matrix (square). An entry indicates the fraction of times any pair of objects co-clusters across the ensemble
avg_dist: float
The mean of all co-occurrences (not including self distances)
See Also
--------
openensembles.cluster.co_occurrence_matrix()
"""
def __init__(self, cObj, data_source_name):
self.cObj = cObj
self.data_source_name = data_source_name
parg = []
for solution in cObj.labels:
parg.append(cObj.labels[solution])
self.parg = parg
self.data = cObj.dataObj.D[data_source_name]
self.N = self.data.shape[0]
self.nEnsembles = len(self.parg)
co_matrix = self.gather_partitions()
self.co_matrix = co_matrix
self.avg_dist = np.mean(ssd.squareform(1-self.co_matrix))
def gather_partitions(self):
"""
Gather partitions sums the number of times all pairs of objects fall within the same cluster
across the ensemble.
Returns
-------
co_matrix_df: pandas dataframe
a dataframe of object names in column and header, wrapped around the co-occurrence matrix
todo:: Check that the solution dimensionality and the data matrix dimensions are the same
"""
dim = self.N
co_matrix = np.zeros(shape=(dim,dim))
for solution in self.parg:
co_bin = self.gather_single_partition(solution)
co_matrix += co_bin
co_matrixF = co_matrix/self.nEnsembles
header = self.cObj.dataObj.df.index.get_values()
co_matrix_df = pd.DataFrame(index=header, data=co_matrixF,
columns=header)
return co_matrix_df
def gather_single_partition(self, solution):
"""
For an individual solution (set of labels), create a binary cooccurrence matrix that has an entry of 1 if
both objects are in the same cluster and 0 if not.
Parameters
----------
Solution: list of ints
A single solution vector of clustering labels
Returns
-------
co_matrix: matrix
a square matrix the size of the length of solution with boolean values (0,1)
"""
dim = len(solution)
co_matrix = np.zeros(shape=(dim,dim))
clusterid_list = np.unique(solution)
#print clusterid_list
for clusterid in clusterid_list:
itemindex = np.where(solution==clusterid)
for i,x in enumerate(itemindex[0][0:]):
co_matrix[x,x] += 1
for j,y in enumerate(itemindex[0][i+1:]):
co_matrix[x,y]+=1
co_matrix[y,x]+=1
return co_matrix
def pairwise_list(self):
"""
Reshapes the co-occurrence dataframe matrix into a list, so it can easily be ordered and explored
Returns
-------
df: pandas dataframe
with a row entry index of object1_object2 and a co-occurrence column labled 'pairwise'
"""
#return a new dataframe in list with index equal to the pairs being
#considered
coMat = self.co_matrix
df = pd.DataFrame(columns=['site_1', 'site_2', 'pairwise'])
headerList = list(coMat)
for i in range(0, len(headerList)-1):
for x in range(i+1, len(headerList)):
s = pd.Series()
h = headerList[i]
j = headerList[x]
s['pairwise'] = coMat.loc[h,j]
s['site_1'] = h
s['site_2'] = j
s.name = '%s; %s'%(h,j)
df = df.append(s)
return df
def link(self, linkage='average'):
"""
Link a co-occurrence matrix. This is required so that co-occurrence is properly treated as a distance matrix
during scipy.cluster.hierarchy.linkage
Parameters
----------
linkage: string
type of linkage to use, see scipy.cluster.hierarchy.linkage for options
Returns
-------
lnk: scipy.cluster.hierarch.linkage object
"""
#arr = self.co_matrix
#set diagonal to zero
arr = 1 - self.co_matrix
lnk = sch.linkage(ssd.squareform(arr), method=linkage, metric='euclidean')
return lnk
def cut(self, lnk, threshold):
"""
**A finishing technique**
Given the calculation of a linkage (e.g. self.lnk(linkage='average')), cut the resulting linkage
at the given threshold and return the labels from the resulting cut on the hierarchically
:param lnk: a linkage object, that can be generated by self.link()
:type lnk: scipy.cluster.hierarch.linkage object
:param threshold: the threshold to cut the groups into. See self.plot() to visually explore the cuts at different thresholds.
:type threshold: float
:returns: labels - vector of ints assigning each object into a group. This is a type of finishing for deterministic clustering from an ensemble.
"""
ind = sch.fcluster(lnk, threshold, 'distance')
return ind
def plot(self, threshold='avg', linkage='average', add_labels= True, **kwargs):#dist_thresh=self.avg_dist):
"""
Plot the co_occurrence matrix with a dendrogram and heatmap
By Default labels=True, set to false to suppress labels in graph
By default label_vec equal to the index list of the dataObj dataframe. Otherwise, you can pass in an alternate naming scheme,
vector length should be the same as
Parameters
----------
threshold: float
Use threshold to color the dendrogram
This is useful for identifying visually how to call .cut()
Default is the average value in the co-occurrence matrix, which is updated to float when 'avg' is passed
add_labels: bool
If you wish to shut off printing of labels pass False, else this will print labels according to the co-matrix data frame headers
linkage: string
Linkage type to use for dendrogram. Default is average
Other Parameters
----------------
label_vec: list
If you want to add labels, but not the same in co-occurrence matrix dataframe, then pass those here
Raises
------
ValueError:
if label_vec in **kwargs is different size then number of objects
Examples
--------
>>> coMat = c.co_occurrence_matrix()
>>> coMat.plot(threshold=1, linkage='average', labels=False)
"""
if isinstance(threshold, str):
threshold = self.avg_dist
if add_labels:
if "label_vec" in kwargs: # use this if you have different labels than in c.dataObj.df.index.values
label_vec = kwargs['label_vec']
if len(label_vec) != len(self.co_matrix):
raise ValueError("ERROR: the length of label vector does not equal the number of objects in the co_occurrence matrix")
else:
label_vec = self.cObj.dataObj.df.index.values.tolist() #using parent just to get column names
else:
label_vec = []
fig = plot_matrix_sorted(self.co_matrix, label_vec, threshold, self.link(linkage=linkage))
return fig
[docs]def plot_matrix_sorted(matrix, label_vec, threshold, lnk1):
"""
A heatmap plotting function, for both co-occurrence and mutual information
Parameters
----------
matrix: np.array
An array of values to plot as a heatmap
label_vec: list of strings
A label_vec to label the row-wise objects. Empty if labels not requested
threshold: float
Threshold to use for coloring of dendrogram
lnk1: linkage object
A pre-calcluated linkage object to use to sort.
Returns
-------
fig: matplotlib.pyplot figure
The figure handle
"""
fig = pylab.figure(figsize=(10,10))
panel3 = fig.add_axes([0,0,1,1])
panel3.axis('off')
# Add dendrogram
if label_vec:
add_labels = True
else:
add_labels = False
if add_labels:
ax1 = add_subplot_axes(panel3,[0.0,0.3,0.11,.6])
Z_pp = sch.dendrogram(lnk1, orientation='left', color_threshold=threshold, labels=label_vec)
else:
ax1 = add_subplot_axes(panel3,[0.16,0.3,0.11,.6])
Z_pp = sch.dendrogram(lnk1, orientation='left', color_threshold=threshold)
ax1.set_yticks([])
idx_pp = Z_pp['leaves']
#
fig.gca().invert_yaxis() # must couple with matshow origin='upper',
ax1.set_xticks([])
for side in ['top','right','bottom','left']:
ax1.spines[side].set_visible(False)
# plot heatmap
axmatrix = add_subplot_axes(panel3,[0.28,0.3,0.7,.6])
hm = matrix
hm = hm.ix[idx_pp,idx_pp]
im = axmatrix.matshow(hm, aspect='auto', origin='upper', cmap='afmhot')
axmatrix.axis('off')
# Plot colorbar indicating scale
axcolor = add_subplot_axes(panel3,[0.28,0.2,0.7,.02]) # [xmin, ymin, dx, and dy]
h=pylab.colorbar(im, cax=axcolor,orientation='horizontal')
h.ax.tick_params(labelsize=10)
h.set_ticks([0.0,.25,.50,.75,1])
#h.set_ticklabels(['0%','25%','50%','75%','100%'])
#plt.show()
return fig
[docs]def add_subplot_axes(ax,rect,facecolor='w'):
"""
A non-class function to handle subaxes
"""
fig = plt.gcf()
box = ax.get_position()
width = box.width
height = box.height
inax_position = ax.transAxes.transform(rect[0:2])
transFigure = fig.transFigure.inverted()
infig_position = transFigure.transform(inax_position)
x = infig_position[0]
y = infig_position[1]
width *= rect[2]
height *= rect[3]
subax = fig.add_axes([x,y,width,height],facecolor=facecolor)
return subax