import numpy as np
import pandas as pd
from enum import Enum
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from matplotlib.colors import LinearSegmentedColormap, Normalize
import matplotlib.cm as cm
from scipy.cluster.hierarchy import dendrogram, linkage
import seaborn as sns
class OrientationError(Exception):
def __init__(self, message = "Orientation Invalid. Valid Orientations are : ", valid_orientations = ['left', 'right', 'top', 'bottom']):
self.message = message + ', '.join(valid_orientations)
def __str__(self):
return self.message
[docs]class DotPlot:
"""
The DotPlot class is used for plotting dotplots, with the option to add clustering and context plots.
The size of the dots based on the values dataframe, where the size of the dot is the area of the value * dotsize
Parameters
----------
values: pandas DataFrame instance
values to plot
fpr : pandas DataFrame instance
false positive rates associated with values being plotted
alpha: float, optional
fpr value that defines the significance cutoff to use when plt
default : 0.05
inclusive_alpha: boolean
whether to include the alpha (significance <= alpha), or not (significance < alpha).
default: True
binary_sig: boolean, optional
indicates whether to plot fpr with binary significance or as a change color hue
default : True
dotsize : float, optional
multiplier to use for scaling size of dots
colormap : dict, optional
maps color values to actual color to use in plotting
default : {0: '#6b838f', 1: '#FF3300'}
labelmap =
maps labels of colors, default is to indicate FPR cutoff in legend
default : None
facecolor : color, optional
Background color of dotplot
default : 'white'
legend_title : str, optional
Legend Title for dot sizes, default is `p-value'
size_number : int, optional
Number of dots to attempt to generate for dot size legend
size_color : color, optional
Size Legend Color to use
color_title : str, optional
Legend Title for the Color Legend
markersize : int, optional
Size of dots for Color Legend
legend_distance : int, optional
relative distance to place legends
figsize : tuple, optional
size of dotplot figure
title : str, optional
Title of dotplot
xlabel : bool, optional
Show xlabel on graph if True
ylabel : bool, optional
Show ylabel on graph if True
x_label_dict: dict, optional
Mapping dictionary of labels as they appear in values dataframe (keys) to how they should appear on plot (values)
kinase_dict: dict, optional
Mapping dictionary of kinase names as they appear in values dataframe (keys) to how they should appear on plot (values)
Attributes
----------
values: pandas dataframe
a copy of the original values dataframe
fpr: pandas dataframe
a copy of the original fpr dataframe
alpha: float
cutoff used for significance, default 0.05
inclusive_alpha: boolean
whether to include the alpha (significance <= alpha), or not (significance < alpha)
significance: pandas dataframe
indicates whether a particular kinases activity is significant, where fpr <= alpha is significant, otherwise it is insignificant
colors: pandas dataframe
dataframe indicating the color to use when plotting: either a copy of the fpr or significance dataframe
binary_sig: boolean
indicates whether coloring will be done based on binary significance or fpr values. Default True
labelmap: dict
indicates how to label each significance color
figsize: tuple
size of the outputted figure, which is overridden if axes is provided for dotplot
title: string
title of the dotplot
xlabel: boolean
indicates whether to plot x-axis labels
ylabel: boolean
indicates whether to plot y-axis labels
colormap: dict
colors to be used when plotting
facecolor: string
background color of dotplot
"""
def __init__(self, values, fpr, alpha = 0.05, inclusive_alpha = True,
binary_sig = True, dotsize = 5,
colormap={0: '#6b838f', 1: '#FF3300'}, facecolor = 'white',
labelmap = None,
legend_title = 'p-value', size_number = 5, size_color = 'gray',
color_title = 'Significant', markersize = 10,
legend_distance = 1.0, figsize = (20,4), title = None,
xlabel = True, ylabel = True, x_label_dict = None, kinase_dict = None):
self.values = values.copy()
self.fpr = fpr.copy()
#make sure that fpr dataframe has the same index as values dataframe. If not, reindex
self.fpr = self.fpr.loc[self.values.index,self.values.columns]
self.alpha = alpha
#create binary dataframe that indicates significance based on provided fpr cutoff.
if inclusive_alpha:
self.significance = (self.fpr <= alpha) * 1
else:
self.significance = (self.fpr < alpha) * 1
#Assign either fpr or significance to colors dataframe based on
self.binary_sig = binary_sig
if binary_sig:
self.colors = self.significance
if labelmap is None:
if inclusive_alpha:
self.labelmap = {0: 'FPR > %0.2f'%(alpha), 1:'FPR <= %0.2f'%(alpha)}
else:
self.labelmap = {0: 'FPR >= %0.2f'%(alpha), 1:'FPR < %0.2f'%(alpha)}
else:
self.colors = self.fpr
self.figsize = figsize
self.title = title
self.xlabel = xlabel
self.ylabel= ylabel
self.colormap = colormap
self.facecolor = facecolor
self.dotsize = dotsize
self.legend_title = legend_title
self.size_number = size_number
self.size_color = size_color
self.markersize = markersize
self.color_title = color_title
self.legend_distance = legend_distance
self.multiplier = 10
self.offset = 5
self.columns = self.set_column_labels(values, x_label_dict)
self.index = self.set_index_labels(values, kinase_dict)
def set_values(self, values):
self.values = values
def set_colors(self, colors):
self.colors = colors
def set_column_labels(self, values, x_label_dict):
self.column_labels = list(self.values.columns)
if x_label_dict is None: #just strip the data: string
self.x_label_dict = {}
#build an x_label_dict
for col in self.column_labels:
self.x_label_dict[col] = col.replace('data:','')
self.column_labels = [x.replace('data:','') for x in self.column_labels]
else:
#check that the label dictionary keys matches the columns
labels = x_label_dict.keys()
if set(labels) != set(self.column_labels):
raise ValueError("The x_label_dict must have the same elements as the value columns")
else:
label_arr = []
for col in self.column_labels:
label_arr.append(x_label_dict[col])
self.column_labels = label_arr
self.x_label_dict = x_label_dict
def set_index_labels(self, values, kinase_dict):
self.index_labels = list(self.values.index)
if kinase_dict is None:
self.kinase_dict = kinase_dict
elif isinstance(kinase_dict, dict):
#if custom dictionary is provided, make sure the appropriate elements are found inside it (needs to be at least
names = kinase_dict.keys()
if not set(self.index_labels).issubset(set(names)):
raise ValueError("The kinase_dict must contain at least all the kinases found in values")
else:
label_arr = []
for index in self.index_labels:
label_arr.append(kinase_dict[index])
self.index_labels = label_arr
self.kinase_dict = kinase_dict
else:
raise TypeError("If wanting to do a custom naming system, a custom dictionary must be provided in the 'kinase_dict' parameter")
[docs] def dotplot(self, ax = None, orientation = 'left', size_legend = True, color_legend = True, max_size = None):
"""
Generates the dotplot plot, where size is determined by values dataframe and color is determined by significant dataframe
Parameters
-----------
ax : matplotlib Axes instance, optional
axes dotplot will be plotted on. If None then new plot generated
"""
valid_orientations = ['left', 'right']
if orientation not in valid_orientations:
raise OrientationError(valid_orientations = valid_orientations)
if not ax:
fig, ax = plt.subplots(figsize=self.figsize)
ax.set_facecolor(self.facecolor)
ax.set_title(self.title)
# Transform Data
columns = list(self.values.columns)
self.values['row_index'] = np.arange(len(self.values)) * self.multiplier + self.offset
self.colors['row_index'] = np.arange(len(self.colors)) * self.multiplier + self.offset
melt = self.values.melt(id_vars = 'row_index')
self.values.drop(columns = ['row_index'], inplace = True)
melt['var'] = melt.apply(lambda row : columns.index(row[1]) * self.multiplier + self.offset, axis = 1)
melt_color = self.colors.melt(id_vars = 'row_index')
melt_color['var'] = melt_color.apply(lambda row : columns.index(row[1]) * self.multiplier + self.offset, axis = 1)
self.colors.drop(columns = ['row_index'], inplace = True)
# Plot Data
x = melt['var']
y = melt['row_index'][::-1] #needs to be done in reverse order to maintain order in the dataframe
s = melt.value * self.dotsize
#check to see if more than 2 values are given (fprs). Otherwise get color based on binary significance
if self.binary_sig:
#get color for each datapoint based on significance
melt_color['color'] = [self.colormap.get(l,'black') for l in melt_color.value]
else:
cmap = LinearSegmentedColormap.from_list("sig_cmap", list(zip([0,1], [self.colormap[0], self.colormap[1]])))
norm = Normalize(vmin=0, vmax=2, clip=True)
mapper = cm.ScalarMappable(norm=norm, cmap=cmap)
#replace 0 with 0.01 to avoid log10 errors, transform the fprs with a log transform
melt_color.replace(0, 0.01, inplace=True)
melt_color.value = -np.log10(melt_color.value)
#get color for each datapoint based on fpr value
melt_color['color'] = [mapper.to_rgba(l) for l in melt_color.value]
c = melt_color['color']
scatter = ax.scatter(x, y, c=c, s=s)
# Add Color Legend
if color_legend:
if self.binary_sig:
#create the legend
color_legend = []
for color_key in self.colormap.keys():
color_legend.append(
Line2D([0], [0], marker='o', color='w', label=self.labelmap[color_key],
markerfacecolor= self.colormap[color_key], markersize=self.markersize),
)
legend1 = ax.legend(handles=color_legend, loc=f'upper {orientation}', bbox_to_anchor=(self.legend_distance,1), title = self.color_title)
ax.add_artist(legend1)
else:
#choose which values to show in the legend
legend_vals = [1, 0.5, 0.05, 0.01]
legend_color = [mapper.to_rgba(-np.log10(val)) for val in legend_vals]
#create the legend
color_legend = []
for i in range(len(legend_vals)):
color_legend.append(Line2D([0], [0], marker='o', color='w', label=str(legend_vals[i]),
markerfacecolor= legend_color[i], markersize=self.markersize))
legend1 = ax.legend(handles=color_legend, loc=f'upper {orientation}', bbox_to_anchor=(self.legend_distance,1), title = 'FPR')
ax.add_artist(legend1)
# Add Size Legend
if size_legend:
#check to see if max pval parameter was given: if so, use to create custom legend
if max_size is not None:
s_label = np.arange(max_size/self.size_number,max_size+1,max_size/self.size_number).astype(int)
dsize = [s*self.dotsize for s in s_label]
legend_elements = []
for element, s in zip(s_label, dsize):
legend_elements.append(Line2D([0],[0], marker='o', color = 'w', markersize = s**0.5, markerfacecolor = self.size_color, label = element))
legend2 = ax.legend(handles = legend_elements, loc = f'lower {orientation}', title = self.legend_title, bbox_to_anchor=(self.legend_distance,0))
else:
kw = dict(prop="sizes", num=self.size_number, color=self.size_color, func=lambda s: s/self.dotsize)
legend2 = ax.legend(*scatter.legend_elements(**kw),
loc=f'lower {orientation}', title=self.legend_title, bbox_to_anchor=(self.legend_distance,0))
# Add Additional Plotting Information
ax.tick_params(axis = 'x', rotation = 90)
ax.yaxis.set_ticks(np.arange(len(self.values)) * self.multiplier + self.offset)
ax.xaxis.set_ticks(np.arange(len(columns)) * self.multiplier + self.offset)
# set column labels in case values has changed
self.set_column_labels(self.values, self.x_label_dict)
ax.set_xticklabels(self.column_labels)
ax.set_yticklabels(self.index_labels[::-1])
#adjust x and y scale so that data is always equally spaced
ax.set_ylim([0,len(self.values)*self.multiplier])
ax.set_xlim([0,len(columns)*self.multiplier])
if not self.xlabel:
ax.axes.xaxis.set_visible(False)
if not self.ylabel:
ax.axes.yaxis.set_visible(False)
return ax
[docs] def cluster(self, ax, method='single', metric='euclidean', orientation = 'top', color_threshold = -np.inf):
"""
Performs hierarchical clustering on data and plots result to provided Axes.
result and significant dataframes are ordered according to clustering
Parameters
---------
ax : matplotlib Axes instance
Axes to plot dendogram to
method : str, optional
The linkage algorithm to use.
metric : str or function, optional
The distance metric to use in the case that y is a collection of observation vectors;
ignored otherwise. See the pdist function for a list of valid distance metrics. A custom distance function can also be used.
orientation : str, optional
The direction to plot the dendrogram, which can be any of the following strings:
'top': Plots the root at the top, and plot descendent links going downwards. (default).
'bottom': Plots the root at the bottom, and plot descendent links going upwards.
'left': Plots the root at the left, and plot descendent links going right.
'right': Plots the root at the right, and plot descendent links going left.
"""
if orientation in ['left', 'right']:
row_linkage = linkage(self.values, method = method, metric = metric)
den_row = dendrogram(row_linkage,
ax = ax,
orientation = orientation,
labels = list(self.values.index),
color_threshold = color_threshold,
above_threshold_color = 'black',
no_labels = True,
show_leaf_counts = False)
self.values = self.values.iloc[den_row['leaves']].copy()
self.colors = self.colors.iloc[den_row['leaves']].copy()
self.set_index_labels(self.values, self.kinase_dict)
elif orientation in ['top', 'bottom']:
col_linkage = linkage(self.values.T, method=method, metric = metric)
den_col = dendrogram(col_linkage,
ax = ax,
orientation = orientation,
labels = list(self.values.columns),
color_threshold = color_threshold,
above_threshold_color = 'black',
no_labels = True,
show_leaf_counts = False)
self.values = self.values.iloc[:, den_col['leaves']].copy()
self.colors = self.colors.iloc[:,den_col['leaves']].copy()
self.set_column_labels(self.values, self.x_label_dict)
else:
raise OrientationError()
ax.tick_params(axis='both', which='both', length=0)
[docs] def drop_kinases_with_no_significance(self):
"""
Drop kinases from the values dataframe (inplace) when plotting if they are never observed as significant
"""
kinase_list = self.significance[self.significance.sum(axis=1) ==0].index.values
#check to make sure kinase_list only contains kinases currently in values dataframe
kinase_list = [kin for kin in kinase_list if kin in self.index_labels]
#remove kinases
self.drop_kinases(kinase_list)
[docs] def drop_kinases(self, kinase_list):
"""
Given a list of kinases, drop these from the dot.values dataframe in all future plotting of this object. Removal
is in place
Parameters
----------
kinase_list: list
list of kinase names to remove
"""
#check to make sure kinase_list only contains kinases currently in values dataframe
kinase_list = [kin for kin in kinase_list if kin in self.index_labels]
self.values.drop(index=kinase_list, inplace=True)
self.colors.drop(index = kinase_list, inplace=True)
#update index_labels property as well
for kin in kinase_list:
if self.kinase_dict is None:
self.index_labels.remove(kin)
else:
self.index_labels.remove(self.kinase_dict[kin])
[docs] def context(self, ax, info, id_column, context_columns, dotsize = 200, markersize = 20, orientation = 'left', color_palette='colorblind', margin = 0.2, make_legend = True):
"""
Context plot is generated and returned. The context plot contains the categorical data used for describing the data.
Parameters
----------
ax : maptlotlib axis
where to map subtype information to
info : pandas df
Dataframe where context information is pulled from
id_column: str
Column used to map the subtype information to
context_columns : list
list of columns to pull context informaiton from
dotsize : int, optional
size of context dots
markersize: int, optional
size of legend markers
orientation : str, optional
orientation to plot context plots to - determines where legends are placed
options : left, right, top, bottom
color_palette : str, optional
seaborn color palette to use
margin: float, optional
margin
make_legend : bool, optional
whether to create legend for context colors
"""
orientation_values = {
'left' : -1,
'right' : 1,
'bottom' : -1,
'top' : 1
}
if orientation in [ 'left', 'right']:
index = list(self.values.index)
elif orientation in ['top', 'bottom']:
index = list(self.values.columns)
else:
raise OrientationError
#record the number of different context types to include
num_context = len(context_columns)
melted = info[[id_column] + context_columns].melt(id_vars=id_column)
#weird issue with melt function here, where for one datset it provides the context column names in 0 column rather than 'variable'. Rename for now.
if 0 in melted.columns:
melted.rename(columns = {0: 'variable'}, inplace = True)
melted['var'] = melted.apply(lambda row : index.index(row[0]) * self.multiplier + self.offset, axis = 1)
color_labels = melted['value'].unique()
rgb_values = sns.color_palette(color_palette, len(color_labels))
color_map = dict(zip(color_labels, rgb_values))
if orientation in ['left', 'right']:
ax.scatter(x = melted['variable'], y = melted['var'],c = melted['value'].map(color_map), s = dotsize)
ax.tick_params(axis = 'x', rotation = 90)
ax.axes.get_yaxis().set_visible(False)
ax.margins(margin, 0.05)
elif orientation in ['top', 'bottom']:
ax.scatter(x = melted['var'], y = melted['variable'], c = melted['value'].map(color_map), s = dotsize)
ax.axes.get_xaxis().set_visible(False)
ax.margins(0.05, margin)
ax.set_ylim([-0.5,num_context+0.5-1])
total = len(melted['value'].unique()) + len(info.columns)-1
running_total = 0
# Add legends
if make_legend:
for col in context_columns:
ids = info[col].unique()
sig_legend = []
for label in ids:
color = color_map[label]
sig_legend.append(
Line2D([0], [0], marker='o', color='w', label=label, markerfacecolor=color,markersize=markersize))
if orientation in ['left', 'right']:
leg = ax.legend(
handles=sig_legend,
bbox_to_anchor=(orientation_values[orientation],1-running_total/total),
title=col)
elif orientation in ['top', 'bottom']:
leg = ax.legend(
handles=sig_legend,
bbox_to_anchor=(running_total/total, orientation_values[orientation]),
loc='lower left',
title=col)
ax.add_artist(leg)
running_total += len(ids) + 1