Source code for kstar.plot

import numpy as np
import pandas as pd
from enum import Enum


import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from matplotlib.colors import LinearSegmentedColormap, Normalize
import matplotlib.cm as cm
from scipy.cluster.hierarchy import dendrogram, linkage

import seaborn as sns

class OrientationError(Exception):
    def __init__(self, message = "Orientation Invalid. Valid Orientations are : ", valid_orientations = ['left', 'right', 'top', 'bottom']):
        self.message = message + ', '.join(valid_orientations)
    def __str__(self):
        return self.message
        
        

[docs]class DotPlot:
    """
    The DotPlot class is used for plotting dotplots, with the option to add clustering and context plots.
    The size of the dots based on the values dataframe, where the size of the dot is the area of the value * dotsize
           
    Parameters
    ----------
    values: pandas DataFrame instance
        values to plot 
    fpr : pandas DataFrame instance 
        false positive rates associated with values being plotted
    alpha: float, optional
        fpr value that defines the significance cutoff to use when plt
        default : 0.05
    inclusive_alpha: boolean
        whether to include the alpha (significance <= alpha), or not (significance < alpha).
        default: True
    binary_sig: boolean, optional
        indicates whether to plot fpr with binary significance or as a change color hue
        default : True
    dotsize : float, optional
        multiplier to use for scaling size of dots
    colormap : dict, optional
        maps color values to actual color to use in plotting
        default : {0: '#6b838f', 1: '#FF3300'}
    labelmap = 
        maps labels of colors, default is to indicate FPR cutoff in legend
        default : None
    facecolor : color, optional
        Background color of dotplot
        default : 'white'
    legend_title : str, optional
        Legend Title for dot sizes, default is `p-value'
    size_number : int, optional 
        Number of dots to attempt to generate for dot size legend
    size_color : color, optional
        Size Legend Color to use 
    color_title : str, optional
        Legend Title for the Color Legend
    markersize : int, optional
        Size of dots for Color Legend
    legend_distance : int, optional
        relative distance to place legends 
    figsize : tuple, optional 
        size of dotplot figure
    title : str, optional
        Title of dotplot
    xlabel : bool, optional
        Show xlabel on graph if True
    ylabel : bool, optional
        Show ylabel on graph if True
    x_label_dict: dict, optional
        Mapping dictionary of labels as they appear in values dataframe (keys) to how they should appear on plot (values)
    kinase_dict: dict, optional
        Mapping dictionary of kinase names as they appear in values dataframe (keys) to how they should appear on plot (values)
    
    Attributes
    ----------
    values: pandas dataframe
        a copy of the original values dataframe
    fpr: pandas dataframe
        a copy of the original fpr dataframe
    alpha: float
        cutoff used for significance, default 0.05
    inclusive_alpha: boolean
        whether to include the alpha (significance <= alpha), or not (significance < alpha)
    significance: pandas dataframe
        indicates whether a particular kinases activity is significant, where fpr <= alpha is significant, otherwise it is insignificant
    colors: pandas dataframe
        dataframe indicating the color to use when plotting: either a copy of the fpr or significance dataframe
    binary_sig: boolean
        indicates whether coloring will be done based on binary significance or fpr values. Default True
    labelmap: dict
        indicates how to label each significance color
    figsize: tuple
        size of the outputted figure, which is overridden if axes is provided for dotplot
    title: string
        title of the dotplot
    xlabel: boolean
        indicates whether to plot x-axis labels
    ylabel: boolean
        indicates whether to plot y-axis labels
    colormap: dict
        colors to be used when plotting
    facecolor: string
        background color of dotplot
    """
    
    
    def __init__(self, values, fpr, alpha = 0.05, inclusive_alpha = True,
                 binary_sig = True, dotsize = 5, 
                 colormap={0: '#6b838f', 1: '#FF3300'}, facecolor = 'white',
                 labelmap = None,
                 legend_title = 'p-value', size_number = 5, size_color = 'gray', 
                 color_title = 'Significant', markersize = 10, 
                 legend_distance = 1.0, figsize = (20,4), title = None,
                 xlabel = True, ylabel = True, x_label_dict = None, kinase_dict = None):


        self.values = values.copy()
        self.fpr = fpr.copy()
        #make sure that fpr dataframe has the same index as values dataframe. If not, reindex
        self.fpr = self.fpr.loc[self.values.index,self.values.columns]
        self.alpha = alpha
        #create binary dataframe that indicates significance based on provided fpr cutoff.
        if inclusive_alpha:
            self.significance = (self.fpr <= alpha) * 1
        else:
            self.significance = (self.fpr < alpha) * 1
        #Assign either fpr or significance to colors dataframe based on 
        self.binary_sig = binary_sig
        if binary_sig:
            self.colors = self.significance
            if labelmap is None:
                if inclusive_alpha:
                    self.labelmap = {0: 'FPR > %0.2f'%(alpha), 1:'FPR <= %0.2f'%(alpha)}
                else: 
                    self.labelmap = {0: 'FPR >= %0.2f'%(alpha), 1:'FPR < %0.2f'%(alpha)}

        else:
            self.colors = self.fpr
      
        
        self.figsize =  figsize
        self.title = title
        self.xlabel = xlabel
        self.ylabel= ylabel

        self.colormap = colormap
        
        self.facecolor = facecolor

        self.dotsize = dotsize
        
        self.legend_title = legend_title
        self.size_number = size_number
        self.size_color = size_color
        self.markersize = markersize

        self.color_title = color_title
        self.legend_distance = legend_distance

        self.multiplier = 10
        self.offset = 5

        self.columns = self.set_column_labels(values, x_label_dict)
        self.index = self.set_index_labels(values, kinase_dict)
    
    def set_values(self, values):
        self.values = values

    def set_colors(self, colors):
        self.colors = colors

    def set_column_labels(self, values, x_label_dict):
        self.column_labels = list(self.values.columns)

        if x_label_dict is None: #just strip the data: string
            self.x_label_dict = {}
            
            #build an x_label_dict 
            for col in self.column_labels:
                self.x_label_dict[col] = col.replace('data:','')
            self.column_labels = [x.replace('data:','') for x in self.column_labels]

        else:
            #check that the label dictionary keys matches the columns
            labels = x_label_dict.keys()
            if set(labels) != set(self.column_labels):
                raise ValueError("The x_label_dict must have the same elements as the value columns")
            else:
                label_arr = []
                for col in self.column_labels:
                    label_arr.append(x_label_dict[col])
            self.column_labels = label_arr
            self.x_label_dict = x_label_dict
            
    def set_index_labels(self, values, kinase_dict):
        self.index_labels = list(self.values.index)
        if kinase_dict is None:
            self.kinase_dict = kinase_dict
        elif isinstance(kinase_dict, dict):
            #if custom dictionary is provided, make sure the appropriate elements are found inside it (needs to be at least
            names = kinase_dict.keys()
            if not set(self.index_labels).issubset(set(names)):
                raise ValueError("The kinase_dict must contain at least all the kinases found in values")
            else:
                label_arr = []
                for index in self.index_labels:
                    label_arr.append(kinase_dict[index])
            
                self.index_labels = label_arr
                self.kinase_dict = kinase_dict
        else:
            raise TypeError("If wanting to do a custom naming system, a custom dictionary must be provided in the 'kinase_dict' parameter")
        

[docs]    def dotplot(self, ax = None, orientation = 'left', size_legend = True, color_legend = True, max_size = None):
        """
        Generates the dotplot plot, where size is determined by values dataframe and color is determined by significant dataframe
        
        Parameters
        -----------
        ax : matplotlib Axes instance, optional
            axes dotplot will be plotted on. If None then new plot generated
        """
        valid_orientations = ['left', 'right']
        if orientation not in valid_orientations:
            raise OrientationError(valid_orientations = valid_orientations)
            
        if not ax:
            fig, ax = plt.subplots(figsize=self.figsize)
        ax.set_facecolor(self.facecolor)
        ax.set_title(self.title)
        
        # Transform Data
        columns = list(self.values.columns)
        self.values['row_index'] = np.arange(len(self.values)) * self.multiplier + self.offset
        self.colors['row_index'] = np.arange(len(self.colors)) * self.multiplier + self.offset
    
        melt = self.values.melt(id_vars = 'row_index')
        self.values.drop(columns = ['row_index'], inplace = True)
        melt['var'] = melt.apply(lambda row : columns.index(row[1]) * self.multiplier + self.offset, axis = 1)
        
        melt_color = self.colors.melt(id_vars = 'row_index')
        melt_color['var'] = melt_color.apply(lambda row : columns.index(row[1]) * self.multiplier + self.offset, axis = 1)
        self.colors.drop(columns = ['row_index'], inplace = True)

        # Plot Data
        x = melt['var']
        y = melt['row_index'][::-1]    #needs to be done in reverse order to maintain order in the dataframe
        
        
        s = melt.value * self.dotsize
        
        #check to see if more than 2 values are given (fprs). Otherwise get color based on binary significance
        if self.binary_sig:
            #get color for each datapoint based on significance
            melt_color['color'] = [self.colormap.get(l,'black') for l in melt_color.value]
        else:
            cmap = LinearSegmentedColormap.from_list("sig_cmap", list(zip([0,1], [self.colormap[0], self.colormap[1]])))
            norm = Normalize(vmin=0, vmax=2, clip=True)
            mapper = cm.ScalarMappable(norm=norm, cmap=cmap)
            #replace 0 with 0.01 to avoid log10 errors, transform the fprs with a log transform
            melt_color.replace(0, 0.01, inplace=True)
            melt_color.value = -np.log10(melt_color.value)
            #get color for each datapoint based on fpr value
            melt_color['color'] = [mapper.to_rgba(l) for l in melt_color.value]

            
        c = melt_color['color']
        scatter = ax.scatter(x, y, c=c, s=s)
        
        # Add Color Legend
        if color_legend:
            if self.binary_sig:
                #create the legend
                color_legend = []
                for color_key in self.colormap.keys():
                    color_legend.append(
                        Line2D([0], [0], marker='o', color='w', label=self.labelmap[color_key],
                                markerfacecolor= self.colormap[color_key], markersize=self.markersize),
                    )     
                legend1 = ax.legend(handles=color_legend, loc=f'upper {orientation}', bbox_to_anchor=(self.legend_distance,1), title = self.color_title)  
                ax.add_artist(legend1) 
            else:
                #choose which values to show in the legend
                legend_vals = [1, 0.5, 0.05, 0.01]
                legend_color = [mapper.to_rgba(-np.log10(val)) for val in legend_vals]
                #create the legend 
                color_legend = []
                for i in range(len(legend_vals)):
                    color_legend.append(Line2D([0], [0], marker='o', color='w', label=str(legend_vals[i]),
                                markerfacecolor= legend_color[i], markersize=self.markersize))
                legend1 = ax.legend(handles=color_legend, loc=f'upper {orientation}', bbox_to_anchor=(self.legend_distance,1), title = 'FPR')  
                ax.add_artist(legend1) 


        # Add Size Legend
        if size_legend:
            #check to see if max pval parameter was given: if so, use to create custom legend
            if max_size is not None:
                s_label = np.arange(max_size/self.size_number,max_size+1,max_size/self.size_number).astype(int)
                dsize = [s*self.dotsize for s in s_label]
                legend_elements = []
                for element, s in zip(s_label, dsize):
                    legend_elements.append(Line2D([0],[0], marker='o', color = 'w', markersize = s**0.5, markerfacecolor = self.size_color, label = element))
                legend2 = ax.legend(handles = legend_elements, loc = f'lower {orientation}', title = self.legend_title, bbox_to_anchor=(self.legend_distance,0))        
            else:
                kw = dict(prop="sizes", num=self.size_number, color=self.size_color, func=lambda s: s/self.dotsize) 
                legend2 = ax.legend(*scatter.legend_elements(**kw),
                        loc=f'lower {orientation}', title=self.legend_title, bbox_to_anchor=(self.legend_distance,0)) 
        
        # Add Additional Plotting Information
        ax.tick_params(axis = 'x', rotation = 90)
        ax.yaxis.set_ticks(np.arange(len(self.values)) * self.multiplier + self.offset)
        ax.xaxis.set_ticks(np.arange(len(columns)) * self.multiplier + self.offset)
        
        # set column labels in case values has changed
        self.set_column_labels(self.values, self.x_label_dict)
        ax.set_xticklabels(self.column_labels)
        ax.set_yticklabels(self.index_labels[::-1])
        #adjust x and y scale so that data is always equally spaced
        ax.set_ylim([0,len(self.values)*self.multiplier])
        ax.set_xlim([0,len(columns)*self.multiplier])
        
        if not self.xlabel:
            ax.axes.xaxis.set_visible(False)
        if not self.ylabel:
            ax.axes.yaxis.set_visible(False)
        return ax 
    
[docs]    def cluster(self, ax, method='single', metric='euclidean', orientation = 'top', color_threshold = -np.inf):
        """
        Performs hierarchical clustering on data and plots result to provided Axes. 
        result and significant dataframes are ordered according to clustering
        
        Parameters
        ---------
        ax : matplotlib Axes instance
            Axes to plot dendogram to
        
        method : str, optional
            The linkage algorithm to use.
        metric : str or function, optional
            The distance metric to use in the case that y is a collection of observation vectors; 
            ignored otherwise. See the pdist function for a list of valid distance metrics. A custom distance function can also be used.
        
        orientation : str, optional
            The direction to plot the dendrogram, which can be any of the following strings:
            'top': Plots the root at the top, and plot descendent links going downwards. (default).
            'bottom': Plots the root at the bottom, and plot descendent links going upwards.
            'left': Plots the root at the left, and plot descendent links going right.
            'right': Plots the root at the right, and plot descendent links going left.
        """
        if orientation in ['left', 'right']:
            row_linkage = linkage(self.values, method = method, metric = metric)
            den_row = dendrogram(row_linkage, 
                        ax = ax, 
                        orientation = orientation, 
                        labels = list(self.values.index), 
                        color_threshold = color_threshold, 
                        above_threshold_color = 'black', 
                        no_labels = True, 
                        show_leaf_counts = False) 
            self.values = self.values.iloc[den_row['leaves']].copy()
            self.colors = self.colors.iloc[den_row['leaves']].copy()
            self.set_index_labels(self.values, self.kinase_dict)

        
        elif orientation in ['top', 'bottom']:
            col_linkage = linkage(self.values.T, method=method, metric = metric)
            den_col = dendrogram(col_linkage, 
                            ax = ax, 
                            orientation = orientation, 
                            labels = list(self.values.columns), 
                            color_threshold = color_threshold, 
                            above_threshold_color = 'black', 
                            no_labels = True, 
                            show_leaf_counts = False)
            self.values = self.values.iloc[:, den_col['leaves']].copy()
            self.colors = self.colors.iloc[:,den_col['leaves']].copy()
            self.set_column_labels(self.values, self.x_label_dict)

        else:
            raise OrientationError()
            
        ax.tick_params(axis='both', which='both', length=0)
        
[docs]    def drop_kinases_with_no_significance(self):
        """
        Drop kinases from the values dataframe (inplace) when plotting if they are never observed as significant
        
        """

        kinase_list = self.significance[self.significance.sum(axis=1) ==0].index.values
        #check to make sure kinase_list only contains kinases currently in values dataframe
        kinase_list = [kin for kin in kinase_list if kin in self.index_labels]
        #remove kinases
        self.drop_kinases(kinase_list)


[docs]    def drop_kinases(self, kinase_list):
        """
        Given a list of kinases, drop these from the dot.values dataframe in all future plotting of this object. Removal 
        is in place

        Parameters
        ----------
        kinase_list: list
            list of kinase names to remove

        """ 
        #check to make sure kinase_list only contains kinases currently in values dataframe
        kinase_list = [kin for kin in kinase_list if kin in self.index_labels]
        
        self.values.drop(index=kinase_list, inplace=True)
        self.colors.drop(index = kinase_list, inplace=True)

        #update index_labels property as well
        for kin in kinase_list:
            if self.kinase_dict is None:
                self.index_labels.remove(kin)
            else:
                self.index_labels.remove(self.kinase_dict[kin])
        
        
[docs]    def context(self, ax, info, id_column, context_columns, dotsize = 200, markersize = 20, orientation = 'left', color_palette='colorblind', margin = 0.2, make_legend = True):
        """
        Context plot is generated and returned. The context plot contains the categorical data used for describing the data.
        
        Parameters
        ----------
        ax : maptlotlib axis
            where to map subtype information to
        info : pandas df
            Dataframe where context information is pulled from
        id_column: str
            Column used to map the subtype information to
        context_columns : list
            list of columns to pull context informaiton from
        dotsize : int, optional
            size of context dots
        markersize: int, optional
            size of legend markers
        orientation : str, optional
            orientation to plot context plots to - determines where legends are placed
            options : left, right, top, bottom
        color_palette : str, optional
            seaborn color palette to use  
        margin: float, optional
            margin  
        make_legend : bool, optional
            whether to create legend for context colors
        """
        
        orientation_values = {
            'left' : -1,
            'right' : 1,
            'bottom' : -1,
            'top' : 1
        }
        
        if orientation in [ 'left', 'right']:
            index = list(self.values.index)
        elif orientation in ['top', 'bottom']:
            index = list(self.values.columns)
        else: 
            raise OrientationError
        
        #record the number of different context types to include
        num_context = len(context_columns)
        melted = info[[id_column] + context_columns].melt(id_vars=id_column)
        #weird issue with melt function here, where for one datset it provides the context column names in 0 column rather than 'variable'. Rename for now.
        if 0 in melted.columns:
            melted.rename(columns = {0: 'variable'}, inplace = True)
        melted['var'] = melted.apply(lambda row : index.index(row[0]) * self.multiplier + self.offset, axis = 1)
        color_labels = melted['value'].unique()
        rgb_values = sns.color_palette(color_palette, len(color_labels))
        color_map = dict(zip(color_labels, rgb_values))
        
        if orientation in ['left', 'right']:
            ax.scatter(x = melted['variable'], y = melted['var'],c = melted['value'].map(color_map), s = dotsize)
            ax.tick_params(axis = 'x', rotation = 90)
            ax.axes.get_yaxis().set_visible(False)
            ax.margins(margin, 0.05)
        elif orientation in ['top', 'bottom']:
            ax.scatter(x = melted['var'], y = melted['variable'], c = melted['value'].map(color_map), s = dotsize)
            ax.axes.get_xaxis().set_visible(False)
            ax.margins(0.05, margin)
            ax.set_ylim([-0.5,num_context+0.5-1])
        
           
        total = len(melted['value'].unique()) + len(info.columns)-1
        running_total = 0
        
        # Add legends
        if make_legend:
            for col in context_columns:
                ids = info[col].unique()
                sig_legend = []
                for label in ids:
                    color = color_map[label]
                    sig_legend.append(
                        Line2D([0], [0], marker='o', color='w', label=label, markerfacecolor=color,markersize=markersize))
                    if orientation in ['left', 'right']:
                        leg = ax.legend(
                            handles=sig_legend, 
                            bbox_to_anchor=(orientation_values[orientation],1-running_total/total), 
                            title=col)
                    elif orientation in ['top', 'bottom']:
                        leg = ax.legend(
                            handles=sig_legend, 
                            bbox_to_anchor=(running_total/total, orientation_values[orientation]),
                            loc='lower left',
                            title=col)
                        
                ax.add_artist(leg)
    
                running_total += len(ids) + 1