Source code for dansy.enrichment_plotting_helpers

import  pandas as pd
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import matplotlib.lines as lines
from dansy.enrichment_helpers import *


[docs] def get_max_info_enriched_ngrams(res_df, condition_labels = None, q = None,p = None): ''' Returns the top X values of enriched n-grams that passes a quantile and/or p-value cutoff. This will collapse n-grams that provide similar p-value trends into a single representative n-gram if a shorter n-gram is in the longer n-gram. Longer n-grams that have more signficant p-values than their shorter counterparts will be retained. Parameters: ----------- - res_df: pandas DataFrame Dataframe of all the n-gram enrichment p-values for each condition - condition_labels: list (Optional) list of strings that are labels for the different conditions. If not provided then defaults to Up and Down - q: float (Optional) quantile cutoff for p-values to return n-grams. Default value is 0.05. - p: float (Optional) p-value cutoff for n-grams to return. If provided with q this will set the upper bound of p-values. Returns: -------- - maxinfo_filt_res: pandas DataFrame A filtered version of the res_df provided to only n-grams that pass the cutoffs provided ''' # Checking the p-value and quantile cutoffs to determine a threshold. if p is None and q is None: q = 0.05 p_thres = np.quantile(res_df['p'],q) elif p is not None and q is not None: x = np.quantile(res_df['p'],q) p_thres = np.min([p, x]) elif p is not None: p_thres = p else: p_thres = np.quantile(res_df['p'],q) # Initial filtering based on only the p-value threshold filt_res_cands = res_df[res_df['p'] <= p_thres]['ngram'] filt_res = res_df[res_df['ngram'].isin(filt_res_cands)].copy() # Creating the condition labels if condition_labels != None: filt_res['variable'] = filt_res['variable'].map({'Up':condition_labels[0],'Down':condition_labels[1]}) # Now sorting the n-grams based on their length to start the collapsing step ngram_list = sorted(set(filt_res['ngram'].tolist()),key=lambda x:len(x.split('|')),reverse=True) # Collapsing the n-grams based on their p-values and if they have similar trends or not. ngrams_2_collapse = collapse_to_max_info(ngram_list,filt_res) ngrams_kept = set(filt_res['ngram'].tolist()).difference(ngrams_2_collapse) # Filtering the results dataframe to only the collapsed n-grams maxinfo_filt_res = filt_res[filt_res['ngram'].isin(ngrams_kept)].copy() return maxinfo_filt_res
[docs] def plot_enriched_ngrams(res, dansyOI, condition_labels = None, q = 0.05,p = None, show_FPR=True ,**kwargs): ''' This plots the top X percent (default 5) n-grams enriched between two different conditions. For clarity n-grams that contain similar information will be collapsed into the shorter n-gram (i.e. if EGF-like domain and EGF-like domain|EGF-like domain both have similar enrichment values they will only be represented by EGF-like domain). Parameters: ----------- res_df: pandas DataFrame A dataframe containing all the results including both the individual statistical enrichment and the False positive rate for all n-grams. (Note this will likely be removed once this is integrated into the actual module.) dansyOI: deDANSy object The deDANSy object that contains the n-grams of interest condition_labels: list (Optional) The labels for both conditions this should be provided in the order of up-regulated and down-regulated. (Note this will be removed once this is integrated into the actual module.) q: float (Optional) The quantile cutoff of values to plot. Default (0.05) p: float (Optional) The p-value threshold to limit n-grams to plot. Default is 0.05 but if combined with the quantile (q) can be lower. show_FPR: bool Flag whether to show the FPR legend portion. kwargs: optional keywords - 'palette','edgecolor', 'linewidth', 'sizes' that adjust the seaborn scatterplot aesthetics. - 'loc', 'bbox_to_anchor', 'handletextpad' to adjust matplotlib legend information. - 'ax' to specify a matplotlib Axes to plot onto Returns: -------- seaborn/matplotlib plot ''' max_res = get_max_info_enriched_ngrams(res, condition_labels, q,p) ngram_plot_names = {node:dansyOI.return_legible_ngram(node) for node in max_res['ngram'].tolist()} max_res['ngram'] = max_res['ngram'].map(ngram_plot_names) # Going through some of the default values set up for the seaborn scatterplot and checking for them in the kwargs to overwrite default values. sns_opts = {'palette':['deepskyblue','silver'],'edgecolor':'k','linewidth':0.5,'sizes':(1,40)} for opt in sns_opts: if opt in kwargs: sns_opts[opt] = kwargs[opt] del kwargs[opt] # Removing to ensure that seaborn does not error out. # Now getting some of the keyword arguments that are associated with the legend legend_opts = {'loc':'lower left', 'bbox_to_anchor':(1,1), 'handletextpad':0.1} for opt in legend_opts: if opt in kwargs: legend_opts[opt] = kwargs[opt] del kwargs[opt] sns.scatterplot(max_res, x='variable',y='ngram', size='-log10(p)', hue = 'FPR <= 0.05', hue_order=[True, False], **sns_opts,**kwargs) # If an axes is provided plot and adjust the legend to that specific axis if 'ax' in kwargs: handles, labels = kwargs['ax'].get_legend_handles_labels() new_handles, new_labels = clean_ngram_legend(handles, labels,show_FPR) l = kwargs['ax'].legend(handles, labels, edgecolor='k', handletextpad=0.1, ) else: handles, labels = plt.gca().get_legend_handles_labels() new_handles, new_labels = clean_ngram_legend(handles, labels,show_FPR) l = plt.legend(new_handles, new_labels,bbox_to_anchor=(1,1), edgecolor='k', handletextpad=0.1, ) # Small aesthetic changes l.get_frame().set_linewidth(0.5) for h in l.legend_handles: if not isinstance(h, lines.Line2D): h.set_edgecolor('k') h.set_linewidth(.25) plt.xlabel(None) plt.ylabel(None)
def plot_enriched_ngrams_presorted(res,x_order = 'variable', dansyOI = None,ngram_ticks = None,show_FPR=True, **kwargs): ''' This plots a presorted n-gram enrichment dataframe based on a provided n-gram order and provided order for conditions (i.e. x-axis). Parameters: ----------- res: pandas DataFrame A dataframe containing all the results including both the individual statistical enrichment and the False positive rate for all n-grams that has been presorted. x_order: str The column name to use for plotting on the x-axis. Default is variable assuming it was from a prior n-gram enrichment that was sorted in a different method. dansyOI: deDANSy object The deDANSy object that contains the n-grams of interest ngram_ticks: dict Key-value pairs of the n-grams and their order show_FPR: bool Flag whether to show the FPR legend portion. kwargs: optional keywords - 'palette','edgecolor', 'linewidth', 'sizes' that adjust the seaborn scatterplot aesthetics. - 'loc', 'bbox_to_anchor', 'handletextpad' to adjust matplotlib legend information. - 'ax' to specify a matplotlib Axes to plot onto Returns: -------- seaborn/matplotlib plot ''' max_res = res.copy() # Going through some of the default values that I have set up for the seaborn scatterplot and checking for them in the kwargs to overwrite my default values. sns_opts = {'palette':['deepskyblue','silver'],'edgecolor':'k','linewidth':0.5,'sizes':(1,40)} for opt in sns_opts: if opt in kwargs: sns_opts[opt] = kwargs[opt] del kwargs[opt] # Removing to ensure that seaborn does not error out. # Now getting some of the keyword arguments that are associated with the legend legend_opts = {'loc':'lower left', 'bbox_to_anchor':(1,1), 'handletextpad':0.1} for opt in legend_opts: if opt in kwargs: legend_opts[opt] = kwargs[opt] del kwargs[opt] sns.scatterplot(max_res, x=x_order,y='ngram_order', size='-log10(p)', hue = 'FPR <= 0.05', hue_order=[True, False], **sns_opts,**kwargs) # Setting up the ticks associated with the ngrams if a dansy object and an n-gram order dict were provided. if ngram_ticks == None and dansyOI == None: pass elif ngram_ticks != None and dansyOI != None: ngram_plot_names = [dansyOI.return_legible_ngram(node) for node in ngram_ticks] plt.yticks(ticks=[v for v in ngram_ticks.values()], labels=ngram_plot_names) plt.ylim(max(list(ngram_ticks.values()))+0.5,-0.5) # If an axes is provided plot and adjust the legend to that specific axis if 'ax' in kwargs: handles, labels = kwargs['ax'].get_legend_handles_labels() new_handles, new_labels = clean_ngram_legend(handles, labels,show_FPR) l = kwargs['ax'].legend(new_handles, new_labels, edgecolor='k', **legend_opts) else: handles, labels = plt.gca().get_legend_handles_labels() new_handles, new_labels = clean_ngram_legend(handles, labels,show_FPR) l = plt.legend(new_handles, new_labels, edgecolor='k',**legend_opts) # Small aesthetic changes l.get_frame().set_linewidth(0.5) for h in l.legend_handles: if not isinstance(h, lines.Line2D): h.set_edgecolor('k') h.set_linewidth(.25) plt.xlabel(None) plt.ylabel(None) def clean_ngram_legend(handles, labels,show_FPR = True): ''' Cleans up the n-gram enrichment legend to show only relevant information if the FPR is to be displayed or not. Parameters: ----------- - handles: list List of matplotlib legend handles to adjust - labels: list List of maptlotlib legend labels to adjust - show_FPR: bool Whether the FPR portion of the legend should be displayed Returns: -------- - new_handles: list The new handles to input into the legend - new_labels: list The new labels to input into the legend ''' labels[1]= 'FPR$\leq$0.05' labels[2] = 'FPR > 0.05' if show_FPR: # Dropping the FPR legend title since it is provided in the actual labels new_handles = handles[1:len(handles)] new_labels = labels[1:len(labels)] else: new_handles = [h for i,h in enumerate(handles) if i not in [0,1,2]] new_labels = [h for i,h in enumerate(labels) if i not in [0,1,2]] return new_handles, new_labels
[docs] def collapse_to_max_info(ngram_list, res_df): ''' This collapses the n-grams to those that represent the most discriminating information of interest. This will take longer n-grams and collapse them into shorter ones if the trends of p-values are similar, but the longer n-grams are slightly less signficant. If a longer n-gram is more significant it will not be collapsed. Parameters: ----------- - ngram_list: list List of n-grams to consider for collapsing - res_df: pandas DataFrame Dataframe containing the enrichment p-value results that will be collapsed to maximize information being presented Returns: -------- - ngrams_2_collapse: list The n-grams that will be collapsed from the inputted list. ''' # Defining potential collapsing n-gram families potential_collapse = {} for ngram in ngram_list: for inner_ngram in ngram_list: # Check for parent-child relationship if inner_ngram != ngram and ngram in inner_ngram: # Add to the dictionary an empty list if the n-gram is not present if ngram not in potential_collapse: potential_collapse[ngram] = [] potential_collapse[ngram].append(inner_ngram) # Now for each of these checking the FPR and p-values to see if they should be collapsed ngrams_2_collapse = set() for ngram, children in potential_collapse.items(): # Get the parent n-grams values parent_p = res_df[res_df['ngram'] == ngram]['p'].tolist() parent_fpr = res_df[res_df['ngram'] == ngram]['FPR <= 0.05'].tolist() parent_cond = res_df[res_df['ngram'] == ngram]['variable'].tolist() # Check if it is only within 1 condition if len(parent_p) == 1: for child in children: child_p = res_df[res_df['ngram'] == child]['p'].tolist() child_fpr = res_df[res_df['ngram'] == child]['FPR <= 0.05'].tolist() child_cond = res_df[res_df['ngram'] == child]['variable'].tolist() # Only collapse if they have the same number of conditions and match if len(child_p) == 1: if child_cond == parent_cond: # If the child one is more signficant and FPR values are not the same then keep it otherwise collapse it if child_p < parent_p and parent_fpr != child_fpr: pass elif parent_fpr != child_fpr: pass else: ngrams_2_collapse.add(child) else: for child in children: child_p = res_df[res_df['ngram'] == child]['p'].tolist() child_fpr = res_df[res_df['ngram'] == child]['FPR <= 0.05'].tolist() child_cond = res_df[res_df['ngram'] == child]['variable'].tolist() if len(child_cond) == 2: # Checking both the p-vals as showing similar trends and child n-grams are not more signficant then keep if any(c < p for c,p in zip(child_p,parent_p)) and any(p != c for c,p in zip(child_fpr,parent_fpr)): pass elif any(p != c for c,p in zip(child_fpr,parent_fpr)): pass else: ngrams_2_collapse.add(child) return ngrams_2_collapse
[docs] def gather_enrichment_results(hyper_values, fpr_values): ''' This gathers both the statistical results from the enrichment analysis and the FPR calculations to return a complete results dataframe. Parameters: ----------- - hyper_values: dict Key-value pairs of both conditions that contains dictionaries with key-value pairs of n-grams and their statistical enrichent in each condition. - fpr_values: dict Key-value pairs of each n-gram for the different conditions in a comparison Returns: -------- - res: pandas DataFrame Results dataframe that has aggregated all the n-gram statistical results ''' fpr_df = pd.DataFrame().from_dict(fpr_values) hyper_df = pd.DataFrame().from_dict(hyper_values) hyper_df = hyper_df.melt(ignore_index=False, value_name='p') hyper_df['ngram'] = hyper_df.index fpr_df = fpr_df.melt(ignore_index=False, value_name='FPR') fpr_df['ngram'] = fpr_df.index res = hyper_df.merge(fpr_df) res.dropna(subset=['p'],inplace=True) res['-log10(p)'] = -np.log10(res['p']) res['FPR <= 0.05'] = res['FPR'] <= 0.05 return res
def calc_ngram_fpr_vals(hyper_vals, rand_ngram_pvals): ''' Calculates the FPR for enriched n-grams in both conditions. Parameters: ----------- hyper_vals: dict Dict of dict that contain the enrichment results of each n-gram for both conditions. rand_ngram_pvals: dict Dict of dicts that contains the equivalent enrichment results for n-grams from randomly chosen genes. Returns: -------- fpr_dict: dict Dict of dict for each condition that contains the FPR values of each n-gram. ''' # Now calculating the fpr for each of the nodes found within the actual network fpr_dict = {k:{} for k in hyper_vals} for c_dir,i in hyper_vals.items(): for node in i: actual_p = i[node] if node in rand_ngram_pvals[c_dir]: rand_p_vals = rand_ngram_pvals[c_dir][node] num_fp = sum([x < actual_p for x in rand_p_vals]) fpr_dict[c_dir][node] = num_fp/len(rand_p_vals) else: fpr_dict[c_dir][node] = 0 return fpr_dict
[docs] def plot_functional_scores(res, show_FPR_handle=True, aspect = 0.9, order = None): ''' This creates the bubble plots for both the separation and distinction scores calculated by deDANSy. Parameters: ----------- - res: pandas DataFrame The dataframe containing all scores and FPR values of the deDANSy analysis - show_FPR_handle: bool Whether the FPR portion of the legend should be displayed - aspect: float The aspect ratio of both plots - order: dict Key-value pairs of comparisons and their order Returns: -------- - ax: list of matplotlib Axes The axes of each subplot generated ''' # Starting with the Separation Scores subplot data_plot, comp_order = create_score_plot_data(res, 'Separation', order) _, axs = plt.subplots(1,2) plt.subplot(1,2,1) sns.scatterplot(data_plot, x='Separation_Category_Order', y = 'Order', size='Separation_Score', hue='Separation_Significance', sizes = (1,50), size_norm = (0,5), palette=['mediumorchid', 'silver'], hue_order=[True, False], linewidth = 0.5,edgecolor='k') # Adding in labels plt.title('Separation Score') plt.xlabel(None) plt.ylabel(None) # Cleaning up the ticks plt.xticks([0,1],['More', 'Less'], rotation=45, ha='right') plt.yticks([v for v in comp_order.values()], [k for k in comp_order.keys()]) plt.xlim(-0.5,1.5) # Cleaning up the legend handles, labels = plt.gca().get_legend_handles_labels() new_handles, new_labels = clean_up_legend(handles, labels, show_FPR_handle) l = plt.legend(new_handles,new_labels,bbox_to_anchor=(1,1), edgecolor='k', handletextpad=0.1) l.get_frame().set_linewidth(0.5) # Setting some aesthetics for the handles for h in l.legend_handles: if not isinstance(h, lines.Line2D): h.set_edgecolor('k') h.set_linewidth(.5) # Grid if there are more than 3 comparisons to provide a slight visual guidance if len(set(data_plot['Comparison'].tolist())) >= 3: plt.grid(visible=True,axis='y', linewidth =0.25, linestyle= ':') plt.gca().set_aspect(aspect) # Now the Distinction Score data_plot,comp_order = create_score_plot_data(res, 'Distinction', order) plt.subplot(1,2,2) sns.scatterplot(data_plot,x = 'Distinction_Category_Order', y = 'Order', size='Distinction_Score',hue='Distinction_Significance',sizes = (1,50),size_norm=(0,5), palette=['seagreen', 'silver'], hue_order=[True, False], linewidth = 0.5,edgecolor='k') # Tick clean up plt.xticks([0,1],['Stably Distinct', 'Unstable/Overlap'], rotation=45, ha='right') plt.yticks([v for v in comp_order.values()], [k for k in comp_order.keys()]) plt.xlabel(None) plt.ylabel(None) plt.title('Distinction Scores',fontdict={'size':6}) plt.xlim([-0.5,1.5]) plt.gca().set_aspect(aspect) plt.ylabel(None) plt.tick_params('y',labelleft=None) # Legend clean up and formatting handles, labels = plt.gca().get_legend_handles_labels() new_handles, new_labels = clean_up_legend(handles, labels, show_FPR_handle) l = plt.legend(new_handles,new_labels,bbox_to_anchor=(1,1), edgecolor='k', handletextpad=0.1) l.get_frame().set_linewidth(0.5) # Aesthetics of specific legend handles for h in l.legend_handles: if not isinstance(h, lines.Line2D): h.set_edgecolor('k') h.set_linewidth(.5) # Grid if there are more than 3 comparisons to provide a slight visual guidance if len(set(data_plot['Comparison'].tolist())) >=3: plt.grid(visible=True,axis='y', linewidth =0.25, linestyle= ':') return axs
def clean_up_legend(handles, labels, show_FPR = True): ''' An internal function that cleans up the legend of the bubble plot to ensure clear communication of results and scores. If desired the FPR portion of the legend will be omitted if it does not provide useful information. Parameters: ----------- - handles: list List of handles for the unmodified legend of the matplotlib/seaborn plot - labels: list List of labels for the unmodified legend of the matplotlib/seaborn plot - show_FPR: bool Flag of whether the FPR portion of the legend should be displayed. Returns: -------- - new_h: list List of handles for the new legend of the matplotlib/seaborn plot - new_l: list List of labels for the new legend of the matplotlib/seaborn plot ''' if show_FPR: handles_2_rm = [0,5,7,9] else: handles_2_rm = [0,1,2,5,7,9] #Will not always need the FPR legend details so will remove them as well new_h = [h for i,h in enumerate(handles) if i not in handles_2_rm] new_l = [h for i,h in enumerate(labels) if i not in handles_2_rm] if show_FPR: new_l[0] = 'FPR$\leq$0.05' new_l[1] = 'FPR > 0.05' new_l[2] = 'Score' else: new_l[0] = 'Score' return new_h, new_l def create_score_plot_data(data, metric, order = None): ''' This creates the plot data for generating the final bubble plot for a multicomparison result. This ensures a consistent easy to read bubble size is present and creates the order the plot will be generated in. Parameters: ----------- - data: pandas DataFrame The scores dataframe generated by the deDANSy object - metric: str Either the Separation or Distinction score that will be used for plotting - order: dict (Optional) Key-value pairs that determine which order the multiple comparisons will be displayed in with the keys being the comparison and the values the order. Returns: -------- - plot_data: pandas DataFrame A modified dataframe which contains a new Order column and additional values to create consistent sizing - comp_map: dict The order dictionary that will be used for displaying the comparisons ''' plot_data = data.copy() if order is None: comps = sorted(plot_data['Comparison'].dropna().tolist()) comp_map = {v:i for i,v in enumerate(comps)} else: comp_map = order plot_data['Order'] = plot_data['Comparison'].map(comp_map) if metric == 'Separation': max_vals = np.ceil(plot_data['Separation_Score'].max()) cat_order = {'More':0,'Less':1} plot_data['Separation_Category_Order'] = plot_data['Separation_Category'].map(cat_order) for i,v in enumerate(np.linspace(0,max_vals,5)): plot_data.loc[i+len(plot_data)] = None plot_data.loc[i+len(plot_data), 'Separation_Score'] = v else: # The IQR scores tend to be between 0-2 so getting the closest integer and then taking 5 steps to force the size max_vals = np.ceil(plot_data['Distinction_Score'].max()) cat_order = {'Stably Distinct':0,'Unstable/Overlapping':1} plot_data['Distinction_Category_Order'] = plot_data['Distinction_Category'].map(cat_order) for i,v in enumerate(np.linspace(0,max_vals,5)): plot_data.loc[i+len(plot_data)] = None plot_data.loc[i+len(plot_data), 'Distinction_Score'] = v return plot_data,comp_map