Source code for dansy.enrichment_plotting_helpers

import  pandas as pd
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import matplotlib.lines as lines
from dansy.enrichment_helpers import *



[docs]
def get_max_info_enriched_ngrams(res_df, condition_labels = None, q = None,p = None):
    '''
    Returns the top X values of enriched n-grams that passes a quantile and/or p-value cutoff. This will collapse n-grams that provide similar p-value trends into a single representative n-gram if a shorter n-gram is in the longer n-gram. Longer n-grams that have more signficant p-values than their shorter counterparts will be retained.

    Parameters:
    -----------
        - res_df: pandas DataFrame
            Dataframe of all the n-gram enrichment p-values for each condition
        - condition_labels: list (Optional)
            list of strings that are labels for the different conditions. If not provided then defaults to Up and Down
        - q: float (Optional)
            quantile cutoff for p-values to return n-grams. Default value is 0.05.
        - p: float (Optional)
            p-value cutoff for n-grams to return. If provided with q this will set the upper bound of p-values.

    Returns:
    --------
        - maxinfo_filt_res: pandas DataFrame
            A filtered version of the res_df provided to only n-grams that pass the cutoffs provided
    
    '''

    # Checking the p-value and quantile cutoffs to determine a threshold.
    if p is None and q is None:
        q = 0.05
        p_thres = np.quantile(res_df['p'],q)
    elif p is not None and q is not None:
        x = np.quantile(res_df['p'],q)
        p_thres = np.min([p, x])
    elif p is not None:
        p_thres = p
    else:
        p_thres = np.quantile(res_df['p'],q)

    # Initial filtering based on only the p-value threshold
    filt_res_cands = res_df[res_df['p'] <= p_thres]['ngram']
    filt_res = res_df[res_df['ngram'].isin(filt_res_cands)].copy()

    # Creating the condition labels
    if condition_labels != None:
        filt_res['variable'] = filt_res['variable'].map({'Up':condition_labels[0],'Down':condition_labels[1]})

    # Now sorting the n-grams based on their length to start the collapsing step
    ngram_list = sorted(set(filt_res['ngram'].tolist()),key=lambda x:len(x.split('|')),reverse=True)

    # Collapsing the n-grams based on their p-values and if they have similar trends or not.
    ngrams_2_collapse = collapse_to_max_info(ngram_list,filt_res)    
    ngrams_kept = set(filt_res['ngram'].tolist()).difference(ngrams_2_collapse)

    # Filtering the results dataframe to only the collapsed n-grams
    maxinfo_filt_res = filt_res[filt_res['ngram'].isin(ngrams_kept)].copy()

    return maxinfo_filt_res



[docs]
def plot_enriched_ngrams(res, dansyOI, condition_labels = None, q = 0.05,p = None, show_FPR=True ,**kwargs):
    '''
    This plots the top X percent (default 5) n-grams enriched between two different conditions. For clarity n-grams that contain similar information will be collapsed into the shorter n-gram (i.e. if EGF-like domain and EGF-like domain|EGF-like domain both have similar enrichment values they will only be represented by EGF-like domain).
    
    Parameters:
    -----------
        res_df: pandas DataFrame
            A dataframe containing all the results including both the individual statistical enrichment and the False positive rate for all n-grams. (Note this will likely be removed once this is integrated into the actual module.)
        dansyOI: deDANSy object
            The deDANSy object that contains the n-grams of interest
        condition_labels: list (Optional)
            The labels for both conditions this should be provided in the order of up-regulated and down-regulated. (Note this will be removed once this is integrated into the actual module.)
        q: float (Optional)
            The quantile cutoff of values to plot. Default (0.05)
        p: float (Optional)
            The p-value threshold to limit n-grams to plot. Default is 0.05 but if combined with the quantile (q) can be lower.
        show_FPR: bool
            Flag whether to show the FPR legend portion.
        kwargs: optional keywords
            - 'palette','edgecolor', 'linewidth', 'sizes' that adjust the seaborn scatterplot aesthetics.
            - 'loc', 'bbox_to_anchor', 'handletextpad' to adjust matplotlib legend information.
            - 'ax' to specify a matplotlib Axes to plot onto
    
    Returns:
    --------
        seaborn/matplotlib plot

    '''
    max_res = get_max_info_enriched_ngrams(res, condition_labels, q,p)
    ngram_plot_names = {node:dansyOI.return_legible_ngram(node) for node in max_res['ngram'].tolist()}
    max_res['ngram'] = max_res['ngram'].map(ngram_plot_names)
    
    # Going through some of the default values set up for the seaborn scatterplot and checking for them in the kwargs to overwrite default values.
    sns_opts = {'palette':['deepskyblue','silver'],'edgecolor':'k','linewidth':0.5,'sizes':(1,40)}
    for opt in sns_opts:
        if opt in kwargs:
            sns_opts[opt] = kwargs[opt]
            del kwargs[opt] # Removing to ensure that seaborn does not error out.

    # Now getting some of the keyword arguments that are associated with the legend
    legend_opts = {'loc':'lower left', 'bbox_to_anchor':(1,1), 'handletextpad':0.1}
    for opt in legend_opts:
        if opt in kwargs:
            legend_opts[opt] = kwargs[opt]
            del kwargs[opt]

    sns.scatterplot(max_res, x='variable',y='ngram',
                    size='-log10(p)', hue = 'FPR <= 0.05',
                    hue_order=[True, False],
                    **sns_opts,**kwargs)
    
    # If an axes is provided plot and adjust the legend to that specific axis
    if 'ax' in kwargs:
        handles, labels = kwargs['ax'].get_legend_handles_labels()  
        new_handles, new_labels = clean_ngram_legend(handles, labels,show_FPR)
        l = kwargs['ax'].legend(handles, labels, edgecolor='k', handletextpad=0.1, )
        
    else: 
        handles, labels = plt.gca().get_legend_handles_labels()
        new_handles, new_labels = clean_ngram_legend(handles, labels,show_FPR)
        l = plt.legend(new_handles, new_labels,bbox_to_anchor=(1,1), edgecolor='k', handletextpad=0.1, )
    
    # Small aesthetic changes
    l.get_frame().set_linewidth(0.5)
    for h in l.legend_handles:
        if not isinstance(h, lines.Line2D):
            h.set_edgecolor('k')
            h.set_linewidth(.25)
    plt.xlabel(None)
    plt.ylabel(None)


def plot_enriched_ngrams_presorted(res,x_order = 'variable', dansyOI = None,ngram_ticks = None,show_FPR=True, **kwargs):
    '''
    This plots a presorted n-gram enrichment dataframe based on a provided n-gram order and provided order for conditions (i.e. x-axis).
    
    Parameters:
    -----------
        res: pandas DataFrame
            A dataframe containing all the results including both the individual statistical enrichment and the False positive rate for all n-grams that has been presorted.
        x_order: str
            The column name to use for plotting on the x-axis. Default is variable assuming it was from a prior n-gram enrichment that was sorted in a different method.
        dansyOI: deDANSy object
            The deDANSy object that contains the n-grams of interest
        ngram_ticks: dict
            Key-value pairs of the n-grams and their order
        show_FPR: bool
            Flag whether to show the FPR legend portion.
        kwargs: optional keywords
            - 'palette','edgecolor', 'linewidth', 'sizes' that adjust the seaborn scatterplot aesthetics.
            - 'loc', 'bbox_to_anchor', 'handletextpad' to adjust matplotlib legend information.
            - 'ax' to specify a matplotlib Axes to plot onto
    
    Returns:
    --------
        seaborn/matplotlib plot

    '''
    
    max_res = res.copy()
   
    
    # Going through some of the default values that I have set up for the seaborn scatterplot and checking for them in the kwargs to overwrite my default values.
    sns_opts = {'palette':['deepskyblue','silver'],'edgecolor':'k','linewidth':0.5,'sizes':(1,40)}
    for opt in sns_opts:
        if opt in kwargs:
            sns_opts[opt] = kwargs[opt]
            del kwargs[opt] # Removing to ensure that seaborn does not error out.
    
    # Now getting some of the keyword arguments that are associated with the legend
    legend_opts = {'loc':'lower left', 'bbox_to_anchor':(1,1), 'handletextpad':0.1}
    for opt in legend_opts:
        if opt in kwargs:
            legend_opts[opt] = kwargs[opt]
            del kwargs[opt]
    
    sns.scatterplot(max_res, x=x_order,y='ngram_order',
                    size='-log10(p)', hue = 'FPR <= 0.05',
                    hue_order=[True, False],
                    **sns_opts,**kwargs)
    
    # Setting up the ticks associated with the ngrams if a dansy object and an n-gram order dict were provided.
    if ngram_ticks == None and dansyOI == None:
        pass
    elif ngram_ticks != None and dansyOI != None:
        ngram_plot_names = [dansyOI.return_legible_ngram(node) for node in ngram_ticks]
        plt.yticks(ticks=[v for v in ngram_ticks.values()], labels=ngram_plot_names)
        plt.ylim(max(list(ngram_ticks.values()))+0.5,-0.5)

    # If an axes is provided plot and adjust the legend to that specific axis
    if 'ax' in kwargs:
        handles, labels = kwargs['ax'].get_legend_handles_labels()  
        new_handles, new_labels = clean_ngram_legend(handles, labels,show_FPR)
        l = kwargs['ax'].legend(new_handles, new_labels, edgecolor='k', **legend_opts)
        
    else: 
        handles, labels = plt.gca().get_legend_handles_labels()
        new_handles, new_labels = clean_ngram_legend(handles, labels,show_FPR)
        l = plt.legend(new_handles, new_labels, edgecolor='k',**legend_opts)
    
    # Small aesthetic changes
    l.get_frame().set_linewidth(0.5)

    for h in l.legend_handles:
        if not isinstance(h, lines.Line2D):
            h.set_edgecolor('k')
            h.set_linewidth(.25)
        
    plt.xlabel(None)
    plt.ylabel(None)

def clean_ngram_legend(handles, labels,show_FPR = True):
    '''
    Cleans up the n-gram enrichment legend to show only relevant information if the FPR is to be displayed or not.

    Parameters:
    -----------
        - handles: list
            List of matplotlib legend handles to adjust
        - labels: list
            List of maptlotlib legend labels to adjust
        - show_FPR: bool
            Whether the FPR portion of the legend should be displayed
    
    Returns:
    --------
        - new_handles: list
            The new handles to input into the legend
        - new_labels: list
            The new labels to input into the legend
    '''

    labels[1]= 'FPR$\leq$0.05'
    labels[2] = 'FPR > 0.05'
    if show_FPR:
        # Dropping the FPR legend title since it is provided in the actual labels
        new_handles = handles[1:len(handles)]
        new_labels = labels[1:len(labels)]
        
    else:
        new_handles = [h for i,h in enumerate(handles) if i not in [0,1,2]]
        new_labels = [h for i,h in enumerate(labels) if i not in [0,1,2]]
    
    return new_handles, new_labels


[docs]
def collapse_to_max_info(ngram_list, res_df):
    '''
    This collapses the n-grams to those that represent the most discriminating information of interest. This will take longer n-grams and collapse them into shorter ones if the trends of p-values are similar, but the longer n-grams are slightly less signficant. If a longer n-gram is more significant it will not be collapsed.

    Parameters:
    -----------
        - ngram_list: list
            List of n-grams to consider for collapsing
        - res_df: pandas DataFrame
            Dataframe containing the enrichment p-value results that will be collapsed to maximize information being presented

    Returns:
    --------
        - ngrams_2_collapse: list
            The n-grams that will be collapsed from the inputted list.
    
    '''
    
    # Defining potential collapsing n-gram families
    potential_collapse = {}
    for ngram in ngram_list:
        for inner_ngram in ngram_list:
            # Check for parent-child relationship
            if inner_ngram != ngram and ngram in inner_ngram:

                # Add to the dictionary an empty list if the n-gram is not present
                if ngram not in potential_collapse:
                    potential_collapse[ngram] = []
                potential_collapse[ngram].append(inner_ngram)


    # Now for each of these checking the FPR and p-values to see if they should be collapsed
    ngrams_2_collapse = set()
    for ngram, children in potential_collapse.items():

        # Get the parent n-grams values
        parent_p = res_df[res_df['ngram'] == ngram]['p'].tolist()
        parent_fpr = res_df[res_df['ngram'] == ngram]['FPR <= 0.05'].tolist()
        parent_cond = res_df[res_df['ngram'] == ngram]['variable'].tolist()
        
        # Check if it is only within 1 condition
        if len(parent_p) == 1:
            for child in children:
                child_p = res_df[res_df['ngram'] == child]['p'].tolist()
                child_fpr = res_df[res_df['ngram'] == child]['FPR <= 0.05'].tolist()
                child_cond = res_df[res_df['ngram'] == child]['variable'].tolist()
                
                # Only collapse if they have the same number of conditions and match
                if len(child_p) == 1:
                    if child_cond == parent_cond:
                        # If the child one is more signficant and FPR values are not the same then keep it otherwise collapse it
                        if child_p < parent_p and parent_fpr != child_fpr:
                            pass 
                        elif parent_fpr != child_fpr:
                            pass 
                        else:
                            ngrams_2_collapse.add(child)
        else:
            for child in children:
                child_p = res_df[res_df['ngram'] == child]['p'].tolist()
                child_fpr = res_df[res_df['ngram'] == child]['FPR <= 0.05'].tolist()
                child_cond = res_df[res_df['ngram'] == child]['variable'].tolist()
                if len(child_cond) == 2:
                    # Checking both the p-vals as showing similar trends and child n-grams are not more signficant then keep
                    if any(c < p for c,p in zip(child_p,parent_p)) and any(p != c for c,p in zip(child_fpr,parent_fpr)):
                        pass
                    elif any(p != c for c,p in zip(child_fpr,parent_fpr)):
                        pass
                    else:
                        ngrams_2_collapse.add(child)
    
    return ngrams_2_collapse



[docs]
def gather_enrichment_results(hyper_values, fpr_values):
    '''
    This gathers both the statistical results from the enrichment analysis and the FPR calculations to return a complete results dataframe.

    Parameters:
    -----------
        - hyper_values: dict
            Key-value pairs of both conditions that contains dictionaries with key-value pairs of n-grams and their statistical enrichent in each condition.
        - fpr_values: dict
            Key-value pairs of each n-gram for the different conditions in a comparison

    Returns:
    --------
        - res: pandas DataFrame
            Results dataframe that has aggregated all the n-gram statistical results
    '''
    fpr_df = pd.DataFrame().from_dict(fpr_values)
    hyper_df = pd.DataFrame().from_dict(hyper_values)
    hyper_df = hyper_df.melt(ignore_index=False, value_name='p')
    hyper_df['ngram'] = hyper_df.index
    fpr_df = fpr_df.melt(ignore_index=False, value_name='FPR')
    fpr_df['ngram'] = fpr_df.index
    res = hyper_df.merge(fpr_df)
    res.dropna(subset=['p'],inplace=True)
    res['-log10(p)'] = -np.log10(res['p'])
    res['FPR <= 0.05'] = res['FPR'] <= 0.05
    return res


def calc_ngram_fpr_vals(hyper_vals, rand_ngram_pvals):
    '''
    Calculates the FPR for enriched n-grams in both conditions.

    Parameters:
    -----------
        hyper_vals: dict
            Dict of dict that contain the enrichment results of each n-gram for both conditions.
        rand_ngram_pvals: dict
            Dict of dicts that contains the equivalent enrichment results for n-grams from randomly chosen genes.
    
    Returns:
    --------
        fpr_dict: dict
            Dict of dict for each condition that contains the FPR values of each n-gram.
    '''
    # Now calculating the fpr for each of the nodes found within the actual network
    
    
    fpr_dict = {k:{} for k in hyper_vals}
    for c_dir,i in hyper_vals.items():
        for node in i:
            actual_p = i[node]
            if node in rand_ngram_pvals[c_dir]:
                rand_p_vals = rand_ngram_pvals[c_dir][node]
                num_fp = sum([x < actual_p for x in rand_p_vals])
                fpr_dict[c_dir][node] = num_fp/len(rand_p_vals)
            else:
                fpr_dict[c_dir][node] = 0

    return fpr_dict



[docs]
def plot_functional_scores(res, show_FPR_handle=True, aspect = 0.9, order = None):
    '''
    This creates the bubble plots for both the separation and distinction scores calculated by deDANSy.

    Parameters:
    -----------
        - res: pandas DataFrame
            The dataframe containing all scores and FPR values of the deDANSy analysis
        - show_FPR_handle: bool
            Whether the FPR portion of the legend should be displayed
        - aspect: float
            The aspect ratio of both plots
        - order: dict
            Key-value pairs of comparisons and their order

    Returns:
    --------
        - ax: list of matplotlib Axes
            The axes of each subplot generated

    '''

    # Starting with the Separation Scores subplot
    data_plot, comp_order = create_score_plot_data(res, 'Separation', order)
    
    _, axs = plt.subplots(1,2)
    plt.subplot(1,2,1)
    sns.scatterplot(data_plot, x='Separation_Category_Order', y = 'Order',
                    size='Separation_Score',
                    hue='Separation_Significance',
                    sizes = (1,50), size_norm = (0,5),
                    palette=['mediumorchid', 'silver'],
                    hue_order=[True, False],
                    linewidth = 0.5,edgecolor='k')

    # Adding in labels
    plt.title('Separation Score')
    plt.xlabel(None)
    plt.ylabel(None)
    
    # Cleaning up the ticks
    plt.xticks([0,1],['More', 'Less'], rotation=45, ha='right')
    plt.yticks([v for v in comp_order.values()], [k for k in comp_order.keys()])
    plt.xlim(-0.5,1.5)
    
    # Cleaning up the legend
    handles, labels = plt.gca().get_legend_handles_labels()
    new_handles, new_labels = clean_up_legend(handles, labels, show_FPR_handle)
    l = plt.legend(new_handles,new_labels,bbox_to_anchor=(1,1), edgecolor='k', handletextpad=0.1)
    l.get_frame().set_linewidth(0.5)

    # Setting some aesthetics for the handles
    for h in l.legend_handles:
        if not isinstance(h, lines.Line2D):
            h.set_edgecolor('k')
            h.set_linewidth(.5)
        
    # Grid if there are more than 3 comparisons to provide a slight visual guidance
    if len(set(data_plot['Comparison'].tolist())) >= 3:
        plt.grid(visible=True,axis='y', linewidth =0.25, linestyle= ':')
    plt.gca().set_aspect(aspect)

    # Now the Distinction Score
    data_plot,comp_order = create_score_plot_data(res, 'Distinction', order)
    plt.subplot(1,2,2)
    sns.scatterplot(data_plot,x = 'Distinction_Category_Order',
                    y = 'Order', size='Distinction_Score',hue='Distinction_Significance',sizes = (1,50),size_norm=(0,5),
                    palette=['seagreen', 'silver'], hue_order=[True, False],
                    linewidth = 0.5,edgecolor='k')
    
    # Tick clean up
    plt.xticks([0,1],['Stably Distinct', 'Unstable/Overlap'], rotation=45, ha='right')
    plt.yticks([v for v in comp_order.values()], [k for k in comp_order.keys()])
    plt.xlabel(None)
    plt.ylabel(None)
    plt.title('Distinction Scores',fontdict={'size':6})
    plt.xlim([-0.5,1.5])
    plt.gca().set_aspect(aspect)
    plt.ylabel(None)
    plt.tick_params('y',labelleft=None)

    # Legend clean up and formatting
    handles, labels = plt.gca().get_legend_handles_labels()
    new_handles, new_labels = clean_up_legend(handles, labels, show_FPR_handle)
    l = plt.legend(new_handles,new_labels,bbox_to_anchor=(1,1), edgecolor='k', handletextpad=0.1)
    l.get_frame().set_linewidth(0.5)
    
    # Aesthetics of specific legend handles
    for h in l.legend_handles:
        if not isinstance(h, lines.Line2D):
            h.set_edgecolor('k')
            h.set_linewidth(.5)
    
    # Grid if there are more than 3 comparisons to provide a slight visual guidance
    if len(set(data_plot['Comparison'].tolist())) >=3:
        plt.grid(visible=True,axis='y', linewidth =0.25, linestyle= ':')
        
    return axs


def clean_up_legend(handles, labels, show_FPR = True):
    '''
    An internal function that cleans up the legend of the bubble plot to ensure clear communication of results and scores. If desired the FPR portion of the legend will be omitted if it does not provide useful information.

    Parameters:
    -----------
        - handles: list
            List of handles for the unmodified legend of the matplotlib/seaborn plot
        - labels: list
            List of labels for the unmodified legend of the matplotlib/seaborn plot
        - show_FPR: bool
            Flag of whether the FPR portion of the legend should be displayed.
    
    Returns:
    --------
        - new_h: list
            List of handles for the new legend of the matplotlib/seaborn plot
        - new_l: list
            List of labels for the new legend of the matplotlib/seaborn plot
    '''
    if show_FPR:
            handles_2_rm = [0,5,7,9]
    else:
        handles_2_rm = [0,1,2,5,7,9] #Will not always need the FPR legend details so will remove them as well
    
    new_h = [h for i,h in enumerate(handles) if i not in handles_2_rm]
    new_l = [h for i,h in enumerate(labels) if i not in handles_2_rm]
    
    if show_FPR:
        new_l[0] = 'FPR$\leq$0.05'
        new_l[1] = 'FPR > 0.05'
        new_l[2] = 'Score'
    else:
        new_l[0] = 'Score'

    return new_h, new_l

def create_score_plot_data(data, metric, order = None):
    '''
    This creates the plot data for generating the final bubble plot for a multicomparison result. This ensures a consistent easy to read bubble size is present and creates the order the plot will be generated in. 

    Parameters:
    -----------
        - data: pandas DataFrame
            The scores dataframe generated by the deDANSy object
        - metric: str
            Either the Separation or Distinction score that will be used for plotting
        - order: dict (Optional)
            Key-value pairs that determine which order the multiple comparisons will be displayed in with the keys being the comparison and the values the order.
    
    Returns:
    --------
        - plot_data: pandas DataFrame
            A modified dataframe which contains a new Order column and additional values to create consistent sizing
        - comp_map: dict
            The order dictionary that will be used for displaying the comparisons
    '''
    plot_data = data.copy()

    if order is None:
        comps = sorted(plot_data['Comparison'].dropna().tolist())
        comp_map = {v:i for i,v in enumerate(comps)}
    else:
        comp_map = order
    plot_data['Order'] = plot_data['Comparison'].map(comp_map)

    if metric == 'Separation':
        max_vals = np.ceil(plot_data['Separation_Score'].max())
        cat_order = {'More':0,'Less':1}
        plot_data['Separation_Category_Order'] = plot_data['Separation_Category'].map(cat_order)
        for i,v in enumerate(np.linspace(0,max_vals,5)):
            plot_data.loc[i+len(plot_data)] = None
            plot_data.loc[i+len(plot_data), 'Separation_Score'] = v
    else:
        # The IQR scores tend to be between 0-2 so getting the closest integer and then taking 5 steps to force the size
        max_vals = np.ceil(plot_data['Distinction_Score'].max())
        cat_order = {'Stably Distinct':0,'Unstable/Overlapping':1}
        plot_data['Distinction_Category_Order'] = plot_data['Distinction_Category'].map(cat_order)
        for i,v in enumerate(np.linspace(0,max_vals,5)):
            plot_data.loc[i+len(plot_data)] = None
            plot_data.loc[i+len(plot_data), 'Distinction_Score'] = v

    return plot_data,comp_map