Source code for ptm_pose.analyze.summarize

import numpy as np
import pandas as pd

#plotting 
import matplotlib.pyplot as plt
import seaborn as sns



#custom stat functions
from ptm_pose import helpers


[docs]
def combine_outputs(spliced_ptms, altered_flanks, report_removed_annotations = True,  include_stop_codon_introduction = False, remove_conflicting = True, **kwargs):
    """
    Given the spliced_ptms (differentially included) and altered_flanks (altered flanking sequences) dataframes obtained from project and flanking_sequences modules, combine the two into a single dataframe that categorizes each PTM by the impact on the PTM site

    Parameters
    ----------
    spliced_ptms: pd.DataFrame
        Dataframe with PTMs projected onto splicing events and with annotations appended from various databases
    altered_flanks: pd.DataFrame
        Dataframe with PTMs associated with altered flanking sequences and with annotations appended from various databases
    include_stop_codon_introduction: bool
        Whether to include PTMs that introduce stop codons in the altered flanks. Default is False.
    remove_conflicting: bool
        Whether to remove PTMs that are both included and excluded across different splicing events. Default is True.
    kwargs: dict
        Additional keyword arguments to pass to the function, will be passed to `helpers.filter_ptms` if filtering is desired. Will automatically filter out insignificant events if not provided
    """
    #filter spliced_ptms and altered_flanks dataframes to remove insignificant events or PTMs with low evidence
    if kwargs:
        filter_arguments = helpers.extract_filter_kwargs(**kwargs)
        helpers.check_filter_kwargs(filter_arguments)
        spliced_ptms = helpers.filter_ptms(spliced_ptms, **filter_arguments)
        filter_arguments['remove_novel'] = False  #keep novel PTMs in altered flanks, as these are not removed from isoform
        altered_flanks = helpers.filter_ptms(altered_flanks, **filter_arguments)

    #extract specific direction of splicing change and add to dataframe
    spliced_ptms['Impact'] = spliced_ptms['dPSI'].apply(lambda x: 'Included' if x > 0 else 'Excluded')

    #restrict altered flanks to those that are changed and are not disrupted by stop codons
    if altered_flanks['Stop Codon Introduced'].dtypes != bool:
        altered_flanks['Stop Codon Introduced'] = altered_flanks['Stop Codon Introduced'].astype(bool)
    if include_stop_codon_introduction:
        altered_flanks['Impact'] = altered_flanks['Stop Codon Introduced'].apply(lambda x: 'Stop Codon Introduced' if x else 'Altered Flank')
    else:
        altered_flanks = altered_flanks[~altered_flanks['Stop Codon Introduced']].copy()
        altered_flanks['Impact'] = 'Altered Flank'

    #identify annotations that are found in both datasets
    annotation_columns_in_spliced_ptms = [col for col in spliced_ptms.columns if ':' in col]
    annotation_columns_in_altered_flanks = [col for col in altered_flanks.columns if ':' in col]
    annotation_columns = list(set(annotation_columns_in_spliced_ptms).intersection(annotation_columns_in_altered_flanks))
    if len(annotation_columns) != annotation_columns_in_spliced_ptms and report_removed_annotations:
        annotation_columns_only_in_spliced = list(set(annotation_columns_in_spliced_ptms) - set(annotation_columns_in_altered_flanks))
        annotation_columns_only_in_altered = list(set(annotation_columns_in_altered_flanks) - set(annotation_columns_in_spliced_ptms))
        if len(annotation_columns_only_in_spliced) > 0:
            print(f'Warning: some annotations in spliced ptms dataframe not found in altered flanks dataframe: {", ".join(annotation_columns_only_in_spliced)}. These annotations will be ignored. To avoid this, make sure to add annotations to both dataframes, or annotate the combined dataframe.')
        if len(annotation_columns_only_in_altered) > 0:
            print(f'Warning: some annotations in altered flanks dataframe not found in spliced ptms dataframe: {", ".join(annotation_columns_only_in_altered)}. These annotations will be ignored. To avoid this, make sure to add annotations to both dataframes, or annotate the combined dataframe.')

    #check if dPSI or sig columns are in both dataframes
    sig_cols = []
    if 'dPSI' in spliced_ptms.columns and 'dPSI' in altered_flanks.columns:
        sig_cols.append('dPSI')
    if 'Significance' in spliced_ptms.columns and 'Significance' in altered_flanks.columns:
        sig_cols.append('Significance')

    shared_columns = ['Impact', 'Gene', 'UniProtKB Accession', 'Isoform ID', 'Isoform Type', 'Residue', 'PTM Position in Isoform', 'Modification Class'] + sig_cols + annotation_columns
    combined = pd.concat([spliced_ptms[shared_columns], altered_flanks[shared_columns]])
    combined = combined.groupby([col for col in combined.columns if col != 'Impact'], as_index = False, dropna = False)['Impact'].apply(lambda x: ';'.join(set(x)))

    #remove ptms that are both included and excluded across different events
    if remove_conflicting:
        combined = combined[~((combined['Impact'].str.contains('Included')) & (combined['Impact'].str.contains('Excluded')))]

    return combined



[docs]
def get_modification_counts(ptms, **kwargs):
    """
    Given PTM data (either spliced ptms, altered flanks, or combined data), return the counts of each modification class

    Parameters
    ----------
    ptms: pd.DataFrame
        Dataframe with PTMs projected onto splicing events or with altered flanking sequences

    Returns
    -------
    modification_counts: pd.Series
        Series with the counts of each modification class
    """
    #filter ptms based on kwargs if provided
    if kwargs:
        filter_arguments = helpers.extract_filter_kwargs(**kwargs)
        helpers.check_filter_kwargs(**filter_arguments)
        ptms = helpers.filter_ptms(ptms, **filter_arguments)

    ptms['Modification Class'] = ptms['Modification Class'].apply(lambda x: x.split(';'))
    ptms = ptms.explode('Modification Class')
    modification_counts = ptms.groupby('Modification Class').size()
    modification_counts = modification_counts.sort_values(ascending = True)
    return modification_counts



[docs]
def get_modification_class_data(ptms, mod_class):
    """
    Given ptm dataframe and a specific modification class, return a dataframe with only the PTMs of that class

    Parameters
    ----------
    ptms : pd.DataFrame
        Dataframe with ptm information, such as the spliced_ptms or altered_flanks dataframe obtained during projection
    mod_class : str
        
        The modification class to filter by, e.g. 'Phosphorylation', 'Acetylation', etc.
    """
    #check if specific modification class was provided and subset data by modification if so
    ptms_of_interest = ptms[ptms['Modification Class'] == mod_class].copy()
    if ptms_of_interest.empty:
        raise ValueError(f"No PTMs found for modification class '{mod_class}'. Please check the input data or choose a different modification class.")

    return ptms_of_interest





[docs]
def plot_modification_breakdown(spliced_ptms = None, altered_flanks = None, colors = sns.color_palette('colorblind'), ax = None, **kwargs):
    """
    Plot the number of PTMs that are differentially included or have altered flanking sequences, separated by PTM type

    Parameters
    ----------
    spliced_ptms: pd.DataFrame
        Dataframe with PTMs that are differentially included
    altered_flanks: pd.DataFrame
        Dataframe with PTMs that have altered flanking sequences
    colors: list
        List of colors to use for the bar plot (first two will be used). Default is seaborn colorblind palette.
    ax: matplotlib.Axes
        Axis to plot on. If None, will create new figure. Default is None.
    kwargs: dict
        Additional keyword arguments to pass to the function, will be passed to `helpers.filter_ptms` if filtering is desired. Will automatically filter out insignificant events by min_dpsi and significance if the columns are present
    """
    if spliced_ptms is None and altered_flanks is None:
        raise ValueError('Either spliced_ptms or altered_flanks must be provided to plot modification breakdown. Both may be provided.')
    
    if kwargs:
        filter_arguments = helpers.extract_filter_kwargs(**kwargs)
        helpers.check_filter_kwargs(**filter_arguments)
        if spliced_ptms is not None:
            spliced_ptms = helpers.filter_ptms(spliced_ptms, **filter_arguments)
        if altered_flanks is not None:
            altered_flanks = helpers.filter_ptms(altered_flanks, **filter_arguments)
    
    if ax is None:
        fig, ax = plt.subplots(figsize = (4,4))

    #separate rows into unique PTM types
    if spliced_ptms is not None and altered_flanks is not None:
        differentially_included_counts = get_modification_counts(spliced_ptms.copy())
        altered_flanks_counts = get_modification_counts(altered_flanks.copy())
        ax.barh(differentially_included_counts.index, differentially_included_counts.values, color = colors[0], label = 'Differentially Included PTMs')
        altered_flanks_counts = altered_flanks_counts.reindex(differentially_included_counts.index, fill_value = 0)
        ax.barh(altered_flanks_counts.index, altered_flanks_counts.values, left = differentially_included_counts.values, color = colors[1], label = 'PTMs with Altered Flank')
        ax.legend()

        #annotate with number of combined PTMs
        total_count = differentially_included_counts.add(altered_flanks_counts, fill_value = 0)
        for i, num_ptm in enumerate(total_count.values):
            ax.text(num_ptm, i, str(num_ptm), ha = 'left', va = 'center')  

        ax.set_xlim([0, total_count.max()*1.1])

    elif spliced_ptms is not None:
        modification_counts = get_modification_counts(spliced_ptms)
        ax.barh(modification_counts.index, modification_counts.values, color = colors[0])

        #annotate with number of PTMs
        for i, num_ptm in enumerate(modification_counts.values):
            ax.text(num_ptm, i, str(num_ptm), ha = 'left', va = 'center')
    elif altered_flanks is not None:
        modification_counts = get_modification_counts(altered_flanks)
        ax.barh(modification_counts.index, modification_counts.values, color = colors[0])

        #annotate with number of PTMs
        for i, num_ptm in enumerate(modification_counts.values):
            ax.text(num_ptm, i, str(num_ptm), ha = 'left', va = 'center')

    ax.set_xlabel('Number of PTMs')