Source code for ptm_pose.analyze.filter



import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#ptm_pose imports
from ptm_pose import helpers



[docs]
def plot_filter_impact(ptms, output_type = 'count', topn = 10, ax = None, **kwargs):
    """
    Given a dataframe of PTMs and a set of filter arguments to be passed to helpers.filter_ptms, this function will plot the number or fraction of PTMs that are retained after filtering for each modification type

    Parameters
    ----------
    ptms : pd.DataFrame
        Dataframe containing PTM data with a column 'Modification Class' that contains the type of modification (e.g. phosphorylation, acetylation, etc.)
    output_type : str, optional
        Type of output to plot, either 'count' or 'fraction'. The default is 'count'.
    topn : int, optional
        The number of top modification classes to plot. The default is 10.
    ax : matplotlib.axes.Axes, optional
        The axes to plot on. If None, a new figure and axes will be created. The default is None.
    **kwargs : keyword arguments
        Additional keyword arguments to be passed to the filter_ptms function (e.g. min_studies, min_compendia, etc.). These will be extracted and checked for validity.
    """
    filter_arguments = helpers.extract_filter_kwargs(**kwargs)
    helpers.check_filter_kwargs(filter_arguments)
    filtered_ptms=helpers.filter_ptms(ptms, **filter_arguments)

    original_mods = ptms['Modification Class'].value_counts()
    original_mods.name = 'Original'
    filtered_mods = filtered_ptms['Modification Class'].value_counts()
    filtered_mods.name = 'Filtered'
    #sort by top n values after filtering
    original_mods = original_mods.sort_values(ascending = False).head(topn)
    filtered_mods = filtered_mods[filtered_mods.index.isin(original_mods.index)]

    #grab y-axis label and labelpad based on output_type
    if output_type == 'fraction':
        #convert counts to fractions
        original_mods = original_mods / original_mods.sum()
        filtered_mods = filtered_mods / filtered_mods.sum()
        ylabel = 'Fraction of PTMs'
        labelpad = 0.02
    elif output_type == 'count':
        ylabel = 'Number of PTMs'
        labelpad = 10
    else:
        raise ValueError("output_type must be either 'count' or 'fraction'")
    
    plt_data = pd.concat([original_mods, filtered_mods], axis = 1)
    plt_data = plt_data.fillna(0)

    if ax is None:
        fig, ax = plt.subplots(figsize = (3,2))

    plt_data.plot(kind = 'bar', ax = ax)

    ax.set_ylabel(ylabel)

    #annotate tops of bars with numbers
    for p in ax.patches:
        if output_type == 'count':
            ax.annotate(str(int(p.get_height())), (p.get_x() * 1.005, p.get_height() + labelpad), fontsize = 8, rotation = 90, annotation_clip=False)
        elif output_type == 'fraction':
            #for fractions, convert to percentage and round to 2 decimal places
            ax.annotate(str(round(p.get_height() * 100, 2)) + '%', (p.get_x() * 1.005, p.get_height() + labelpad), fontsize = 8, rotation = 90, annotation_clip=False)

    #remove top and right spines
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)


def assess_filter_range(ptms, min_value = 0, max_value = None, step = None, filter_type = 'min_studies', phospho_only_evidence_filter = True, ax = None, fontsize = 11):
    num_ptms = []
    frac_phospho = []

    #check filter_type is valid
    if filter_type not in ['min_studies', 'min_compendia', 'min_MS', 'min_LTP']:
        raise ValueError("filter_type must be one of ['min_studies', 'min_compendia', 'min_MS', 'min_LTP']")

    #grab max value if not provided
    if max_value is None:
        if filter_type == 'min_studies':
            max_value = int(ptms[['MS_LIT', 'LT_LIT']].sum(axis =1).max())
        elif filter_type == 'min_compendia':
            max_value = ptms['Number of Compendia'].max()
        elif filter_type == 'min_MS':
            max_value = ptms[['MS_LIT', 'MS_CST']].sum(axis = 1).max()
        elif filter_type == 'min_LTP':
            max_value = ptms['LT_LIT'].max()
    
    #if specific step value not provided, round to nearest 10% of max value
    if step is None:
        step = round(max_value/10)

    #filter PTMs using the indicated filter type method for value in range
    x = np.arange(min_value, int(max_value) + 1, step)
    for i in x:
        #filter PTMs
        if filter_type == 'min_studies':
            filtered_ptms = helpers.filter_ptms(ptms, report_removed = False, min_studies = i, phospho_only_evidence_filter = phospho_only_evidence_filter)
        elif filter_type == 'min_compendia':
            filtered_ptms = helpers.filter_ptms(ptms, report_removed = False, min_compendia = i, phospho_only_evidence_filter=phospho_only_evidence_filter)
        elif filter_type == 'min_MS':
            filtered_ptms = helpers.filter_ptms(ptms, report_removed = False, min_MS_observations = i, phospho_only_evidence_filter=phospho_only_evidence_filter)
        elif filter_type == 'min_LTP':
            filtered_ptms = helpers.filter_ptms(ptms, report_removed = False, min_LTP_studies = i, phospho_only_evidence_filter=phospho_only_evidence_filter)

        #save number of PTMs and the fraction that are phosphorylated
        num_ptms.append(filtered_ptms.shape[0])
        #fraction of PTMs that are phosphorylation sites
        if filtered_ptms.shape[0] > 0 and 'Phosphorylation' in filtered_ptms['Modification Class'].unique():  
            filtered_mods = filtered_ptms['Modification Class'].value_counts()
            phospho_fraction = filtered_mods['Phosphorylation']/filtered_mods.sum()
            frac_phospho.append(phospho_fraction)
        elif filtered_ptms.shape[0] == 0:
            frac_phospho.append(np.nan)
        else:
            frac_phospho.append(0)

    x_label_dict = {'min_studies': 'Minimum number of\nliterature reports', 'min_LTP': 'Minimum number of\nLow-throughput Studies', 'min_MS': 'Minimum number of\nMS Observations', 'min_compendia': 'Minimum number of\ncompendia'}

    if ax is None:
        fig, ax = plt.subplots(figsize = (3,3))

    ax.plot(x, num_ptms, color = 'blue')
    ax.set_ylabel('Number of PTMs', color = 'blue', fontsize = fontsize)
    #change color of tick labels
    ax.tick_params(axis='y', labelcolor='blue')
    ax.set_xlabel(x_label_dict[filter_type], fontsize = fontsize)
    ax2 = ax.twinx()
    ax2.plot(x, frac_phospho, color = 'red')
    ax2.set_ylabel('Phosphorylation\nFraction', color = 'red', fontsize = fontsize)
    ax2.tick_params(axis='y', labelcolor='red')