Source code for ptm_pose.analyze.filter



import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#ptm_pose imports
from ptm_pose import helpers


[docs] def plot_filter_impact(ptms, output_type = 'count', topn = 10, ax = None, **kwargs): """ Given a dataframe of PTMs and a set of filter arguments to be passed to helpers.filter_ptms, this function will plot the number or fraction of PTMs that are retained after filtering for each modification type Parameters ---------- ptms : pd.DataFrame Dataframe containing PTM data with a column 'Modification Class' that contains the type of modification (e.g. phosphorylation, acetylation, etc.) output_type : str, optional Type of output to plot, either 'count' or 'fraction'. The default is 'count'. topn : int, optional The number of top modification classes to plot. The default is 10. ax : matplotlib.axes.Axes, optional The axes to plot on. If None, a new figure and axes will be created. The default is None. **kwargs : keyword arguments Additional keyword arguments to be passed to the filter_ptms function (e.g. min_studies, min_compendia, etc.). These will be extracted and checked for validity. """ filter_arguments = helpers.extract_filter_kwargs(**kwargs) helpers.check_filter_kwargs(filter_arguments) filtered_ptms=helpers.filter_ptms(ptms, **filter_arguments) original_mods = ptms['Modification Class'].value_counts() original_mods.name = 'Original' filtered_mods = filtered_ptms['Modification Class'].value_counts() filtered_mods.name = 'Filtered' #sort by top n values after filtering original_mods = original_mods.sort_values(ascending = False).head(topn) filtered_mods = filtered_mods[filtered_mods.index.isin(original_mods.index)] #grab y-axis label and labelpad based on output_type if output_type == 'fraction': #convert counts to fractions original_mods = original_mods / original_mods.sum() filtered_mods = filtered_mods / filtered_mods.sum() ylabel = 'Fraction of PTMs' labelpad = 0.02 elif output_type == 'count': ylabel = 'Number of PTMs' labelpad = 10 else: raise ValueError("output_type must be either 'count' or 'fraction'") plt_data = pd.concat([original_mods, filtered_mods], axis = 1) plt_data = plt_data.fillna(0) if ax is None: fig, ax = plt.subplots(figsize = (3,2)) plt_data.plot(kind = 'bar', ax = ax) ax.set_ylabel(ylabel) #annotate tops of bars with numbers for p in ax.patches: if output_type == 'count': ax.annotate(str(int(p.get_height())), (p.get_x() * 1.005, p.get_height() + labelpad), fontsize = 8, rotation = 90, annotation_clip=False) elif output_type == 'fraction': #for fractions, convert to percentage and round to 2 decimal places ax.annotate(str(round(p.get_height() * 100, 2)) + '%', (p.get_x() * 1.005, p.get_height() + labelpad), fontsize = 8, rotation = 90, annotation_clip=False) #remove top and right spines ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False)
[docs] def assess_filter_range(ptms, min_value = 0, max_value = None, step = None, filter_type = 'min_studies', phospho_only_evidence_filter = False, ax = None, fontsize = 11): """ Given a dataframe of PTMs and a PTM evidence filter type, assess how adjusting the filter value impacts the number of PTMs and the fraction of those PTMs that are phosphorylated. This is done by plotting the number of PTMs and the fraction of phosphorylated PTMs as a function of the filter value. Parameters ---------- ptms : pd.DataFrame Dataframe containing PTM data, either the spliced_ptms or altered_flank data min_value : int, optional The minimum value for the filter. The default is 0. max_value : int, optional The maximum value for the filter. If None, the maximum value will be determined based on the filter type. The default is None. step : int, optional The step size for the filter. If None, the step size will be determined based on the maximum value. The default is None. filter_type : str, optional The type of filter to apply. Must be one of ['min_studies', 'min_compendia', 'min_MS', 'min_LTP']. The default is 'min_studies'. phospho_only_evidence_filter : bool, optional Whether to apply the phospho only evidence filter (only filter phosphorylatio sites). The default is False. ax : matplotlib.axes.Axes, optional The axes to plot on. If None, a new figure and axes will be created. The default is None. fontsize : int, optional The font size for the plot. The default is 11. """ num_ptms = [] frac_phospho = [] #check filter_type is valid if filter_type not in ['min_studies', 'min_compendia', 'min_MS', 'min_LTP']: raise ValueError("filter_type must be one of ['min_studies', 'min_compendia', 'min_MS', 'min_LTP']") #grab max value if not provided if max_value is None: if filter_type == 'min_studies': max_value = int(ptms[['MS_LIT', 'LT_LIT']].sum(axis =1).max()) elif filter_type == 'min_compendia': max_value = ptms['Number of Compendia'].max() elif filter_type == 'min_MS': max_value = ptms[['MS_LIT', 'MS_CST']].sum(axis = 1).max() elif filter_type == 'min_LTP': max_value = ptms['LT_LIT'].max() #if specific step value not provided, round to nearest 10% of max value if step is None and max_value >= 1: step = round(max_value/10) elif step is None and max_value < 1: step = max_value/10 #filter PTMs using the indicated filter type method for value in range x = np.arange(min_value, int(max_value) + 1, step) for i in x: #filter PTMs if filter_type == 'min_studies': filtered_ptms = helpers.filter_ptms(ptms, report_removed = False, min_studies = i, phospho_only_evidence_filter = phospho_only_evidence_filter) elif filter_type == 'min_compendia': filtered_ptms = helpers.filter_ptms(ptms, report_removed = False, min_compendia = i, phospho_only_evidence_filter=phospho_only_evidence_filter) elif filter_type == 'min_MS': filtered_ptms = helpers.filter_ptms(ptms, report_removed = False, min_MS_observations = i, phospho_only_evidence_filter=phospho_only_evidence_filter) elif filter_type == 'min_LTP': filtered_ptms = helpers.filter_ptms(ptms, report_removed = False, min_LTP_studies = i, phospho_only_evidence_filter=phospho_only_evidence_filter) #save number of PTMs and the fraction that are phosphorylated num_ptms.append(filtered_ptms.shape[0]) #fraction of PTMs that are phosphorylation sites if filtered_ptms.shape[0] > 0 and 'Phosphorylation' in filtered_ptms['Modification Class'].unique(): filtered_mods = filtered_ptms['Modification Class'].value_counts() phospho_fraction = filtered_mods['Phosphorylation']/filtered_mods.sum() frac_phospho.append(phospho_fraction) elif filtered_ptms.shape[0] == 0: frac_phospho.append(np.nan) else: frac_phospho.append(0) x_label_dict = {'min_studies': 'Minimum number of\nliterature reports', 'min_LTP': 'Minimum number of\nLow-throughput Studies', 'min_MS': 'Minimum number of\nMS Observations', 'min_compendia': 'Minimum number of\ncompendia'} if ax is None: fig, ax = plt.subplots(figsize = (3,3)) ax.plot(x, num_ptms, color = 'blue') ax.set_ylabel('Number of PTMs', color = 'blue', fontsize = fontsize) #change color of tick labels ax.tick_params(axis='y', labelcolor='blue') ax.set_xlabel(x_label_dict[filter_type], fontsize = fontsize) ax2 = ax.twinx() ax2.plot(x, frac_phospho, color = 'red') ax2.set_ylabel('Phosphorylation\nFraction', color = 'red', fontsize = fontsize) ax2.tick_params(axis='y', labelcolor='red')