import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#ptm_pose imports
from ptm_pose import helpers
[docs]
def plot_filter_impact(ptms, output_type = 'count', topn = 10, ax = None, **kwargs):
"""
Given a dataframe of PTMs and a set of filter arguments to be passed to helpers.filter_ptms, this function will plot the number or fraction of PTMs that are retained after filtering for each modification type
Parameters
----------
ptms : pd.DataFrame
Dataframe containing PTM data with a column 'Modification Class' that contains the type of modification (e.g. phosphorylation, acetylation, etc.)
output_type : str, optional
Type of output to plot, either 'count' or 'fraction'. The default is 'count'.
topn : int, optional
The number of top modification classes to plot. The default is 10.
ax : matplotlib.axes.Axes, optional
The axes to plot on. If None, a new figure and axes will be created. The default is None.
**kwargs : keyword arguments
Additional keyword arguments to be passed to the filter_ptms function (e.g. min_studies, min_compendia, etc.). These will be extracted and checked for validity.
"""
filter_arguments = helpers.extract_filter_kwargs(**kwargs)
helpers.check_filter_kwargs(filter_arguments)
filtered_ptms=helpers.filter_ptms(ptms, **filter_arguments)
original_mods = ptms['Modification Class'].value_counts()
original_mods.name = 'Original'
filtered_mods = filtered_ptms['Modification Class'].value_counts()
filtered_mods.name = 'Filtered'
#sort by top n values after filtering
original_mods = original_mods.sort_values(ascending = False).head(topn)
filtered_mods = filtered_mods[filtered_mods.index.isin(original_mods.index)]
#grab y-axis label and labelpad based on output_type
if output_type == 'fraction':
#convert counts to fractions
original_mods = original_mods / original_mods.sum()
filtered_mods = filtered_mods / filtered_mods.sum()
ylabel = 'Fraction of PTMs'
labelpad = 0.02
elif output_type == 'count':
ylabel = 'Number of PTMs'
labelpad = 10
else:
raise ValueError("output_type must be either 'count' or 'fraction'")
plt_data = pd.concat([original_mods, filtered_mods], axis = 1)
plt_data = plt_data.fillna(0)
if ax is None:
fig, ax = plt.subplots(figsize = (3,2))
plt_data.plot(kind = 'bar', ax = ax)
ax.set_ylabel(ylabel)
#annotate tops of bars with numbers
for p in ax.patches:
if output_type == 'count':
ax.annotate(str(int(p.get_height())), (p.get_x() * 1.005, p.get_height() + labelpad), fontsize = 8, rotation = 90, annotation_clip=False)
elif output_type == 'fraction':
#for fractions, convert to percentage and round to 2 decimal places
ax.annotate(str(round(p.get_height() * 100, 2)) + '%', (p.get_x() * 1.005, p.get_height() + labelpad), fontsize = 8, rotation = 90, annotation_clip=False)
#remove top and right spines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
[docs]
def assess_filter_range(ptms, min_value = 0, max_value = None, step = None, filter_type = 'min_studies', phospho_only_evidence_filter = False, ax = None, fontsize = 11):
"""
Given a dataframe of PTMs and a PTM evidence filter type, assess how adjusting the filter value impacts the number of PTMs and the fraction of those PTMs that are phosphorylated. This is done by plotting the number of PTMs and the fraction of phosphorylated PTMs as a function of the filter value.
Parameters
----------
ptms : pd.DataFrame
Dataframe containing PTM data, either the spliced_ptms or altered_flank data
min_value : int, optional
The minimum value for the filter. The default is 0.
max_value : int, optional
The maximum value for the filter. If None, the maximum value will be determined based on the filter type. The default is None.
step : int, optional
The step size for the filter. If None, the step size will be determined based on the maximum value. The default is None.
filter_type : str, optional
The type of filter to apply. Must be one of ['min_studies', 'min_compendia', 'min_MS', 'min_LTP']. The default is 'min_studies'.
phospho_only_evidence_filter : bool, optional
Whether to apply the phospho only evidence filter (only filter phosphorylatio sites). The default is False.
ax : matplotlib.axes.Axes, optional
The axes to plot on. If None, a new figure and axes will be created. The default is None.
fontsize : int, optional
The font size for the plot. The default is 11.
"""
num_ptms = []
frac_phospho = []
#check filter_type is valid
if filter_type not in ['min_studies', 'min_compendia', 'min_MS', 'min_LTP']:
raise ValueError("filter_type must be one of ['min_studies', 'min_compendia', 'min_MS', 'min_LTP']")
#grab max value if not provided
if max_value is None:
if filter_type == 'min_studies':
max_value = int(ptms[['MS_LIT', 'LT_LIT']].sum(axis =1).max())
elif filter_type == 'min_compendia':
max_value = ptms['Number of Compendia'].max()
elif filter_type == 'min_MS':
max_value = ptms[['MS_LIT', 'MS_CST']].sum(axis = 1).max()
elif filter_type == 'min_LTP':
max_value = ptms['LT_LIT'].max()
#if specific step value not provided, round to nearest 10% of max value
if step is None and max_value >= 1:
step = round(max_value/10)
elif step is None and max_value < 1:
step = max_value/10
#filter PTMs using the indicated filter type method for value in range
x = np.arange(min_value, int(max_value) + 1, step)
for i in x:
#filter PTMs
if filter_type == 'min_studies':
filtered_ptms = helpers.filter_ptms(ptms, report_removed = False, min_studies = i, phospho_only_evidence_filter = phospho_only_evidence_filter)
elif filter_type == 'min_compendia':
filtered_ptms = helpers.filter_ptms(ptms, report_removed = False, min_compendia = i, phospho_only_evidence_filter=phospho_only_evidence_filter)
elif filter_type == 'min_MS':
filtered_ptms = helpers.filter_ptms(ptms, report_removed = False, min_MS_observations = i, phospho_only_evidence_filter=phospho_only_evidence_filter)
elif filter_type == 'min_LTP':
filtered_ptms = helpers.filter_ptms(ptms, report_removed = False, min_LTP_studies = i, phospho_only_evidence_filter=phospho_only_evidence_filter)
#save number of PTMs and the fraction that are phosphorylated
num_ptms.append(filtered_ptms.shape[0])
#fraction of PTMs that are phosphorylation sites
if filtered_ptms.shape[0] > 0 and 'Phosphorylation' in filtered_ptms['Modification Class'].unique():
filtered_mods = filtered_ptms['Modification Class'].value_counts()
phospho_fraction = filtered_mods['Phosphorylation']/filtered_mods.sum()
frac_phospho.append(phospho_fraction)
elif filtered_ptms.shape[0] == 0:
frac_phospho.append(np.nan)
else:
frac_phospho.append(0)
x_label_dict = {'min_studies': 'Minimum number of\nliterature reports', 'min_LTP': 'Minimum number of\nLow-throughput Studies', 'min_MS': 'Minimum number of\nMS Observations', 'min_compendia': 'Minimum number of\ncompendia'}
if ax is None:
fig, ax = plt.subplots(figsize = (3,3))
ax.plot(x, num_ptms, color = 'blue')
ax.set_ylabel('Number of PTMs', color = 'blue', fontsize = fontsize)
#change color of tick labels
ax.tick_params(axis='y', labelcolor='blue')
ax.set_xlabel(x_label_dict[filter_type], fontsize = fontsize)
ax2 = ax.twinx()
ax2.plot(x, frac_phospho, color = 'red')
ax2.set_ylabel('Phosphorylation\nFraction', color = 'red', fontsize = fontsize)
ax2.tick_params(axis='y', labelcolor='red')