Source code for ptm_pose.analyze.enzyme

import numpy as np
import pandas as pd
import pickle

import os
from tqdm import tqdm

#plotting functions
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from matplotlib.patches import Patch
import seaborn as sns

#custom stat functions and other helper functions
from ptm_pose import stat_utils, pose_config, helpers, annotate
import scipy.stats as stats



#optional analysis packages
try:
    import kinase_library as kl
    kinase_library = True
except ImportError:
    kinase_library = False

package_dir = os.path.dirname(os.path.abspath(__file__))


[docs] def compare_KL_for_sequence(inclusion_seq, exclusion_seq, dpsi = None, comparison_type = 'percentile'): """ Given two sequences, compare the kinase library scores, percentiles, or ranks for each sequence. Optionally, provide a dPSI value to calculate the relative change in preference for each kinase. Parameters ---------- inclusion_seq : str sequence to score for inclusion preference, with modification lowercased exclusion_seq : str sequence to score for exclusion preference, with modification lowercased dpsi : float dPSI value for the PTM event, which will be used to calculate the relative change in preference for each kinase (score difference * dPSI). Default is None. comparison_type : str type of comparison to perform. Can be 'percentile', 'score', or 'rank'. Default is 'percentile'. """ #get percentiles for inclusion sequence s_inc = kl.Substrate(inclusion_seq) inn_perc = s_inc.percentile(sort_by = 'name') #get percentiles for exclusion sequence s_exc = kl.Substrate(exclusion_seq) if comparison_type == 'percentile': inc_results = s_inc.percentile(sort_by = 'name') ex_results = s_exc.percentile(sort_by = 'name') elif comparison_type == 'score': inc_results = s_inc.score() ex_results = s_exc.score() elif comparison_type == 'rank': inc_results = s_inc.rank() ex_results = s_exc.rank() results = pd.concat([inc_results, ex_results], axis = 1) results.columns = [f'Inclusion {comparison_type}', f'Exclusion {comparison_type}'] results['Difference'] = results[f'Inclusion {comparison_type}'] - results[f'Exclusion {comparison_type}'] results['Absolute Difference'] = results['Difference'].abs() results = results.sort_values(by = 'Absolute Difference', ascending = False) if dpsi is not None: results['dPSI'] = dpsi results['Relative Change in Preference'] = results['Difference'] * dpsi return results
[docs] def get_all_KL_scores(seq_data, seq_col, kin_type = ['ser_thr', 'tyrosine'], score_type = 'percentiles'): """ Given a dataset with flanking sequences, score each flanking sequence Parameters ---------- seq_data : pandas dataframe processed dataframe containing flanking sequences to score seq_col : str column in seq_data containing the flanking sequences kin_type : list list of kinase types to score. Can be 'ser_thr' or 'ST' for serine/threonine kinases, or 'tyrosine' or 'Y' for tyrosine kinases. Default is ['ser_thr', 'tyrosine']. score_type : str type of score to calculate. Can be 'percentile', 'score', or 'rank'. Default is 'percentile'. Returns ------- merged_data : dict dictionary containing the merged dataframes for each kinase type with KinaseLibrary scores """ phospho_predict = kl.PhosphoProteomics(seq_data, seq_col = seq_col) if isinstance(kin_type, list): if len(kin_type) == 1: phospho_predict.predict(kin_type = kin_type) merged_data = phospho_predict.merge_data_scores(kin_type = kin_type, score_type = score_type) else: merged_data = {} for ktype in kin_type: if ktype in ['ser_thr', 'tyrosine', 'Y', 'ST']: if ktype == 'Y': ktype == 'tyrosine' elif ktype == 'ST': ktype = 'ser_thr' phospho_predict.predict(kin_type = ktype) merged_data[ktype] = phospho_predict.merge_data_scores(kin_type = ktype, score_type = score_type) else: print(f"{ktype} not recognized. Must be 'ser_thr' or 'ST' for serine/threonine, or 'tyrosine' or 'Y' for tyrosine") elif isinstance(kin_type, str): phospho_predict.predict(metric = score_type, kin_type = kin_type) merged_data = phospho_predict.merge_data_scores(kin_type = kin_type, score_type = score_type) return merged_data
class KL_flank_analysis: def __init__(self, altered_flanks, metric = 'percentile', alpha = 0.05, min_dpsi = 0.2, **kwargs): """ Class for comparing kinase preferences for inclusion and exclusion sequences from altered flanking sequences, using motif scoring from Kinase Library. Parameters ---------- altered_flanks : pandas dataframe dataframe containing altered flanking sequences (output during projection), including columns for 'Region ID', 'Gene', 'Residue', 'PTM Position in Isoform', 'Inclusion Flanking Sequence', 'Exclusion Flanking Sequence', 'Modification Class', and 'dPSI'. metric : str metric to use for scoring. Can be 'percentile', 'score', or 'rank'. Default is 'percentile'. alpha : float significance threshold for p-value. Default is 0.05. min_dpsi : float effect size threshold for dPSI. Default is 0.2. **kwargs : dict additional keyword arguments to pass to the filter_ptms function """ #check to make sure kinase library is installed if not kinase_library: print('Kinase library package not installed. To use this functionality, please install the kinase library package by running `pip install kinase-library`') return None #reduce to phosphorylation sites altered_flanks = altered_flanks[altered_flanks['Modification Class'] == 'Phosphorylation'].copy() #restrict to only significant PTMs filter_arguments = helpers.extract_filter_kwargs(alpha = alpha, min_dpsi = min_dpsi, modification_class = 'Phosphorylation', **kwargs) altered_flanks = helpers.filter_ptms(altered_flanks, **filter_arguments) #restrict to only necessary columns and remove duplicate entries (from canonical and alternative isoforms) optional_cols = ['dPSI'] cols = ['Region ID', 'Gene', 'Residue', 'PTM Position in Isoform', 'Inclusion Flanking Sequence', 'Exclusion Flanking Sequence'] + [c for c in optional_cols if c in altered_flanks.columns] altered_flanks = altered_flanks.sort_values(by = 'Isoform Type', ascending = False) altered_flanks = altered_flanks[cols].drop_duplicates(keep = 'first') altered_flanks.reset_index(drop = True, inplace = True) #store result self.altered_flanks = altered_flanks.copy() self.metric = metric if 'dPSI' in altered_flanks.columns: self.optional_cols = ['dPSI'] def analyze_single_ptm(self, gene, loc): """ Score a single PTM for its flanking sequence in the inclusion and exclusion isoforms using Kinase-Library """ ptm_data = self.altered_flanks[(self.altered_flanks['Gene'] == gene) & (self.altered_flanks['PTM Position in Isoform'] == loc)].copy() #remove duplicate entries with same splice event and .drop_duplicates(subset = ['Inclusion Flanking Sequence', 'Exclusion Flanking Sequence', 'Region ID']) if ptm_data.shape[0] == 0: raise ValueError(f"No flanking sequence data found for gene {gene} at position {loc}") elif ptm_data.shape[0] == 1: # If there's only one row, squeeze it to remove the extra dimension ptm_data = ptm_data.squeeze() dpsi = ptm_data['dPSI'] if 'dPSI' in ptm_data else None results = compare_KL_for_sequence(ptm_data['Inclusion Flanking Sequence'], ptm_data['Exclusion Flanking Sequence'], dpsi = dpsi) return results elif ptm_data.shape[0] > 1: # if multiple entries, perform analysis on all of them print(f"Multiple splice events impacting the flanking sequence of {ptm_data['Residue'].values[0]}{loc} in {gene} found. Performing analysis on all of them.") results = {} for i, row in ptm_data.iterrows(): dpsi = row['dPSI'] if 'dPSI' in row else None results[row['Region ID']] = compare_KL_for_sequence(row['Inclusion Flanking Sequence'], row['Exclusion Flanking Sequence'], dpsi = dpsi) return results def score_all_ptms(self): """ Score the flanking sequences of PTMs for both the inclusion and exclusion isoforms using Kinase-Library """ inclusion_sequences = self.altered_flanks[[col for col in self.altered_flanks.columns if 'Exclusion' not in col]].copy() exclusion_sequences = self.altered_flanks[[col for col in self.altered_flanks.columns if 'Inclusion' not in col]].copy() print('Score sequences for inclusion isoforms') inclusion_percentile = get_all_KL_scores(inclusion_sequences, 'Inclusion Flanking Sequence') print('\nScoring sequences for exclusion isoforms') exclusion_percentile = get_all_KL_scores(exclusion_sequences, 'Exclusion Flanking Sequence') self.inclusion_percentile = inclusion_percentile self.exclusion_percentile = exclusion_percentile def melt_score_df(self, kin_type = 'ser_thr', flank_type = 'Inclusion'): """ Melt the percentile dataframe to long format for easier plotting """ if not hasattr(self, 'inclusion_percentile') or not hasattr(self, 'exclusion_percentile'): self.score_all_ptms() #check to make sure kin_type is valid if kin_type not in ['ser_thr', 'tyrosine']: raise ValueError(f"kin_type must be either 'ser_thr' or 'ST' for serine/threonine, or 'tyrosine'/'Y' for tyrosine, but got {kin_type}") elif kin_type == 'Y': kin_type = 'tyrosine' elif kin_type == 'ST': kin_type = 'ser_thr' #grab percentile data seq_col = f'{flank_type} Flanking Sequence' if flank_type == 'Inclusion': percentile_df = self.inclusion_percentile[kin_type].copy() else: percentile_df = self.exclusion_percentile[kin_type].copy() #remove unneeded columns percentile_df = percentile_df.drop(columns = ['phos_res', 'Sequence']) #melt data percentile_df = percentile_df.melt(id_vars = ['Region ID', 'Gene', 'Residue', 'PTM Position in Isoform', seq_col] + self.optional_cols, var_name = 'Kinase', value_name = self.metric.capitalize()) return percentile_df def combine_score_dfs(self): """ Combine the inclusion and exclusion score dataframes into a single dataframe that calculates differences in predicted preference between inclusion and exclusion isoforms """ if not hasattr(self, 'inclusion_percentile') or not hasattr(self, 'exclusion_percentile'): self.score_all_ptms() melted_inclusion = pd.concat( [self.melt_percentile_df(flank_type = 'Inclusion', kin_type = 'ser_thr'), self.melt_percentile_df(flank_type = 'Inclusion', kin_type = 'tyrosine')] ) melted_inclusion = melted_inclusion.rename(columns = {'Percentile': f'Inclusion {self.score_type}'}) melted_exclusion = pd.concat( [self.melt_percentile_df(flank_type = 'Exclusion', kin_type = 'ser_thr'), self.melt_percentile_df(flank_type = 'Inclusion', kin_type = 'tyrosine')] ) melted_exclusion = melted_exclusion.rename(columns = {'Percentile': f'Exclusion {self.score_type}'}) combined = melted_inclusion.merge(melted_exclusion, on = ['Region ID', 'Gene', 'Residue', 'PTM Position in Isoform', 'Kinase'] + self.optional_cols, how = 'inner') #calculate the difference combined['Difference'] = combined[f'Inclusion {self.score_type}'] - combined[f'Exclusion {self.score_type}'] combined['Absolute Difference'] = combined['Difference'].abs() combined = combined.sort_values(by = 'Absolute Difference', ascending = False) if 'dPSI' in combined.columns: combined['Relative Change in Preference'] = combined['Difference'] * combined['dPSI'] self.summary_df = combined def analyze_all_ptms(self): """ Perform the full analysis on all PTMs, including scoring and melting the dataframes """ self.score_all_ptms() self.combine_score_dfs() def get_kinases_with_largest_changes(self, kinase_type = 'ST', top_n = 5, difference_type = 'relative', agg = 'median'): """ Get the top n kinases with the largest changes in preference for inclusion or exclusion isoforms. Difference can be calculated as the normal difference in preference, the absolute difference in preference, or the relative change in preference (normalized by magnitude of splicing change, or dPSI). Parameters ---------- kinase_type : str type of kinase to restrict analysis to. Can be 'ST' or 'ser_thr' for serine/threonine kinases, or 'Y' or 'tyrosine' for tyrosine kinases. Default is 'ST'. top_n : int number of top kinases to return. Default is 5. difference_type : str type of difference to calculate. Can be 'normal' (difference in preference), 'absolute' (absolute difference in preference), or 'relative' (relative change in preference normalized by magnitude of splicing change, or dPSI). Default is 'relative'. agg : str aggregation function to use across all interactions for calculating top kinases. Default is 'median'. """ #grab kinase of interest if kinase_type == 'ST' or kinase_type == 'ser_thr': kinase_df = self.summary_df[self.summary_df['Residue'].str.contains('S|T')].copy() elif kinase_type == 'Y' or kinase_type == 'tyrosine': kinase_df = self.summary_df[self.summary_df['Residue'].str.contains('Y')].copy() if difference_type == 'normal': comp_col = 'Absolute Difference' elif difference_type == 'absolute': comp_col == 'Absolute Difference' elif difference_type == 'relative': comp_col = 'Relative Change in Preference' kinase_df['Absolute Relative'] = kinase_df['Relative Change in Preference'].abs() comp_col = 'Absolute Relative' else: raise ValueError('difference_type must be either "normal", "absolute", or "relative"') #check to make sure provided kinase type is valid if kinase_type not in ['ST','Y','ser_thr','tyrosine']: raise ValueError('kinase_type must be either "ST" or "ser_thr" for serine/threonine kinases or "Y" or "tyrosine" for tyrosine kinases') top_kinases = list(kinase_df.groupby('Kinase')[comp_col].agg(agg).sort_values(ascending = False).head(top_n).index) kinase_df = kinase_df[kinase_df['Kinase'].isin(top_kinases)] return kinase_df def plot_top_kinases(self, top_n = 5, kinase_type = 'ST', difference_type = 'relative', agg = 'median', metric_type = 'percentile', ax = None): """ Plot a swarm plot showing the top n kinases with the largest changes in preference for inclusion or exclusion isoforms" """ top_kinase_df = self.get_kinases_with_largest_changes(kinase_type = kinase_type, top_n = top_n, difference_type = difference_type, agg = agg) if ax is None: fig, ax = plt.subplots(figsize = (5, 4)) #plot swarm plot y_data = 'Relative Change in Preference' if difference_type == 'relative' else 'Absolute Difference' if difference_type == 'absolute' else 'Difference' sns.swarmplot(data = top_kinase_df, x = 'Kinase', y = y_data, dodge = True, ax = ax) ax.axhline(0, color = 'black', linestyle = '--', linewidth = 0.5) if difference_type == 'relative': ax.set_ylabel('Relative Change\nin Preference\n(Change*dPSI)') elif difference_type == 'absolute': ax.set_ylabel(f'Absolute Difference\nin {metric_type.capitalize()}') elif difference_type == 'normal': ax.set_ylabel(f'Difference\nin {metric_type.capitalize()}') def plot_top_changes(self, gene = None, loc = None, top_n = 10, difference_type = 'relative', metric_type = 'percentile', ax = None): """ Plot the top n changes in preference after an altered flanking sequence event, either for a all events, a specific gene, or a specific PTM. Top changes can be calculated by percentile change, absolute percentile change, or relative change in preference (percentile change in affinity*dPSI) Parameters ---------- gene : str gene to restrict analysis to. Default is None, which will analyze all PTMs. loc : int location of PTM to restrict analysis to. Default is None, which will analyze all PTMs for the given gene or for all genes (if gene = None). top_n : int number of top changes to plot. Default is 10. difference_type : str type of difference to plot. Can be 'normal' (difference in preference), 'absolute' (absolute difference in preference), or 'relative' (relative change in preference normalized by magnitude of splicing change, or dPSI). Default is 'relative'. metric_type : str type of metric to plot. Default is 'percentile'. ax : matplotlib axis axis to plot on. Default is None, which will create a new figure and axis. """ if not hasattr(self, 'summary_df'): self.analyze_all_ptms() if gene is None and loc is None: ptm = self.summary_df.copy() ptm['Kinase'] = ptm['Kinase'] + '->' + ptm['Gene'] + ' ' + ptm['Residue'] + ptm['PTM Position in Isoform'].astype(str) elif gene is not None: if loc is None: ptm = self.summary_df[self.summary_df['Gene'] == gene].copy() ptm['Kinase'] = ptm['Kinase'] + '->' + ptm['Gene'] + ' ' + ptm['Residue'] + ptm['PTM Position in Isoform'].astype(str) else: ptm = self.summary_df[(self.summary_df['Gene'] == gene) & (self.summary_df['PTM Position in Isoform'] == loc)].copy() if difference_type == 'normal': ptm = ptm.sort_values(by = 'Absolute Difference', ascending = False) diff_col = 'Difference' y_label = f'Difference\nin {metric_type.capitalize()}' elif difference_type == 'absolute': ptm = ptm.sort_values(by = 'Absolute Difference', ascending = False) diff_col = 'Absolute Difference' y_label = f'Absolute Difference\nin {metric_type.capitalize()}' elif difference_type == 'relative': ptm['Relative Absolute'] = ptm['Relative Change in Preference'].abs() ptm = ptm.sort_values(by = 'Relative Absolute', ascending = False) diff_col = 'Relative Change in Preference' y_label = 'Relative Change\nin Preference\n(Change*dPSI)' else: raise ValueError('difference_type must be either "normal", "absolute", or "relative"') plt_data = ptm.head(top_n).sort_values(by = diff_col, ascending = True) #color bars by sign colors = ['coral' if x > 0 else 'lightblue' for x in plt_data[diff_col]] if ax is None: fig, ax = plt.subplots(figsize = (3, 4)) ax.barh(plt_data['Kinase'], plt_data[diff_col], color = colors, edgecolor = 'black', height=1) ax.set_xlabel(y_label) if gene is not None and loc is not None: ax.set_title(f'{gene} {loc} - Top {top_n} Changes') elif gene is not None: ax.set_title(f'{gene} - Top {top_n} Changes') else: ax.set_title(f'Top Changes') #construct legend if difference_type != 'absolute': handles = [plt.Rectangle((0,0),1,1, facecolor='coral', edgecolor='black'), plt.Rectangle((0,0),1,1, facecolor='lightblue', edgecolor='black')] labels = ['Increase in Preference\nupon Perturbation', 'Decrease in Preference \nupon Perturbation'] if difference_type == 'relative' else ['Prefers Inclusion Isoform', 'Prefers Exclusion Isoform'] ax.legend(handles, labels, loc=(1.05,0.5), fontsize=8) class KSEA: def __init__(self, ptms, database = 'OmniPath Writer', modification = None, min_dpsi = None, alpha = None, **kwargs): """ Adapted version of the Kinase Substrate Enrichment Algorithm (KSEA) from Casado et al. 2013. This version is designed to work with the PTM-POSE pipeline and uses the OmniPath database as default for enzyme-substrate information. Parameters ---------- ptms : pandas dataframe dataframe containing PTM information, including columns for 'PTM', 'Modification Class', 'p-value', and 'dPSI'. This dataframe should be the output from the PTM-POSE pipeline. alpha : float significance threshold for p-value. Default is 0.05. dpsi : float effect size threshold for dPSI. Default is 0.2. database : str database to use for enzyme-substrate information. Default is 'OmniPath Writer'. Other options include 'OmniPath Eraser', 'PhosphoSitePlus', or 'DEPOD'. modification : str modification type to restrict analysis to. Default is None, which includes all modifications. If provided, should be one of the modification classes in the 'Modification Class' column of the ptms dataframe. """ #filter ptms if desired if min_dpsi is None: min_dpsi = 0 if alpha is None: alpha = 1.1 #filter ptms ptms = helpers.filter_ptms(ptms, modification_class = modification, min_dpsi = min_dpsi, alpha = alpha, **kwargs) #if omnipath database is used, make sure it was specified to either look at reader or writer enzymes if 'OmniPath' in database and database not in ['OmniPath Writer', 'OmniPath Eraser']: raise ValueError('OmniPath database must be either "OmniPath Writer" or "OmniPath Eraser"') elif database == 'OmniPath Writer': self.annot_col = 'OmniPath:Writer Enzyme' #use writer enzyme annotations elif database == 'OmniPath Eraser': self.annot_col = 'OmniPath:Eraser Enzyme' elif database not in ['PhosphoSitePlus', 'DEPOD', 'RegPhos', 'OmniPath Writer', 'OmniPath Eraser']: raise ValueError('database must be either "OmniPath Writer", "OmniPath Eraser", "PhosphoSitePlus", "DEPOD", or "RegPhos"') else: self.annot_col = f'{database}:Enzyme' #use enzyme annotations for other databases #load annotations if database == 'OmniPath Writer': self.annotations = annotate.process_database_annotations(database = 'OmniPath', annot_type = 'Writer Enzyme') elif database == 'OmniPath Eraser': self.annotations = annotate.process_database_annotations(database = 'OmniPath', annot_type = 'Eraser Enzyme') else: self.annotations = annotate.process_database_annotations(database = database, annot_type = 'Enzyme') self.enzymes = list(self.annotations.keys()) #add labels to the ptms dataframe ptms = helpers.add_ptm_column(ptms) ptms['Label'] = ptms['PTM'] + '-' + ptms['Modification Class'] self.ptms = ptms def runKSEA_singleenzyme(self, enzyme): """ Given a single enzyme, calculate the zscore for the enzyme based on the dPSI values of the PTMs in the dataframe. Parameters ---------- enzyme : str enzyme to calculate zscore for Returns ------- z : float zscore for the enzyme m : int number of identified substrates for the enzyme in the dataset """ annot = self.annotations[enzyme] #check the possible residues and modifications for the current enzyme (for establishing background) residues = set([x.split('_')[1][0] for x in annot]) mod = set([x.split('-')[1] for x in annot]) #filter the ptms dataframe to only include those that match the current enzyme tmp_ptms = self.ptms[(self.ptms['Residue'].isin(residues)) & (self.ptms['Modification Class'].isin(mod))] if tmp_ptms.empty: return np.nan, np.nan #get number of sites associated with enzyme m = tmp_ptms[tmp_ptms['Label'].isin(annot)].shape[0] if m == 0: return np.nan, np.nan #calculate the mean dPSI for all residues p_mean = tmp_ptms['dPSI'].mean() #calculate the mean dPSI for all residues modified by the current enzyme s_mean = tmp_ptms[tmp_ptms['Label'].isin(annot)]['dPSI'].mean() #calculate the stdev of dPSI for all residues p_std = tmp_ptms['dPSI'].std() #calculate the zscore for the current enzyme z = ((s_mean - p_mean) * (m**0.5)) / (p_std ) return z, m def runKSEA(self): """ Perform KSEA-style analysis for each kinase with substrates identified in experiment using dPSI values as quantitative measure. Returns ------- all_results: pandas dataframe melted dataframe including all KSEA results, including zscore, number of identified substrates (m), and false discovery rate zscore: pandas dataframe dataframe which contains zscores from all data columns, updated with new zscores calculated for data_col pvals: pandas dataframe dataframe which contains p-values from all data columns, updated with new p-values calculated for data_col FDR: pandas dataframe dataframe which contains false discover rate from all data columns, updated with new false discovery rate calculated for data_col m: pandas dataframe dataframe which contains number of identified substrates for each kianse from all data columns, updated with new m calculated for data_col """ results = pd.DataFrame(np.nan, columns = ['z', 'p', 'FDR', 'm'], index = self.enzymes) #iterate through each possible enzyme for enzyme in self.annotations: #get the zscore and number of substrates for the current enzyme z, m = self.runKSEA_singleenzyme(enzyme) results.loc[enzyme, 'm'] = m results.loc[enzyme, 'z'] = z #calculate p-values and FDR for the results results = results.dropna(how = 'all') results['p'] = stats.norm.cdf(-abs(results['z'].values)) results = results.sort_values(by = 'p', ascending = True) results['FDR'] = stat_utils.adjustP(results['p'].values, method = 'BH') self.results = results def plot_results(self, show_substrate_count = True, ax = None): """ Create a horizontal barplot of KSEA results, coloring based on significance and annotating with the number of substrates (if desired) Parameters ---------- show_substrate_count : bool whether to show the number of substrates on the right side of the plot. Default is True. ax : matplotlib axis axis to plot on. Default is None, which will create a new figure and axis. """ #create horizontal barplot of KSEA results, coloring based on significance plt_data = self.results.copy() #sort results by zscore plt_data = plt_data.sort_values('z') #plot results if ax is None: fig_height = len(plt_data)*0.2 fig, ax = plt.subplots(figsize = (3.5, fig_height)) colors = ['grey' if x > 0.05 else 'red' for x in plt_data['FDR']] ax.barh(plt_data.index, plt_data['z'], color = colors, edgecolor = 'black') ax.axvline(0, color = 'black') #set xlim to be equal on either side of 0 xlim = round(plt_data['z'].abs().max()) ax.set_xlim(-xlim, xlim) ax.set_xlabel('z-score') #add legend for significance legend_elements = [Patch(facecolor='grey', edgecolor='black', label='FDR > 0.05'), Patch(facecolor='red', edgecolor='black', label=r'$FDR \leq 0.05$')] ax.legend(handles = legend_elements) #annotate with number of substrates on the right side of plot if show_substrate_count: ax.text(xlim+0.1, len(plt_data)+1, 'm', va = 'center', ha = 'center', fontweight = 'bold') for i, v in enumerate(plt_data['m']): ax.text(xlim+0.1, i, str(int(v)), va = 'center', ha = 'center') def save_results(self, fname = 'ksea_results', odir = ''): """ Saves zscores, FDR, and m for all data columns in seperate .tsv files """ self.results.to_csv(f'{odir}{fname}.tsv', sep = '\t') class kstar_enrichment: def __init__(self, spliced_ptms, network_dir, background_ptms = None, impact_type = ['All', 'Included', 'Excluded'], phospho_type = 'Y', **kwargs): """ Given spliced ptm or PTMs with altered flanks and a single kstar network, get enrichment for each kinase in the network using a hypergeometric. Assumes the data has already been reduced to the modification of interest (phosphotyrosine or phoshoserine/threonine) Parameters ---------- network_dir : dict dictionary of networks with kinase-substrate information ptms : pandas dataframe differentially included ptms background_ptms: pd.DataFrame PTMs to consider as the background for enrichment purposes, which should overlap with the spliced ptms information provided (an example might be all identified events, whether or not they are significant). If not provided, will use all ptms in the phosphoproteome. impact_type: list of str or str type of impacts to perform enrichment analysis for. Can be 'All' (all significantly differentially included sites), 'Included' (sites with dPSI > 0), and/or 'Excluded' (sites with dPSI < 0). Default is to perform enrichment on all three groups. phospho_type : str type of phosphorylation event to extract. Can either by phosphotyrosine ('Y') or phosphoserine/threonine ('ST'). Default is 'Y'. """ #change phospho_type to list if not already if isinstance(phospho_type, list): pass elif phospho_type in ['Y', 'ST']: phospho_type = [phospho_type] elif phospho_type == 'All': phospho_type = ['Y', 'ST'] else: raise ValueError('phospho_type must be either Y, ST, or All') if isinstance(impact_type, list): #if impact type includes inclusion and/or exclusion, check if dPSI is present and assign impact type accordingly if 'Included' in impact_type or 'Excluded' in impact_type: if 'dPSI' in spliced_ptms.columns: spliced_ptms['Impact'] = spliced_ptms['dPSI'].apply(lambda x: 'Included' if x > 0 else 'Excluded') else: raise ValueError('spliced_ptms must contain dPSI column to determine impact type, please either provide dPSI or restrict impact_type to "All"') self.impact_type = impact_type elif isinstance(impact_type, str): if impact_type == 'Included' or impact_type == 'Excluded': if 'dPSI' not in spliced_ptms.columns: raise ValueError('spliced_ptms must contain dPSI column to determine impact type, please either provide dPSI or restrict impact_type to "All"') else: spliced_ptms['Impact'] = spliced_ptms['dPSI'].apply(lambda x: 'Included' if x > 0 else 'Excluded') self.impact_type = [impact_type] elif impact_type == 'All': self.impact_type = ['All'] else: raise ValueError('impact_type must be at least one of either "All", "Included", or "Excluded"') self.significant_ptms = {} self.background_ptms = {} self.networks = {} for ptype in phospho_type: if ptype == 'Y': print('\nProcessing differentially included phosphotyrosine data') elif ptype == 'ST': print('\nProcessing differentially included phosphoserine/threonine data') #process ptms to only include specific phosphorylation data needed self.significant_ptms[ptype] = self.process_ptms(spliced_ptms, phospho_type = ptype, **kwargs) #load background, either from provided data or from all phosphoproteome data if ptype == 'Y': print('\nProcessing background phosphotyrosine data') elif ptype == 'ST': print('\nProcessing background phosphoserine/threonine data') if background_ptms is not None: self.background_ptms[ptype] = self.process_ptms(background_ptms, phospho_type=ptype, min_dpsi = 0, alpha = 1.1, **kwargs) else: self.background_ptms[ptype] = self.process_ptms(pose_config.ptm_coordinates.copy(), phospho_type = ptype, min_dpsi = 0, alpha = 1.1, **kwargs) #load kstar networks self.networks[ptype] = self.load_networks(network_dir, phospho_type = ptype) #save info self.phospho_type = phospho_type self.median_enrichment = None def load_networks(self, network_dir, phospho_type = 'Y'): """ Given network directory, load all kstar networks for specific phosphorylation type """ #check if file exists and whether a pickle has been generated: if not, load each network file individually if not os.path.exists(network_dir): raise ValueError('Network directory not found') elif os.path.exists(f"{network_dir}/*.p"): networks = pickle.load(open(f"{network_dir}/network_{phospho_type}.p", "rb" ) ) else: network_directory = network_dir + f'/{phospho_type}/INDIVIDUAL_NETWORKS/' networks = {} for file in os.listdir(network_directory): if file.endswith('.tsv'): #get the value of the network number file_noext = file.strip(".tsv").split('_') key_name = 'nkin'+str(file_noext[1]) #print("Debug: key name is %s"%(key_name)) networks[key_name] = pd.read_csv(f"{network_directory}{file}", sep='\t') return networks def process_ptms(self, ptms, phospho_type = 'Y', **kwargs): """ Given ptm information, restrict data to include only the phosphorylation type of interest and add a PTM column for matching information from KSTAR Parameters ---------- ptms: pd.DataFrame ptm information containing modification type and ptm locatin information, such as the output from projection or altered flanking sequence analysis phospho_type: str type of phosphorylation event to extract. Can either by phosphotyrosine ('Y') or phosphoserine/threonine ('ST') Returns ptms: pd.DataFrame trimmed dataframe containing only modifications of interest and new 'PTM' column """ #restrict to ptms to phosphorylation type of interest ptms = ptms[(ptms['Modification Class'] == 'Phosphorylation')].copy() if phospho_type == 'Y': ptms = ptms[ptms['Residue'] == 'Y'].copy() elif phospho_type == 'ST': ptms = ptms[ptms['Residue'].isin(['S', 'T'])].copy() else: raise ValueError('phospho_type must be either Y or ST') ptms = helpers.filter_ptms(ptms, **kwargs) #construct PTM column that matches KSTAR information ptms['PTM'] = ptms['UniProtKB Accession'] + '_' + ptms['Residue'] + ptms['PTM Position in Isoform'].astype(int).astype(str) return ptms def get_enrichment_single_network(self, network_key, impact_type = 'All', phospho_type = 'Y'): """ in progress """ network = self.networks[phospho_type][network_key] network['PTM'] = network['KSTAR_ACCESSION'] + '_' + network['KSTAR_SITE'] #add network information to all significant data if impact_type == 'All': sig_ptms = self.significant_ptms[phospho_type][['PTM']].drop_duplicates() elif impact_type == 'Included': sig_ptms = self.significant_ptms[phospho_type][self.significant_ptms[phospho_type]['Impact'] == 'Included'][['PTM']].drop_duplicates() elif impact_type == 'Excluded': sig_ptms = self.significant_ptms[phospho_type][self.significant_ptms[phospho_type]['Impact'] == 'Excluded'][['PTM']].drop_duplicates() else: raise ValueError('impact_type must be either "All", "Included", or "Excluded"') sig_ptms_kstar = sig_ptms.merge(network[['KSTAR_KINASE','PTM']], on = 'PTM') #repeat for background data bg_ptms = self.background_ptms[phospho_type][['PTM']].drop_duplicates() bg_ptms_kstar = bg_ptms.merge(network[['KSTAR_KINASE','PTM']], on = 'PTM') results = pd.DataFrame(np.nan, index = sig_ptms_kstar['KSTAR_KINASE'].unique(), columns = ['k','n','M','N','p']) for kinase in sig_ptms_kstar['KSTAR_KINASE'].unique(): #get numbers for a hypergeometric test to look for enrichment of kinase substrates k = sig_ptms_kstar.loc[sig_ptms_kstar['KSTAR_KINASE'] == kinase, 'PTM'].nunique() n = bg_ptms_kstar.loc[bg_ptms_kstar['KSTAR_KINASE'] == kinase, 'PTM'].nunique() M = bg_ptms['PTM'].nunique() N = sig_ptms_kstar['PTM'].nunique() #run hypergeometric test results.loc[kinase,'p'] = stat_utils.hypergeom(M,n,N,k) results.loc[kinase, 'M'] = M results.loc[kinase, 'N'] = N results.loc[kinase, 'k'] = k results.loc[kinase, 'n'] = n return results def get_enrichment_all_networks(self, impact_type = 'All', phospho_type = 'Y'): """ Given prostate data and a dictionary of kstar networks, get enrichment for each kinase in each network in the prostate data. Assumes the prostate data has already been reduced to the modification of interest (phosphotyrosine or phoshoserine/threonine) Parameters ---------- networks : dict dictionary of kstar networks prostate : pandas dataframe all PTMs identified in tCGA prostate data, regardless of significance (reduced to only include mods of interest) sig_prostate : pandas dataframe significant PTMs identified in tCGA prostate data, p < 0.05 and effect size > 0.25 (reduced to only include mods of interest) """ results = {} for network in self.networks[phospho_type]: results[network] = self.get_enrichment_single_network(network_key=network, impact_type=impact_type, phospho_type=phospho_type) return results def extract_enrichment(self, results): """ Given a dictionary of results from get_enrichment_all_networks, extract the p-values for each network and kinase, and then calculate the median p-value across all networks for each kinase Parameters ---------- results : dict dictionary of results from get_enrichment_all_networks """ enrichment = pd.DataFrame(index = results['nkin0'].index, columns = results.keys()) for network in results: enrichment[network] = results[network]['p'] enrichment['median'] = enrichment.median(axis = 1) return enrichment def run_kstar_enrichment(self): """ Run full kstar analysis to generate substrate enrichment across each of the 50 KSTAR networks and calculate the median p-value for each kinase across all networks """ enrichment = {} median = {} for ptype in self.phospho_type: enrichment[ptype] = {} median_impacts = {} for impact in tqdm(self.impact_type, desc = f'Running enrichment for {ptype} data'): results = self.get_enrichment_all_networks(phospho_type=ptype, impact_type = impact) enrichment[ptype][impact] = self.extract_enrichment(results) median_impacts[impact] = enrichment[ptype][impact]['median'] median[ptype] = pd.DataFrame(median_impacts) self.enrichment_all = enrichment self.median_enrichment = median def return_enriched_kinases(self, impact_type = 'All', alpha = 0.05): """ Return kinases with a median p-value less than the provided alpha value (substrates are enriched among the significant PTMs) Parameters ---------- alpha : float significance threshold to use to subset kinases. Default is 0.05. """ if self.median_enrichment is None: self.run_kstar_enrichment() sig_enrichment = {} for ptype in self.phospho_type: tmp_data = self.median_enrichment[ptype][impact_type].copy() sig_enrichment[ptype] = tmp_data[tmp_data < alpha].index.values return sig_enrichment def dotplot(self, ptype = 'Y', impact_types = ['All', 'Included', 'Excluded'], kinase_axis = 'x', ax = None, facecolor = 'white', title = '', size_legend = False, color_legend = True, max_size = None, sig_kinases_only = True, alpha = 0.05, dotsize = 5, colormap={0: '#6b838f', 1: sns.color_palette('colorblind')[1]}, labelmap = {0: 'FPR > %0.2f'%(0.05), 1:'FPR <= %0.2f'%(0.05)}, legend_title = 'p-value', size_number = 5, size_color = 'gray', color_title = 'Significant', markersize = 10, legend_distance = 1.0, figsize = (5,5)): """ Generates the dotplot plot, where size is determined by values dataframe and color is determined by significant dataframe. This is a stripped down version of the code used in KSTAR to generate the dotplot for the kinase activities Parameters ----------- ax : matplotlib Axes instance, optional axes dotplot will be plotted on. If None then new plot generated """ multiplier = 10 offset = 5 if ax is None: fig, ax = plt.subplots(figsize=figsize) ax.set_facecolor(facecolor) ax.set_title(title) plt_data = self.median_enrichment[ptype][impact_types] if isinstance(plt_data, pd.Series): plt_data = pd.DataFrame(plt_data) if sig_kinases_only: plt_data = plt_data[(plt_data < alpha).any(axis = 1)] if plt_data.empty: raise ValueError('No significant kinases found with p-value < %0.2f'%(alpha)) if kinase_axis == 'x': plt_data = plt_data.T elif kinase_axis == 'y': plt_data = plt_data.copy() else: raise ValueError('kinase_axis must be either "x" or "y"') # Transform Data columns = list(plt_data.columns) values = -np.log10(plt_data).copy() colors = ((plt_data <= alpha) *1).copy() values['row_index'] = np.arange(len(values)) * multiplier + offset colors['row_index'] = np.arange(len(colors)) * multiplier + offset melt = values.melt(id_vars = 'row_index') values.drop(columns = ['row_index'], inplace = True) melt['var'] = melt.apply(lambda row : columns.index(row.iloc[1]) * multiplier + offset, axis = 1) melt_color = colors.melt(id_vars = 'row_index') melt_color['var'] = melt_color.apply(lambda row : columns.index(row.iloc[1]) * multiplier + offset, axis = 1) colors.drop(columns = ['row_index'], inplace = True) # Plot Data x = melt['var'] y = melt['row_index'][::-1] #needs to be done in reverse order to maintain order in the dataframe s = melt.value * dotsize #check to see if more than 2 values are given (fprs). Otherwise get color based on binary significance #get color for each datapoint based on significance melt_color['color'] = [colormap.get(l,'black') for l in melt_color.value] c = melt_color['color'] scatter = ax.scatter(x, y, c=c, s=s) # Add Color Legend if color_legend: #create the legend color_legend = [] for color_key in colormap.keys(): color_legend.append( Line2D([0], [0], marker='o', color='w', label=labelmap[color_key], markerfacecolor= colormap[color_key], markersize=markersize), ) legend1 = ax.legend(handles=color_legend, loc=f'upper right', bbox_to_anchor=(legend_distance,1), title = color_title) legend1.set_clip_on(False) ax.add_artist(legend1) # Add Size Legend if size_legend: #check to see if max pval parameter was given: if so, use to create custom legend if max_size is not None: s_label = np.arange(max_size/size_number,max_size+1,max_size/size_number).astype(int) dsize = [s*dotsize for s in s_label] legend_elements = [] for element, s in zip(s_label, dsize): legend_elements.append(Line2D([0],[0], marker='o', color = 'w', markersize = s**0.5, markerfacecolor = size_color, label = element)) legend2 = ax.legend(handles = legend_elements, loc = f'lower right', title = legend_title, bbox_to_anchor=(legend_distance,0)) else: kw = dict(prop="sizes", num=size_number, color=size_color, func=lambda s: s/dotsize) legend2 = ax.legend(*scatter.legend_elements(**kw), loc=f'lower right', title=legend_title, bbox_to_anchor=(legend_distance,0)) ax.add_artist(legend2) # Add Additional Plotting Information ax.tick_params(axis = 'x', rotation = 90) ax.yaxis.set_ticks(np.arange(len(values)) * multiplier + offset) ax.xaxis.set_ticks(np.arange(len(columns)) * multiplier + offset) ax.set_xticklabels(plt_data.columns) ax.set_yticklabels(plt_data.index[::-1]) #reverse order to match the dataframe #adjust x and y scale so that data is always equally spaced ax.set_ylim([0,len(values)*multiplier]) ax.set_xlim([0,len(columns)*multiplier]) return ax