Source code for ptm_pose.analyze.annotations

import numpy as np
import pandas as pd

import os
import time

#plotting 
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns

#custom stat functions
from ptm_pose import stat_utils, pose_config, annotate, helpers
from ptm_pose.analyze import summarize

try:
    import gseapy as gp
except ImportError:
    gp = None


package_dir = os.path.dirname(os.path.abspath(__file__))





[docs] def get_available_annotations(ptms): """ Given a PTM dataframe, indicate the annotations that are available for analysis and indicate whether they have already been appended to the PTM dataset Parameters ---------- ptms : pd.DataFrame contains PTM information and may have annotations already appended, such as spliced_ptms and altered_flanks dataframes generated during projection Returns ------- available_annots : pd.DataFrame DataFrame indicating the available annotation types and their sources, as well as whether they have been appended to the PTM data. """ available_gmt = annotate.get_available_gmt_annotations(format = 'dataframe') available_gmt['Appended to PTM data?'] = 'No' #check to see if any annotations have been added to the spliced_ptms dataframe annot_cols = [col for col in ptms.columns if ':' in col] database_list = [] annot_type_list = [] for a in annot_cols: database_list.append(a.split(':')[0]) annot_type_list.append(a.split(':')[1]) available_in_df = pd.DataFrame({'Database':database_list, 'Annotation Type':annot_type_list, 'Appended to PTM data?': 'Yes'}) #combine the two dataframes and remove duplicates available_annots = pd.concat([available_in_df, available_gmt]).drop_duplicates(keep = 'first', subset = ['Database', 'Annotation Type']).reset_index(drop = True) available_annots.sort_values(by = 'Annotation Type') return available_annots
[docs] def get_ptm_annotations(ptms, annot_type = 'Function', database = 'PhosphoSitePlus', collapse_on_similar = False, min_dpsi = 0.1, alpha = 0.05, **kwargs): """ Given spliced ptm information obtained from project and annotate modules, grab PTMs in spliced ptms associated with specific PTM modules Parameters ---------- spliced_ptms: pd.DataFrame PTMs projected onto splicing events and with annotations appended from various databases annot_type: str Type of annotation to pull from spliced_ptms dataframe. Available information depends on the selected database. Default is 'Function'. database: str database from which PTMs are pulled. Options include 'PhosphoSitePlus', 'ELM', or 'PTMInt'. ELM and PTMInt data will automatically be downloaded, but due to download restrictions, PhosphoSitePlus data must be manually downloaded and annotated in the spliced_ptms data using functions from the annotate module. Default is 'PhosphoSitePlus'. collapse_on_similar : bool Whether to collapse similar annotations (for example, "cell growth, increased" and "cell growth, decreased") into a single category. Default is False. min_dpsi: float Minimum change in PSI required to return a PTM as associated with the annotation. Default is 0.1. This can be used to filter out PTMs that are not significantly spliced. alpha : float Significance threshold to use to filter PTMs based on their significance. Default is 0.05. This can be used to filter out PTMs that are not significantly spliced. kwargs: additional keyword arguments Additional keyword arguments to pass to the `filter_ptms()` function from the helper module. These will be used to filter ptms with lower evidence. For example, if you want to filter PTMs based on the number of MS observations, you can add 'min_MS_observations = 2' to the kwargs. This will filter out any PTMs that have less than 2 MS observations. See the `filter_ptms()` function for more options. Returns ------- annotations : pd.DataFrame Individual PTM information of PTMs that have been associated with the requested annotation type. annotation_counts : pd.Series or pd.DataFrame Number of PTMs associated with each annotation of the requested annotation type. If dPSI col is provided or impact col is present, will output annotation counts for each type of impact ('Included', 'Excluded', 'Altered Flank') separately. """ if database == 'PTMsigDB': if annot_type == 'Perturbation': raise ValueError('PTMsigDB has multiple perturbation datasets. Please specify which dataset to use. Options are: "Perturbation-DIA","Perturbation-DIA2", "Perturbation-PRM".') elif annot_type == 'Pathway': raise ValueError('PTMsigDB has multiple pathway datasets. Please specify which dataset to use. Options are: "Pathway-WikiPathway", "Pathway-NetPath", "Pathway-BI"') else: annot_col = f'{database}:{annot_type}' else: annot_col = f'{database}:{annot_type}' #check to see if requested annotation information is annotated to the spliced_ptms dataframe, if not, append information if annot_col not in ptms.columns: ptms = annotate.append_from_gmt(ptms, database = database, annot_type = annot_type).copy() ptms = ptms.dropna(subset = annot_col) #check to make sure requested annotation is available filter_arguments = helpers.extract_filter_kwargs(min_dpsi = min_dpsi, alpha = alpha, **kwargs) helpers.check_filter_kwargs(filter_arguments) ptms_of_interest = helpers.filter_ptms(ptms, **filter_arguments) #extract relevant annotation and remove PTMs without an annotation if 'Impact' not in ptms_of_interest.columns and 'dPSI' in ptms_of_interest.columns: ptms_of_interest['Impact'] = ptms_of_interest['dPSI'].apply(lambda x: 'Included' if x > 0 else 'Excluded') optional_cols = [col for col in ptms_of_interest.columns if col in ['Impact', 'dPSI', 'Significance']] annotations = ptms_of_interest[['Gene', 'UniProtKB Accession', 'Residue', 'PTM Position in Isoform', 'Modification Class'] + [annot_col] + optional_cols].copy() annotations = annotations.dropna(subset = annot_col).drop_duplicates() if annotations.empty: print("No PTMs with associated annotation") return None, None #combine repeat entries for same PTM (with multiple impacts) annotations = annotations.groupby(['Gene', 'UniProtKB Accession', 'Residue', 'PTM Position in Isoform'], as_index = False).agg(lambda x: ';'.join(set([str(i) for i in x if i == i]))) #separate distinct modification annotations in unique rows annotations_exploded = annotations.copy() annotations_exploded[annot_col] = annotations_exploded[annot_col].apply(lambda x: x.split(';') if isinstance(x, str) else np.nan) annotations_exploded = annotations_exploded.explode(annot_col) annotations_exploded[annot_col] = annotations_exploded[annot_col].apply(lambda x: x.strip() if isinstance(x, str) else np.nan) #if desired collapse similar annotations (for example, same function but increasing or decreasing) if collapse_on_similar: annotations_exploded[annot_col] = annotate.collapse_annotations(annotations_exploded[annot_col].values, database = database, annot_type = annot_type) annotations_exploded.drop_duplicates(inplace = True) annotations = annotations_exploded.groupby([col for col in annotations_exploded.columns if col != annot_col], as_index = False, dropna = False)[annot_col].apply(lambda x: ';'.join(set(x))) #get the number of annotations found annotation_counts = annotations_exploded.drop_duplicates(subset = ['Gene', 'UniProtKB Accession', 'Residue', 'PTM Position in Isoform'] + [annot_col])[annot_col].value_counts() #additional_counts sub_counts = [] impact_type = [] if 'Impact' in annotations_exploded.columns: for imp in ['Included', 'Excluded', 'Altered Flank']: tmp_annotations = annotations_exploded[annotations_exploded['Impact'] == imp].copy() if not tmp_annotations.empty: tmp_annotations = tmp_annotations.drop_duplicates(subset = ['Gene', 'UniProtKB Accession', 'Residue', 'PTM Position in Isoform'] + [annot_col]) sub_counts.append(tmp_annotations[annot_col].value_counts()) impact_type.append(imp) annotation_counts = pd.concat([annotation_counts] + sub_counts, axis = 1) annotation_counts.columns = ['All Impacted'] + impact_type annotation_counts = annotation_counts.replace(np.nan, 0) #combine repeat entries for same PTM (with multiple impacts) annotations = annotations.groupby(['Gene', 'UniProtKB Accession', 'Residue', 'PTM Position in Isoform'], as_index = False).agg(lambda x: ';'.join(set([str(i) for i in x if i == i]))) if database == 'PTMcode': #convert to readable gene name annotation_counts.index = [pose_config.uniprot_to_genename[i].split(' ')[0] if i in pose_config.uniprot_to_genename.keys() else i for i in annotation_counts.index] return annotations, annotation_counts
[docs] def get_background_annotation_counts(database = 'PhosphoSitePlus', annot_type = 'Function', **kwargs): """ Given a database and annotation type, retrieve the counts of PTMs associated with the requested annotation type across all PTMs in the ptm_coordinates dataframe used for projection Parameters ---------- database : str Source of annotation. Default is PhosphoSitePlus annot_type : str Type of annotation that can be found in indicated database. Default is 'Function'. Other options include 'Process', 'Disease', 'Enzyme', 'Interactions', etc. kwargs: additional keyword arguments Additional keyword arguments to pass to the `filter_ptms()` function from the helper module. These will be used to filter ptms with lower evidence. For example, if you want to filter PTMs based on the number of MS observations, you can add 'min_MS_observations = 2' to the kwargs. This will filter out any PTMs that have less than 2 MS observations. See the `filter_ptms()` function for more options. """ if kwargs: filter_arguments = helpers.extract_filter_kwargs(report_removed = False, **kwargs) helpers.check_filter_kwargs(filter_arguments) background_ptms = helpers.filter_ptms(pose_config.ptm_coordinates, **filter_arguments) background_ptms = background_ptms.rename(columns = {'Gene name':'Gene'}) _, background_annotation_count = get_ptm_annotations(background_ptms, annot_type = annot_type, database = database, report_removed = False, **kwargs) background_size = background_ptms.drop_duplicates(subset = ['UniProtKB Accession', 'Residue', 'PTM Position in Isoform', 'Modification Class']).shape[0] else: annotation_dict = annotate.process_database_annotations(database = database, annot_type=annot_type, key_type = 'annotation') background_annotation_count = pd.Series(annotation_dict).apply(len) background_size = pose_config.ptm_coordinates.drop_duplicates(subset = ['UniProtKB Accession', 'Residue', 'PTM Position in Isoform', 'Modification Class']).shape[0] return background_annotation_count, background_size
[docs] def get_enrichment_inputs(ptms, annot_type = 'Function', database = 'PhosphoSitePlus', background_type = 'all', collapse_on_similar = False, mod_class = None, alpha = 0.05, min_dpsi = 0.1, **kwargs): """ Given the spliced ptms, altered_flanks, or combined PTMs dataframe, identify the number of PTMs corresponding to specific annotations in the foreground (PTMs impacted by splicing) and the background (all PTMs in the proteome or all PTMs in dataset not impacted by splicing). This information can be used to calculate the enrichment of specific annotations among PTMs impacted by splicing. Several options are provided for constructing the background data: all (based on entire proteome in the ptm_coordinates dataframe) or significance (foreground PTMs are extracted from provided spliced PTMs based on significance and minimum delta PSI) Parameters ---------- ptms: pd.DataFrame Dataframe with PTMs projected onto splicing events and with annotations appended from various databases. This can be either the spliced_ptms, altered_flanks, or combined dataframe. annot_type : str type of annotation to pull the annotations from. Default is 'Function'. database : str source of annotations. Default is 'PhosphoSitePlus'. background_type : str Type of background to construct. Options are either 'all' (all PTMs in proteome) or 'significance' (only PTMs in dataset). Note that significance option assumes that PTMs have not already been filtered for significance. collapse_on_similar : bool Whether to collapse similar annotations (for example, "cell growth, increased" and "cell growth, decreased") into a single category. Default is False. mod_class : str Type of modification to perform enrichment for min_dpsi: float Minimum change in PSI required to return a PTM as associated with the annotation. Default is 0.1. This can be used to filter out PTMs that are not significantly spliced. alpha : float Significance threshold to use to filter PTMs based on their significance. Default is 0.05. This can be used to filter out PTMs that are not significantly spliced. kwargs: additional keyword arguments Additional keyword arguments to pass to the `filter_ptms()` function from the helper module. These will be used to filter ptms with lower evidence. For example, if you want to filter PTMs based on the number of MS observations, you can add 'min_MS_observations = 2' to the kwargs. This will filter out any PTMs that have less than 2 MS observations. See the `filter_ptms()` function for more options. """ if background_type == 'all': background_annotation_count, background_size = get_background_annotation_counts(database = database, annot_type = annot_type, **kwargs) background_size = pose_config.ptm_coordinates.drop_duplicates(subset = ['UniProtKB Accession', 'Residue', 'PTM Position in Isoform', 'Modification Class']).shape[0] #filter ptms filter_arguments = helpers.extract_filter_kwargs(min_dpsi = min_dpsi, alpha = alpha, modification_class = mod_class, report_removed = False, **kwargs) helpers.check_filter_kwargs(filter_arguments) ptms = helpers.filter_ptms(ptms, **filter_arguments) elif background_type == 'significance': if 'Significance' not in ptms.columns or 'dPSI' not in ptms.columns: raise ValueError('Significance and dPSI columns must be present in spliced_ptms dataframe to construct a background based on significance (these columns must be provided during projection).') #filter ptms if any kwargs are provided if kwargs: filter_arguments = helpers.extract_filter_kwargs(min_dpsi = 0, alpha = 1, modification_class = mod_class, report_removed = False, **kwargs) helpers.check_filter_kwargs(filter_arguments) ptms = helpers.filter_ptms(ptms, **filter_arguments) background = ptms.copy() #restrict sample to significantly spliced ptms ptms = ptms[(ptms['Significance'] <= alpha) & (ptms['dPSI'].abs() >= min_dpsi)].copy() #check to make sure there are significant PTMs in the data and that there is a difference in the number of significant and background PTMs if ptms.shape[0] == 0: raise ValueError('No significantly spliced PTMs found in the data') elif ptms.shape[0] == background.shape[0]: raise ValueError(f'The foreground and background PTM sets are the same size when considering significance. Please check data or use "all" for the background_type parameter.') else: #remove impact from background columns if present cols_to_remove = [col for col in ['dPSI', 'Significance', 'Impact'] if col in background.columns] if len(cols_to_remove) > 0: background = background.drop(columns = cols_to_remove) #get background counts background_size = background.drop_duplicates(subset = ['UniProtKB Accession', 'Residue', 'PTM Position in Isoform', 'Modification Class']).shape[0] #get background counts _, background_annotation_count = get_ptm_annotations(background, annot_type = annot_type, database = database, collapse_on_similar = collapse_on_similar) else: raise ValueError('Invalid background type. Must be all (default) or significance') #get foreground counts and size foreground_size = ptms.shape[0] annotation_details, foreground_annotation_count = get_ptm_annotations(ptms, annot_type = annot_type, database = database, collapse_on_similar = collapse_on_similar, min_dpsi = min_dpsi, alpha = alpha, report_removed = False, **kwargs) #process annotation details into usable format if annotation_details is None: print('No PTMs with requested annotation type, so could not perform enrichment analysis') return np.repeat(None, 5) else: annot_col = f'{database}:{annot_type}' annotation_details[annot_col] = annotation_details[annot_col].str.split(';') annotation_details = annotation_details.explode(annot_col) annotation_details[annot_col] = annotation_details[annot_col].str.strip() annotation_details['PTM'] = annotation_details['Gene'] + '_' + annotation_details['Residue'] + annotation_details['PTM Position in Isoform'].astype(int).astype(str) annotation_details = annotation_details.groupby(annot_col)['PTM'].agg(';'.join) return foreground_annotation_count, foreground_size, background_annotation_count, background_size, annotation_details
[docs] def annotation_enrichment(ptms, database = 'PhosphoSitePlus', annot_type = 'Function', background_type = 'all', collapse_on_similar = False, mod_class = None, alpha = 0.05, min_dpsi = 0.1, **kwargs):# """ Given spliced ptm information (differential inclusion, altered flanking sequences, or both), calculate the enrichment of specific annotations in the dataset using a hypergeometric test. Background data can be provided/constructed in a few ways: 1. Use annotations from the entire phosphoproteome (background_type = 'all') 2. Use the alpha and min_dpsi parameter to construct a foreground that only includes significantly spliced PTMs, and use the entire provided spliced_ptms dataframe as the background. This will allow you to compare the enrichment of specific annotations in the significantly spliced PTMs compared to the entire dataset. Will do this automatically if alpha or min_dpsi is provided. Parameters ---------- ptms: pd.DataFrame Dataframe with PTMs projected onto splicing events and with annotations appended from various databases database: str database from which PTMs are pulled. Options include 'PhosphoSitePlus', 'ELM', 'PTMInt', 'PTMcode', 'DEPOD', 'RegPhos', 'PTMsigDB'. Default is 'PhosphoSitePlus'. annot_type: str Type of annotation to pull from spliced_ptms dataframe. Available information depends on the selected database. Default is 'Function'. background_type: str how to construct the background data. Options include 'pregenerated' (default) and 'significance'. If 'significance' is selected, the alpha and min_dpsi parameters must be provided. Otherwise, will use whole proteome in the ptm_coordinates dataframe as the background. collapse_on_similar: bool Whether to collapse similar annotations (for example, increasing and decreasing functions) into a single category. Default is False. mod_class: str modification class to subset, if any alpha: float significance threshold to use to subset foreground PTMs. Default is None. min_dpsi: float minimum delta PSI value to use to subset foreground PTMs. Default is None. kwargs: additional keyword arguments Additional keyword arguments to pass to the `filter_ptms()` function from the helper module. These will be used to filter ptms with lower evidence. For example, if you want to filter PTMs based on the number of MS observations, you can add 'min_MS_observations = 2' to the kwargs. This will filter out any PTMs that have less than 2 MS observations. See the `filter_ptms()` function for more options. """ foreground_annotation_count, foreground_size, background_annotations, background_size, annotation_details = get_enrichment_inputs(ptms, background_type = background_type, annot_type = annot_type, database = database, collapse_on_similar = collapse_on_similar, mod_class = mod_class, alpha = alpha, min_dpsi = min_dpsi) if foreground_annotation_count is not None: #iterate through all annotations and calculate enrichment with a hypergeometric test results = pd.DataFrame(columns = ['Fraction Impacted', 'p-value'], index = foreground_annotation_count.index) for i, n in background_annotations.items(): #number of PTMs in the foreground with the annotation if i in foreground_annotation_count.index.values: if isinstance(foreground_annotation_count, pd.Series): k = foreground_annotation_count[i] elif foreground_annotation_count.shape[1] == 1: k = foreground_annotation_count.loc[i, 'count'] elif foreground_annotation_count.shape[1] > 1: k = foreground_annotation_count.loc[i, 'All Impacted'] p = stat_utils.getEnrichment(background_size, n, foreground_size, k, fishers = False) results.loc[i, 'Fraction Impacted'] = f"{k}/{n}" results.loc[i, 'p-value'] = p results = results.sort_values('p-value') results['Adjusted p-value'] = stat_utils.adjustP(results['p-value'].values) results = pd.concat([results, annotation_details], axis = 1) else: results = None return results
[docs] def plot_available_annotations(ptms, only_annotations_in_data = False, show_all_ptm_count = False, ax = None, **kwargs): """ Given a dataframe with ptm annotations added, show the number of PTMs associated with each annotation type Parameters ---------- ptms: pd.DataFrame Dataframe with PTMs and annotations added only_annotations_in_data : bool Only plot annotations that are already appended to the dataset show_all_ptm_count: bool Whether to show the total number of PTMs in the dataset. Default is True. ax: matplotlib.Axes Axis to plot on. If None, will create new figure. Default is None. """ if kwargs: filter_arguments = helpers.extract_filter_kwargs(report_removed = False, **kwargs) helpers.check_filter_kwargs(filter_arguments) ptms = helpers.filter_ptms(ptms, **filter_arguments) else: ptms = ptms.copy() if ax is None: fig, ax = plt.subplots(figsize = (5,5)) if show_all_ptm_count: num_ptms = [ptms.drop_duplicates(['UniProtKB Accession', 'Residue', 'PTM Position in Isoform', 'Modification Class']).shape[0]] num_ptms_filters = ['All PTMs'] filter_source = ['None'] else: num_ptms = [] num_ptms_filters = [] filter_source = [] available_annotations = get_available_annotations(ptms) available_annotations = available_annotations.sort_values(by = ['Database', 'Annotation Type']) if only_annotations_in_data: available_annotations = available_annotations[available_annotations['Appended to PTM data?'] == 'Yes'] for _, row in available_annotations.iterrows(): annot_col = f"{row['Database']}:{row['Annotation Type']}" if annot_col not in ptms.columns: ptms = annotate.append_from_gmt(ptms, database = row['Database'], annot_type = row['Annotation Type']) num_ptms.append(ptms.dropna(subset = annot_col).drop_duplicates(subset = ['UniProtKB Accession', 'Residue', 'PTM Position in Isoform', 'Modification Class']).shape[0]) num_ptms_filters.append(f"{row['Annotation Type']}({row['Database']})") filter_source.append(row['Database']) plt_data = pd.DataFrame({'PTM Count': num_ptms, 'Annotation Type': num_ptms_filters, 'Source': filter_source}) plt_data = plt_data.sort_values(by = 'PTM Count', ascending = True) #remove annotations with 0 PTMs plt_data = plt_data[plt_data['PTM Count'] > 0] #plot bar plot #color bars based on datasource databases = available_annotations['Database'].unique() #construct a color palette for the databases, with "None" as gray palette = {db: col for db, col in zip(databases, sns.color_palette("rainbow", len(databases)))} palette['None'] = 'gray' #set "None" to gray for all PTMs (whether or not associated with annotation) colors = [] for source in plt_data['Source']: colors.append(palette[source]) ax.barh(plt_data['Annotation Type'], plt_data['PTM Count'], color = colors) ax.set_xlabel('Number of PTMs with annotation') #annotate with number of PTMs for i, num_ptm in enumerate(plt_data['PTM Count']): ax.text(num_ptm, i, str(num_ptm), ha = 'left', va = 'center') #create legend handles = [plt.Rectangle((0,0),1,1, color = color) for color in palette.values() if color != 'gray'] labels = [source for source in palette.keys() if source != 'None'] ax.legend(handles, labels, title = 'Annotation Source') plt.show()
[docs] def plot_annotation_counts(spliced_ptms = None, altered_flanks = None, database = 'PhosphoSitePlus', annot_type = 'Function', collapse_on_similar = True, colors = None, fontsize = 10, top_terms = 5, legend = False, legend_loc = (1.05,0.5), title = None, title_type = 'database', ax = None, **kwargs): """ Given a dataframe with PTM annotations added, plot the top annotations associated with the PTMs Parameters ---------- spliced_ptms: pd.DataFrame Dataframe with differentially included PTMs altered_flanks: pd.DataFrame Dataframe with PTMs associated with altered flanking sequences database: str Database to use for annotations. Default is 'PhosphoSitePlus'. annot_type: str Type of annotation to plot. Default is 'Function'. collapse_on_similar: bool Whether to collapse similar annotations into a single category. Default is True. colors: list List of colors to use for the bar plot. Default is None. top_terms: int Number of top terms to plot. Default is 5. legend: bool Whether to show the legend. Default is True. legend_loc: tuple Location of the legend. Default is None, which will place the legend in the upper right corner. ax: matplotlib.Axes Axis to plot on. If None, will create new figure. Default is None. title_type: str Type of title to use for the plot. Default is 'database'. Options include 'database' and 'detailed'. title: str Title to use for the plot. Default is None. fontsize: int Font size for the plot. Default is 10. legend_loc: tuple Location of the legend. Default is None, which will place the legend to the right of the plot. **kwargs: additional keyword arguments Additional keyword arguments, which will be fed into the `filter_ptms()` function from the helper module. These will be used to filter ptms with lower evidence. For example, if you want to filter PTMs based on the number of MS observations, you can add 'min_MS_observations = 2' to the kwargs. This will filter out any PTMs that have less than 2 MS observations. See the `filter_ptms()` function for more options. """ #check what was provided as input, combine if both are provided if spliced_ptms is None and altered_flanks is None: raise ValueError('Must provide either spliced_ptms or altered_flanks dataframe to plot annotation counts') elif spliced_ptms is not None and altered_flanks is not None: ptms = summarize.combine_outputs(spliced_ptms, altered_flanks, report_removed = False, report_removed_annotations = False, **kwargs) elif spliced_ptms is not None: ptms = spliced_ptms.copy() else: ptms = altered_flanks.copy() #get number of PTMs associated with each annotation _, annotation_counts = get_ptm_annotations(ptms, annot_type = annot_type, database = database, collapse_on_similar = collapse_on_similar, report_removed = False, kwargs = kwargs) if annotation_counts is None: return None if ax is None: fig, ax = plt.subplots(figsize = (2,3)) if colors is None: colors = ['lightgrey', 'gray', 'white'] #grab counts for plot if isinstance(annotation_counts, pd.Series): #if only one impact annotation_counts = annotation_counts.head(top_terms).sort_values(ascending = True) if isinstance(colors, list) or isinstance(colors, np.ndarray): colors = colors[0] ax.barh(annotation_counts.index, annotation_counts.values, color = colors, edgecolor = 'black') legend = False else: # if multiple impact types (Included, Excluded, Altered Flank) legend_labels = ['All Impacted'] annotation_counts = annotation_counts.head(top_terms).sort_values(by = 'All Impacted', ascending = True) left = 0 if 'Excluded' in annotation_counts.columns: ax.barh(annotation_counts['Excluded'].index, annotation_counts['Excluded'].values, height = 1, edgecolor = 'black', color = colors[0]) legend_labels.append('Excluded') left = annotation_counts['Excluded'].values if 'Included' in annotation_counts.columns: ax.barh(annotation_counts['Included'].index, annotation_counts['Included'].values, left = left, height = 1, color = colors[1], edgecolor = 'black') legend_labels.append('Included') left = left + annotation_counts['Included'].values if 'Altered Flank' in annotation_counts.columns: ax.barh(annotation_counts['Altered Flank'].index, annotation_counts['Altered Flank'].values, left = left, height = 1, color = colors[2], edgecolor = 'black') legend_labels.append('Altered Flank') left = left + annotation_counts['Altered Flank'].values #ax.set_xticks([0,50,100,150]) ax.set_ylabel('', fontsize = fontsize) ax.set_xlabel('Number of PTMs', fontsize = fontsize) if title is not None: ax.set_title(title, fontsize = fontsize, weight = 'bold') elif title_type == 'detailed': ax.set_title(f'Top {top_terms} {database} {annot_type} Annotations', fontsize = fontsize, weight = 'bold') elif title_type == 'database': ax.set_title(f'{database}', fontsize = fontsize, weight = 'bold') x_label_dict = {'Function':'Number of PTMs\nassociated with Function', 'Process':'Number of PTMs\nassociated with Process', 'Disease':'Number of PTMs\nassociated with Disease', 'Kinase':'Number of Phosphosites\ntargeted by Kinase', 'Interactions': 'Number of PTMs\nthat regulate interaction\n with protein','Motif Match':'Number of PTMs\nfound within a\nmotif instance', 'Intraprotein': 'Number of PTMs\nthat are important\for intraprotein\n interactions','Phosphatase':'Number of Phosphosites\ntargeted by Phosphatase', 'Perturbation (DIA2)': "Number of PTMs\nAffected by Perturbation\n(Measured by DIA)", 'Perturbation (PRM)': 'Number of PTMs\nAffected by Perturbation\n(Measured by PRM)', 'NetPath':'Number of PTMs/Genes\nassociated with NetPath', 'Perturbation':'Number of PTMs\nAffected by Perturbation'} ax.set_xlabel(x_label_dict[annot_type], fontsize = fontsize) #make a custom legend if legend: handles = [] for i, impact in enumerate(['Included', 'Excluded', 'Altered Flank']): if impact in legend_labels: handles.append(mpatches.Patch(facecolor = colors[i], edgecolor = 'black', label = impact)) ax.legend(handles = handles, ncol = 1, fontsize = fontsize - 2, title = 'Type of Impact', title_fontsize = fontsize - 1, loc = legend_loc)
#return ax
[docs] def gene_set_enrichment(spliced_ptms = None, altered_flanks = None, sig_col = 'Significance', dpsi_col = 'dPSI', alpha = 0.05, min_dpsi = None, gene_sets = ['GO_Biological_Process_2023', 'Reactome_2022'], background = None, return_sig_only = True, max_retries = 5, delay = 10, **kwargs): """ Given spliced_ptms and/or altered_flanks dataframes (or the dataframes combined from combine_outputs()), perform gene set enrichment analysis using the enrichr API Parameters ---------- spliced_ptms: pd.DataFrame Dataframe with differentially included PTMs projected onto splicing events and with annotations appended from various databases. Default is None (will not be considered in analysis). If combined dataframe is provided, this dataframe will be ignored. altered_flanks: pd.DataFrame Dataframe with PTMs associated with altered flanking sequences and with annotations appended from various databases. Default is None (will not be considered). If combined dataframe is provided, this dataframe will be ignored. combined: pd.DataFrame Combined dataframe with spliced_ptms and altered_flanks dataframes. Default is None. If provided, spliced_ptms and altered_flanks dataframes will be ignored. gene_sets: list List of gene sets to use in enrichment analysis. Default is ['KEGG_2021_Human', 'GO_Biological_Process_2023', 'GO_Cellular_Component_2023', 'GO_Molecular_Function_2023','Reactome_2022']. Look at gseapy and enrichr documentation for other available gene sets background: list List of genes to use as background in enrichment analysis. Default is None (all genes in the gene set database will be used). return_sig_only: bool Whether to return only significantly enriched gene sets. Default is True. max_retries: int Number of times to retry downloading gene set enrichment data from enrichr API. Default is 5. delay: int Number of seconds to wait between retries. Default is 10. **kwargs: additional keyword arguments Additional keyword arguments to pass to the combine_outputs() function from the summarize module. These will be used to filter the spliced_ptms and altered_flanks dataframes before performing gene set enrichment analysis. For example, if you want to filter PTMs based on the number of MS observations, you can add 'min_MS_observations = 2' to the kwargs. This will filter out any PTMs that have less than 2 MS observations. See the `combine_outputs()` function for more options. Returns ------- results: pd.DataFrame Dataframe with gene set enrichment results from enrichr API """ if gp is None: raise ImportError('gseapy package is required to perform gene set enrichment analysis. Please install it using "pip install gseapy"') if background is not None: raise ValueError('Background data not supported at this time, but will be added in future versions. Please set background = None, which will use all genes in the gene set database as the background.') #grab the genes associated with impacted PTMs if spliced_ptms is not None and altered_flanks is not None: #gene information (total and spliced genes) combined = summarize.combine_outputs(spliced_ptms, altered_flanks, **kwargs) foreground = combined.copy() type = 'Differentially Included + Altered Flanking Sequences' #isolate the type of impact on the gene combined_on_gene = combined.groupby('Gene')['Impact'].apply(lambda x: ';'.join(set(x))) included = combined_on_gene.str.contains('Included') excluded = combined_on_gene.str.contains('Excluded') differential = included | excluded altered_flank = combined_on_gene.str.contains('Altered Flank') altered_flank_only = altered_flank & ~differential differential_only = differential & ~altered_flank both = differential & altered_flank altered_flank_only = combined_on_gene[altered_flank_only].index.tolist() differential_only = combined_on_gene[differential_only].index.tolist() both = combined_on_gene[both].index.tolist() elif spliced_ptms is not None: foreground = spliced_ptms.copy() combined = spliced_ptms.copy() type = 'Differentially Included' #isolate the type of impact on the gene altered_flank_only = [] differential_only = spliced_ptms['Gene'].unique().tolist() both = [] elif altered_flanks is not None: foreground = altered_flanks.copy() combined = altered_flanks.copy() type = 'Altered Flanking Sequences' #isolate the type of impact on the gene altered_flank_only = altered_flanks['Gene'].unique().tolist() differential_only = [] both = [] else: raise ValueError('No dataframes provided. Please provide spliced_ptms, altered_flanks, or the combined dataframe.') #restrict to significant ptms, if available if sig_col in foreground.columns and (min_dpsi is not None and dpsi_col in foreground.columns): foreground = foreground[foreground[sig_col] <= alpha].copy() foreground = foreground[foreground[dpsi_col].abs() >= min_dpsi] elif sig_col in foreground.columns: if min_dpsi is not None: print('`min_dpsi` provided but `dpsi_col` not found in dataframe. Ignoring `min_dpsi` parameter. Please indicate correct column name for dPSI values and rerun if desired.') foreground = foreground[foreground[sig_col] <= alpha].copy() elif min_dpsi is not None and 'dPSI' in combined.columns: print('`sig_col` not found in dataframe. Ignoring `alpha` parameter. Please indicate correct column name for Significance and rerun if desired.') foreground = combined[combined['dPSI'].abs() >= min_dpsi].copy() else: print('Significance column not found and min_dpsi not provided. All PTMs in dataframe will be considered as the foreground') foreground = foreground['Gene'].unique().tolist() #perform gene set enrichment analysis and save data for i in range(max_retries): try: enr = gp.enrichr(foreground, gene_sets = gene_sets) break except Exception as e: enrichr_error = e time.sleep(delay) else: raise Exception('Failed to run enrichr analysis after ' + str(max_retries) + ' attempts. Please try again later. Error given by EnrichR: ' + str(enrichr_error)) results = enr.results.copy() results['Type'] = type #indicate the genes in each gene set associated with each type of impact results['Genes with Differentially Included PTMs only'] = results['Genes'].apply(lambda x: ';'.join(set(x.split(';')) & (set(differential_only)))) results['Genes with PTM with Altered Flanking Sequence only'] = results['Genes'].apply(lambda x: ';'.join(set(x.split(';')) & (set(altered_flank_only)))) results['Genes with Both'] = results['Genes'].apply(lambda x: ';'.join(set(x.split(';')) & (set(both)))) if return_sig_only: return results[results['Adjusted P-value'] <= 0.05] else: return results
[docs] def draw_pie(dist, xpos, ypos, size,colors,edgecolor =None, type = 'donut', ax=None): """ Draws pies individually, as if points on a scatter plot. This function was taken from this stack overflow post: https://stackoverflow.com/questions/56337732/how-to-plot-scatter-pie-chart-using-matplotlib Parameters ---------- dist: list list of values to be represented as pie slices for a single point xpos: float x position of pie chart in the scatter plot ypos: float y position of pie chart in the scatter plot size: float size of pie chart colors: list list of colors to use for pie slices ax: matplotlib.Axes axis to plot on, if None, will create new figure """ if ax is None: fig, ax = plt.subplots(figsize=(10,8)) #remove slices with 0 size colors = [c for c, d in zip(colors, dist) if d != 0] dist = [d for d in dist if d != 0] # for incremental pie slices cumsum = np.cumsum(dist) cumsum = cumsum/ cumsum[-1] pie = [0] + cumsum.tolist() num_colors = len(dist) for i, r1, r2 in zip(range(num_colors), pie[:-1], pie[1:]): angles = np.linspace(2 * np.pi * r1, 2 * np.pi * r2) x = [0] + np.cos(angles).tolist() y = [0] + np.sin(angles).tolist() xy = np.column_stack([x, y]) ax.scatter([xpos], [ypos], marker=xy, s=size, facecolor= colors[i], edgecolors=edgecolor, linewidth = 0.3) if type == 'donut': # add white circle in the middle donut_edgecolors = 'w' if edgecolor is None else edgecolor ax.scatter([xpos], [ypos], s=size/5, facecolor='w', edgecolors=donut_edgecolors, linewidth = 0.3) return ax
[docs] def plot_EnrichR_pies(enrichr_results, top_terms = None, terms_to_plot = None, colors = None, edgecolor = None, row_height = 0.3, type = 'circle', ax = None): """ Given PTM-specific EnrichR results, plot EnrichR score for the provided terms, with each self point represented as a pie chart indicating the fraction of genes in the group with PTMs Parameters ---------- ptm_results: pd.selfFrame selfFrame containing PTM-specific results from EnrichR analysis num_to_plot: int number of terms to plot, if None, will plot all terms. Ignored if specific terms are provided in terms to plot list terms_to_plot: list list of terms to plot colors: list list of colors to use for pie slices. Default is None, which will use seaborn colorblind palette edgecolor: str color to use for edge of pie slices. Default is None, which will use the same color as the pie slice row_height: float height of each row in the plot. Default is 0.3. type: str type of pie chart to plot. Default is 'circle'. Options include 'circle' and 'donut' (hole in center). ax: matplotlib.Axes axis to plot on, if None, will create new figure """ if colors is None: colors = sns.color_palette('colorblind', n_colors = 3) plt_data = enrichr_results.copy() plt_data['Number with Differential Inclusion Only'] = plt_data['Genes with Differentially Included PTMs only'].apply(lambda x: len(x.split(';'))) plt_data['Number with Altered Flank Only'] = plt_data['Genes with Differentially Included PTMs only'].apply(lambda x: len(x.split(';'))) plt_data['Number with Both'] = plt_data['Genes with Both'].apply(lambda x: len(x.split(';')) if x != '' else 0) if terms_to_plot is None: plt_data = plt_data.sort_values(by = 'Combined Score') if top_terms is not None: plt_data = plt_data.iloc[-top_terms:] if top_terms < plt_data.shape[0] else plt_data else: plt_data = plt_data[plt_data['Term'].isin(terms_to_plot)].sort_values(by = 'Combined Score') if plt_data.shape[0] == 0: print('No significant terms found in EnrichR results. Please check the terms_to_plot list and try again.') return #remove gene ontology specific terms plt_data['Term'] = plt_data['Term'].apply(lambda x: x.split(' R-HSA')[0] +' (R)' if 'R-HSA' in x else x.split('(GO')[0]+' (GO)') #construct multiple piecharts for each term in 'Term' column, where location along x-axis is dictated by combined score and piechart is dictated by 'Fraction With PTMs' plt_data = plt_data.reset_index(drop = True) #set up figure if ax is None: figure_length = plt_data.shape[0]*row_height fig, ax = plt.subplots(figsize = (2, figure_length)) #get non-inf max score and replace inf values with max score maxscore = np.nanmax(plt_data['Combined Score'][plt_data['Combined Score'] != np.inf]) plt_data['Combined Score'] = plt_data['Combined Score'].replace([-np.inf, np.inf], maxscore) ax.set_xlim([maxscore*-0.05, maxscore*1.1]) mult = 4 ax.set_yticks(list(range(0,plt_data.shape[0]*mult,mult))) ax.set_yticklabels(plt_data['Term'].values) ax.set_ylim([-(mult/2), plt_data.shape[0]*mult-(mult/2)]) type = 'circle' event_type = plt_data['Type'].values[0] for i, row in plt_data.iterrows(): if event_type == 'Differentially Included + Altered Flanking Sequences': draw_pie([row['Number with Differential Inclusion Only'], row['Number with Altered Flank Only'], row['Number with Both']],xpos = row['Combined Score'], ypos = i*mult, colors = colors, edgecolor=edgecolor,ax = ax, type = type, size = 70) else: draw_pie([1],xpos = row['Combined Score'], ypos = i*mult, colors = colors, edgecolor=edgecolor,ax = ax, type = type, size = 70) ax.axhline(i*mult+(mult/2), c= 'k', lw = 0.5) ax.axhline(i*mult-(mult/2), c = 'k', lw = 0.5) #ax.tick_params(labelsize = ) #make a custom legend if event_type == 'Differentially Included + Altered Flanking Sequences': handles = [mpatches.Patch(color = colors[2], label = 'Contains Both Events'), mpatches.Patch(color = colors[1], label = 'PTMs with Altered Flanking Sequence'), mpatches.Patch(color = colors[0], label = 'Differentially Included PTMs')] ax.legend(handles = handles, loc = 'upper center', borderaxespad = 0, bbox_to_anchor = (0.5, 1 + (1/figure_length)), ncol = 1, fontsize = 9) ax.set_xlabel('EnrichR Combined\nScore', fontsize = 11)