Source code for ptm_pose.analyze.flanking_sequences


import numpy as np
import pandas as pd

from ptm_pose import helpers

#plotting 
import matplotlib.pyplot as plt
import seaborn as sns

#analysis packages
from Bio.Align import PairwiseAligner
import re



def compare_flanking_sequences(altered_flanks, flank_size = 5):
    sequence_identity_list = []
    altered_positions_list = []
    residue_change_list = []
    flank_side_list = []
    for i, row in altered_flanks.iterrows():
        #if there is sequence info for both and does not introduce stop codons, compare sequence identity
        if not row['Stop Codon Introduced'] and row['Inclusion Flanking Sequence'] == row['Inclusion Flanking Sequence'] and row['Exclusion Flanking Sequence'] == row['Exclusion Flanking Sequence']:
            #compare sequence identity
            sequence_identity = getSequenceIdentity(row['Inclusion Flanking Sequence'], row['Exclusion Flanking Sequence'])
            #identify where flanking sequence changes
            altered_positions, residue_change, flank_side = findAlteredPositions(row['Inclusion Flanking Sequence'], row['Exclusion Flanking Sequence'], flank_size = flank_size)
        else:
            sequence_identity = np.nan
            altered_positions = np.nan
            residue_change = np.nan
            flank_side = np.nan



        #add to lists
        sequence_identity_list.append(sequence_identity)
        altered_positions_list.append(altered_positions)
        residue_change_list.append(residue_change)
        flank_side_list.append(flank_side)

    altered_flanks['Sequence Identity'] = sequence_identity_list
    altered_flanks['Altered Positions'] = altered_positions_list
    altered_flanks['Residue Change'] = residue_change_list
    altered_flanks['Altered Flank Side'] = flank_side_list
    return altered_flanks



[docs] def compare_inclusion_motifs(flanking_sequences, elm_classes = None): """ Given a DataFrame containing flanking sequences with changes and a DataFrame containing ELM class information, identify motifs that are found in the inclusion and exclusion events, identifying motifs unique to each case. This does not take into account the position of the motif in the sequence or additional information that might validate any potential interaction (i.e. structural information that would indicate whether the motif is accessible or not). ELM class information can be downloaded from the download page of elm (http://elm.eu.org/elms/elms_index.tsv). Parameters ---------- flanking_sequences: pandas.DataFrame DataFrame containing flanking sequences with changes, obtained from get_flanking_changes_from_splice_data() elm_classes: pandas.DataFrame DataFrame containing ELM class information (ELMIdentifier, Regex, etc.), downloaded directly from ELM (http://elm.eu.org/elms/elms_index.tsv). Recommended to download this file and input it manually, but will download from ELM otherwise Returns ------- flanking_sequences: pandas.DataFrame DataFrame containing flanking sequences with changes and motifs found in the inclusion and exclusion events """ if elm_classes is None: elm_classes = pd.read_csv('http://elm.eu.org/elms/elms_index.tsv', sep = '\t', header = 5) only_in_inclusion = [] only_in_exclusion = [] for _, row in flanking_sequences.iterrows(): #check if there is a stop codon introduced and both flanking sequences are present if not row['Stop Codon Introduced'] and row['Inclusion Flanking Sequence'] == row['Inclusion Flanking Sequence'] and row['Exclusion Flanking Sequence'] == row['Exclusion Flanking Sequence']: #get elm motifs that match inclusion or Exclusion Flanking Sequences inclusion_matches = find_motifs(row['Inclusion Flanking Sequence'], elm_classes) exclusion_matches = find_motifs(row['Exclusion Flanking Sequence'], elm_classes) #get motifs that are unique to each case only_in_inclusion.append(';'.join(set(inclusion_matches) - set(exclusion_matches))) only_in_exclusion.append(';'.join(set(exclusion_matches) - set(inclusion_matches))) else: only_in_inclusion.append(np.nan) only_in_exclusion.append(np.nan) #save data flanking_sequences["Motif only in Inclusion"] = only_in_inclusion flanking_sequences["Motif only in Exclusion"] = only_in_exclusion return flanking_sequences
def identify_change_to_specific_motif(altered_flanks, elm_motif_name, elm_classes = None, modification_class = None, residues = None, dPSI_col = None, **kwargs): #filter ptms filter_arguments = helpers.extract_filter_kwargs(**kwargs) helpers.check_filter_kwargs(filter_arguments) altered_flanks = helpers.filter_ptms(altered_flanks, **filter_arguments) #compare flanking sequences if 'Altered Positions' not in altered_flanks.columns: altered_flanks = compare_flanking_sequences(altered_flanks) #grab elm motifs that match inclusion or Exclusion Flanking Sequences if 'Motif only in Inclusion' not in altered_flanks.columns: altered_flanks = compare_inclusion_motifs(altered_flanks, elm_classes = elm_classes) #grab only needed info motif_data = altered_flanks.dropna(subset = ['Inclusion Flanking Sequence', 'Exclusion Flanking Sequence'], how = 'all').copy() cols_to_keep = ['Region ID', 'Gene', 'UniProtKB Accession', 'Residue', 'PTM Position in Isoform', 'Modification Class', 'Inclusion Flanking Sequence', 'Exclusion Flanking Sequence', 'Motif only in Inclusion', 'Motif only in Exclusion', 'Altered Positions', 'Residue Change'] if dPSI_col is not None: cols_to_keep.append(dPSI_col) #go through motif data and identify motifs matching elm motif of interest motif_data = motif_data[cols_to_keep] for i, row in motif_data.iterrows(): if row['Motif only in Inclusion'] == row['Motif only in Inclusion']: if elm_motif_name in row['Motif only in Inclusion']: motif_data.loc[i, 'Motif only in Inclusion'] = ';'.join([motif for motif in row['Motif only in Inclusion'].split(';') if elm_motif_name in motif]) else: motif_data.loc[i, 'Motif only in Inclusion'] = np.nan if row['Motif only in Exclusion'] == row['Motif only in Exclusion']: if elm_motif_name in row['Motif only in Exclusion']: motif_data.loc[i, 'Motif only in Exclusion'] = ';'.join([motif for motif in row['Motif only in Exclusion'].split(';') if elm_motif_name in motif]) else: motif_data.loc[i, 'Motif only in Exclusion'] = np.nan #restrict to events that are specific modification types or residues (for example, SH2 domain motifs should be phosphotyrosine) motif_data = motif_data.dropna(subset = ['Motif only in Inclusion', 'Motif only in Exclusion'], how = 'all') if modification_class is not None: motif_data = motif_data[motif_data['Modification Class'].str.contains(modification_class)] if residues is not None and isinstance(residues, str): motif_data = motif_data[motif_data['Residue'] == residues] elif residues is not None and isinstance(residues, list): motif_data = motif_data[motif_data['Residue'].isin(residues)] elif residues is not None: raise ValueError('residues parameter must be a string or list of strings') return motif_data
[docs] def findAlteredPositions(seq1, seq2, flank_size = 5): """ Given two sequences, identify the location of positions that have changed Parameters ---------- seq1, seq2: str sequences to compare (order does not matter) flank_size: int size of the flanking sequences (default is 5). This is used to make sure the provided sequences are the correct length Returns ------- altered_positions: list list of positions that have changed residue_change: list list of residues that have changed associated with that position flank_side: str indicates which side of the flanking sequence the change has occurred (N-term, C-term, or Both) """ desired_seq_size = flank_size*2+1 altered_positions = [] residue_change = [] flank_side = [] seq_size = len(seq1) flank_size = (seq_size -1)/2 if seq_size == len(seq2) and seq_size == desired_seq_size: for i in range(seq_size): if seq1[i] != seq2[i]: altered_positions.append(i-(flank_size)) residue_change.append(f'{seq1[i]}->{seq2[i]}') #check to see which side flanking sequence altered_positions = np.array(altered_positions) n_term = any(altered_positions < 0) c_term = any(altered_positions > 0) if n_term and c_term: flank_side = 'Both' elif n_term: flank_side = 'N-term only' elif c_term: flank_side = 'C-term only' else: flank_side = 'Unclear' return altered_positions, residue_change, flank_side else: return np.nan, np.nan, np.nan
[docs] def getSequenceIdentity(seq1, seq2): """ Given two flanking sequences, calculate the sequence identity between them using Biopython and parameters definded by Pillman et al. BMC Bioinformatics 2011 Parameters ---------- seq1, seq2: str flanking sequence Returns ------- normalized_score: float normalized score of sequence similarity between flanking sequences (calculated similarity/max possible similarity) """ #make pairwise aligner object aligner = PairwiseAligner() #set parameters, with match score of 10 and mismatch score of -2 aligner.mode = 'global' aligner.match_score = 10 aligner.mismatch_score = -2 #calculate sequence alignment score between two sequences actual_similarity = aligner.align(seq1, seq2)[0].score #calculate sequence alignment score between the same sequence control_similarity = aligner.align(seq1, seq1)[0].score #normalize score normalized_score = actual_similarity/control_similarity return normalized_score
[docs] def find_motifs(seq, elm_classes): """ Given a sequence and a dataframe containinn ELM class information, identify motifs that can be found in the provided sequence using the RegEx expression provided by ELM (PTMs not considered). This does not take into account the position of the motif in the sequence or additional information that might validate any potential interaction (i.e. structural information that would indicate whether the motif is accessible or not). ELM class information can be downloaded from the download page of elm (http://elm.eu.org/elms/elms_index.tsv). Parameters ---------- seq: str sequence to search for motifs elm_classes: pandas.DataFrame DataFrame containing ELM class information (ELMIdentifier, Regex, etc.), downloaded directly from ELM (http://elm.eu.org/elms/elms_index.tsv) """ matches = [] for j, elm_row in elm_classes.iterrows(): reg_ex = elm_row['Regex'] if re.search(reg_ex, seq) is not None: matches.append(elm_row['ELMIdentifier']) return matches
[docs] def plot_sequence_differences(inclusion_seq, exclusion_seq, dpsi = None, flank_size = 5, figsize = (3,1)): """ Given the flanking sequences for a PTM resulting from a specific splice event, plot the differences between the two sequences, coloring the changing residues in red. If dPSI is also provided, will add an arrow to the plot indicating the direction of change Parameters ---------- inclusion_seq: str Sequence of the inclusion isoform (with spliced region included) exclusion_seq: str Sequence of the exclusion isoform (with spliced region excluded) dpsi: float Change in PSI for the specific splice event. Default is None, which will not add an arrow to the plot. flank_size: int Size of flanking region to plot. Default is 5. This must be less than half the length of the shortest sequence. figsize: tuple Size of the figure. Default is (3,1). """ #convert sequence into list of residues inclusion_seq = list(inclusion_seq) exclusion_seq = list(exclusion_seq) if len(inclusion_seq) < flank_size*2 + 1 or len(exclusion_seq) < flank_size*2 + 1: min_seq_length = min(len(inclusion_seq), len(exclusion_seq)) raise ValueError(f'Longest allowable flank size for provided sequences is {(min_seq_length -1)/2}, or {min_seq_length} residues. Please use smaller flank size or rerun flanking sequence analysis to get longer flank sequences') #if either provided seq is longer than flank size, trim to flank size if len(inclusion_seq) > flank_size*2 + 1: lowercase_pos = [i for i, x in enumerate(inclusion_seq) if x.islower()] inclusion_seq = inclusion_seq[int(lowercase_pos[0])-flank_size:int(lowercase_pos[0])+flank_size+1] if len(exclusion_seq) > flank_size*2 + 1: lowercase_pos = [i for i, x in enumerate(exclusion_seq) if x.islower()] exclusion_seq = exclusion_seq[int(lowercase_pos[0])-flank_size:int(lowercase_pos[0])+flank_size+1] #convert sequences to dataframe for plotting plt_data = pd.DataFrame({'Inclusion': inclusion_seq, 'Exclusion': exclusion_seq}, index = list(range(-flank_size, flank_size+1))) #note where inclusion and exclusion rows differ binary_map = plt_data['Inclusion'] != plt_data['Exclusion'] binary_map = pd.concat([binary_map, binary_map], axis =1).astype(int).T binary_map.loc[0] = 0 plt_data = plt_data.T fig, ax = plt.subplots(figsize = figsize) sns.heatmap(binary_map, cmap = 'coolwarm', cbar = False, ax = ax, annot = plt_data, fmt = '', annot_kws = {'size': 10}, vmax = 1.1, vmin =-0.1, linewidths = 0.6, ec = 'black', linecolor = 'black') ax.set_yticklabels(['Inclusion', 'Exclusion'], rotation = 0) #if dpsi is provided, add arrow to plot showing direction of change end_of_map = ax.get_xticks()[-1]+1 if dpsi is not None: start_of_arrow = 1.8 if dpsi < 0 else 0.2 end_of_arrow = 0.2 if dpsi < 0 else 1.8 ax.annotate(f'', xy = (end_of_map, start_of_arrow), xytext = (end_of_map, end_of_arrow), arrowprops=dict(arrowstyle="->", lw = 1, color = 'black'), annotation_clip=False)
[docs] def plot_location_of_altered_flanking_residues(altered_flanks, figsize = (4,3), modification_class = None, residue = None): """ Plot the number of PTMs with altered residues as specific positions relative to the PTM site. This includes the specific position of the residue (-5 to +5 from PTM site) and the specific side of the PTM site that is altered (N-term or C-term) Parameters ---------- altered_flanks: pd.DataFrame Dataframe with altered flanking sequences, and annotated information added with analyze.compare_flanking_sequences figsize: tuple Size of the figure. Default is (4,3). modification_class: str Specific modification class to plot. Default is None, which will plot all modification classes. residue: str Specific residue to plot. Default is None, which will plot all residues. """ fig, ax = plt.subplots(nrows = 2, figsize = figsize, height_ratios = [0.5,1]) fig.subplots_adjust(hspace = 1) if modification_class is not None: altered_flanks = altered_flanks[altered_flanks['Modification Class'].str.contains(modification_class)].copy() if residue is not None: altered_flanks = altered_flanks[altered_flanks['Residue'] == residue].copy() #### plot of side of modification that flank is altered terminus = altered_flanks.groupby('Altered Flank Side').size() terminus = terminus[['N-term only', 'C-term only']] #focus on cases where only one side is altered for ease of plotting ax[0].bar(terminus.index, terminus.values, color = 'gray') ax[0].set_xlabel('Location of Altered Region', fontsize = 9) ax[0].set_xticklabels(['N-term\nonly', 'C-term\nonly']) ax[0].set_ylabel('# of PTMs', fontsize = 9) #### plot specific positions of altered residues relative to PTM position_breakdown = altered_flanks.explode(['Altered Positions', 'Residue Change']).copy()[['Gene', 'Residue', 'PTM Position in Isoform','Altered Positions', 'Residue Change']] position_breakdown = position_breakdown.groupby('Altered Positions').size() ax[1].bar(position_breakdown.index, position_breakdown.values, color = 'gray') ax[1].set_xlim([-5.5,5.5]) ax[1].set_xlabel('Position Relative to PTM', fontsize = 9) ax[1].set_ylabel('# of Changed\nResidues', fontsize = 9) ax[1].set_xticks(np.arange(-5,6,1))
[docs] def plot_alterations_matrix(altered_flanks, modification_class = None, residue = None, title = '', ax = None): """ Given the altered flanking sequences dataframe, plot a matrix showing the positions of altered residues for specific proteins, as well as the specific change Parameters ---------- altered_flanks: pd.DataFrame Dataframe with altered flanking sequences, and annotated information added with analyze.compare_flanking_sequences modification_class: str Specific modification class to plot. Default is None, which will plot all modification classes. residue: str Specific residue to plot. Default is None, which will plot all residues. title: str Title of the plot. Default is '' (no title). ax: matplotlib.Axes Axis to plot on. If None, will create new figure. Default is None. """ #extract altered flanking sequences and make sure there is altered position data position_breakdown = altered_flanks.copy() if 'Altered Positions' not in position_breakdown.columns: position_breakdown = fs.compare_flanking_sequences(position_breakdown) position_breakdown = position_breakdown.dropna(subset = ['Altered Positions', 'Residue Change']) #restrit to desired PTM types and residues if modification_class is not None: position_breakdown = position_breakdown[position_breakdown['Modification Class'].str.contains(modification_class)].copy() if residue is not None: position_breakdown = position_breakdown[position_breakdown['Residue'] == residue].copy() #add ptm column to position breakdown position_breakdown['PTM'] = position_breakdown['Gene'] + '_' + position_breakdown['Residue'] + position_breakdown['PTM Position in Isoform'].astype(str) #separate altered residue into individual rows position_breakdown = position_breakdown.explode(['Altered Positions', 'Residue Change']).copy()[['Gene', 'PTM','Altered Positions', 'Residue Change']] #convert altered positions to integers and remove duplicates position_breakdown['Altered Positions'] = position_breakdown['Altered Positions'].astype(int) position_breakdown = position_breakdown.drop_duplicates() #position_breakdown = position_breakdown.drop_duplicates(subset = ["PTM", "Altered_Positions"], keep = False) position_breakdown['PTM'] position_matrix = position_breakdown.pivot(columns = 'Altered Positions', index = 'PTM', values= 'Residue Change') for i, pos in zip(range(11),range(-5, 6)): if pos not in position_matrix.columns: position_matrix.insert(i, pos, np.nan) #replace strings with 1 and nans with 0 position_values = position_matrix.map(lambda x: 1 if x == x else np.nan).sort_values(by = 5) position_matrix = position_matrix.loc[position_values.index] #plot heatmap with black edges around cells and no colorbar, annotate with strings in position matrix if ax is None: fig, ax = plt.subplots(figsize = (4,3)) sns.heatmap(position_values, cmap = 'Greens', vmin = 0, vmax = 2, ax = ax, cbar = False, linewidths = 0.5, linecolor = 'black', yticklabels=True ) ax.set_facecolor("lightgrey") #annotate with strings in the position matrix for i in range(position_values.shape[0]): for j in range(position_values.shape[1]): if position_values.iloc[i,j] == 1: ax.text(j+0.5, i+0.5, position_matrix.iloc[i,j], ha = 'center', va = 'center', fontsize = 6) #adjust figure parameters ax.set_xticklabels(['-5','-4','-3','-2','-1','PTM','1','2','3','4','5'], fontsize = 8) ax.set_xlabel('') ax.tick_params(axis = 'y', labelsize = 8) ax.set_title(title, fontsize = 9)