Source code for ptm_pose.analyze.flanking_sequences


import numpy as np
import pandas as pd

from ptm_pose import helpers

#plotting 
import matplotlib.pyplot as plt
import seaborn as sns

#analysis packages
from Bio.Align import PairwiseAligner
import re



def compare_flanking_sequences(altered_flanks, flank_size = 5):
    sequence_identity_list = []
    altered_positions_list = []
    residue_change_list = []
    flank_side_list = []
    for i, row in altered_flanks.iterrows():
        #if there is sequence info for both and does not introduce stop codons, compare sequence identity
        if not row['Stop Codon Introduced'] and row['Inclusion Flanking Sequence'] == row['Inclusion Flanking Sequence'] and row['Exclusion Flanking Sequence'] == row['Exclusion Flanking Sequence']:
            #compare sequence identity
            sequence_identity = getSequenceIdentity(row['Inclusion Flanking Sequence'], row['Exclusion Flanking Sequence'])
            #identify where flanking sequence changes
            altered_positions, residue_change, flank_side = findAlteredPositions(row['Inclusion Flanking Sequence'], row['Exclusion Flanking Sequence'], flank_size = flank_size)
        else:
            sequence_identity = np.nan
            altered_positions = np.nan
            residue_change = np.nan
            flank_side = np.nan



        #add to lists
        sequence_identity_list.append(sequence_identity)
        altered_positions_list.append(altered_positions)
        residue_change_list.append(residue_change)
        flank_side_list.append(flank_side)

    altered_flanks['Sequence Identity'] = sequence_identity_list
    altered_flanks['Altered Positions'] = altered_positions_list
    altered_flanks['Residue Change'] = residue_change_list
    altered_flanks['Altered Flank Side'] = flank_side_list
    return altered_flanks




[docs]
def compare_inclusion_motifs(flanking_sequences, elm_classes = None):
    """
    Given a DataFrame containing flanking sequences with changes and a DataFrame containing ELM class information, identify motifs that are found in the inclusion and exclusion events, identifying motifs unique to each case. This does not take into account the position of the motif in the sequence or additional information that might validate any potential interaction (i.e. structural information that would indicate whether the motif is accessible or not). ELM class information can be downloaded from the download page of elm (http://elm.eu.org/elms/elms_index.tsv).

    Parameters
    ----------
    flanking_sequences: pandas.DataFrame
        DataFrame containing flanking sequences with changes, obtained from get_flanking_changes_from_splice_data()
    elm_classes: pandas.DataFrame
        DataFrame containing ELM class information (ELMIdentifier, Regex, etc.), downloaded directly from ELM (http://elm.eu.org/elms/elms_index.tsv). Recommended to download this file and input it manually, but will download from ELM otherwise

    Returns
    -------
    flanking_sequences: pandas.DataFrame
        DataFrame containing flanking sequences with changes and motifs found in the inclusion and exclusion events

    """
    if elm_classes is None:
        elm_classes = pd.read_csv('http://elm.eu.org/elms/elms_index.tsv', sep = '\t', header = 5)

        

    only_in_inclusion = []
    only_in_exclusion = []

    for _, row in flanking_sequences.iterrows():
        #check if there is a stop codon introduced and both flanking sequences are present
        if not row['Stop Codon Introduced'] and row['Inclusion Flanking Sequence'] == row['Inclusion Flanking Sequence'] and row['Exclusion Flanking Sequence'] == row['Exclusion Flanking Sequence']:
            #get elm motifs that match inclusion or Exclusion Flanking Sequences
            inclusion_matches = find_motifs(row['Inclusion Flanking Sequence'], elm_classes)
            exclusion_matches = find_motifs(row['Exclusion Flanking Sequence'], elm_classes)

            #get motifs that are unique to each case
            only_in_inclusion.append(';'.join(set(inclusion_matches) - set(exclusion_matches)))
            only_in_exclusion.append(';'.join(set(exclusion_matches) - set(inclusion_matches)))
        else:
            only_in_inclusion.append(np.nan)
            only_in_exclusion.append(np.nan)

    #save data
    flanking_sequences["Motif only in Inclusion"] = only_in_inclusion
    flanking_sequences["Motif only in Exclusion"] = only_in_exclusion
    return flanking_sequences


def identify_change_to_specific_motif(altered_flanks, elm_motif_name, elm_classes = None, modification_class = None, residues = None, dPSI_col = None, **kwargs):

    #filter ptms 
    filter_arguments = helpers.extract_filter_kwargs(**kwargs)
    helpers.check_filter_kwargs(filter_arguments)
    altered_flanks = helpers.filter_ptms(altered_flanks, **filter_arguments)

    #compare flanking sequences
    if 'Altered Positions' not in altered_flanks.columns:
        altered_flanks = compare_flanking_sequences(altered_flanks)
    
    #grab elm motifs that match inclusion or Exclusion Flanking Sequences
    if 'Motif only in Inclusion' not in altered_flanks.columns:
        altered_flanks = compare_inclusion_motifs(altered_flanks, elm_classes = elm_classes)

    #grab only needed info
    motif_data = altered_flanks.dropna(subset = ['Inclusion Flanking Sequence', 'Exclusion Flanking Sequence'], how = 'all').copy()
    cols_to_keep = ['Region ID', 'Gene', 'UniProtKB Accession', 'Residue', 'PTM Position in Isoform', 'Modification Class', 'Inclusion Flanking Sequence', 'Exclusion Flanking Sequence', 'Motif only in Inclusion', 'Motif only in Exclusion', 'Altered Positions', 'Residue Change']
    if dPSI_col is not None:
        cols_to_keep.append(dPSI_col)

    #go through motif data and identify motifs matching elm motif of interest
    motif_data = motif_data[cols_to_keep]
    for i, row in motif_data.iterrows():
        if row['Motif only in Inclusion'] == row['Motif only in Inclusion']:
            if elm_motif_name in row['Motif only in Inclusion']:
                motif_data.loc[i, 'Motif only in Inclusion'] = ';'.join([motif for motif in row['Motif only in Inclusion'].split(';') if elm_motif_name in motif])
            else:
                motif_data.loc[i, 'Motif only in Inclusion'] = np.nan

        if row['Motif only in Exclusion'] == row['Motif only in Exclusion']:
            if elm_motif_name in row['Motif only in Exclusion']:
                motif_data.loc[i, 'Motif only in Exclusion'] = ';'.join([motif for motif in row['Motif only in Exclusion'].split(';') if elm_motif_name in motif])
            else:
                motif_data.loc[i, 'Motif only in Exclusion'] = np.nan

    #restrict to events that are specific modification types or residues (for example, SH2 domain motifs should be phosphotyrosine)
    motif_data = motif_data.dropna(subset = ['Motif only in Inclusion', 'Motif only in Exclusion'], how = 'all')
    if modification_class is not None:
        motif_data = motif_data[motif_data['Modification Class'].str.contains(modification_class)]

    if residues is not None and isinstance(residues, str):
        motif_data = motif_data[motif_data['Residue'] == residues]
    elif residues is not None and isinstance(residues, list):
        motif_data = motif_data[motif_data['Residue'].isin(residues)]
    elif residues is not None:
        raise ValueError('residues parameter must be a string or list of strings')
    
    return motif_data

    




[docs]
def findAlteredPositions(seq1, seq2, flank_size = 5):
    """
    Given two sequences, identify the location of positions that have changed

    Parameters
    ----------
    seq1, seq2: str
        sequences to compare (order does not matter)
    flank_size: int
        size of the flanking sequences (default is 5). This is used to make sure the provided sequences are the correct length
    
    Returns
    -------
    altered_positions: list
        list of positions that have changed
    residue_change: list
        list of residues that have changed associated with that position
    flank_side: str
        indicates which side of the flanking sequence the change has occurred (N-term, C-term, or Both)
    """
    desired_seq_size = flank_size*2+1
    altered_positions = []
    residue_change = []
    flank_side = []
    seq_size = len(seq1)
    flank_size = (seq_size -1)/2
    if seq_size == len(seq2) and seq_size == desired_seq_size:
        for i in range(seq_size):
            if seq1[i] != seq2[i]:
                altered_positions.append(i-(flank_size))
                residue_change.append(f'{seq1[i]}->{seq2[i]}')
        #check to see which side flanking sequence
        altered_positions = np.array(altered_positions)
        n_term = any(altered_positions < 0)
        c_term = any(altered_positions > 0)
        if n_term and c_term:
            flank_side = 'Both'
        elif n_term:
            flank_side = 'N-term only'
        elif c_term:
            flank_side = 'C-term only'
        else:
            flank_side = 'Unclear'
        return altered_positions, residue_change, flank_side
    else:
        return np.nan, np.nan, np.nan

        

[docs]
def getSequenceIdentity(seq1, seq2):
    """
    Given two flanking sequences, calculate the sequence identity between them using Biopython and parameters definded by Pillman et al. BMC Bioinformatics 2011

    Parameters
    ----------
    seq1, seq2: str
        flanking sequence 

    Returns
    -------
    normalized_score: float
        normalized score of sequence similarity between flanking sequences (calculated similarity/max possible similarity)
    """
    #make pairwise aligner object
    aligner = PairwiseAligner()
    #set parameters, with match score of 10 and mismatch score of -2
    aligner.mode = 'global'
    aligner.match_score = 10
    aligner.mismatch_score = -2
    #calculate sequence alignment score between two sequences
    actual_similarity = aligner.align(seq1, seq2)[0].score
    #calculate sequence alignment score between the same sequence
    control_similarity = aligner.align(seq1, seq1)[0].score
    #normalize score
    normalized_score = actual_similarity/control_similarity
    return normalized_score



[docs]
def find_motifs(seq, elm_classes):
    """
    Given a sequence and a dataframe containinn ELM class information, identify motifs that can be found in the provided sequence using the RegEx expression provided by ELM (PTMs not considered). This does not take into account the position of the motif in the sequence or additional information that might validate any potential interaction (i.e. structural information that would indicate whether the motif is accessible or not). ELM class information can be downloaded from the download page of elm (http://elm.eu.org/elms/elms_index.tsv).

    Parameters
    ----------
    seq: str
        sequence to search for motifs
    elm_classes: pandas.DataFrame
        DataFrame containing ELM class information (ELMIdentifier, Regex, etc.), downloaded directly from ELM (http://elm.eu.org/elms/elms_index.tsv)
    """
    matches = []
    for j, elm_row in elm_classes.iterrows():
        reg_ex = elm_row['Regex']
        if re.search(reg_ex, seq) is not None:
            matches.append(elm_row['ELMIdentifier'])

    return matches



[docs]
def plot_sequence_differences(inclusion_seq, exclusion_seq, dpsi = None, flank_size = 5, figsize = (3,1)):
    """
    Given the flanking sequences for a PTM resulting from a specific splice event, plot the differences between the two sequences, coloring the changing residues in red. If dPSI is also provided, will add an arrow to the plot indicating the direction of change

    Parameters
    ----------
    inclusion_seq: str
        Sequence of the inclusion isoform (with spliced region included)
    exclusion_seq: str
        Sequence of the exclusion isoform (with spliced region excluded)
    dpsi: float
        Change in PSI for the specific splice event. Default is None, which will not add an arrow to the plot.
    flank_size: int
        Size of flanking region to plot. Default is 5. This must be less than half the length of the shortest sequence.
    figsize: tuple
        Size of the figure. Default is (3,1).
    """
    #convert sequence into list of residues
    inclusion_seq = list(inclusion_seq)
    exclusion_seq = list(exclusion_seq)

    if len(inclusion_seq) < flank_size*2 + 1 or len(exclusion_seq) < flank_size*2 + 1:
        min_seq_length = min(len(inclusion_seq), len(exclusion_seq))
        raise ValueError(f'Longest allowable flank size for provided sequences is {(min_seq_length -1)/2}, or {min_seq_length} residues. Please use smaller flank size or rerun flanking sequence analysis to get longer flank sequences')
    
    #if either provided seq is longer than flank size, trim to flank size
    if len(inclusion_seq) > flank_size*2 + 1:
        lowercase_pos = [i for i, x in enumerate(inclusion_seq) if x.islower()]
        inclusion_seq = inclusion_seq[int(lowercase_pos[0])-flank_size:int(lowercase_pos[0])+flank_size+1]
    if len(exclusion_seq) > flank_size*2 + 1:
        lowercase_pos = [i for i, x in enumerate(exclusion_seq) if x.islower()]
        exclusion_seq = exclusion_seq[int(lowercase_pos[0])-flank_size:int(lowercase_pos[0])+flank_size+1]

    #convert sequences to dataframe for plotting
    plt_data = pd.DataFrame({'Inclusion': inclusion_seq, 'Exclusion': exclusion_seq}, index = list(range(-flank_size, flank_size+1)))

    #note where inclusion and exclusion rows differ
    binary_map = plt_data['Inclusion'] != plt_data['Exclusion']
    binary_map = pd.concat([binary_map, binary_map], axis =1).astype(int).T
    binary_map.loc[0] = 0
    plt_data = plt_data.T

    fig, ax = plt.subplots(figsize = figsize)

    sns.heatmap(binary_map, cmap = 'coolwarm', cbar = False, ax = ax, annot = plt_data, fmt = '', annot_kws = {'size': 10}, vmax = 1.1, vmin =-0.1, linewidths = 0.6, ec = 'black', linecolor = 'black')
    ax.set_yticklabels(['Inclusion', 'Exclusion'], rotation = 0)

    #if dpsi is provided, add arrow to plot showing direction of change
    end_of_map = ax.get_xticks()[-1]+1
    if dpsi is not None:
        start_of_arrow = 1.8 if dpsi < 0 else 0.2
        end_of_arrow = 0.2 if dpsi < 0 else 1.8
        ax.annotate(f'', xy = (end_of_map, start_of_arrow), xytext = (end_of_map, end_of_arrow), arrowprops=dict(arrowstyle="->", lw = 1, color = 'black'), annotation_clip=False)



[docs]
def plot_location_of_altered_flanking_residues(altered_flanks, figsize = (4,3), modification_class = None, residue = None):
    """
    Plot the number of PTMs with altered residues as specific positions relative to the PTM site. This includes the specific position of the residue (-5 to +5 from PTM site) and the specific side of the PTM site that is altered (N-term or C-term)

    Parameters
    ----------
    altered_flanks: pd.DataFrame
        Dataframe with altered flanking sequences, and annotated information added with analyze.compare_flanking_sequences
    figsize: tuple
        Size of the figure. Default is (4,3).
    modification_class: str
        Specific modification class to plot. Default is None, which will plot all modification classes.
    residue: str
        Specific residue to plot. Default is None, which will plot all residues.

    """
    
    fig, ax = plt.subplots(nrows = 2, figsize = figsize, height_ratios = [0.5,1])
    fig.subplots_adjust(hspace = 1)

    if modification_class is not None:
        altered_flanks = altered_flanks[altered_flanks['Modification Class'].str.contains(modification_class)].copy()
    
    if residue is not None:
        altered_flanks = altered_flanks[altered_flanks['Residue'] == residue].copy()

    #### plot of side of modification that flank is altered
    terminus = altered_flanks.groupby('Altered Flank Side').size()
    terminus = terminus[['N-term only', 'C-term only']] #focus on cases where only one side is altered for ease of plotting
    ax[0].bar(terminus.index, terminus.values, color = 'gray')
    ax[0].set_xlabel('Location of Altered Region', fontsize = 9)
    ax[0].set_xticklabels(['N-term\nonly', 'C-term\nonly'])
    ax[0].set_ylabel('# of PTMs', fontsize = 9)

    #### plot specific positions of altered residues relative to PTM
    position_breakdown = altered_flanks.explode(['Altered Positions', 'Residue Change']).copy()[['Gene', 'Residue', 'PTM Position in Isoform','Altered Positions', 'Residue Change']]
    position_breakdown = position_breakdown.groupby('Altered Positions').size()
    ax[1].bar(position_breakdown.index, position_breakdown.values, color = 'gray')
    ax[1].set_xlim([-5.5,5.5])
    ax[1].set_xlabel('Position Relative to PTM', fontsize = 9)
    ax[1].set_ylabel('# of Changed\nResidues', fontsize = 9)
    ax[1].set_xticks(np.arange(-5,6,1))




[docs]
def plot_alterations_matrix(altered_flanks, modification_class = None, residue = None, title = '', ax = None):
    """
    Given the altered flanking sequences dataframe, plot a matrix showing the positions of altered residues for specific proteins, as well as the specific change

    Parameters
    ----------
    altered_flanks: pd.DataFrame
        Dataframe with altered flanking sequences, and annotated information added with analyze.compare_flanking_sequences

    modification_class: str
        Specific modification class to plot. Default is None, which will plot all modification classes.

    residue: str
        Specific residue to plot. Default is None, which will plot all residues.
    title: str
        Title of the plot. Default is '' (no title).
    ax: matplotlib.Axes
        Axis to plot on. If None, will create new figure. Default is None.
    """
    #extract altered flanking sequences and make sure there is altered position data
    position_breakdown = altered_flanks.copy()
    if 'Altered Positions' not in position_breakdown.columns:
        position_breakdown = fs.compare_flanking_sequences(position_breakdown)

    position_breakdown = position_breakdown.dropna(subset = ['Altered Positions', 'Residue Change'])

    #restrit to desired PTM types and residues
    if modification_class is not None:
        position_breakdown = position_breakdown[position_breakdown['Modification Class'].str.contains(modification_class)].copy()
    if residue is not None:
        position_breakdown = position_breakdown[position_breakdown['Residue'] == residue].copy()

    #add ptm column to position breakdown
    position_breakdown['PTM'] = position_breakdown['Gene'] + '_' + position_breakdown['Residue'] + position_breakdown['PTM Position in Isoform'].astype(str)

    #separate altered residue into individual rows
    position_breakdown = position_breakdown.explode(['Altered Positions', 'Residue Change']).copy()[['Gene', 'PTM','Altered Positions', 'Residue Change']]

    #convert altered positions to integers and remove duplicates
    position_breakdown['Altered Positions'] = position_breakdown['Altered Positions'].astype(int)
    position_breakdown = position_breakdown.drop_duplicates()

    #position_breakdown = position_breakdown.drop_duplicates(subset = ["PTM", "Altered_Positions"], keep = False)
    position_breakdown['PTM']
    position_matrix = position_breakdown.pivot(columns = 'Altered Positions', index = 'PTM', values= 'Residue Change')
    for i, pos in zip(range(11),range(-5, 6)):
        if pos not in position_matrix.columns:
            position_matrix.insert(i, pos, np.nan)


    #replace strings with 1 and nans with 0
    position_values = position_matrix.map(lambda x: 1 if x == x else np.nan).sort_values(by = 5)
    position_matrix = position_matrix.loc[position_values.index]
    #plot heatmap with black edges around cells and no colorbar, annotate with strings in position matrix

    if ax is None:
        fig, ax = plt.subplots(figsize = (4,3))
    sns.heatmap(position_values, cmap = 'Greens', vmin = 0, vmax = 2, ax = ax, cbar = False, linewidths = 0.5, linecolor = 'black', yticklabels=True )
    ax.set_facecolor("lightgrey")

    #annotate with strings in the position matrix
    for i in range(position_values.shape[0]):
        for j in range(position_values.shape[1]):
            if position_values.iloc[i,j] == 1:
                ax.text(j+0.5, i+0.5, position_matrix.iloc[i,j], ha = 'center', va = 'center', fontsize = 6)

    #adjust figure parameters
    ax.set_xticklabels(['-5','-4','-3','-2','-1','PTM','1','2','3','4','5'], fontsize = 8)
    ax.set_xlabel('')
    ax.tick_params(axis = 'y', labelsize = 8)
    ax.set_title(title, fontsize = 9)