Source code for ptm_pose.flanking_sequences

#biopython packages
from Bio.Data import CodonTable

#standard packages
import numpy as np
import pandas as pd
import re

import tqdm
import warnings

#PTM pose functions
from ptm_pose import database_interfacing as di
from ptm_pose import project
from ptm_pose import pose_config



# Get the standard codon table
codon_table = CodonTable.unambiguous_dna_by_name["Standard"]


[docs]def translate_flanking_sequence(seq, flank_size = 7, full_flanking_seq = True, lowercase_mod = True, first_flank_length = None, stop_codon_symbol = '*', unknown_codon_symbol = 'X'):
    """
    Given a DNA sequence, translate the sequence into an amino acid sequence. If the sequence is not the correct length, the function will attempt to extract the flanking sequence with spaces to account for missing parts if full_flanking_seq is not True. If the sequence is still not the correct length, the function will raise an error. Any unrecognized codons that are found in the sequence and are not in the standard codon table, including stop codons, will be translated as 'X' (unknown) or '*' (stop codon).

    Parameters
    ----------
    seq : str
        DNA sequence to translate
    flank_size : int, optional
        Number of amino acids to include flanking the PTM, by default 7
    full_flanking_seq : bool, optional
        Whether to require the flanking sequence to be the correct length, by default True
    lowercase_mod : bool, optional
        Whether to lowercase the amino acid associated with the PTM, by default True
    first_flank_length : int, optional
        Length of the flanking sequence in front of the PTM, by default None. If full_flanking_seq is False and sequence is not the correct length, this is required.
    stop_codon_symbol : str, optional
        Symbol to use for stop codons, by default '*'
    unknown_codon_symbol : str, optional
        Symbol to use for unknown codons, by default 'X'

    Returns
    -------
    str
        Amino acid sequence of the flanking sequence if translation was successful, otherwise np.nan
    """
    aa_seq = ''
    if len(seq) == flank_size*2*3+3:
        for i in range(0, len(seq), 3):
            if seq[i:i+3] in codon_table.forward_table.keys():
                aa = codon_table.forward_table[seq[i:i+3]]
            elif seq[i:i+3] in codon_table.stop_codons:
                aa = stop_codon_symbol
            else:
                aa = unknown_codon_symbol

            if i/3 == flank_size and lowercase_mod:
                aa = aa.lower()
            aa_seq += aa
    elif len(seq) % 3 == 0 and not full_flanking_seq:
        for i in range(0, len(seq), 3):
            if seq[i:i+3] in codon_table.forward_table.keys():
                aa = codon_table.forward_table[seq[i:i+3]]
            elif seq[i:i+3] in codon_table.stop_codons:
                aa = '*'
            else:
                aa = 'X'

            if lowercase_mod and i/3 == first_flank_length:
                aa = aa.lower()
            aa_seq += aa
    elif len(seq) % 3 == 0 and full_flanking_seq:
        raise ValueError('Provided sequence length does not match indicated flank size. Fix sequence or set full_flanking_seq = False, which requires indicating the length of the flanking sequence in front of the PTM.')
    elif len(seq) % 3 != 0:
        raise ValueError('Provided sequence is not a multiple of 3')
    else:
        raise ValueError('Unknown error with flanking sequence')
    return aa_seq

[docs]def get_ptm_locs_in_spliced_sequences(ptm_loc_in_flank, first_flank_seq, spliced_seq, second_flank_seq, strand, which_flank = 'First', order_by = 'Coordinates'):
    """
    Given the location of a PTM in a flanking sequence, extract the location of the PTM in the Inclusion Flanking Sequence and the Exclusion Flanking Sequence associated with a given splice event. Inclusion Flanking Sequence will include the skipped exon region, retained intron, or longer alternative splice site depending on event type. The PTM location should be associated with where the PTM is located relative to spliced region (before = 'First', after = 'Second').

    Parameters
    ----------
    ptm_loc_in_flank : int
        Location of the PTM in the flanking sequence it is found (either first or second)
    first_flank_seq : str
        Flanking exon sequence before the spliced region
    spliced_seq : str
        Spliced region sequence
    second_flank_seq : str
        Flanking exon sequence after the spliced region
    which_flank : str, optional
        Which flank the PTM is associated with, by default 'First'
    order_by : str, optional
        Whether the first, spliced and second regions are defined by their genomic coordinates (first has smallest coordinate, spliced next, then second), or if they are defined by their translation (first the first when translated, etc.)

    Returns
    -------
    tuple
        Tuple containing the PTM location in the Inclusion Flanking Sequence and the Exclusion Flanking Sequence
    """
    if order_by == 'Translation':
        if which_flank == 'First':
            inclusion_ptm_loc, exclusion_ptm_loc = ptm_loc_in_flank, ptm_loc_in_flank
        elif which_flank == 'Second':
            inclusion_ptm_loc = ptm_loc_in_flank+len(spliced_seq)+len(first_flank_seq)
            exclusion_ptm_loc = ptm_loc_in_flank+len(first_flank_seq)

    elif order_by == 'Coordinates':
        #grab codon associated with ptm in sequence
        if (which_flank == 'First' and strand == 1) or (which_flank == 'Second' and strand == -1):
            inclusion_ptm_loc, exclusion_ptm_loc = ptm_loc_in_flank, ptm_loc_in_flank
        elif (strand == -1 and which_flank == 'First'):
            inclusion_ptm_loc =  ptm_loc_in_flank+len(spliced_seq)+len(second_flank_seq)
            exclusion_ptm_loc =  ptm_loc_in_flank+len(second_flank_seq)
        elif (strand == 1 and which_flank == 'Second'):
            inclusion_ptm_loc =  ptm_loc_in_flank+len(spliced_seq)+len(first_flank_seq)
            exclusion_ptm_loc =  ptm_loc_in_flank+len(first_flank_seq)
    else:
        raise ValueError('Unknown order_by value, must be either Coordinates (first, spliced and second regions are determined by genomic coordinates) or Translation (first, spliced and second regions are determined by translation')

    return int(inclusion_ptm_loc), int(exclusion_ptm_loc)
    

[docs]def get_flanking_sequence(ptm_loc, seq, ptm_residue, flank_size = 5, lowercase_mod = True, full_flanking_seq = False):
    """
    Given a PTM location in a sequence of DNA, extract the flanking sequence around the PTM location and translate into the amino acid sequence. If the sequence is not the correct length, the function will attempt to extract the flanking sequence with spaces to account for missing parts if full_flanking_seq is not True. If the sequence is still not the correct length, the function will raise an error. Any unrecognized codons that are found in the sequence and are not in the standard codon table, including stop codons, will be translated as 'X' (unknown) or '*' (stop codon).

    Parameters
    ----------
    ptm_loc : int
        Location of the first base pair associated with PTM in the DNA sequence
    seq : str
        DNA sequence containing the PTM
    ptm_residue : str
        Amino acid residue associated with the PTM
    flank_size : int, optional
        Number of amino acids to include flanking the PTM, by default 5
    lowercase_mod : bool, optional
        Whether to lowercase the amino acid associated with the PTM, by default True
    full_flanking_seq : bool, optional
        Whether to require the flanking sequence to be the correct length, by default False

    Returns
    -------
    str
        Amino acid sequence of the flanking sequence around the PTM if translation was successful, otherwise np.nan
    """
    ptm_codon = seq[ptm_loc:ptm_loc+3]
    #check if ptm codon codes for amino acid and then extract flanking sequence
    if ptm_codon in codon_table.forward_table.keys():
        if codon_table.forward_table[ptm_codon] == ptm_residue:
            if len(seq) != 3*(flank_size*2+1):
                if full_flanking_seq:
                    raise ValueError('Flanking sequence is not the correct length, please fix or set full_flanking_seq to False')
                else:
                    #check where issue is, at start or end of sequence
                    enough_at_start = ptm_loc >= flank_size*3
                    enough_at_end = len(seq) - ptm_loc >= flank_size*3+3
                    #extract length with amino acids and add cushion for missing parts
                    front_length = flank_size*3 if enough_at_start else ptm_loc
                    start_cushion = (flank_size*3 - ptm_loc)*' ' if not enough_at_start else ''
                    end_length = flank_size*3 + 3 if enough_at_end else len(seq) - ptm_loc
                    end_cushion = (flank_size*3 - (len(seq) - ptm_loc))*' ' if not enough_at_end else ''
                    #reconstruct sequence with spaces to account for missing ends
                    flanking_seq_bp = start_cushion +  seq[ptm_loc-front_length:ptm_loc+end_length] + end_cushion
            else:
                flanking_seq_bp = seq[ptm_loc-(flank_size*3):ptm_loc+(flank_size*3)+3]
            flanking_seq_aa = translate_flanking_sequence(flanking_seq_bp, flank_size = flank_size, lowercase_mod=lowercase_mod, full_flanking_seq = full_flanking_seq)
        else:
            flanking_seq_aa = np.nan
    else:
        flanking_seq_aa = np.nan
    
    return flanking_seq_aa

[docs]def extract_region_from_splicegraph(splicegraph, region_id):
    """
    Given a region id and the splicegraph from SpliceSeq, extract the chromosome, strand, and start and stop locations of that exon. Start and stop are forced to be in ascending order, which is not necessarily true from the splice graph (i.e. start > stop for negative strand exons). This is done to make the region extraction consistent with the rest of the codebase.

    Parameters
    ----------
    spliceseq : pandas.DataFrame
        SpliceSeq splicegraph dataframe, with region_id as index
    region_id : str
        Region ID to extract information from, in the format of 'GeneName_ExonNumber'

    Returns
    -------
    list
        List containing the chromosome, strand (1 for forward, -1 for negative), start, and stop locations of the region
    """
    region_info = splicegraph.loc[region_id]

    #check to see how many regions correspond to id, if multiple, default to first entry
    if isinstance(region_info, pd.DataFrame):
        region_info = region_info.iloc[0]
        print(f'Warning: {region_id} has multiple entries in splicegraph. Defaulting to first entry.')
    
    strand = project.convert_strand_symbol(region_info['Strand'])
    if strand == 1:
        return [region_info['Chromosome'], strand,region_info['Chr_Start'], region_info['Chr_Stop']]
    else:
        return [region_info['Chromosome'], strand,region_info['Chr_Stop'], region_info['Chr_Start']]

    
[docs]def get_spliceseq_event_regions(gene_name, from_exon, spliced_exons, to_exon, splicegraph):
    """
    Given all exons associated with a splicegraph event, obtain the coordinates associated with the flanking exons and the spliced region. The spliced region is defined as the exons that are associated with psi values, while flanking regions include the "from" and "to" exons that indicate the adjacent, unspliced exons.

    Parameters
    ----------
    gene_name : str
        Gene name associated with the splice event
    from_exon : int
        Exon number associated with the first flanking exon
    spliced_exons : str
        Exon numbers associated with the spliced region, separated by colons for each unique exon
    to_exon : int
        Exon number associated with the second flanking exon
    splicegraph : pandas.DataFrame
        DataFrame containing information about individual exons and their coordinates

    Returns
    -------
    tuple
        Tuple containing the genomic coordinates of the first flanking region, spliced regions, and second flanking region
    """
    first_exon_region = extract_region_from_splicegraph(splicegraph, region_id = gene_name+'_'+str(from_exon))
    spliced_regions = [extract_region_from_splicegraph(splicegraph, gene_name+'_'+exon) if '.' in exon else extract_region_from_splicegraph(splicegraph, gene_name+'_'+exon+'.0') for exon in spliced_exons.split(':')]
    second_exon_region = extract_region_from_splicegraph(splicegraph, region_id = gene_name+'_'+str(to_exon))
    return first_exon_region, spliced_regions, second_exon_region





[docs]def get_flanking_changes(ptm_coordinates, chromosome, strand, first_flank_region, spliced_region, second_flank_region, gene = None, dPSI = None, sig = None, event_id = None, flank_size = 5, coordinate_type = 'hg38', lowercase_mod = True, order_by = 'Coordinates'):
    """
    Currently has been tested with MATS splicing events.

    Given flanking and spliced regions associated with a splice event, identify PTMs that have potential to have an altered flanking sequence depending on whether spliced region is included or excluded (if PTM is close to splice boundary). For these PTMs, extract the flanking sequences associated with the inclusion and exclusion cases and translate into amino acid sequences. If the PTM is not associated with a codon that codes for the expected amino acid, the PTM will be excluded from the results. 

    Parameters
    ----------
    ptm_coordinates : pandas.DataFrame
        DataFrame containing PTM coordinate information for identify PTMs in the flanking regions
    chromosome : str
        Chromosome associated with the splice event
    strand : int
        Strand associated with the splice event (1 for forward, -1 for negative)
    first_flank_region : list
        List containing the start and stop locations of the first flanking region (first is currently defined based on location the genome not coding sequence)
    spliced_region : list
        List containing the start and stop locations of the spliced region
    second_flank_region : list
        List containing the start and stop locations of the second flanking region (second is currently defined based on location the genome not coding sequence)
    event_id : str, optional
        Event ID associated with the splice event, by default None
    flank_size : int, optional
        Number of amino acids to include flanking the PTM, by default 7
    coordinate_type : str, optional
        Coordinate system used for the regions, by default 'hg38'. Other options is hg19.
    lowercase_mod : bool, optional
        Whether to lowercase the amino acid associated with the PTM in returned flanking sequences, by default True
    order_by : str, optional
        Whether the first, spliced and second regions are defined by their genomic coordinates (first has smallest coordinate, spliced next, then second), or if they are defined by their translation (first the first when translated, etc.)
    

    Returns
    -------
    pandas.DataFrame
        DataFrame containing the PTMs associated with the flanking regions and the amino acid sequences of the flanking regions in the inclusion and exclusion cases
    """
    strand = project.convert_strand_symbol(strand)
    #check first flank for ptms
    ptms_in_region_first_flank = project.find_ptms_in_region(ptm_coordinates, chromosome, strand, first_flank_region[0], first_flank_region[1], gene = gene, coordinate_type = coordinate_type)
    if not ptms_in_region_first_flank.empty:
        ptms_in_region_first_flank = ptms_in_region_first_flank[ptms_in_region_first_flank['Proximity to Region End (bp)'] < flank_size*3]
        ptms_in_region_first_flank['Region'] = 'First'
    #check second flank for ptms
    ptms_in_region_second_flank = project.find_ptms_in_region(ptm_coordinates, chromosome, strand, second_flank_region[0], second_flank_region[1], gene = gene, coordinate_type = coordinate_type)
    if not ptms_in_region_second_flank.empty:
        ptms_in_region_second_flank = ptms_in_region_second_flank[ptms_in_region_second_flank['Proximity to Region Start (bp)'] < flank_size*3]
        ptms_in_region_second_flank['Region'] = 'Second'

    #combine
    ptms_in_region = pd.concat([ptms_in_region_first_flank, ptms_in_region_second_flank])


    if ptms_in_region.empty:
        return pd.DataFrame()
    else:

        #add chromosome/strand info to region info for ensembl query
        first_flank_region_query = [chromosome, strand] + first_flank_region
        spliced_region_query = [chromosome, strand] + spliced_region
        second_flank_region_query = [chromosome, strand] + second_flank_region
        regions_list = [first_flank_region_query, spliced_region_query, second_flank_region_query]

        #get dna sequences associated with regions
        first_flank_seq, spliced_seq, second_flank_seq = di.get_region_sequences_from_list(regions_list, coordinate_type = coordinate_type)

        #combine sequences for inclusion and exclusion cases
        if strand == 1:
            inclusion_seq = first_flank_seq + spliced_seq + second_flank_seq
            exclusion_seq = first_flank_seq + second_flank_seq
        else:
            inclusion_seq = second_flank_seq + spliced_seq + first_flank_seq
            exclusion_seq = second_flank_seq + first_flank_seq

        #go through all ptms in region within range of splice boundary and grab flanking sequences
        translate_success_list = []
        inclusion_seq_list = []
        exclusion_seq_list = []
        flank_region_list = []
        for i, ptm in ptms_in_region.iterrows():
            ptm_loc = ptm[f'Gene Location ({coordinate_type})']
            flank_region = ptm['Region']
            flank_region_loc = ptm['Region']
            flank_region = first_flank_region if flank_region_loc == 'First' else second_flank_region
            #grab ptm loc based on which strand ptm is on
            if strand == 1:
                relative_ptm_loc = int(ptm_loc - flank_region[0])
            else:
                relative_ptm_loc = int(flank_region[1] - ptm_loc)


            #grab where ptm is located in both the inclusion and exclusion event
            inclusion_ptm_loc, exclusion_ptm_loc = get_ptm_locs_in_spliced_sequences(relative_ptm_loc, first_flank_seq, spliced_seq, second_flank_seq,strand = strand, which_flank = flank_region_loc, order_by = order_by)

            #grab codon associated with ptm in sequence 
            ptm_codon_inclusion = inclusion_seq[inclusion_ptm_loc:inclusion_ptm_loc+3]
            ptm_codon_exclusion = exclusion_seq[exclusion_ptm_loc:exclusion_ptm_loc+3]


            #check if ptm codon codes for amino acid and then extract flanking sequence
            correct_seq = False
            if ptm_codon_inclusion in codon_table.forward_table.keys() and ptm_codon_exclusion in codon_table.forward_table.keys():
                if codon_table.forward_table[ptm_codon_inclusion] == ptm['Residue'] and codon_table.forward_table[ptm_codon_exclusion] == ptm['Residue']  and exclusion_ptm_loc-(flank_size*3) >= 0 and len(exclusion_seq) >= exclusion_ptm_loc+(flank_size*3)+3:
                    inclusion_flanking_seq = inclusion_seq[inclusion_ptm_loc-(flank_size*3):inclusion_ptm_loc+(flank_size*3)+3]
                    exclusion_flanking_seq = exclusion_seq[exclusion_ptm_loc-(flank_size*3):exclusion_ptm_loc+(flank_size*3)+3]
                    correct_seq = True


            #check to make sure ptm matches expected residue
            if correct_seq:
                translate_success_list.append(True)

                #translate flanking sequences
                inclusion_aa = translate_flanking_sequence(inclusion_flanking_seq, flank_size = flank_size, lowercase_mod=lowercase_mod)
                exclusion_aa = translate_flanking_sequence(exclusion_flanking_seq, flank_size = flank_size, lowercase_mod=lowercase_mod)

                #append to lists
                inclusion_seq_list.append(inclusion_aa)
                exclusion_seq_list.append(exclusion_aa)
                flank_region_list.append(flank_region_loc)
            else:
                translate_success_list.append(False)
                inclusion_seq_list.append(np.nan)
                exclusion_seq_list.append(np.nan)
                flank_region_list.append(flank_region_loc)

        #grab useful info from ptm dataframe
        if gene is not None:
            ptms_in_region = ptms_in_region[['Source of PTM', 'Gene', 'UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform', 'Modification Class']].reset_index(drop = True)
        else:
            ptms_in_region = ptms_in_region[['Source of PTM', 'UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform', 'Modification Class']].reset_index(drop = True)
        #add flanking sequence information to ptm dataframe
        ptms_in_region['Inclusion Flanking Sequence'] = inclusion_seq_list
        ptms_in_region['Exclusion Flanking Sequence'] = exclusion_seq_list
        ptms_in_region['Region'] = flank_region_list
        ptms_in_region['Translation Success'] = translate_success_list

        if event_id is not None:
            ptms_in_region.insert(0, 'Event ID', event_id)
        if dPSI is not None:
            ptms_in_region['dPSI'] = dPSI
        if sig is not None:
            ptms_in_region['Significance'] = sig

        return ptms_in_region


[docs]def get_flanking_changes_from_splice_data(splice_data, ptm_coordinates = None, chromosome_col = None, strand_col = None, first_flank_start_col = None, first_flank_end_col = None, spliced_region_start_col = None, spliced_region_end_col = None, second_flank_start_col = None, second_flank_end_col = None, dPSI_col = None,  sig_col = None, event_id_col = None, gene_col = None, flank_size = 5, coordinate_type = 'hg38', lowercase_mod = True):
    """
    Given a DataFrame containing information about splice events, extract the flanking sequences associated with the PTMs in the flanking regions if there is potential for this to be altered. The DataFrame should contain columns for the chromosome, strand, start and stop locations of the first flanking region, spliced region, and second flanking region. The DataFrame should also contain a column for the event ID associated with the splice event. If the DataFrame does not contain the necessary columns, the function will raise an error.

    Parameters
    ----------
    splice_data : pandas.DataFrame
        DataFrame containing information about splice events
    ptm_coordinates : pandas.DataFrame
        DataFrame containing PTM coordinate information for identify PTMs in the flanking regions
    chromosome_col : str, optional
        Column name indicating chromosome, by default None
    strand_col : str, optional
        Column name indicating strand, by default None
    first_flank_start_col : str, optional
        Column name indicating start location of the first flanking region, by default None
    first_flank_end_col : str, optional
        Column name indicating end location of the first flanking region, by default None
    spliced_region_start_col : str, optional
        Column name indicating start location of the spliced region, by default None
    spliced_region_end_col : str, optional
        Column name indicating end location of the spliced region, by default None
    second_flank_start_col : str, optional
        Column name indicating start location of the second flanking region, by default None
    second_flank_end_col : str, optional
        Column name indicating end location of the second flanking region, by default None
    event_id_col : str, optional
        Column name indicating event ID, by default None
    flank_size : int, optional
        Number of amino acids to include flanking the PTM, by default 7
    coordinate_type : str, optional
        Coordinate system used for the regions, by default 'hg38'. Other options is hg19.
    lowercase_mod : bool, optional
        Whether to lowercase the amino acid associated with the PTM in returned flanking sequences, by default True
    
    Returns
    -------
    list
        List containing DataFrames with the PTMs associated with the flanking regions and the amino acid sequences of the flanking regions in the inclusion and exclusion cases
    """
    #load ptm data from config if not provided
    if ptm_coordinates is None and pose_config.ptm_coordinates is not None:
        ptm_coordinates = pose_config.ptm_coordinates
    elif ptm_coordinates is None:
        raise ValueError('ptm_coordinates dataframe not provided and not found in the resource files. Please provide the ptm_coordinates dataframe with config.download_ptm_coordinates() or download the file manually. To avoid needing to download this file each time, run pose_config.download_ptm_coordinates(save = True) to save the file locally within the package directory (will take ~63MB of storage space)')

    #check to make sure all required columns are provided
    if chromosome_col is None and strand_col is None and first_flank_start_col is None and first_flank_end_col is None and spliced_region_start_col is None and spliced_region_end_col is None and second_flank_start_col is None and second_flank_end_col is None:
        raise ValueError('Please provide column names for chromosome, strand, first flank start, first flank end, spliced region start, spliced region end, second flank start, and second flank end.')

    #if chromosome is labeled with 'chr', remove
    if splice_data[chromosome_col].str.contains('chr').any():
        splice_data['chr'] = splice_data['chr'].str.strip('chr')
    

    results = []
    for i, event in tqdm.tqdm(splice_data.iterrows(), total = splice_data.shape[0], desc = 'Finding flanking sequences for PTMs nearby splice junctions'):
        if event_id_col is None:
            event_id = i
        else:
            event_id = event[event_id_col]

        #get gene info
        chromosome = event[chromosome_col]
        strand = event[strand_col]
        gene = event[gene_col] if gene_col is not None else None
        dPSI = event[dPSI_col] if dPSI_col is not None else None
        sig = event[sig_col] if sig_col is not None else None

        #extract region inof
        first_flank_region = [event[first_flank_start_col], event[first_flank_end_col]]
        spliced_region = [event[spliced_region_start_col], event[spliced_region_end_col]]
        second_flank_region = [event[second_flank_start_col], event[second_flank_end_col]]

        #get flanking changes
        ptm_flanks = get_flanking_changes(ptm_coordinates, chromosome, strand, first_flank_region, spliced_region, second_flank_region, gene = gene, sig = sig, dPSI = dPSI, event_id = event_id, flank_size = flank_size, coordinate_type = coordinate_type, lowercase_mod=lowercase_mod)

        #append to results
        results.append(ptm_flanks)

    results = pd.concat(results)
    #combine and remove any failed translation attempts
    if not results.empty:
        results = results[results['Translation Success']]

        #do some quick comparison of flanking sequences
        if not results.empty:
            #find flanking sequences that have changed and only keep those
            results['Matched'] = results['Inclusion Flanking Sequence'] == results['Exclusion Flanking Sequence']
            results = results[~results['Matched']]
            results = results.drop(columns=['Matched'])
            results['Stop Codon Introduced'] = (results['Inclusion Flanking Sequence'].str.contains(r'\*')) | (results['Exclusion Flanking Sequence'].str.contains(r'\*')) 

        print(f'{results.shape[0]} PTMs found with potential for altered flanking sequences.')
    else:
        print('No PTMs found with potential for altered flanking sequences.')
    return results


[docs]def get_spliceseq_flank_loc(ptm, strand, from_region_coords, to_region_coords, coordinate_type = 'hg19'):
    """
    Given ptm information for identifying flanking sequences from splicegraph information, extract the relative location of the ptm in the flanking region (where it is located in translation of the flanking region).

    Parameters
    ----------
    ptm : pandas.Series
        Series containing PTM information
    strand : int
        Strand associated with the splice event (1 for forward, -1 for negative)
    from_region_coords : list
        List containing the chromosome, strand, start, and stop locations of the first flanking region
    to_region_coords : list
        List containing the chromosome, strand, start, and stop locations of the second flanking region
    
    Returns
    -------
    int
        Relative location of the PTM in the flanking region
    """
    if strand == 1 and ptm['Which Flank'] == 'First':
        return ptm[f'Gene Location ({coordinate_type})']  - from_region_coords[-2]
    elif strand == 1 and ptm['Which Flank'] == 'Second':
        return ptm[f'Gene Location ({coordinate_type})']  - to_region_coords[-2]
    elif strand == -1 and ptm['Which Flank'] == 'First':
        return from_region_coords[-1] - ptm[f'Gene Location ({coordinate_type})']
    else:
        return to_region_coords[-1] - ptm[f'Gene Location ({coordinate_type})']

[docs]def get_ptms_in_splicegraph_flank(gene_name, chromosome, strand, flank_region_start, flank_region_end, coordinate_type = 'hg19', which_flank = 'First', flank_size = 5):
    """

    """
    #check for ptms in first flank region
    flank_ptms = project.find_ptms_in_region(ptm_coordinates = pose_config.ptm_coordinates, chromosome = chromosome, strand = strand, start = flank_region_start, end = flank_region_end, coordinate_type = coordinate_type, gene = gene_name)
    if not flank_ptms.empty and which_flank == 'First': #if ptms found region, grab those close enough to splice boundary to have impacted flanking sequence
        flank_ptms = flank_ptms[flank_ptms['Proximity to Region End (bp)'] < flank_size*3]
        flank_ptms['Which Flank'] = 'First'
    elif not flank_ptms.empty and which_flank == 'Second': #if ptms found region, grab those close enough to splice boundary to have impacted flanking sequence
        flank_ptms = flank_ptms[flank_ptms['Proximity to Region Start (bp)'] < flank_size*3]
        flank_ptms['Which Flank'] = 'Second'
    
    return flank_ptms

def get_flank_changes_from_splicegraph_single_event(event_row, splicegraph, event_id_col = None, dPSI_col = None, sig_col = None, extra_cols = None, flank_size = 5, coordinate_type = 'hg19'):
    region_id = event_row[event_id_col] if event_id_col is not None else None
    dPSI = event_row[dPSI_col] if dPSI_col is not None else None
    sig = event_row[sig_col] if sig_col is not None else None

    #get region info
    from_region_coords, spliced_region_coords, to_region_coords = get_spliceseq_event_regions(gene_name = event_row['symbol'], from_exon = event_row['from_exon'], spliced_exons = event_row['exons'], to_exon = event_row['to_exon'], splicegraph = splicegraph)
    chromosome = from_region_coords[0]
    strand = from_region_coords[1]

    from_flank_ptms = get_ptms_in_splicegraph_flank(event_row['symbol'], chromosome, strand, from_region_coords[-2], from_region_coords[-1], coordinate_type = coordinate_type, which_flank = 'First', flank_size = flank_size)
    to_flank_ptms = get_ptms_in_splicegraph_flank(event_row['symbol'], chromosome, strand, to_region_coords[-2], to_region_coords[-1], coordinate_type = coordinate_type, which_flank = 'Second', flank_size = flank_size)
    ptms_of_interest = pd.concat([from_flank_ptms, to_flank_ptms]).reset_index()


    #if any ptms found for event that could have altered flanking sequences extract sequence information
    if not ptms_of_interest.empty:
        #add additional context from splice data, if indicated
        if event_id_col is not None:
            ptms_of_interest['Region ID'] = region_id
            
        if dPSI_col is not None:
            ptms_of_interest['dPSI'] = dPSI
        
        if sig_col is not None:
            ptms_of_interest['Significance'] = sig
        
        if extra_cols is not None:
            for col in extra_cols:
                ptms_of_interest[col] = event_row[col]


        region_list = [from_region_coords] + spliced_region_coords + [to_region_coords]
        seqs = di.get_region_sequences_from_list(region_list, coordinate_type = 'hg19')
        from_sequence = seqs[0]
        to_sequence = seqs[-1] 
        spliced_sequence = ''.join(seqs[1:-1]) #combine all sequences from spliced region (may be multiple exons)

        inclusion_sequence = seqs[0] + ''.join(seqs[1:-1]) + seqs[-1] #combine sequences if spliced region is included
        exclusion_sequence = seqs[0] + seqs[-1] #combine sequences if spliced region is excluded

        #initialize columns for flanking sequences
        ptms_of_interest['Inclusion Flanking Sequence'] = ''
        ptms_of_interest['Exclusion Flanking Sequence'] = ''
        for i, ptm in ptms_of_interest.iterrows():
            ptm_loc_in_flank = get_spliceseq_flank_loc(ptm, strand, from_region_coords, to_region_coords)
            #grab where ptm is located in both the inclusion and exclusion event
            inclusion_ptm_loc, exclusion_ptm_loc = get_ptm_locs_in_spliced_sequences(ptm_loc_in_flank, from_sequence, spliced_sequence, to_sequence, strand = strand, which_flank = ptm['Which Flank'], order_by = 'Translation')

            #extract expected flanking sequence based on location in sequence
            inclusion_flank = get_flanking_sequence(inclusion_ptm_loc, inclusion_sequence, ptm_residue = ptm['Residue'], flank_size = flank_size, full_flanking_seq = False)
            exclusion_flank = get_flanking_sequence(exclusion_ptm_loc, exclusion_sequence, ptm_residue = ptm['Residue'], flank_size = flank_size, full_flanking_seq = False)

            #add to dataframe
            ptms_of_interest.loc[i, 'Inclusion Flanking Sequence'] = inclusion_flank
            ptms_of_interest.loc[i, 'Exclusion Flanking Sequence'] = exclusion_flank

        #trim the expected flanking sequence
        #ptms_of_interest['Expected Flanking Sequence'] = ptms_of_interest['Expected Flanking Sequence'].apply(lambda x: x[int((len(x)-1)/2-flank_size):int((len(x)-1)/2+flank_size+1)] if x == x else np.nan)
        #find flanking sequences that have changed and only keep those
        ptms_of_interest['Matched'] = ptms_of_interest['Inclusion Flanking Sequence'] == ptms_of_interest['Exclusion Flanking Sequence']
        ptms_of_interest = ptms_of_interest[~ptms_of_interest['Matched']]
        ptms_of_interest = ptms_of_interest.drop(columns=['Matched'])
        ptms_of_interest['Stop Codon Introduced'] = (ptms_of_interest['Inclusion Flanking Sequence'].str.contains(r'\*')) | (ptms_of_interest['Exclusion Flanking Sequence'].str.contains(r'\*')) 

    
    return ptms_of_interest

[docs]def get_flanking_changes_from_splicegraph(psi_data, splicegraph, ptm_coordinates = None, dPSI_col = None, sig_col = None, event_id_col = None, extra_cols = None, gene_col = 'symbol', flank_size = 5, coordinate_type = 'hg19'):
    """
    Given a DataFrame containing information about splice events  obtained from SpliceSeq and the corresponding splicegraph, extract the flanking sequences of PTMs that are nearby the splice boundary (potential for flanking sequence to be altered). Coordinate information of individual exons should be found in splicegraph. You can also provide columns with specific psi or significance information. Extra cols not in these categories can be provided with extra_cols parameter.

    Parameters
    ----------
    psi_data : pandas.DataFrame
        DataFrame containing information about splice events obtained from SpliceSeq
    splicegraph : pandas.DataFrame
        DataFrame containing information about individual exons and their coordinates
    ptm_coordinates : pandas.DataFrame
        DataFrame containing PTM coordinate information for identify PTMs in the flanking regions
    dPSI_col : str, optional
        Column name indicating delta PSI value, by default None
    sig_col : str, optional
        Column name indicating significance of the event, by default None
    event_id_col : str, optional
        Column name indicating event ID, by default None
    extra_cols : list, optional
        List of column names for additional information to add to the results, by default None
    gene_col : str, optional
        Column name indicating gene symbol of spliced gene, by default 'symbol'
    flank_size : int, optional
        Number of amino acids to include flanking the PTM, by default 5
    coordinate_type : str, optional
        Coordinate system used for the regions, by default 'hg19'. Other options is hg38.

    Returns
    -------
    altered_flanks : pandas.DataFrame
        DataFrame containing the PTMs associated with the flanking regions that are altered, and the flanking sequences that arise depending on whether the flanking sequence is included or not
    """
    #load ptm data from config if not provided
    if ptm_coordinates is None and pose_config.ptm_coordinates is not None:
        ptm_coordinates = pose_config.ptm_coordinates.copy()
    elif ptm_coordinates is None:
        raise ValueError('ptm_coordinates dataframe not provided and not found in the resource files. Please provide the ptm_coordinates dataframe with config.download_ptm_coordinates() or download the file manually. To avoid needing to download this file each time, run pose_config.download_ptm_coordinates(save = True) to save the file locally within the package directory (will take ~63MB of storage space)')

    #load spliceseq
    splicegraph['Region ID'] = splicegraph['Symbol'] + '_' + splicegraph['Exon'].astype(str)
    splicegraph.index = splicegraph['Region ID'].values

    data_for_flanks = psi_data.drop_duplicates().copy() 

    #extract relevant columns
    relevant_columns = ['as_id', 'splice_type', 'symbol', 'from_exon', 'exons', 'to_exon']
    if event_id_col is not None:
        relevant_columns.append(event_id_col)
    if dPSI_col is not None:
        relevant_columns.append(dPSI_col)
    if sig_col is not None:
        relevant_columns.append(sig_col)
    if extra_cols is not None:
        relevant_columns.extend(extra_cols)

    data_for_flanks = data_for_flanks[relevant_columns].drop_duplicates()
    data_for_flanks = data_for_flanks.dropna(subset = ['from_exon', 'to_exon'])
    data_for_flanks['from_region_id'] = data_for_flanks[gene_col]+'_'+data_for_flanks['from_exon'].astype(str)
    data_for_flanks['to_region_id'] = data_for_flanks['symbol']+'_'+data_for_flanks['to_exon'].astype(str)

    #get coordinates for the different regions
    altered_flanks = []
    for i, row in tqdm.tqdm(data_for_flanks.iterrows(), total = data_for_flanks.shape[0], desc = 'Finding flanking changes for splicegraph events'):
        single_event_altered_flanks = get_flank_changes_from_splicegraph_single_event(row, splicegraph, event_id_col = event_id_col, dPSI_col = dPSI_col, sig_col = sig_col, extra_cols = extra_cols, flank_size = flank_size, coordinate_type = coordinate_type)

        altered_flanks.append(single_event_altered_flanks)

    altered_flanks = pd.concat(altered_flanks)    
    return altered_flanks