Source code for ptm_pose.splicing_tools.SpliceSeq

from ptm_pose import helpers, project, flanking_sequences, nease_runner, pose_config
from ptm_pose import database_interfacing as di
from ptm_pose.splicing_tools.base import GenericDataset
import pandas as pd
from tqdm import tqdm


def extract_region_from_splicegraph(splicegraph, region_id):
    """
    Given a region id and the splicegraph from SpliceSeq, extract the chromosome, strand, and start and stop locations of that exon. Start and stop are forced to be in ascending order, which is not necessarily true from the splice graph (i.e. start > stop for negative strand exons). This is done to make the region extraction consistent with the rest of the codebase.

    Parameters
    ----------
    spliceseq : pandas.DataFrame
        SpliceSeq splicegraph dataframe, with region_id as index
    region_id : str
        Region ID to extract information from, in the format of 'GeneName_ExonNumber'

    Returns
    -------
    list
        List containing the chromosome, strand (1 for forward, -1 for negative), start, and stop locations of the region
    """
    region_info = splicegraph.loc[region_id]

    #check to see how many regions correspond to id, if multiple, default to first entry
    if isinstance(region_info, pd.DataFrame):
        region_info = region_info.iloc[0]
        print(f'Warning: {region_id} has multiple entries in splicegraph. Defaulting to first entry.')
    
    strand = project.convert_strand_symbol(region_info['Strand'])
    if strand == 1:
        return [region_info['Chromosome'], strand,region_info['Chr_Start'], region_info['Chr_Stop']]
    else:
        return [region_info['Chromosome'], strand,region_info['Chr_Stop'], region_info['Chr_Start']]

    
def get_spliceseq_event_regions(gene_name, from_exon, spliced_exons, to_exon, splicegraph):
    """
    Given all exons associated with a splicegraph event, obtain the coordinates associated with the flanking exons and the spliced region. The spliced region is defined as the exons that are associated with psi values, while flanking regions include the "from" and "to" exons that indicate the adjacent, unspliced exons.

    Parameters
    ----------
    gene_name : str
        Gene name associated with the splice event
    from_exon : int
        Exon number associated with the first flanking exon
    spliced_exons : str
        Exon numbers associated with the spliced region, separated by colons for each unique exon
    to_exon : int
        Exon number associated with the second flanking exon
    splicegraph : pandas.DataFrame
        DataFrame containing information about individual exons and their coordinates

    Returns
    -------
    tuple
        Tuple containing the genomic coordinates of the first flanking region, spliced regions, and second flanking region
    """
    first_exon_region = extract_region_from_splicegraph(splicegraph, region_id = gene_name+'_'+str(from_exon))
    spliced_regions = [extract_region_from_splicegraph(splicegraph, gene_name+'_'+exon) if '.' in exon else extract_region_from_splicegraph(splicegraph, gene_name+'_'+exon+'.0') for exon in spliced_exons.split(':')]
    second_exon_region = extract_region_from_splicegraph(splicegraph, region_id = gene_name+'_'+str(to_exon))
    return first_exon_region, spliced_regions, second_exon_region


def get_spliceseq_flank_loc(ptm, strand, from_region_coords, to_region_coords, coordinate_type = 'hg19'):
    """
    Given ptm information for identifying flanking sequences from splicegraph information, extract the relative location of the ptm in the flanking region (where it is located in translation of the flanking region).

    Parameters
    ----------
    ptm : pandas.Series
        Series containing PTM information
    strand : int
        Strand associated with the splice event (1 for forward, -1 for negative)
    from_region_coords : list
        List containing the chromosome, strand, start, and stop locations of the first flanking region
    to_region_coords : list
        List containing the chromosome, strand, start, and stop locations of the second flanking region
    
    Returns
    -------
    int
        Relative location of the PTM in the flanking region
    """
    if strand == 1 and ptm['Which Flank'] == 'First':
        return ptm[f'Gene Location ({coordinate_type})']  - from_region_coords[-2]
    elif strand == 1 and ptm['Which Flank'] == 'Second':
        return ptm[f'Gene Location ({coordinate_type})']  - to_region_coords[-2]
    elif strand == -1 and ptm['Which Flank'] == 'First':
        return from_region_coords[-1] - ptm[f'Gene Location ({coordinate_type})']
    else:
        return to_region_coords[-1] - ptm[f'Gene Location ({coordinate_type})']

def get_ptms_in_splicegraph_flank(gene_name, chromosome, strand, flank_region_start, flank_region_end, coordinate_type = 'hg19', which_flank = 'First', flank_size = 5):
    """

    """
    #check for ptms in first flank region
    flank_ptms = project.find_ptms_in_region(ptm_coordinates = pose_config.ptm_coordinates, chromosome = chromosome, strand = strand, start = flank_region_start, end = flank_region_end, coordinate_type = coordinate_type, gene = gene_name)
    if not flank_ptms.empty and which_flank == 'First': #if ptms found region, grab those close enough to splice boundary to have impacted flanking sequence
        flank_ptms = flank_ptms[flank_ptms['Proximity to Region End (bp)'] < flank_size*3]
        flank_ptms['Which Flank'] = 'First'
    elif not flank_ptms.empty and which_flank == 'Second': #if ptms found region, grab those close enough to splice boundary to have impacted flanking sequence
        flank_ptms = flank_ptms[flank_ptms['Proximity to Region Start (bp)'] < flank_size*3]
        flank_ptms['Which Flank'] = 'Second'
    
    return flank_ptms






[docs]
class SpliceSeq_Dataset(GenericDataset):
    def __init__(self, data, splicegraph, min_dpsi = 0, alpha = 0.05, dpsi_col = None, sig_col = None, coordinate_type = 'hg38'):
        print('Removing ME events from analysis')
        data = data.copy()
        data = data[data['splice_type'] != 'ME'].copy()

        #remove redundant columns, if any, that will be added from splicegraph
        overlapping_columns = set(data.columns).intersection({'Chromosome', 'Strand', 'Chr_Start', 'Chr_Stop'})
        if len(overlapping_columns) > 0:
            #drop columns that will be added from splicegraph
            data = data.drop(columns=overlapping_columns)

        #split exons into individual exons
        data['Individual exon'] = data['exons'].apply(lambda x: x.split(':'))
        data = data.explode('Individual exon').drop_duplicates()
        data['Individual exon'] = data['Individual exon'].astype(float)

        #add gene location information to psi data from spliceseq
        splicegraph['Region ID'] = splicegraph['Symbol'] + '_' + splicegraph['Exon'].astype(str)
        data = data.merge(splicegraph.copy(), left_on = ['symbol', 'Individual exon'], right_on = ['Symbol', 'Exon'], how = 'left')
        data = data.rename(columns = {'Chr_Start': 'event_start', 'Chr_Stop': 'event_end'})

        #initialize base class with spliceseq information
        super().__init__(splice_data=data, min_dpsi=min_dpsi, alpha=alpha, dpsi_col=dpsi_col, sig_col=sig_col, coordinate_type=coordinate_type, chromosome_col = 'Chromosome', strand_col = 'Strand', event_id_col = 'Region ID', gene_col = 'symbol', start_coordinate_system = '1-based')

        #save spliceseq specific data
        self.splicegraph = splicegraph
        self.original_data = data

    def get_flank_changes_from_splicegraph_single_event(self, event_row, extra_cols = None, flank_size = 5, coordinate_type = 'hg19'):
        region_id = event_row[self.event_id_col] if self.event_id_col is not None else None
        dPSI = event_row[self.dpsi_col] if self.dpsi_col is not None else None
        sig = event_row[self.sig_col] if self.sig_col is not None else None

        #get region info
        from_region_coords, spliced_region_coords, to_region_coords = get_spliceseq_event_regions(gene_name = event_row['symbol'], from_exon = event_row['from_exon'], spliced_exons = event_row['exons'], to_exon = event_row['to_exon'], splicegraph = self.splicegraph)
        chromosome = from_region_coords[0]
        strand = from_region_coords[1]

        from_flank_ptms = get_ptms_in_splicegraph_flank(event_row['symbol'], chromosome, strand, from_region_coords[-2], from_region_coords[-1], coordinate_type = coordinate_type, which_flank = 'First', flank_size = flank_size)
        to_flank_ptms = get_ptms_in_splicegraph_flank(event_row['symbol'], chromosome, strand, to_region_coords[-2], to_region_coords[-1], coordinate_type = coordinate_type, which_flank = 'Second', flank_size = flank_size)
        ptms_of_interest = pd.concat([from_flank_ptms, to_flank_ptms]).reset_index()


        #if any ptms found for event that could have altered flanking sequences extract sequence information
        if not ptms_of_interest.empty:
            #add additional context from splice data, if indicated
            if self.event_id_col is not None:
                ptms_of_interest['Region ID'] = region_id
                
            if self.dpsi_col is not None:
                ptms_of_interest['dPSI'] = dPSI
            
            if self.sig_col is not None:
                ptms_of_interest['Significance'] = sig
            
            if extra_cols is not None:
                for col in extra_cols:
                    ptms_of_interest[col] = event_row[col]


            region_list = [from_region_coords] + spliced_region_coords + [to_region_coords]
            seqs = di.get_region_sequences_from_list(region_list, coordinate_type = 'hg19')
            from_sequence = seqs[0]
            to_sequence = seqs[-1] 
            spliced_sequence = ''.join(seqs[1:-1]) #combine all sequences from spliced region (may be multiple exons)

            inclusion_sequence = seqs[0] + ''.join(seqs[1:-1]) + seqs[-1] #combine sequences if spliced region is included
            exclusion_sequence = seqs[0] + seqs[-1] #combine sequences if spliced region is excluded

            #initialize columns for flanking sequences
            ptms_of_interest['Inclusion Flanking Sequence'] = ''
            ptms_of_interest['Exclusion Flanking Sequence'] = ''
            for i, ptm in ptms_of_interest.iterrows():
                ptm_loc_in_flank = get_spliceseq_flank_loc(ptm, strand, from_region_coords, to_region_coords)
                #grab where ptm is located in both the inclusion and exclusion event
                inclusion_ptm_loc, exclusion_ptm_loc = flanking_sequences.get_ptm_locs_in_spliced_sequences(ptm_loc_in_flank, from_sequence, spliced_sequence, to_sequence, strand = strand, which_flank = ptm['Which Flank'], order_by = 'Translation')

                #extract expected flanking sequence based on location in sequence
                inclusion_flank = flanking_sequences.get_flanking_sequence(inclusion_ptm_loc, inclusion_sequence, ptm_residue = ptm['Residue'], flank_size = flank_size, full_flanking_seq = False)
                exclusion_flank = flanking_sequences.get_flanking_sequence(exclusion_ptm_loc, exclusion_sequence, ptm_residue = ptm['Residue'], flank_size = flank_size, full_flanking_seq = False)

                #add to dataframe
                ptms_of_interest.loc[i, 'Inclusion Flanking Sequence'] = inclusion_flank
                ptms_of_interest.loc[i, 'Exclusion Flanking Sequence'] = exclusion_flank

            #trim the expected flanking sequence
            #ptms_of_interest['Expected Flanking Sequence'] = ptms_of_interest['Expected Flanking Sequence'].apply(lambda x: x[int((len(x)-1)/2-flank_size):int((len(x)-1)/2+flank_size+1)] if x == x else np.nan)
            #find flanking sequences that have changed and only keep those
            ptms_of_interest['Matched'] = ptms_of_interest['Inclusion Flanking Sequence'] == ptms_of_interest['Exclusion Flanking Sequence']
            ptms_of_interest = ptms_of_interest[~ptms_of_interest['Matched']]
            ptms_of_interest = ptms_of_interest.drop(columns=['Matched'])
            ptms_of_interest['Stop Codon Introduced'] = (ptms_of_interest['Inclusion Flanking Sequence'].str.contains(r'\*')) | (ptms_of_interest['Exclusion Flanking Sequence'].str.contains(r'\*')) 

        
        return ptms_of_interest


[docs]
    def get_flanking_changes_from_splicegraph(self, extra_cols = None, flank_size = 5, **kwargs):
        """
        Given a DataFrame containing information about splice events  obtained from SpliceSeq and the corresponding splicegraph, extract the flanking sequences of PTMs that are nearby the splice boundary (potential for flanking sequence to be altered). Coordinate information of individual exons should be found in splicegraph. You can also provide columns with specific psi or significance information. Extra cols not in these categories can be provided with extra_cols parameter.

        Parameters
        ----------
        psi_data : pandas.DataFrame
            DataFrame containing information about splice events obtained from SpliceSeq
        splicegraph : pandas.DataFrame
            DataFrame containing information about individual exons and their coordinates
        ptm_coordinates : pandas.DataFrame
            DataFrame containing PTM coordinate information for identify PTMs in the flanking regions
        dPSI_col : str, optional
            Column name indicating delta PSI value, by default None
        sig_col : str, optional
            Column name indicating significance of the event, by default None
        event_id_col : str, optional
            Column name indicating event ID, by default None
        extra_cols : list, optional
            List of column names for additional information to add to the results, by default None
        gene_col : str, optional
            Column name indicating gene symbol of spliced gene, by default 'symbol'
        flank_size : int, optional
            Number of amino acids to include flanking the PTM, by default 5
        coordinate_type : str, optional
            Coordinate system used for the regions, by default 'hg19'. Other options is hg38.
        **kwargs: additional keyword arguments
            Additional keyword arguments, which will be fed into the `filter_ptms()` function from the helper module. These will be used to filter ptms with lower evidence. For example, if you want to filter PTMs based on the number of MS observations, you can add 'min_MS_observations = 2' to the kwargs. This will filter out any PTMs that have less than 2 MS observations. See the `filter_ptms()` function for more options.

        Returns
        -------
        altered_flanks : pandas.DataFrame
            DataFrame containing the PTMs associated with the flanking regions that are altered, and the flanking sequences that arise depending on whether the flanking sequence is included or not
        """
        #check for any keyword arguments to use for filtering
        if kwargs:
            filter_arguments = helpers.extract_filter_arguments(**kwargs)
            #check any excess unused keyword arguments, report them
            helpers.check_filter_kwargs(**filter_arguments)
            #filter ptm coordinates file to include only ptms with desired evidence
            ptm_coordinates = helpers.filter_ptms_by_evidence(pose_config.ptm_coordinates, **filter_arguments)
        else:
            ptm_coordinates = pose_config.ptm_coordinates.copy()

        #load spliceseq
        self.splicegraph.index = self.splicegraph['Region ID'].values

        data_for_flanks = self.original_data.drop_duplicates().copy() 

        #extract relevant columns
        relevant_columns = ['as_id', 'splice_type', 'symbol', 'from_exon', 'exons', 'to_exon']
        if self.event_id_col is not None:
            relevant_columns.append(self.event_id_col)
        if self.dpsi_col is not None:
            relevant_columns.append(self.dpsi_col)
        if self.sig_col is not None:
            relevant_columns.append(self.sig_col)
        if extra_cols is not None:
            relevant_columns.extend(extra_cols)
        
        #get region ids for flanking regions to grab from splicegraph
        data_for_flanks = data_for_flanks[relevant_columns].drop_duplicates()
        data_for_flanks = data_for_flanks.dropna(subset = ['from_exon', 'to_exon'])
        data_for_flanks['from_region_id'] = data_for_flanks[self.gene_col]+'_'+data_for_flanks['from_exon'].astype(str)
        data_for_flanks['to_region_id'] = data_for_flanks['symbol']+'_'+data_for_flanks['to_exon'].astype(str)

        #get coordinates for the different regions
        altered_flanks = []
        for i, row in tqdm(data_for_flanks.iterrows(), total = data_for_flanks.shape[0], desc = 'Finding flanking changes for splicegraph events'):
            single_event_altered_flanks = self.get_flank_changes_from_splicegraph_single_event(row,  extra_cols = extra_cols, flank_size = flank_size)

            altered_flanks.append(single_event_altered_flanks)

        altered_flanks = pd.concat(altered_flanks)    
        self.altered_flanks = altered_flanks


    def run_pose(self, identify_altered_flanks = True, extra_cols = None, flank_size = 5, PROCESSES = 1, **kwargs):
        #check for any keyword arguments to use for filtering
        self.project_ptms_generic(extra_cols = extra_cols, PROCESSES = PROCESSES, **kwargs)
        if identify_altered_flanks:
            self.get_flanking_changes_from_splicegraph(extra_cols = extra_cols, flank_size = flank_size,**kwargs)

    def run_nease(self):
        self.run_nease_generic()