Source code for kstar.mapping


import pandas as pd
from collections import defaultdict
import re
import os
import numpy as np
from kstar import config, helpers



[docs]class ExperimentMapper:
    """
    Given an experiment object and reference sequences, map the phosphorylation sites to the common reference.
    Inputs

    Parameters
    ----------
    name : str
        Name of experiment. Used for logging
    experiment: pandas dataframe
        Pandas dataframe of an experiment that has a reference accession, a peptide column and/or a site column. 
        The peptide column should be upper case, with lower case indicating the site of phosphorylation - this is preferred
        The site column should be in the format S/T/Y<pos>, e.g. Y15 or S345
    columns: dict
        Dictionary with mappings of the experiment dataframe column names for the required names 'accession_id', 'peptide', or 'site'. 
        One of 'peptide' or 'site' is required. 
    logger: Logger object
        used for logging when peptides cannot be matched and when a site location changes
    sequences: dict
        Dictionary of sequences. Key : accession. Value : protein sequence. 
        Default is imported from kstar.config
    compendia: pd.DataFrame
        Human phosphoproteome compendia, mapped to KinPred and annotated with number of compendia. 
        Default is imported from kstar.config
    window : int
        The length of amino acids to the N- and C-terminal sides of the central phosphoprotein to map a site to.
        Default is 7.
    data_columns: list, or empty
        The list of data columns to use. If this is empty, logger will look for anything that starts with statement data: and those values
        Default is None.

    
    Attributes
    ----------
    experiment: pandas dataframe
        mapped experiment, which for each peptide, no contains the mapped accession, site, peptide, number of compendia, compendia type
    sequences: dict
        Dictionary of sequences passed into the class
    compendia: pandas dataframe
        compendia dataframe passed into the class
    data_columns: list
        indicates which columns will be used as data
    

    """ 
    #for documentation purposes convert non required parameters to kwargs
    #def __init__(self, experiment, columns, logger, *kwargs): 
    def __init__(self, experiment, columns, logger, sequences=config.HUMAN_REF_SEQUENCES, compendia=config.HUMAN_REF_COMPENDIA, window = 7, data_columns = None): 
        self.experiment = experiment
        self.sequences = sequences
        self.compendia = compendia
        def set_accession_id(accession):
            acc = accession.split('-')
            if len(acc) > 1:
                acc = acc[:-1]
            return '-'.join(acc)

        if 'accession_id' not in columns.keys():
            raise ValueError('ExperimentMapper requires accession_id as a dictionary key')
        else:
            self.experiment[config.KSTAR_ACCESSION] = self.experiment[columns['accession_id']].apply(set_accession_id) 

        if 'peptide' not in columns.keys() and 'site' not in columns.keys():
            raise ValueError('ExperimentMapper requires either site or peptide as keys in dictionary')

        self.experiment[config.KSTAR_PEPTIDE] = self.experiment[columns['peptide']] if 'peptide' in columns.keys() else None
        self.experiment[config.KSTAR_SITE] = self.experiment[columns['site']] if 'site' in columns.keys() else None

        self.logger = logger
        self.set_data_columns(data_columns)
        self.align_sites(window)

        compendia = self.compendia[[config.KSTAR_ACCESSION, config.KSTAR_SITE, 'KSTAR_NUM_COMPENDIA', 'KSTAR_NUM_COMPENDIA_CLASS']]
        self.experiment = pd.merge(self.experiment, compendia, how = 'inner', on = [config.KSTAR_ACCESSION, config.KSTAR_SITE] )
        #after merge, NUM_COMPENDIA/CLASS as become a float, likely due to mismatches, so assume those evidences are 0
        self.experiment['KSTAR_NUM_COMPENDIA'] = self.experiment['KSTAR_NUM_COMPENDIA'].fillna(0.0).astype(int)
        self.experiment['KSTAR_NUM_COMPENDIA_CLASS'] = self.experiment['KSTAR_NUM_COMPENDIA_CLASS'].fillna(0.0).astype(int)

[docs]    def set_data_columns(self, data_columns):
        """
        Identifies which columns in the experiment should be used as data columns. If data_columns is provided,
        then 'data:' is added to the front and experiment dataframe is renamed. Otherwise, function will look for columns
        with 'data:' in front and this to the data_columns attribute.
        """
        self.data_columns = []
        if data_columns is None:
            for col in self.experiment.columns:
                if col.startswith('data:'):
                    self.data_columns.append(col)
        else:
            data_rename = defaultdict()
            for col in data_columns:
                if not col.startswith('data:'):
                    data_rename[col] = f"data:{col}"
                else: 
                    data_rename[col] = col
            self.experiment.rename(columns = data_rename, inplace = True)
            self.data_columns = list(data_rename.values())

[docs]    def get_experiment(self):
        """
        Return the mapped experiment dataframe
        """
        return self.experiment


[docs]    def get_sequence(self, accession):
        """
        Gets the sequence that matches the given accession
        """
        if accession in self.sequences.keys():
            return self.sequences[accession]
        return None

    
[docs]    def align_sites(self, window = 7):
        """
        Map the peptide/sites to the common sequence reference and remove and report errors for sites that do not align as expected.
        expMapper.align_sites(window=7). Operates on the experiment dataframe of class.

        Parameters
        ----------
        window: int
            The length of amino acids to the N- and C-terminal sides of the central phosphoprotein to map a site to.

        """

        self.experiment = expand_peptide(self.experiment, config.KSTAR_PEPTIDE)

        for index, row in self.experiment.iterrows():
            sequence = self.get_sequence(row[config.KSTAR_ACCESSION])
            if sequence is not None:
                # If peptide provided then find site
                if row[config.KSTAR_PEPTIDE] is not None:
                    
                    site =  peptide_site_number(peptide = row[config.KSTAR_PEPTIDE], 
                                                site = row[config.KSTAR_SITE],
                                                sequence = sequence)
                    if site is None:
                        self.logger.warning(f"SITE NOT FOUND : {row[config.KSTAR_ACCESSION]}\t{row[config.KSTAR_PEPTIDE]}")
                        self.experiment.loc[index, config.KSTAR_SITE] = None
                        self.experiment.loc[index, config.KSTAR_PEPTIDE] = None
                        continue
                    elif site != row[config.KSTAR_SITE]:
                        self.logger.info(f"SITE CHANGED : {row[config.KSTAR_ACCESSION]} {row[config.KSTAR_SITE]} -> {site}")
                        self.experiment.loc[index, config.KSTAR_SITE] = site
                    
                    peptide = get_aligned_peptide(site = site,
                                                  sequence = sequence, 
                                                  window = window)
                    if peptide is not None:
                        self.experiment.loc[index, config.KSTAR_PEPTIDE] = peptide
                        
                # Peptide not provided but site is provided - build peptide
                elif row[config.KSTAR_SITE] is not None:
                    peptide = get_aligned_peptide(site = row[config.KSTAR_SITE],
                                                  sequence = sequence, 
                                                  window = window)
                    if peptide is not None:
                        self.experiment.loc[index, config.KSTAR_PEPTIDE] = peptide
                    else:
                        self.logger.warning(f"PEPTIDE MISMATCH : {row[config.KSTAR_ACCESSION]} {row[config.KSTAR_SITE]}")
                        self.experiment.loc[index, config.KSTAR_SITE] = None
                        self.experiment.loc[index, config.KSTAR_PEPTIDE] = None
                else:
                    self.logger.warning(f"SITE NOT FOUND : {self.experiment[config.KSTAR_ACCESSION]}\t{self.experiment[config.KSTAR_SITE]}")
            
            # Sequence not found in compendia
            else:
                self.logger.warning(f"SEQUENCE NOT FOUND : {row[config.KSTAR_ACCESSION]}")
                self.experiment.loc[index, config.KSTAR_SITE] = None
                self.experiment.loc[index, config.KSTAR_PEPTIDE] = None
        self.experiment.dropna(axis = 'rows', subset = [config.KSTAR_ACCESSION, config.KSTAR_SITE, config.KSTAR_PEPTIDE], inplace = True)
        self.experiment.drop_duplicates(inplace=True)


def expand_peptide(df, peptide_column):
    """
    Expands a dataframe based on the peptide column such that each 
    peptide contains one modification type. 
    Example
        Accession  Peptide  Info
        P0001      aBCDeF   Good
        P0002      TUvWXy   Bad

        Becomes
        Accession  Peptide  Info
        P0001      aBCDEF   Good
        P0001      ABCDeF   Good
        P0002      TUvWXY   Bad
        P0002      TUVWXy   Bad
    
    Parameters
    --------
    df : pandas df
        Experiment pandas dataframe
    peptide_column: str
        peptide column to expand
    
    Returns
    ---------
    df : pandas DataFrame
        expanded dataframe

    """
    df_list = []
    for index, row in df.iterrows():
        if row[peptide_column] is not None:
            mods = find_modified_sites(row[peptide_column])
            for mod in mods:
                tmp = row.copy()
                tmp[peptide_column] = list(tmp[peptide_column].upper())
                tmp[peptide_column][mod[1]] = tmp[peptide_column][mod[1]].lower()
                tmp[peptide_column] = ''.join(tmp[peptide_column])
                df_list.append(tmp)
        else:
            df_list.append(row)
    df = pd.DataFrame(df_list).reset_index(drop=True)
    return df
        
def find_modified_sites(peptide, modification_types = None):
        """
        Finds all modification sites, indicated by a lower letter in the peptide column 
        and returns as a list of lists 
        Parameters
        ----------
        row : pandas row
        peptide_column : column that contains peptide with mod site
            mod site identified by being lower case

        Returns : [[modtype, relative_location]]
        1st element is the modification type
        2nd element is the relative modification location
        """

        if modification_types is not None:
            mod_types = ''.join(modification_types)
            mod_types = '[' + mod_types + ']'
            return [[m.group() , m.start()] for m in re.finditer(mod_types, peptide)]
        return [[m.group() , m.start()] for m in re.finditer('[a-z]', peptide)]

def find_peptide_locations(peptide, sequence):
    """
    Finds all start locations of a peptide in a given sequence. 
    Sequence starts at location 1.
    Example: Sequence: ABCDEFGABCDHIJK, Peptide: BCD
    would return [2, 9], as peptide is found at location 2 and 9
    """
    return [m.start() + 1 for m in re.finditer(peptide, sequence)]

def peptide_site_number(peptide, sequence, site = None, modification_types = None):
    """
    Finds the site number by finding the modification site in sequence and the peptide location
    in the sequence. A modification site is defined as where in a peptide a lower case letter appears.
    If a site is provided with alphabetical character the alphabetical character is
    the modification type checked for in the peptide.
    If a site contains numbers the closest match of the petide in the sequence to the site number is used as the site
    If no modification sites are found or peptide is not found in sequence None is returned
    
    Example: sequence is ZZZZABCDEFGZZZABCDEFGHIZZ, peptide is AbCdEfG
    If no site is provided and no modification types are provided then B6 is returned
    If no site is provided and modification_types is ['d','f'] or 'df' then D8 is returned
    If site is 30 and modification_types is ['d','f'] then F20 is returned
    If site is D23 then D18 is returned, modification_type is ignored
    If site is X50 then X50 is returned as modification_type x is not found in peptide
    Parameters
    ----------

    peptide :str
        peptide string where modification site is first lower-case letter found
    sequence : str
        sequence string where all characters are uppercase
    site : str
        site number to check against. can either have letters or just numbers
    modification_types: list
        list of modification types to check against while checking peptide
        ex: ['s', 't', 'y'] would ingore all modifications except for s,t,y
        not used if site contains alphabetical characters
    
    Returns
    ---------
    site : str
        site of peptide in sequence dataframe in format S123
        closest to original site if provided, otherwise first location found
        if no modification sites are found, the sequence cannot be found, or the peptide
            cannot be found in the sequence then the original site is returned
    """
    if pd.isnull(peptide):
        return None

    peptide = ''.join(filter(str.isalpha, peptide)) # keep only alphabetical letters
    all_mod_sites = find_modified_sites(peptide, None) # get all mod sites
        
    if isinstance(site, str):
        modification_types = site[0].lower()
    
    mod_sites = []
    if modification_types is not None:
        for mod_site in all_mod_sites:
            if mod_site[0] in modification_types:
                mod_sites.append(mod_site)
    else:
        mod_sites = all_mod_sites
    
    # if no mod sites are found return None
    if len(mod_sites) == 0:
        return None

    peptide_locations = find_peptide_locations(peptide.upper(), sequence) 


    relative_location = mod_sites[0][1] # only looking at first mod site location
    site_type = mod_sites[0][0].upper() # first mod type (STY)
    
    # get all locations where peptide appears in sequence
    peptide_locations = find_peptide_locations(peptide.upper(), sequence) 
    if len(peptide_locations)>0:
        if site is not None:
            # get closest peptide location to provided site
            site_num = int(re.sub("[^0-9]", "", str(site))) #remove all letters and make int
            closeness = np.inf
            closest_site = np.inf
            for loc in peptide_locations:
                true_location = loc + relative_location
                if abs(true_location - site_num) < closeness:
                    closeness = abs(true_location - site_num)
                    closest_site = true_location
            return site_type + str(closest_site)
        else:
            # no site info provided : return first instance
            return site_type + str(peptide_locations[0] + relative_location)  
    return None

def peptide_site_number_df(peptide, accession, df_sequence, site = None, modification_types = None):
    """
    Finds the site number by finding the modification site in sequence and the peptide location
    in the sequence from the given dataframe.
    If the sequence cannot be found then the original site is returned.
    See peptide_site_number(peptide,sequence,site,modification_types) 
    for details of how site is found once sequence is found
    
    Parameters
    ----------

    peptide :str
        peptide string where modification site is first lower-case letter found
    accession : str
        UniProt accession number 
    df_sequence : pandas DataFrame
        sequence dataframe pulled from UniProt with columns Entry, Sequence
    site : str
        site number to check against. can either have letters or just numbers
    modification_types: list
        list of modification types to check against while checking peptide
        ex: ['s', 't', 'y'] would ingore all modifications except for s,t,y
    
    Returns
    ---------
    site : str
        site of peptide in sequence dataframe in format S123
        closest to row[site_column] if provided, otherwise first location found
        if no modification sites are found, the sequence cannot be found, or the peptide
            cannot be found in the sequence then the original site is returned
    """
    sequence = df_sequence[df_sequence['Entry'] == accession]
    if len(sequence) > 0: # get first sequence for accession
        sequence = sequence.iloc[0]['Sequence']
        return peptide_site_number(peptide, sequence, site, modification_types)
    return site

def is_valid_site(site, sequence):
    """
    Checks whether the site provided is a valid site in the given sequence
    Valid sites are sites where site has same modification as sequence location
    
    Parameters
    ----------
    site: str
        site AA#, e.g. Y12
    sequence : str
        sequence to check
    
    Returns
    --------
    is_valid : bool
        True if site is valid. False otherwise
    """
    site_type  = ''.join(filter(str.isalpha, site)).upper()
    site_number = int( ''.join( filter(lambda x: x.isdigit(), site) ) )
    if site_number > 0 and site_number <= len(sequence):
        return site_type == sequence[site_number - 1]
    return False

def get_aligned_peptide(site, sequence, window):
    """
    Returns aligned peptide where modificaiton site is lowercase. 
    Aligned peptide contains +/- window AA surrounding modification site. 
    No buffer characters are added.
    If the site is not a valid location in the sequence then None is returned
    
    Parameters
    ----------
    site : str
        modification site. AA#. e.g. Y12
    sequence : str
        sequence to use for generating aligned peptide
    window : int
        number of amino acids to pull surrounding modification site
    
    Returns
    ----------
    peptide : str
        peptide of AA surrounding mod site. Mod site is lowercase. 
        None is returned if not a valid site
            e.g. ABCDeFGHI where site=E5, window=4
            e.g. AbCDEF    where site=B2, window=4 
    """
    if is_valid_site(site, sequence): 
        site_number = int( ''.join( filter(lambda x: x.isdigit(), site) ) ) - 1
        start = max(0, site_number - window)
        stop = min(len(sequence), site_number + window + 1)
        sequence = list(sequence)
        sequence[site_number] = sequence[site_number].lower()
        peptide = sequence[start:stop]
        peptide = ''.join(peptide)
        return peptide
    return None

def run_mapping(experiment, odir, name, map_columns, window=7, data_columns=None):
    if not os.path.exists(f"{odir}/MAPPED_DATA"): 
        os.mkdir(f"{odir}/MAPPED_DATA")   
    mapping_log = helpers.get_logger(f"mapping_{name}", f"{odir}/MAPPED_DATA/mapping_{name}.log")
    exp_mapper = ExperimentMapper(
        experiment = experiment,
        sequences = config.HUMAN_REF_SEQUENCES, 
        columns = map_columns, 
        logger = mapping_log, 
        compendia = config.HUMAN_REF_COMPENDIA, 
        window = window, 
        data_columns = data_columns)
     
    experiment = exp_mapper.experiment 
    experiment.to_csv(f"{odir}/MAPPED_DATA/{name}_mapped.tsv", sep = '\t', index = False)
    
    return experiment