import pandas as pd
from collections import defaultdict
import re
import os
import numpy as np
from kstar import config, helpers
[docs]class ExperimentMapper:
"""
Given an experiment object and reference sequences, map the phosphorylation sites to the common reference.
Inputs
Parameters
----------
name : str
Name of experiment. Used for logging
experiment: pandas dataframe
Pandas dataframe of an experiment that has a reference accession, a peptide column and/or a site column.
The peptide column should be upper case, with lower case indicating the site of phosphorylation - this is preferred
The site column should be in the format S/T/Y<pos>, e.g. Y15 or S345
columns: dict
Dictionary with mappings of the experiment dataframe column names for the required names 'accession_id', 'peptide', or 'site'.
One of 'peptide' or 'site' is required.
logger: Logger object
used for logging when peptides cannot be matched and when a site location changes
sequences: dict
Dictionary of sequences. Key : accession. Value : protein sequence.
Default is imported from kstar.config
compendia: pd.DataFrame
Human phosphoproteome compendia, mapped to KinPred and annotated with number of compendia.
Default is imported from kstar.config
window : int
The length of amino acids to the N- and C-terminal sides of the central phosphoprotein to map a site to.
Default is 7.
data_columns: list, or empty
The list of data columns to use. If this is empty, logger will look for anything that starts with statement data: and those values
Default is None.
Attributes
----------
experiment: pandas dataframe
mapped experiment, which for each peptide, no contains the mapped accession, site, peptide, number of compendia, compendia type
sequences: dict
Dictionary of sequences passed into the class
compendia: pandas dataframe
compendia dataframe passed into the class
data_columns: list
indicates which columns will be used as data
"""
#for documentation purposes convert non required parameters to kwargs
#def __init__(self, experiment, columns, logger, *kwargs):
def __init__(self, experiment, columns, logger, sequences=config.HUMAN_REF_SEQUENCES, compendia=config.HUMAN_REF_COMPENDIA, window = 7, data_columns = None):
self.experiment = experiment
self.sequences = sequences
self.compendia = compendia
def set_accession_id(accession):
acc = accession.split('-')
if len(acc) > 1:
acc = acc[:-1]
return '-'.join(acc)
if 'accession_id' not in columns.keys():
raise ValueError('ExperimentMapper requires accession_id as a dictionary key')
else:
self.experiment[config.KSTAR_ACCESSION] = self.experiment[columns['accession_id']].apply(set_accession_id)
if 'peptide' not in columns.keys() and 'site' not in columns.keys():
raise ValueError('ExperimentMapper requires either site or peptide as keys in dictionary')
self.experiment[config.KSTAR_PEPTIDE] = self.experiment[columns['peptide']] if 'peptide' in columns.keys() else None
self.experiment[config.KSTAR_SITE] = self.experiment[columns['site']] if 'site' in columns.keys() else None
self.logger = logger
self.set_data_columns(data_columns)
self.align_sites(window)
compendia = self.compendia[[config.KSTAR_ACCESSION, config.KSTAR_SITE, 'KSTAR_NUM_COMPENDIA', 'KSTAR_NUM_COMPENDIA_CLASS']]
self.experiment = pd.merge(self.experiment, compendia, how = 'inner', on = [config.KSTAR_ACCESSION, config.KSTAR_SITE] )
#after merge, NUM_COMPENDIA/CLASS as become a float, likely due to mismatches, so assume those evidences are 0
self.experiment['KSTAR_NUM_COMPENDIA'] = self.experiment['KSTAR_NUM_COMPENDIA'].fillna(0.0).astype(int)
self.experiment['KSTAR_NUM_COMPENDIA_CLASS'] = self.experiment['KSTAR_NUM_COMPENDIA_CLASS'].fillna(0.0).astype(int)
[docs] def set_data_columns(self, data_columns):
"""
Identifies which columns in the experiment should be used as data columns. If data_columns is provided,
then 'data:' is added to the front and experiment dataframe is renamed. Otherwise, function will look for columns
with 'data:' in front and this to the data_columns attribute.
"""
self.data_columns = []
if data_columns is None:
for col in self.experiment.columns:
if col.startswith('data:'):
self.data_columns.append(col)
else:
data_rename = defaultdict()
for col in data_columns:
if not col.startswith('data:'):
data_rename[col] = f"data:{col}"
else:
data_rename[col] = col
self.experiment.rename(columns = data_rename, inplace = True)
self.data_columns = list(data_rename.values())
[docs] def get_experiment(self):
"""
Return the mapped experiment dataframe
"""
return self.experiment
[docs] def get_sequence(self, accession):
"""
Gets the sequence that matches the given accession
"""
if accession in self.sequences.keys():
return self.sequences[accession]
return None
[docs] def align_sites(self, window = 7):
"""
Map the peptide/sites to the common sequence reference and remove and report errors for sites that do not align as expected.
expMapper.align_sites(window=7). Operates on the experiment dataframe of class.
Parameters
----------
window: int
The length of amino acids to the N- and C-terminal sides of the central phosphoprotein to map a site to.
"""
self.experiment = expand_peptide(self.experiment, config.KSTAR_PEPTIDE)
for index, row in self.experiment.iterrows():
sequence = self.get_sequence(row[config.KSTAR_ACCESSION])
if sequence is not None:
# If peptide provided then find site
if row[config.KSTAR_PEPTIDE] is not None:
site = peptide_site_number(peptide = row[config.KSTAR_PEPTIDE],
site = row[config.KSTAR_SITE],
sequence = sequence)
if site is None:
self.logger.warning(f"SITE NOT FOUND : {row[config.KSTAR_ACCESSION]}\t{row[config.KSTAR_PEPTIDE]}")
self.experiment.loc[index, config.KSTAR_SITE] = None
self.experiment.loc[index, config.KSTAR_PEPTIDE] = None
continue
elif site != row[config.KSTAR_SITE]:
self.logger.info(f"SITE CHANGED : {row[config.KSTAR_ACCESSION]} {row[config.KSTAR_SITE]} -> {site}")
self.experiment.loc[index, config.KSTAR_SITE] = site
peptide = get_aligned_peptide(site = site,
sequence = sequence,
window = window)
if peptide is not None:
self.experiment.loc[index, config.KSTAR_PEPTIDE] = peptide
# Peptide not provided but site is provided - build peptide
elif row[config.KSTAR_SITE] is not None:
peptide = get_aligned_peptide(site = row[config.KSTAR_SITE],
sequence = sequence,
window = window)
if peptide is not None:
self.experiment.loc[index, config.KSTAR_PEPTIDE] = peptide
else:
self.logger.warning(f"PEPTIDE MISMATCH : {row[config.KSTAR_ACCESSION]} {row[config.KSTAR_SITE]}")
self.experiment.loc[index, config.KSTAR_SITE] = None
self.experiment.loc[index, config.KSTAR_PEPTIDE] = None
else:
self.logger.warning(f"SITE NOT FOUND : {self.experiment[config.KSTAR_ACCESSION]}\t{self.experiment[config.KSTAR_SITE]}")
# Sequence not found in compendia
else:
self.logger.warning(f"SEQUENCE NOT FOUND : {row[config.KSTAR_ACCESSION]}")
self.experiment.loc[index, config.KSTAR_SITE] = None
self.experiment.loc[index, config.KSTAR_PEPTIDE] = None
self.experiment.dropna(axis = 'rows', subset = [config.KSTAR_ACCESSION, config.KSTAR_SITE, config.KSTAR_PEPTIDE], inplace = True)
self.experiment.drop_duplicates(inplace=True)
def expand_peptide(df, peptide_column):
"""
Expands a dataframe based on the peptide column such that each
peptide contains one modification type.
Example
Accession Peptide Info
P0001 aBCDeF Good
P0002 TUvWXy Bad
Becomes
Accession Peptide Info
P0001 aBCDEF Good
P0001 ABCDeF Good
P0002 TUvWXY Bad
P0002 TUVWXy Bad
Parameters
--------
df : pandas df
Experiment pandas dataframe
peptide_column: str
peptide column to expand
Returns
---------
df : pandas DataFrame
expanded dataframe
"""
df_list = []
for index, row in df.iterrows():
if row[peptide_column] is not None:
mods = find_modified_sites(row[peptide_column])
for mod in mods:
tmp = row.copy()
tmp[peptide_column] = list(tmp[peptide_column].upper())
tmp[peptide_column][mod[1]] = tmp[peptide_column][mod[1]].lower()
tmp[peptide_column] = ''.join(tmp[peptide_column])
df_list.append(tmp)
else:
df_list.append(row)
df = pd.DataFrame(df_list).reset_index(drop=True)
return df
def find_modified_sites(peptide, modification_types = None):
"""
Finds all modification sites, indicated by a lower letter in the peptide column
and returns as a list of lists
Parameters
----------
row : pandas row
peptide_column : column that contains peptide with mod site
mod site identified by being lower case
Returns : [[modtype, relative_location]]
1st element is the modification type
2nd element is the relative modification location
"""
if modification_types is not None:
mod_types = ''.join(modification_types)
mod_types = '[' + mod_types + ']'
return [[m.group() , m.start()] for m in re.finditer(mod_types, peptide)]
return [[m.group() , m.start()] for m in re.finditer('[a-z]', peptide)]
def find_peptide_locations(peptide, sequence):
"""
Finds all start locations of a peptide in a given sequence.
Sequence starts at location 1.
Example: Sequence: ABCDEFGABCDHIJK, Peptide: BCD
would return [2, 9], as peptide is found at location 2 and 9
"""
return [m.start() + 1 for m in re.finditer(peptide, sequence)]
def peptide_site_number(peptide, sequence, site = None, modification_types = None):
"""
Finds the site number by finding the modification site in sequence and the peptide location
in the sequence. A modification site is defined as where in a peptide a lower case letter appears.
If a site is provided with alphabetical character the alphabetical character is
the modification type checked for in the peptide.
If a site contains numbers the closest match of the petide in the sequence to the site number is used as the site
If no modification sites are found or peptide is not found in sequence None is returned
Example: sequence is ZZZZABCDEFGZZZABCDEFGHIZZ, peptide is AbCdEfG
If no site is provided and no modification types are provided then B6 is returned
If no site is provided and modification_types is ['d','f'] or 'df' then D8 is returned
If site is 30 and modification_types is ['d','f'] then F20 is returned
If site is D23 then D18 is returned, modification_type is ignored
If site is X50 then X50 is returned as modification_type x is not found in peptide
Parameters
----------
peptide :str
peptide string where modification site is first lower-case letter found
sequence : str
sequence string where all characters are uppercase
site : str
site number to check against. can either have letters or just numbers
modification_types: list
list of modification types to check against while checking peptide
ex: ['s', 't', 'y'] would ingore all modifications except for s,t,y
not used if site contains alphabetical characters
Returns
---------
site : str
site of peptide in sequence dataframe in format S123
closest to original site if provided, otherwise first location found
if no modification sites are found, the sequence cannot be found, or the peptide
cannot be found in the sequence then the original site is returned
"""
if pd.isnull(peptide):
return None
peptide = ''.join(filter(str.isalpha, peptide)) # keep only alphabetical letters
all_mod_sites = find_modified_sites(peptide, None) # get all mod sites
if isinstance(site, str):
modification_types = site[0].lower()
mod_sites = []
if modification_types is not None:
for mod_site in all_mod_sites:
if mod_site[0] in modification_types:
mod_sites.append(mod_site)
else:
mod_sites = all_mod_sites
# if no mod sites are found return None
if len(mod_sites) == 0:
return None
peptide_locations = find_peptide_locations(peptide.upper(), sequence)
relative_location = mod_sites[0][1] # only looking at first mod site location
site_type = mod_sites[0][0].upper() # first mod type (STY)
# get all locations where peptide appears in sequence
peptide_locations = find_peptide_locations(peptide.upper(), sequence)
if len(peptide_locations)>0:
if site is not None:
# get closest peptide location to provided site
site_num = int(re.sub("[^0-9]", "", str(site))) #remove all letters and make int
closeness = np.inf
closest_site = np.inf
for loc in peptide_locations:
true_location = loc + relative_location
if abs(true_location - site_num) < closeness:
closeness = abs(true_location - site_num)
closest_site = true_location
return site_type + str(closest_site)
else:
# no site info provided : return first instance
return site_type + str(peptide_locations[0] + relative_location)
return None
def peptide_site_number_df(peptide, accession, df_sequence, site = None, modification_types = None):
"""
Finds the site number by finding the modification site in sequence and the peptide location
in the sequence from the given dataframe.
If the sequence cannot be found then the original site is returned.
See peptide_site_number(peptide,sequence,site,modification_types)
for details of how site is found once sequence is found
Parameters
----------
peptide :str
peptide string where modification site is first lower-case letter found
accession : str
UniProt accession number
df_sequence : pandas DataFrame
sequence dataframe pulled from UniProt with columns Entry, Sequence
site : str
site number to check against. can either have letters or just numbers
modification_types: list
list of modification types to check against while checking peptide
ex: ['s', 't', 'y'] would ingore all modifications except for s,t,y
Returns
---------
site : str
site of peptide in sequence dataframe in format S123
closest to row[site_column] if provided, otherwise first location found
if no modification sites are found, the sequence cannot be found, or the peptide
cannot be found in the sequence then the original site is returned
"""
sequence = df_sequence[df_sequence['Entry'] == accession]
if len(sequence) > 0: # get first sequence for accession
sequence = sequence.iloc[0]['Sequence']
return peptide_site_number(peptide, sequence, site, modification_types)
return site
def is_valid_site(site, sequence):
"""
Checks whether the site provided is a valid site in the given sequence
Valid sites are sites where site has same modification as sequence location
Parameters
----------
site: str
site AA#, e.g. Y12
sequence : str
sequence to check
Returns
--------
is_valid : bool
True if site is valid. False otherwise
"""
site_type = ''.join(filter(str.isalpha, site)).upper()
site_number = int( ''.join( filter(lambda x: x.isdigit(), site) ) )
if site_number > 0 and site_number <= len(sequence):
return site_type == sequence[site_number - 1]
return False
def get_aligned_peptide(site, sequence, window):
"""
Returns aligned peptide where modificaiton site is lowercase.
Aligned peptide contains +/- window AA surrounding modification site.
No buffer characters are added.
If the site is not a valid location in the sequence then None is returned
Parameters
----------
site : str
modification site. AA#. e.g. Y12
sequence : str
sequence to use for generating aligned peptide
window : int
number of amino acids to pull surrounding modification site
Returns
----------
peptide : str
peptide of AA surrounding mod site. Mod site is lowercase.
None is returned if not a valid site
e.g. ABCDeFGHI where site=E5, window=4
e.g. AbCDEF where site=B2, window=4
"""
if is_valid_site(site, sequence):
site_number = int( ''.join( filter(lambda x: x.isdigit(), site) ) ) - 1
start = max(0, site_number - window)
stop = min(len(sequence), site_number + window + 1)
sequence = list(sequence)
sequence[site_number] = sequence[site_number].lower()
peptide = sequence[start:stop]
peptide = ''.join(peptide)
return peptide
return None
def run_mapping(experiment, odir, name, map_columns, window=7, data_columns=None):
if not os.path.exists(f"{odir}/MAPPED_DATA"):
os.mkdir(f"{odir}/MAPPED_DATA")
mapping_log = helpers.get_logger(f"mapping_{name}", f"{odir}/MAPPED_DATA/mapping_{name}.log")
exp_mapper = ExperimentMapper(
experiment = experiment,
sequences = config.HUMAN_REF_SEQUENCES,
columns = map_columns,
logger = mapping_log,
compendia = config.HUMAN_REF_COMPENDIA,
window = window,
data_columns = data_columns)
experiment = exp_mapper.experiment
experiment.to_csv(f"{odir}/MAPPED_DATA/{name}_mapped.tsv", sep = '\t', index = False)
return experiment