Source code for ptm_pose.annotate

import pandas as pd
import numpy as np
import re
import os

from ptm_pose import pose_config, helpers


#dictionaries for converting modification codes to modification names in PhosphoSitePlus data
mod_shorthand_dict = {'p': 'Phosphorylation', 'ca':'Caspase Cleavage', 'hy':'Hydroxylation', 'sn':'S-Nitrosylation', 'ng':'Glycosylation', 'ub': 'Ubiquitination', 'pa': "Palmitoylation",'ne':'Neddylation','sc':'Succinylation', 'sm': 'Sumoylation', 'ga': 'Glycosylation', 'gl': 'Glycosylation', 'ac': 'Acetylation', 'me':'Methylation', 'm1':'Methylation', 'm2': 'Dimethylation', 'm3':'Trimethylation'}
residue_dict = {'P': 'proline', 'Y':'tyrosine', 'S':'serine', 'T':'threonine', 'H':'histidine', 'D':'aspartic acid', 'I':'isoleucine', 'K':'lysine', 'R':'arginine', 'G':'glycine', 'N':'asparagine', 'M':'methionine'}
annotation_col_dict = {'PhosphoSitePlus':{'Function':'PSP:ON_FUNCTION', 'Process':'PSP:ON_PROCESS', 'Interactions':'PSP:ON_PROT_INTERACT', 'Disease':'PSP:Disease_Association', 'Kinase':'PSP:Kinase','Perturbation':'PTMsigDB:PSP-PERT'},
                        'ELM':{'Interactions':'ELM:Interactions', 'Motif Match':'ELM:Motif Matches'},
                        'PTMcode':{'Intraprotein':'PTMcode:Intraprotein_Interactions', 'Interactions':'PTMcode:Interprotein_Interactions'},
                        'PTMInt':{'Interactions':'PTMInt:Interactions'},
                        'RegPhos':{'Kinase':'RegPhos:Kinase'},
                        'DEPOD':{'Phosphatase':'DEPOD:Phosphatase'},
                        'PTMsigDB': {'WikiPathway':'PTMsigDB:PATH-WP', 'NetPath':'PTMsigDB:PATH-NP','mSigDB':'PTMsigDB:PATH-BI', 'Pertubation (DIA2)':'PTMsigDB:PERT-P100-DIA2', 'Perturbation (DIA)': 'PTMsigDB:PERT-P100-DIA', 'Perturbation (PRM)':'PTMsigDB:PERT-P100-PRM', 'Kinase':'PTMsigDB:Kinase-iKiP'}}



[docs]def add_custom_annotation(spliced_ptms, annotation_data, source_name, annotation_type, annotation_col, accession_col = 'UniProtKB Accession', residue_col = 'Residue', position_col = 'PTM Position in Canonical Isoform'):
    """
    Add custom annotation data to spliced_ptms or altered flanking sequence dataframes

    Parameters
    ----------
    annotation_data: pandas.DataFrame
        Dataframe containing the annotation data to be added to the spliced_ptms dataframe. Must contain columns for UniProtKB Accession, Residue, PTM Position in Canonical Isoform, and the annotation data to be added
    source_name: str
        Name of the source of the annotation data, will be used to label the columns in the spliced_ptms dataframe
    annotation_type: str
        Type of annotation data being added, will be used to label the columns in the spliced_ptms dataframe
    annotation_col: str
        Column name in the annotation data that contains the annotation data to be added to the spliced_ptms dataframe
    

    Returns
    -------
    spliced_ptms: pandas.DataFrame
        Contains the PTMs identified across the different splice events with an additional column for the custom annotation data
    """
    #check if annotation data contains the annotation col
    if isinstance(annotation_col, str):
        if annotation_col not in annotation_data.columns:
            raise ValueError(f'Could not find column indicated to contain {annotation_col} in annotation data. Please either change the name of your annotation data column with this information or indicate the correct column name with the annotation_col parameter')
        else:
            #make annotation col name based on source and annotation type
            annotation_col_name = source_name + ':' + annotation_type
            annotation_data = annotation_data.rename(columns = {annotation_col: annotation_col_name})
    else:
        raise ValueError('annotation_col must be a string indicating column with annotation data to be added to the spliced_ptms dataframe')

    #check to make sure annotation data has the necessary columns
    if not all([x in annotation_data.columns for x in [accession_col, residue_col, position_col]]):
        raise ValueError(f'Could not find columns containing ptm information: {accession_col}, {residue_col}, and {position_col}. Please either change the name of your annotation data columns containing this information or indicate the correct column names with the accession_col, residue_col, and position_col parameters')

    #if splice data already has the annotation columns, remove them
    if annotation_col_name in spliced_ptms.columns:
        spliced_ptms = spliced_ptms.drop(columns = [annotation_col_name])

    #add to splice data
    original_data_size = spliced_ptms.shape[0]
    spliced_ptms = spliced_ptms.merge(annotation_data, how = 'left', left_on = ['UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform'], right_on = [accession_col, residue_col, position_col])
    if spliced_ptms.shape[0] != original_data_size:
        raise RuntimeError('Dataframe size has changed, check for duplicates in spliced ptms or annotation dataframe')
    
    #report the number of PTMs identified
    num_ptms_with_custom_data = spliced_ptms.dropna(subset = annotation_col).groupby(['UniProtKB Accession', 'Residue']).size().shape[0]
    print(f"{source_name} {annotation_type} data added: {num_ptms_with_custom_data} PTMs in dataset found with {source_name} {annotation_type} information")

    return spliced_ptms

[docs]def add_PSP_regulatory_site_data(spliced_ptms, file = 'Regulatory_sites.gz', report_success = True):
    """
    Add functional information from PhosphoSitePlus (Regulatory_sites.gz) to spliced_ptms dataframe from project_ptms_onto_splice_events() function

    Parameters
    ----------
    file: str
        Path to the PhosphoSitePlus Regulatory_sites.gz file. Should be downloaded from PhosphoSitePlus in the zipped format

    Returns
    -------
    spliced_ptms: pandas.DataFrame
        Contains the PTMs identified across the different splice events with additional columns for regulatory site information, including domains, biological process, functions, and protein interactions associated with the PTMs
    """
    #check to make sure file exists
    check_file(file, expected_extension='.gz')

    #read in the kinase substrate data and add to spliced ptm info
    regulatory_site_data = pd.read_csv(file, sep = '\t', header = 2, on_bad_lines='skip',compression = 'gzip')
    regulatory_site_data = regulatory_site_data.rename(columns = {'ACC_ID':'UniProtKB Accession'})
    #drop extra modification information that is not needed
    regulatory_site_data['Residue'] = regulatory_site_data['MOD_RSD'].apply(lambda x: x.split('-')[0][0])
    regulatory_site_data['PTM Position in Canonical Isoform'] = regulatory_site_data['MOD_RSD'].apply(lambda x: int(x.split('-')[0][1:]))
    #add modification type
    regulatory_site_data['Modification Class'] = regulatory_site_data['MOD_RSD'].apply(lambda x: mod_shorthand_dict[x.split('-')[1]])

    #restrict to human data
    regulatory_site_data = regulatory_site_data[regulatory_site_data['ORGANISM'] == 'human']
    regulatory_site_data = regulatory_site_data[['UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform', 'Modification Class', 'ON_PROCESS', 'ON_PROT_INTERACT', 'ON_OTHER_INTERACT', 'ON_FUNCTION']].drop_duplicates()
    
    #group like modifications into a single column
    regulatory_site_data = regulatory_site_data.groupby(['UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform', 'Modification Class']).agg(lambda x: '; '.join([y for y in x if y == y])).reset_index()
    regulatory_site_data = regulatory_site_data.replace('', np.nan)
    
    #add 'PSP:' in front of each column
    regulatory_site_data.columns = ['PSP:' + x if x not in ['UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform', 'Modification Class'] else x for x in regulatory_site_data.columns]
    
    #if splice data already has the annotation columns, remove them
    if 'PSP:ON_FUNCTION' in spliced_ptms.columns:
        spliced_ptms = spliced_ptms.drop(columns = ['PSP:ON_FUNCTION', 'PSP:ON_PROCESS', 'PSP:ON_PROT_INTERACT', 'PSP:ON_OTHER_INTERACT'])

    #explode dataframe on modifications
    if spliced_ptms['Modification Class'].str.contains(';').any():
        spliced_ptms['Modification Class'] = spliced_ptms['Modification Class'].str.split(';')
        spliced_ptms = spliced_ptms.explode('Modification Class').reset_index(drop = True)

    #merge with spliced_ptm info
    original_data_size = spliced_ptms.shape[0]
    spliced_ptms = spliced_ptms.merge(regulatory_site_data, how = 'left', on = ['UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform', 'Modification Class'])
    if spliced_ptms.shape[0] != original_data_size:
        raise RuntimeError('Dataset size changed upon merge, please make sure there are no duplicates in spliced ptms data')

    
    #report the number of ptms with motif data
    if report_success:
        num_ptms_with_known_function = spliced_ptms.dropna(subset = 'PSP:ON_FUNCTION').groupby(['UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform', 'Modification Class']).size().shape[0]
        num_ptms_with_known_process = spliced_ptms.dropna(subset = 'PSP:ON_PROCESS').groupby(['UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform', 'Modification Class']).size().shape[0]
        num_ptms_with_known_interaction = spliced_ptms.dropna(subset = 'PSP:ON_PROT_INTERACT').groupby(['UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform', 'Modification Class']).size().shape[0]
        print(f"PhosphoSitePlus regulatory_site information added:\n\t ->{num_ptms_with_known_function} PTMs in dataset found associated with a molecular function \n\t ->{num_ptms_with_known_process} PTMs in dataset found associated with a biological process\n\t ->{num_ptms_with_known_interaction} PTMs in dataset found associated with a protein interaction")
    return spliced_ptms

[docs]def add_PSP_kinase_substrate_data(spliced_ptms, file = 'Kinase_Substrate_Dataset.gz', report_success = True):
    """
    Add kinase substrate data from PhosphoSitePlus (Kinase_Substrate_Dataset.gz) to spliced_ptms dataframe from project_ptms_onto_splice_events() function

    Parameters
    ----------
    file: str
        Path to the PhosphoSitePlus Kinase_Substrate_Dataset.gz file. Should be downloaded from PhosphoSitePlus in the zipped format

    Returns
    -------
    spliced_ptms: pandas.DataFrame
        Contains the PTMs identified across the different splice events with an additional column indicating the kinases known to phosphorylate that site (not relevant to non-phosphorylation PTMs)

    """
    #check to make sure provided file exists
    check_file(file, expected_extension='.gz')

    #load data
    ks_dataset = pd.read_csv(file, sep = '\t', header = 2, on_bad_lines='skip',compression = 'gzip', encoding = "cp1252")
    #restrict to human data
    ks_dataset = ks_dataset[ks_dataset['KIN_ORGANISM'] == 'human']
    ks_dataset = ks_dataset[ks_dataset['SUB_ORGANISM'] == 'human']

    ks_dataset = ks_dataset[['GENE', 'SUB_ACC_ID', 'SUB_MOD_RSD']].groupby(['SUB_ACC_ID', 'SUB_MOD_RSD']).agg(';'.join).reset_index()
    ks_dataset.columns = ['UniProtKB Accession', 'Residue', 'PSP:Kinase']

    #separate residue and position
    ks_dataset['PTM Position in Canonical Isoform'] = ks_dataset['Residue'].apply(lambda x: int(x[1:]))
    ks_dataset['Residue'] = ks_dataset['Residue'].apply(lambda x: x[0])

    
    #if splice data already has the annotation columns, remove them
    if 'PSP:Kinase' in spliced_ptms.columns:
        spliced_ptms = spliced_ptms.drop(columns = ['PSP:Kinase'])

    original_data_size = spliced_ptms.shape[0]
    spliced_ptms = spliced_ptms.merge(ks_dataset, how = 'left', on = ['UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform'])
    if spliced_ptms.shape[0] != original_data_size:
        raise RuntimeError('Dataset size changed upon merge, please make sure there are no duplicates in spliced ptms data')
    
    
        #report the number of ptms with kinase substrate information
    if report_success:
        num_ptms_with_KS = spliced_ptms.dropna(subset = 'PSP:Kinase').groupby(['UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform', 'Modification Class']).size().shape[0]
        print(f"PhosphoSitePlus kinase-substrate interactions added: {num_ptms_with_KS} phosphorylation sites in dataset found associated with a kinase in PhosphoSitePlus")
    return spliced_ptms

[docs]def add_PSP_disease_association(spliced_ptms, file = 'Disease-associated_sites.gz', report_success = True):
    """
    Process disease asociation data from PhosphoSitePlus (Disease-associated_sites.gz), and add to spliced_ptms dataframe from project_ptms_onto_splice_events() function

    Parameters
    ----------
    file: str
        Path to the PhosphoSitePlus Kinase_Substrate_Dataset.gz file. Should be downloaded from PhosphoSitePlus in the zipped format

    Returns
    -------
    spliced_ptms: pandas.DataFrame
        Contains the PTMs identified across the different splice events with an additional column indicating the kinases known to phosphorylate that site (not relevant to non-phosphorylation PTMs)

    """
    #check to make sure provided file exists
    check_file(file, expected_extension='.gz')

    #load data
    disease_associated_sites = pd.read_csv(file, sep = '\t', header = 2, on_bad_lines='skip',compression = 'gzip')
    disease_associated_sites = disease_associated_sites[disease_associated_sites['ORGANISM'] == 'human']

    #removes sites without a specific disease annotation
    disease_associated_sites = disease_associated_sites.dropna(subset = ['DISEASE'])

    #drop extra modification information that is not needed
    #drop extra modification information that is not needed
    disease_associated_sites['Residue'] = disease_associated_sites['MOD_RSD'].apply(lambda x: x.split('-')[0][0])
    disease_associated_sites['PTM Position in Canonical Isoform'] = disease_associated_sites['MOD_RSD'].apply(lambda x: int(x.split('-')[0][1:]))
    #add modification type
    disease_associated_sites['Modification Class'] = disease_associated_sites['MOD_RSD'].apply(lambda x: mod_shorthand_dict[x.split('-')[1]])
    #if phosphorylation, add specific residue
    disease_associated_sites['Modification Class'] = disease_associated_sites.apply(lambda x: x['Modification Class'] + residue_dict[x['Residue'][0]] if x['Modification Class'] == 'Phospho' else x['Modification Class'], axis = 1)
    #change O-GalNac occurring on N to N-glycosylation
    disease_associated_sites['Modification Class'] = disease_associated_sites.apply(lambda x: 'N-Glycosylation' if x['Modification Class'] == 'O-Glycosylation' and x['Residue'][0] == 'N' else x['Modification Class'], axis = 1)


    #combine disease and alteration
    disease_associated_sites['ALTERATION'] = disease_associated_sites.apply(lambda x: x['DISEASE']+'->'+x['ALTERATION'] if x['ALTERATION'] == x['ALTERATION'] else x['DISEASE'], axis = 1)
    #grab only necessary columns and rename
    disease_associated_sites = disease_associated_sites[['ACC_ID', 'Residue', 'PTM Position in Canonical Isoform', 'Modification Class', 'ALTERATION']]
    disease_associated_sites.columns = ['UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform', 'Modification Class', 'PSP:Disease_Association']

    #aggregate multiple disease associations
    disease_associated_sites = disease_associated_sites.groupby(['UniProtKB Accession', 'Residue','PTM Position in Canonical Isoform', 'Modification Class']).agg(';'.join).reset_index()

    #if splice data already has the annotation columns, remove them
    if 'PSP:Disease_Association' in spliced_ptms.columns:
        spliced_ptms = spliced_ptms.drop(columns = ['PSP:Disease_Association'])

    #explode dataframe on modifications
    if spliced_ptms['Modification Class'].str.contains(';').any():
        spliced_ptms['Modification Class'] = spliced_ptms['Modification Class'].str.split(';')
        spliced_ptms = spliced_ptms.explode('Modification Class').reset_index(drop = True)


    #merge with spliced_ptm info
    original_data_size = spliced_ptms.shape[0]
    spliced_ptms = spliced_ptms.merge(disease_associated_sites, how = 'left', on = ['UniProtKB Accession', 'Residue','PTM Position in Canonical Isoform', 'Modification Class'])
    if spliced_ptms.shape[0] != original_data_size:
        raise RuntimeError('Dataset size changed upon merge, please make sure there are no duplicates in spliced ptms data')
    
    #
    #report the number of ptms with motif data
    if report_success:
        num_ptms_with_disease = spliced_ptms.dropna(subset = 'PSP:Disease_Association').groupby(['UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform', 'Modification Class']).size().shape[0]
        print(f"PhosphoSitePlus disease associations added: {num_ptms_with_disease} PTM sites in dataset found associated with a disease in PhosphoSitePlus")
    
    
    return spliced_ptms


[docs]def add_ELM_interactions(spliced_ptms, file = None, report_success =True):
    """
    Given a spliced ptms dataframe from the project module, add ELM interaction data to the dataframe
    """
    #load data
    if file is None:
        elm_interactions = pd.read_csv('http://elm.eu.org/interactions/as_tsv', sep = '\t', header = 0)
    else:
        check_file(file, expected_extension='.tsv')
        elm_interactions = pd.read_csv(file, sep = '\t', header = 0)

    elm_interactions = elm_interactions[(elm_interactions['taxonomyElm'] == '9606(Homo sapiens)') & (elm_interactions['taxonomyDomain'] == '9606(Homo sapiens)')]

    elm_list = []
    elm_type = []
    elm_interactor = []
    for i, row in spliced_ptms.iterrows():
        #grab ptm location from residue column (gives residue and position (S981), so need to remove residue and convert to int)
        ptm_loc = int(row['PTM Position in Canonical Isoform']) if row['PTM Position in Canonical Isoform'] == row['PTM Position in Canonical Isoform'] and row['PTM Position in Canonical Isoform'] != 'none' else None

        #if data does not have position information, move to the next
        if ptm_loc is None:
            elm_list.append(np.nan)
            elm_type.append(np.nan)
            elm_interactor.append(np.nan)
            continue

        #find if any of the linear motifs match ptm loc
        protein_match = row['UniProtKB Accession'] == elm_interactions['interactorElm']
        region_match = (ptm_loc >= elm_interactions['StartElm'])  & (ptm_loc <=elm_interactions['StopElm'])
        elm_subset_motif = elm_interactions[protein_match & region_match]
        #if any interactions were found, record and continue to the next (assumes a single ptm won't be found as both a SLiM and domain)
        if elm_subset_motif.shape[0] > 0:
            elm_list.append(';'.join(elm_subset_motif['Elm'].values))
            elm_type.append('SLiM')
            elm_interactor.append(';'.join(elm_subset_motif['interactorDomain'].values))
            continue


        #domain
        protein_match = row['UniProtKB Accession'] == elm_interactions['interactorDomain']
        region_match = (ptm_loc >= elm_interactions['StartDomain'])  & (ptm_loc <=elm_interactions['StopDomain'])
        elm_subset_domain = elm_interactions[protein_match & region_match]
        #if any interactions were found, record and continue to the next (assumes a single ptm won't be found as both a SLiM and domain)
        if elm_subset_domain.shape[0] > 0:
            elm_list.append(';'.join(elm_subset_domain['Elm'].values))
            elm_type.append('Domain')
            elm_interactor.append(';'.join(elm_subset_domain['interactorElm'].values))
            continue

        #if no interactions wer found, record as np.nan
        elm_list.append(np.nan)
        elm_type.append(np.nan)
        elm_interactor.append(np.nan)

    spliced_ptms['ELM:Interactions'] = elm_interactor
    spliced_ptms['ELM:Location of PTM for Interaction'] = elm_type
    spliced_ptms['ELM:Motifs Associated with Interactions'] = elm_list
    
    #report the number of ptms with motif data
    if report_success:
        num_ptms_with_ELM_instance = spliced_ptms.dropna(subset = 'ELM:Interactions').groupby(['UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform']).size().shape[0]
        print(f"ELM interaction instances added: {num_ptms_with_ELM_instance} PTMs in dataset found associated with at least one known ELM instance")
    return spliced_ptms


def add_ELM_matched_motifs(spliced_ptms, flank_size = 7, file = None, report_success = True):
    if file is None:
        elm_classes = pd.read_csv('http://elm.eu.org/elms/elms_index.tsv', sep = '\t', header = 5)
    else:
        check_file(file, expected_extension='.tsv')
        elm_classes = pd.read_csv(file, sep = '\t', header = 5)

    ptm_coordinates = pose_config.ptm_coordinates.copy()
    #create corresponding label for ptm_coordinate data
    ptm_coordinates['PTM Label'] = ptm_coordinates['UniProtKB Accession'] + '_' + ptm_coordinates['Residue'] + ptm_coordinates['PTM Position in Canonical Isoform'].apply(lambda x: int(float(x)) if x == x else np.nan).astype(str)
    
    match_list = []
    for i, row in spliced_ptms.iterrows():
        matches = []
        #grab ptm information
        #grab flanking sequence for the ptm
        loc = int(row["PTM Position in Canonical Isoform"]) if row['PTM Position in Canonical Isoform'] == row['PTM Position in Canonical Isoform'] else np.nan
        ptm = row['UniProtKB Accession'] + '_' + row['Residue'] + str(loc)

        
        if ptm in ptm_coordinates['PTM Label'].values:
            ptm_flanking_seq = ptm_coordinates.loc[ptm_coordinates['PTM Label'] == ptm, 'Expected Flanking Sequence'].values[0]
            #make sure flanking sequence is present
            if isinstance(ptm_flanking_seq, str):

                #default flanking sequence is 10, if requested flanking sequence is different, then adjust
                if flank_size > 10:
                    raise ValueError('Flanking size must be equal to or less than 10')
                elif flank_size < 10:
                    ptm_flanking_seq = ptm_flanking_seq[10-flank_size:10+flank_size]

                for j, elm_row in elm_classes.iterrows():
                    reg_ex = elm_row['Regex']
                    if re.search(reg_ex, ptm_flanking_seq) is not None:
                        matches.append(elm_row['ELMIdentifier'])

                match_list.append(';'.join(matches))
            else:
                match_list.append(np.nan)
        else:
            #print(f'PTM {ptm} not found in PTM info file')
            match_list.append(np.nan)
    
    spliced_ptms['ELM:Motif Matches'] = match_list

    #report the number of ptms with motif data
    if report_success:
        num_ptms_with_matched_motif = spliced_ptms.dropna(subset = 'ELM:Motif Matches').groupby(['UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform']).size().shape[0]
        print(f"ELM Class motif matches found: {num_ptms_with_matched_motif} PTMs in dataset found with at least one matched motif")
    return spliced_ptms

[docs]def add_PTMInt_data(spliced_ptms, file = None, report_success = True):
    """
    Given spliced_ptms data from project module, add PTMInt interaction data, which will include the protein that is being interacted with, whether it enchances or inhibits binding, and the localization of the interaction. This will be added as a new column labeled PTMInt:Interactions and each entry will be formatted like 'Protein->Effect|Localization'. If multiple interactions, they will be separated by a semicolon
    """
    #load file
    if file is None:
        PTMint = pd.read_csv('https://ptmint.sjtu.edu.cn/data/PTM%20experimental%20evidence.csv')
    else:
        check_file(file, expected_extension='.csv')
        PTMint = pd.read_csv(file)

    PTMint = PTMint.rename(columns={'Uniprot':'UniProtKB Accession', 'AA':'Residue', 'Site':'PTM Position in Canonical Isoform'})
    #PTMint['Site'] = PTMint['AA'] + PTMint['Site'].astype(str)
    PTMint['PTMInt:Interaction'] = PTMint['Int_gene']+'->'+PTMint['Effect']
    PTMint = PTMint[['UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform', 'PTMInt:Interaction']]
    #PTMint['PTM Position in Canonical Isoform'] = PTMint['PTM Position in Canonical Isoform'].astype(str)

    #aggregate PTMint data on the same PTMs
    PTMint = PTMint.groupby(['UniProtKB Accession','Residue','PTM Position in Canonical Isoform'], as_index = False).agg(';'.join)

    #if splice data already has the annotation columns, remove them
    if 'PTMInt:Interaction' in spliced_ptms.columns:
        spliced_ptms = spliced_ptms.drop(columns = ['PTMInt:Interaction'])

    #add to splice data
    original_data_size = spliced_ptms.shape[0]
    spliced_ptms = spliced_ptms.merge(PTMint[['UniProtKB Accession','Residue','PTM Position in Canonical Isoform', 'PTMInt:Interaction']], on = ['UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform'], how = 'left')
    if spliced_ptms.shape[0] != original_data_size:
        raise RuntimeError('Dataframe size has changed, check for duplicates in spliced ptms dataframe')

    #report the number of PTMs identified
    if report_success:
        num_ptms_with_PTMInt_data = spliced_ptms.dropna(subset = 'PTMInt:Interaction').groupby(['UniProtKB Accession', 'Residue']).size().shape[0]
        print(f"PTMInt data added: {num_ptms_with_PTMInt_data} PTMs in dataset found with PTMInt interaction information")

    return spliced_ptms
    #delete source PTMint data
    #os.remove(pdir + './Data/PTM_experimental_evidence.csv')

#def add_PTMcode_intraprotein(spliced_ptms, fname = None, report_success = True):
#    #load ptmcode info
#    if fname is None:
#        ptmcode = pd.read_csv('https://ptmcode.embl.de/data/PTMcode2_associations_within_proteins.txt.gz', sep = '\t', header = 2, compression='gzip')
#    else:
#        check_file(fname, expected_extension = '.gz')
#        ptmcode = pd.read_csv(fname, sep = '\t', header = 2, compression = 'gzip')
#    
#    #grab humn data
#    ptmcode = ptmcode[ptmcode['Species'] == 'Homo sapiens']
#
#    #add gene name to data
#    translator = pd.DataFrame(pose_config.uniprot_to_genename, index = ['Gene']).T
#    translator['Gene'] = translator['Gene'].apply(lambda x: x.split(' '))
#    translator = translator.explode('Gene')
#    translator = translator.reset_index()
#    translator.columns = ['UniProtKB/Swiss-Prot ID', 'Gene name']
#
#    #add uniprot ID information
#    ptmcode = ptmcode.merge(translator.dropna().drop_duplicates(), left_on = '## Protein', right_on = 'Gene name', how = 'left')
#
#    #convert modification names to match annotation data
#    convert_dict = {'Adp ribosylation': 'ADP Ribosylation', 'Glutamine deamidation':'Deamidation'}
#    new_mod_names = []
#    failed_mod = []
#    mod_list = ptmcode['PTM1'].unique()
#    for mod in mod_list:
#        mod = mod.capitalize()
#        if 'glycosylation' in mod: #if glycosylation, group into one gorup
#            new_mod_names.append('Glycosylation')
#        elif mod in pose_config.modification_conversion['Modification Class'].values: #if already in modification class data, keep
#            new_mod_names.append(mod)
#        elif mod in convert_dict.keys():
#            new_mod_names.append(convert_dict[mod])
#        else:
#            try:
#                new_mod = pose_config.modification_conversion[pose_config.modification_conversion['Modification'] == mod].values[0][0]
#                new_mod_names.append(new_mod)
#            except:
#                failed_mod.append(mod)
#                new_mod_names.append(mod)
#    conversion_df = pd.DataFrame({'PTM1':mod_list, 'Modification Class':new_mod_names})
#
#    #add new modification labels to data
#    ptmcode = ptmcode.merge(conversion_df, on = 'PTM1', how = 'left')
#    
#    #groupby by PTM1 and rename to match column names in annotation data
#    ptmcode = ptmcode[['UniProtKB/Swiss-Prot ID', 'Modification Class', 'Residue1', 'Residue2']].dropna(subset = 'UniProtKB/Swiss-Prot ID')
#    ptmcode = ptmcode.groupby(['UniProtKB/Swiss-Prot ID', 'Modification Class', 'Residue1'])['Residue2'].agg(';'.join).reset_index()
#    ptmcode = ptmcode.rename(columns = {'UniProtKB/Swiss-Prot ID':'UniProtKB Accession', 'Residue1':'Residue', 'Residue2':'PTMcode:Intraprotein_Interactions'})
#    
#    #separate residue information into separate columns, one for amino acid and one for position
#    ptmcode['PTM Position in Canonical Isoform'] = ptmcode['Residue'].apply(lambda x: int(x[1:]))
#    ptmcode['Residue'] = ptmcode['Residue'].apply(lambda x: x[0])
#
#        #if splice data already has the annotation columns, remove them
#    if 'PTMcode:Intraprotein_Interactions' in spliced_ptms.columns:
#        spliced_ptms = spliced_ptms.drop(columns = ['PTMcode:Intraprotein_Interactions'])
#
#    #explode dataframe on modifications
#    if spliced_ptms['Modification Class'].str.contains(';').any():
#        spliced_ptms['Modification Class'] = spliced_ptms['Modification Class'].str.split(';')
#        spliced_ptms = spliced_ptms.explode('Modification Class').reset_index(drop = True)
#
#    #add to splice data
#    original_data_size = spliced_ptms.shape[0]
#    spliced_ptms = spliced_ptms.merge(ptmcode, how = 'left', on = ['UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform', 'Modification Class'])
#    if spliced_ptms.shape[0] != original_data_size:
#        raise RuntimeError('Dataframe size has changed, check for duplicates in spliced ptms dataframe')
#    
#    #report the number of PTMs identified
#    if report_success:
#        num_ptms_with_PTMcode_data = spliced_ptms.dropna(subset = 'PTMcode:Intraprotein_Interactions').groupby(['UniProtKB Accession', 'Residue']).size().shape[0]
#        print(f"PTMcode intraprotein interactions added: {num_ptms_with_PTMcode_data} PTMs in dataset found with PTMcode intraprotein interaction information")
#
#    return spliced_ptms

def extract_ids_PTMcode(df, col = '## Protein1'):

    #add gene name to data
    name_to_uniprot = pd.DataFrame(pose_config.uniprot_to_genename, index = ['Gene']).T
    name_to_uniprot['Gene'] = name_to_uniprot['Gene'].apply(lambda x: x.split(' ') if x == x else np.nan)
    name_to_uniprot = name_to_uniprot.explode('Gene')
    name_to_uniprot = name_to_uniprot.reset_index()
    name_to_uniprot.columns = ['UniProtKB/Swiss-Prot ID', 'Gene name']
    name_to_uniprot = name_to_uniprot.drop_duplicates(subset = 'Gene name', keep = False)

    #protein name is provided as either ensemble gene id or gene name check for both
    df = df.merge(pose_config.translator[['Gene stable ID']].reset_index().dropna().drop_duplicates(), left_on = col, right_on = 'Gene stable ID', how = 'left')
    df = df.rename(columns = {'index': 'From_ID'})
    df = df.merge(name_to_uniprot, left_on = col, right_on = 'Gene name', how = 'left')
    df = df.rename(columns = {'UniProtKB/Swiss-Prot ID': 'From_Name'})

    #grab unique id from 'From_ID' and 'From_Name' column, if available
    uniprot_ids = df['From_Name'].combine_first(df['From_ID'])
    return uniprot_ids.values

def add_PTMcode_interprotein(spliced_ptms, fname = None, report_success = True):
    if fname is None:
        ptmcode = pd.read_csv('https://ptmcode.embl.de/data/PTMcode2_associations_between_proteins.txt.gz', sep = '\t', header = 2, compression = 'gzip')
    else:
        check_file(fname, expected_extension = '.gz')
        ptmcode = pd.read_csv(fname, sep = '\t', header = 2, compression='gzip')

    #grab human interactions
    ptmcode = ptmcode[ptmcode['Species'] == 'Homo sapiens']
    #ignore intraprotein interactions
    ptmcode = ptmcode[ptmcode['## Protein1'] != ptmcode['Protein2']]

    #get uniprot id for primary protein and interacting protein
    ptmcode['UniProtKB Accession'] = extract_ids_PTMcode(ptmcode, '## Protein1')
    ptmcode['Interacting Protein'] = extract_ids_PTMcode(ptmcode, 'Protein2')

    ptmcode = ptmcode.dropna(subset = ['UniProtKB Accession', 'Interacting Protein'])
    #remove duplicate proteins (some entries have different ids but are actually the same protein)
    ptmcode = ptmcode[ptmcode['UniProtKB Accession'] != ptmcode['Interacting Protein']]

    #aggregate interactions
    ptmcode['Interacting Residue'] = ptmcode['Interacting Protein'] + '_' + ptmcode['Residue2']


    #convert modification names
    convert_dict = {'Adp ribosylation': 'ADP Ribosylation', 'Glutamine deamidation':'Deamidation'}
    new_mod_names = []
    failed_mod = []
    mod_list = ptmcode['PTM1'].unique()
    for mod in mod_list:
        mod = mod.capitalize()
        if 'glycosylation' in mod:
            new_mod_names.append('Glycosylation')
        elif mod in pose_config.modification_conversion['Modification Class'].values:
            new_mod_names.append(mod)
        elif mod in convert_dict.keys():
            new_mod_names.append(convert_dict[mod])
        else:
            try:
                new_mod = pose_config.modification_conversion[pose_config.modification_conversion['Modification'] == mod].values[0][0]
                new_mod_names.append(new_mod)
            except:
                failed_mod.append(mod)
                new_mod_names.append(mod)
    conversion_df = pd.DataFrame({'PTM1':mod_list, 'Modification Class':new_mod_names})

    ptmcode = ptmcode.merge(conversion_df, on = 'PTM1', how = 'left')


    ptmcode = ptmcode.rename(columns = {'Residue1':'Residue'})
    ptmcode = ptmcode.groupby(['UniProtKB Accession', 'Residue', 'Modification Class'])['Interacting Residue'].agg(';'.join).reset_index()
    ptmcode = ptmcode.rename(columns = {'UniProtKB/Swiss-Prot ID':'UniProtKB Accession', 'Residue1':'Residue', 'Interacting Residue':'PTMcode:Interprotein_Interactions'})

    #separate residue information into separate columns, one for amino acid and one for position
    ptmcode['PTM Position in Canonical Isoform'] = ptmcode['Residue'].apply(lambda x: float(x[1:]))
    ptmcode['Residue'] = ptmcode['Residue'].apply(lambda x: x[0])

            #if splice data already has the annotation columns, remove them
    if 'PTMcode:Interprotein_Interactions' in spliced_ptms.columns:
        spliced_ptms = spliced_ptms.drop(columns = ['PTMcode:Interprotein_Interactions'])

        #explode dataframe on modifications
    if spliced_ptms['Modification Class'].str.contains(';').any():
        spliced_ptms['Modification Class'] = spliced_ptms['Modification Class'].str.split(';')
        spliced_ptms = spliced_ptms.explode('Modification Class').reset_index(drop = True)

    #add to splice data
    original_data_size = spliced_ptms.shape[0]
    spliced_ptms = spliced_ptms.merge(ptmcode, how = 'left', on = ['UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform', 'Modification Class'])
    if spliced_ptms.shape[0] != original_data_size:
        raise RuntimeError('Dataframe size has changed, check for duplicates in spliced ptms dataframe')
    
    #report the number of PTMs identified
    if report_success:
        num_ptms_with_PTMcode_data = spliced_ptms.dropna(subset = 'PTMcode:Interprotein_Interactions').groupby(['UniProtKB Accession', 'Residue']).size().shape[0]
        print(f"PTMcode interprotein interactions added: {num_ptms_with_PTMcode_data} PTMs in dataset found with PTMcode interprotein interaction information")

    return spliced_ptms

[docs]def extract_positions_from_DEPOD(x):
    """
    Given string object consisting of multiple modifications in the form of 'Residue-Position' separated by ', ', extract the residue and position. Ignore any excess details in the string.
    """
    x = x.split('[')[0].split(', ')
    #for each residue in list, find location of 'Ser', 'Thr' and 'Tyr' in the string (should either have '-' or a number immediately after it)
    new_x = []
    for item in x:
        #determine type of modification
        if 'Ser' in item:
            loc = [match.start() for match in re.finditer('Ser', item)]
            res = 'S'
        elif 'Thr' in item:
            loc = [match.start() for match in re.finditer('Thr', item)]
            res = 'T'
        elif 'Tyr' in item:
            loc = [match.start() for match in re.finditer('Tyr', item)]
            res = 'Y'
        elif 'His' in item:
            loc = [match.start() for match in re.finditer('His', item)]
            res = 'H'
        else:
            loc = -1

        #check if multiple locations were found, if so grab last entry
        if loc == -1:
            item = np.nan
            make_string = False
        elif len(loc) > 1:
            make_string = True
            loc = loc[-1]
        else:
            loc = loc[0]
            make_string = True
        
        #find integer
        if make_string:
            if '-' in item[loc:]:
                item = item.split('-')
                item = res + item[1].strip()
            else:
                item = item[loc+3:]
                item = res + item

        new_x.append(item)
    
    return new_x

def add_DEPOD_phosphatase_data(spliced_ptms, report_success = True):

    #download data
    depod1 = pd.read_excel('https://depod.bioss.uni-freiburg.de/download/PPase_protSubtrates_201903.xls', sheet_name='PSprots')
    depod2 = pd.read_excel('https://depod.bioss.uni-freiburg.de/download/PPase_protSubtrates_newPairs_201903.xls', sheet_name = 'newPSprots')
    depod = pd.concat([depod1, depod2])

    #remove any rows with missing sit information
    depod = depod.dropna(subset = 'Dephosphosites')

    #remove excess annotations that make parsing difficult
    depod['Dephosphosites'] = depod['Dephosphosites'].apply(lambda x: x.split('[')[0])
    depod['Dephosphosites'] = depod['Dephosphosites'].apply(lambda x: x.split('(')[0])
    depod['Dephosphosites'] = depod['Dephosphosites'].apply(lambda x: x.split(';')[0])
    depod['Dephosphosites'] = depod['Dephosphosites'].apply(lambda x: x.split('in')[0])
    depod['Dephosphosites'] = depod['Dephosphosites'].str.replace('in ref.', '')

    #separate individual sites
    depod['Dephosphosites'] = depod['Dephosphosites'].str.split(',')
    depod = depod.explode('Dephosphosites')
    depod = depod[(~depod['Dephosphosites'].str.contains('Isoform')) & (~depod['Dephosphosites'].str.contains('isoform'))]

    #process dephosphosite strings to extract residue and position and explode so that each phosphosite is its own row
    depod['Dephosphosites'] = depod['Dephosphosites'].apply(extract_positions_from_DEPOD)
    depod = depod.explode('Dephosphosites')

    #separate multiple substrate accessions into their own rows (many of these link back to the same ID, but will keep just in case)
    depod['Substrate accession numbers'] = depod['Substrate accession numbers'].str.split(' ')
    depod = depod.explode('Substrate accession numbers')
    depod = depod.dropna(subset = ['Substrate accession numbers'])

    #extract only needed information and add phosphorylation as modification type
    #extract only needed information and add phosphorylation as modification type
    depod['Residue'] = depod['Dephosphosites'].apply(lambda x: x[0] if x == x else np.nan)
    depod['PTM Position in Canonical Isoform'] = depod['Dephosphosites'].apply(lambda x: int(x[1:]) if x == x else np.nan)
    depod = depod.rename({'Substrate accession numbers': 'UniProtKB Accession', 'Phosphatase entry names':'DEPOD:Phosphatase'}, axis = 1)
    depod = depod[['DEPOD:Phosphatase', 'UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform']]
    depod['Modification Class'] = 'Phosphorylation'

    #combine on the same PTM
    depod = depod.drop_duplicates()
    depod = depod.groupby(['UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform', 'Modification Class'], as_index = False)['DEPOD:Phosphatase'].agg(';'.join)

        #if splice data already has the annotation columns, remove them
    if 'DEPOD:Phosphatase' in spliced_ptms.columns:
        spliced_ptms = spliced_ptms.drop(columns = ['DEPOD:Phosphatase'])

        #explode dataframe on modifications
    if spliced_ptms['Modification Class'].str.contains(';').any():
        spliced_ptms['Modification Class'] = spliced_ptms['Modification Class'].str.split(';')
        spliced_ptms = spliced_ptms.explode('Modification Class').reset_index(drop = True)

    #add to splice data
    original_data_size = spliced_ptms.shape[0]
    spliced_ptms = spliced_ptms.merge(depod, how = 'left', on = ['UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform', 'Modification Class'])
    if spliced_ptms.shape[0] != original_data_size:
        raise RuntimeError('Dataframe size has changed, check for duplicates in spliced ptms dataframe')
    
    #report the number of PTMs identified
    if report_success:
        num_ptms_with_PTMcode_data = spliced_ptms.dropna(subset = 'DEPOD:Phosphatase').groupby(['UniProtKB Accession', 'Residue']).size().shape[0]
        print(f"DEPOD Phosphatase substrates added: {num_ptms_with_PTMcode_data} PTMs in dataset found with Phosphatase substrate information")

    return spliced_ptms

def add_RegPhos_data(spliced_ptms, file = None, report_success = True):
    if file is None:
        regphos = pd.read_csv('http://140.138.144.141/~RegPhos/download/RegPhos_Phos_human.txt', sep = '\t', dtype = {'position':int, 'description':str,'catalytic kinase':str, 'reference':'str'})
    else:
        check_file(file, expected_extension = '.txt')
        regphos = pd.read_csv(file, sep = '\t')

    regphos = regphos.dropna(subset = 'catalytic kinase')
    #regphos['Residue'] = regphos['code'] + regphos['position'].astype(str)
    regphos = regphos.rename(columns = {'code': 'Residue', 'position':'PTM Position in Canonical Isoform', 'AC': 'UniProtKB Accession', 'catalytic kinase': 'RegPhos:Kinase'})
    regphos['Modification Class'] = 'Phosphorylation'
    regphos = regphos[['UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform', 'Modification Class', 'RegPhos:Kinase']].dropna()
    regphos = regphos.groupby(['UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform', 'Modification Class']).agg(';'.join).reset_index()

    #if splice data already has the annotation columns, remove them
    if 'RegPhos:Kinase' in spliced_ptms.columns:
        spliced_ptms = spliced_ptms.drop(columns = ['RegPhos:Kinase'])

    #explode dataframe on modifications
    if spliced_ptms['Modification Class'].str.contains(';').any():
        spliced_ptms['Modification Class'] = spliced_ptms['Modification Class'].str.split(';')
        spliced_ptms = spliced_ptms.explode('Modification Class').reset_index(drop = True)

    #add to splice data
    original_data_size = spliced_ptms.shape[0]
    spliced_ptms = spliced_ptms.merge(regphos, how = 'left', on = ['UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform', 'Modification Class'])
    if spliced_ptms.shape[0] != original_data_size:
        raise RuntimeError('Dataframe size has changed, check for duplicates in spliced ptms dataframe')
    
    #report the number of PTMs identified
    if report_success:
        num_ptms_with_regphos_data = spliced_ptms.dropna(subset = 'RegPhos:Kinase').groupby(['UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform']).size().shape[0]
        print(f"RegPhos kinase-substrate data added: {num_ptms_with_regphos_data} PTMs in dataset found with kinase-substrate information")

    return spliced_ptms


def add_PTMsigDB_data(spliced_ptms, file = None, report_success = True):
    #if file is None:
    #    ptmsigdb = pd.read_excel('https://proteomics.broadapps.org/ptmsigdb/_w_8b062d9e/appff37efd164a676afcc8e6e42e6058e01/session/a2b28c4ed29deadd6779fdd26aec33c1/download/download.xlsx?w=8b062d9e', sheet_name = 'human')
    #else:
    check_file(file, expected_extension = '.xlsx')
    ptmsigdb = pd.read_excel(file, sheet_name = 'human')


    ptmsigdb['UniProtKB Accession'] = ptmsigdb['site.uniprot'].str.split(';').str[0]
    ptmsigdb['Residue'] = ptmsigdb['site.uniprot'].str.split(';').str[1].str[0]
    ptmsigdb['PTM Position in Canonical Isoform'] = ptmsigdb['site.uniprot'].apply(lambda x: int(x.split(';')[1].split('-')[0][1:]))

    #filter out excess information in some of the site.ptm column, then convert to modification class details
    ptmsigdb['site.ptm'] = ptmsigdb['site.ptm'].apply(lambda x: x.split(';')[1].split('-')[1] if ';' in x else x)
    ptmsigdb['Modification Class'] = ptmsigdb['site.ptm'].map(mod_shorthand_dict)

    #combine signature and direction for annotation column
    ptmsigdb['Signature'] = ptmsigdb['signature'] +'->'+ ptmsigdb['site.direction']

    #drop unneeded columns
    ptmsigdb = ptmsigdb[['UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform', 'Modification Class', 'Signature', 'category']]
    ptmsigdb['Signature'] = ptmsigdb.apply(lambda x: x['Signature'].replace(x['category'] + '_', ''), axis = 1)
    ptmsigdb['category'] = 'PTMsigDB:' + ptmsigdb['category'] 
    ptmsigdb = ptmsigdb.drop_duplicates()

        #convert to pivot table with each category being a separate column
    ptmsigdb = ptmsigdb.pivot_table(index = ['UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform', 'Modification Class'], columns = 'category', values = 'Signature', aggfunc=';'.join).reset_index()

    #remove psp data if it is already in spliced ptms
    if 'PSP:Kinase' in spliced_ptms.columns:
        ptmsigdb = ptmsigdb.drop(columns = 'PTMsigDB:KINASE-PSP')

    if 'PSP:Disease_Association' in spliced_ptms.columns:
        ptmsigdb = ptmsigdb.drop(columns = 'PTMsigDB:DISEASE-PSP')


    #if splice data already has the annotation columns, remove them
    if 'PTMsigDB:PATH-BI' in spliced_ptms.columns:
        cols_in_data = [col for col in spliced_ptms.columns if 'PTMsigDB' in col]
        spliced_ptms = spliced_ptms.drop(columns = cols_in_data)


    #explode dataframe on modifications
    if spliced_ptms['Modification Class'].str.contains(';').any():
        spliced_ptms['Modification Class'] = spliced_ptms['Modification Class'].str.split(';')
        spliced_ptms = spliced_ptms.explode('Modification Class').reset_index(drop = True)

    #merge with spliced_ptm info
    original_data_size = spliced_ptms.shape[0]
    spliced_ptms = spliced_ptms.merge(ptmsigdb, how = 'left', on = ['UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform', 'Modification Class'])
    if spliced_ptms.shape[0] != original_data_size:
        raise RuntimeError('Dataset size changed upon merge, please make sure there are no duplicates in spliced ptms data')


    #report the number of ptms with motif data
    if report_success:
        num_ptms_with_ikip = spliced_ptms.dropna(subset = 'PTMsigDB:KINASE-iKiP').groupby(['UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform', 'Modification Class']).size().shape[0]
        num_ptms_with_path_bi = spliced_ptms.dropna(subset = 'PTMsigDB:PATH-BI').groupby(['UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform', 'Modification Class']).size().shape[0]
        num_ptms_with_path_np= spliced_ptms.dropna(subset = 'PTMsigDB:PATH-NP').groupby(['UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform', 'Modification Class']).size().shape[0]
        num_ptms_with_path_wp = spliced_ptms.dropna(subset = 'PTMsigDB:PATH-WP').groupby(['UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform', 'Modification Class']).size().shape[0]
        num_ptms_with_dia_pert = spliced_ptms.dropna(subset = 'PTMsigDB:PERT-P100-DIA').groupby(['UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform', 'Modification Class']).size().shape[0]
        num_ptms_with_dia2_pert = spliced_ptms.dropna(subset = 'PTMsigDB:PERT-P100-DIA2').groupby(['UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform', 'Modification Class']).size().shape[0]
        num_ptms_with_prm_pert = spliced_ptms.dropna(subset = 'PTMsigDB:PERT-P100-PRM').groupby(['UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform', 'Modification Class']).size().shape[0]
        num_ptms_with_psp_pert = spliced_ptms.dropna(subset = 'PTMsigDB:PERT-PSP').groupby(['UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform', 'Modification Class']).size().shape[0]
        print(f"PTMsigDB added:\n\t ->{num_ptms_with_ikip} PTMs associated with kinases in iKiP\n\t ->{num_ptms_with_path_wp} PTMs associated with molecular pathway signatures from WikiPathways\n\t ->{num_ptms_with_path_np} PTMs associated with molecular pathway signatures from NetPath\n\t ->{num_ptms_with_psp_pert} PTMs with PhosphoSitePlus perturbations\n\t ->{num_ptms_with_dia_pert} with perturbations in LINCS P1000 DIA dataset \n\t ->{num_ptms_with_dia2_pert} with perturbations in LINCS P1000 DIA2 dataset\n\t ->{num_ptms_with_prm_pert} with perturbations in LINCS P1000 PRM dataset")
    return spliced_ptms



######### Functions for combining annotations from multiple sources ########

[docs]def convert_PSP_label_to_UniProt(label):
    """
    Given a label for an interacting protein from PhosphoSitePlus, convert to UniProtKB accession. Required as PhosphoSitePlus interactions are recorded in various ways that aren't necessarily consistent with other databases (i.e. not always gene name)

    Parameters
    ----------
    label: str
        Label for interacting protein from PhosphoSitePlus
    """
    if not hasattr(pose_config, 'genename_to_uniprot'):
        #using uniprot to gene name dict, construct dict to go the other direction (gene name to uniprot id)
        pose_config.genename_to_uniprot = pose_config.flip_uniprot_dict(pose_config.uniprot_to_genename)


    #remove isoform label if present
    if label in pose_config.genename_to_uniprot: #if PSP name is gene name found in uniprot
        return pose_config.genename_to_uniprot[label]
    elif label.upper() in pose_config.genename_to_uniprot:
        return pose_config.genename_to_uniprot[label.upper()]
    elif label.split(' ')[0].upper() in pose_config.genename_to_uniprot:
        return pose_config.genename_to_uniprot[label.split(' ')[0].upper()]
    elif label.replace('-', '').upper() in pose_config.genename_to_uniprot:
        return pose_config.genename_to_uniprot[label.replace('-', '').upper()]
    elif label in pose_config.psp_name_dict: # if PSP name is not gene name, but is in conversion dictionary
        return pose_config.psp_name_dict[label]
    else: #otherwise note that gene was missed
        return np.nan
        #missed_genes.append(gene)

def extract_interaction_details(interaction, column = "PSP:ON_PROT_INTERACT"):

    interaction_types = {'PTMcode:Interprotein_Interactions':'INDUCES', 'PSP:Kinase':'REGULATES', 'DEPOD:Phosphatase':'REGULATES', 'RegPhos:Kinase':'REGULATES', 'Combined:Kinase':'REGULATES', 'ELM:Interactions':'UNCLEAR'}
    if column == 'PSP:ON_PROT_INTERACT':
        type = interaction.split('(')[1].split(')')[0]
        protein = interaction.split('(')[0].strip(' ')
    elif column == 'PTMInt:Interaction':
        ptmint_type_conversion = {'Inhibit':'DISRUPTS', 'Enhance':"INDUCES"}
        type = ptmint_type_conversion[interaction.split('->')[1]]
        protein = interaction.split('->')[0]
    elif column == 'PTMcode:Interprotein_Interactions':
        type = 'INDUCES'
        protein = interaction.split('_')[0]
    else:
        type = interaction_types[column]
        protein = interaction

    return type, protein

[docs]def unify_interaction_data(spliced_ptms, interaction_col, name_dict = {}):
    """
    Given spliced ptm data and a column containing interaction data, extract the interacting protein, type of interaction, and convert to UniProtKB accession. This will be added as a new column labeled 'Interacting ID'

    Parameters
    ----------
    spliced_ptms: pd.DataFrame
        Dataframe containing PTM data
    interaction_col: str
        column containing interaction information from a specific database
    name_dict: dict
        dictionary to convert names within given database to UniProt IDs. For cases when name is not necessarily one of the gene names listed in UniProt

    Returns
    -------
    interact: pd.DataFrame
        Contains PTMs and their interacting proteins, the type of influence the PTM has on the interaction (DISRUPTS, INDUCES, or REGULATES)
    """
    if not hasattr(pose_config, 'genename_to_uniprot'):
        #using uniprot to gene name dict, construct dict to go the other direction (gene name to uniprot id)
        pose_config.genename_to_uniprot = pose_config.flip_uniprot_dict(pose_config.uniprot_to_genename)

    #extract PSP data from annotated PTMs, separate cases in which single PTM has multipe interactions
    data_cols = [col for col in spliced_ptms.columns if col in ['Significance', 'dPSI']]
    interact = spliced_ptms.dropna(subset = interaction_col)[['Gene', 'UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform', 'Modification Class',interaction_col] + data_cols]
    if interact.empty:
        print(f"No PTMs associated with {interaction_col}")
        return interact
    
    interact[interaction_col] = interact[interaction_col].apply(lambda x: x.split(';'))
    interact = interact.explode(interaction_col)

    #extract protein and type of interaction (currently for phosphosite plus)
    type = []
    protein = []
    for i, row in interact.iterrows():
        processed = extract_interaction_details(row[interaction_col], interaction_col)
        type.append(processed[0])
        protein.append(processed[1])
    interact['Type']  = type
    interact['Interacting Protein'] = protein
        

    #convert interacting protein to uniprot id for databases that are not reported in uniprot ids
    if interaction_col not in ['PTMcode:Interprotein_Interactions', 'ELM:Interactions']:
        interacting_id = []
        missed_genes = []
        for gene in interact['Interacting Protein']:
            #remove isoform label if present
            if gene in pose_config.genename_to_uniprot: #if PSP name is gene name found in uniprot
                interacting_id.append(pose_config.genename_to_uniprot[gene])
            elif gene.upper() in pose_config.genename_to_uniprot:
                interacting_id.append(pose_config.genename_to_uniprot[gene.upper()])
            elif gene.split(' ')[0].upper() in pose_config.genename_to_uniprot:
                interacting_id.append(pose_config.genename_to_uniprot[gene.split(' ')[0].upper()])
            elif gene.replace('-', '').upper() in pose_config.genename_to_uniprot:
                interacting_id.append(pose_config.genename_to_uniprot[gene.replace('-', '').upper()])
            elif gene in name_dict: # if PSP name is not gene name, but is in conversion dictionary
                interacting_id.append(name_dict[gene])
            else: #otherwise note that gene was missed
                interacting_id.append(np.nan)
                missed_genes.append(gene)

        #save information
        interact['Interacting ID'] = interacting_id
        interact = interact.dropna(subset = 'Interacting ID')


        #check if there multiple in one row
        if interact['Interacting ID'].str.contains(';').any():
            interact['Interacting ID'] = interact['Interacting ID'].apply(lambda x: x.split(';'))
            interact = interact.explode('Interacting ID')
    else:
        interact['Interacting ID'] = interact['Interacting Protein']
    

    interact['Interacting ID'] = interact['Interacting ID'].apply(lambda x: x.split(' ')[0] if x == x else np.nan)
    interact = interact.explode('Interacting ID')
    interact = interact.dropna(subset = 'Interacting ID')
    interact = interact.drop(columns = interaction_col).drop_duplicates()

    return interact

[docs]def add_annotation(spliced_ptms, database = 'PhosphoSitePlus', annotation_type = 'Function', file = None, check_existing = False):
    """
    Given a desired database and annotation type, add the corresponding annotation data to the spliced ptm dataframe

    Parameters
    ----------
    spliced_ptms: pd.DataFrame
        Dataframe containing PTM data
    database: str
        Database to extract annotation data from. Options include 'PhosphoSitePlus', 'PTMcode', 'PTMInt', 'RegPhos', 'DEPOD'
    annotation_type: str
        Type of annotation to extract. Options include 'Function', 'Process', 'Interactions', 'Disease', 'Kinase', 'Phosphatase', but depend on the specific database (run analyze.get_annotation_categories())
    file: str
        File path to annotation data. If None, will download from online source, except for PhosphoSitePlus (due to licensing restrictions)
    """
    if check_existing:
        annot_col = annotation_col_dict[database][annotation_type]
        if annot_col in spliced_ptms.columns:
            print(f"Annotation data for {database} {annotation_type} already present in provided dataframe, skipping. If you would like to update annotation data, set check_existing = False")
            return spliced_ptms

    if database == "PhosphoSitePlus":
        if annotation_type in ['Function', 'Process', 'Interactions']:
            check_file(file, expected_extension='.gz')
            spliced_ptms = add_PSP_regulatory_site_data(spliced_ptms, file = file)
        elif annotation_type == 'Disease':
            check_file(file, expected_extension='.gz')
            spliced_ptms = add_PSP_disease_association(spliced_ptms, file = file)
        elif annotation_type == 'Kinase':
            check_file(file, expected_extension='.gz')
            spliced_ptms = add_PSP_kinase_substrate_data(spliced_ptms, file = file)
        else:
            raise ValueError(f"Annotation type {annotation_type} not recognized for PhosphoSitePlus")
    elif database == 'PTMcode':
        #if annotation_type == 'Intraprotein':
        #    if file is not None:
        #        check_file(file, expected_extension='.gz')
        #        spliced_ptms = add_PTMcode_intraprotein(spliced_ptms, file = file)
        #    else:
        #        spliced_ptms = add_PTMcode_intraprotein(spliced_ptms)
        if annotation_type == 'Interactions':
            if file is not None:
                check_file(file, expected_extension='.gz')
                spliced_ptms = add_PTMcode_interprotein(spliced_ptms, file = file)
            else:
                spliced_ptms = add_PTMcode_interprotein(spliced_ptms)
        else:
            raise ValueError(f"Annotation type {annotation_type} not recognized for PTMcode")
    elif database == 'PTMInt':
        if annotation_type == 'Interactions':
            if file is not None:
                check_file(file, expected_extension='.csv')
                spliced_ptms = add_PTMInt_data(spliced_ptms, file = file)
            else:
                spliced_ptms = add_PTMInt_data(spliced_ptms)
        else:
            raise ValueError(f"Annotation type {annotation_type} not recognized for PTMInt")
    elif database == 'RegPhos':
        if annotation_type == 'Kinase':
            if file is not None:
                check_file(file, expected_extension='.txt')
                spliced_ptms = add_RegPhos_data(spliced_ptms, file = file)
            else:
                spliced_ptms = add_RegPhos_data(spliced_ptms)
        else:
            raise ValueError(f"Annotation type {annotation_type} not recognized for RegPhos")
    elif database == 'DEPOD':
        if annotation_type == 'Phosphatase':
            spliced_ptms = add_DEPOD_phosphatase_data(spliced_ptms, file = file)
        else:
            raise ValueError(f"Annotation type {annotation_type} not recognized for RegPhos")
    elif database == 'Combined':
        if annotation_type == 'Kinase':
            if 'PSP:Kinase' not in spliced_ptms.columns:
                raise ValueError("PhosphoSitePlus kinase data not found in spliced PTM dataframe, please annotate with this first")
            if 'RegPhos:Kinase' not in spliced_ptms.columns:
                spliced_ptms = add_RegPhos_data(spliced_ptms)
            spliced_ptms = combine_KS_data(spliced_ptms)
        elif annotation_type == 'Interactions':
            spliced_ptms = combine_interaction_data(spliced_ptms)
    else:
        raise ValueError(f"Database {database} not recognized")
    
    return spliced_ptms
    

[docs]def combine_interaction_data(spliced_ptms, interaction_databases = ['PhosphoSitePlus', 'PTMcode', 'PTMInt', 'RegPhos', 'DEPOD', 'ELM'], include_enzyme_interactions = True):
    """
    Given annotated spliced ptm data, extract interaction data from various databases and combine into a single dataframe. This will include the interacting protein, the type of interaction, and the source of the interaction data

    Parameters
    ----------
    spliced_ptms: pd.DataFrame
        Dataframe containing PTM data and associated interaction annotations from various databases
    interaction_databases: list
        List of databases to extract interaction data from. Options include 'PhosphoSitePlus', 'PTMcode', 'PTMInt', 'RegPhos', 'DEPOD'. These should already have annotation columns in the spliced_ptms dataframe, otherwise they will be ignored. For kinase-substrate interactions, if combined column is present, will use that instead of individual databases
    include_enzyme_interactions: bool
        If True, will include kinase-substrate and phosphatase interactions in the output dataframe

    Returns
    -------
    interact_data: list
        List of dataframes containing PTMs and their interacting proteins, the type of influence the PTM has on the interaction (DISRUPTS, INDUCES, or REGULATES), and the source of the interaction data

    """
    interact_data = []
    combined_added = False
    for database in interaction_databases:
        if database == 'PhosphoSitePlus' and 'PSP:ON_PROT_INTERACT' in spliced_ptms.columns:
            if not spliced_ptms['PSP:ON_PROT_INTERACT'].isna().all():
                print('PhosphoSitePlus regulatory site data found and added')
                interact = unify_interaction_data(spliced_ptms, 'PSP:ON_PROT_INTERACT', pose_config.psp_name_dict)
                interact['Source'] = database
                interact_data.append(interact)

  
        if database == 'PTMcode' and 'PTMcode:Interprotein_Interactions' in spliced_ptms.columns:
            if not spliced_ptms['PTMcode:Interprotein_Interactions'].isna().all():
                print('PTMcode data found and added')
                interact = unify_interaction_data(spliced_ptms, 'PTMcode:Interprotein_Interactions')
                interact['Source'] = database
                interact_data.append(interact)
        if database == 'PTMInt' and 'PTMInt:Interaction' in spliced_ptms.columns:
            if not spliced_ptms['PTMInt:Interaction'].isna().all():
                print('PTMInt data found and added')
                interact = unify_interaction_data(spliced_ptms, 'PTMInt:Interaction')
                interact['Source'] = database
                interact_data.append(interact)
        if database == 'ELM' and 'ELM:Interactions' in spliced_ptms.columns:
            if not spliced_ptms['ELM:Interactions'].isna().all():
                print('ELM data found and added')
                interact = unify_interaction_data(spliced_ptms, 'ELM:Interactions')
                interact['Source'] = database
                interact_data.append(interact)
        
        if include_enzyme_interactions:
            #dictionary to convert kinase names to gene names
            ks_genes_to_uniprot = {'ABL1(ABL)':'P00519', 'ACK':'Q07912', 'AURC':'Q9UQB9', 'ERK1(MAPK3)':'P27361','ERK2(MAPK1)':'P28482',  'ERK5(MAPK7)':'Q13164','JNK1(MAPK8)':'P45983', 'CK1A':'P48729', 'JNK2(MAPK9)':'P45984', 'JNK3(MAPK10)':'P53779', 'P38A(MAPK14)':'Q16539','P38B(MAPK11)':'Q15759', 'P38G(MAPK12)':'P53778','P70S6K' :'Q9UBS0', 'PAK':'Q13153', 'PKCZ':'Q05513', 'CK2A':'P19784', 'ABL2':'P42684', 'AMPKA1':'Q13131', 'AMPKA2':'Q13131', 'AURB':'Q96GD4', 'CAMK1A':'Q14012', 'CDC42BP':'Q9Y5S2','CK1D':'P48730','CK1E':'P49674','CK2B':'P67870','DMPK1':'Q09013', 'DNAPK':'P78527','DSDNA KINASE':'P78527', 'EG3 KINASE':'P49840','ERK3(MAPK6)':'Q16659','GSK3':'P49840', 'MRCKA':'Q5VT25', 'P38D(MAPK13)':'O15264','P70S6KB':'Q9UBS0','PDKC':'P78527','PKCH':'P24723','PKCI':'P41743','PKCT':'Q04759','PKD3':'O94806','PKG1':'Q13976','PKG2':'Q13237','SMMLCK':'Q15746'}
            if 'Combined:Kinase' in spliced_ptms.columns and not combined_added:
                if not spliced_ptms['Combined:Kinase'].isna().all():
                    print('Combined kinase-substrate data found and added')
                    interact = unify_interaction_data(spliced_ptms, 'Combined:Kinase', ks_genes_to_uniprot)
                    interact['Source'] = 'PSP/RegPhos'
                    interact_data.append(interact)
                    combined_added = True
            elif 'Combined:Kinase' not in spliced_ptms.columns:
                if 'RegPhos:Kinase' in spliced_ptms.columns and database == 'RegPhos':
                    if not spliced_ptms['RegPhos:Kinase'].isna().all():
                        print('RegPhos kinase-substrate data found and added')
                        interact = unify_interaction_data(spliced_ptms, 'RegPhos:Kinase', ks_genes_to_uniprot)
                        interact['Source'] = database
                        interact_data.append(interact)
                if 'PSP:Kinase' in spliced_ptms.columns and database == 'PhosphoSitePlus':
                    if not spliced_ptms['PSP:Kinase'].isna().all():
                        print('PhosphoSitePlus kinase-substrate data found and added')
                        interact = unify_interaction_data(spliced_ptms, 'PSP:Kinase', ks_genes_to_uniprot)
                        interact['Source'] = database
                        interact_data.append(interact)

            if database == 'DEPOD' and 'DEPOD:Phosphatase' in spliced_ptms.columns:
                if not spliced_ptms['DEPOD:Phosphatase'].isna().all():
                    print('DEPOD phosphatase-substrate data found and added')
                    interact = unify_interaction_data(spliced_ptms, 'DEPOD:Phosphatase')
                    interact['Source'] = database
                    interact_data.append(interact)
    
    if len(interact_data) > 0:
        interact_data = pd.concat(interact_data)
        extra_cols = [col for col in interact_data.columns if col in ['dPSI', 'Significance']]
        interact_data = interact_data.groupby(['UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform', 'Interacting ID', 'Type']+extra_cols, dropna = False, as_index = False)['Source'].apply(helpers.join_unique_entries)
    
        #convert uniprot ids back to gene names for interpretability
        ptm_gene = []
        interacting_gene = []
        for i, row in interact_data.iterrows():
            ptm_gene.append(pose_config.uniprot_to_genename[row['UniProtKB Accession'].split('-')[0]].split(' ')[0]) if row['UniProtKB Accession'].split('-')[0] in pose_config.uniprot_to_genename else ptm_gene.append(row['UniProtKB Accession'])
            interacting_gene.append(pose_config.uniprot_to_genename[row['Interacting ID'].split('-')[0]].split(' ')[0]) if row['Interacting ID'].split('-')[0] in pose_config.uniprot_to_genename else interacting_gene.append(row['Interacting ID'])
        interact_data['Modified Gene'] = ptm_gene
        interact_data["Interacting Gene"] = interacting_gene
  
  
        return interact_data.drop_duplicates()
    else:
        return pd.DataFrame()



[docs]def combine_KS_data(spliced_ptms, ks_databases = ['PhosphoSitePlus', 'RegPhos'], regphos_conversion = {'ERK1(MAPK3)':'MAPK3', 'ERK2(MAPK1)':'MAPK1', 'JNK2(MAPK9)':'MAPK9','CDC2':'CDK1', 'CK2A1':'CSNK2A1', 'PKACA':'PRKACA', 'ABL1(ABL)':'ABL1'}):
    """
    Given spliced ptm information, combine kinase-substrate data from multiple databases (currently support PhosphoSitePlus and RegPhos), assuming that the kinase data from these resources has already been added to the spliced ptm data. The combined kinase data will be added as a new column labeled 'Combined:Kinase'

    Parameters
    ----------
    spliced_ptms: pd.DataFrame
        Spliced PTM data from project module
    ks_databases: list
        List of databases to combine kinase data from. Currently support PhosphoSitePlus and RegPhos
    regphos_conversion: dict
        Allows conversion of RegPhos names to matching names in PhosphoSitePlus.

    Returns
    -------
    splicde_ptms: pd.DataFrame
        Spliced PTM data with combined kinase data added
    
    """
    if not hasattr(pose_config, 'genename_to_uniprot'):
        pose_config.genename_to_uniprot = pose_config.flip_uniprot_dict(pose_config.uniprot_to_genename)

    ks_data = []
    for i, row in spliced_ptms.iterrows():
        combined = []
        for db in ks_databases:
            if db == 'PhosphoSitePlus':
                psp = row['PSP:Kinase'].split(';') if row['PSP:Kinase'] == row['PSP:Kinase'] else []
                #convert PSP names to a common name (first gene name provided by uniprot)
                psp = [pose_config.uniprot_to_genename[pose_config.genename_to_uniprot[kin]].split(' ')[0]  if kin in pose_config.genename_to_uniprot else kin for kin in psp]
                combined += psp
            elif db == 'RegPhos':
                regphos = row['RegPhos:Kinase'].split(';') if row['RegPhos:Kinase'] == row['RegPhos:Kinase'] else []
                for i, rp in enumerate(regphos):
                    if rp in pose_config.genename_to_uniprot:
                        regphos[i] = pose_config.uniprot_to_genename[pose_config.genename_to_uniprot[rp]].split(' ')[0]
                    elif rp.split('(')[0] in pose_config.genename_to_uniprot:
                        regphos[i] = pose_config.uniprot_to_genename[pose_config.genename_to_uniprot[rp.split('(')[0]]].split(' ')[0]
                    elif rp.upper() in regphos_conversion:
                        regphos[i] = regphos_conversion[rp.upper()]
                    else:
                        regphos[i] = rp.upper()
                combined += regphos


        if len(combined) > 0:
            ks_data.append(';'.join(set(combined)))
        else:
            ks_data.append(np.nan)

    spliced_ptms['Combined:Kinase'] = ks_data
    return spliced_ptms


[docs]def check_file(fname, expected_extension = '.tsv'):
    """
    Given a file name, check if the file exists and has the expected extension. If the file does not exist or has the wrong extension, raise an error.

    Parameters
    ----------
    fname: str
        File name to check
    expected_extension: str
        Expected file extension. Default is '.tsv'
    """
    if fname is None:
        raise ValueError('Annotation file path must be provided')
    if not os.path.exists(fname):
        raise ValueError(f'File {fname} not found')
    
    if not fname.endswith(expected_extension):
        raise ValueError(f'File {fname} does not have the expected extension ({expected_extension})')
    




[docs]def annotate_ptms(spliced_ptms, psp_regulatory_site_file = None, psp_ks_file = None, psp_disease_file = None, elm_interactions = False, elm_motifs = False, PTMint = False, PTMcode_interprotein = False, DEPOD = False, RegPhos = False, ptmsigdb_file = None, interactions_to_combine = ['PTMcode', 'PhosphoSitePlus', 'RegPhos', 'PTMInt'], kinases_to_combine = ['PhosphoSitePlus', 'RegPhos'], combine_similar = True):
    """
    Given spliced ptm data, add annotations from various databases. The annotations that can be added are the following:
    - PhosphoSitePlus 
        - regulatory site data (file must be provided)
        - kinase-substrate data (file must be provided)
        - disease association data (file must be provided)
    - ELM 
        - interaction data (can be downloaded automatically or provided as a file)
        - motif matches (elm class data can be downloaded automatically or provided as a file)
    - PTMInt
        - interaction data (will be downloaded automatically)
    - PTMcode
        - intraprotein interactions (can be downloaded automatically or provided as a file)
        - interprotein interactions (can be downloaded automatically or provided as a file)
    - DEPOD
        - phosphatase-substrate data (will be downloaded automatically)
    - RegPhos
        - kinase-substrate data (will be downloaded automatically)

    Parameters
    ----------
    spliced_ptms: pd.DataFrame
        Spliced PTM data from project module
    psp_regulatory_site_file: str
        File path to PhosphoSitePlus regulatory site data
    psp_ks_file: str
        File path to PhosphoSitePlus kinase-substrate data
    psp_disease_file: str
        File path to PhosphoSitePlus disease association data
    elm_interactions: bool or str
        If True, download ELM interaction data automatically. If str, provide file path to ELM interaction data
    elm_motifs: bool or str
        If True, download ELM motif data automatically. If str, provide file path to ELM motif data
    PTMint: bool
        If True, download PTMInt data automatically
    PTMcode_intraprotein: bool or str
        If True, download PTMcode intraprotein data automatically. If str, provide file path to PTMcode intraprotein data
    PTMcode_interprotein: bool or str
        If True, download PTMcode interprotein data automatically. If str, provide file path to PTMcode interprotein data
    DEPOD: bool
        If True, download DEPOD data automatically
    RegPhos: bool
        If True, download RegPhos data automatically
    ptmsigdb_file: str
        File path to PTMsigDB data
    interactions_to_combine: list
        List of databases to combine interaction data from. Default is ['PTMcode', 'PhosphoSitePlus', 'RegPhos', 'PTMInt']
    kinases_to_combine: list
        List of databases to combine kinase-substrate data from. Default is ['PhosphoSitePlus', 'RegPhos']
    combine_similar: bool
        Whether to combine annotations of similar information (kinase, interactions, etc) from multiple databases into another column labeled as 'Combined'. Default is True
    """
    if psp_regulatory_site_file is not None:
        try:
            check_file(psp_regulatory_site_file, expected_extension='.gz')
            spliced_ptms = add_PSP_regulatory_site_data(spliced_ptms, file = psp_regulatory_site_file)
        except Exception as e:
            raise RuntimeError(f'Error adding PhosphoSitePlus regulatory site data. Error message: {e}')
    if psp_ks_file is not None:
        try:    
            check_file(psp_ks_file, expected_extension='.gz')
            spliced_ptms = add_PSP_kinase_substrate_data(spliced_ptms, file = psp_ks_file)
        except Exception as e:
            raise RuntimeError(f'Error adding PhosphoSitePlus kinase-substrate data. Error message: {e}')
    if psp_disease_file is not None:
        try:
            check_file(psp_disease_file, expected_extension='.gz')
            spliced_ptms = add_PSP_disease_association(spliced_ptms, file = psp_disease_file)
        except Exception as e:
            raise RuntimeError(f'Error adding PhosphoSitePlus disease association data. Error message: {e}')
    if elm_interactions:
        try:
            if isinstance(elm_interactions, bool):
                spliced_ptms = add_ELM_interactions(spliced_ptms)
            elif isinstance(elm_interactions, str):
                check_file(elm_interactions, expected_extension='.tsv')
                spliced_ptms = add_ELM_interactions(spliced_ptms, file = elm_interactions)
            else:
                raise ValueError('elm_interactions must be either a boolean (download elm data automatically, slower) or a string (path to elm data tsv file, faster)')
        except Exception as e:
            raise RuntimeError(f'Error adding ELM interaction data. Error message: {e}')
    if elm_motifs:
        try:
            if isinstance(elm_motifs, bool):
                spliced_ptms = add_ELM_matched_motifs(spliced_ptms)
            elif isinstance(elm_motifs, str):
                check_file(elm_motifs, expected_extension='.tsv')
                spliced_ptms = add_ELM_matched_motifs(spliced_ptms, file = elm_motifs)
            else:
                raise ValueError('elm_interactions must be either a boolean (download elm data automatically, slower) or a string (path to elm data tsv file, faster)')
        except Exception as e:
            raise RuntimeError(f'Error adding ELM motif matches. Error message: {e}')
    if PTMint:
        try:
            if isinstance(PTMint, bool):
                spliced_ptms = add_PTMInt_data(spliced_ptms)
            elif isinstance(PTMint, str):
                check_file(PTMint, expected_extension='.csv')
                spliced_ptms = add_PTMInt_data(spliced_ptms, file = PTMint)
            else:
                raise ValueError('PTMint must be either a boolean (download PTMInt data automatically, slower) or a string (path to PTMInt data csv file, faster)')
        except Exception as e:
            raise RuntimeError(f'Error adding PTMInt interaction data. Error message: {e}')
    #if PTMcode_intraprotein:
    #    try:
    #        if isinstance(PTMcode_intraprotein, bool):
    #            spliced_ptms = add_PTMcode_intraprotein(spliced_ptms)
    #        elif isinstance(PTMcode_intraprotein, str):
    #            check_file(PTMcode_intraprotein, expected_extension='.gz')
    #            spliced_ptms = add_PTMcode_intraprotein(spliced_ptms, fname = PTMcode_intraprotein)
    #        else:
    #            raise ValueError('PTMcode_intraprotein must be either a boolean (download PTMcode data automatically, slower) or a string (path to PTMcode data file, faster)')
    #    except Exception as e:
    #        print(f'Error adding PTMcode intraprotein interaction data. Error message: {e}')
    if PTMcode_interprotein:
        try:
            if isinstance(PTMcode_interprotein, bool):
                spliced_ptms = add_PTMcode_interprotein(spliced_ptms)
            elif isinstance(PTMcode_interprotein, str):
                check_file(PTMcode_interprotein, expected_extension='.gz')
                spliced_ptms = add_PTMcode_interprotein(spliced_ptms, fname = PTMcode_interprotein)
            else:
                raise ValueError('PTMcode_interprotein must be either a boolean (download PTMcode data automatically, slower) or a string (path to PTMcode data file, faster)')
        except Exception as e:
            raise RuntimeError(f'Error adding PTMcode interprotein interaction data. Error message: {e}')
    if DEPOD:
        try:
            spliced_ptms = add_DEPOD_phosphatase_data(spliced_ptms)
        except Exception as e:
            raise RuntimeError(f'Error adding DEPOD phosphatase data. Error message: {e}')
    if RegPhos:
        try:
            if isinstance(RegPhos, str):
                check_file(RegPhos, expected_extension='.txt')
                spliced_ptms = add_RegPhos_data(spliced_ptms, file = RegPhos)
            else:
                spliced_ptms = add_RegPhos_data(spliced_ptms)
        except Exception as e:
            raise RuntimeError(f'Error adding RegPhos kinase substrate data data. Error message: {e}')
    if ptmsigdb_file is not None:
        try:
            spliced_ptms = add_PTMsigDB_data(spliced_ptms, file = ptmsigdb_file)
        except Exception as e:
            raise RuntimeError(f'Error adding PTMsigDB data. Error message: {e}')

    if combine_similar:
        interaction_cols = ['PTMcode:Interprotein_Interactions', 'PSP:ON_PROT_INTERACT', 'PSP:Kinase', 'PTMInt:Interaction', 'RegPhos:Kinase', 'DEPOD:Phosphatase']
        if set(interaction_cols).intersection(spliced_ptms.columns) != 0:
            print('\nCombining interaction data from multiple databases')
            interact = combine_interaction_data(spliced_ptms, interaction_databases = interactions_to_combine)
            if not interact.empty:
                interact['Combined:Interactions'] = interact['Interacting Gene']+'->'+interact['Type']
                interact = interact.groupby(['UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform'], dropna = False, as_index = False)['Combined:Interactions'].apply(lambda x: ';'.join(np.unique(x)))
                if 'Combined:Interactions' in spliced_ptms.columns:
                    spliced_ptms = spliced_ptms.drop(columns = ['Combined:Interactions'])
    
                spliced_ptms = spliced_ptms.merge(interact, how = 'left', on = ['UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform'])
            else:
                spliced_ptms['Combined:Interactions'] = np.nan

        #check for what kinase data is available
        spliced_ptms = combine_KS_data(spliced_ptms, ks_databases=kinases_to_combine) #add combined kinase column


    return spliced_ptms