{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Reference Structure Files generation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Fetch all reference files for an Interpro_ID of interest" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "from CoDIAC import *\n", "import CoDIAC\n", "\n", "Uniprot_IDs = ['O14508', 'Q06124'] #SOCS2 and PTPN11 were selected as examples for demonstrating ligand and domain-domain interfaces\n", "Interpro_ID = 'IPR000980' # this is the domain we want to analyze within the Uniprot_IDs\n", "#We will be creating a lot of files, this is how we would like them to be named\n", "data_root = 'Data/'\n", "name_root = 'SH2_'+Interpro_ID\n", "\n", "# The files we will make in this process (so that different pieces of code can be run below as needed)\n", "uniprot_reference_file = data_root+'Uniprot_Reference/'+name_root+'_uniprot_reference.csv' # The uniprot reference\n", "fasta_long_header_file = data_root + 'Uniprot_Reference/' + name_root+'_long_header.fasta'\n", "fasta_file = data_root + 'Uniprot_Reference/' + name_root+'.fasta'\n", "#note: in addition to these 3 files, this also makes a mapping file for movng between fasta_long_header_file and fasta_file\n", "\n", "#PDB Files we'll make in this process\n", "PDB_file = data_root + 'PDB_Reference/' + name_root + '_PDB.csv'\n", "PDB_file_annotated = data_root+ 'PDB_Reference/' + name_root + '_PDB_annotated.csv'\n", "PDB_file_filtered = data_root + 'PDB_Reference/' + name_root + '_PDB_reference.csv' #The final PDB structure file, containing only filtered proteins\n", "\n", "# PTMs feature directory location\n", "feature_dir = data_root+'Uniprot_Reference/Features_relative_to_reference/PTM_features/'\n", "\n", "# You can set offsets here for the family, used to reduce the bounds of the N- or C-terminal (systematically across all domains in the family)\n", "N_OFFSET = 0\n", "C_OFFSET = -1\n", "\n", "# Set the number of PTMS required to be considered across the domain family for feature files (here low, since we're only considering two proteins)\n", "PTM_THRESHOLD = 1\n" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### STEP 2: Make a human reference file of the family of interest \n", "(Skipping Step 1, fetch of all proteins, since we're using targeted Uniprot fetch)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Domain Reference File successfully created!\n", "Adding Interpro Domains\n", "Fetching domains..\n", "Appending domains to file..\n", "Interpro metadata succesfully incorporated\n" ] } ], "source": [ "# These uniprot_IDs (list) came from fetching all uniprot IDs for an Interpro ID, but\n", "# this could be a fixed list of interest, or even all unique IDs in a proteome\n", "uniprot_df = CoDIAC.UniProt.makeRefFile(Uniprot_IDs, uniprot_reference_file)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### STEP 3: Get information about all PDB IDs that exist for the reference proteins of interest" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Structure Reference File successfully created!\n", "All PDBs successfully fetched\n" ] } ], "source": [ "CoDIAC.PDB.generateStructureRefFile_fromUniprotFile(uniprot_reference_file, PDB_file)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### STEP 4: Annotate the structure file with reference, for domain annotation" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "struct_df_out = CoDIAC.IntegrateStructure_Reference.add_reference_info_to_struct_file(PDB_file, uniprot_reference_file, PDB_file_annotated, INTERPRO=True, verbose=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### STEP 5: Reduce the structure file to just those that contain the domain of interest" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Made Data/PDB_Reference/SH2_IPR000980_PDB_reference.csv file: 84 structures retained of 94 total\n" ] } ], "source": [ "# Now with an appended PDB File, create an output that contains only the lines that have SH2 domains\n", "CoDIAC.IntegrateStructure_Reference.filter_structure_file(PDB_file_annotated, Interpro_ID, PDB_file_filtered)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Reference FASTA file generation" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### STEP 6: Create the FASTA Reference file for domain of interest" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "n offset is 0 and c offset is -1\n", "Created files: Data/Uniprot_Reference/SH2_IPR000980.fasta and Data/Uniprot_Reference/SH2_IPR000980_mapping.csv\n" ] } ], "source": [ "# Given the SH2 domain file, create the fasta reference file (using INTERPRO as default)\n", "\n", "CoDIAC.UniProt.print_domain_fasta_file(uniprot_reference_file, Interpro_ID, fasta_long_header_file, N_OFFSET, C_OFFSET, APPEND=False)\n", "\n", "# Shortening the fasta headers, still unique for each domain/protein pair\n", "# dropping the redundant information about the domains printed. This creates a shorter header, useful for reading and processing\n", "key_array_order= ['uniprot', 'gene', 'domain_num', 'start', 'end']\n", "#translation creates a mapping file \n", "output_fasta, mapping_file = CoDIAC.UniProt.translate_fasta_to_new_headers(fasta_long_header_file, fasta_file, key_array_order)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Feature and annotation files generation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 7 Create the PTM feature files" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### STEP 7a create the ProteomeScout based Features files" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "n offset is 0 and c offset is -1\n", "n offset is 0 and c offset is -1\n", "Wrote these feature files:\n", "['Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/ProteomeScout/IPR000980_Phosphoserine.feature', 'Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/ProteomeScout/IPR000980_Phosphothreonine.feature', 'Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/ProteomeScout/IPR000980_Phosphotyrosine.feature', 'Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/ProteomeScout/IPR000980_N6-acetyllysine.feature', 'Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/ProteomeScout/IPR000980_Ubiquitination.feature']\n", "These belong to the following fasta file:\n", "Data/Uniprot_Reference/SH2_IPR000980.fasta\n", "{'Phosphoserine': 4, 'Phosphothreonine': 2, 'Phosphotyrosine': 5, 'N6-acetyllysine': 2, 'Ubiquitination': 1}\n" ] } ], "source": [ "feature_dir_prot = feature_dir+'ProteomeScout/'\n", "ptm_feature_file_list_pscout, ptm_count_dict, ptm_feature_dict, mapping_dict = CoDIAC.PTM.write_PTM_features(Interpro_ID, uniprot_reference_file, feature_dir_prot, mapping_file, N_OFFSET, C_OFFSET, gap_threshold=0.7, num_PTM_threshold = PTM_THRESHOLD)\n", "print(\"Wrote these feature files:\")\n", "print(ptm_feature_file_list_pscout)\n", "print(\"These belong to the following fasta file:\")\n", "print(output_fasta) #comes from block above - the short header format of the fasta header\n", "print(ptm_count_dict)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### STEP 7b create the PhosphoSite based Features files" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "n offset is 0 and c offset is -1\n", "n offset is 0 and c offset is -1\n", "Wrote these feature files:\n", "['Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/PHOSPHOSITE_PLUS/IPR000980_Phosphoserine.feature', 'Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/PHOSPHOSITE_PLUS/IPR000980_Phosphothreonine.feature', 'Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/PHOSPHOSITE_PLUS/IPR000980_Phosphotyrosine.feature', 'Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/PHOSPHOSITE_PLUS/IPR000980_Ubiquitination.feature', 'Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/PHOSPHOSITE_PLUS/IPR000980_Acetylation.feature']\n", "These belong to the following fasta file:\n", "Data/Uniprot_Reference/SH2_IPR000980.fasta\n", "{'Phosphoserine': 6, 'Phosphothreonine': 2, 'Phosphotyrosine': 5, 'Ubiquitination': 4, 'Acetylation': 2}\n" ] } ], "source": [ "feature_dir_psite = feature_dir + 'PHOSPHOSITE_PLUS/'\n", "ptm_feature_file_list_psite, ptm_count_dict, ptm_feature_dict, mapping_dict = CoDIAC.PTM.write_PTM_features(Interpro_ID, uniprot_reference_file, feature_dir_psite, mapping_file, N_OFFSET, C_OFFSET, gap_threshold=0.7, num_PTM_threshold = PTM_THRESHOLD, PHOSPHOSITE_PLUS=True)\n", "print(\"Wrote these feature files:\")\n", "print(ptm_feature_file_list_psite)\n", "print(\"These belong to the following fasta file:\")\n", "print(output_fasta) #comes from block above - the short header format of the fasta header\n", "print(ptm_count_dict)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 8 combine feature files from ProteomeScout and PhosphoSitePlus and generate annotation tracks." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#paired list\n", "\n" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Created Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/Combined/SH2_IPR000980_N6-acetyllysine.feature\n", "Created Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/Combined/SH2_IPR000980_Phosphoserine.feature\n", "Created Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/Combined/SH2_IPR000980_Phosphothreonine.feature\n", "Created Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/Combined/SH2_IPR000980_Phosphotyrosine.feature\n", "Created Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/Combined/SH2_IPR000980_Ubiquitination.feature\n" ] } ], "source": [ "pairs = {}\n", "\n", "proteomescout_base = feature_dir_prot+'/IPR000980_'\n", "PSP_base = feature_dir_psite+'/IPR000980_'\n", "pairs['N6-acetyllysine'] = [proteomescout_base+'N6-acetyllysine.feature', PSP_base+'Acetylation.feature']\n", "pairs['Phosphoserine'] = [proteomescout_base+'Phosphoserine.feature', PSP_base+'Phosphoserine.feature']\n", "pairs['Phosphothreonine'] = [proteomescout_base+'Phosphothreonine.feature', PSP_base+'Phosphothreonine.feature']\n", "pairs['Phosphotyrosine'] = [proteomescout_base+'Phosphotyrosine.feature', PSP_base+'Phosphotyrosine.feature']\n", "pairs['Ubiquitination'] = [proteomescout_base+'Ubiquitination.feature', PSP_base+'Ubiquitination.feature']\n", "\n", "output_dir = feature_dir+'Combined/'\n", "new_feature_files = {}\n", "\n", "for mod in pairs.keys():\n", " feature_file = output_dir+'SH2_IPR000980_'+mod+'.feature'\n", " feature_combined, feature_color_dict = CoDIAC.jalviewFunctions.combine_feature_files(feature_file, pairs[mod])\n", " new_feature_files[mod] = feature_file\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.18" } }, "nbformat": 4, "nbformat_minor": 4 }