{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Reference Structure Files generation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Fetch all reference files for an Interpro_ID of interest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "from CoDIAC import *\n",
    "import CoDIAC\n",
    "\n",
    "Uniprot_IDs = ['O14508', 'Q06124'] #SOCS2 and PTPN11 were selected as examples for demonstrating ligand and domain-domain interfaces\n",
    "Interpro_ID = 'IPR000980' # this is the domain we want to analyze within the Uniprot_IDs\n",
    "#We will be creating a lot of files, this is how we would like them to be named\n",
    "data_root = 'Data/'\n",
    "name_root = 'SH2_'+Interpro_ID\n",
    "\n",
    "# The files we will make in this process (so that different pieces of code can be run below as needed)\n",
    "uniprot_reference_file = data_root+'Uniprot_Reference/'+name_root+'_uniprot_reference.csv' # The uniprot reference\n",
    "fasta_long_header_file = data_root + 'Uniprot_Reference/' + name_root+'_long_header.fasta'\n",
    "fasta_file = data_root + 'Uniprot_Reference/' + name_root+'.fasta'\n",
    "#note: in addition to these 3 files, this also makes a mapping file for movng between fasta_long_header_file and fasta_file\n",
    "\n",
    "#PDB Files we'll make in this process\n",
    "PDB_file = data_root + 'PDB_Reference/' + name_root + '_PDB.csv'\n",
    "PDB_file_annotated = data_root+ 'PDB_Reference/' + name_root + '_PDB_annotated.csv'\n",
    "PDB_file_filtered = data_root + 'PDB_Reference/' + name_root + '_PDB_reference.csv' #The final PDB structure file, containing only filtered proteins\n",
    "\n",
    "# PTMs feature directory location\n",
    "feature_dir = data_root+'Uniprot_Reference/Features_relative_to_reference/PTM_features/'\n",
    "\n",
    "# You can set offsets here for the family, used to reduce the bounds of the N- or C-terminal (systematically across all domains in the family)\n",
    "N_OFFSET = 0\n",
    "C_OFFSET = -1\n",
    "\n",
    "# Set the number of PTMS required to be considered across the domain family for feature files (here low, since we're only considering two proteins)\n",
    "PTM_THRESHOLD = 1\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### STEP 2: Make a human reference file of the family of interest \n",
    "(Skipping Step 1, fetch of all proteins, since we're using targeted Uniprot fetch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Domain Reference File successfully created!\n",
      "Adding Interpro Domains\n",
      "Fetching domains..\n",
      "Appending domains to file..\n",
      "Interpro metadata succesfully incorporated\n"
     ]
    }
   ],
   "source": [
    "# These uniprot_IDs (list) came from fetching all uniprot IDs for an Interpro ID, but\n",
    "# this could be a fixed list of interest, or even all unique IDs in a proteome\n",
    "uniprot_df = CoDIAC.UniProt.makeRefFile(Uniprot_IDs, uniprot_reference_file)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### STEP 3: Get information about all PDB IDs that exist for the reference proteins of interest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Structure Reference File successfully created!\n",
      "All PDBs successfully fetched\n"
     ]
    }
   ],
   "source": [
    "CoDIAC.PDB.generateStructureRefFile_fromUniprotFile(uniprot_reference_file, PDB_file)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### STEP 4: Annotate the structure file with reference, for domain annotation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "struct_df_out = CoDIAC.IntegrateStructure_Reference.add_reference_info_to_struct_file(PDB_file, uniprot_reference_file, PDB_file_annotated, INTERPRO=True, verbose=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### STEP 5: Reduce the structure file to just those that contain the domain of interest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Made Data/PDB_Reference/SH2_IPR000980_PDB_reference.csv file: 84 structures retained of 94 total\n"
     ]
    }
   ],
   "source": [
    "# Now with an appended PDB File, create an output that contains only the lines that have SH2 domains\n",
    "CoDIAC.IntegrateStructure_Reference.filter_structure_file(PDB_file_annotated, Interpro_ID, PDB_file_filtered)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Reference FASTA file generation"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### STEP 6: Create the FASTA Reference file for domain of interest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "n offset is 0 and c offset is -1\n",
      "Created files: Data/Uniprot_Reference/SH2_IPR000980.fasta and Data/Uniprot_Reference/SH2_IPR000980_mapping.csv\n"
     ]
    }
   ],
   "source": [
    "# Given the SH2 domain file, create the fasta reference file (using INTERPRO as default)\n",
    "\n",
    "CoDIAC.UniProt.print_domain_fasta_file(uniprot_reference_file, Interpro_ID, fasta_long_header_file, N_OFFSET, C_OFFSET, APPEND=False)\n",
    "\n",
    "# Shortening the fasta headers, still unique for each domain/protein pair\n",
    "# dropping the redundant information about the domains printed. This creates a shorter header, useful for reading and processing\n",
    "key_array_order= ['uniprot', 'gene', 'domain_num', 'start', 'end']\n",
    "#translation creates a mapping file \n",
    "output_fasta, mapping_file = CoDIAC.UniProt.translate_fasta_to_new_headers(fasta_long_header_file, fasta_file, key_array_order)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Feature and annotation files generation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## STEP 7 Create the PTM feature files"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### STEP 7a create the ProteomeScout based Features files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "n offset is 0 and c offset is -1\n",
      "n offset is 0 and c offset is -1\n",
      "Wrote these feature files:\n",
      "['Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/ProteomeScout/IPR000980_Phosphoserine.feature', 'Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/ProteomeScout/IPR000980_Phosphothreonine.feature', 'Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/ProteomeScout/IPR000980_Phosphotyrosine.feature', 'Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/ProteomeScout/IPR000980_N6-acetyllysine.feature', 'Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/ProteomeScout/IPR000980_Ubiquitination.feature']\n",
      "These belong to the following fasta file:\n",
      "Data/Uniprot_Reference/SH2_IPR000980.fasta\n",
      "{'Phosphoserine': 4, 'Phosphothreonine': 2, 'Phosphotyrosine': 5, 'N6-acetyllysine': 2, 'Ubiquitination': 1}\n"
     ]
    }
   ],
   "source": [
    "feature_dir_prot = feature_dir+'ProteomeScout/'\n",
    "ptm_feature_file_list_pscout, ptm_count_dict, ptm_feature_dict, mapping_dict = CoDIAC.PTM.write_PTM_features(Interpro_ID, uniprot_reference_file, feature_dir_prot, mapping_file, N_OFFSET, C_OFFSET, gap_threshold=0.7, num_PTM_threshold = PTM_THRESHOLD)\n",
    "print(\"Wrote these feature files:\")\n",
    "print(ptm_feature_file_list_pscout)\n",
    "print(\"These belong to the following fasta file:\")\n",
    "print(output_fasta) #comes from block above - the short header format of the fasta header\n",
    "print(ptm_count_dict)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### STEP 7b create the PhosphoSite based Features files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "n offset is 0 and c offset is -1\n",
      "n offset is 0 and c offset is -1\n",
      "Wrote these feature files:\n",
      "['Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/PHOSPHOSITE_PLUS/IPR000980_Phosphoserine.feature', 'Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/PHOSPHOSITE_PLUS/IPR000980_Phosphothreonine.feature', 'Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/PHOSPHOSITE_PLUS/IPR000980_Phosphotyrosine.feature', 'Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/PHOSPHOSITE_PLUS/IPR000980_Ubiquitination.feature', 'Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/PHOSPHOSITE_PLUS/IPR000980_Acetylation.feature']\n",
      "These belong to the following fasta file:\n",
      "Data/Uniprot_Reference/SH2_IPR000980.fasta\n",
      "{'Phosphoserine': 6, 'Phosphothreonine': 2, 'Phosphotyrosine': 5, 'Ubiquitination': 4, 'Acetylation': 2}\n"
     ]
    }
   ],
   "source": [
    "feature_dir_psite = feature_dir + 'PHOSPHOSITE_PLUS/'\n",
    "ptm_feature_file_list_psite, ptm_count_dict, ptm_feature_dict, mapping_dict = CoDIAC.PTM.write_PTM_features(Interpro_ID, uniprot_reference_file, feature_dir_psite, mapping_file, N_OFFSET, C_OFFSET, gap_threshold=0.7, num_PTM_threshold = PTM_THRESHOLD, PHOSPHOSITE_PLUS=True)\n",
    "print(\"Wrote these feature files:\")\n",
    "print(ptm_feature_file_list_psite)\n",
    "print(\"These belong to the following fasta file:\")\n",
    "print(output_fasta) #comes from block above - the short header format of the fasta header\n",
    "print(ptm_count_dict)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 8 combine feature files from ProteomeScout and PhosphoSitePlus and generate annotation tracks."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#paired list\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Created Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/Combined/SH2_IPR000980_N6-acetyllysine.feature\n",
      "Created Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/Combined/SH2_IPR000980_Phosphoserine.feature\n",
      "Created Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/Combined/SH2_IPR000980_Phosphothreonine.feature\n",
      "Created Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/Combined/SH2_IPR000980_Phosphotyrosine.feature\n",
      "Created Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/Combined/SH2_IPR000980_Ubiquitination.feature\n"
     ]
    }
   ],
   "source": [
    "pairs = {}\n",
    "\n",
    "proteomescout_base = feature_dir_prot+'/IPR000980_'\n",
    "PSP_base = feature_dir_psite+'/IPR000980_'\n",
    "pairs['N6-acetyllysine'] = [proteomescout_base+'N6-acetyllysine.feature', PSP_base+'Acetylation.feature']\n",
    "pairs['Phosphoserine'] = [proteomescout_base+'Phosphoserine.feature', PSP_base+'Phosphoserine.feature']\n",
    "pairs['Phosphothreonine'] = [proteomescout_base+'Phosphothreonine.feature', PSP_base+'Phosphothreonine.feature']\n",
    "pairs['Phosphotyrosine'] = [proteomescout_base+'Phosphotyrosine.feature', PSP_base+'Phosphotyrosine.feature']\n",
    "pairs['Ubiquitination'] = [proteomescout_base+'Ubiquitination.feature', PSP_base+'Ubiquitination.feature']\n",
    "\n",
    "output_dir = feature_dir+'Combined/'\n",
    "new_feature_files = {}\n",
    "\n",
    "for mod in pairs.keys():\n",
    "    feature_file = output_dir+'SH2_IPR000980_'+mod+'.feature'\n",
    "    feature_combined, feature_color_dict = CoDIAC.jalviewFunctions.combine_feature_files(feature_file, pairs[mod])\n",
    "    new_feature_files[mod] = feature_file\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}