[1]:

import pandas as pd
import os
from CoDIAC import contactMap as cm
from CoDIAC import PDBHelper
from CoDIAC import pTyrLigand_helpers as pTyr_helpers
import CoDIAC

[ ]:

#N and C terminal offsets for regions of interest (ROI)
N_offset_ROI_1 = 0
C_offset_ROI_1 = -1
N_offset_ROI_2 = 0
C_offset_ROI_2 = -1

Analysis using PDB structures¶

A. Domain-domain contact interface extraction¶

STEP 1: Fetching data from PDB reference file for a PDB examined¶

[2]:

PATH = './Data/Adjacency_files/' #this is where you can find the adjacency files
ann_file = './Data/PDB_Reference/SH2_IPR000980_PDB_reference.csv' #this is the annotation file
PDB_ID = '4JGH' #SOCS family protein

ann = pd.read_csv(ann_file)
entities = CoDIAC.PDBHelper.PDBEntitiesClass(ann, PDB_ID)
entity =1 #there's only one entity in this example
pdbClass = entities.pdb_dict[entity] #this holds information about the protein crystalized, such as domains
chain_A = CoDIAC.contactMap.chainMap(PDB_ID, entity)
chain_A.construct(PATH)

[3]:

print('struct sequence:', chain_A.structSeq)
print('minimum residue number:', chain_A.return_min_residue())
print('length of residues in seq:', len(chain_A.resNums))
print('residue array size:', chain_A.arr.shape)
print('unmodelled residues list:',chain_A.unmodeled_list)

struct sequence: HMDPEFQAARLAKALRELGQTGWYWGSMTVNEAKEKLKEAPEGTFLIRDSSHSDYLLTISVKTSAGPTNLRIEYQDGKFRLDSIICVKSKLKQFDSVVHLIDYYVQMCKD----------GTVHLYLTKPLYTSAPSLQHLCRLTINKCTGAIWGLPLPTRLKDYLEEYKFQV
minimum residue number: 26
length of residues in seq: 173
residue array size: (173, 173)
unmodelled residues list: [136, 145]

[4]:

print(pdbClass.ref_seq_mutated)
print(pdbClass.ref_seq_positions[0])
print(len(pdbClass.ref_seq_mutated))

QAARLAKALRELGQTGWYWGSMTVNEAKEKLKEAPEGTFLIRDSSHSDYLLTISVKTSAGPTNLRIEYQDGKFRLDSIICVKSKLKQFDSVVHLIDYYVQMCKDKRTGPEAPRNGTVHLYLTKPLYTSAPSLQHLCRLTINKCTGAIWGLPLPTRLKDYLEEYKFQV
32
167

STEP 2: Create an entity specific class to align structure to reference sequence positions¶

[5]:

chain_A_aligned = CoDIAC.contactMap.translate_chainMap_to_RefSeq(chain_A, pdbClass) #class created to align structure to reference sequence positions

Deleting 6 amino acids
Adding 1 c-terminal gaps

[6]:

#These are the domains that are available in the structure for analysis
pdbClass.domains

[6]:

{0: {'SH2': [46, 156, 0, 0]}, 1: {'SOCS_box': [151, 197, 0, 0]}}

STEP 3: Identify domain boundaries¶

[7]:

ROI_1 = [46, 156] #SH2 domain
ROI_2 = [151, 197] #socsbox domain
fastaHeader = 'SH2|SOCS_box'

STEP 4: Generate fasta/feature files for jalview visualization¶

[11]:

chain_A_aligned.print_fasta_feature_files(ROI_1[0],N_offset_ROI_1, ROI_1[1], C_offset_ROI_1,ROI_2[0], N_offset_ROI_2,ROI_2[1],C_offset_ROI_2, 'O14508|SOCS2|1|46|156', 'SOCS_box',
                                         './Data/Feature_Fasta_files/SH2_interdomain_4JGH', append=False, use_ref_seq_aligned=True)

[10]:

#another visualization of contacts between regions of interest
chain_A_aligned.generateAnnotatedHeatMap(ROI_2[0], ROI_2[1], ROI_1[0], ROI_1[1], remove_no_contacts=True)

../_images/Examples_contacts_extraction_14_0.png

B. Ligand interface contact extraction using PDB structures¶

STEP 1: Identity PTMS from PDB reference file for a specific PDB¶

[3]:

main = pd.read_csv(ann_file)
PDB_ID='3TL0'
for name, group in main.groupby('PDB_ID'):

    for index, row in group.iterrows():
        if name == PDB_ID:
            modification_from_PDB = (row['modifications'])
            if isinstance(row['modifications'],str): #PTM dictionary
                transDict = CoDIAC.PDBHelper.return_PTM_dict(modification_from_PDB) #dictionary with ptm information from PDB reference file

[4]:

transDict

[4]:

{4: 'PTR'}

STEP 2: Generate single dictionary of contact map for each entity in the structure¶

[5]:

lig_entity = 2
SH2_entity = 1
dict_of_lig = CoDIAC.contactMap.return_single_chain_dict(main, PDB_ID, PATH, lig_entity)
dict_of_SH2 = CoDIAC.contactMap.return_single_chain_dict(main, PDB_ID, PATH, SH2_entity)

3
Adding 1 n_term positions
Adding 2 c-terminal gaps
6
Adding 4 n_term positions
Adding 3 c-terminal gaps

[6]:

dict_of_lig

[6]:

{'entity': 2,
 'PDB_ID': '3TL0',
 'pdb_class': <CoDIAC.PDBHelper.PDBClass at 0x7fd119898310>,
 'cm': <CoDIAC.contactMap.chainMap at 0x7fd10a013ee0>,
 'cm_aligned': <CoDIAC.contactMap.chainMap at 0x7fd108c29e20>}

[7]:

cm_aligned = dict_of_lig['cm_aligned']

[8]:

cm_aligned.__dict__

[8]:

{'PDB_ID': '3TL0',
 'entity': 2,
 'aaRes_dict': {},
 'arr': array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 1., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 1., 1., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 1., 1., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 1., 1., 0., 1., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]),
 'transDict': {4: 'PTR'},
 'structSeq': 'RLNYAQLWHR',
 'resNums': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
 'adjacencyDict': {2: {3: 1, 4: 1},
  3: {4: 1},
  4: {5: 1},
  5: {6: 1, 7: 1},
  6: {7: 1},
  7: {8: 1}},
 'unmodeled_list': [1, 1, 8, 10],
 'match': True,
 'offset': 198,
 'ERROR_CODE': 0,
 'refseq': 'RLNYAQLWHR'}

STEP 3: Select the ligand length by specifying no of residues N and C terminal of PTM of interest (‘y’-pTyr)¶

[8]:

PTM='PTR'
for res in cm_aligned.transDict:
    if res in cm_aligned.resNums:
        if PTM in cm_aligned.transDict[res]:
            res_start, res_end, aligned_str, tick_labels = CoDIAC.pTyr_helpers.return_pos_of_interest(
                cm_aligned.resNums, cm_aligned.structSeq, res, n_term_num=5, c_term_num=5, PTR_value = 'y')

[9]:

res_start, res_end, aligned_str, tick_labels

[9]:

(1,
 9,
 '--RLNyAQLWH',
 ['R-3', 'L-2', 'N-1', 'y0', 'A1', 'Q2', 'L3', 'W4', 'H5', 'R10'])

STEP 4: Generate contacts between two single entity dictionaries with contactmaps (domains and ligands on different entities)¶

[10]:

adjList, arr = CoDIAC.contactMap.return_interChain_adj(PATH, dict_of_lig, dict_of_SH2) #outer dict keys for ligand residues - ligand to domain contacts
adjList_alt, arr_alt = CoDIAC.contactMap.return_interChain_adj(PATH, dict_of_SH2, dict_of_lig) #outer dict keys for SH2 domain - domain to ligand contacts

[11]:

adjList

[11]:

{2: {14: 1, 17: 1},
 4: {32: 1, 34: 1, 35: 1, 36: 1, 42: 1, 53: 1, 55: 1},
 3: {52: 1, 53: 1},
 5: {52: 1, 53: 1, 54: 1, 90: 1, 96: 1},
 7: {54: 1, 65: 1, 88: 1, 89: 1, 90: 1, 96: 1},
 8: {89: 1},
 6: {90: 1, 91: 1}}

[12]:

#fetching start and end residue positions of domains and ligands
domains = dict_of_SH2['pdb_class'].domains
SH2_start = int(domains[0]['SH2'][0])
SH2_stop = int(domains[0]['SH2'][1])
arr_sub, list_aa_from_sub, list_to_aa_sub = CoDIAC.contactMap.return_arr_subset_by_ROI(arr,
                                                                 res_start, res_end, dict_of_lig['cm_aligned'].return_min_residue(),
                                                                 SH2_start, SH2_stop, dict_of_SH2['cm_aligned'].return_min_residue())

[13]:

SH2_start, SH2_stop

[13]:

(4, 102)

STEP 5: Generate fasta/feature files for jalview visualization¶

[18]:

#SH2 domain-centric contacts
fasta_header = 'Q06124|PTPN11|1|4|102|lig_4'
PTM_file = './Data/Feature_Fasta_files/SH2_ligand_3TL0_test'
CoDIAC.contactMap.print_fasta_feature_files(arr_alt, dict_of_SH2['cm_aligned'].refseq,
                                                            SH2_start,N_offset_ROI_1, SH2_stop, C_offset_ROI_1,
                                                            dict_of_SH2['cm_aligned'].return_min_residue(),
                                                            res_start,C_offset_ROI_1, res_end, C_offset_ROI_2,
                                                            dict_of_lig['cm_aligned'].return_min_residue(),
                                                            fasta_header,'pTyr', PTM_file, threshold=1, append=True)

[19]:

#pTyr ligand-centric contacts
SH2_file = './Data/Feature_Fasta_files/pTyr_3TL0_test'
CoDIAC.contactMap.print_fasta_feature_files(arr, dict_of_lig['cm_aligned'].structSeq,
                                                                 res_start, N_offset_ROI_1,res_end, C_offset_ROI_1,
                                                                dict_of_lig['cm_aligned'].return_min_residue(),
                                                                 SH2_start,N_offset_ROI_2, SH2_stop, C_offset_ROI_2,
                                                                dict_of_SH2['cm_aligned'].return_min_residue(),
                                                                fasta_header,'SH2', SH2_file, threshold=1, append=True )

Analysis using AlphaFold structures¶

STEP 1: Generate mmCIF files¶

[ ]:

from CoDIAC import AlphaFold
refUniprot_file = './Data/Uniprot_Reference/SH2_IPR000980_uniprot_reference.csv'
outputDir = './Data/AlphaFold_Reference/cifFiles/'
CoDIAC.AlphaFold.get_cifFile(refUniprot_file, outputDir)

STEP 2: Generate AlphaFold reference csv file¶

[ ]:

cifFile_path = './Data/AlphaFold_Reference/cifFiles/'
outputFile = './Data/AlphaFold_Reference/SH2_AlphaFold_reference.csv'
CoDIAC.AlphaFold.makeRefFile(refUniprot_file, cifFile_path, outputFile)

STEP 3: Generate json files using Arppegio¶

Arpeggio was ran for input mmCIF files of PDBs of interest on High-performance computing system that can allocate more memory for running these jobs.

#!/bin/bash
#SBATCH --nodes=1
#SBATCH --mem=750MB
#SBATCH --time=50:00:00
#SBATCH --partition=standard
#SBATCH -A g_bme-naeglelab
#SBATCH -o SH2_PDB.output
#SBATCH -e SH2_PDB.error

module load anaconda
conda create -n arp-env python=3.7
source activate arp-env
conda install -c openbabel openbabel
pip install biopython
pip install git+https://github.com/PDBeurope/arpeggio.git@master#egg=arpeggio

for fname in *.cif
do
arpeggio $fname -sa
done

The refactored version of Arpeggio that is being made available by PDBe has been used to generate .json files in this work. The pdbe-arpeggio (git release version 1.4.*) was used at the time of contactmap generation.

*pip package installation pip install pdbe-arpeggio can be used for this purpose as well.

STEP 4: Generate binary Adjacency text files¶

[ ]:

from CoDIAC import AdjacencyFiles
cif_path = './Data/CIF_files'
json_path = './Data/JSON_files'
adj_path='./Data/Adjacency_files'
cif_file = cif_path+'/'+PDB_ID+'.cif'
json_file = json_path+'/'+PDB_ID+'.json'
newpath = adj_path+'/'+PDB_ID
os.mkdir(newpath)   #create a directory for PDB to store its corresponding adjacency files
outfile = newpath
CoDIAC.AdjacencyFiles.ProcessArpeggio(json_file, outfile, cif_file, small_molecule=False)    #generates adjacency file
CoDIAC.AdjacencyFiles.BinaryFeatures(PDB_ID, outfile, translate_resid=True)    #generates binarized version

STEP 5: Generate domain-domain contact maps¶

[ ]:

from CoDIAC import contactMap
from CoDIAC import PDBHelper

PATH = './Data/Adjacency_files/' #path to adjacency files
ann_file = './Data/AlphaFold_Reference/SH2_AlphaFold_reference.csv' #this is the reference structure annotation file
PDB_ID = 'AF-O14508-F1' #SOCS family protein
ann = pd.read_csv(ann_file)
entities = CoDIAC.PDBHelper.PDBEntitiesClass(ann, PDB_ID)
entity = 1 #there's only one entity in this example
pdbClass = entities.pdb_dict[entity] #this holds information about the protein crystalized, such as domains
chain_A = CoDIAC.contactMap.chainMap(PDB_ID, entity)
chain_A.construct(PATH)
chain_A_aligned = CoDIAC.contactMap.translate_chainMap_to_RefSeq(chain_A, pdbClass)
ROI_1 = [46, 156] #SH2 domain
ROI_2 = [151, 197] #socsbox domain
fastaHeader = 'SH2|SOCS_box'

Print fasta and feature files that contain contacts extracted across specified domain-domain interface¶

[ ]:

chain_A_aligned.print_fasta_feature_files(ROI_1[0],N_offset_ROI_1, ROI_1[1], C_offset_ROI_1,ROI_2[0], N_offset_ROI_2,ROI_2[1],C_offset_ROI_2,
                                          'O14508|SOCS2|1|46|156', 'SOCS_box',
                                         './Data/Feature_Fasta_files/SH2_interdomain_Afold',
                                          append=False, use_ref_seq_aligned=True)