[1]:
import pandas as pd
import os
from CoDIAC import contactMap as cm
from CoDIAC import PDBHelper
from CoDIAC import pTyrLigand_helpers as pTyr_helpers
import CoDIAC
[ ]:
#N and C terminal offsets for regions of interest (ROI)
N_offset_ROI_1 = 0
C_offset_ROI_1 = -1
N_offset_ROI_2 = 0
C_offset_ROI_2 = -1
Analysis using PDB structures¶
A. Domain-domain contact interface extraction¶
STEP 1: Fetching data from PDB reference file for a PDB examined¶
[2]:
PATH = './Data/Adjacency_files/' #this is where you can find the adjacency files
ann_file = './Data/PDB_Reference/SH2_IPR000980_PDB_reference.csv' #this is the annotation file
PDB_ID = '4JGH' #SOCS family protein
ann = pd.read_csv(ann_file)
entities = CoDIAC.PDBHelper.PDBEntitiesClass(ann, PDB_ID)
entity =1 #there's only one entity in this example
pdbClass = entities.pdb_dict[entity] #this holds information about the protein crystalized, such as domains
chain_A = CoDIAC.contactMap.chainMap(PDB_ID, entity)
chain_A.construct(PATH)
[3]:
print('struct sequence:', chain_A.structSeq)
print('minimum residue number:', chain_A.return_min_residue())
print('length of residues in seq:', len(chain_A.resNums))
print('residue array size:', chain_A.arr.shape)
print('unmodelled residues list:',chain_A.unmodeled_list)
struct sequence: HMDPEFQAARLAKALRELGQTGWYWGSMTVNEAKEKLKEAPEGTFLIRDSSHSDYLLTISVKTSAGPTNLRIEYQDGKFRLDSIICVKSKLKQFDSVVHLIDYYVQMCKD----------GTVHLYLTKPLYTSAPSLQHLCRLTINKCTGAIWGLPLPTRLKDYLEEYKFQV
minimum residue number: 26
length of residues in seq: 173
residue array size: (173, 173)
unmodelled residues list: [136, 145]
[4]:
print(pdbClass.ref_seq_mutated)
print(pdbClass.ref_seq_positions[0])
print(len(pdbClass.ref_seq_mutated))
QAARLAKALRELGQTGWYWGSMTVNEAKEKLKEAPEGTFLIRDSSHSDYLLTISVKTSAGPTNLRIEYQDGKFRLDSIICVKSKLKQFDSVVHLIDYYVQMCKDKRTGPEAPRNGTVHLYLTKPLYTSAPSLQHLCRLTINKCTGAIWGLPLPTRLKDYLEEYKFQV
32
167
STEP 2: Create an entity specific class to align structure to reference sequence positions¶
[5]:
chain_A_aligned = CoDIAC.contactMap.translate_chainMap_to_RefSeq(chain_A, pdbClass) #class created to align structure to reference sequence positions
Deleting 6 amino acids
Adding 1 c-terminal gaps
[6]:
#These are the domains that are available in the structure for analysis
pdbClass.domains
[6]:
{0: {'SH2': [46, 156, 0, 0]}, 1: {'SOCS_box': [151, 197, 0, 0]}}
STEP 3: Identify domain boundaries¶
[7]:
ROI_1 = [46, 156] #SH2 domain
ROI_2 = [151, 197] #socsbox domain
fastaHeader = 'SH2|SOCS_box'
STEP 4: Generate fasta/feature files for jalview visualization¶
[11]:
chain_A_aligned.print_fasta_feature_files(ROI_1[0],N_offset_ROI_1, ROI_1[1], C_offset_ROI_1,ROI_2[0], N_offset_ROI_2,ROI_2[1],C_offset_ROI_2, 'O14508|SOCS2|1|46|156', 'SOCS_box',
'./Data/Feature_Fasta_files/SH2_interdomain_4JGH', append=False, use_ref_seq_aligned=True)
[10]:
#another visualization of contacts between regions of interest
chain_A_aligned.generateAnnotatedHeatMap(ROI_2[0], ROI_2[1], ROI_1[0], ROI_1[1], remove_no_contacts=True)
B. Ligand interface contact extraction using PDB structures¶
STEP 1: Identity PTMS from PDB reference file for a specific PDB¶
[3]:
main = pd.read_csv(ann_file)
PDB_ID='3TL0'
for name, group in main.groupby('PDB_ID'):
for index, row in group.iterrows():
if name == PDB_ID:
modification_from_PDB = (row['modifications'])
if isinstance(row['modifications'],str): #PTM dictionary
transDict = CoDIAC.PDBHelper.return_PTM_dict(modification_from_PDB) #dictionary with ptm information from PDB reference file
[4]:
transDict
[4]:
{4: 'PTR'}
STEP 2: Generate single dictionary of contact map for each entity in the structure¶
[5]:
lig_entity = 2
SH2_entity = 1
dict_of_lig = CoDIAC.contactMap.return_single_chain_dict(main, PDB_ID, PATH, lig_entity)
dict_of_SH2 = CoDIAC.contactMap.return_single_chain_dict(main, PDB_ID, PATH, SH2_entity)
3
Adding 1 n_term positions
Adding 2 c-terminal gaps
6
Adding 4 n_term positions
Adding 3 c-terminal gaps
[6]:
dict_of_lig
[6]:
{'entity': 2,
'PDB_ID': '3TL0',
'pdb_class': <CoDIAC.PDBHelper.PDBClass at 0x7fd119898310>,
'cm': <CoDIAC.contactMap.chainMap at 0x7fd10a013ee0>,
'cm_aligned': <CoDIAC.contactMap.chainMap at 0x7fd108c29e20>}
[7]:
cm_aligned = dict_of_lig['cm_aligned']
[8]:
cm_aligned.__dict__
[8]:
{'PDB_ID': '3TL0',
'entity': 2,
'aaRes_dict': {},
'arr': array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 1., 1., 0., 0., 0., 0., 0., 0.],
[0., 1., 0., 1., 0., 0., 0., 0., 0., 0.],
[0., 1., 1., 0., 1., 0., 0., 0., 0., 0.],
[0., 0., 0., 1., 0., 1., 1., 0., 0., 0.],
[0., 0., 0., 0., 1., 0., 1., 0., 0., 0.],
[0., 0., 0., 0., 1., 1., 0., 1., 0., 0.],
[0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]),
'transDict': {4: 'PTR'},
'structSeq': 'RLNYAQLWHR',
'resNums': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'adjacencyDict': {2: {3: 1, 4: 1},
3: {4: 1},
4: {5: 1},
5: {6: 1, 7: 1},
6: {7: 1},
7: {8: 1}},
'unmodeled_list': [1, 1, 8, 10],
'match': True,
'offset': 198,
'ERROR_CODE': 0,
'refseq': 'RLNYAQLWHR'}
STEP 3: Select the ligand length by specifying no of residues N and C terminal of PTM of interest (‘y’-pTyr)¶
[8]:
PTM='PTR'
for res in cm_aligned.transDict:
if res in cm_aligned.resNums:
if PTM in cm_aligned.transDict[res]:
res_start, res_end, aligned_str, tick_labels = CoDIAC.pTyr_helpers.return_pos_of_interest(
cm_aligned.resNums, cm_aligned.structSeq, res, n_term_num=5, c_term_num=5, PTR_value = 'y')
[9]:
res_start, res_end, aligned_str, tick_labels
[9]:
(1,
9,
'--RLNyAQLWH',
['R-3', 'L-2', 'N-1', 'y0', 'A1', 'Q2', 'L3', 'W4', 'H5', 'R10'])
STEP 4: Generate contacts between two single entity dictionaries with contactmaps (domains and ligands on different entities)¶
[10]:
adjList, arr = CoDIAC.contactMap.return_interChain_adj(PATH, dict_of_lig, dict_of_SH2) #outer dict keys for ligand residues - ligand to domain contacts
adjList_alt, arr_alt = CoDIAC.contactMap.return_interChain_adj(PATH, dict_of_SH2, dict_of_lig) #outer dict keys for SH2 domain - domain to ligand contacts
[11]:
adjList
[11]:
{2: {14: 1, 17: 1},
4: {32: 1, 34: 1, 35: 1, 36: 1, 42: 1, 53: 1, 55: 1},
3: {52: 1, 53: 1},
5: {52: 1, 53: 1, 54: 1, 90: 1, 96: 1},
7: {54: 1, 65: 1, 88: 1, 89: 1, 90: 1, 96: 1},
8: {89: 1},
6: {90: 1, 91: 1}}
[12]:
#fetching start and end residue positions of domains and ligands
domains = dict_of_SH2['pdb_class'].domains
SH2_start = int(domains[0]['SH2'][0])
SH2_stop = int(domains[0]['SH2'][1])
arr_sub, list_aa_from_sub, list_to_aa_sub = CoDIAC.contactMap.return_arr_subset_by_ROI(arr,
res_start, res_end, dict_of_lig['cm_aligned'].return_min_residue(),
SH2_start, SH2_stop, dict_of_SH2['cm_aligned'].return_min_residue())
[13]:
SH2_start, SH2_stop
[13]:
(4, 102)
STEP 5: Generate fasta/feature files for jalview visualization¶
[18]:
#SH2 domain-centric contacts
fasta_header = 'Q06124|PTPN11|1|4|102|lig_4'
PTM_file = './Data/Feature_Fasta_files/SH2_ligand_3TL0_test'
CoDIAC.contactMap.print_fasta_feature_files(arr_alt, dict_of_SH2['cm_aligned'].refseq,
SH2_start,N_offset_ROI_1, SH2_stop, C_offset_ROI_1,
dict_of_SH2['cm_aligned'].return_min_residue(),
res_start,C_offset_ROI_1, res_end, C_offset_ROI_2,
dict_of_lig['cm_aligned'].return_min_residue(),
fasta_header,'pTyr', PTM_file, threshold=1, append=True)
[19]:
#pTyr ligand-centric contacts
SH2_file = './Data/Feature_Fasta_files/pTyr_3TL0_test'
CoDIAC.contactMap.print_fasta_feature_files(arr, dict_of_lig['cm_aligned'].structSeq,
res_start, N_offset_ROI_1,res_end, C_offset_ROI_1,
dict_of_lig['cm_aligned'].return_min_residue(),
SH2_start,N_offset_ROI_2, SH2_stop, C_offset_ROI_2,
dict_of_SH2['cm_aligned'].return_min_residue(),
fasta_header,'SH2', SH2_file, threshold=1, append=True )
Analysis using AlphaFold structures¶
STEP 1: Generate mmCIF files¶
[ ]:
from CoDIAC import AlphaFold
refUniprot_file = './Data/Uniprot_Reference/SH2_IPR000980_uniprot_reference.csv'
outputDir = './Data/AlphaFold_Reference/cifFiles/'
CoDIAC.AlphaFold.get_cifFile(refUniprot_file, outputDir)
STEP 2: Generate AlphaFold reference csv file¶
[ ]:
cifFile_path = './Data/AlphaFold_Reference/cifFiles/'
outputFile = './Data/AlphaFold_Reference/SH2_AlphaFold_reference.csv'
CoDIAC.AlphaFold.makeRefFile(refUniprot_file, cifFile_path, outputFile)
STEP 3: Generate json files using Arppegio¶
Arpeggio was ran for input mmCIF files of PDBs of interest on High-performance computing system that can allocate more memory for running these jobs.
#!/bin/bash
#SBATCH --nodes=1
#SBATCH --mem=750MB
#SBATCH --time=50:00:00
#SBATCH --partition=standard
#SBATCH -A g_bme-naeglelab
#SBATCH -o SH2_PDB.output
#SBATCH -e SH2_PDB.error
module load anaconda
conda create -n arp-env python=3.7
source activate arp-env
conda install -c openbabel openbabel
pip install biopython
pip install git+https://github.com/PDBeurope/arpeggio.git@master#egg=arpeggio
for fname in *.cif
do
arpeggio $fname -sa
done
The refactored version of Arpeggio that is being made available by PDBe has been used to generate .json files in this work. The pdbe-arpeggio (git release version 1.4.*) was used at the time of contactmap generation.
*pip package installation pip install pdbe-arpeggio
can be used for this purpose as well.
STEP 4: Generate binary Adjacency text files¶
[ ]:
from CoDIAC import AdjacencyFiles
cif_path = './Data/CIF_files'
json_path = './Data/JSON_files'
adj_path='./Data/Adjacency_files'
cif_file = cif_path+'/'+PDB_ID+'.cif'
json_file = json_path+'/'+PDB_ID+'.json'
newpath = adj_path+'/'+PDB_ID
os.mkdir(newpath) #create a directory for PDB to store its corresponding adjacency files
outfile = newpath
CoDIAC.AdjacencyFiles.ProcessArpeggio(json_file, outfile, cif_file, small_molecule=False) #generates adjacency file
CoDIAC.AdjacencyFiles.BinaryFeatures(PDB_ID, outfile, translate_resid=True) #generates binarized version
STEP 5: Generate domain-domain contact maps¶
[ ]:
from CoDIAC import contactMap
from CoDIAC import PDBHelper
PATH = './Data/Adjacency_files/' #path to adjacency files
ann_file = './Data/AlphaFold_Reference/SH2_AlphaFold_reference.csv' #this is the reference structure annotation file
PDB_ID = 'AF-O14508-F1' #SOCS family protein
ann = pd.read_csv(ann_file)
entities = CoDIAC.PDBHelper.PDBEntitiesClass(ann, PDB_ID)
entity = 1 #there's only one entity in this example
pdbClass = entities.pdb_dict[entity] #this holds information about the protein crystalized, such as domains
chain_A = CoDIAC.contactMap.chainMap(PDB_ID, entity)
chain_A.construct(PATH)
chain_A_aligned = CoDIAC.contactMap.translate_chainMap_to_RefSeq(chain_A, pdbClass)
ROI_1 = [46, 156] #SH2 domain
ROI_2 = [151, 197] #socsbox domain
fastaHeader = 'SH2|SOCS_box'
Print fasta and feature files that contain contacts extracted across specified domain-domain interface¶
[ ]:
chain_A_aligned.print_fasta_feature_files(ROI_1[0],N_offset_ROI_1, ROI_1[1], C_offset_ROI_1,ROI_2[0], N_offset_ROI_2,ROI_2[1],C_offset_ROI_2,
'O14508|SOCS2|1|46|156', 'SOCS_box',
'./Data/Feature_Fasta_files/SH2_interdomain_Afold',
append=False, use_ref_seq_aligned=True)