import warnings
import random
import networkx as nx
import pandas as pd
import dansy.ngramUtilities as ngramUtilities
import dansy.helper as helper
[docs]
class dansy:
'''
A domain n-gram network built off either a list of proteins of interest or a reference file generated by CoDIAC. If InterPro IDs are provided will extract n-grams that contain only those IDs. Default values will generate a 10-gram network model.
Parameters
-----------
protsOI : list
List of UniProt IDs whose n-grams are desired to generate the n-gram network.
ref : pandas DataFrame (Recommended)
Dataframe that has been generated from CoDIAC containing both InterPro and UniProt information.
n : int (Optional)
N-gram lengths to be extracted
interproIDs : list (Optional)
List of Interpro IDs to extract n-grams. If omitted, all n-grams will be extracted.
Attributes
----------
G: networkx Graph
The network graph representation of the DANSy n-gram network
ref: pandas DataFrame
The reference file information for the proteins within the dataset
n: int
The maximum length of n-grams being extracted
interproIDs: list
A list of all protein domain InterPro IDs that were found within the dataset
protsOI: list
The UniProt IDs for the proteins found within the dataset
ngrams: list
The extracted domain n-grams
collapsed_ngrams: list
The domain n-grams which were collapsed into other n-grams which represent the set of proteins
adj: pandas DataFrame
The adjacency matrix for the n-gram network for the DANSy analysis
interpro2uniprot: dict
The keys of InterPro IDs with values of a list of UniProt IDs that have the InterPro ID
min_arch: int (Default: 1)
The minimum number of domain architectures for an n-gram to be retained.
max_node_len: int
The maximum n-gram length that will be retained during the collapsing step to represent n-grams sharing the same set of proteins. This will not be larger than n (Default of 10).
collapse: bool
Whether the n-grams were collapsed
readable_flag: bool
Whether the n-grams are human-legible
verbose: bool
Whether progress statements are to be printed during calculations
network_params: dict
Key-value pairs of acceptable networkx drawing parameters
'''
def __init__(self,protsOI = None, ref = None,n = 10,interproIDs = None, **kwargs):
# Bare minimum attributes required for setting up an empty n-gram network.
self.protsOI = protsOI
self.ref = ref
self._n = n
self.interproIDs = interproIDs
# If any parameters were included that can generate the n-gram network actually populating and generating the network.
if (self.protsOI is not None) or (self.ref is not None):
self.populate_ngramNet(**kwargs)
else:
self.network_params = {}
@property
def n(self):
return self._n
@n.setter
def n(self, value):
raise AttributeError('The maximum length of n-gram (n) cannot be readjusted.')
def populate_ngramNet(self,**attribs):
'''
This will actually generate the n-gram network object and go through the domain n-gram analysis and populate each attribute with the user defined and default parameters.
'''
# Getting the Proteins of interest that will be used for the analysis. If proteins of interest are not provided then will raise an error.
self.add_prots()
self.add_ref_df()
# Unpack the additional, optional parameters that alter the behavior of the network generated or provide descriptions.
self.add_interpro_ids()
self.set_ngram_parameters(**attribs)
self.unpack_opts(**attribs)
# Now generating the actual Domain N-gram Network that is the basis of this class
self.build_dgNet()
# Adding in protein of interest
def add_prots(self):
'''
Adds in the protein list if it was not provided
'''
# If the protein list is not provided but the reference file is, then import the entire protein list from the reference file
if self.protsOI is None and self.ref is not None:
ref = self.check_ref(self.ref)
self.protsOI = self.extract_uniprots(ref)
# Ensuring the correct data type for downstream analysis
if isinstance(self.protsOI, str):
self.protsOI = [self.protsOI]
elif isinstance(self.protsOI, list):
self.protsOI = self.protsOI
elif isinstance(self.protsOI, set):
self.protsOI = list(self.protsOI)
else:
raise ValueError('UniProt IDs should be provided as a list.')
# Adding in the reference dataframe for the n-gram analysis
def add_ref_df(self):
'''
Adds in the dataframe that contains all the reference information for the protein domain architectures.
'''
# if no reference file is provided then this will load the current whole proteome reference file in the designated directory.
# Import the complete proteome reference file if no reference file has been specified.
if self.ref is None:
ref_data = helper.import_proteome_files()
# Ensure the reference data is a dataframe and not just the string to the reference file.
else:
ref_data = self.check_ref(self.ref)
self.get_refs(ref_data)
def add_interpro_ids(self):
'''
Add the list of interpro ids that will be used to find the n-grams of interest.
'''
# Setting up the InterPro IDs that will be used to extract the n-grams
if self.interproIDs is None:
tokens = ngramUtilities.generate_interpro_conversion(self.ref)
self.interproIDs = list(tokens.keys())
# Getting the reference data frame used for each instance of the class
def get_refs(self,ref_df):
self.ref = ref_df[ref_df['UniProt ID'].isin(self.protsOI)]
# Building the actual domain n-gram network
def build_dgNet(self):
self.adj,_,self.interpro2uniprot,self.collapsed_ngrams,self.interpro_conversion = ngramUtilities.full_ngram_analysis(self.ref,
self.interproIDs,
max_ngram=self.n,
min_arch=self.min_arch,
readable_flag=self.readable_flag,
max_node_len=self.max_node_len,
concat_flag=self.collapse,
verbose=self.verbose)
# Get the full list of n-grams
self.ngrams = list(self.adj.columns)
self.generate_network()
def generate_network(self):
'''
Uses networkx to generate a network view of the n-gram network. This will remove the self-loop edges from the n-gram network, but does not touch the adjacency matrix.
'''
# Using networkx to generate the network and then remove the self loops.
# In some instances all provided proteins will have the exact same domain architecture so a networkx graph generation is somewhat useless and fails
if len(self.adj.columns) > 1:
G = nx.from_pandas_adjacency(self.adj)
G.remove_edges_from(nx.selfloop_edges(G))
self.G = G
else:
if self.verbose:
print('Only one n-gram was found in the dataset. Please double-check inputs.')
G = nx.Graph()
G.add_node(self.adj.columns[0])
self.G = G
# Utility to draw basic network, but is recommended to use the actual networkx package.
[docs]
def draw_network(self):
'''
Draws a basic version of the Graph with the networkx spring layout implementation. It is recommended to use the actual networkx package for a full implementation.
'''
# Default values
basic_network_params = {'node_size':1,
'edgecolors':'k',
'linewidths':0.1,
'width':0.25,
'edge_color':'#808080',
}
# Retrieving any of the network values that were set as attributes and placing into the network parameters
if self.network_params:
for k in basic_network_params.keys():
if k in self.network_params:
basic_network_params[k] = self.network_params[k]
if 'pos' in self.network_params:
basic_network_params['pos'] = self.network_params['pos']
nx.draw(self.G,**basic_network_params)
# Check the reference file status
@staticmethod
def check_ref(ref):
'''
Checks whether the provided reference is a string to a specific file or an already imported DataFrame. It will return the actual DataFrame that was imported.
'''
if isinstance(ref, str):
ref_df = ngramUtilities.import_reference_file(ref)
elif isinstance(ref, pd.DataFrame):
# Check that the reference file has the necessary information for generating the n-gram network
columns = ['Interpro Domains', 'Interpro Domain Architecture IDs']
if columns[1] in ref.columns: # Check if the IDs version of the domain architecture are present
ref_df = ref
elif columns[0] in ref.columns:
ref['Interpro Domain Architecture IDs'] = ref['Interpro Domains'].map(create_arch_ids)
ref_df = ref
else:
raise DomainArchError('Interpro Domain Architecture Information is missing. ')
else:
raise TypeError('Reference File is not correct')
return ref_df
# Extract all UniProt IDs from a reference file if only the reference file is to be used
@staticmethod
def extract_uniprots(ref):
'''
Extracts a list of the entire UniProts that were contained within a provided reference file.
'''
if 'UniProt ID' in ref.columns:
uniprot_ids = ref['UniProt ID'].tolist()
else:
raise ValueError('The reference file is missing a UniProt ID column. Please check or regenerate the reference file.')
return uniprot_ids
# Return a human readable version of specified n-grams
def return_legible_ngram(self, ngram):
'''
Given an n-gram of interest. It will convert the name to a human legible version.
'''
if '|' in ngram:
split_ngram = ngram.split('|')
ngram_conv = []
for dom in split_ngram:
ngram_conv.append(self.interpro_conversion[dom])
ngram_conv = '|'.join(ngram_conv)
else:
ngram_conv = self.interpro_conversion[ngram]
return ngram_conv
[docs]
def ngram_protein_count(self, ngram):
'''
Returns the number of proteins an individual n-gram was found within.
'''
if ngram in self.collapsed_ngrams:
cands = [i for i in self.ngrams if ngram in i]
if len(cands) > 1:
raise ValueError('The n-gram of interest was somehow found across several non-redundant n-grams?')
else:
c = len(self.interpro2uniprot[cands[0]])
elif ngram in self.ngrams:
c = len(self.interpro2uniprot[ngram])
else:
raise ValueError('The inputted n-gram is not found.')
return c
def unpack_opts(self, **kwargs):
'''
Unpacks both necessary and optional attributes that can be used for downstream analysis. It will check for a list of acceptable optional parameters that can be set with some being necessary for the analysis and others to help with record-keeping/descriptions.
'''
opts = ['name',
'collapse',
'max_node_len',
'min_arch',
'readable_flag',
'verbose']
network_opts = ['pos',
'node_size',
'node_color',
'linewidths',
'edge_color',
'width',
'edgecolors']
ignored_opts = []
for k,v in kwargs.items():
if k in opts:
self.__setattr__(k,v)
elif k in network_opts:
pass # separate function to unpack the network associated parameters
else:
ignored_opts.append(k)
network_params = self.unpack_net_opts(**kwargs)
self.network_params = network_params
if ignored_opts and self.verbose:
print(f'The following parameters were ignored:{ignored_opts}')
def unpack_net_opts(self, **kwargs):
'''
Unpacks key value parameters that relate to the basic drawing of a network. This does not fully encompass all parameters found within an networkx function.
'''
network_opts = ['pos',
'node_size',
'node_color',
'linewidths',
'edge_color',
'width',
'edgecolors']
network_params = {}
for k,v in kwargs.items():
if k in network_opts:
network_params[k] = v
return network_params
def set_ngram_parameters(self, **kwargs):
'''
Using the parameters given override defaults as necessary.
'''
# Default values for the n-gram analysis
ngram_parameters = {'min_arch':1,
'max_node_len':None,
'collapse':True,
'readable_flag':False,
'verbose':True}
for k,v in kwargs.items():
if k in ngram_parameters.keys():
ngram_parameters[k] = v
for k,v in ngram_parameters.items():
self.__setattr__(k, v)
[docs]
def summary(self, detailed = False):
'''
Output a summary of key features that are represented within the n-gram network.
'''
# Generating the baseline dataframe that will hold the summarized data.
index = ['name',
'Proteins',
'n-grams',
'Network Isolates',
'Network Connected Components',
]
net_sum = pd.DataFrame(index=index,columns=[''])
if 'name' in self.__dict__:
net_sum.loc['name'] = self.name
else:
net_sum.loc['name'] = 'Domain n-gram Network'
net_sum.loc['Proteins'] = len(self.protsOI)
net_sum.loc['n-grams'] = len(self.ngrams)
net_sum.loc['Network Isolates'] = nx.number_of_isolates(self.G)
net_sum.loc['Network Connected Components'] = nx.number_connected_components(self.G)
if detailed:
net_sum.loc['Collapsed n-grams'] = len(self.collapsed_ngrams)
net_sum.loc['Network Edges'] = nx.number_of_edges(self.G)
net_sum.loc['Maximum Length of Protein Domain Architecture'] = self.retrieve_longest_arch(self.ref)
return net_sum
[docs]
@staticmethod
def retrieve_longest_arch(ref):
'''Retrieves the longest domain architecture length.'''
arch_max = max(ref['Interpro Domain Architecture IDs'].apply(lambda x: len(x.split('|'))))
return arch_max
def import_precomp_data(self,adj,ref,interpro_ids,n,collapsed_ngrams = None,**net_attribs):
'''
For instances where a precomputed n-gram network was generated already, this will import the data if it does not already exist.
'''
# Check pre-existence of any values
if ('adj' in self.__dict__) or (self.ref is not None) or (self.interproIDs is not None):
raise ValueError('Cannot alter already computed n-gram network parameters. Please create a new instance.')
else:
self.adj = adj
self.ref = self.check_ref(ref)
self.interproIDs = interpro_ids
self._n = n
# Populate the n-gram lists
self.ngrams = list(self.adj.columns)
if collapsed_ngrams is not None:
self.collapsed_ngrams = collapsed_ngrams
self.validate_n()
# Create the n-gram to UniProt mapping. This uses the preexisting function.
_,ngram_dict = ngramUtilities.get_ngrams_from_df(self.ref, self.interproIDs,arch_num=1, max_ngram=self.n)
# For the n-grams that were already collapsed removing from the n-gram dict to reduce redundant information.
if collapsed_ngrams is None:
self.interpro2uniprot = ngram_dict
else:
for ngram in collapsed_ngrams:
del ngram_dict[ngram]
self.interpro2uniprot = ngram_dict
# Unpacking only network options
self.network_params = self.unpack_net_opts(**net_attribs)
self.generate_network()
def validate_n(self):
'''
Ensure that the n that has been provided matches the n-gram model that has been provided.
'''
# Check n-grams within the adjacency matrix
cur_max_n = max([len(x.split('|')) for x in self.ngrams])
max_possible = self.retrieve_longest_arch(self.ref)
# First checks that there is no mismatch in longest extracted n-gram.
if cur_max_n > self.n:
raise AttributeError('The provided max n-gram length is longer than n-grams in the provided n-gram list.')
if 'collapsed_ngrams' in self.__dict__:
col_max = max([len(x.split('|')) for x in self.collapsed_ngrams])
if col_max > self.n:
raise AttributeError('The provided max n-gram length is longer than n-grams in the provided n-gram list.')
# Now checking whether there may have just been a mistake of the longest and raise a warning. If ther maximum length is much smaller than the maximum possible length and that does not match the provided n
if (cur_max_n < max_possible) and (self.n != cur_max_n):
raise AttributeError('The provided max n-gram does not match the provided n-gram list.')
if self.n > max_possible:
warnings.warn('The maximum possible n-gram that can be extracted was smaller than the desired maximum n-gram length.')
[docs]
def retrieve_protein_info(self, prot = None, ngram = None):
'''
Retrieves the reference information of proteins of interest. If an InterPro ID/n-gram is provided it will use that instead and search for the proteins with that ID and return that instead.
'''
if prot == None and ngram == None:
raise TypeError('Either a protein list or an n-gram are necessary. Please provide either one.')
elif prot != None and ngram != None:
raise TypeError('Please provide either a protein list or an n-gram and not both.')
if ngram != None:
prot = self.interpro2uniprot[ngram]
if isinstance(prot, str):
prot = [prot]
prot_info = self.ref[self.ref['UniProt ID'].isin(prot)]
return prot_info
[docs]
def retrieve_random_ids(self,num, iters = 50, seed = 882):
'''Generator of random UniProt IDs from the base network.'''
random.seed(seed)
seedlist = random.sample(range(100000), k=iters)
full_id_list = sorted(self.ref['UniProt ID'].tolist())
for i in range(iters):
random.seed(seedlist[i])
rand_ids = random.sample(full_id_list, k = num)
yield rand_ids
def create_arch_ids(arch):
'''
This takes the architectures returned from the InterPro module of CoDIAC, which follows the format Name:InterPro_ID:Start:Stop for each domain separated by a semicolon and creates the full architecture with only the IDs seperated by a pipe (|).
Parameters
----------
- arch: str
The full domain architecture string containing all the domain information
Returns
-------
- arch_id: str
The abbreviated domain architecture of only InterPro IDs
'''
try:
arch_list = arch.split(';')
ids_list = [x.split(':')[1] for x in arch_list]
arch_id = '|'.join(ids_list)
except:
raise DomainArchError('The domain architecture information was incorrectly formatted.')
return arch_id
class DomainArchError(TypeError):
'''
Custom error for domain architectures that are not properly formatted or missing.
'''
pass