Source code for dansy.dansy

import warnings
import random
import networkx as nx
import pandas as pd
import dansy.ngramUtilities as ngramUtilities
import dansy.helper as helper

[docs] class dansy: ''' A domain n-gram network built off either a list of proteins of interest or a reference file generated by CoDIAC. If InterPro IDs are provided will extract n-grams that contain only those IDs. Default values will generate a 10-gram network model. Parameters ----------- protsOI : list List of UniProt IDs whose n-grams are desired to generate the n-gram network. ref : pandas DataFrame (Recommended) Dataframe that has been generated from CoDIAC containing both InterPro and UniProt information. n : int (Optional) N-gram lengths to be extracted interproIDs : list (Optional) List of Interpro IDs to extract n-grams. If omitted, all n-grams will be extracted. Attributes ---------- G: networkx Graph The network graph representation of the DANSy n-gram network ref: pandas DataFrame The reference file information for the proteins within the dataset n: int The maximum length of n-grams being extracted interproIDs: list A list of all protein domain InterPro IDs that were found within the dataset protsOI: list The UniProt IDs for the proteins found within the dataset ngrams: list The extracted domain n-grams collapsed_ngrams: list The domain n-grams which were collapsed into other n-grams which represent the set of proteins adj: pandas DataFrame The adjacency matrix for the n-gram network for the DANSy analysis interpro2uniprot: dict The keys of InterPro IDs with values of a list of UniProt IDs that have the InterPro ID min_arch: int (Default: 1) The minimum number of domain architectures for an n-gram to be retained. max_node_len: int The maximum n-gram length that will be retained during the collapsing step to represent n-grams sharing the same set of proteins. This will not be larger than n (Default of 10). collapse: bool Whether the n-grams were collapsed readable_flag: bool Whether the n-grams are human-legible verbose: bool Whether progress statements are to be printed during calculations network_params: dict Key-value pairs of acceptable networkx drawing parameters ''' def __init__(self,protsOI = None, ref = None,n = 10,interproIDs = None, **kwargs): # Bare minimum attributes required for setting up an empty n-gram network. self.protsOI = protsOI self.ref = ref self._n = n self.interproIDs = interproIDs # If any parameters were included that can generate the n-gram network actually populating and generating the network. if (self.protsOI is not None) or (self.ref is not None): self.populate_ngramNet(**kwargs) else: self.network_params = {} @property def n(self): return self._n @n.setter def n(self, value): raise AttributeError('The maximum length of n-gram (n) cannot be readjusted.') def populate_ngramNet(self,**attribs): ''' This will actually generate the n-gram network object and go through the domain n-gram analysis and populate each attribute with the user defined and default parameters. ''' # Getting the Proteins of interest that will be used for the analysis. If proteins of interest are not provided then will raise an error. self.add_prots() self.add_ref_df() # Unpack the additional, optional parameters that alter the behavior of the network generated or provide descriptions. self.add_interpro_ids() self.set_ngram_parameters(**attribs) self.unpack_opts(**attribs) # Now generating the actual Domain N-gram Network that is the basis of this class self.build_dgNet() # Adding in protein of interest def add_prots(self): ''' Adds in the protein list if it was not provided ''' # If the protein list is not provided but the reference file is, then import the entire protein list from the reference file if self.protsOI is None and self.ref is not None: ref = self.check_ref(self.ref) self.protsOI = self.extract_uniprots(ref) # Ensuring the correct data type for downstream analysis if isinstance(self.protsOI, str): self.protsOI = [self.protsOI] elif isinstance(self.protsOI, list): self.protsOI = self.protsOI elif isinstance(self.protsOI, set): self.protsOI = list(self.protsOI) else: raise ValueError('UniProt IDs should be provided as a list.') # Adding in the reference dataframe for the n-gram analysis def add_ref_df(self): ''' Adds in the dataframe that contains all the reference information for the protein domain architectures. ''' # if no reference file is provided then this will load the current whole proteome reference file in the designated directory. # Import the complete proteome reference file if no reference file has been specified. if self.ref is None: ref_data = helper.import_proteome_files() # Ensure the reference data is a dataframe and not just the string to the reference file. else: ref_data = self.check_ref(self.ref) self.get_refs(ref_data) def add_interpro_ids(self): ''' Add the list of interpro ids that will be used to find the n-grams of interest. ''' # Setting up the InterPro IDs that will be used to extract the n-grams if self.interproIDs is None: tokens = ngramUtilities.generate_interpro_conversion(self.ref) self.interproIDs = list(tokens.keys()) # Getting the reference data frame used for each instance of the class def get_refs(self,ref_df): self.ref = ref_df[ref_df['UniProt ID'].isin(self.protsOI)] # Building the actual domain n-gram network def build_dgNet(self): self.adj,_,self.interpro2uniprot,self.collapsed_ngrams,self.interpro_conversion = ngramUtilities.full_ngram_analysis(self.ref, self.interproIDs, max_ngram=self.n, min_arch=self.min_arch, readable_flag=self.readable_flag, max_node_len=self.max_node_len, concat_flag=self.collapse, verbose=self.verbose) # Get the full list of n-grams self.ngrams = list(self.adj.columns) self.generate_network() def generate_network(self): ''' Uses networkx to generate a network view of the n-gram network. This will remove the self-loop edges from the n-gram network, but does not touch the adjacency matrix. ''' # Using networkx to generate the network and then remove the self loops. # In some instances all provided proteins will have the exact same domain architecture so a networkx graph generation is somewhat useless and fails if len(self.adj.columns) > 1: G = nx.from_pandas_adjacency(self.adj) G.remove_edges_from(nx.selfloop_edges(G)) self.G = G else: if self.verbose: print('Only one n-gram was found in the dataset. Please double-check inputs.') G = nx.Graph() G.add_node(self.adj.columns[0]) self.G = G # Utility to draw basic network, but is recommended to use the actual networkx package.
[docs] def draw_network(self): ''' Draws a basic version of the Graph with the networkx spring layout implementation. It is recommended to use the actual networkx package for a full implementation. ''' # Default values basic_network_params = {'node_size':1, 'edgecolors':'k', 'linewidths':0.1, 'width':0.25, 'edge_color':'#808080', } # Retrieving any of the network values that were set as attributes and placing into the network parameters if self.network_params: for k in basic_network_params.keys(): if k in self.network_params: basic_network_params[k] = self.network_params[k] if 'pos' in self.network_params: basic_network_params['pos'] = self.network_params['pos'] nx.draw(self.G,**basic_network_params)
# Check the reference file status @staticmethod def check_ref(ref): ''' Checks whether the provided reference is a string to a specific file or an already imported DataFrame. It will return the actual DataFrame that was imported. ''' if isinstance(ref, str): ref_df = ngramUtilities.import_reference_file(ref) elif isinstance(ref, pd.DataFrame): # Check that the reference file has the necessary information for generating the n-gram network columns = ['Interpro Domains', 'Interpro Domain Architecture IDs'] if columns[1] in ref.columns: # Check if the IDs version of the domain architecture are present ref_df = ref elif columns[0] in ref.columns: ref['Interpro Domain Architecture IDs'] = ref['Interpro Domains'].map(create_arch_ids) ref_df = ref else: raise DomainArchError('Interpro Domain Architecture Information is missing. ') else: raise TypeError('Reference File is not correct') return ref_df # Extract all UniProt IDs from a reference file if only the reference file is to be used @staticmethod def extract_uniprots(ref): ''' Extracts a list of the entire UniProts that were contained within a provided reference file. ''' if 'UniProt ID' in ref.columns: uniprot_ids = ref['UniProt ID'].tolist() else: raise ValueError('The reference file is missing a UniProt ID column. Please check or regenerate the reference file.') return uniprot_ids # Return a human readable version of specified n-grams def return_legible_ngram(self, ngram): ''' Given an n-gram of interest. It will convert the name to a human legible version. ''' if '|' in ngram: split_ngram = ngram.split('|') ngram_conv = [] for dom in split_ngram: ngram_conv.append(self.interpro_conversion[dom]) ngram_conv = '|'.join(ngram_conv) else: ngram_conv = self.interpro_conversion[ngram] return ngram_conv
[docs] def ngram_protein_count(self, ngram): ''' Returns the number of proteins an individual n-gram was found within. ''' if ngram in self.collapsed_ngrams: cands = [i for i in self.ngrams if ngram in i] if len(cands) > 1: raise ValueError('The n-gram of interest was somehow found across several non-redundant n-grams?') else: c = len(self.interpro2uniprot[cands[0]]) elif ngram in self.ngrams: c = len(self.interpro2uniprot[ngram]) else: raise ValueError('The inputted n-gram is not found.') return c
def unpack_opts(self, **kwargs): ''' Unpacks both necessary and optional attributes that can be used for downstream analysis. It will check for a list of acceptable optional parameters that can be set with some being necessary for the analysis and others to help with record-keeping/descriptions. ''' opts = ['name', 'collapse', 'max_node_len', 'min_arch', 'readable_flag', 'verbose'] network_opts = ['pos', 'node_size', 'node_color', 'linewidths', 'edge_color', 'width', 'edgecolors'] ignored_opts = [] for k,v in kwargs.items(): if k in opts: self.__setattr__(k,v) elif k in network_opts: pass # separate function to unpack the network associated parameters else: ignored_opts.append(k) network_params = self.unpack_net_opts(**kwargs) self.network_params = network_params if ignored_opts and self.verbose: print(f'The following parameters were ignored:{ignored_opts}') def unpack_net_opts(self, **kwargs): ''' Unpacks key value parameters that relate to the basic drawing of a network. This does not fully encompass all parameters found within an networkx function. ''' network_opts = ['pos', 'node_size', 'node_color', 'linewidths', 'edge_color', 'width', 'edgecolors'] network_params = {} for k,v in kwargs.items(): if k in network_opts: network_params[k] = v return network_params def set_ngram_parameters(self, **kwargs): ''' Using the parameters given override defaults as necessary. ''' # Default values for the n-gram analysis ngram_parameters = {'min_arch':1, 'max_node_len':None, 'collapse':True, 'readable_flag':False, 'verbose':True} for k,v in kwargs.items(): if k in ngram_parameters.keys(): ngram_parameters[k] = v for k,v in ngram_parameters.items(): self.__setattr__(k, v)
[docs] def summary(self, detailed = False): ''' Output a summary of key features that are represented within the n-gram network. ''' # Generating the baseline dataframe that will hold the summarized data. index = ['name', 'Proteins', 'n-grams', 'Network Isolates', 'Network Connected Components', ] net_sum = pd.DataFrame(index=index,columns=['']) if 'name' in self.__dict__: net_sum.loc['name'] = self.name else: net_sum.loc['name'] = 'Domain n-gram Network' net_sum.loc['Proteins'] = len(self.protsOI) net_sum.loc['n-grams'] = len(self.ngrams) net_sum.loc['Network Isolates'] = nx.number_of_isolates(self.G) net_sum.loc['Network Connected Components'] = nx.number_connected_components(self.G) if detailed: net_sum.loc['Collapsed n-grams'] = len(self.collapsed_ngrams) net_sum.loc['Network Edges'] = nx.number_of_edges(self.G) net_sum.loc['Maximum Length of Protein Domain Architecture'] = self.retrieve_longest_arch(self.ref) return net_sum
[docs] @staticmethod def retrieve_longest_arch(ref): '''Retrieves the longest domain architecture length.''' arch_max = max(ref['Interpro Domain Architecture IDs'].apply(lambda x: len(x.split('|')))) return arch_max
def import_precomp_data(self,adj,ref,interpro_ids,n,collapsed_ngrams = None,**net_attribs): ''' For instances where a precomputed n-gram network was generated already, this will import the data if it does not already exist. ''' # Check pre-existence of any values if ('adj' in self.__dict__) or (self.ref is not None) or (self.interproIDs is not None): raise ValueError('Cannot alter already computed n-gram network parameters. Please create a new instance.') else: self.adj = adj self.ref = self.check_ref(ref) self.interproIDs = interpro_ids self._n = n # Populate the n-gram lists self.ngrams = list(self.adj.columns) if collapsed_ngrams is not None: self.collapsed_ngrams = collapsed_ngrams self.validate_n() # Create the n-gram to UniProt mapping. This uses the preexisting function. _,ngram_dict = ngramUtilities.get_ngrams_from_df(self.ref, self.interproIDs,arch_num=1, max_ngram=self.n) # For the n-grams that were already collapsed removing from the n-gram dict to reduce redundant information. if collapsed_ngrams is None: self.interpro2uniprot = ngram_dict else: for ngram in collapsed_ngrams: del ngram_dict[ngram] self.interpro2uniprot = ngram_dict # Unpacking only network options self.network_params = self.unpack_net_opts(**net_attribs) self.generate_network() def validate_n(self): ''' Ensure that the n that has been provided matches the n-gram model that has been provided. ''' # Check n-grams within the adjacency matrix cur_max_n = max([len(x.split('|')) for x in self.ngrams]) max_possible = self.retrieve_longest_arch(self.ref) # First checks that there is no mismatch in longest extracted n-gram. if cur_max_n > self.n: raise AttributeError('The provided max n-gram length is longer than n-grams in the provided n-gram list.') if 'collapsed_ngrams' in self.__dict__: col_max = max([len(x.split('|')) for x in self.collapsed_ngrams]) if col_max > self.n: raise AttributeError('The provided max n-gram length is longer than n-grams in the provided n-gram list.') # Now checking whether there may have just been a mistake of the longest and raise a warning. If ther maximum length is much smaller than the maximum possible length and that does not match the provided n if (cur_max_n < max_possible) and (self.n != cur_max_n): raise AttributeError('The provided max n-gram does not match the provided n-gram list.') if self.n > max_possible: warnings.warn('The maximum possible n-gram that can be extracted was smaller than the desired maximum n-gram length.')
[docs] def retrieve_protein_info(self, prot = None, ngram = None): ''' Retrieves the reference information of proteins of interest. If an InterPro ID/n-gram is provided it will use that instead and search for the proteins with that ID and return that instead. ''' if prot == None and ngram == None: raise TypeError('Either a protein list or an n-gram are necessary. Please provide either one.') elif prot != None and ngram != None: raise TypeError('Please provide either a protein list or an n-gram and not both.') if ngram != None: prot = self.interpro2uniprot[ngram] if isinstance(prot, str): prot = [prot] prot_info = self.ref[self.ref['UniProt ID'].isin(prot)] return prot_info
[docs] def retrieve_random_ids(self,num, iters = 50, seed = 882): '''Generator of random UniProt IDs from the base network.''' random.seed(seed) seedlist = random.sample(range(100000), k=iters) full_id_list = sorted(self.ref['UniProt ID'].tolist()) for i in range(iters): random.seed(seedlist[i]) rand_ids = random.sample(full_id_list, k = num) yield rand_ids
def create_arch_ids(arch): ''' This takes the architectures returned from the InterPro module of CoDIAC, which follows the format Name:InterPro_ID:Start:Stop for each domain separated by a semicolon and creates the full architecture with only the IDs seperated by a pipe (|). Parameters ---------- - arch: str The full domain architecture string containing all the domain information Returns ------- - arch_id: str The abbreviated domain architecture of only InterPro IDs ''' try: arch_list = arch.split(';') ids_list = [x.split(':')[1] for x in arch_list] arch_id = '|'.join(ids_list) except: raise DomainArchError('The domain architecture information was incorrectly formatted.') return arch_id class DomainArchError(TypeError): ''' Custom error for domain architectures that are not properly formatted or missing. ''' pass