Source code for dansy.dansy

import warnings
import random
import networkx as nx
import pandas as pd
import dansy.ngramUtilities as ngramUtilities
import dansy.helper as helper


[docs]
class dansy:
    '''
    A domain n-gram network built off either a list of proteins of interest or a reference file generated by CoDIAC. If InterPro IDs are provided will extract n-grams that contain only those IDs. Default values will generate a 10-gram network model.

    Parameters
    -----------
    protsOI : list
        List of UniProt IDs whose n-grams are desired to generate the n-gram network.
    ref : pandas DataFrame (Recommended)
        Dataframe that has been generated from CoDIAC containing both InterPro and UniProt information.
    n : int (Optional)
        N-gram lengths to be extracted
    interproIDs : list (Optional)
        List of Interpro IDs to extract n-grams. If omitted, all n-grams will be extracted.

    Attributes
    ----------
    G: networkx Graph
        The network graph representation of the DANSy n-gram network
    ref: pandas DataFrame
        The reference file information for the proteins within the dataset
    n: int
        The maximum length of n-grams being extracted
    interproIDs: list
        A list of all protein domain InterPro IDs that were found within the dataset
    protsOI: list
        The UniProt IDs for the proteins found within the dataset
    ngrams: list
        The extracted domain n-grams
    collapsed_ngrams: list
        The domain n-grams which were collapsed into other n-grams which represent the set of proteins
    adj: pandas DataFrame
        The adjacency matrix for the n-gram network for the DANSy analysis
    interpro2uniprot: dict
        The keys of InterPro IDs with values of a list of UniProt IDs that have the InterPro ID
    min_arch: int (Default: 1)
        The minimum number of domain architectures for an n-gram to be retained.
    max_node_len: int
        The maximum n-gram length that will be retained during the collapsing step to represent n-grams sharing the same set of proteins. This will not be larger than n (Default of 10).
    collapse: bool
        Whether the n-grams were collapsed
    readable_flag: bool
        Whether the n-grams are human-legible
    verbose: bool
        Whether progress statements are to be printed during calculations
    network_params: dict
        Key-value pairs of acceptable networkx drawing parameters
    '''
    def __init__(self,protsOI = None, ref = None,n = 10,interproIDs = None, **kwargs):
        
        # Bare minimum attributes required for setting up an empty n-gram network.
        self.protsOI = protsOI
        self.ref = ref
        self._n = n
        self.interproIDs = interproIDs

        # If any parameters were included that can generate the n-gram network actually populating and generating the network.
        if (self.protsOI is not None) or (self.ref is not None):
            self.populate_ngramNet(**kwargs)
        else:
            self.network_params = {}
    
    @property
    def n(self):
        return self._n
    
    @n.setter
    def n(self, value):
        raise AttributeError('The maximum length of n-gram (n) cannot be readjusted.')

    def populate_ngramNet(self,**attribs):
        '''
        This will actually generate the n-gram network object and go through the domain n-gram analysis and populate each attribute with the user defined and default parameters.
        '''
        # Getting the Proteins of interest that will be used for the analysis. If proteins of interest are not provided then will raise an error.
        self.add_prots()
        self.add_ref_df()
        
        # Unpack the additional, optional parameters that alter the behavior of the network generated or provide descriptions.
        self.add_interpro_ids()
        self.set_ngram_parameters(**attribs)
        self.unpack_opts(**attribs)

        # Now generating the actual Domain N-gram Network that is the basis of this class
        self.build_dgNet()
    
    # Adding in protein of interest
    def add_prots(self):
        '''
        Adds in the protein list if it was not provided
        '''

        # If the protein list is not provided but the reference file is, then import the entire protein list from the reference file
        if self.protsOI is None and self.ref is not None:
            ref = self.check_ref(self.ref)
            self.protsOI = self.extract_uniprots(ref)
        
        # Ensuring the correct data type for downstream analysis
        if isinstance(self.protsOI, str):
            self.protsOI = [self.protsOI]
        elif isinstance(self.protsOI, list):
            self.protsOI = self.protsOI
        elif isinstance(self.protsOI, set):
            self.protsOI = list(self.protsOI)
        else:
            raise ValueError('UniProt IDs should be provided as a list.')

    # Adding in the reference dataframe for the n-gram analysis
    def add_ref_df(self):
        '''
        Adds in the dataframe that contains all the reference information for the protein domain architectures.
        '''
        # if no reference file is provided then this will load the current whole proteome reference file in the designated directory.
        
        # Import the complete proteome reference file if no reference file has been specified.
        if self.ref is None:
            ref_data = helper.import_proteome_files()
        
        # Ensure the reference data is a dataframe and not just the string to the reference file.
        else:
            ref_data = self.check_ref(self.ref)
        self.get_refs(ref_data)

    def add_interpro_ids(self):
        '''
        Add the list of interpro ids that will be used to find the n-grams of interest.
        '''
        # Setting up the InterPro IDs that will be used to extract the n-grams
        if self.interproIDs is None:
            tokens = ngramUtilities.generate_interpro_conversion(self.ref)
            self.interproIDs = list(tokens.keys())


    # Getting the reference data frame used for each instance of the class
    def get_refs(self,ref_df):
        self.ref = ref_df[ref_df['UniProt ID'].isin(self.protsOI)]

    # Building the actual domain n-gram network
    def build_dgNet(self):
        self.adj,_,self.interpro2uniprot,self.collapsed_ngrams,self.interpro_conversion = ngramUtilities.full_ngram_analysis(self.ref,
                                                                                                                            self.interproIDs,
                                                                                                                            max_ngram=self.n, 
                                                                                                                            min_arch=self.min_arch,
                                                                                                                            readable_flag=self.readable_flag,
                                                                                                                            max_node_len=self.max_node_len,
                                                                                                                            concat_flag=self.collapse,
                                                                                                                            verbose=self.verbose)

        # Get the full list of n-grams
        self.ngrams = list(self.adj.columns)
        self.generate_network()

    def generate_network(self):
        '''
        Uses networkx to generate a network view of the n-gram network. This will remove the self-loop edges from the n-gram network, but does not touch the adjacency matrix.
        '''
        # Using networkx to generate the network and then remove the self loops.
        
        # In some instances all provided proteins will have the exact same domain architecture so a networkx graph generation is somewhat useless and fails 
        if len(self.adj.columns) > 1:
            G = nx.from_pandas_adjacency(self.adj)
            G.remove_edges_from(nx.selfloop_edges(G))
            self.G = G
        else:
            if self.verbose:
                print('Only one n-gram was found in the dataset. Please double-check inputs.')
            G = nx.Graph()
            G.add_node(self.adj.columns[0])
            self.G = G

    
    # Utility to draw basic network, but is recommended to use the actual networkx package.

[docs]
    def draw_network(self):
        '''
        Draws a basic version of the Graph with the networkx spring layout implementation. It is recommended to use the actual networkx package for a full implementation.
        '''
        
        # Default values 
        basic_network_params = {'node_size':1,
                                  'edgecolors':'k',
                                  'linewidths':0.1,
                                  'width':0.25,
                                  'edge_color':'#808080',
                                  }
        
        # Retrieving any of the network values that were set as attributes and placing into the network parameters
        if self.network_params:
            for k in basic_network_params.keys():
                if k in self.network_params:
                    basic_network_params[k] = self.network_params[k]

        if 'pos' in self.network_params:
            basic_network_params['pos'] = self.network_params['pos']

        nx.draw(self.G,**basic_network_params)



    # Check the reference file status
    @staticmethod
    def check_ref(ref):
        '''
        Checks whether the provided reference is a string to a specific file or an already imported DataFrame. It will return the actual DataFrame that was imported.
        '''
        if isinstance(ref, str):
            ref_df = ngramUtilities.import_reference_file(ref)
        elif isinstance(ref, pd.DataFrame):
            
            # Check that the reference file has the necessary information for generating the n-gram network
            columns = ['Interpro Domains', 'Interpro Domain Architecture IDs']
            if columns[1] in ref.columns: # Check if the IDs version of the domain architecture are present
                ref_df = ref
            elif columns[0] in ref.columns:
                ref['Interpro Domain Architecture IDs'] = ref['Interpro Domains'].map(create_arch_ids)
                ref_df = ref
            else:
                raise DomainArchError('Interpro Domain Architecture Information is missing. ')

        else:
            raise TypeError('Reference File is not correct')
        
        return ref_df
    
    # Extract all UniProt IDs from a reference file if only the reference file is to be used
    @staticmethod
    def extract_uniprots(ref):
        '''
        Extracts a list of the entire UniProts that were contained within a provided reference file.
        '''

        if 'UniProt ID' in ref.columns:
            uniprot_ids = ref['UniProt ID'].tolist()
        else:
            raise ValueError('The reference file is missing a UniProt ID column. Please check or regenerate the reference file.')
        
        return uniprot_ids

    # Return a human readable version of specified n-grams
    def return_legible_ngram(self, ngram):
        '''
        Given an n-gram of interest. It will convert the name to a human legible version.
        '''

        if '|' in ngram:
            split_ngram = ngram.split('|')
            ngram_conv = []
            for dom in split_ngram:
                ngram_conv.append(self.interpro_conversion[dom])
            ngram_conv = '|'.join(ngram_conv)
        else:
            ngram_conv = self.interpro_conversion[ngram]
        
        return ngram_conv
    

[docs]
    def ngram_protein_count(self, ngram):
        '''
        Returns the number of proteins an individual n-gram was found within.
        '''
        if ngram in self.collapsed_ngrams:
            cands = [i for i in self.ngrams if ngram in i]
            if len(cands) > 1:
                raise ValueError('The n-gram of interest was somehow found across several non-redundant n-grams?')
            else:
                c = len(self.interpro2uniprot[cands[0]])
        elif ngram in self.ngrams:
            c = len(self.interpro2uniprot[ngram])
        else:
            raise ValueError('The inputted n-gram is not found.')

        return c


    def unpack_opts(self, **kwargs):
        '''
        Unpacks both necessary and optional attributes that can be used for downstream analysis. It will check for a list of acceptable optional parameters that can be set with some being necessary for the analysis and others to help with record-keeping/descriptions.
        '''
        
        opts = ['name',
                'collapse',
                'max_node_len',
                'min_arch',
                'readable_flag',
                'verbose']

        network_opts = ['pos',
                        'node_size',
                        'node_color',
                        'linewidths',
                        'edge_color',
                        'width',
                        'edgecolors']
        ignored_opts = []
        
        for k,v in kwargs.items():
            if k in opts:
                self.__setattr__(k,v)
            elif k in network_opts:
                pass # separate function to unpack the network associated parameters
            else:
                ignored_opts.append(k)
        
        network_params = self.unpack_net_opts(**kwargs)
        
        self.network_params = network_params

        if ignored_opts and self.verbose:
            print(f'The following parameters were ignored:{ignored_opts}')

    def unpack_net_opts(self, **kwargs):
        '''
        Unpacks key value parameters that relate to the basic drawing of a network. This does not fully encompass all parameters found within an networkx function.
        '''

        network_opts = ['pos',
                        'node_size',
                        'node_color',
                        'linewidths',
                        'edge_color',
                        'width',
                        'edgecolors']

        network_params = {}
        for k,v in kwargs.items():
            if k in network_opts:
                network_params[k] = v

        return network_params


    def set_ngram_parameters(self, **kwargs):
        '''
        Using the parameters given override defaults as necessary.
        '''

        # Default values for the n-gram analysis
        ngram_parameters = {'min_arch':1,
                          'max_node_len':None,
                          'collapse':True,
                          'readable_flag':False,
                          'verbose':True}
        
        for k,v in kwargs.items():
            if k in ngram_parameters.keys():
                ngram_parameters[k] = v

        for k,v in ngram_parameters.items():
            self.__setattr__(k, v)


[docs]
    def summary(self, detailed = False):
        '''
        Output a summary of key features that are represented within the n-gram network.
        '''

        # Generating the baseline dataframe that will hold the summarized data.

        index = ['name',
                 'Proteins',
                 'n-grams',
                 'Network Isolates',
                 'Network Connected Components',
                 ]
        
        net_sum = pd.DataFrame(index=index,columns=[''])
        if 'name' in self.__dict__:
            net_sum.loc['name'] = self.name
        else:
            net_sum.loc['name'] = 'Domain n-gram Network'
        net_sum.loc['Proteins'] = len(self.protsOI)
        net_sum.loc['n-grams'] = len(self.ngrams)
        net_sum.loc['Network Isolates'] = nx.number_of_isolates(self.G)
        net_sum.loc['Network Connected Components'] = nx.number_connected_components(self.G)

        if detailed:
            net_sum.loc['Collapsed n-grams'] = len(self.collapsed_ngrams)
            net_sum.loc['Network Edges'] = nx.number_of_edges(self.G)
            net_sum.loc['Maximum Length of Protein Domain Architecture'] = self.retrieve_longest_arch(self.ref)


        return net_sum

    

[docs]
    @staticmethod
    def retrieve_longest_arch(ref):
        '''Retrieves the longest domain architecture length.'''

        arch_max = max(ref['Interpro Domain Architecture IDs'].apply(lambda x: len(x.split('|'))))

        return arch_max


    def import_precomp_data(self,adj,ref,interpro_ids,n,collapsed_ngrams = None,**net_attribs):
        '''
        For instances where a precomputed n-gram network was generated already, this will import the data if it does not already exist. 
        '''

        # Check pre-existence of any values
        if ('adj' in self.__dict__) or (self.ref is not None) or (self.interproIDs is not None):
            raise ValueError('Cannot alter already computed n-gram network parameters. Please create a new instance.')
        
        else:
            self.adj = adj
            self.ref = self.check_ref(ref)
            self.interproIDs = interpro_ids
            self._n = n

        # Populate the n-gram lists
        self.ngrams = list(self.adj.columns)
        if collapsed_ngrams is not None:
            self.collapsed_ngrams = collapsed_ngrams
        
        self.validate_n()

        # Create the n-gram to UniProt mapping. This uses the preexisting function.
        _,ngram_dict = ngramUtilities.get_ngrams_from_df(self.ref, self.interproIDs,arch_num=1, max_ngram=self.n)

        # For the n-grams that were already collapsed removing from the n-gram dict to reduce redundant information.
        if collapsed_ngrams is None:
            self.interpro2uniprot = ngram_dict
        else:
            for ngram in collapsed_ngrams:
                del ngram_dict[ngram]
            self.interpro2uniprot = ngram_dict

        # Unpacking only network options
        self.network_params = self.unpack_net_opts(**net_attribs)

        self.generate_network()


    def validate_n(self):
        '''
        Ensure that the n that has been provided matches the n-gram model that has been provided.
        '''

        # Check n-grams within the adjacency matrix
        cur_max_n = max([len(x.split('|')) for x in self.ngrams])
        max_possible = self.retrieve_longest_arch(self.ref)
        
        # First checks that there is no mismatch in longest extracted n-gram.
        if cur_max_n > self.n:
            raise AttributeError('The provided max n-gram length is longer than n-grams in the provided n-gram list.')
        
        if 'collapsed_ngrams' in self.__dict__:
            col_max = max([len(x.split('|')) for x in self.collapsed_ngrams])
            if col_max > self.n:
                raise AttributeError('The provided max n-gram length is longer than n-grams in the provided n-gram list.')
            
        # Now checking whether there may have just been a mistake of the longest and raise a warning. If ther maximum length is much smaller than the maximum possible length and that does not match the provided n
        if (cur_max_n < max_possible) and (self.n != cur_max_n):
            raise AttributeError('The provided max n-gram does not match the provided n-gram list.')
        if self.n >  max_possible:
            warnings.warn('The maximum possible n-gram that can be extracted was smaller than the desired maximum n-gram length.')


[docs]
    def retrieve_protein_info(self, prot = None, ngram = None):
        '''
        Retrieves the reference information of proteins of interest. If an InterPro ID/n-gram is provided it will use that instead and search for the proteins with that ID and return that instead.
        '''

        if prot == None and ngram == None:
            raise TypeError('Either a protein list or an n-gram are necessary. Please provide either one.')
        elif prot != None and ngram != None:
            raise TypeError('Please provide either a protein list or an n-gram and not both.')
        
        if ngram != None:
            prot = self.interpro2uniprot[ngram]
        
        if isinstance(prot, str):
            prot = [prot]

        prot_info = self.ref[self.ref['UniProt ID'].isin(prot)]

        return prot_info

    

[docs]
    def retrieve_random_ids(self,num, iters = 50, seed = 882):
        '''Generator of random UniProt IDs from the base network.'''
        random.seed(seed)
        seedlist = random.sample(range(100000), k=iters)
        full_id_list = sorted(self.ref['UniProt ID'].tolist())
        for i in range(iters):
            random.seed(seedlist[i])
            rand_ids = random.sample(full_id_list, k = num)
            yield rand_ids



def create_arch_ids(arch):
    '''
    This takes the architectures returned from the InterPro module of CoDIAC, which follows the format Name:InterPro_ID:Start:Stop for each domain separated by a semicolon and creates the full architecture with only the IDs seperated by a pipe (|).

    Parameters
    ----------
        - arch: str
            The full domain architecture string containing all the domain information
    
    Returns
    -------
        - arch_id: str
            The abbreviated domain architecture of only InterPro IDs
    '''
    try:
        arch_list = arch.split(';')
        ids_list = [x.split(':')[1] for x in arch_list]
        arch_id = '|'.join(ids_list)
    except:
        raise DomainArchError('The domain architecture information was incorrectly formatted.')
    
    return arch_id

class DomainArchError(TypeError):
    '''
    Custom error for domain architectures that are not properly formatted or missing.
    '''
    pass