Source code for dansy.deDANSy

import warnings
import random
import itertools
import time
import pandas as pd
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import dansy.ngramUtilities as ngramUtilities
import dansy.dedansy_algorithm as algorithm
from dansy.dansy import dansy
from dansy.network_separation_helpers import network_separation,build_network_reference_dict
from dansy.enrichment_plotting_helpers import plot_functional_scores, gather_enrichment_results, calc_ngram_fpr_vals, plot_enriched_ngrams


[docs] class DEdansy(dansy): ''' A container class of multiple Domain n-gram networks related to a differentially expressed dataset that was generated using the DESeq analysis pipeline. This provides methods to analyze and contrast pairs of domain architecture subnetworks to understand changes in functional molecular ecosystems available to different conditions. Parameters ---------- dataset: pandas DataFrame The expression dataset that contains the proteins of interest and expression values to designate differentially expressed genes/proteins uniprot_ref: pandas DataFrame (Optional) The base reference file that contains all the proteins of interest within the dataset. Note: It is recommended to include if multiple instances of a DEdansy object are being instantiated that share a common set of proteins. n: int (Optional) Length of n-grams to extract. Default is 10 id_conv: pandas DataFrame (Optional) A dataframe with a column of UniProt IDs and a column for a IDs used in the expression dataset to help in converting conv_cols: str (Optional) The name of the column with IDs matching in the id_conv dataframe. Assumes the naming convention generated from pybiomart data_ids: str (Optional) The name of the column in the dataset dataframe for the IDs to be converted to UniProt IDs penalty: 'dynamic' or int (Optional) The value or 'dynamic' for the penalty during network separation calculations. run_conversion: bool Whether the dataset IDs have to be converted using a provided id_conv dataframe kwargs Additional keyword arguments for generating the DANSy network. See dansy.set_ngram_parameters for details acceptable values are reproduced below - 'min_arch' - 'max_node_len' - 'collapse' - 'readable_flag' - 'verbose' Attributes ---------- ----------------- At Initialization ----------------- dataset: pandas DataFrame The expression dataset for DANSy analysis ref: pandas DataFrame The reference file information for the proteins within the dataset n: int The maximum length of n-grams being extracted interproIDs: list A list of all protein domain InterPro IDs that were found within the dataset protsOI: list The UniProt IDs for the proteins found within the dataset ngrams: list The extracted domain n-grams collapsed_ngrams: list The domain n-grams which were collapsed into other n-grams which represent the set of proteins G: networkx Graph The network graph representation of the DANSy n-gram network adj: pandas DataFrame The adjacency matrix for the n-gram network for the DANSy analysis interpro2uniprot: dict The keys of InterPro IDs with values of a list of UniProt IDs that have the InterPro ID id_conversion_dict: dict A dictionary containing the conversion of the provided gene/protein ID to a UniProt ID. If UniProt IDs were provided, returns a dict of UniProt to UniProt IDs. data_id_cols: str The ID column used in the dataset for conversion network_params: dict Key-value pairs of acceptable networkx drawing parameters min_arch: int (Default: 1) The minimum number of domain architectures for an n-gram to be retained. max_node_len: int The maximum n-gram length that will be retained during the collapsing step to represent n-grams sharing the same set of proteins. This will not be larger than n (Default of 10). collapse: bool Whether the n-grams were collapsed readable_flag: bool Whether the n-grams are human-legible verbose: bool Whether progress statements are to be printed during calculations -------------------------------------- After Establishing Comparison MetaData -------------------------------------- comp_metadata: pandas DataFrame The metadata information for comparisons of interest in the dataset ---------------------- After DEG Calculations ---------------------- up_DEGs/down_DEGs: list List of UniProts that have were designated as up or down for a specific condition up_ngrams/down_ngrams: list The n-grams for either up- or down-regulated proteins/genes alpha: float The p-value cutoff for designating DEGs fcthres: float The fold-change cutoff for designating DEGs pval_data_col: str The column name in the datasetused for the p-value data for DEGs fc_data_col: str The column name in the dataset used for fold-change data for DEGs ----------------------------- After creating deDANSy scores ----------------------------- scores: pandas DataFrame The scores, p-values, and FPR values for the deDANSy Separation and Distinction scores for the comparisons raw_dists: pandas DataFrame The raw distribution of values for each score fpr_dists: pandas DataFrame The raw distribution of FPR values for each score ''' def __init__(self,dataset,uniprot_ref = None,n = 10, id_conv = None, conv_cols = 'Gene stable ID', data_ids = 'gene_id', penalty = 'dynamic',run_conversion = True, **kwargs): # Bare minimum attributes required for setting up an empty n-gram network. self.dataset = dataset self.ref = uniprot_ref self._n = n self.interproIDs = None # Converting the dataset ids for individual genes to the UniProt IDs if 'dbl_check' in kwargs: check_IDs_flag = kwargs['dbl_check'] else: check_IDs_flag = False if run_conversion: if id_conv is None: raise ValueError('Missing an ID converting dataframe.') else: self.id_conversion_dict = create_id_conv(dataset, id_conv, conv_cols,data_ids,check_IDs_flag) self.protsOI = convert_2_uniprotIDs(dataset,id_conv, conv_cols,data_ids, check_IDs_flag) else: self.protsOI = dataset[data_ids].tolist() self.id_conversion_dict = {k:[k] for k in self.protsOI} # Saving the data_id column for instances when the DEGs have to be calculated. if isinstance(data_ids, list): self.data_id_col = data_ids[0] else: self.data_id_col = data_ids self.conversion_id_col = conv_cols # Now making sure there is a common reference that can be used for generating the n-gram networks. if self.ref is None: self.add_ref_df() self.populate_ngramNet(**kwargs) if self.verbose: print('Building the reference network information.') self.ref_data = build_network_reference_dict(self, penalty=penalty)
[docs] def DEG_network_sep(self, force_run = False): ''' This computes the network separation of two conditions that designates individual genes as differentially expressed. Silently passes a nan if it fails. ''' # Get the UniProt IDs for the differentially expressed genes if hasattr(self, 'up_DEGs') and hasattr(self, 'down_DEGs'): pass else: raise ValueError('DEGs do not exist. Please run calc_DEG_ngrams.') if len(self.up_ngrams) == 0 or len(self.down_ngrams) == 0: ns = np.nan else: try: # Generating networks for the individual DEG conditions up_net = self.G.subgraph(self.up_ngrams) dn_net = self.G.subgraph(self.down_ngrams) ns = network_separation(up_net,dn_net, self.ref_data,force_run=force_run) except: ns = np.nan return ns
[docs] def calc_DEG_ngrams(self, comp, batch_mode = False): ''' Defines the DEG UniProt IDs and the associated n-grams for the datasset of interest ''' # Checking if there was a set of DEGs that already exists and printing statement saying it will be overwritten. if hasattr(self,'up_DEGs'): verbose_flag = True if hasattr(self, 'G_collapsed'): del self.G_collapsed else: verbose_flag = False # This is only for running several and wanting to keep track of general progress. if batch_mode: verbose_flag = False # Now extracting the data columns which for the comparison of interest to define the differentially expressed genes/proteins compOI_info = self.comp_metadata.loc[comp] data_cols = [compOI_info['fc_col'], compOI_info['pval_col']] fc_thres = compOI_info['fc_thres'] alpha = compOI_info['alpha'] # Reducing large dataframe to what is the info needed for generating the DEG networks cols = [self.data_id_col]+data_cols dataset_OI = self.dataset.filter(cols) # Getting the differentially expressed genes deg_up = list(dataset_OI[(dataset_OI[data_cols[1]] <= alpha) & (dataset_OI[data_cols[0]] > fc_thres)][self.data_id_col]) deg_dn = list(dataset_OI[(dataset_OI[data_cols[1]] <= alpha) & (dataset_OI[data_cols[0]] < -fc_thres)][self.data_id_col]) # Keeping record of the data columns for a summary self.pval_data_col = data_cols[1] self.fc_data_col = data_cols[0] # Recording the thresholds for DEG values self.alpha = alpha self.fcthres = fc_thres # And converting to the UniProt IDs to highlight n-grams within the network that were retained for each up_DEGs = [v for k, v in self.id_conversion_dict.items() if k in deg_up] up_DEGs = set(list(itertools.chain.from_iterable(up_DEGs))) down_DEGs = [v for k, v in self.id_conversion_dict.items() if k in deg_dn] down_DEGs = set(list(itertools.chain.from_iterable(down_DEGs))) # Now just due to random sampling issues making sure the DEGs are in alphabetical order and converting to a list up_DEGs = sorted(up_DEGs) down_DEGs = sorted(down_DEGs) self.set_DEG_ngrams(up_DEGs,down_DEGs, verbose=verbose_flag)
def set_DEG_ngrams(self, up_DEGs, down_DEGs,collapse = True, verbose=True): ''' This actually sets the DEGs so that they are calculated, but allows for custom sets to be generated for very specific purposes. It is not recommended to use this ''' if hasattr(self,'up_DEGs') and verbose: print('Will be overwriting existing DEG data.') # Setting the DEGs self.up_DEGs = up_DEGs self.down_DEGs = down_DEGs # Now getting the n-grams associated with the collective DEGs up_ngram_cands = [k for k,v in self.interpro2uniprot.items() if set(v).intersection(self.up_DEGs)] down_ngram_cands = [k for k,v in self.interpro2uniprot.items() if set(v).intersection(self.down_DEGs)] # Getting the n-gram candidates up_ngram_dict = {k:set(v).intersection(self.up_DEGs) for k,v in self.interpro2uniprot.items() if k in up_ngram_cands} down_ngram_dict = {k:set(v).intersection(self.down_DEGs) for k,v in self.interpro2uniprot.items() if k in down_ngram_cands} # Now collapsing these to the non-redundant ones if collapse: up_ngram_dict,_ = ngramUtilities.concatenate_ngrams(up_ngram_dict) down_ngram_dict,_ = ngramUtilities.concatenate_ngrams(down_ngram_dict) # Exporting to the object self.up_ngrams = [k for k in up_ngram_dict.keys()] self.down_ngrams = [k for k in down_ngram_dict.keys()]
[docs] def plot_DEG_ns(self,pos = [],deg_labels=[], large_cc_mode=False): ''' Using the defined differentially expressed genes displaying a network graph of the n-grams that are associated or shared between the DEG conditions. ''' # Setting default labels otherwise setting the label names for the legend. if deg_labels: up_label = deg_labels[0] down_label = deg_labels[1] else: up_label = 'Up' down_label = 'Down' # Retrieving the n-grams across both DEG conditions to filter out any isolates/connected components in the reference network that are not used. all_deg_ngrams = set(self.up_ngrams).union(self.down_ngrams) if large_cc_mode: large_ref_cc = max(nx.connected_components(self.G), key=len) all_deg_ngrams = all_deg_ngrams.intersection(large_ref_cc) # Setting up the node color list for plotting node_colors = [] for node in self.G.subgraph(all_deg_ngrams): if (node in self.up_ngrams) and (node in self.down_ngrams): node_colors.append('tab:purple') elif node in self.up_ngrams: node_colors.append('tab:cyan') elif node in self.down_ngrams: node_colors.append('tab:red') else: node_colors.append('tab:gray') # Dropping connected components not shared with the DEGs and the reference network if large_cc_mode: cc_2_keep = set(large_ref_cc) else: cc_2_keep = set() for cc in nx.connected_components(self.G): if set(cc).intersection(all_deg_ngrams): cc_2_keep.update(cc) # Getting the node positions for network drawing if not supplied. if pos == [] and hasattr(self, 'network_params'): if 'pos' in self.network_params: pos = self.network_params['pos'] else: pos = nx.spring_layout(self.G, k=0.05) elif pos ==[]: pos = nx.spring_layout(self.G, k=0.05) # Getting all the basic network parameters that are default in the n-gram networks # Default values basic_network_params = {'node_size':1, 'edgecolors':'k', 'linewidths':0.1, 'width':0.25, 'edge_color':'#808080', } net_draw_params = {} for param in basic_network_params: if param in self.network_params: net_draw_params[param] = self.network_params[param] else: self.network_params[param] = basic_network_params[param] net_draw_params[param] = basic_network_params[param] # Now drawing and adding legends plt.figure(figsize=(2,2), dpi=300) nx.draw(self.G.subgraph(cc_2_keep), pos, node_color = 'tab:gray', alpha = 0.1, **net_draw_params) nx.draw(self.G.subgraph(all_deg_ngrams), pos, node_color = node_colors,**net_draw_params) # Legend drawing plt.scatter([],[],c='tab:cyan',s=1, label=up_label) plt.scatter([],[],c='tab:red',s=1, label=down_label) plt.scatter([],[],c='tab:purple',s=1, label='Both') plt.legend(bbox_to_anchor=(1,0.5),frameon=False)
[docs] def deg_summary(self, detailed = False): ''' This provides a summary of the DEG information that has been used within this dataset. ''' summary_df = pd.DataFrame(index=['p-value threshold', 'Fold change threshold', 'Up Regulated DEGs','Down Regulated DEGs'],columns=['']) vals = [self.alpha, self.fcthres, len(self.up_DEGs), len(self.down_DEGs)] for i,v in zip(summary_df.index, vals): summary_df.loc[i] = v if detailed: extended_inds = ['Data ID column','p-val data column', 'FC data column','Up-ngrams','Down ngrams','Common n-grams'] # Getting common ngram counts common = set(self.up_ngrams).intersection(self.down_ngrams) extended_vals = [self.data_id_col, self.pval_data_col,self.fc_data_col, len(self.up_ngrams), len(self.down_ngrams), len(common)] for i,v in zip(extended_inds, extended_vals): summary_df.loc[i] = v return summary_df
def ngram_DEG_hypergeom(self, condition): ''' Performs the hypergeometric over-representation test on the n-grams associated with different conditions. Parameters: ----------- - condition: str Which condition to perform the test on must be either Up or Down (case insensitive) Returns: -------- - p_vals: dict The p-value of each n-gram given the condition relative to the whole potential universe of the n-gram reference background of the n-gram network. ''' # Make the condition input all lower case condition = condition.lower() if condition == 'up': ngramsOI = self.up_ngrams degsOI = self.up_DEGs elif condition == 'down': ngramsOI = self.down_ngrams degsOI = self.down_DEGs else: raise ValueError('Please specify either up or down.') p = ngram_enrichment(self, prots = degsOI, ngrams=ngramsOI) return p
[docs] def create_contrast_metadata(self,comparisons, fc_cols, pval_cols, fcs = 1,alphas = 0.05, delim=None): ''' This create the metadata for the deDANSy object that contains information for each comparison of interest and the cutoffs to define differentially expressed genes/proteins (DEGs). Must provide the same number of columns for both the fold change and p-value columns Parameters ----------- comparisons : list All comparisons that will be used that match what is provided in the deDANSy dataset fc_cols : str or list Either a stem or list of columns used for the foldchanges to define DEGs (Use the delim parameter and a stem if reoccurring patterns are used where the comparison is at the end.) pval_cols : str or list Either a stem or list of columns used for the p-values to define DEGs. (Use the delim parameter and a stem if reoccurring patterns are used where the comparison is at the end.) fcs : float or list The fold-change cutoff. Can either be a single value or a list of values of the same size as comparisons alphas : float or list The p-value cutoff. Can either be a single value or a list of values of the same size as comparisons delim : str (Optional) Delimiter used in column names separating the stem of the column from the comparison. If none then will take the Returns -------- comp_meta : pandas DataFrame DataFrame where each row is a single comparison ''' # Checking parameter inputs fc_list_flag = False if isinstance(fcs, list): fc_list_flag = True if len(fcs) == 1: fcs = fcs[0] fc_list_flag = False if len(fcs) != len(comparisons): raise ValueError('Provide either a single value or list of values for each comparison of fold-change cutoffs') alpha_list_flag = False if isinstance(alphas, list): alpha_list_flag = True if len(alphas) == 1: alpha_list_flag = False alphas = alphas[0] if len(alphas) != len(comparisons): raise ValueError('Provide either a single value or list of values for each comparison of p-value cutoffs') col_lists = isinstance(fc_cols, list) and isinstance(pval_cols, list) if col_lists and len(fc_cols) != len(pval_cols): raise ValueError('The number of fold change columns and p-value columns does not match.') elif col_lists and len(fc_cols) != len(comparisons): if len(fc_cols) > 1: raise ValueError('The number of data columns does not match the number of comparisons.') col_strs = isinstance(fc_cols, str) and isinstance(pval_cols, str) if delim is None and col_strs: if len(comparisons) > 1: raise ValueError('Cannot create feasible data columns without a delimiter') if (not col_lists) and (not col_strs): raise ValueError('The data columns have to be either lists or strings not a mixture of the two.') # Now creating the DataFrame to populate with values comp_meta = pd.DataFrame(columns=['fc_col', 'fc_thres','pval_col','alpha'], index=comparisons) for i,c in enumerate(comparisons): if col_strs: if delim is None: fcol = fc_cols pcol = pval_cols else: fcol = delim.join([fc_cols,c]) pcol = delim.join([pval_cols,c]) else: fcol = fc_cols[i] pcol = pval_cols[i] comp_meta.loc[c,'fc_col'] = fcol comp_meta.loc[c,'pval_col'] = pcol # Get the corresponding values for the cutoffs if fc_list_flag: f = fcs[i] else: f = fcs if alpha_list_flag: a = alphas[i] else: a = alphas comp_meta.loc[c,'fc_thres'] = f comp_meta.loc[c,'alpha'] = a if hasattr(self, 'comp_metadata'): temp = pd.concat([self.comp_metadata, comp_meta], axis = 0) self.comp_metadata = temp else: self.comp_metadata = comp_meta
[docs] def calculate_scores(self, comps, fpr_trials=50, min_pval = -10, num_ss_trials = 100, processes = 1, seed=None, verbose=True, overwrite=False): ''' This calculates the separation and distinct functional neighborhood scores for a deDANSy instance. It creates new attributes for the deDANSy instance containing all the information of interest. If scores for a condition have been generated, this will raise a warning and overwrite existing scores. Parameters ----------- dedansy : deDANSy object The base deDANSy object containing all n-grams and expression data comps : list or str The conditions that will be compared fpr_trials : int (Optional) Number of FPR trials to perform. min_pval : int (Optional) The log10 transform of the minimum p-value to use for the p-value pruning sweep step. num_ss_trials : int (Optional) The number of subsampled (and random) networks used to build distributions for comparing the network separation and IQR processes : int (Optional) Number of processes to use if multiprocessing is desired. (Recommended having 4-8 when feasible) seed : int Seed for random numbers. If not provided will use system time Returns -------- The following attributes are added or adjusted: scores : pandas DataFrame All scores, p-values, and FPR values for each condition raw_dists : pandas DataFrame For each condition the raw values that made up the distributions for each score fpr_dists : pandas DataFrame For each condition the p-values from the random FPR trials for calculating the FPR''' # Check for existing scores for all conditions provided if hasattr(self, 'scores'): a = set(comps).intersection(self.scores.index) if len(a) >= 1: if overwrite: warnings.warn('At least one condition has existing scores that will be overwritten.') else: warnings.warn('At least one condition has existing scores that will be kept.') else: a = set() temp_scores, temp_raw_dists, temp_fpr_dists = algorithm.calculate_scores(self, comps, fpr_trials,min_pval, num_ss_trials,processes,seed, verbose) if hasattr(self, 'scores'): cur_scores = self.scores if overwrite: cur_scores.update(temp_scores) new_scores = cur_scores.combine_first(temp_scores) new_raw_dist = self.raw_dists.combine_first(temp_raw_dists) new_fpr_dist = self.fpr_dists.combine_first(temp_fpr_dists) else: new_scores = temp_scores new_raw_dist = temp_raw_dists new_fpr_dist = temp_fpr_dists self.scores = new_scores self.raw_dists = new_raw_dist self.fpr_dists = new_fpr_dist
[docs] def plot_scores(self, show_FPR = True, aspect = 0.9, order = None): ''' This creates the bubble plots for the separation and distinction scores. This is a wrapper function for the base plotting function found in the enrichment_plotting_helpers. Parameters ---------- show_FPR : bool (Optional) Whether the FPR legend handles should be displayed. aspect : float (Optional) The aspect ratio for each score plot. order : dict (Optional) Key-value pairs for each comparison and what order they should be displayed on the axis. Returns -------- ax : matplotlib Axes The axes of the resulting plot ''' x = self.scores.copy() x['Separation_Significance'] = x['Separation_FPR'] <= 0.05 x['Distinction_Significance'] = x['Distinction_FPR'] <= 0.05 ax = plot_functional_scores(x, show_FPR,aspect=aspect, order=order) return ax
[docs] def calculate_ngram_enrichment(self,comparison, fpr_trials = 100, seed = None): ''' Calculates the n-gram enrichment for the comparison of interest to find the most significant n-grams. Parameters: ----------- - comparison: str The comparison of interest that contains the up- and down-regulated genes/proteins - alpha: float (Optional, Default: 0.05) Threshold of values to return - fpr_trials: int (Optional, Default: 100) The number of trials used to calculate the false positive rate - seed: int (Optional) Seed for the random state Return: ------- Creates the new attribute - ngram_results: dict of pandas DataFrame All the n-gram enrichment values that pass the significance threshold for the comparison of interest ''' # Setting the random state if seed is None: random.seed(int(time.time())) else: random.seed(seed) # First determine and store the enriched n-grams for the comparison of interest self.calc_DEG_ngrams(comparison) enriched_ngrams = {} for i in ['Up', 'Down']: enriched_ngrams[i] = self.ngram_DEG_hypergeom(i) # Get the original DEGs for defining number of proteins to randomly choose true_up = self.up_DEGs true_dn = self.down_DEGs total_degs = len(set(true_up).union(true_dn)) frac_up = len(true_up)/total_degs # Getting the random protein generator rand_genes = self.retrieve_random_ids(num=total_degs, iters=fpr_trials) rand_ngram_pvals = {'Up':{}, 'Down':{}} for i in range(fpr_trials): cur_DEGs = next(rand_genes) orig_up = random.sample(cur_DEGs, k=round(len(cur_DEGs)*frac_up)) orig_dn = list(set(cur_DEGs).difference(orig_up)) self.set_DEG_ngrams(up_DEGs=orig_up, down_DEGs=orig_dn, verbose=False) rand_up_hyper = self.ngram_DEG_hypergeom('Up') rand_dn_hyper = self.ngram_DEG_hypergeom('Down') for j,c_dir in zip([rand_up_hyper,rand_dn_hyper],['Up','Down']): for node in j: if node not in rand_ngram_pvals[c_dir]: rand_ngram_pvals[c_dir][node] = [] rand_ngram_pvals[c_dir][node].append(j[node]) # Now getting the FPR values and putting the results in the correct order fpr_dict = calc_ngram_fpr_vals(enriched_ngrams, rand_ngram_pvals) res = gather_enrichment_results({'Up':enriched_ngrams['Up'],'Down':enriched_ngrams['Down']}, fpr_dict) if hasattr(self,'ngram_results'): self.ngram_results[comparison] = res else: self.ngram_results = {} self.ngram_results[comparison] = res
[docs] def plot_ngram_enrichment(self, comparison, p = 0.05, q=0.05, show_FPR = True, **kwargs): ''' Creates a bubble plot of the enriched n-grams for up and down-regulated n-grams for the comparison of interest. The resulting plot can be paritally customized based on additional keyword parameters. This function is a wrapper for the plot_enriched_ngrams function from the enrichment_plotting_helpers module, but specific to the deDANSy object instance. Parameters: ----------- - comparison: str The comparison to plot the enriched n-grams of - q: float (Optional) The quantile cutoff of values to plot. Default (0.05) - p: float (Optional) The p-value threshold to limit n-grams to plot. Default is 0.05 but if combined with the quantile (q) can be lower. show_FPR: bool Flag whether to show the FPR legend portion. kwargs: optional keywords - 'palette','edgecolor', 'linewidth', 'sizes' that adjust the seaborn scatterplot aesthetics. - 'loc', 'bbox_to_anchor', 'handletextpad' to adjust matplotlib legend information. - 'ax' to specify a matplotlib Axes to plot onto Returns: -------- seaborn/matplotlib plot ''' resOI = self.ngram_results[comparison] plot_enriched_ngrams(resOI, dansyOI=self, p = p, q=q, show_FPR=show_FPR, **kwargs) plt.xlim(-0.5,1.5)
[docs] def get_ngram_results(self,comparison): ''' Returns the n-gram enrichment results of the comparison of interest. Parameters: ----------- - comparison: str The comparison of interest Returns: -------- - res: pandas DataFrame The n-gram enrichment dataframe ''' temp = self.ngram_results[comparison] # Convert the n-gram IDs to legible names temp['ngram_legible'] = temp['ngram'].apply(lambda x: self.return_legible_ngram(x)) # Now putting the columns into a more logical order col_order = ['variable', 'ngram_legible', 'ngram', 'p', '-log10(p)', 'FPR', 'FPR <= 0.05'] temp = temp[col_order] return temp
## Helper functions for converting the IDs around def convert_2_uniprotIDs(df, id_conv, conv_col = 'Gene stable ID', data_id_cols = 'gene_id', dbl_check = False): ''' This takes a dataframe of interest and converts the specified column to UniProt IDs given a second dataframe consisting of UniProt IDs, ENSEMBL ids, and gene names/synonyms retrieved using biopython. If desired there will be a double check to ensure all IDs are retrieved if an archived version of ENSEMBL has been used and gene names are requested instead. ''' # Now getting all possible UniProt IDs given the IDs of interest. ensembl_uni_dict = create_id_conv(df, id_conv=id_conv,conv_col=conv_col, data_id_cols=data_id_cols, dbl_check=dbl_check) uniprot_IDs = set(itertools.chain.from_iterable(ensembl_uni_dict.values())) return uniprot_IDs def create_id_conv(df, id_conv, conv_col = 'Gene stable ID', data_id_cols = 'gene_id', dbl_check = False): ''' This is an internal function that makes a dict to convert the ensembl IDs (by default) to UniProt IDs. ''' # Check if a list is provided for the data_id_cols or if a single/default value provided. if isinstance(data_id_cols, list): if dbl_check: data_id_col = data_id_cols[0] else: data_id_col = data_id_cols[0] warnings.warn('More than one column name was provided for converting, but a double check was not designated. Ignoring additional inputs.') else: data_id_col = data_id_cols # Retrieve all ENSEMBL IDs which are found within the dataset of interest success_mapping_ids = set(id_conv[id_conv[conv_col].isin(df[data_id_col])]['Gene stable ID']) if dbl_check: gene_dbl_chck = df[~df[data_id_col].isin(success_mapping_ids)][data_id_cols[1]].dropna() cands = id_conv[id_conv['Gene name'].isin(gene_dbl_chck)] success_mapping_ids.update(cands['Gene stable ID']) id_dict = id_conv[id_conv['Gene stable ID'].isin(success_mapping_ids)].filter(['Gene stable ID', 'UniProtKB/Swiss-Prot ID']).drop_duplicates().dropna().to_dict('tight') id_dict = id_dict['data'] # Now to ensure that all potential UniProt IDs are accounted for have to go through and put them in lists as some Ensembl IDs map to multiple proteins. id_conversion = {} for conversion_data in id_dict: ensembl = conversion_data[0] uniprot = conversion_data[1] if ensembl in id_conversion: id_conversion[ensembl].append(uniprot) else: id_conversion[ensembl] = [uniprot] return id_conversion # Helper functions for n-gram enrichment. def ngram_enrichment(dansy, prots, collapse = True, **kwargs): ''' Calculates the enrichment of n-grams associated with a subset of proteins within a DANSy object by Fisher's exact test. Note: This method can specifically analyze specific n-grams within the protein subset. However, this process should only be accessed with the deDANSy object and is not recommended to be used. Parameters ---------- - dansy: DANSY The DANSy object which contains the proteins of interest. (This can be either the differential expression or standard class.) - prots: list List containing the proteins of interest for enrichment analysis. - collapse: bool Whether the n-grams should be collapsed to their most informative and non-redundant n-grams. - kwargs: optional keyword arguments (Not recommended to use) - ngrams: list List of n-grams that are to be analyzed specifically. ''' p_vals = ngram_subset_enrichment(prots, dansy.protsOI, dansy, collapse=collapse, kwargs=kwargs) return p_vals def ngram_subset_enrichment(protsOI, full_prots,dansy_bkg, collapse = True, **kwargs): '''Peform Fisher's exact test for n-grams found in a subset of proteins that may be found in the full list of proteins. This is the more general version that does not require using the full DANSy protein list. Parameters: ----------- - protsOI: list List containing the proteins of interest for enrichment analysis. - full_prots: list List containing the proteins that make up the full background list. - dansy_bkg: DANSy object The DANSy that the proteins and n-grams are associated with - collapse: bool Whether the n-grams should be collapsed to their most informative and non-redundant n-grams. - kwargs: key, value mappings (Not recommended) Additional keyword arguments: - ngrams: list List of n-grams that are to be analyzed specifically. Returns: -------- - p_vals: dict Key-value pairs of n-grams and their enrichment p-value. ''' if 'ngrams' in kwargs: ngrams = kwargs['ngrams'] # Making sure all n-grams are found in at least one of the proteins. If any are not raise an error. internal_check = [len(set(v).intersection(protsOI)) == 0 for k,v in dansy_bkg.interpro2uniprot.items() if k in ngrams] if any(internal_check): raise ValueError('At least one provided n-gram is not found in the proteins of interest.') else: ngrams = [] subset_prots = list(set(protsOI).intersection(full_prots)) # Ensure that the foreground proteins are found within the background (removing any which are not) M = len(subset_prots) N = len(full_prots) p_vals = {} if full_prots == dansy_bkg.protsOI: full_check = True else: full_check = False # Get the n-grams of the proteins of interest if they were not provided if ngrams: ngramsOI = ngrams else: ngram_cands = [k for k,v in dansy_bkg.interpro2uniprot.items() if set(v).intersection(subset_prots)] ngram_dict = {k:set(v).intersection(subset_prots) for k,v in dansy_bkg.interpro2uniprot.items() if k in ngram_cands} if collapse: ngram_dict,_ = ngramUtilities.concatenate_ngrams(ngram_dict) ngramsOI = [k for k in ngram_dict.keys()] # Now using the cdf/sf of the hypergeometric distribution to get p-values (This is equivalent to Fisher's exact) for node in ngramsOI: # Skip the filtering if using the full background if full_check: full_prot_list = dansy_bkg.interpro2uniprot[node] else: full_prot_list = [u for u in dansy_bkg.interpro2uniprot[node] if u in full_prots] # Now getting the numbers for the hypergeom distribution and calculating the cdf (or really sf since it is equivalent to 1-cdf) k = len(set(full_prot_list).intersection(subset_prots)) n = len(set(full_prot_list)) p = stats.hypergeom.sf(k-1,N, n,M) p_vals[node] = p return p_vals