Source code for dansy.enrichment_helpers

import random
import numpy as np
import scipy.stats as stats
import multiprocessing as mp
import dansy.network_separation_helpers as ns_helpers

# Defining a function to calculate cohen's D since I do not want to make the assumption that there is equal variance between the random and subsampled distributions. 
[docs] def cohen_d(a,b): ''' Calculates the Cohen's d effect size of 2 lists of values, where the second is considered the "control" group. Parameters: ----------- a,b: list or numpy array Distribution of values to compare the effect size of Returns: -------- d: numpy float Cohen's d effect size (Returns nan if all values are nan) ''' # For certain cases there might be all nans so just check that first if all(np.isnan(a)) and all(np.isnan(b)): d = np.nan else: # Means x1 = np.nanmean(a) x2 = np.nanmean(b) # Sizes n1 = len(a) n2 = len(b) # Defining the pooled standard deviation (Note: need to account for the definition by cohen that uses N-1 for the variance, which numpy does not use) s = np.sqrt(((n1-1)*np.nanstd(a,ddof=1)**2 + (n2 -1)*np.nanstd(b, ddof=1)**2)/(n1+n2-2)) if s > 0: d = (x1-x2)/s elif np.isnan(s): d = np.nan else: #Shouldn't ever get to this d = 0 return d
def hypergeom_prune_ns(dedansy, sweep): ''' Builds a distribution of network separation values between two conditions on the deDANSy network based on the enrichment pruning over more restrictive p-value thresholds. Parameters: ----------- - dedansy: deDANSy object The object containing all the enriched n-grams for both conditions of interest - sweep: list List of thresholds to use for pruning of differential expression status Returns: -------- - ns_sweep: list Network separation values for each of the thresholds provided ''' # Get the enrichment p-values for each n-gram of both conditions up_hyper_vals = dedansy.ngram_DEG_hypergeom('Up') dn_hyper_vals = dedansy.ngram_DEG_hypergeom('Down') ns_sweep = [] for i in sweep: # Get only n-grams that pass the threshold up_check = [k for k,v in up_hyper_vals.items() if v <= i] dn_check = [k for k,v in dn_hyper_vals.items() if v <= i] # Setting up the networks for determining the network separation up_net = dedansy.G.subgraph(up_check) dn_net = dedansy.G.subgraph(dn_check) if len(up_net.nodes()) > 0 and len(dn_net.nodes())> 0: ns = ns_helpers.network_separation(up_net, dn_net, dedansy.ref_data) else: ns = np.nan ns_sweep.append(ns) return ns_sweep # Below is a handful of helper functions to try and do network separation on a number of iterations to create a null distribution of DEG network separation values def get_random_count_dist(count_weights): ''' Creates list of counts for different domain architecture lengths to use in subsampling and random gene selection to calculate robust enrichment of different n-grams. This creates a small bit of randomness to ensure some leeway of genes chosen to not bias n-grams of specific lengths being consistently over-represented. The total distribution of domain architectures from 0 to 10 are individual elements, while those >10 will be accumulated as these represent <5% of the human proteome. Parameters: ----------- - count_weights: list List of counts of the total distribution of domain architecture lengths of the measured universe in the deDANSy object Returns: -------- - rand_counts: list List of random integers for each domain architecture length ''' rand_counts = [] for i in count_weights[1:11]: #Skip the 0 length architectures as they do not contribute to the network # Give leeway of about +/-15% for randomly choosen an integer of similar size to the original count i_min = np.floor(i*.85) i_max = np.ceil(i*1.15) if i_min == 0 and i_max == 0: r = 0 else: r = random.randrange(i_min, i_max) rand_counts.append(r) # For the 10+ architectures choose between 0 and the actual value since going above will really skew n-grams if count_weights[11] > 0: rand_counts.append(random.randrange(0,count_weights[11])) else: rand_counts.append(0) return rand_counts def get_random_id_indices(arch_len_array, rand_count_list): ''' Get the indices of randomly chosen genes to use as a null distribution. Parameters: ----------- - arch_len_array: list The list of protein architecture lengths to choose randomly from - rand_count_list: list The number of genes to choose with each architecture length Returns: -------- - rand_4_analysis: list List of indices from the original dataframe containing all protein information. ''' rand_4_analysis = [] for i in range(1,11): cands = arch_len_array[arch_len_array == i] rand_ids = random.sample(list(cands.index), k=rand_count_list[i-1]) # Using indices as these are conserved across all the analysis and have smallish memory footprint rand_4_analysis += rand_ids # For the >10 n-grams now getting candidates cands = arch_len_array[arch_len_array > 10] rand_ids = random.sample(list(cands.index), k=rand_count_list[10]) rand_4_analysis += rand_ids return rand_4_analysis def designate_rand_DEGs(rand_indices, ref_df, up_fraction): ''' Designates randomly chosen genes as either up or down regulated while preserving the relative fraction of each condition. Parameters: ----------- - rand_indices: list List of protein indices from the reference data frame that were randomly chosen - ref_df: pandas DataFrame The reference dataframe that contains all protein info - up_fraction: float The relative fraction of up to down regulated genes/proteins being expressed Returns: -------- - rand_DEGs: dict Up and down lists of UniProt IDs from the randomly chosen genes ''' # Now double-checking the distribution random_prots = ref_df.loc[rand_indices] # Now designating the random genes as either up or down based on the proportion of DEGs that were for both rand_up_num = round(up_fraction*len(random_prots)) # Alternative to the random sampling which has issues with reproducibility when called multiple times rand_prot_set = sorted(random_prots['UniProt ID'].tolist()) #random.shuffle(rand_prot_set) #rand_up_chosen = rand_prot_set[0:rand_up_num] #rand_down_chosen = rand_prot_set[rand_up_num:] rand_up_chosen = sorted(random.sample(rand_prot_set, k=rand_up_num)) rand_down_chosen = sorted(set(rand_prot_set).difference(rand_up_chosen)) rand_DEGs = {'Up':rand_up_chosen,'Down':rand_down_chosen} return rand_DEGs def get_random_net_sep_metrics(normalized_arch_weights, complete_arch_dist, dedansy,deg_ratio,pvals= np.logspace(0,-10, num=21),return_dist = False): ''' Performs the network separation and pruning analysis on randomly chosen genes from the dedansy object. Parameters: ----------- - normalized_arch_weights: list The relative fraction of the domain architecture lengths from 0-10 and 10+ from the deDANSy object of differentially expressed genes/proteins - complete_arch_dist: list The proteins that are found for each domain architecture length to choose gene randomly from - dedansy: deDANSy object The object that contains all the protein information being analyzed - deg_ratio: float The relative fraction of up to down regulated genes/proteins - pvals: numpy array (Optional) The threshold p-values to use for pruning analysis - return_dist: bool (Optional) Whether the distribution of p-values during the p-value sweep is return. Default is False as this is a niche request Returns: -------- - o: tuple Tuple containing both the network separation prior to pruning and the interquartile range of network separation values during the pruning analysis ''' # Setting the randomly chosen genes rcd = get_random_count_dist(normalized_arch_weights) rii = get_random_id_indices(complete_arch_dist,rcd) r_prots = designate_rand_DEGs(rii, dedansy.ref,deg_ratio) dedansy.set_DEG_ngrams(r_prots['Up'],r_prots['Down'], verbose=False) # Getting the two (or three) metrics ns = dedansy.DEG_network_sep() ns_vals = hypergeom_prune_ns(dedansy, pvals) ns_iqr = stats.iqr(ns_vals, nan_policy='omit') if return_dist: o = (ns,ns_iqr,ns_vals) else: o = (ns,ns_iqr) return o def get_subsample_net_sep_metrics(dedansy,orig_DEGs, rand_prot_nums,pvals= np.logspace(0,-10, num=21),return_dist = False): ''' Performing the network separation and pruning analysis on a subsampled set of proteins from the differentially expressed genes/proteins. Parameters: - dedansy: deDANSy object The dansy object containing all expression and protein information - orig_DEGs: list List of two elements which contain lists of all the UniProt IDs that are either up or down (in that order) regulated - rand_prot_nums: list Number of randomly chosen proteins for each condition to ensure we match the randomly chosen genes - pvals: numpy array The threshold p-values to use during the pruning step - return_dist: bool Whether the distribution of p-values during the p-value sweep is return. Default is False as this is a niche request Returns: -------- - o: tuple Tuple containing both the network separation prior to pruning and the interquartile range of network separation values during the pruning analysis ''' # Setting up the original differentially expressed genes/proteins and randomly sampling them orig_up = orig_DEGs[0] orig_dn = orig_DEGs[1] rand_up = random.sample(orig_up, k=rand_prot_nums[0]) rand_dn = random.sample(orig_dn, k=rand_prot_nums[1]) # Setting the subsampled genes and getting the metrics of interest dedansy.set_DEG_ngrams(rand_up,rand_dn, verbose=False) ns = dedansy.DEG_network_sep() ns_vals = hypergeom_prune_ns(dedansy,pvals) ns_iqr = stats.iqr(ns_vals, nan_policy='omit') if return_dist: o = (ns,ns_iqr,ns_vals) else: o = (ns,ns_iqr) return o
[docs] def individual_trial_calc(dedansy, arch_weights, comp_arch_dist, ratio, sweep, originals,dist_flag = False, seed =123): ''' Calculates the subsampled and randomly chosen genes network separtion and pruning analysis. Parameters: ----------- - dedansy: deDANSy object The dansy object containing all expression and protein information - arch_weights: list The relative fraction of the domain architecture lengths from 0-10 and 10+ from the deDANSy object of differentially expressed genes/proteins - comp_arch_dist: list The proteins that are found for each domain architecture length to choose gene randomly from - ratio: float The relative fraction of up to down regulated genes/proteins - sweep: numpy array The threshold p-values to use for pruning analysis - originals: list List of two elements which contain lists of all the UniProt IDs that are either up or down (in that order) regulated - dist_flag: bool (Optional) Whether the distribution of p-values during the p-value sweep is return. Default is False as this is a niche request - seed: int (Optional) Random number generator seed to ensure reproducibility. Default is 123 Returns: -------- - list of tuples The random and then subsampled data of the unpruned network separation, the interquartile range of network separation during pruning, and if requested the distribution of values ''' random.seed(seed) rand_data = get_random_net_sep_metrics(normalized_arch_weights=arch_weights, complete_arch_dist=comp_arch_dist, dedansy=dedansy, deg_ratio=ratio, pvals=sweep, return_dist=dist_flag) # Since the random function above generates the sizes will pass this on. rand_szs = (len(dedansy.up_DEGs), len(dedansy.down_DEGs)) subsample_data = get_subsample_net_sep_metrics(dedansy=dedansy, orig_DEGs=originals, rand_prot_nums=rand_szs, pvals=sweep, return_dist=dist_flag) return list(rand_data + subsample_data)
[docs] def calculate_separation_stability(dedansy, num_trials = 50, pval_sweep = np.logspace(0,-10,21), return_distributions = False,processes = 1, verbose=True): ''' Generates the distribution of network separation and interquartile range of network separation values during pruning for generating the different scores for deDANSy analysis. This is the key function of the algorithm that gathers all the results. Parameters: ----------- - dedansy: deDANSy object The dansy object containing all expression and protein information - num_trials: int (Optional, Default 50) The number of trials for subsampling and random gene selection - pval_sweep: numpy array The threshold p-values to use for pruning analysis - return_distribution: bool Whether the distribution of p-values during the p-value sweep is return. Default is False as this is a niche request - processes: int Number of multiprocessing workers to use for analysis - verbose: bool Whether progress statements should be displayed Returns: -------- - o: list of tuples The full distribution of values for each individual trial of the subsampled and randomly chosen genes ''' # Setting up the inputs as necessary for the dependent functions by getting the information about the measured universe proteins orig_up = dedansy.up_DEGs orig_dn = dedansy.down_DEGs original_DEGs = list(orig_up) + list(orig_dn) deg_info = dedansy.retrieve_protein_info(original_DEGs) arch_lens = deg_info['Interpro Domain Architecture IDs'].apply(lambda x: len(x.split('|'))) max_length = max([max(arch_lens),11]) deg_arch_dist = np.histogram(arch_lens, bins=range(max_length+1)) complete_arch_dist = dedansy.ref['Interpro Domain Architecture IDs'].apply(lambda x: len(x.split('|'))) weight_list = deg_arch_dist[0][0:11] weight_list = np.append(weight_list,np.sum(deg_arch_dist[0][11:])) # Now normalizing the list 70% of the total number of DEGs available weight_list_n = np.round(weight_list/sum(weight_list)*(sum(weight_list)*0.7)) up_frac = len(dedansy.up_DEGs)/len(deg_info) # For reproducibility (mostly only for when multiprocessing is enacted) creating a seed list that will be passed to the individual trials function seedlist = random.sample(range(50*num_trials), num_trials) if processes == 1: # Intializing some of the variables of the for loop rand_ns = np.zeros(num_trials) subsample_ns = np.zeros(num_trials) rand_iqr = np.zeros(num_trials) subsample_iqr = np.zeros(num_trials) if return_distributions: rand_ns_dists = [] subsample_ns_dists = [] # Progress statement if verbose: print('Starting calculations') for i in range(num_trials): a = individual_trial_calc(dedansy, weight_list_n,complete_arch_dist,up_frac,pval_sweep,[orig_up,orig_dn],dist_flag=return_distributions, seed=seedlist[i]) # Unpacking the values into the datastructures above rand_ns[i] = a[0] rand_iqr[i] = a[1] if return_distributions: rand_ns_dists.append(a[2]) subsample_ns[i] = a[3] subsample_iqr[i] = a[4] subsample_ns_dists.append(a[5]) else: subsample_ns[i] = a[2] subsample_iqr[i] = a[3] else: if verbose: print('Will do multiprocessing') # Setting up the arguments to be passed to multiple processes args = [(dedansy, weight_list_n,complete_arch_dist,up_frac,pval_sweep,[orig_up,orig_dn],return_distributions, seedlist[i]) for i in range(num_trials)] if __name__ == 'dansy.enrichment_helpers': pool = mp.Pool(processes=processes) with pool as p: a = p.starmap(individual_trial_calc, args,chunksize=5) pool.close() # In case but not necessary # Unpacking the values into the datastructures above rand_ns = [x[0] for x in a] rand_iqr = [x[1] for x in a] if return_distributions: rand_ns_dists = [x[2] for x in a] subsample_ns = [x[3] for x in a] subsample_iqr = [x[4] for x in a] subsample_ns_dists= [x[5] for x in a] else: subsample_ns = [x[2] for x in a] subsample_iqr = [x[3] for x in a] # Prepping the export if return_distributions: o = [rand_ns,rand_iqr,rand_ns_dists,subsample_ns,subsample_iqr,subsample_ns_dists] else: o = [rand_ns,rand_iqr,subsample_ns,subsample_iqr] return o
[docs] def retrieve_fpr_checks(dedansy,num_DEGs,fpr_trials = 50, num_internal_trials = 50, deg_ratios = 0.7, processes = 1, seed =123): ''' Performs a false positive rate check on the values for the differentially expressed genes by performing the same subsampling and random gene selection analysis with random genes that were measured to determine if findings are true positives. This process can be highly time consuming based on the number of FPR trials that are to conducted. Parameters: ----------- - dedansy: deDANSy object The dansy object containing all expression and protein information - num_DEGs: int The number of differentially expressed genes in the deDANSy object - fpr_trials: int (Optional, Default 50) The number of False Positive Rate trials to conduct - num_internal_trials: int (Optional, Default 50) The number of trials for subsampling and random gene selection that will be used for all the FPR trials. (Should match what was done on the analysis of interest) - deg_ratios: float Relative ratio of differentially expressed genes to designate as up or down - processes: int Number of multiprocessing workers to use for analysis - seed Random number generator seed Returns: -------- - internal_fpr: list List of p-values for the network separation and interquartile range during pruning ''' # Setting up the random DEGs generator rand_DEGs = dedansy.retrieve_random_ids(num=num_DEGs, iters=fpr_trials,seed = seed) internal_fpr = [] for i in range(fpr_trials): cur_DEGs = next(rand_DEGs) cur_DEGs = sorted(cur_DEGs) # In case of weird issues with random sampling using sorted orig_up = random.sample(cur_DEGs, k=round(len(cur_DEGs)*deg_ratios)) orig_dn = sorted(set(cur_DEGs).difference(orig_up)) dedansy.set_DEG_ngrams(up_DEGs=orig_up, down_DEGs=orig_dn, verbose=False) temp = calculate_separation_stability(dedansy,num_trials=num_internal_trials,processes=processes,verbose = False) rand_iqr = [x for x in temp[1]] subsample_iqr = [x for x in temp[3]] rand_full_ns = [x for x in temp[0]] actual_full_ns = [x for x in temp[2]] # Now getting some of the stats and results temp_iqr_res = stats.mannwhitneyu(rand_iqr,subsample_iqr) temp_ns_res = stats.mannwhitneyu(rand_full_ns, actual_full_ns) internal_fpr.append((temp_ns_res[1], temp_iqr_res[1])) return internal_fpr
[docs] def calculate_fpr(actual_res, random_res): ''' For each result of interest calculates the False positive rate Parameters: ----------- - actual_res: list The results from conducting the analysis - random_res: list The p-values from the random FPR trials Returns: -------- - tuple of FPR values ''' ns_res = actual_res[0] iqr_res = actual_res[1] ns_rand_res = [x[0] for x in random_res] iqr_rand_res = [x[1] for x in random_res] # Now calculating the fpr values x = [i <= ns_res for i in ns_rand_res] ns_fpr = sum(x)/len(x) x = [i <= iqr_res for i in iqr_rand_res] iqr_fpr = sum(x)/len(x) return (ns_fpr, iqr_fpr)