import warnings
import random
import itertools
import time
import pandas as pd
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import dansy.ngramUtilities as ngramUtilities
import dansy.dedansy_algorithm as algorithm
from dansy.dansy import dansy
from dansy.network_separation_helpers import network_separation,build_network_reference_dict
from dansy.enrichment_plotting_helpers import plot_functional_scores, gather_enrichment_results, calc_ngram_fpr_vals, plot_enriched_ngrams
[docs]
class DEdansy(dansy):
'''
A container class of multiple Domain n-gram networks related to a differentially expressed dataset that was generated using the DESeq analysis pipeline. This provides methods to analyze and contrast pairs of domain architecture subnetworks to understand changes in functional molecular ecosystems available to different conditions.
Parameters
----------
dataset: pandas DataFrame
The expression dataset that contains the proteins of interest and expression values to designate differentially expressed genes/proteins
uniprot_ref: pandas DataFrame (Optional)
The base reference file that contains all the proteins of interest within the dataset. Note: It is recommended to include if multiple instances of a DEdansy object are being instantiated that share a common set of proteins.
n: int (Optional)
Length of n-grams to extract. Default is 10
id_conv: pandas DataFrame (Optional)
A dataframe with a column of UniProt IDs and a column for a IDs used in the expression dataset to help in converting
conv_cols: str (Optional)
The name of the column with IDs matching in the id_conv dataframe. Assumes the naming convention generated from pybiomart
data_ids: str (Optional)
The name of the column in the dataset dataframe for the IDs to be converted to UniProt IDs
penalty: 'dynamic' or int (Optional)
The value or 'dynamic' for the penalty during network separation calculations.
run_conversion: bool
Whether the dataset IDs have to be converted using a provided id_conv dataframe
kwargs
Additional keyword arguments for generating the DANSy network. See dansy.set_ngram_parameters for details acceptable values are reproduced below
- 'min_arch'
- 'max_node_len'
- 'collapse'
- 'readable_flag'
- 'verbose'
Attributes
----------
-----------------
At Initialization
-----------------
dataset: pandas DataFrame
The expression dataset for DANSy analysis
ref: pandas DataFrame
The reference file information for the proteins within the dataset
n: int
The maximum length of n-grams being extracted
interproIDs: list
A list of all protein domain InterPro IDs that were found within the dataset
protsOI: list
The UniProt IDs for the proteins found within the dataset
ngrams: list
The extracted domain n-grams
collapsed_ngrams: list
The domain n-grams which were collapsed into other n-grams which represent the set of proteins
G: networkx Graph
The network graph representation of the DANSy n-gram network
adj: pandas DataFrame
The adjacency matrix for the n-gram network for the DANSy analysis
interpro2uniprot: dict
The keys of InterPro IDs with values of a list of UniProt IDs that have the InterPro ID
id_conversion_dict: dict
A dictionary containing the conversion of the provided gene/protein ID to a UniProt ID. If UniProt IDs were provided, returns a dict of UniProt to UniProt IDs.
data_id_cols: str
The ID column used in the dataset for conversion
network_params: dict
Key-value pairs of acceptable networkx drawing parameters
min_arch: int (Default: 1)
The minimum number of domain architectures for an n-gram to be retained.
max_node_len: int
The maximum n-gram length that will be retained during the collapsing step to represent n-grams sharing the same set of proteins. This will not be larger than n (Default of 10).
collapse: bool
Whether the n-grams were collapsed
readable_flag: bool
Whether the n-grams are human-legible
verbose: bool
Whether progress statements are to be printed during calculations
--------------------------------------
After Establishing Comparison MetaData
--------------------------------------
comp_metadata: pandas DataFrame
The metadata information for comparisons of interest in the dataset
----------------------
After DEG Calculations
----------------------
up_DEGs/down_DEGs: list
List of UniProts that have were designated as up or down for a specific condition
up_ngrams/down_ngrams: list
The n-grams for either up- or down-regulated proteins/genes
alpha: float
The p-value cutoff for designating DEGs
fcthres: float
The fold-change cutoff for designating DEGs
pval_data_col: str
The column name in the datasetused for the p-value data for DEGs
fc_data_col: str
The column name in the dataset used for fold-change data for DEGs
-----------------------------
After creating deDANSy scores
-----------------------------
scores: pandas DataFrame
The scores, p-values, and FPR values for the deDANSy Separation and Distinction scores for the comparisons
raw_dists: pandas DataFrame
The raw distribution of values for each score
fpr_dists: pandas DataFrame
The raw distribution of FPR values for each score
'''
def __init__(self,dataset,uniprot_ref = None,n = 10, id_conv = None, conv_cols = 'Gene stable ID', data_ids = 'gene_id', penalty = 'dynamic',run_conversion = True, **kwargs):
# Bare minimum attributes required for setting up an empty n-gram network.
self.dataset = dataset
self.ref = uniprot_ref
self._n = n
self.interproIDs = None
# Converting the dataset ids for individual genes to the UniProt IDs
if 'dbl_check' in kwargs:
check_IDs_flag = kwargs['dbl_check']
else:
check_IDs_flag = False
if run_conversion:
if id_conv is None:
raise ValueError('Missing an ID converting dataframe.')
else:
self.id_conversion_dict = create_id_conv(dataset, id_conv, conv_cols,data_ids,check_IDs_flag)
self.protsOI = convert_2_uniprotIDs(dataset,id_conv, conv_cols,data_ids, check_IDs_flag)
else:
self.protsOI = dataset[data_ids].tolist()
self.id_conversion_dict = {k:[k] for k in self.protsOI}
# Saving the data_id column for instances when the DEGs have to be calculated.
if isinstance(data_ids, list):
self.data_id_col = data_ids[0]
else:
self.data_id_col = data_ids
self.conversion_id_col = conv_cols
# Now making sure there is a common reference that can be used for generating the n-gram networks.
if self.ref is None:
self.add_ref_df()
self.populate_ngramNet(**kwargs)
if self.verbose:
print('Building the reference network information.')
self.ref_data = build_network_reference_dict(self, penalty=penalty)
[docs]
def DEG_network_sep(self, force_run = False):
'''
This computes the network separation of two conditions that designates individual genes as differentially expressed. Silently passes a nan if it fails.
'''
# Get the UniProt IDs for the differentially expressed genes
if hasattr(self, 'up_DEGs') and hasattr(self, 'down_DEGs'):
pass
else:
raise ValueError('DEGs do not exist. Please run calc_DEG_ngrams.')
if len(self.up_ngrams) == 0 or len(self.down_ngrams) == 0:
ns = np.nan
else:
try:
# Generating networks for the individual DEG conditions
up_net = self.G.subgraph(self.up_ngrams)
dn_net = self.G.subgraph(self.down_ngrams)
ns = network_separation(up_net,dn_net, self.ref_data,force_run=force_run)
except:
ns = np.nan
return ns
[docs]
def calc_DEG_ngrams(self, comp, batch_mode = False):
'''
Defines the DEG UniProt IDs and the associated n-grams for the datasset of interest
'''
# Checking if there was a set of DEGs that already exists and printing statement saying it will be overwritten.
if hasattr(self,'up_DEGs'):
verbose_flag = True
if hasattr(self, 'G_collapsed'):
del self.G_collapsed
else:
verbose_flag = False
# This is only for running several and wanting to keep track of general progress.
if batch_mode:
verbose_flag = False
# Now extracting the data columns which for the comparison of interest to define the differentially expressed genes/proteins
compOI_info = self.comp_metadata.loc[comp]
data_cols = [compOI_info['fc_col'], compOI_info['pval_col']]
fc_thres = compOI_info['fc_thres']
alpha = compOI_info['alpha']
# Reducing large dataframe to what is the info needed for generating the DEG networks
cols = [self.data_id_col]+data_cols
dataset_OI = self.dataset.filter(cols)
# Getting the differentially expressed genes
deg_up = list(dataset_OI[(dataset_OI[data_cols[1]] <= alpha) & (dataset_OI[data_cols[0]] > fc_thres)][self.data_id_col])
deg_dn = list(dataset_OI[(dataset_OI[data_cols[1]] <= alpha) & (dataset_OI[data_cols[0]] < -fc_thres)][self.data_id_col])
# Keeping record of the data columns for a summary
self.pval_data_col = data_cols[1]
self.fc_data_col = data_cols[0]
# Recording the thresholds for DEG values
self.alpha = alpha
self.fcthres = fc_thres
# And converting to the UniProt IDs to highlight n-grams within the network that were retained for each
up_DEGs = [v for k, v in self.id_conversion_dict.items() if k in deg_up]
up_DEGs = set(list(itertools.chain.from_iterable(up_DEGs)))
down_DEGs = [v for k, v in self.id_conversion_dict.items() if k in deg_dn]
down_DEGs = set(list(itertools.chain.from_iterable(down_DEGs)))
# Now just due to random sampling issues making sure the DEGs are in alphabetical order and converting to a list
up_DEGs = sorted(up_DEGs)
down_DEGs = sorted(down_DEGs)
self.set_DEG_ngrams(up_DEGs,down_DEGs, verbose=verbose_flag)
def set_DEG_ngrams(self, up_DEGs, down_DEGs,collapse = True, verbose=True):
'''
This actually sets the DEGs so that they are calculated, but allows for custom sets to be generated for very specific purposes. It is not recommended to use this
'''
if hasattr(self,'up_DEGs') and verbose:
print('Will be overwriting existing DEG data.')
# Setting the DEGs
self.up_DEGs = up_DEGs
self.down_DEGs = down_DEGs
# Now getting the n-grams associated with the collective DEGs
up_ngram_cands = [k for k,v in self.interpro2uniprot.items() if set(v).intersection(self.up_DEGs)]
down_ngram_cands = [k for k,v in self.interpro2uniprot.items() if set(v).intersection(self.down_DEGs)]
# Getting the n-gram candidates
up_ngram_dict = {k:set(v).intersection(self.up_DEGs) for k,v in self.interpro2uniprot.items() if k in up_ngram_cands}
down_ngram_dict = {k:set(v).intersection(self.down_DEGs) for k,v in self.interpro2uniprot.items() if k in down_ngram_cands}
# Now collapsing these to the non-redundant ones
if collapse:
up_ngram_dict,_ = ngramUtilities.concatenate_ngrams(up_ngram_dict)
down_ngram_dict,_ = ngramUtilities.concatenate_ngrams(down_ngram_dict)
# Exporting to the object
self.up_ngrams = [k for k in up_ngram_dict.keys()]
self.down_ngrams = [k for k in down_ngram_dict.keys()]
[docs]
def plot_DEG_ns(self,pos = [],deg_labels=[], large_cc_mode=False):
'''
Using the defined differentially expressed genes displaying a network graph of the n-grams that are associated or shared between the DEG conditions.
'''
# Setting default labels otherwise setting the label names for the legend.
if deg_labels:
up_label = deg_labels[0]
down_label = deg_labels[1]
else:
up_label = 'Up'
down_label = 'Down'
# Retrieving the n-grams across both DEG conditions to filter out any isolates/connected components in the reference network that are not used.
all_deg_ngrams = set(self.up_ngrams).union(self.down_ngrams)
if large_cc_mode:
large_ref_cc = max(nx.connected_components(self.G), key=len)
all_deg_ngrams = all_deg_ngrams.intersection(large_ref_cc)
# Setting up the node color list for plotting
node_colors = []
for node in self.G.subgraph(all_deg_ngrams):
if (node in self.up_ngrams) and (node in self.down_ngrams):
node_colors.append('tab:purple')
elif node in self.up_ngrams:
node_colors.append('tab:cyan')
elif node in self.down_ngrams:
node_colors.append('tab:red')
else:
node_colors.append('tab:gray')
# Dropping connected components not shared with the DEGs and the reference network
if large_cc_mode:
cc_2_keep = set(large_ref_cc)
else:
cc_2_keep = set()
for cc in nx.connected_components(self.G):
if set(cc).intersection(all_deg_ngrams):
cc_2_keep.update(cc)
# Getting the node positions for network drawing if not supplied.
if pos == [] and hasattr(self, 'network_params'):
if 'pos' in self.network_params:
pos = self.network_params['pos']
else:
pos = nx.spring_layout(self.G, k=0.05)
elif pos ==[]:
pos = nx.spring_layout(self.G, k=0.05)
# Getting all the basic network parameters that are default in the n-gram networks
# Default values
basic_network_params = {'node_size':1,
'edgecolors':'k',
'linewidths':0.1,
'width':0.25,
'edge_color':'#808080',
}
net_draw_params = {}
for param in basic_network_params:
if param in self.network_params:
net_draw_params[param] = self.network_params[param]
else:
self.network_params[param] = basic_network_params[param]
net_draw_params[param] = basic_network_params[param]
# Now drawing and adding legends
plt.figure(figsize=(2,2), dpi=300)
nx.draw(self.G.subgraph(cc_2_keep), pos, node_color = 'tab:gray', alpha = 0.1, **net_draw_params)
nx.draw(self.G.subgraph(all_deg_ngrams), pos, node_color = node_colors,**net_draw_params)
# Legend drawing
plt.scatter([],[],c='tab:cyan',s=1, label=up_label)
plt.scatter([],[],c='tab:red',s=1, label=down_label)
plt.scatter([],[],c='tab:purple',s=1, label='Both')
plt.legend(bbox_to_anchor=(1,0.5),frameon=False)
[docs]
def deg_summary(self, detailed = False):
'''
This provides a summary of the DEG information that has been used within this dataset.
'''
summary_df = pd.DataFrame(index=['p-value threshold', 'Fold change threshold', 'Up Regulated DEGs','Down Regulated DEGs'],columns=[''])
vals = [self.alpha, self.fcthres, len(self.up_DEGs), len(self.down_DEGs)]
for i,v in zip(summary_df.index, vals):
summary_df.loc[i] = v
if detailed:
extended_inds = ['Data ID column','p-val data column', 'FC data column','Up-ngrams','Down ngrams','Common n-grams']
# Getting common ngram counts
common = set(self.up_ngrams).intersection(self.down_ngrams)
extended_vals = [self.data_id_col, self.pval_data_col,self.fc_data_col, len(self.up_ngrams), len(self.down_ngrams), len(common)]
for i,v in zip(extended_inds, extended_vals):
summary_df.loc[i] = v
return summary_df
def ngram_DEG_hypergeom(self, condition):
'''
Performs the hypergeometric over-representation test on the n-grams associated with different conditions.
Parameters:
-----------
- condition: str
Which condition to perform the test on must be either Up or Down (case insensitive)
Returns:
--------
- p_vals: dict
The p-value of each n-gram given the condition relative to the whole potential universe of the n-gram reference background of the n-gram network.
'''
# Make the condition input all lower case
condition = condition.lower()
if condition == 'up':
ngramsOI = self.up_ngrams
degsOI = self.up_DEGs
elif condition == 'down':
ngramsOI = self.down_ngrams
degsOI = self.down_DEGs
else:
raise ValueError('Please specify either up or down.')
p = ngram_enrichment(self, prots = degsOI, ngrams=ngramsOI)
return p
[docs]
def calculate_scores(self, comps, fpr_trials=50, min_pval = -10, num_ss_trials = 100, processes = 1, seed=None, verbose=True, overwrite=False):
'''
This calculates the separation and distinct functional neighborhood scores for a deDANSy instance. It creates new attributes for the deDANSy instance containing all the information of interest. If scores for a condition have been generated, this will raise a warning and overwrite existing scores.
Parameters
-----------
dedansy : deDANSy object
The base deDANSy object containing all n-grams and expression data
comps : list or str
The conditions that will be compared
fpr_trials : int (Optional)
Number of FPR trials to perform.
min_pval : int (Optional)
The log10 transform of the minimum p-value to use for the p-value pruning sweep step.
num_ss_trials : int (Optional)
The number of subsampled (and random) networks used to build distributions for comparing the network separation and IQR
processes : int (Optional)
Number of processes to use if multiprocessing is desired. (Recommended having 4-8 when feasible)
seed : int
Seed for random numbers. If not provided will use system time
Returns
--------
The following attributes are added or adjusted:
scores : pandas DataFrame
All scores, p-values, and FPR values for each condition
raw_dists : pandas DataFrame
For each condition the raw values that made up the distributions for each score
fpr_dists : pandas DataFrame
For each condition the p-values from the random FPR trials for calculating the FPR'''
# Check for existing scores for all conditions provided
if hasattr(self, 'scores'):
a = set(comps).intersection(self.scores.index)
if len(a) >= 1:
if overwrite:
warnings.warn('At least one condition has existing scores that will be overwritten.')
else:
warnings.warn('At least one condition has existing scores that will be kept.')
else:
a = set()
temp_scores, temp_raw_dists, temp_fpr_dists = algorithm.calculate_scores(self, comps, fpr_trials,min_pval, num_ss_trials,processes,seed, verbose)
if hasattr(self, 'scores'):
cur_scores = self.scores
if overwrite:
cur_scores.update(temp_scores)
new_scores = cur_scores.combine_first(temp_scores)
new_raw_dist = self.raw_dists.combine_first(temp_raw_dists)
new_fpr_dist = self.fpr_dists.combine_first(temp_fpr_dists)
else:
new_scores = temp_scores
new_raw_dist = temp_raw_dists
new_fpr_dist = temp_fpr_dists
self.scores = new_scores
self.raw_dists = new_raw_dist
self.fpr_dists = new_fpr_dist
[docs]
def plot_scores(self, show_FPR = True, aspect = 0.9, order = None):
'''
This creates the bubble plots for the separation and distinction scores. This is a wrapper function for the base plotting function found in the enrichment_plotting_helpers.
Parameters
----------
show_FPR : bool (Optional)
Whether the FPR legend handles should be displayed.
aspect : float (Optional)
The aspect ratio for each score plot.
order : dict (Optional)
Key-value pairs for each comparison and what order they should be displayed on the axis.
Returns
--------
ax : matplotlib Axes
The axes of the resulting plot
'''
x = self.scores.copy()
x['Separation_Significance'] = x['Separation_FPR'] <= 0.05
x['Distinction_Significance'] = x['Distinction_FPR'] <= 0.05
ax = plot_functional_scores(x, show_FPR,aspect=aspect, order=order)
return ax
[docs]
def calculate_ngram_enrichment(self,comparison, fpr_trials = 100, seed = None):
'''
Calculates the n-gram enrichment for the comparison of interest to find the most significant n-grams.
Parameters:
-----------
- comparison: str
The comparison of interest that contains the up- and down-regulated genes/proteins
- alpha: float (Optional, Default: 0.05)
Threshold of values to return
- fpr_trials: int (Optional, Default: 100)
The number of trials used to calculate the false positive rate
- seed: int (Optional)
Seed for the random state
Return:
-------
Creates the new attribute
- ngram_results: dict of pandas DataFrame
All the n-gram enrichment values that pass the significance threshold for the comparison of interest
'''
# Setting the random state
if seed is None:
random.seed(int(time.time()))
else:
random.seed(seed)
# First determine and store the enriched n-grams for the comparison of interest
self.calc_DEG_ngrams(comparison)
enriched_ngrams = {}
for i in ['Up', 'Down']:
enriched_ngrams[i] = self.ngram_DEG_hypergeom(i)
# Get the original DEGs for defining number of proteins to randomly choose
true_up = self.up_DEGs
true_dn = self.down_DEGs
total_degs = len(set(true_up).union(true_dn))
frac_up = len(true_up)/total_degs
# Getting the random protein generator
rand_genes = self.retrieve_random_ids(num=total_degs, iters=fpr_trials)
rand_ngram_pvals = {'Up':{}, 'Down':{}}
for i in range(fpr_trials):
cur_DEGs = next(rand_genes)
orig_up = random.sample(cur_DEGs, k=round(len(cur_DEGs)*frac_up))
orig_dn = list(set(cur_DEGs).difference(orig_up))
self.set_DEG_ngrams(up_DEGs=orig_up, down_DEGs=orig_dn, verbose=False)
rand_up_hyper = self.ngram_DEG_hypergeom('Up')
rand_dn_hyper = self.ngram_DEG_hypergeom('Down')
for j,c_dir in zip([rand_up_hyper,rand_dn_hyper],['Up','Down']):
for node in j:
if node not in rand_ngram_pvals[c_dir]:
rand_ngram_pvals[c_dir][node] = []
rand_ngram_pvals[c_dir][node].append(j[node])
# Now getting the FPR values and putting the results in the correct order
fpr_dict = calc_ngram_fpr_vals(enriched_ngrams, rand_ngram_pvals)
res = gather_enrichment_results({'Up':enriched_ngrams['Up'],'Down':enriched_ngrams['Down']}, fpr_dict)
if hasattr(self,'ngram_results'):
self.ngram_results[comparison] = res
else:
self.ngram_results = {}
self.ngram_results[comparison] = res
[docs]
def plot_ngram_enrichment(self, comparison, p = 0.05, q=0.05, show_FPR = True, **kwargs):
'''
Creates a bubble plot of the enriched n-grams for up and down-regulated n-grams for the comparison of interest. The resulting plot can be paritally customized based on additional keyword parameters. This function is a wrapper for the plot_enriched_ngrams function from the enrichment_plotting_helpers module, but specific to the deDANSy object instance.
Parameters:
-----------
- comparison: str
The comparison to plot the enriched n-grams of
- q: float (Optional)
The quantile cutoff of values to plot. Default (0.05)
- p: float (Optional)
The p-value threshold to limit n-grams to plot. Default is 0.05 but if combined with the quantile (q) can be lower.
show_FPR: bool
Flag whether to show the FPR legend portion.
kwargs: optional keywords
- 'palette','edgecolor', 'linewidth', 'sizes' that adjust the seaborn scatterplot aesthetics.
- 'loc', 'bbox_to_anchor', 'handletextpad' to adjust matplotlib legend information.
- 'ax' to specify a matplotlib Axes to plot onto
Returns:
--------
seaborn/matplotlib plot
'''
resOI = self.ngram_results[comparison]
plot_enriched_ngrams(resOI, dansyOI=self, p = p, q=q, show_FPR=show_FPR, **kwargs)
plt.xlim(-0.5,1.5)
[docs]
def get_ngram_results(self,comparison):
'''
Returns the n-gram enrichment results of the comparison of interest.
Parameters:
-----------
- comparison: str
The comparison of interest
Returns:
--------
- res: pandas DataFrame
The n-gram enrichment dataframe
'''
temp = self.ngram_results[comparison]
# Convert the n-gram IDs to legible names
temp['ngram_legible'] = temp['ngram'].apply(lambda x: self.return_legible_ngram(x))
# Now putting the columns into a more logical order
col_order = ['variable', 'ngram_legible', 'ngram', 'p', '-log10(p)', 'FPR', 'FPR <= 0.05']
temp = temp[col_order]
return temp
## Helper functions for converting the IDs around
def convert_2_uniprotIDs(df, id_conv, conv_col = 'Gene stable ID', data_id_cols = 'gene_id', dbl_check = False):
'''
This takes a dataframe of interest and converts the specified column to UniProt IDs given a second dataframe consisting of UniProt IDs, ENSEMBL ids, and gene names/synonyms retrieved using biopython. If desired there will be a double check to ensure all IDs are retrieved if an archived version of ENSEMBL has been used and gene names are requested instead.
'''
# Now getting all possible UniProt IDs given the IDs of interest.
ensembl_uni_dict = create_id_conv(df, id_conv=id_conv,conv_col=conv_col, data_id_cols=data_id_cols, dbl_check=dbl_check)
uniprot_IDs = set(itertools.chain.from_iterable(ensembl_uni_dict.values()))
return uniprot_IDs
def create_id_conv(df, id_conv, conv_col = 'Gene stable ID', data_id_cols = 'gene_id', dbl_check = False):
'''
This is an internal function that makes a dict to convert the ensembl IDs (by default) to UniProt IDs.
'''
# Check if a list is provided for the data_id_cols or if a single/default value provided.
if isinstance(data_id_cols, list):
if dbl_check:
data_id_col = data_id_cols[0]
else:
data_id_col = data_id_cols[0]
warnings.warn('More than one column name was provided for converting, but a double check was not designated. Ignoring additional inputs.')
else:
data_id_col = data_id_cols
# Retrieve all ENSEMBL IDs which are found within the dataset of interest
success_mapping_ids = set(id_conv[id_conv[conv_col].isin(df[data_id_col])]['Gene stable ID'])
if dbl_check:
gene_dbl_chck = df[~df[data_id_col].isin(success_mapping_ids)][data_id_cols[1]].dropna()
cands = id_conv[id_conv['Gene name'].isin(gene_dbl_chck)]
success_mapping_ids.update(cands['Gene stable ID'])
id_dict = id_conv[id_conv['Gene stable ID'].isin(success_mapping_ids)].filter(['Gene stable ID', 'UniProtKB/Swiss-Prot ID']).drop_duplicates().dropna().to_dict('tight')
id_dict = id_dict['data']
# Now to ensure that all potential UniProt IDs are accounted for have to go through and put them in lists as some Ensembl IDs map to multiple proteins.
id_conversion = {}
for conversion_data in id_dict:
ensembl = conversion_data[0]
uniprot = conversion_data[1]
if ensembl in id_conversion:
id_conversion[ensembl].append(uniprot)
else:
id_conversion[ensembl] = [uniprot]
return id_conversion
# Helper functions for n-gram enrichment.
def ngram_enrichment(dansy, prots, collapse = True, **kwargs):
'''
Calculates the enrichment of n-grams associated with a subset of proteins within a DANSy object by Fisher's exact test.
Note: This method can specifically analyze specific n-grams within the protein subset. However, this process should only be accessed with the deDANSy object and is not recommended to be used.
Parameters
----------
- dansy: DANSY
The DANSy object which contains the proteins of interest. (This can be either the differential expression or standard class.)
- prots: list
List containing the proteins of interest for enrichment analysis.
- collapse: bool
Whether the n-grams should be collapsed to their most informative and non-redundant n-grams.
- kwargs: optional keyword arguments (Not recommended to use)
- ngrams: list
List of n-grams that are to be analyzed specifically.
'''
p_vals = ngram_subset_enrichment(prots, dansy.protsOI, dansy, collapse=collapse, kwargs=kwargs)
return p_vals
def ngram_subset_enrichment(protsOI, full_prots,dansy_bkg, collapse = True, **kwargs):
'''Peform Fisher's exact test for n-grams found in a subset of proteins that may be found in the full list of proteins. This is the more general version that does not require using the full DANSy protein list.
Parameters:
-----------
- protsOI: list
List containing the proteins of interest for enrichment analysis.
- full_prots: list
List containing the proteins that make up the full background list.
- dansy_bkg: DANSy object
The DANSy that the proteins and n-grams are associated with
- collapse: bool
Whether the n-grams should be collapsed to their most informative and non-redundant n-grams.
- kwargs: key, value mappings (Not recommended)
Additional keyword arguments:
- ngrams: list
List of n-grams that are to be analyzed specifically.
Returns:
--------
- p_vals: dict
Key-value pairs of n-grams and their enrichment p-value.
'''
if 'ngrams' in kwargs:
ngrams = kwargs['ngrams']
# Making sure all n-grams are found in at least one of the proteins. If any are not raise an error.
internal_check = [len(set(v).intersection(protsOI)) == 0 for k,v in dansy_bkg.interpro2uniprot.items() if k in ngrams]
if any(internal_check):
raise ValueError('At least one provided n-gram is not found in the proteins of interest.')
else:
ngrams = []
subset_prots = list(set(protsOI).intersection(full_prots)) # Ensure that the foreground proteins are found within the background (removing any which are not)
M = len(subset_prots)
N = len(full_prots)
p_vals = {}
if full_prots == dansy_bkg.protsOI:
full_check = True
else:
full_check = False
# Get the n-grams of the proteins of interest if they were not provided
if ngrams:
ngramsOI = ngrams
else:
ngram_cands = [k for k,v in dansy_bkg.interpro2uniprot.items() if set(v).intersection(subset_prots)]
ngram_dict = {k:set(v).intersection(subset_prots) for k,v in dansy_bkg.interpro2uniprot.items() if k in ngram_cands}
if collapse:
ngram_dict,_ = ngramUtilities.concatenate_ngrams(ngram_dict)
ngramsOI = [k for k in ngram_dict.keys()]
# Now using the cdf/sf of the hypergeometric distribution to get p-values (This is equivalent to Fisher's exact)
for node in ngramsOI:
# Skip the filtering if using the full background
if full_check:
full_prot_list = dansy_bkg.interpro2uniprot[node]
else:
full_prot_list = [u for u in dansy_bkg.interpro2uniprot[node] if u in full_prots]
# Now getting the numbers for the hypergeom distribution and calculating the cdf (or really sf since it is equivalent to 1-cdf)
k = len(set(full_prot_list).intersection(subset_prots))
n = len(set(full_prot_list))
p = stats.hypergeom.sf(k-1,N, n,M)
p_vals[node] = p
return p_vals