Source code for kstar.calculate

import pandas as pd
import numpy as np
import scipy.stats as stats
from statsmodels.stats import multitest
import multiprocessing
from collections import defaultdict
import pickle
import os
from datetime import datetime
import itertools
from itertools import repeat
import concurrent.futures

from kstar import config, helpers
from kstar.normalize import generate_random_experiments, calculate_fpr

[docs]class KinaseActivity:
    """
    Kinase Activity calculates the estimated activity of kinases given an experiment using hypergeometric distribution.
    Hypergeometric distribution examines the number of protein sites found to be active in evidence compared to the 
    number of protein sites attributed to a kinase on a provided network.

    Parameters
    ----------
    evidence : pandas df
        a dataframe that contains (at minimum, but can have more) data columms as evidence to use in analysis and KSTAR_ACCESSION and KSTAR_SITE
    data_columns: list
        list of the columns containing the abundance values, which will be used to determine which sites will be used as evidence for activity prediction in each sample
    logger : Logger object
        keeps track of kstar analysis, including any errors that occur
    phospho_type: string, either 'Y' or 'ST'
        indicates the phoshpo modification of interest
    
    Attributes
    ----------
    -------------------
    Upon Initialization
    -------------------
    evidence: pandas dataframe
        inputted evidence column
    data_columns: list
        list of columns containing abundance values, which will be used to determine which sites will be used as evidence. If inputted data_columns parameter was None, this lists includes in column in evidence prefixed by 'data:'
    logger : Logger object
        keeps track of kstar analysis, including any errors that occur
    phospho_type: string
        indicated phosphomod of interest
    network_directory: string
        directory where kinase substrate networks can be downloaded, as indicated in config.py
    normalized: bool
        indicates whether normalization analysis has been performed
    aggregate: string
        the type of aggregation to use when determining binary evidence, either 'count' or 'mean'. Default is 'count'.
    threshold: float
        cutoff to use when determining what sites to use for each experiment
    greater: bool
        indicates whether sites with greater or lower abundances than the threshold will be used
    run_data: string
        indicates the date that kinase activity object was initialized
    
    ---------------------------------
    After Hypergeometric Calculations
    ---------------------------------
    activities_list: pandas dataframe
        p-values obtained for all pruned networks indicating statistical enrichment of a kinase's substrates for each network, based on hypergeometric 
        tests
    activities: pandas dataframe
        median p-values obtained from the activities_list object for each experiment/kinase
    agg_activities: pandas dataframe
    
    -----------------------------------
    After Random Enrichment Calculation
    -----------------------------------
    random_experiments: pandas dataframe
        contains information about the sites randomly sampled for each random experiment
    
    random_kinact: KinaseActivity object
        KinaseActivity object containing random activities predicted from each of the random experiments
    
    ---------------------------
    After Mann Whitney Analysis
    ---------------------------
    activities_mann_whitney: pandas dataframe
        p-values obtained from comparing the real distribution of p-values to the distribution of p-values from random datasets, based
        the Mann Whitney U-test
    fpr_mann_whitney: pandas dataframe
        false positive rates for predicted kinase activities
        
    """
    def __init__(self, evidence, logger, data_columns = None, phospho_type = 'Y'):
        
        self.phospho_type = phospho_type
        self.set_evidence(evidence)
        

        self.networks = defaultdict()
        self.network_sizes = defaultdict()
        self.logger = logger
        self.normalizers = defaultdict()
        self.network_directory = config.NETWORK_DIR

        self.num_networks = None

        self.activities_list = None
        self.agg_activities = None
        self.activities = None


        #Properties for normalized information
        #self.normalizers = defaultdict()
        #self.normalized = False
        #self.normalized_activities_list = None
        #self.normalized_agg_activities = None
        #self.activities_normalized = None
        #self.fpr_activity = None
        
        #Location of random experiments
        self.randomized = False
        self.random_experiments = None



        self.aggregate = 'mean'
        self.threshold = 1.0
        self.greater = True

        self.run_date = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
        
        #if data columns is None, set data columns to be columns with data: in front
        self.set_data_columns(data_columns = data_columns)
   

[docs]    def check_data_columns(self):
        """
        Checks data columns to make sure column is in evidence and that evidence filtered on that data column 
        has at least one point of evidence. Removes all columns that do not meet criteria
        """
        new_data_columns = []
        evidence = self.evidence.groupby([config.KSTAR_ACCESSION, config.KSTAR_SITE]).agg(self.aggregate).reset_index()
        for col in self.data_columns:
            if col in self.evidence.columns:
                if self.greater:
                    if len(evidence[evidence[col] >= self.threshold]) > 0:
                        new_data_columns.append(col)
                    else:
                        self.logger.warning(f"{col} does not have any evidence")
                else:
                    if len(evidence[evidence[col] <= self.threshold]) > 0:
                        new_data_columns.append(col)
                    else:
                        self.logger.warning(f"{col} does not have any evidence")
            else:
                self.logger.warning(f"{col} not in evidence")
        self.data_columns = new_data_columns

[docs]    def set_data_columns(self, data_columns = None):
        """
        Sets the data columns to use in the kinase activity calculation
        If data_columns is None or an empty list then set data_columns to 
        be all columns that start with data:

        Checks all set columns to make sure columns are vaild after filtering evidence
        """
        if data_columns is None or data_columns == []:
            self.data_columns = []
            for col in self.evidence.columns:
                if col.startswith('data:'):
                    self.data_columns.append(col)  
        else:
            self.data_columns = data_columns
        self.check_data_columns()    

[docs]    def calculate_random_activities(self, logger, num_random_experiments=150, PROCESSES = 1):
        """
        Generate random experiments and calculate the kinase activities for these random experiments
        """
        self.logger.info("Running Randomization Pipeline")
        pool = multiprocessing.Pool(processes = PROCESSES)
        self.num_random_experiments = num_random_experiments
        self.logger.info("Generating random experiments")
        self.random_experiments = generate_random_experiments.build_random_experiments(
            self.evidence_binary, 
            config.HUMAN_REF_COMPENDIA, 
            num_random_experiments,
            self.phospho_type, 
            self.data_columns,
            pool,
            PROCESSES = PROCESSES)
        
        self.logger.info("Calculating random kinase activities")
        self.random_kinact = KinaseActivity(self.random_experiments, logger, phospho_type=self.phospho_type)
        self.random_kinact.add_networks_batch(self.networks)
        self.random_kinact.calculate_kinase_activities( agg='count', threshold=1.0, greater=True, PROCESSES = PROCESSES )
        self.random_kinact.agg_activities = self.random_kinact.aggregate_activities()
        self.random_kinact.activities = self.random_kinact.summarize_activities()

    
    #def run_normalization(self, logger, num_random_experiments=150, target_alpha=0.05, PROCESSES = 1):
    #    """
    #    Run entire normaliation pipeline (Generate random experiments, calculate kinase activities for random experiments, and normalize p-values based on provided target_alpha)
    #    """
    #    self.logger.info("Running Normalization Pipeline")
    #    pool = multiprocessing.Pool(processes = PROCESSES)
    #    self.normalized = True
    #    self.num_random_experiments = num_random_experiments
    #    self.logger.info("Generating random experiments")
    #    self.random_experiments = generate_random_experiments.build_random_experiments(
    #        self.evidence_binary, 
    #        config.HUMAN_REF_COMPENDIA, 
    #        num_random_experiments,
    #        self.phospho_type, 
    #        self.data_columns,
    #        pool,
    #        PROCESSES = PROCESSES)

        
    #    self.logger.info("Calculating random kinase activities")
    #    self.random_kinact = KinaseActivity(self.random_experiments, logger, phospho_type=self.phospho_type)
    #    self.random_kinact.add_networks_batch(self.networks)
    #    self.random_kinact.calculate_kinase_activities( agg='count', threshold=1.0, greater=True, PROCESSES = PROCESSES )
        
    #    self.logger.info("Normalizing Activities")
    #    self.normalize(target_alpha=target_alpha)



    #def normalize(self, target_alpha=0.05):
    #    """
    #    Given random_kinace.activities, summarize, calculate FPR, and normalize. Called in the run_normalization pipelin
    #    and useful for changing the target_alpha value, without requiring generation and calculation of random datasets
    #    such as in reading a project from slim format.
    #
    #    Parameters
    #    ----------
    #    target_alpha: float
    #        FPR target value between 0 and 1
    #
    #
    #    """
    #    if target_alpha > 1 or target_alpha < 0:
    #        self.logger.info("ERROR: Target alpha should be between 0 an 1, setting to default of 0.05")
    #        target_alpha = 0.05
    #    self.random_kinact.aggregate_activities()
    #    self.random_kinact.activities = self.random_kinact.summarize_activities()

    #    self.normalizers, self.fpr_activity = calculate_fpr.generate_fpr_values(self.activities, self.random_kinact.activities, target_alpha)

    #    self.normalize_activities(default_normalization=target_alpha)
    #    self.activities_normalized = self.summarize_activities(self.normalized_agg_activities,'median_normalized_activity',normalized=True)
        

    def add_networks_batch(self,networks):
        for nid, network in networks.items():
            self.add_network(nid, network)
        self.num_networks = len(networks)

    def add_network(self, network_id, network, network_size = None):
        """
        Add network to be analyzed

        Parameters
        ----------
        network_id : str
            name of the network
        network : pandas DataFrame
            network with columns substrate_id, site, kinase_id
        """
        self.networks[network_id] = network
        if network_size is not None and isinstance(network_size, int):
            self.network_sizes[network_id] = network_size
        else:
            self.network_sizes[network_id] = len(network.groupby([config.KSTAR_ACCESSION, config.KSTAR_SITE]).size())
            self.logger.info(f'ADD NETWORK : Number of Accession Sites : {self.network_sizes[network_id]}')
    
[docs]    def get_run_date(self):
        """
        return date that kinase activities were run
        """
        return self.run_date

    def set_evidence(self, evidence):
        """
        Evidence to use in analysis

        Parameters
        ----------
        evidence : pandas DataFrame
            substrate sites with activity seen. 
            columns : dict for column mapping
                substrate : Uniprot ID (P12345)
                site : phosphorylation site (Y123)
        """
        columns = list(evidence.columns)
        if config.KSTAR_ACCESSION in columns and config.KSTAR_SITE in columns:
            phospho_type = tuple(self.phospho_type)
            self.evidence = evidence[evidence[config.KSTAR_SITE].str.startswith(phospho_type)].copy()
        else:
            self.logger.warning(f"Evidence not set. Evidence columns must include '{config.KSTAR_ACCESSION}' and '{config.KSTAR_SITE}' keys")
    
    #def set_normalizers(self, normalizers):
    #    """
    #    Sets the Kinase normalization values to be used in calculating the updated p-values
    #    Updated p-values normalized via original p-value / normalization-factor * normalization_factor

    #    Parameters
    #    ----------
    #    normalizers : dict or pandas df
    #        Kinase Activity Normalizers
    #        index : kinase
    #        columns : data_column normalization values
    #    """
    #    self.normalizers = normalizers

    # def calculate_hypergeometric_single_network(self, evidence, network_id):
    #     """
    #     Hypergeometric Cumulative Distribution Function calculated for each kinase given evidence
    #         k : number of times kinase seen in evidence
    #         M : number of unique sites in network
    #         n : number of times kinase seen in network
    #         N : size of evidence
        
    #     Parameters
    #     ----------
    #     evidence : pandas df
    #         subset of kstar evidence that has been filtered to only include evidence associated with experimetn
    #     network_id : str
    #         network to use for analysis
        
    #     Returns
    #     -------
    #     results : pandas df
    #         Hypergeometric results of evidence for given network
    #         index : kinase_id
    #         columns
    #             frequency : number of times kinase seen in network
    #             kinase_activity : activity derived from hypergometric cdf 
    #     """
        
    #     network = self.networks[network_id]
    #     intersect = pd.merge(network, evidence, how='inner',
    #         on=[config.KSTAR_ACCESSION, config.KSTAR_SITE])
    #     counts = intersect.groupby(config.KSTAR_KINASE).size()
    #     N = len(intersect.groupby([config.KSTAR_ACCESSION, config.KSTAR_SITE]).size())
    #     results = pd.DataFrame(counts, columns = ['frequency'])
    #     results['kinase_activity'] = 1.0

    #     K = network.groupby(config.KSTAR_KINASE).size()
        
    #     kinases = counts.index
    #     for kin in kinases:
    #         k = 0
    #         if counts.loc[kin] > 0:
    #             k = counts.loc[kin] - 1
    #         prb = stats.hypergeom.sf(
    #             k = int(k), 
    #             M = int(self.network_sizes[network_id]), 
    #             n = int(K.loc[kin]), 
    #             N = int(N)
    #             )
    #         results.at[kin, 'kinase_activity'] = prb
    #     results['network'] = network_id
    #     return results
        
    # def calculate_hypergeometric_activities(self, evidence, name):
    #     """
    #     Perform hypergeometric kinase activity analysis given evidence on all networks
        
    #     Parameters
    #     ----------
    #     evidence : pandas df
    #         subset of class evidence variable where data is filtered based on experiment
    #     name : str
    #         name of experiment being performed
            
    #     Returns
    #     ---------
    #     fdr_act : pd DataFrame
    #         network : network name, from networks key
    #         frequency : number of times kinase was seen in subgraph of evidence and network
    #         kinase_activity : hypergeometric kinase activity
    #         fdr_corrected_kinase_activity : kinase activity after fdr correction
    #         significant : whether kinase activity is significant based on fdr alpha
    #     combined : pd DataFrame
    #         significant : number of networks where kinase was found to be significant
    #         fraction_significant : fraction of networks kinase was found to be significant through FDR correction
    #         avg_p_value : combined p-values of kinase using mean
    #         median_p_value : combined p-values of kinase using median
    #     """
        
    #     self.logger.info(f"Running hypergeometric analysis on {name}")
        
    #     results = []
    #     for network_id in self.networks.keys(): # calculate kinase activity within each network 
    #         result = self.calculate_hypergeometric_single_network(evidence, network_id) 
    #         results.append(result)

    #     # combine results into single dataframe
    #     hyp_act = pd.concat(results)
    #     hyp_act = hyp_act.reset_index()
    #     hyp_act['data'] = name
    #     return hyp_act

[docs]    def create_binary_evidence(self, agg = 'mean', threshold = 1.0,  greater = True):
        """
        Returns a binary evidence data frame according to the parameters passed in for method for aggregating
        duplicates and considering whether a site is included as evidence or not

        Parameters
        ----------
        threshold : float
            threshold value used to filter rows 
        agg : {'count', 'mean'}
            method to use when aggregating duplicate substrate-sites. 
            'count' combines multiple representations and adds if values are non-NaN
            'mean' uses the mean value of numerical data from multiple representations of the same peptide.
                NA values are droped from consideration.
        greater: Boolean
            whether to keep sites that have a numerical value >=threshold (TRUE, default) or <=threshold (FALSE)
        
        Returns
        -------
        evidence_binary : pd.DataFrame
            Matches the evidence dataframe of the kinact object, but with 0 or 1 if a site is included or not.
            This is uniquified and rows that are never used are removed.
        
        """

        evidence = self.evidence.groupby([config.KSTAR_ACCESSION, config.KSTAR_SITE]).agg(agg).reset_index()
        
        #set the binary evidence for whether a site is included
        evidence_binary = evidence.copy()
        for col in self.data_columns:
            if greater:
                evidence_binary[col].mask(evidence[col] >= threshold, 1, inplace=True)
                evidence_binary[col].mask(evidence[col] < threshold, 0, inplace=True)
            else:
                evidence_binary[col].mask(evidence[col] <= threshold, 1, inplace=True)
                evidence_binary[col].mask(evidence[col] > threshold, 0, inplace=True)

        #remove phosphorylation sites that were not selected in any experiment (useful for very large experiments where removing the need to copy data reduces time)
        evidence_binary.drop(evidence_binary[evidence_binary[self.data_columns].sum(axis=1) == 0].index, inplace = True) 
        return evidence_binary


[docs]    def calculate_kinase_activities(self, agg = 'mean', threshold = 1.0,  greater = True, PROCESSES = 1):
        """
        Calculates combined activity of experiments based that uses a threshold value to determine if an experiment sees a site or not
        To use values use 'mean' as agg
            mean aggregation drops NA values from consideration
        To use count use 'count' as agg - present if not na

        Parameters
        ----------
        data_columns : list
            columns that represent experimental result, if None, takes the columns that start with `data:'' in experiment. 
            Pass this value in as a list, if seeking to calculate on fewer than all available data columns
        threshold : float
            threshold value used to filter rows 
        agg : {'count', 'mean'}
            method to use when aggregating duplicate substrate-sites. 
            'count' combines multiple representations and adds if values are non-NaN
            'mean' uses the mean value of numerical data from multiple representations of the same peptide.
                NA values are droped from consideration.
        greater: Boolean
            whether to keep sites that have a numerical value >=threshold (TRUE, default) or <=threshold (FALSE)
        
        Returns
        -------
        activities : dict
            key : experiment
            value : pd DataFrame
                network : network name, from networks key
                kinase : kinase examined
                frequency : number of times kinase was seen in subgraph of evidence and network
                kinase_activity : hypergeometric kinase activity
        
        """

        #for each phosphoType, check that the 
        self.aggregate = agg
        self.threshold = threshold
        self.greater = greater
        #self.set_data_columns(data_columns)

        self.evidence_binary = self.create_binary_evidence(agg = self.aggregate, threshold = self.threshold,  greater = self.greater)
        
        # if no data columns are provided use all columns that start with data:
        # data columns that filtered have no evidence are removed
        self.logger.info(f"Kinase Activity will be run on the following data columns: {','.join(self.data_columns)}")
        

        # MULTIPROCESSING
        if PROCESSES > 1:
            pool = multiprocessing.Pool(processes = PROCESSES)
            
            filtered_evidence_list  = [self.evidence_binary[self.evidence_binary[col] ==1 ] for col in self.data_columns] 
            networks = itertools.repeat(self.networks)  
            network_sizes = itertools.repeat(self.network_sizes)
            iterable = zip(filtered_evidence_list, networks, network_sizes, self.data_columns)
            activities_list = pool.starmap(calculate_hypergeometric_activities, iterable)
        
        # SINGLE CORE PROCESSING
        else:
            activities_list =[]
            for col in self.data_columns:
                filtered_evidence = self.evidence_binary[self.evidence_binary[col] == 1]
                act = calculate_hypergeometric_activities(filtered_evidence, self.networks, self.network_sizes, col)
                act['data'] = col
                activities_list.append(act)
        self.num_networks = len(self.network_sizes)

        self.activities_list = pd.concat(activities_list)
        return self.activities_list

    def summarize_activities(self, activities = None, method = 'median_activity', normalized = False):
        """
        Builds a single combined dataframe from the provided activities such that 
        each piece of evidence is given a single column. Values are based on the method selected.
        The method must be a column in the activities

        Parameters
        ----------
        activities : dict
            hypergeometric activities that have previously been summarized by network.
            key : experiment name
            value : hypergeometric activity
        method : str
            The column in the hypergeometric activity to use for summarizing data
        
        Returns
        ---------
        activity_summary : pandas DataFrame

        """
        if activities is None:
            activities = self.agg_activities
        available_methods = list(activities.columns)
        available_methods.remove('data')
        if method not in available_methods:
            raise ValueError(f"the method '{method}' is not in the availble methods. \nAvailable methods include : {', '.join(available_methods)}")


        activity_summary = activities.pivot(index = config.KSTAR_KINASE, columns ='data', values = method).reset_index().rename_axis(None, axis=1).set_index(config.KSTAR_KINASE)
        activity_summary = activity_summary[self.data_columns]
        return activity_summary

    def aggregate_activities(self, activities = None):
        """
        Aggregate network activity using median for all activities

        Parameters
        ---------
        activities : dict
            key : Experiment
            value : kinase activity result
        
        Returns 
        ---------
        summaries : dict
            key : experiment
            value : summarized kinase activities accross networks
        """
        if activities is None:
            activities = self.activities_list
        self.agg_activities = activities.groupby(['data', config.KSTAR_KINASE ]).agg(
            median_activity = ('kinase_activity', 'median'),
        ).reset_index()
        return self.agg_activities
    
    #def normalize_activities(self, activities = None, default_normalization = 0.05, normalization_multiplier = 0.05):
    #    """
    #    Generates the normalized activities and corresponding summary statistics

    #    Parameters
    #    ----------
    #    activities : dict
    #        hypergeometric activities generated by KSTAR algorithm
    #        key : experiment
    #        value : hypergeometric result
    #    default_normalization: float
    #        value to normalize to, if no normalizers are set.
        
    #    Returns
    #    --------
    #    normalized_activities : dict
    #        normalized Kinase Activity results
    #        key : experiment
    #        value : normalized results
    #    summarized_activities : dict
    #        summary statistics of normalized kinase activity
    #        key : experiment
    #        value : summary statistics
    #    """
    #    self.normalized = True
    #    
    #    if activities is None:
    #        activities = self.activities_list
    #    normalized_activities = []
    #    for data in self.data_columns:
    #        if type(self.normalizers) is dict:
    #            normalizers = self.normalizers
    #        elif type(self.normalizers) is pd.DataFrame and data #in self.normalizers.columns:
    #            #print("setting normalizers")
    #            normalizers = self.normalizers[data].to_dict()
    #        else:
    #            normalizers = {}
    #        activity = activities[activities['data'] == data]
    #        normalized_activities.append(
    #            self.calculate_normalized_activity(
    #                activity, 
    #                normalizers, 
    #                default_normalization, 
    #                normalization_multiplier))
    #    self.normalized_activities_list = pd.concat(normalized_activities)
    #    self.aggregate_normalized_activities(self.normalized_activities_list)
        
    #    return self.normalized_activities_list, self.normalized_agg_activities
    
    
    #def calculate_normalized_activity(self, kinase_activity, normalizers, default_normalization = 0.05, normalization_multiplier = 0.05):
    #    """
    #    Activity is normalized based on kinase normalization factors. 
    #    Added Columns : 
    #        Normalization Factor : normalization factor for given Kinase
    #        Significant : 1 if Kinase Activity <= Normalization Factor
    #        Normalized Activity : ( Kinase Activity ) / ( Normalization Factor ) * Normalization Multiplier

    #    Parameters
    #    ----------
    #    kinase_activity : pandas df
    #        hypergeometric activity calculated through KSTAR algorithm
    #    default_normalization : float
    #        Normalization factor to use if one not provided
    #    normalization_multiplier : float
    #        normalization multiplier to use for calculated #normalized kinase activity
    #    """

    #    normalized_activity = kinase_activity.copy()
    #    normalized_activity['Normalization Factor'] = normalized_activity[config.KSTAR_KINASE].apply(lambda name: normalizers[name] if name in normalizers.keys() else default_normalization)
    #    if len(self.networks) > 0:
    #        normalized_activity['Significant'] = normalized_activity.apply(lambda row: (row['kinase_activity'] <= row['Normalization Factor'] / len( self.networks ) ) * 1, axis = 1)
    #    else:
    #        normalized_activity['Significant'] = normalized_activity.apply(lambda row: (row['kinase_activity'] <= row['Normalization Factor']) * 1, axis = 1)
    #    normalized_activity['Normalized Activity'] = normalized_activity.apply(lambda row: np.clip( row['kinase_activity'] / row['Normalization Factor'] * normalization_multiplier, 0.0, 1.0 ), axis = 1)
    #    return normalized_activity
    
    
    #def aggregate_normalized_activities(self, normalized_activities):
    #    """
    #    Summarizes normalized kinase activity results of all networks by kinase
    #    Summaries provided : 
    #        median original activity
    #        average original activity
    #        median normalized activity
    #        average normalized activity
    #        count significant
    #        fraction significant
        
    #    Parameters
    #    ----------
    #    normalized_activity : pandas df
    #        Normalized kinase activty
        
    #    Returns
    #    --------
    #    summary : pandas df
    #        summarized data of all networks by kinase
    #    """
    #    self.normalized_agg_activities = normalized_activities.groupby(['data',config.KSTAR_KINASE]).agg(
    #        median_original_activity = ('kinase_activity', 'median'),
    #        median_normalized_activity = ('Normalized Activity', 'median'),
    #        count_significant = ('Significant', 'sum'),
    #        fraction_significant = ('Significant', 'mean')
    #    ).reset_index()
    #    return self.normalized_agg_activities

    def  find_pvalue_limits(self, data_columns, agg = 'count', threshold = 1.0):
        """
        For each data column and network find the lowest p-value achievable and how many 
        seen sites are required to get to that limit.
        Assumptions
            - kinase size in network is same for all kinases

        Parameters
        ----------
        data_columns : list
            what columns in evidence to compare
        agg : str
            aggregate function - what function to use for determining if site is present
                count : use when using activity_count
                mean : use when using activity_threshold
        threshold : float
            threshold to use in determining if site present in evidence
        
        Returns
        -------
        all_limits : pandas DataFrame
            p-value limits of each column for each network
            columns:
                evidence        evidence data column
                network         network being compared
                kinase          kinase being evaluated
                evidence_size   size of evidence
                limit_size      number of sites to get non-zero p-value
                p-value          p-value generated
        limit_summary : pandas DataFrame
            summary of all_limits by taking average over by evidence
        """

        
        evidence = self.evidence.groupby([config.KSTAR_ACCESSION, config.KSTAR_SITE]).agg(agg)
        all_rows = []

        kinase_sizes = {}
        for nid, network in self.networks.items():
            # kinase_sizes[nid] = network.groupby(config.KSTAR_KINASE).size()
            kinase_sizes[nid] = int(network.groupby(config.KSTAR_KINASE).size().mean())
            

        for col in data_columns:
            print(col)
            filtered_evidence = evidence[evidence[col] > threshold]
            N = len(filtered_evidence.groupby([config.KSTAR_ACCESSION, config.KSTAR_SITE]).size())
            for nid, M in self.network_sizes.items():
                # kinases = list(kinase_sizes[nid].index)
                # for kinase in kinases:
                # n = int(kinase_sizes[nid].loc[kinase])
                n = kinase_sizes[nid]

                for k in range(N,0,-1):
                    pvalue = stats.hypergeom.sf(
                        k = k, 
                        M = M, 
                        n = n, 
                        N = N
                    )
                    # pvalue = 1 - prb
                    if pvalue > 0:
                        break
                row = {
                    'evidence' : col,
                    'network' : nid,
                    # 'kinase' : kinase,
                    'evidence_size' : N,
                    'limit_size' : k,
                    'kinase_size' : n,
                    'network_size' : M,
                    'p-value' : pvalue
                    }
                all_rows.append(row)
        all_limits = pd.DataFrame(all_rows)
        limit_summary = all_limits.groupby('evidence').mean()
        return all_limits, limit_summary

[docs]    def calculate_Mann_Whitney_activities_sig(self, log, number_sig_trials = 100, PROCESSES = 1):
        """
        For a kinact_dict, where random generation and activity has already been run for the phospho_types of interest, 
        this will calculate the Mann-Whitney U test for comparing the array of p-values for real data 
        to those of random data, across the number of networks used.
        It will also calculate the false positive rate for a pvalue, given observations of a random bootstrapping analysis
        
        Parameters
        ----------
        kinact_dict: dictionary
            A dictionary of kinact objects, with keys 'Y' and/or 'ST'
        log: logger 
            Logger for logging activity messages
        phospho_types: {['Y', 'ST'], ['Y'], ['ST']}
            Which substrate/kinaset-type to run activity for: Both ['Y, 'ST'] (default), Tyrosine ['Y'], or Serine/Threonine ['ST']
        number_sig_trials: int
            Maximum number of significant trials to run

            
        
        Returns
        -------

        """
        #First, check that objects are correct and values can be found
        if not isinstance(self.random_kinact.activities_list, pd.DataFrame):
            raise ValueError("Random activities do not exist, please run kstar_activity.normalize_analysis")

        if number_sig_trials > self.num_random_experiments:
            log.info("Warning: number of trials for Mann Whitney exceeds number available, using %d instead of %d"%(self.num_random_experiments, number_sig_trials))
            number_sig_trials = self.num_random_experiments

        self.activities_mann_whitney = pd.DataFrame(index=self.activities.index, columns=self.activities.columns)
        self.fpr_mann_whitney = pd.DataFrame(index=self.activities.index, columns=self.activities.columns)
        #for every kinase and every dataset, calculate and assemble dataframes of activities and significance values

        for exp in self.data_columns:
            log.info("MW Working on %s: "%(exp))

            #Get a subset of the random and real activites for this experiment
            activities_sub = self.activities_list[self.activities_list['data']==exp]
            rand_activities_sub = self.random_kinact.activities_list[self.random_kinact.activities_list['data'].str.startswith(exp)]

            pval_arr = []
            fpr_arr = []
            with concurrent.futures.ProcessPoolExecutor(max_workers=PROCESSES) as executor:
                for pval, fpr in executor.map(calculate_MannWhitney_one_experiment_one_kinase, repeat(activities_sub), repeat(rand_activities_sub), repeat(self.num_networks), self.activities.index, repeat(exp), repeat(number_sig_trials)):
                    pval_arr.append(pval)
                    fpr_arr.append(fpr)

            self.activities_mann_whitney[exp] = pval_arr
            self.fpr_mann_whitney[exp] = fpr_arr
    




def calculate_hypergeometric_single_network(evidence, network, network_size, network_id ):
        """
        Hypergeometric Cumulative Distribution Function calculated for each kinase given evidence
            k : number of times kinase seen in evidence
            M : number of unique sites in network
            n : number of times kinase seen in network
            N : size of evidence
        
        Parameters
        ----------
        evidence : pandas df
            subset of kstar evidence that has been filtered to only include evidence associated with experiment
        network_id : str
            network to use for analysis
        
        Returns
        -------
        results : pandas df
            Hypergeometric results of evidence for given network
            index : kinase_id
            columns
                frequency : number of times kinase seen in network
                kinase_activity : activity derived from hypergometric cdf 
        """
        
        
        intersect = pd.merge(network, evidence, how='inner',
            on=[config.KSTAR_ACCESSION, config.KSTAR_SITE])
        
        counts = intersect.groupby(config.KSTAR_KINASE).size()
        N = len(intersect.groupby([config.KSTAR_ACCESSION, config.KSTAR_SITE]).size())
        results = pd.DataFrame(counts, columns = ['frequency'])

        results['kinase_activity'] = 1.0

        K = network.groupby(config.KSTAR_KINASE).size()
        
        kinases = counts.index
        for kin in kinases:
            k = 0
            if counts.loc[kin] > 0:
                k = counts.loc[kin] - 1
            prb = stats.hypergeom.sf(
                k = int(k), 
                M = int(network_size), 
                n = int(K.loc[kin]), 
                N = int(N)
                )
            results.at[kin, 'kinase_activity'] = prb

        kinases = network[config.KSTAR_KINASE].unique()
        for kin in kinases:
            if kin not in results.index:
                results.at[kin,'frequency'] = 0
                results.at[kin,'kinase_activity'] = 1.0
        results['network'] = network_id
        return results
        
def calculate_hypergeometric_activities(evidence, networks, network_sizes, name):
        """
        Perform hypergeometric kinase activity analysis given evidence on all networks
        
        Parameters
        ----------
        evidence : pandas df
            subset of class evidence variable where data is filtered based on experiment
        name : str
            name of experiment being performed
            
        Returns
        ---------
        fdr_act : pd DataFrame
            network : network name, from networks key
            frequency : number of times kinase was seen in subgraph of evidence and network
            kinase_activity : hypergeometric kinase activity
            fdr_corrected_kinase_activity : kinase activity after fdr correction
            significant : whether kinase activity is significant based on fdr alpha
        combined : pd DataFrame
            significant : number of networks where kinase was found to be significant
            fraction_significant : fraction of networks kinase was found to be significant through FDR correction
            avg_p_value : combined p-values of kinase using mean
            median_p_value : combined p-values of kinase using median
        """
        
        # self.logger.info(f"Running hypergeometric analysis on {name}")
        
        results = []
        for network_id in networks.keys(): # calculate kinase activity within each network 
            result =calculate_hypergeometric_single_network(evidence, networks[network_id], network_sizes[network_id], network_id) 
            results.append(result)

        # combine results into single dataframe
        hyp_act = pd.concat(results)
        hyp_act = hyp_act.reset_index()
        hyp_act['data'] = name
        return hyp_act

"""
****************************************
Methods for Mann Whitney analysis
****************************************
"""

def calculate_fpr_Mann_Whitney(random_kinase_activity_array, number_sig_trials):
    """
    Given an mxn array of kinase activities from m random experiments across n networks
    use bootstrapping to calculate an empirical p-value at which the false positive rate is controlled. 
    This function takes one of m random experiments and calculates the Mann Whitney U pvalue 
    then finds the pvalue at which the target_alpha is achieved
    Parameters
    ----------
    random_kinase_activity_array: np.array
        See calculate_MannWhitney_one_experiment_one_kinase for unwrapping all activities for a kinase and experiment
    number_sig_trials:
        Number of random trials to perform, where one random set is tested number_sig_trials against the full background
    
    Returns
    -------
    random_stats: np.array 
        A vector of Mann Whitney p-values that is m long, representing pvalues from m bootstrap tests
    
    """
    #calculate the significance by taking each experiment 
    [m, n] = random_kinase_activity_array.shape
    if number_sig_trials > m: 
        print("Warning, using %d, maximum number for significance"%(m))
        number_sig_trials = m
    random_stats = np.empty([number_sig_trials])
    for i in range(0, number_sig_trials):
        #take out one vector as real
        sample = random_kinase_activity_array[i,:]
        bgnd = np.delete(random_kinase_activity_array, i, 0) #remove the sample before testing
        [stat, random_stats[i]] = stats.mannwhitneyu(-np.log10(sample), -np.log10(bgnd.reshape(bgnd.size)), alternative='greater')
    return random_stats

def calculate_MannWhitney_one_experiment_one_kinase(kinact_activities, rand_activities, number_networks, kinase, experiment, number_sig_trials):
    """
    For a given kinact object, where random generation and activity has already been run, this will calculate
    the Mann-Whitney U test between the p-values across all networks for the given experiment name 
    and from the random networks. It will also calculate the significance value for the given test 
    based on the target_alpha value by using each random set as a real set to bootstrap. 
    
    Parameters
    ----------
    kinact: kinact object
        A kinact object (not a dictionary)
    kinase: str
        Kinase name to measure significance for
    experiment: str
        Experiment name to measure significance for

    Returns
    -------
    p-value: float
        p-value that results from Mann Whitney U test
    fpr_value: float
        the false positive rate where the p_value for the real experiment lies, given the random experiments
    """
    
    
    kinase_activity_list = kinact_activities[(kinact_activities[config.KSTAR_KINASE]==kinase) & (kinact_activities['data']==experiment)].kinase_activity.values
    
    random_kinase_activity_array = np.empty([number_sig_trials, number_networks])

    for i in range(0, number_sig_trials):
        # get the kinase activity values for all networks for a random set
        experiment_name = experiment+':'+str(i)
        random_kinase_activity_array[i,:]=rand_activities[(rand_activities[config.KSTAR_KINASE]==kinase) & (rand_activities['data']==experiment_name)].kinase_activity.values
       
    [stat, p_value] = stats.mannwhitneyu(-np.log10(kinase_activity_list), -np.log10(random_kinase_activity_array.reshape(random_kinase_activity_array.size)), alternative='greater')
    
    randomStats = calculate_fpr_Mann_Whitney(random_kinase_activity_array, number_sig_trials)
    
    #sig_value = calculate_fpr.single_experiment_kinase_fpr(randomStats, target_alpha)
    fpr_value = calculate_fpr.single_pvalue_fpr(randomStats, p_value)
    #sig_value = 1
    
    return p_value, fpr_value

"""
****************************************
Methods for running KSTAR pipeline
****************************************
"""
[docs]def enrichment_analysis(experiment, log, networks, phospho_types =['Y', 'ST'], data_columns = None, agg = 'mean', threshold = 1.0,  greater = True, PROCESSES = 1):
    """
    Function to establish a kstar KinaseActivity object from an experiment with an activity log
    add the networks, calculate, aggregate, and summarize the hypergeometric enrichment into a final activity object. Should be followed by 
    randomized_analyis, then Mann_Whitney_analysis.

    Parameters
    ----------
    experiment: pandas df
        experiment dataframe that has been mapped, includes KSTAR_SITE, KSTAR_ACCESSION, etc.
    log: logger object
        Log to write activity log error and update to
    networks: dictionary of dictionaries
        Outer dictionary keys are 'Y' and 'ST'.
        Establish a network by loading a pickle of desired networks. See the helpers and config file for this.
        If downloaded from FigShare, then the GLOBAL network pickles in config file can be loaded
        For example: networks['Y'] = pickle.load(open(config.NETWORK_Y_PICKLE, "rb" ))
    phospho_types: {['Y', 'ST'], ['Y'], ['ST']}
        Which substrate/kinaset-type to run activity for: Both ['Y, 'ST'] (default), Tyrosine ['Y'], or Serine/Threonine ['ST']
    data_columns : list
        columns that represent experimental result, if None, takes the columns that start with `data:'' in experiment. 
        Pass this value in as a list, if seeking to calculate on fewer than all available data columns
    agg : {'count', 'mean'}
        method to use when aggregating duplicate substrate-sites. 
        'count' combines multiple representations and adds if values are non-NaN
        'mean' uses the mean value of numerical data from multiple representations of the same peptide.
            NA values are droped from consideration.
    threshold : float
        threshold value used to filter rows
    greater: Boolean
        whether to keep sites that have a numerical value >=threshold (TRUE, default) or <=threshold (FALSE)
    
    Returns
    -------
    kinactDict: dictionary of Kinase Activity Objects
        Outer keys are phosphoTypes run 'Y' and 'ST'
        Includes the activities dictionary (see calculate_kinase_activities)
        aggregation of activities across networks (see aggregate activities)
        activity summary (see summarize_activities)

    """


    kinact_dict = {}

    # For each phosphoType of interest, establish a kinase activity object on a filtered dataset and run, aggregate, and summarize activity
    for phospho_type in phospho_types:
        #first check that networks for the phosphotypes were passed in
        if phospho_type not in networks:
            print("ERROR: Please pass networks as dictionary with phosphotype key")
        #filter the experiment (log how many are of that type)
        if phospho_type == 'ST':
            experiment_sub = experiment[(experiment.KSTAR_SITE.str.contains('S')) | (experiment.KSTAR_SITE.str.contains('T'))]
            log.info("Running Serine/Threonine Kinase Activity Analysis")
        elif phospho_type == 'Y':
            experiment_sub = experiment[(experiment.KSTAR_SITE.str.contains('Y'))]
            log.info("Running Tyrosine Kinase Activity Analysis")

        else:
            print("ERROR: Did not recognize phosphoType %s, which should only include 'Y' or 'ST' "%(phospho_type))
            return
        kinact = KinaseActivity(experiment_sub, log, data_columns = data_columns, phospho_type=phospho_type)
        kinact.add_networks_batch(networks[phospho_type])
        kinact.calculate_kinase_activities(agg=agg, threshold=threshold, greater=greater, PROCESSES = PROCESSES)
        kinact.aggregate_activities()
        kinact.activities = kinact.summarize_activities()
        kinact_dict[phospho_type] = kinact
    return kinact_dict

#def normalize_analysis(kinact_dict, log, num_random_experiments=150, target_alpha = 0.05, PROCESSES = 1):
#    """
#    Creates random experiments, drawn from the human phosphoproteome, according to the distribution of the number of compendia
#    that each data column in the experiment has for num_random_experiments. Kinase activity calculation is then run on every random experiment
#    and then assembled for each data column to calculate the pvalue at which a kinase hits the target_alpha value of false positives. 
#    Finally, it takes these empirically defined pvalue corrections and normalizes the kinase activities according to these, such that
#    the real experimental data also has the target_alpha value. 
#
#    Parameters
#    ----------
#    kinact_dict: KinaseActivities dictionary
#        Has keys ['Y'] and/or ['ST'] and values that are KinaseActivity objects. These objects are modified to add normalization
#    log: logger 
#        Logger for logging activity messages
#    num_random_experiments: int
#        Number of random experiments, for each data column, to create and run activity from
#    target_alpha: float 
#        Value between 0 and 1 that is the target false positive rate. 
#
#
#    """
#
#    if target_alpha > 1 or target_alpha < 0: 
#        raise ValueError('ERROR: target_alpha must be value between 0 and 1')
#
#    for phospho_type, kinact in kinact_dict.items():
#        kinact.run_normalization(log, num_random_experiments, target_alpha, PROCESSES = PROCESSES)
        
[docs]def randomized_analysis(kinact_dict, log, num_random_experiments=150, PROCESSES = 1):
    """
    Creates random experiments, drawn from the human phosphoproteome, according to the distribution of the number of compendia
    that each data column in the experiment has for num_random_experiments. Kinase activity calculation is then run on every random experiment. 

    Parameters
    ----------
    kinact_dict: KinaseActivities dictionary
        Has keys ['Y'] and/or ['ST'] and values that are KinaseActivity objects. These objects are modified to add normalization
    log: logger 
        Logger for logging activity messages
    num_random_experiments: int
        Number of random experiments, for each data column, to create and run activity from
    """

    for phospho_type, kinact in kinact_dict.items():
        kinact.calculate_random_activities(log, num_random_experiments, PROCESSES = PROCESSES)
        
[docs]def Mann_Whitney_analysis(kinact_dict, log, number_sig_trials = 100, PROCESSES = 1):
    """
    For a kinact_dict, where random generation and activity has already been run for the phospho_types of interest, 
    this will calculate the Mann-Whitney U test for comparing the array of p-values for real data 
    to those of random data, across the number of networks used.
    It will also calculate the false positive rate for a pvalue, given observations of a random bootstrapping analysis
        
    Parameters
    ----------
    kinact_dict: dictionary
        A dictionary of kinact objects, with keys 'Y' and/or 'ST'
    log: logger 
        Logger for logging activity messages
    number_sig_trials: int
        Maximum number of significant trials to run
    """

    for phospho_type, kinact in kinact_dict.items():
        kinact.calculate_Mann_Whitney_activities_sig(log, number_sig_trials = number_sig_trials, PROCESSES = PROCESSES)
        
    
        
    
[docs]def save_kstar(kinact_dict, name, odir, PICKLE=True):
        """
        Having performed kinase activities (run_kstar_analyis), save each of the important dataframes to files and the final pickle
        Saves an activities, aggregated_activities, summarized_activities tab-separated files
        Saves a pickle file of dictionary 
        
        Parameters
        ----------
        kinact_dict: dictionary of Kinase Activity Objects
            Outer keys are phosphoTypes run 'Y' and 'ST'
            Includes the activities dictionary (see calculate_kinase_activities)
            aggregation of activities across networks (see aggregate activities)
            activity summary (see summarize_activities)
        name: string 
            The name to use when saving activities
        odir:  string
            Outputdirectory to save files and pickle to
        PICKLE: boolean
            Whether to save the entire pickle file 

        Returns
        -------
        Nothing

        """

        if not os.path.exists(f"{odir}/RESULTS"): 
            os.mkdir(f"{odir}/RESULTS") 
        
        for phospho_type in kinact_dict:
            kinact = kinact_dict[phospho_type]
            name_out = f"{name}_{phospho_type}"
            kinact.activities_list.to_csv(f"{odir}/RESULTS/{name_out}_activities_list.tsv", sep = '\t')
            kinact.agg_activities.to_csv(f"{odir}/RESULTS/{name_out}_aggregated_activities.tsv", sep = '\t', index = False)
            kinact.activities.to_csv(f"{odir}/RESULTS/{name_out}_activities.tsv", sep = '\t', index = True)
            kinact.evidence_binary.to_csv(f"{odir}/RESULTS/{name_out}_binarized_experiment.tsv", sep='\t', index=False)
            
            if hasattr(kinact, 'random_kinact'):
                kinact.random_kinact.activities_list.to_csv(f"{odir}/RESULTS/{name_out}_random_activities_list.tsv", sep = '\t')
                kinact.random_kinact.agg_activities.to_csv(f"{odir}/RESULTS/{name_out}_random_aggregated_activities.tsv", sep = '\t', index = False)
                kinact.random_kinact.activities.to_csv(f"{odir}/RESULTS/{name_out}_random_activities.tsv", sep = '\t', index = True)
                kinact.random_experiments.to_csv(f"{odir}/RESULTS/{name_out}_random_experiments.tsv", sep = '\t', index = False)
                kinact.random_kinact.evidence.to_csv(f"{odir}/RESULTS/{name_out}_random_experiments.tsv", sep = '\t', index = False)
                
            #if kinact.normalized:

            #    kinact.normalized_activities_list.to_csv(f"{odir}/RESULTS/{name_out}_normalized_activities_list.tsv", sep = '\t', index = True)
            #    kinact.normalized_agg_activities.to_csv(f"{odir}/RESULTS/{name_out}_normalized_aggregated_activities.tsv", sep = '\t', index = True)
            #    kinact.activities_normalized.to_csv(f"{odir}/RESULTS/{name_out}_activities_normalized.tsv", sep = '\t', index = True)
            #    kinact.fpr_activity.to_csv(f"{odir}/RESULTS/{name_out}_activities_fpr.tsv", sep='\t', index=True)


            if hasattr(kinact, 'activities_mann_whitney'):
                kinact.activities_mann_whitney.to_csv(f"{odir}/RESULTS/{name_out}_mann_whitney_activities.tsv", sep='\t', index = True) 
                kinact.fpr_mann_whitney.to_csv(f"{odir}/RESULTS/{name_out}_mann_whitney_fpr.tsv", sep='\t', index = True) 

        if PICKLE:
            pickle.dump( kinact_dict, open( f"{odir}/RESULTS/{name}_kinact.p", "wb" ) )

[docs]def save_kstar_slim(kinact_dict, name, odir):
    """
    Having performed kinase activities (run_kstar_analyis), save each of the important dataframes, minimizing the memory storage needed to get back 
    to a rebuilt version for plotting results and analysis. For each phospho_type in the kinact_dict, this will save three .tsv files for every activities
    analysis run, two additional if random analysis was run, and two more if Mann Whitney based analysis was run. It also creates a readme file of the parameter values 
    used 
    
    Parameters
    ----------
    kinact_dict: dictionary of Kinase Activity Objects
        Outer keys are phosphoTypes run 'Y' and 'ST'
        Includes the activities dictionary (see calculate_kinase_activities)
        aggregation of activities across networks (see aggregate activities)
        activity summary (see summarize_activities)
    name: string 
        The name to use when saving activities
    odir:  string
        Outputdirectory to save files and pickle to

    Returns
    -------
    Nothing

    """

    if not os.path.exists(f"{odir}/RESULTS"): 
        os.mkdir(f"{odir}/RESULTS") 


    param_dict = {}
    
    for phospho_type in kinact_dict:

        kinact = kinact_dict[phospho_type]
        name_out = f"{name}_{phospho_type}"


        param_temp = {}
        param_temp['data_columns'] = kinact.data_columns
        param_temp['aggregate'] = kinact.aggregate
        param_temp['greater'] = kinact.greater
        param_temp['threshold'] = kinact.threshold
        param_temp['run_date']= kinact.run_date
        param_temp['mann_whitney'] = False
        param_temp['randomized'] = False
        param_temp['num_networks'] = kinact.num_networks
        param_temp['network_directory'] = kinact.network_directory


        kinact.activities_list.to_csv(f"{odir}/RESULTS/{name_out}_activities_list.tsv", sep = '\t')
        kinact.activities.to_csv(f"{odir}/RESULTS/{name_out}_activities.tsv", sep = '\t', index = True)
        kinact.evidence_binary.to_csv(f"{odir}/RESULTS/{name_out}_binarized_experiment.tsv", sep='\t', index=False)

        #if kinact.normalized:
        #    param_temp['normalized'] = True
        #    kinact.activities_normalized.to_csv(f"{odir}/RESULTS/{name_out}_activities_normalized.tsv", sep = '\t', index = True)
        #    kinact.fpr_activity.to_csv(f"{odir}/RESULTS/{name_out}_activities_fpr.tsv", sep='\t', index=True)
            
        if hasattr(kinact, 'random_kinact'):
            param_temp['randomized'] = True
            param_temp['num_random_experiments'] = kinact.num_random_experiments
            kinact.random_kinact.activities_list.to_csv(f"{odir}/RESULTS/{name_out}_random_activities_list.tsv", sep = '\t')
            kinact.random_kinact.evidence.to_csv(f"{odir}/RESULTS/{name_out}_random_experiments.tsv", sep = '\t', index = False)

        if hasattr(kinact, 'activities_mann_whitney'):
            param_temp['mann_whitney'] = True
            kinact.activities_mann_whitney.to_csv(f"{odir}/RESULTS/{name_out}_mann_whitney_activities.tsv", sep='\t', index = True) 
            kinact.fpr_mann_whitney.to_csv(f"{odir}/RESULTS/{name_out}_mann_whitney_fpr.tsv", sep='\t', index = True) 

        param_dict[phospho_type] = param_temp

    #save the parameters in a pickle file for reinstantiating object information
    pickle.dump(param_dict, open(f"{odir}/RESULTS/{name}_params.p", "wb"))


[docs]def from_kstar_slim(name, odir, log):
    """
    Given the name and output directory of a saved kstar analyis, load the parameters and minimum dataframes needed for reinstantiating a kinact object
    This minimum list will allow you to repeat normalization or mann whitney at a different false positive rate threshold and plot results.

    Parameters
    ----------
    name: string 
        The name to used when saving activities and mapped data
    odir:  string
        Output directory of saved files and parameter pickle
    log: logger 
        Logger for logging activity messages
    """

    #First check for the param file
    try:
        param_dict = pickle.load( open(f"{odir}/RESULTS/{name}_params.p", "rb"))
    except:
        print(f"ERROR: Cannot find parameter dictionary file in RESULTS: {odir}/RESULTS/{name}_params.p")
        log.info(f"ERROR: Cannot find parameter dictionary file in RESULTS: {odir}/RESULTS/{name}_params.p")
        return
    kinact_dict = {}
    for phospho_type in param_dict.keys():
        params = param_dict[phospho_type]
        name_out = f"{name}_{phospho_type}"
        #instantiate an object and update values according to 

        #check that the minimum file set exists so we can use binary_evidence file as the experiment
        evidence_binary = pd.read_csv(f"{odir}/RESULTS/{name_out}_binarized_experiment.tsv", sep='\t')


        kinact = KinaseActivity(evidence_binary, log, phospho_type=phospho_type)

        kinact.activities_list = pd.read_csv(f"{odir}/RESULTS/{name_out}_activities_list.tsv", sep = '\t', index_col = 0)
        kinact.activities = pd.read_csv(f"{odir}/RESULTS/{name_out}_activities.tsv", sep = '\t', index_col = config.KSTAR_KINASE)
        kinact.evidence_binary = evidence_binary

        if params['mann_whitney']:
            #read mann_whitney and load
            kinact.activities_mann_whitney = pd.read_csv(f"{odir}/RESULTS/{name_out}_mann_whitney_activities.tsv", sep='\t', index_col = config.KSTAR_KINASE) 
            kinact.fpr_mann_whitney = pd.read_csv(f"{odir}/RESULTS/{name_out}_mann_whitney_fpr.tsv", sep='\t', index_col = config.KSTAR_KINASE) 
            params.pop('mann_whitney', None)
            
        #if params['normalized']:
        #    kinact.activities_normalized= pd.read_csv(f"{odir}/RESULTS/{name_out}_activities_normalized.tsv", sep = '\t', index_col = config.KSTAR_KINASE)
        #    kinact.fpr_activity = pd.read_csv(f"{odir}/RESULTS/{name_out}_activities_fpr.tsv", sep='\t', index_col=config.KSTAR_KINASE)
        
        if params['randomized']:
            rand_experiments = pd.read_csv(f"{odir}/RESULTS/{name_out}_random_experiments.tsv", sep = '\t')

            kinact.random_kinact = KinaseActivity(rand_experiments, log, phospho_type=phospho_type)
            kinact.random_kinact.activities_list = pd.read_csv(f"{odir}/RESULTS/{name_out}_random_activities_list.tsv", sep = '\t')
            kinact.random_kinact.evidence = pd.read_csv(f"{odir}/RESULTS/{name_out}_random_experiments.tsv", sep = '\t')
            #set data columns according to file
            kinact.random_kinact.set_data_columns(data_columns=None)


        for param_name in params:
            setattr(kinact, param_name, params[param_name])
        kinact_dict[phospho_type] = kinact
    return kinact_dict
    
[docs]def from_kstar_nextflow(name, odir, log = None):
    """
    Given the name and output directory of a saved kstar analyis from the nextflow pipeline, load the results into new kinact object with
    the minimum dataframes required for analysis (binary experiment, hypergeometric activities, normalized activities, mann whitney activities)

    Parameters
    ----------
    name: string 
        The name to used when saving activities and mapped data
    odir:  string
        Output directory of saved files
    log: logger
        logger used when loading nextflow data into kinase activity object. If not provided, new logger will be created.
    """
    #create new logger if not provided
    if log is None:
        log = helpers.get_logger(f"activity_{name}", f"{odir}/activity_{name}.log")
    
    kinact_dict = {}
    for phospho_type in ['Y','ST']:
        #check to see if folder for phosphotype is present (have Y or ST activities been calculated)
        if os.path.isdir(f'{odir}/{phospho_type}'):
            #check that the minimum file set exists so we can use binary_evidence file as the experiment
            evidence_binary = pd.read_csv(f"{odir}/{phospho_type}/binary_experiment/{name}_binarized_experiment.tsv", sep='\t')


            kinact = KinaseActivity(evidence_binary, log, phospho_type=phospho_type)

            kinact.activities_list = pd.read_csv(f"{odir}/{phospho_type}/hypergeometric_activity/{name}_activities_list.tsv", sep = '\t', index_col = 0)
            kinact.activities = pd.read_csv(f"{odir}/{phospho_type}/hypergeometric_activity/{name}_activities.tsv", sep = '\t', index_col = config.KSTAR_KINASE)
            kinact.evidence_binary = evidence_binary

            #read mann_whitney and load
            kinact.activities_mann_whitney = pd.read_csv(f"{odir}/{phospho_type}/mann_whitney/{name}_mann_whitney_activities.tsv", sep='\t', index_col = config.KSTAR_KINASE) 
            kinact.fpr_mann_whitney = pd.read_csv(f"{odir}/{phospho_type}/mann_whitney/{name}_mann_whitney_fpr.tsv", sep='\t', index_col = config.KSTAR_KINASE) 
            #rand_experiments = pd.read_csv(f"{odir}/{phospho_type}/{name_out}_random_experiments.tsv", sep = '\t')
            #kinact.random_kinact = KinaseActivity(rand_experiments, log, phospho_type=phospho_type)
            #kinact.random_kinact.activities_list = pd.read_csv(f"{odir}/{phospho_type}/{name}_random_activities_list.tsv", sep = '\t', index_col=0)
            #kinact.random_kinact.evidence = pd.read_csv(f"{odir}/{phospho_type}/{name_out}_random_experiments.tsv", sep = '\t')
            #set data columns according to file
            #kinact.random_kinact.set_data_columns(data_columns=None)

            kinact_dict[phospho_type] = kinact
            
    
    #check to see if any files were loaded
    if kinact_dict.keys() == 0:
        print('KSTAR results were not found for either Y or ST')
        
    return kinact_dict