Analysis of CPTAC Breast Cancer Patient Samples (Mertens 2016)

[1]:
#Supporting packages for analysis
import numpy as np
import pandas as pd


#KSTAR imports
from kstar import config, helpers, calculate
from kstar.plot import DotPlot


#Set matplotlib defaults for arial 12 point font
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Arial']
rcParams['font.size'] = 12
import matplotlib.pyplot as plt


#where supplementary data was downloaded to (From https://figshare.com/articles/dataset/KSTAR_Supplementary_Data/14919726)
SUPPLEMENTS_DIR = './'

#Directory where KSTAR Supplemental data was set
odir = SUPPLEMENTS_DIR+'Supplements/SupplementaryData/BreastCancer/CPTAC_Mertins2016/'

#load the Mann Whitney activities and FPR for Tyrosine predictions,
#it will be faster and less data than loading all KSTAR outputs
activities = pd.read_csv(odir+'RESULTS/TCGA_Y_mann_whitney_activities.tsv', sep='\t', index_col=0)
fpr = pd.read_csv(odir+'RESULTS/TCGA_Y_mann_whitney_fpr.tsv', sep='\t', index_col=0)

s1 = pd.read_csv(odir+'brca_clinical_data_her2Determination_oneHeader.csv')




#load kinase map from supplementary data
KINASE_MAP =  pd.read_csv(SUPPLEMENTS_DIR+'SupplementaryData/Map/globalKinaseMap.csv', index_col = 0)
#set preferred kinase names from the kinase map (make a kinase_dict)
kinase_dict = {}
for kinase in activities.index:
    kinase_dict[kinase] = KINASE_MAP.loc[kinase,'Preferred Name']

Plot using agglomerative clustering on both kinases and patients, and add context to each sample

[2]:

#Setup a label dictionary that makes it easier to see patient sample names mapDict = {} cols = activities.columns arr_Ids = [] for index, row in s1.iterrows(): id_val = row['Complete TCGA ID'] #remove the starting TCGA- or CPTAC- and find the match in the column header of data and map it patient = id_val.strip('TCGA-') patient = patient.strip('CPTAC-') mapDict[id_val] = '' #default if it's not found for col in cols: if patient in col: mapDict[id_val] = col break arr_Ids.append(mapDict[id_val]) s1['ID'] = arr_Ids # Set the column labels to just have the labels x_label_dict = {} for label in cols: l = label.strip('data:patient:') l_arr = l.split('.') x_label_dict[label] = l_arr[0]
[3]:
results = activities
results = -np.log10(results)


#Setup a figure with a context strip at the top for HER2 status and activity dots on the below axis
fig, axes = plt.subplots(figsize = (20, 20),
        nrows = 3, ncols = 2,
        sharex = 'col',
        sharey = 'row',
        gridspec_kw = {
            'height_ratios':[.1,0.05, 1],
            'width_ratios':[0.1,1]
        },)
fig.subplots_adjust(wspace=0, hspace=0)

dots = DotPlot(results,
               fpr,
               figsize = (20,18),
               dotsize = 5,
               legend_title='-log10(p-value)',
               x_label_dict=x_label_dict,
               kinase_dict = kinase_dict)
#dots.drop_kinases_with_no_significance()

#Cluster changes the sorting of the values array, so be sure to plot context last so that it is in the same sort.
dots.cluster(orientation = 'left', ax = axes[2,0], method='ward')
dots.cluster(orientation = 'top', ax = axes[0,1], method='ward')
#dots.context(ax=axes[1,1],info = s1, id_column = 'Sample', context_columns = ['Human Tumor Her2'], orientation = 'top', dotsize =200, markersize= 15 )
#dots.context(ax=axes[1,1],info = s1, id_column = 'ID', context_columns = ['AJCC Stage', 'HER2 Final Status', 'PAM50 mRNA'], orientation = 'top', dotsize =40, markersize= 20 )
dots.context(ax=axes[1,1],info = s1, id_column = 'ID', context_columns = ['HER2 Final Status', 'PAM50 mRNA', 'RPPA Clusters'], orientation = 'top', dotsize =40, markersize= 20 )

dots.dotplot(ax = axes[2,1])

plt.savefig(odir+'CPTAC_all.pdf', bbox_inches='tight')
../_images/Examples_CPTAC_Analysis_4_0.png

Plot ERBB2 activities with each samples associated HER2 status

[4]:
s1['HER2 Status'] = s1['HER2 Final Status']
[5]:
#Plot just the ERBB2 kinase activity and ERBB2 status

kinases = list(activities.index)
kinases.remove('ERBB2')
results = activities
results = results.sort_values('ERBB2', axis=1)

results = -np.log10(results)


#Setup a figure with a context strip at the top for HER2 status and activity dots on the below axis
fig, axes = plt.subplots(figsize = (9,0.75),
        nrows = 2,
        sharex = 'col',
        sharey = 'row',
        gridspec_kw = {
            'height_ratios':[.5,.5]
        },)
fig.subplots_adjust(wspace=0, hspace=0)

dots = dotplot.DotPlot(results,
                       fpr,
                       figsize = (9,0.75),
                       dotsize = 3,
                       legend_title='-log10(p-value)',
                       x_label_dict=x_label_dict)

#drop all but ERBB2
dots.drop_kinases(kinases)


dots.context(ax=axes[0],info = s1, id_column = 'ID', context_columns = ['HER2 Status'], orientation = 'top', dotsize =10, markersize= 10 )

dots.dotplot(ax = axes[1])

[5]:
<matplotlib.axes._subplots.AxesSubplot at 0x1214e8150>
../_images/Examples_CPTAC_Analysis_7_1.png

Measure the number of cross-predictions between HER2-status and HER2/ERBB2 activity

[9]:
def returnPredictionRates(kinasePred, s1):
    """
    Given a binary prediction of kinase status, what is the prediction rate
    """
    ERBB2_pred_dict = {}
    HER2_status_dict = {}
    for patient_ID in kinasePred.keys():
        if patient_ID in s1.ID.values:
            ERBB2_pred_dict[patient_ID] = kinasePred[patient_ID]
            status = s1[s1['ID']==patient_ID]['HER2 Final Status'].values[0]
            if status == 'Positive':
                HER2_status_dict[patient_ID] = True
            elif status == 'Negative':
                HER2_status_dict[patient_ID] = False
            else:
                HER2_status_dict[patient_ID] = 'Equivocal'
    comp_df = pd.DataFrame.from_dict(ERBB2_pred_dict, orient='index', columns =['HER2 Prediction'])
    temp = pd.DataFrame.from_dict(HER2_status_dict, orient='index', columns=['HER2 Status'])
    comp_df = comp_df.join(temp)
    trueNeg = 0
    truePos = 0
    falsePos = 0
    falseNeg = 0
    for index, row in comp_df.iterrows():
            if not row['HER2 Status']: #negative for HER2 at biopsy
                if not row['HER2 Prediction']:
                    trueNeg +=1
                else:
                    falsePos += 1
            else: #positive for HER2
                if not row['HER2 Prediction']:
                    falseNeg += 1
                else:
                    truePos +=1
    TPR = truePos/(truePos + falsePos)
    FPR = falsePos/(truePos+falsePos)
    return comp_df, [truePos, falsePos, trueNeg, falseNeg]
[10]:

#generate the results, at a specific FPR, what % do we predict. comp_df = {} arrayDict ={} alpha = 0.05 significance = fpr <= alpha ERBB2_prediction = significance.loc['ERBB2'] comp_df, arrayVals = returnPredictionRates(ERBB2_prediction, s1) [truePos, falsePos, trueNeg, falseNeg] = arrayVals
[11]:
comp_df #the dataframe that combines the HER2 activity prediction and corresponding status
[11]:
HER2 Prediction HER2 Status
data:patient:A2-A0CM.07TCGA False False
data:patient:A2-A0D0.06TCGA False False
data:patient:A2-A0D1.07TCGA True True
data:patient:A2-A0D2.31TCGA False False
data:patient:A2-A0EQ.08TCGA False True
... ... ...
data:patient:E2-A10A.02TCGA False False
data:patient:E2-A150.27TCGA False False
data:patient:E2-A154.03TCGA False False
data:patient:E2-A158.29TCGA True False
data:patient:E2-A159.24TCGA False False

97 rows × 2 columns

[14]:
# HER2-negative patients that are predicted to have activity
HER2neg = comp_df[comp_df['HER2 Status']==False]
HER2neg
print('Number of HER2 Negative patients=%d, Number Considered HER2 active=%d'%(len(HER2neg), HER2neg['HER2 Prediction'].sum()))
Number of HER2 Negative patients=73, Number Considered HER2 active=12
[15]:
# HER2-negative patients that are predicted to have activity
HER2pos = comp_df[comp_df['HER2 Status']==True]
print('Number of HER2 Positive patients=%d, Number Considered HER2 active=%d'%(len(HER2pos), HER2pos['HER2 Prediction'].sum()))
Number of HER2 Positive patients=23, Number Considered HER2 active=10
[16]:
print("Check: there are %d patients and %d status"%(len(comp_df), truePos+trueNeg+falsePos+falseNeg))
Check: there are 97 patients and 97 status