Source code for ptm_pose.splicing_tools.MAJIQ
from ptm_pose import helpers, pose_config, project
from ptm_pose.splicing_tools.base import GenericDataset
import pandas as pd
[docs]
class MAJIQ_Dataset(GenericDataset):
"""
Given splice quantification from the MAJIQ algorithm, annotate with PTMs that are found in the differentially included regions.
Parameters
----------
voila_tsv: str
Path to the MAJIQ output TSV file, which contains the splice quantification data.
samp1_name: str
Name of the first sample in the MAJIQ output. This will be used to identify the sample in MAJIQ output and label the columns in the output dataframe.
samp2_name: str
Name of the second sample in the MAJIQ output. This will be used to identify the sample in MAJIQ output and label the columns in the output dataframe.
alpha: float
Significance threshold for filtering out non-changing lsvs. Default is 0.05.
min_dpsi: float
Delta PSI cutoff for filtering out non-changing lsvs. Default is 0.15.
coordinate_type: str
indicates the coordinate system used for the start and end positions. Either hg38 or hg19. Default is 'hg38'.
Attributes
----------
splice_data: pandas.DataFrame
processed DataFrame containing the MAJIQ splice quantification data with each row representing a unique junction of an lsv
"""
def __init__(self, voila_tsv_file, samp1_name, samp2_name, min_dpsi = 0, alpha = 0.05, coordinate_type = 'hg38'):
majiq = pd.read_csv(voila_tsv_file, sep = '\t', header = 10)
#separate lsvs with multiple junctions into unique rows
lsv_cols = ['mean_dpsi_per_lsv_junction', 'probability_changing', 'probability_non_changing', f'{samp1_name}_mean_psi', f'{samp2_name}_mean_psi', 'de_novo_junctions', 'junctions_coords']
for col in lsv_cols:
majiq[col] = majiq[col].str.split(';')
majiq = majiq.explode(column = lsv_cols)
#convert to numeric
for col in lsv_cols:
if col != 'junctions_coords' and col != 'de_novo_junctions':
majiq[col] = pd.to_numeric(majiq[col])
#extract start and stop of junctions from junctions_coords
majiq['junction_start'] = majiq['junctions_coords'].str.split('-').str[0].astype(int)
majiq['junction_end'] = majiq['junctions_coords'].str.split('-').str[1].astype(int)
super().__init__(splice_data=majiq, min_dpsi=min_dpsi, alpha=alpha, coordinate_type=coordinate_type, chromosome_col = 'seqid', strand_col = 'strand', region_start_col = 'junction_start', region_end_col = 'junction_end', dpsi_col = 'mean_dpsi_per_lsv_junction', sig_col = 'probability_non_changing', event_id_col = 'lsv_id', start_coordinate_system = '1-based')
self.samp1_name = samp1_name
self.samp2_name = samp2_name
def run_pose(self, extra_cols = None, PROCESSES = 1, **kwargs):
#check for any keyword arguments to use for filtering
self.project_ptms_generic(extra_cols = extra_cols, PROCESSES = PROCESSES, **kwargs)
def run_nease(self):
self.run_nease_generic()