Source code for dansy.helper

import os
from dansy.config import DANSY_DATA_DIR, DANSY_PROTEOME_VERSION
import dansy.ngramUtilities as ngramUtilities

[docs] def import_proteome_files(ref_file_dir = DANSY_DATA_DIR, ref_file_suffix = DANSY_PROTEOME_VERSION): ''' Imports the files that are used for the generation of the reference dataframe of the complete canonical proteome. Note: Need to adjust this so it looks in only one folder from here on out. Parameters: ----------- - reference_file_version: str String of the suffix of the reference files to be used Returns: -------- - ref_df: pandas DataFrame Dataframe containing the InterPro, UniProt, and PDB information of individual proteins as retrieved via CoDIAC - interpro_dict: dict dictionary containing the InterPro IDs and domain names for conversion purposes ''' all_refs = [] ref_files = os.listdir(ref_file_dir) for fileName in ref_files: if fileName.endswith(ref_file_suffix): fullpath = os.path.join(ref_file_dir, fileName) all_refs.append(fullpath) if all_refs: ref_df = ngramUtilities.import_reference_file(all_refs) else: raise FileNotFoundError(f'No reference file with the suffix {ref_file_suffix} was found') #ref_df, interpro_dict = ngramUtilities.add_Interpro_ID_architecture(ref_df) return ref_df