Source code for kstar.helpers

from Bio import SeqIO
import logging
import argparse
import urllib.parse
import urllib.request

[docs]def process_fasta_file(fasta_file): """ For configuration, to convert the global fasta sequence file into a sequence dictionary that can be used in mapping Parameters --------- fasta_file : str file location of fasta file Returns --------- sequences : dict {acc : sequence} dictionary generated from fasta file """ seqs = SeqIO.parse(open(fasta_file), 'fasta') sequences = {} for entry in seqs: seq = str(entry.seq) acc ='|')[1].strip() sequences[acc] = seq return sequences
[docs]def get_logger(name, filename): """ Finds and returns logger if it exists. Creates new logger if log file does not exist Parameters ---------- name : str log name filename : str location to store log file """ logger = logging.getLogger(name) logger.setLevel(logging.DEBUG) handler = logging.FileHandler(filename) log_format = logging.Formatter('%(asctime)s\t%(name)s\t%(levelname)s:\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') handler.setFormatter(log_format) # if (logger.hasHandlers()): # logger.handlers.clear() logger.addHandler(handler) return logger
[docs]def string_to_boolean(string): """ Converts string to boolean Parameters ---------- string :str input string Returns ---------- result : bool output boolean """ if isinstance(string, bool): return str if string.lower() in ('yes', 'true', 't', 'y', '1'): return True elif string.lower() in ('no', 'false', 'f', 'n', '0'): return False else: raise argparse.ArgumentTypeError('Boolean value expected.')
[docs]def convert_acc_to_uniprot(df, acc_col_name, acc_col_type, acc_uni_name): """ Given an experimental dataframe (df) with an accession column (acc_col_name) that is not uniprot, use uniprot to append an accession column of uniprot IDS Parameters ---------- df: pandas.DataFrame Dataframe with at least a column of accession of interest acc_col_name: string name of column to convert FROM acc_col_type: string Uniprot string designation of the accession type to convert FROM, see acc_uni_name: name of new column Returns ------- appended_df: pandas.DataFrame Input dataframe with an appended acc column of uniprot IDs """ # Convert refseq to Uniprot identifiers from the header of cptac dataset #get the unique identifiers accVals = list(set(df[acc_col_name].values)) #create a query string for uniprot query queryString = '' for acc in accVals: #remove the isoform, which maps strangely queryString = "%s %s"%(queryString, acc) url = '' params = { 'from': acc_col_type, 'to': 'ACC', 'format': 'tab', 'query': queryString#list(set(refseqList)) } data = urllib.parse.urlencode(params) data = data.encode('utf-8') req = urllib.request.Request(url, data) with urllib.request.urlopen(req) as f: response = #print(response.decode('utf-8')) ref_to_uni ={} for line in response.decode('utf-8').split('\n'): if line: l_arr = line.split('\t') fromVal = l_arr[0] if acc_col_type=='P_GI': fromVal = 'gi|'+str(l_arr[0]) ref_to_uni[fromVal] = l_arr[1] #now walk through each row, create a unique and add accession uniprot_arr = [] for index, row in df.iterrows(): acc = row[acc_col_name] if acc in ref_to_uni: uniprot_arr.append(ref_to_uni[acc]) else: uniprot_arr.append('NotFound') df[acc_uni_name]=uniprot_arr return df