Source code for kstar.helpers

from Bio import SeqIO
import logging
import argparse
import urllib.parse
import urllib.request

[docs]def process_fasta_file(fasta_file):
	"""
	For configuration, to convert the global fasta sequence file into a sequence dictionary that can be used in mapping

	Parameters
	---------
	fasta_file : str
		file location of fasta file
	
	Returns
	---------
	sequences : dict
		{acc : sequence} dictionary generated from fasta file
	"""
	seqs = SeqIO.parse(open(fasta_file), 'fasta')

	sequences = {}
	for entry in seqs:
		seq = str(entry.seq)
		acc = entry.id.split('|')[1].strip()
		sequences[acc] = seq
	return sequences


[docs]def get_logger(name, filename):
	"""
	Finds and returns logger if it exists. Creates new logger if log file does not exist

	Parameters
	----------
	name : str
	log name
	filename : str
	location to store log file
	"""
	logger = logging.getLogger(name)
	logger.setLevel(logging.DEBUG)
	handler = logging.FileHandler(filename)
	log_format = logging.Formatter('%(asctime)s\t%(name)s\t%(levelname)s:\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
	handler.setFormatter(log_format)
	#     if (logger.hasHandlers()):
	#         logger.handlers.clear()
	logger.addHandler(handler)
	return logger

[docs]def string_to_boolean(string):
	"""
	Converts string to boolean

	Parameters
	----------
	string :str
		input string

	Returns
	----------
	result : bool
		output boolean
	"""
	if isinstance(string, bool):
		return str
	if string.lower() in ('yes', 'true', 't', 'y', '1'):
		return True
	elif string.lower() in ('no', 'false', 'f', 'n', '0'):
		return False
	else:
		raise argparse.ArgumentTypeError('Boolean value expected.')

[docs]def convert_acc_to_uniprot(df, acc_col_name, acc_col_type, acc_uni_name):
	"""
	Given an experimental dataframe (df) with an accession column (acc_col_name) that is not uniprot, use uniprot 
	to append an accession column of uniprot IDS

	Parameters
	----------
	df: pandas.DataFrame
		Dataframe with at least a column of accession of interest
	acc_col_name: string
		name of column to convert FROM
	acc_col_type: string
		Uniprot string designation of the accession type to convert FROM, see https://www.uniprot.org/help/api_idmapping
	acc_uni_name: 
		name of new column

	Returns
	-------
	appended_df: pandas.DataFrame
		Input dataframe with an appended acc column of uniprot IDs


	"""
	# Convert refseq to Uniprot identifiers from the header of cptac dataset


	#get the unique identifiers 
	accVals = list(set(df[acc_col_name].values))

	#create a query string for uniprot query
	queryString = ''
	for acc in accVals:
	    #remove the isoform, which maps strangely

	    queryString = "%s %s"%(queryString, acc)

	url = 'https://www.uniprot.org/uploadlists/'

	params = {
	'from': acc_col_type,
	'to': 'ACC',
	'format': 'tab',
	'query': queryString#list(set(refseqList))
	}

	data = urllib.parse.urlencode(params)
	data = data.encode('utf-8')
	req = urllib.request.Request(url, data)
	with urllib.request.urlopen(req) as f:
	   response = f.read()
	#print(response.decode('utf-8'))

	ref_to_uni ={}

	for line in response.decode('utf-8').split('\n'):
	    if line:
	        l_arr = line.split('\t')
	        fromVal = l_arr[0]
	        if acc_col_type=='P_GI':
	        	fromVal = 'gi|'+str(l_arr[0]) 
	        ref_to_uni[fromVal] = l_arr[1]

	#now walk through each row, create a unique and add accession
	uniprot_arr = []
	for index, row in df.iterrows():
	    acc = row[acc_col_name]
	    if acc in ref_to_uni:
	        uniprot_arr.append(ref_to_uni[acc])
	    else:
	        uniprot_arr.append('NotFound')
	df[acc_uni_name]=uniprot_arr
	return df