Skip to content
Snippets Groups Projects
Commit 7d14cc67 authored by sauves's avatar sauves
Browse files

finished compute_tpm

parent 7329dc65
No related branches found
No related tags found
No related merge requests found
......@@ -3,21 +3,44 @@ import pandas as pd
import os
import argparse
import pdb
import datetime
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-GE', dest = 'GE_FILE', default = None, type = str, help = 'name of Gene Expression matrix file in RAW counts')
parser.add_argument('-GL', dest = 'GL_FILE', default = None, type = str, help = 'name of Gene LIST file (genes must appear in same order than in the GE matrix file.)')
parser.add_argument('-n', dest = 'NORMALISATION_PRTCL', default = 'TPM', type = str, help = 'type of normalisation used')
parser.add_argument('-conrsion_file', dest = 'CONVERSION_FILE', default = '~/leucegene/E19/tables/CONVERSIONS/gene_id_conversions.txt', help= 'name of file used to retrieve the following gene info: (1) "ensembl ID" to "gene symbol" or gene_name (2) transcript type ie. "protein_coding" "pseudogene" etc. (3) transcript length including all UTRs' )
parser.add_argument('-n', dest = 'NORMALIZATION_PRTCL', default = 'TPM', type = str, help = 'type of normalisation used')
parser.add_argument('-conversion_file', dest = 'CONVERSION_FILE', default = '~/leucegene/E19/tables/CONVERSIONS/gene_id_conversions.txt', help= 'name of file used to retrieve the following gene info: (1) "ensembl ID" to "gene symbol" or gene_name (2) transcript type ie. "protein_coding" "pseudogene" etc. (3) transcript length including all UTRs' )
parser.add_argument('-transcript_type', dest = 'TPT_TYPE', default = ['protein_coding'], type = str, nargs = '+', help= 'transcript types used for gene prefiltering')
parser.add_argument('-o', dest = 'OUTPUT_FILE', default = 'GE_FILTERED_log10[TPMx1024_plus_one]_{}.txt'.format(datetime.datetime.now().isoformat()[:-10]),type = str, help = 'name of output file')
parser.add_argument('-v', dest = 'VERBOSE',type = int, help = 'level of verbosity' , default = 0)
args = parser.parse_args()
if args.VERBOSE > 0: print('Starting [{}] normalization of input file [{}] at {}\nReading in files...'.format(args.NORMALIZATION_PRTCL, args.GE_FILE, datetime.datetime.now().isoformat()[:-10]))
CONVERSION = pd.read_csv(args.CONVERSION_FILE, sep = '\t')
CONVERSION.columns = ['tpt_type', 'gene_name', 'strand', 'length','gencode_anno', 'go_domain', 'go_acc','chr','ensblID']
KEEP = CONVERSION[np.any(np.array(CONVERSION.tpt_type).reshape((CONVERSION.shape[0], 1)) == args.TPT_TYPE, axis = 1)][['gene_name', 'ensblID','tpt_type', 'length']]
GL = pd.read_csv(args.GL_FILE, sep = '\t')
pdb.set_trace()
GE = pd.read_csv(args.GE_FILE, sep = '\t')
CONVERSION.columns = ['tpt_type', 'gene_name', 'strand', 'length','gencode_anno', 'go_domain', 'go_acc','chr','ensbl_stable_ID']
KEEP = CONVERSION[np.any(np.array(CONVERSION.tpt_type).reshape((CONVERSION.shape[0], 1)) == args.TPT_TYPE, axis = 1)][['gene_name', 'ensbl_stable_ID','tpt_type', 'length']].groupby('ensbl_stable_ID')[['gene_name','length']].max()
GL = pd.read_csv(args.GL_FILE, names = ['ensbl_stable_ID'])
GL['ensblID'] = [ensg.split('.')[0] for ensg in GL.ensbl_stable_ID]
GL['idx'] = np.arange(GL.shape[0])
GE = pd.read_csv(args.GE_FILE, sep = ' ')
if args.VERBOSE > 0: print('Filtering on {}...'.format(args.TPT_TYPE))
KEEP['ensblID'] = [ensg.split('.')[0] for ensg in KEEP.index]
GL_KEEP = GL.merge(KEEP)
GE_KEEP = GE.iloc[np.array(GL_KEEP.idx)]
if args.VERBOSE > 0: print('{} Normalizing...'.format(args.NORMALIZATION_PRTCL) )
RPK = GE_KEEP.values / np.array(GL_KEEP.length).reshape((GL_KEEP.shape[0], 1)) * 1000
per_million = RPK.sum(axis = 0) / 1e6
TPM = RPK / per_million.reshape((1, RPK.shape[1]))
lTPM = np.log10(1024 * TPM + 1)
lTPM_DF = pd.DataFrame(lTPM, columns = GE_KEEP.columns)
lTPM_DF.index = GL_KEEP.gene_name
if 'res' not in os.listdir('.'):
os.mkdir('res')
OUTFILE = os.path.join('res', args.OUTPUT_FILE)
if args.VERBOSE > 0 : print('Writing to file {} ...'.format(OUTFILE))
lTPM_DF.to_csv(OUTFILE)
if args.VERBOSE > 0 : print('All done!')
if __name__ == '__main__':
main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment