added tpm script

7329dc65 · sauves · 3486e519 · 7329dc65
Commit 7329dc65 authored 5 years ago by sauves
--- a/compute_tpm.py
+++ b/compute_tpm.py
+import numpy as np
+import pandas as pd 
+import os
+import argparse
+import pdb
+def main():
+        parser = argparse.ArgumentParser()
+        parser.add_argument('-GE',  dest = 'GE_FILE', default = None, type = str, help = 'name of Gene Expression matrix file in RAW counts')
+        parser.add_argument('-GL', dest = 'GL_FILE', default = None, type = str, help = 'name of Gene LIST file (genes must appear in same order than in the GE matrix file.)')
+        parser.add_argument('-n', dest = 'NORMALISATION_PRTCL', default = 'TPM', type = str, help = 'type of normalisation used')
+        parser.add_argument('-conrsion_file', dest = 'CONVERSION_FILE', default = '~/leucegene/E19/tables/CONVERSIONS/gene_id_conversions.txt', help= 'name of file used to retrieve the following gene info: (1) "ensembl ID" to "gene symbol" or gene_name (2) transcript type ie. "protein_coding" "pseudogene" etc. (3) transcript length including all UTRs' )
+        parser.add_argument('-transcript_type', dest = 'TPT_TYPE', default = ['protein_coding'], type = str, nargs = '+', help= 'transcript types used for gene prefiltering') 
+        args = parser.parse_args()
+        CONVERSION = pd.read_csv(args.CONVERSION_FILE, sep = '\t')
+        CONVERSION.columns = ['tpt_type', 'gene_name', 'strand', 'length','gencode_anno', 'go_domain', 'go_acc','chr','ensblID']
+        KEEP = CONVERSION[np.any(np.array(CONVERSION.tpt_type).reshape((CONVERSION.shape[0], 1)) == args.TPT_TYPE, axis = 1)][['gene_name', 'ensblID','tpt_type', 'length']] 
+        GL = pd.read_csv(args.GL_FILE, sep = '\t')
+        
+        pdb.set_trace()
+        GE = pd.read_csv(args.GE_FILE, sep = '\t')
+
+if __name__ == '__main__':
+        main()