corrected some nan bugs

a08ea562 · sauves · 00cd0641 · a08ea562
Commit a08ea562 authored 5 years ago by sauves
--- a/compute_tpm.py
+++ b/compute_tpm.py
@@ -10,7 +10,7 @@ def main():
        parser.add_argument('-GE',  dest = 'GE_FILE', default = None, type = str, help = 'name of Gene Expression matrix file in RAW counts')
        parser.add_argument('-GL', dest = 'GL_FILE', default = None, type = str, help = 'name of Gene LIST file (genes must appear in same order than in the GE matrix file.)')
        parser.add_argument('-n', dest = 'NORMALIZATION_PRTCL', default = 'TPM', type = str, help = 'type of normalisation used')
-        parser.add_argument('-conversion_file', dest = 'CONVERSION_FILE', default = '~/leucegene/E19/tables/CONVERSIONS/gene_id_conversions.txt', help= 'name of file used to retrieve the following gene info: (1) "ensembl ID" to "gene symbol" or gene_name (2) transcript type ie. "protein_coding" "pseudogene" etc. (3) transcript length including all UTRs' )
+        parser.add_argument('-conversion_file', dest = 'CONVERSION_FILE', default = '/u/sauves/leucegene/E19/tables/CONVERSIONS/gene_id_conversions.txt', help= 'name of file used to retrieve the following gene info: (1) "ensembl ID" to "gene symbol" or gene_name (2) transcript type ie. "protein_coding" "pseudogene" etc. (3) transcript length including all UTRs' )
        parser.add_argument('-transcript_type', dest = 'TPT_TYPE', default = ['protein_coding'], type = str, nargs = '+', help= 'transcript types used for gene prefiltering') 
        parser.add_argument('-o', dest = 'OUTPUT_FILE', default = 'GE_FILTERED_log10[TPMx1024_plus_one]_{}.txt'.format(datetime.datetime.now().isoformat()[:-10]),type = str, help = 'name of output file')
        parser.add_argument('-v', dest = 'VERBOSE',type = int, help = 'level of verbosity' , default = 0)
@@ -30,7 +30,7 @@ def main():
        GE_KEEP = GE.iloc[np.array(GL_KEEP.idx)]
        if args.VERBOSE > 0: print('{} Normalizing...'.format(args.NORMALIZATION_PRTCL) )
        RPK = GE_KEEP.values / np.array(GL_KEEP.length).reshape((GL_KEEP.shape[0], 1)) * 1000
-        per_million = RPK.sum(axis = 0) / 1e6
+        per_million = np.nansum(RPK, axis = 0) / 1e6
        TPM = RPK / per_million.reshape((1, RPK.shape[1]))
        lTPM = np.log10(1024 * TPM + 1)
        lTPM_DF = pd.DataFrame(lTPM, columns = GE_KEEP.columns)
@@ -40,7 +40,7 @@ def main():
        OUTFILE = os.path.join('res', args.OUTPUT_FILE)
        if args.VERBOSE > 0 : print('Writing to file {} ...'.format(OUTFILE))
        lTPM_DF.to_csv(OUTFILE)
-        if args.VERBOSE > 0 : print('All done!')
+        if args.VERBOSE > 0 : print('All done! at {}'.format(datetime.datetime.now().isoformat()[:-10]))

 if __name__ == '__main__':
        main()