Skip to content
Snippets Groups Projects
Commit a08ea562 authored by sauves's avatar sauves
Browse files

corrected some nan bugs

parent 00cd0641
No related branches found
No related tags found
No related merge requests found
......@@ -10,7 +10,7 @@ def main():
parser.add_argument('-GE', dest = 'GE_FILE', default = None, type = str, help = 'name of Gene Expression matrix file in RAW counts')
parser.add_argument('-GL', dest = 'GL_FILE', default = None, type = str, help = 'name of Gene LIST file (genes must appear in same order than in the GE matrix file.)')
parser.add_argument('-n', dest = 'NORMALIZATION_PRTCL', default = 'TPM', type = str, help = 'type of normalisation used')
parser.add_argument('-conversion_file', dest = 'CONVERSION_FILE', default = '~/leucegene/E19/tables/CONVERSIONS/gene_id_conversions.txt', help= 'name of file used to retrieve the following gene info: (1) "ensembl ID" to "gene symbol" or gene_name (2) transcript type ie. "protein_coding" "pseudogene" etc. (3) transcript length including all UTRs' )
parser.add_argument('-conversion_file', dest = 'CONVERSION_FILE', default = '/u/sauves/leucegene/E19/tables/CONVERSIONS/gene_id_conversions.txt', help= 'name of file used to retrieve the following gene info: (1) "ensembl ID" to "gene symbol" or gene_name (2) transcript type ie. "protein_coding" "pseudogene" etc. (3) transcript length including all UTRs' )
parser.add_argument('-transcript_type', dest = 'TPT_TYPE', default = ['protein_coding'], type = str, nargs = '+', help= 'transcript types used for gene prefiltering')
parser.add_argument('-o', dest = 'OUTPUT_FILE', default = 'GE_FILTERED_log10[TPMx1024_plus_one]_{}.txt'.format(datetime.datetime.now().isoformat()[:-10]),type = str, help = 'name of output file')
parser.add_argument('-v', dest = 'VERBOSE',type = int, help = 'level of verbosity' , default = 0)
......@@ -30,7 +30,7 @@ def main():
GE_KEEP = GE.iloc[np.array(GL_KEEP.idx)]
if args.VERBOSE > 0: print('{} Normalizing...'.format(args.NORMALIZATION_PRTCL) )
RPK = GE_KEEP.values / np.array(GL_KEEP.length).reshape((GL_KEEP.shape[0], 1)) * 1000
per_million = RPK.sum(axis = 0) / 1e6
per_million = np.nansum(RPK, axis = 0) / 1e6
TPM = RPK / per_million.reshape((1, RPK.shape[1]))
lTPM = np.log10(1024 * TPM + 1)
lTPM_DF = pd.DataFrame(lTPM, columns = GE_KEEP.columns)
......@@ -40,7 +40,7 @@ def main():
OUTFILE = os.path.join('res', args.OUTPUT_FILE)
if args.VERBOSE > 0 : print('Writing to file {} ...'.format(OUTFILE))
lTPM_DF.to_csv(OUTFILE)
if args.VERBOSE > 0 : print('All done!')
if args.VERBOSE > 0 : print('All done! at {}'.format(datetime.datetime.now().isoformat()[:-10]))
if __name__ == '__main__':
main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment