library(dplyr)
library(tidyr)
library(data.table)
library(DESeq2)
library(BiocParallel)
library(stringr)

# enable paralellism
register(MulticoreParam(8))
# 
patients = fread('~/leucegene/E19/tables/PATIENTS/LEUCEGENE_452_LABELS.txt')

# read in GE gene list file 
GL = fread('/u/sauves/leucegene/data/star/COUNTS/GENE_LIST_all-unstranded_STAR_COUNTS_RAW_T.txt', header = F)
# read in GE files 
GE.protocol.counts.t = fread('/u/sauves/leucegene/data/star/COUNTS/GE_all-unstranded_STAR_COUNTS_RAW_T.txt') %>% mutate(ensblID = GL$V1)

# this is the final cts file
GE.protocol.counts.t.na_rm = GE.protocol.counts.t[which(!is.na(GE.protocol.counts.t %>% select(-ensblID) %>% rowSums())),] # remove NAs (DESEQ doesn't like it!)
rownames(GE.protocol.counts.t.na_rm) = GE.protocol.counts.t.na_rm$ensblID
GE.protocol.counts.t.na_rm.prep = GE.protocol.counts.t.na_rm %>% select(-ensblID)

# this is the coldata file 
coldata = data.frame( condition = as.factor(t(patients[18,2:ncol(patients)])))
rownames(coldata) = names(patients)[2:ncol(patients)]

#perform deseq analysis
dds = DESeqDataSetFromMatrix(countData = GE.protocol.counts.t.na_rm.prep, colData = coldata, design = ~condition)
expressed  = rowSums(counts(dds)) >= 10
dds = dds[expressed,]
dds = DESeq(dds)
res = results(dds)
write.csv(as.data.frame(res), file = paste('~/leucegene/E19/tables/DESEQ2/DESeq2_DEG_PROTEIN_CODING_', nrow(dds),'x452_PROTOCOL',COUNT_PROTOCOL,'_stranded_vs_n_stranded.txt', sep = '', collapse = ''))