library(dplyr) library(tidyr) library(data.table) library(DESeq2) library(BiocParallel) library(stringr) # enable paralellism register(MulticoreParam(8)) # patients = fread('~/leucegene/E19/tables/PATIENTS/LEUCEGENE_452_LABELS.txt') # read in GE gene list file GL = fread('/u/sauves/leucegene/data/star/COUNTS/GENE_LIST_all-unstranded_STAR_COUNTS_RAW_T.txt', header = F) # read in GE files GE.protocol.counts.t = fread('/u/sauves/leucegene/data/star/COUNTS/GE_all-unstranded_STAR_COUNTS_RAW_T.txt') %>% mutate(ensblID = GL$V1) # this is the final cts file GE.protocol.counts.t.na_rm = GE.protocol.counts.t[which(!is.na(GE.protocol.counts.t %>% select(-ensblID) %>% rowSums())),] # remove NAs (DESEQ doesn't like it!) rownames(GE.protocol.counts.t.na_rm) = GE.protocol.counts.t.na_rm$ensblID GE.protocol.counts.t.na_rm.prep = GE.protocol.counts.t.na_rm %>% select(-ensblID) # this is the coldata file coldata = data.frame( condition = as.factor(t(patients[18,2:ncol(patients)]))) rownames(coldata) = names(patients)[2:ncol(patients)] #perform deseq analysis dds = DESeqDataSetFromMatrix(countData = GE.protocol.counts.t.na_rm.prep, colData = coldata, design = ~condition) expressed = rowSums(counts(dds)) >= 10 dds = dds[expressed,] dds = DESeq(dds) res = results(dds) write.csv(as.data.frame(res), file = paste('~/leucegene/E19/tables/DESEQ2/DESeq2_DEG_PROTEIN_CODING_', nrow(dds),'x452_PROTOCOL',COUNT_PROTOCOL,'_stranded_vs_n_stranded.txt', sep = '', collapse = ''))