added R files

46359ae1 · sauves · a08ea562 · 46359ae1 · 46359ae1 · 46359ae1
Commit 46359ae1 authored 5 years ago by sauves
--- a/assemble_gene_expression_matrix.R
+++ b/assemble_gene_expression_matrix.R
+library(dplyr)
+library(tidyr)
+library(data.table)
+library(DESeq2)
+library(BiocParallel)
+library(stringr)
+patients = fread('~/leucegene/E19/tables/PATIENTS/LEUCEGENE_452_LABELS.txt')
+sample_filepaths = data.frame( files = as.character(system('ls -d /u/leucegene/data/rnaseq/star/*/*/ReadsPerGene.out.tab',intern = T)))
+sample_filepaths$pname = (str_split(sample_filepaths$files, '/') 
+                          %>% unlist() 
+                          %>% matrix(ncol = 9, byrow = T))[,8]
+# specifiy which protocol to consider 
+COUNT_PROTOCOL = 'all-unstranded'
+# create protocol file 
+protocol = data.frame(pname = names(patients), strandedness = t(patients[18,])) %>%
+  left_join(sample_filepaths, by = 'pname') %>% 
+  filter(!is.na(files)) %>% 
+  mutate(protocol1.column = 2,
+         protocol3.column = ifelse(strandedness == 'stranded', 4, 2)) 
+write.table(protocol, '~/leucegene/E19/tables/DESEQ2/PROTOCOLS.txt', quote = F)
+assembleGeneExpressionMatrix = function(protocolDF, keep, count.protocol = 'all-unstranded', transpose = T){
+  # init returned DF
+  retDF = select(keep, ensblID)
+  # start filling df 
+  for (patient in protocolDF$pname){
+    # retrieve file 
+    patientDF = fread(as.character(protocolDF$files[protocolDF$pname == patient]))
+    # keepcols
+    keepcols = which(c('ensbl_stable_ID','all-unstranded','reverse', 'strand-specific') %in% c('ensbl_stable_ID', count.protocol))
+    # select columns based on count protocol
+    patientDF = patientDF %>% select(keepcols) %>% 
+      separate(1, c('ensblID', 'version'), sep = '\\.')
+    # join to Returned DF
+    retDF = retDF %>% left_join(patientDF %>% select(-version), by = 'ensblID')
+    names(retDF)[ncol(retDF)] = patient
+    i = 1
+    idx = as.numeric(rownames(protocolDF)[protocolDF$pname == patient])
+    modulo = as.integer(length(protocolDF$pname)/10)
+    if (as.numeric(idx) %% modulo  == 0){
+      print(paste(paste(rep('#', 1 + idx *10 / length(protocolDF$pname) ), collapse = ''), round(idx * 100 / length(protocolDF$pname)),'% complete',paste('[',idx,'/',length(protocolDF$pname),']', collapse = ''), collapse = '\t'))
+    }
+  }
+  retDF
+}
+# read in conversion file 
+conversion = fread('~/leucegene/E19/tables/CONVERSIONS/gene_id_conversions.txt')
+names(conversion)[c(1,2,9)] = c('tpt_type', 'gene_name', 'ensbl_stable_ID') 
+keep = conversion %>% filter(tpt_type == 'protein_coding') %>% 
+  group_by(gene_name, ensbl_stable_ID) %>% 
+  summarise(n = n()) %>% 
+  ungroup() %>% 
+  separate(ensbl_stable_ID, c('ensblID', 'version'), sep = '\\.')
+# read in GE files 
+GE.protocol.counts.t = assembleGeneExpressionMatrix(protocol, keep, count.protocol = COUNT_PROTOCOL, transpose = T)
+# write assembled GE data
+write.table(GE.protocol.counts.t[,2:ncol(GE.protocol.counts.t)], file = paste('~/leucegene/data/star/COUNTS/GE_', COUNT_PROTOCOL, '_STAR_COUNTS_RAW_T.txt', sep = '',collapse = ''), quote = F, row.names = F)
+write.table(GE.protocol.counts.t[,1], file = paste('~/leucegene/data/star/COUNTS/GENE_LIST_', COUNT_PROTOCOL, '_STAR_COUNTS_RAW_T.txt', sep = '',collapse = ''), quote = F, row.names = F)
--- a/run_deseq.R
+++ b/run_deseq.R
+library(dplyr)
+library(tidyr)
+library(data.table)
+library(DESeq2)
+library(BiocParallel)
+library(stringr)
+# enable paralellism
+register(MulticoreParam(8))
+# 
+patients = fread('~/leucegene/E19/tables/PATIENTS/LEUCEGENE_452_LABELS.txt')
+# read in GE gene list file 
+GL = fread('/u/sauves/leucegene/data/star/COUNTS/GENE_LIST_all-unstranded_STAR_COUNTS_RAW_T.txt', header = F)
+# read in GE files 
+GE.protocol.counts.t = fread('/u/sauves/leucegene/data/star/COUNTS/GE_all-unstranded_STAR_COUNTS_RAW_T.txt') %>% mutate(ensblID = GL$V1)
+# this is the final cts file
+GE.protocol.counts.t.na_rm = GE.protocol.counts.t[which(!is.na(GE.protocol.counts.t %>% select(-ensblID) %>% rowSums())),] # remove NAs (DESEQ doesn't like it!)
+rownames(GE.protocol.counts.t.na_rm) = GE.protocol.counts.t.na_rm$ensblID
+GE.protocol.counts.t.na_rm.prep = GE.protocol.counts.t.na_rm %>% select(-ensblID)
+# this is the coldata file 
+coldata = data.frame( condition = as.factor(t(patients[18,2:ncol(patients)])))
+rownames(coldata) = names(patients)[2:ncol(patients)]
+#perform deseq analysis
+dds = DESeqDataSetFromMatrix(countData = GE.protocol.counts.t.na_rm.prep, colData = coldata, design = ~condition)
+expressed  = rowSums(counts(dds)) >= 10
+dds = dds[expressed,]
+dds = DESeq(dds)
+res = results(dds)
+write.csv(as.data.frame(res), file = paste('~/leucegene/E19/tables/DESEQ2/DESeq2_DEG_PROTEIN_CODING_', nrow(dds),'x452_PROTOCOL',COUNT_PROTOCOL,'_stranded_vs_n_stranded.txt', sep = '', collapse = ''))
--- a/run_limma.R
+++ b/run_limma.R
+library(dplyr)
+library(tidyr)
+library(data.table)
+library(edgeR)
+# library(BiocParallel)
+library(stringr)
+COUNT_PROTOCOL = 'strand-specific'
+GE.counts.tpm.prt_cod = fread(paste('~/leucegene/data/star/COUNTS/GE_', COUNT_PROTOCOL, '_STAR_COUNTS_log10[TPMx1024_plus_one]_T.txt', sep = '',collapse = '')) 
+duplicates = (GE.counts.tpm.prt_cod %>% select(gene_name) %>% group_by(gene_name) %>% summarise(n = n())%>% filter(n >= 2))$gene_name
+GE.counts.tpm.prt_cod = GE.counts.tpm.prt_cod %>% filter(!(gene_name %in% duplicates))
+rownames(GE.counts.tpm.prt_cod) = GE.counts.tpm.prt_cod$gene_name
+limma.counts = GE.counts.tpm.prt_cod[,2:ncol(GE.counts.tpm.prt_cod)] 
+not_nas = which(!(is.na(rowSums(limma.counts))))
+limma.counts = limma.counts[not_nas,]
+# read in conversion file 
+conversion = fread('~/leucegene/E19/tables/CONVERSIONS/gene_id_conversions.txt')
+names(conversion)[c(1,2,9)] = c('tpt_type', 'gene_name', 'ensbl_stable_ID') 
+patients = fread('~/leucegene/E19/tables/PATIENTS/LEUCEGENE_452_LABELS.txt')
+# this is the coldata file 
+coldata = data.frame( condition = as.factor(t(patients[18,2:ncol(patients)]))) %>% mutate(condition = str_replace(condition, '-', '_'))
+rownames(coldata) = names(patients)[2:ncol(patients)]
+groups = coldata$condition
+# figdir = '~/leucegene/E19/figures/JUN6'
+# system(paste('mkdir', figdir))
+# pdf(paste(figdir, paste('limma_fig1_',COUNT_PROTOCOL,'.pdf', sep = '', collapse = ''), sep = '/'))
+# plotMDS(limma.counts, col = as.numeric(groups))
+# dev.off()
+mm = model.matrix(~0 + groups)
+y = voom(limma.counts, mm, plot = T)
+fit = lmFit(y, mm)
+contr <- makeContrasts(groupsnon_stranded-groupsstranded, levels = colnames(coef(fit)))
+tmp <- contrasts.fit(fit, contr)
+tmp <- eBayes(tmp)
+top.table <- topTable(tmp, sort.by = "logFC", n = Inf)
+tabledir = '/u/sauves/leucegene/E19/tables/LIMMA-VOOM'
+tablename = paste('LIMMA_VOOM_DEG_PROTEIN_CODING_',nrow(top.table),'x', ncol(top.table),'_PRTCL_',COUNT_PROTOCOL,'_stranded_vs_n_stranded.txt', collapse = '', sep = '')
+write.table(top.table, paste(tabledir, tablename, sep = '/'))
+# head(top.table, 20)