Skip to content
Snippets Groups Projects
Commit 46359ae1 authored by sauves's avatar sauves
Browse files

added R files

parent a08ea562
No related branches found
Tags 0.76.3
No related merge requests found
library(dplyr)
library(tidyr)
library(data.table)
library(DESeq2)
library(BiocParallel)
library(stringr)
patients = fread('~/leucegene/E19/tables/PATIENTS/LEUCEGENE_452_LABELS.txt')
sample_filepaths = data.frame( files = as.character(system('ls -d /u/leucegene/data/rnaseq/star/*/*/ReadsPerGene.out.tab',intern = T)))
sample_filepaths$pname = (str_split(sample_filepaths$files, '/')
%>% unlist()
%>% matrix(ncol = 9, byrow = T))[,8]
# specifiy which protocol to consider
COUNT_PROTOCOL = 'all-unstranded'
# create protocol file
protocol = data.frame(pname = names(patients), strandedness = t(patients[18,])) %>%
left_join(sample_filepaths, by = 'pname') %>%
filter(!is.na(files)) %>%
mutate(protocol1.column = 2,
protocol3.column = ifelse(strandedness == 'stranded', 4, 2))
write.table(protocol, '~/leucegene/E19/tables/DESEQ2/PROTOCOLS.txt', quote = F)
assembleGeneExpressionMatrix = function(protocolDF, keep, count.protocol = 'all-unstranded', transpose = T){
# init returned DF
retDF = select(keep, ensblID)
# start filling df
for (patient in protocolDF$pname){
# retrieve file
patientDF = fread(as.character(protocolDF$files[protocolDF$pname == patient]))
# keepcols
keepcols = which(c('ensbl_stable_ID','all-unstranded','reverse', 'strand-specific') %in% c('ensbl_stable_ID', count.protocol))
# select columns based on count protocol
patientDF = patientDF %>% select(keepcols) %>%
separate(1, c('ensblID', 'version'), sep = '\\.')
# join to Returned DF
retDF = retDF %>% left_join(patientDF %>% select(-version), by = 'ensblID')
names(retDF)[ncol(retDF)] = patient
i = 1
idx = as.numeric(rownames(protocolDF)[protocolDF$pname == patient])
modulo = as.integer(length(protocolDF$pname)/10)
if (as.numeric(idx) %% modulo == 0){
print(paste(paste(rep('#', 1 + idx *10 / length(protocolDF$pname) ), collapse = ''), round(idx * 100 / length(protocolDF$pname)),'% complete',paste('[',idx,'/',length(protocolDF$pname),']', collapse = ''), collapse = '\t'))
}
}
retDF
}
# read in conversion file
conversion = fread('~/leucegene/E19/tables/CONVERSIONS/gene_id_conversions.txt')
names(conversion)[c(1,2,9)] = c('tpt_type', 'gene_name', 'ensbl_stable_ID')
keep = conversion %>% filter(tpt_type == 'protein_coding') %>%
group_by(gene_name, ensbl_stable_ID) %>%
summarise(n = n()) %>%
ungroup() %>%
separate(ensbl_stable_ID, c('ensblID', 'version'), sep = '\\.')
# read in GE files
GE.protocol.counts.t = assembleGeneExpressionMatrix(protocol, keep, count.protocol = COUNT_PROTOCOL, transpose = T)
# write assembled GE data
write.table(GE.protocol.counts.t[,2:ncol(GE.protocol.counts.t)], file = paste('~/leucegene/data/star/COUNTS/GE_', COUNT_PROTOCOL, '_STAR_COUNTS_RAW_T.txt', sep = '',collapse = ''), quote = F, row.names = F)
write.table(GE.protocol.counts.t[,1], file = paste('~/leucegene/data/star/COUNTS/GENE_LIST_', COUNT_PROTOCOL, '_STAR_COUNTS_RAW_T.txt', sep = '',collapse = ''), quote = F, row.names = F)
library(dplyr)
library(tidyr)
library(data.table)
library(DESeq2)
library(BiocParallel)
library(stringr)
# enable paralellism
register(MulticoreParam(8))
#
patients = fread('~/leucegene/E19/tables/PATIENTS/LEUCEGENE_452_LABELS.txt')
# read in GE gene list file
GL = fread('/u/sauves/leucegene/data/star/COUNTS/GENE_LIST_all-unstranded_STAR_COUNTS_RAW_T.txt', header = F)
# read in GE files
GE.protocol.counts.t = fread('/u/sauves/leucegene/data/star/COUNTS/GE_all-unstranded_STAR_COUNTS_RAW_T.txt') %>% mutate(ensblID = GL$V1)
# this is the final cts file
GE.protocol.counts.t.na_rm = GE.protocol.counts.t[which(!is.na(GE.protocol.counts.t %>% select(-ensblID) %>% rowSums())),] # remove NAs (DESEQ doesn't like it!)
rownames(GE.protocol.counts.t.na_rm) = GE.protocol.counts.t.na_rm$ensblID
GE.protocol.counts.t.na_rm.prep = GE.protocol.counts.t.na_rm %>% select(-ensblID)
# this is the coldata file
coldata = data.frame( condition = as.factor(t(patients[18,2:ncol(patients)])))
rownames(coldata) = names(patients)[2:ncol(patients)]
#perform deseq analysis
dds = DESeqDataSetFromMatrix(countData = GE.protocol.counts.t.na_rm.prep, colData = coldata, design = ~condition)
expressed = rowSums(counts(dds)) >= 10
dds = dds[expressed,]
dds = DESeq(dds)
res = results(dds)
write.csv(as.data.frame(res), file = paste('~/leucegene/E19/tables/DESEQ2/DESeq2_DEG_PROTEIN_CODING_', nrow(dds),'x452_PROTOCOL',COUNT_PROTOCOL,'_stranded_vs_n_stranded.txt', sep = '', collapse = ''))
library(dplyr)
library(tidyr)
library(data.table)
library(edgeR)
# library(BiocParallel)
library(stringr)
COUNT_PROTOCOL = 'strand-specific'
GE.counts.tpm.prt_cod = fread(paste('~/leucegene/data/star/COUNTS/GE_', COUNT_PROTOCOL, '_STAR_COUNTS_log10[TPMx1024_plus_one]_T.txt', sep = '',collapse = ''))
duplicates = (GE.counts.tpm.prt_cod %>% select(gene_name) %>% group_by(gene_name) %>% summarise(n = n())%>% filter(n >= 2))$gene_name
GE.counts.tpm.prt_cod = GE.counts.tpm.prt_cod %>% filter(!(gene_name %in% duplicates))
rownames(GE.counts.tpm.prt_cod) = GE.counts.tpm.prt_cod$gene_name
limma.counts = GE.counts.tpm.prt_cod[,2:ncol(GE.counts.tpm.prt_cod)]
not_nas = which(!(is.na(rowSums(limma.counts))))
limma.counts = limma.counts[not_nas,]
# read in conversion file
conversion = fread('~/leucegene/E19/tables/CONVERSIONS/gene_id_conversions.txt')
names(conversion)[c(1,2,9)] = c('tpt_type', 'gene_name', 'ensbl_stable_ID')
patients = fread('~/leucegene/E19/tables/PATIENTS/LEUCEGENE_452_LABELS.txt')
# this is the coldata file
coldata = data.frame( condition = as.factor(t(patients[18,2:ncol(patients)]))) %>% mutate(condition = str_replace(condition, '-', '_'))
rownames(coldata) = names(patients)[2:ncol(patients)]
groups = coldata$condition
# figdir = '~/leucegene/E19/figures/JUN6'
# system(paste('mkdir', figdir))
# pdf(paste(figdir, paste('limma_fig1_',COUNT_PROTOCOL,'.pdf', sep = '', collapse = ''), sep = '/'))
# plotMDS(limma.counts, col = as.numeric(groups))
# dev.off()
mm = model.matrix(~0 + groups)
y = voom(limma.counts, mm, plot = T)
fit = lmFit(y, mm)
contr <- makeContrasts(groupsnon_stranded-groupsstranded, levels = colnames(coef(fit)))
tmp <- contrasts.fit(fit, contr)
tmp <- eBayes(tmp)
top.table <- topTable(tmp, sort.by = "logFC", n = Inf)
tabledir = '/u/sauves/leucegene/E19/tables/LIMMA-VOOM'
tablename = paste('LIMMA_VOOM_DEG_PROTEIN_CODING_',nrow(top.table),'x', ncol(top.table),'_PRTCL_',COUNT_PROTOCOL,'_stranded_vs_n_stranded.txt', collapse = '', sep = '')
write.table(top.table, paste(tabledir, tablename, sep = '/'))
# head(top.table, 20)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment