Last updated: 2018-12-01
workflowr checks: (Click a bullet for more information)Great! Since the R Markdown file has been committed to the Git repository, you know the exact version of the code that produced these results.
Great job! The global environment was empty. Objects defined in the global environment can affect the analysis in your R Markdown file in unknown ways. For reproduciblity it’s best to always run the code in an empty environment.
set.seed(20181119)
The command set.seed(20181119)
was run prior to running the code in the R Markdown file. Setting a seed ensures that any results that rely on randomness, e.g. subsampling or permutations, are reproducible.
Great job! Recording the operating system, R version, and package versions is critical for reproducibility.
wflow_publish
or wflow_git_commit
). workflowr only checks the R Markdown file, but you know if there are other scripts or data files that it depends on. Below is the status of the Git repository when the results were generated:
Ignored files:
Ignored: .Rproj.user/
Ignored: analysis/Quality_metrics_cache/
Ignored: analysis/figure/
Unstaged changes:
Modified: analysis/Quality_metrics.Rmd
Note that any generated files, e.g. HTML, png, CSS, etc., are not included in this status report because it is ok for generated content to have uncommitted changes. # 01-Nov-2018
# obtain the gene expression profile of all gRNAs individually
library(edgeR)
library(Rfast)
library(data.table)
library(cellrangerRkit)
#################
# exp_matrix <- exp_matrix_backup
##############
gbm <- load_cellranger_matrix("../NSC_merged_05_07_08_new/") # load the GBM with NEW data
exp_matrix <- gbm[, colSums(exprs(gbm)) > 11000] # get cells above background value
use_genes <-get_nonzero_genes(exp_matrix)
exp_matrix <- exp_matrix[use_genes, ]
exp_matrix <- normalize_barcode_sums_to_median(exp_matrix)
# Select cells that have at least 1 reads in gRNA count
exp_matrix <- exp_matrix[, colMaxs(as.matrix(exprs(exp_matrix[
(length(exp_matrix@featureData@data$id)-75): # should be 75 or 76?
(length(exp_matrix@featureData@data$id)), ])), value = T) > 0] #more than 1 count
# make cell count for cells contain unique RNA
exp_matrix <- exp_matrix[, # Select cells that have one gRNA expression more than 3x of all others
(colMaxs(as.matrix(exprs(exp_matrix[
(length(exp_matrix@featureData@data$id)-75):
(length(exp_matrix@featureData@data$id)), ])), value = T)) >
(colSums(exprs(exp_matrix[(length(exp_matrix@featureData@data$id)-75):
(length(exp_matrix@featureData@data$id)), ]))*3/4)]
# exp_GeneBCMatrix <- exp_matrix # output from the last step
#############
# find the gRNA for each cell by taking the rowMax index of gRNA columns(29847:29922)
# will return a data.frame with entries followed by .xx since colnames should be unique
exp_matrix <- as.matrix(exprs(exp_matrix))
exp_matrix_backup <- exp_matrix # make a backup in case of need
#############
exp_matrix <- exp_matrix_backup
############
# total UMI count: 77,997,203
#scale exp_matrix to 1,000,000 counts -- NO, will significantly decrease the statistic power
# exp_matrix <- exp_matrix*(1000000/77997203)
### Make the gRNA list
# gRNA_list <- as.data.frame(rownames(exp_matrix
# [(nrow(exp_matrix)-76 +
# colMaxs(
# as.matrix(
# exp_matrix[
# ((nrow(exp_matrix)-75):(nrow(exp_matrix)))
# ,]
# ), value = F)
# ),
# ]))
gRNA_list <- as.data.frame(rownames(exp_matrix
[(nrow(exp_matrix)-76 +
colMaxs(
exp_matrix[
((nrow(exp_matrix)-75):(nrow(exp_matrix)))
,]
, value = F)
),
]))
colnames(gRNA_list) <- "gRNA"
gRNA_ASoC_list <- gRNA_list[!duplicated(gRNA_list$gRNA), ]
gRNA_ASoC_list <- gRNA_ASoC_list[order(gRNA_ASoC_list)]
# exp_matrix$cell_type <- as.factor(gRNA_dist$gRNA)
gRNA_ASoC_list <- gRNA_ASoC_list[-c(31,32,33,37,38,39,40,41,51,52,59,60,61,65,66,67)]
#############
# exp_matrix now: rownames=ENSG, colnames=barcode
# exp_matrix <- as.matrix(t(exp_matrix))
# not using
cell_type_index <- as.vector(gRNA_list$gRNA) # use this variable as index, length = 2522
############ Main Program ######
gRNA_ASoC_list_backup <- gRNA_ASoC_list # set a backup, will use the full list after debugging
# gRNA_ASoC_list <- gRNA_ASoC_list[1:2] # list for debugging, 2 elements only
# gRNA_ASoC_list <- gRNA_ASoC_list_backup
for(i in 1:length(gRNA_ASoC_list)) {
print(gRNA_ASoC_list[i])
### make correspondence of Gene_Symbol and Gene_id
# write_out_with_gene_name <- as.data.frame(get_CRISPRi_result(gRNA_ASoC_list[i],
# TPM_filter = F, TPM_threshold = 0.01))
write_out_with_gene_name <- as.data.frame(get_CRISPRi_result(gRNA_ASoC_list[i],
TPM_filter = F, TPM_threshold = 0.01)) # no filter set
write_out_with_gene_name$Geneid <- rownames(write_out_with_gene_name)
write_out_with_gene_name_output <- merge(write_out_with_gene_name, ENSG_coord_gene_gencodev28, by = "Geneid")
write_out_with_gene_name_output <- write_out_with_gene_name_output[order(write_out_with_gene_name_output$PValue), ]
write.table(write_out_with_gene_name_output, append = F,
row.names = F, col.names = T, sep = "\t", quote = F,
file = paste("output/", gRNA_ASoC_list[i], "_gRNA.txt", collapse = "", sep = ""))
}
############
############## custom functions##############
get_CRISPRi_result <- function(gRNA_name, TPM_filter = FALSE, TPM_threshold = 0.01) {
# TPM filter takes only genes with an estimated TPM above 1
# in more than 25% of the considered cells
# TPM_filter <- TRUE # temporary
# TPM_threshold <- 0.01 # temporaray
# if(TPM_filter) {
# exp_matrix <- exp_matrix[
# rowSums(exp_matrix > TPM_threshold)
# > trunc(ncol(exp_matrix/4)), ]
# }
# make two matrix using grepl, separate the target gRNA and control gRNA (EGFP/neg)
control_matrix <- exp_matrix[ , grepl("EGFP", cell_type_index) |
grepl("neg", cell_type_index)]
# gRNA_name <- "VPS45_2_gene" # temporary
gRNA_matrix <- exp_matrix[ , grepl(gRNA_name, cell_type_index)]
# prepare the input matrix for edgeR
## merge the two matrices as one, ensure the gRNA is the first
matrix_combined <- as.matrix(cbind(control_matrix, gRNA_matrix))
# matrix_combined <- matrix_combined[order(cell_type_index), ]
## Trim the tailing gRNA artificial genes, total 75
# matrix_combined_transposed <- t(matrix_combined[, -((ncol(matrix_combined)-75):
# ncol(matrix_combined))])
matrix_combined_transposed <- matrix_combined[1:(nrow(matrix_combined)-76), ]
### Assign rownames as ENSG gene identifiers
rownames(matrix_combined_transposed) <- gsub("\\..*", "", rownames(matrix_combined_transposed))
# rownames(matrix_combined_transposed) <- gsub("\\..*", "",
# colnames(matrix_combined[ , 1:(ncol(matrix_combined)-76)]))
# colnames(matrix_combined_transposed) <- rownames(matrix_combined)
# print(scale(colMeans(matrix_combined_transposed > 0)))
# Run edgeR use edgeRQLFDetRate, nrow(control_matrix) should be 139
group <- factor(c(rep("ctrl", len = ncol(control_matrix)),
rep("gRNA", len = ncol(gRNA_matrix))))
# make DGEList()
main_DGE <- DGEList(counts = matrix_combined_transposed, group = group, remove.zeros = T)
# Use edgeRQLFDetRate flow from now on
main_DGE <- calcNormFactors(main_DGE)
cdr <- scale(colMeans(matrix_combined_transposed > 0)) # DetRate is applied here
design <- model.matrix(~ cdr + group) # cdr (~ cdr + group)
main_DGE <- estimateDisp(main_DGE, design = design)
fit <- glmQLFit(main_DGE, design = design) # fit
qlf <- glmQLFTest(fit) # QLF vs LRT
# tt <- topTags(qlf, n = 100)
exp_table <- qlf$table
exp_table$FDR <- p.adjust(exp_table$PValue, "fdr")
# write.table(group, append = F,
# row.names = F, col.names = T, sep = "\t", quote = F,
# file = paste("output/", gRNA_ASoC_list[i], "_group.txt", collapse = ""))
return(exp_table)
}
sessionInfo()
R version 3.3.2 (2016-10-31)
Platform: x86_64-apple-darwin11.4.2 (64-bit)
Running under: OS X El Capitan 10.11.6
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
loaded via a namespace (and not attached):
[1] workflowr_1.1.1 Rcpp_0.12.17 digest_0.6.12
[4] rprojroot_1.2 R.methodsS3_1.7.1 backports_1.0.5
[7] git2r_0.18.0 magrittr_1.5 evaluate_0.10
[10] stringi_1.1.5 whisker_0.3-2 R.oo_1.22.0
[13] R.utils_2.7.0 rmarkdown_1.10 tools_3.3.2
[16] stringr_1.2.0 yaml_2.1.16 htmltools_0.3.6
[19] knitr_1.20
This reproducible R Markdown analysis was created with workflowr 1.1.1