Last updated: 2018-12-01

workflowr checks: (Click a bullet for more information)
  • R Markdown file: up-to-date

    Great! Since the R Markdown file has been committed to the Git repository, you know the exact version of the code that produced these results.

  • Environment: empty

    Great job! The global environment was empty. Objects defined in the global environment can affect the analysis in your R Markdown file in unknown ways. For reproduciblity it’s best to always run the code in an empty environment.

  • Seed: set.seed(20181119)

    The command set.seed(20181119) was run prior to running the code in the R Markdown file. Setting a seed ensures that any results that rely on randomness, e.g. subsampling or permutations, are reproducible.

  • Session information: recorded

    Great job! Recording the operating system, R version, and package versions is critical for reproducibility.

  • Repository version: fdd5647

    Great! You are using Git for version control. Tracking code development and connecting the code version to the results is critical for reproducibility. The version displayed above was the version of the Git repository at the time these results were generated.

    Note that you need to be careful to ensure that all relevant files for the analysis have been committed to Git prior to generating the results (you can use wflow_publish or wflow_git_commit). workflowr only checks the R Markdown file, but you know if there are other scripts or data files that it depends on. Below is the status of the Git repository when the results were generated:
    
    Ignored files:
        Ignored:    .Rproj.user/
        Ignored:    analysis/Quality_metrics_cache/
        Ignored:    analysis/figure/
    
    Unstaged changes:
        Modified:   analysis/Quality_metrics.Rmd
    
    
    Note that any generated files, e.g. HTML, png, CSS, etc., are not included in this status report because it is ok for generated content to have uncommitted changes.
Expand here to see past versions:
    File Version Author Date Message
    Rmd fdd5647 szhao06 2018-12-01 qc
    html fdd5647 szhao06 2018-12-01 qc
    Rmd b5d096d simingz 2018-11-29 migrate to rcc
    html 6360503 simingz 2018-11-20 Build site.
    Rmd 8be64d0 simingz 2018-11-20 First


Siwei’s original code

# 01-Nov-2018
# obtain the gene expression profile of all gRNAs individually
library(edgeR)
library(Rfast)
library(data.table)
library(cellrangerRkit)
#################
# exp_matrix <- exp_matrix_backup
##############
gbm <- load_cellranger_matrix("../NSC_merged_05_07_08_new/") # load the GBM with NEW data 
exp_matrix <- gbm[, colSums(exprs(gbm)) > 11000] # get cells above background value
use_genes <-get_nonzero_genes(exp_matrix)
exp_matrix <- exp_matrix[use_genes, ]
exp_matrix <- normalize_barcode_sums_to_median(exp_matrix)

# Select cells that have at least 1 reads in gRNA count
exp_matrix <- exp_matrix[, colMaxs(as.matrix(exprs(exp_matrix[
  (length(exp_matrix@featureData@data$id)-75): # should be 75 or 76?
    (length(exp_matrix@featureData@data$id)), ])), value = T) > 0] #more than 1 count

# make cell count for cells contain unique RNA
exp_matrix <- exp_matrix[, # Select cells that have one gRNA expression more than 3x of all others
                         (colMaxs(as.matrix(exprs(exp_matrix[
                           (length(exp_matrix@featureData@data$id)-75):
                             (length(exp_matrix@featureData@data$id)), ])), value = T)) >
                           (colSums(exprs(exp_matrix[(length(exp_matrix@featureData@data$id)-75):
                                                       (length(exp_matrix@featureData@data$id)), ]))*3/4)]

# exp_GeneBCMatrix <- exp_matrix # output from the last step
#############

# find the gRNA for each cell by taking the rowMax index of gRNA columns(29847:29922)
# will return a data.frame with entries followed by .xx since colnames should be unique
exp_matrix <- as.matrix(exprs(exp_matrix))
exp_matrix_backup <- exp_matrix # make a backup in case of need

#############
exp_matrix <- exp_matrix_backup
############
# total UMI count: 77,997,203
#scale exp_matrix to 1,000,000 counts -- NO, will significantly decrease the statistic power
# exp_matrix <- exp_matrix*(1000000/77997203)


### Make the gRNA list
# gRNA_list <- as.data.frame(rownames(exp_matrix
#                                     [(nrow(exp_matrix)-76 +
#                                         colMaxs(
#                                           as.matrix(
#                                             exp_matrix[
#                                               ((nrow(exp_matrix)-75):(nrow(exp_matrix)))
#                                               ,]                                             
#                                           ), value = F)
#                                     ),
#                                       ]))
gRNA_list <- as.data.frame(rownames(exp_matrix
                                    [(nrow(exp_matrix)-76 +
                                        colMaxs(
                                            exp_matrix[
                                              ((nrow(exp_matrix)-75):(nrow(exp_matrix)))
                                              ,]                                             
                                          , value = F)
                                    ),
                                      ]))

colnames(gRNA_list) <- "gRNA"

gRNA_ASoC_list <- gRNA_list[!duplicated(gRNA_list$gRNA), ]
gRNA_ASoC_list <- gRNA_ASoC_list[order(gRNA_ASoC_list)]
# exp_matrix$cell_type <- as.factor(gRNA_dist$gRNA)
gRNA_ASoC_list <- gRNA_ASoC_list[-c(31,32,33,37,38,39,40,41,51,52,59,60,61,65,66,67)]
#############
# exp_matrix now: rownames=ENSG, colnames=barcode


# exp_matrix <- as.matrix(t(exp_matrix)) 
# not using
cell_type_index <- as.vector(gRNA_list$gRNA) # use this variable as index, length = 2522

############ Main Program ######
gRNA_ASoC_list_backup <- gRNA_ASoC_list # set a backup, will use the full list after debugging

# gRNA_ASoC_list <- gRNA_ASoC_list[1:2] # list for debugging, 2 elements only


# gRNA_ASoC_list <- gRNA_ASoC_list_backup

for(i in 1:length(gRNA_ASoC_list)) {
  print(gRNA_ASoC_list[i])
  ### make correspondence of Gene_Symbol and Gene_id
  # write_out_with_gene_name <- as.data.frame(get_CRISPRi_result(gRNA_ASoC_list[i], 
  #                                                              TPM_filter = F, TPM_threshold = 0.01))
  write_out_with_gene_name <- as.data.frame(get_CRISPRi_result(gRNA_ASoC_list[i], 
                                                               TPM_filter = F, TPM_threshold = 0.01)) # no filter set
  write_out_with_gene_name$Geneid <- rownames(write_out_with_gene_name)
  write_out_with_gene_name_output <- merge(write_out_with_gene_name, ENSG_coord_gene_gencodev28, by = "Geneid")
  write_out_with_gene_name_output <- write_out_with_gene_name_output[order(write_out_with_gene_name_output$PValue), ]
  write.table(write_out_with_gene_name_output, append = F, 
              row.names = F, col.names = T, sep = "\t", quote = F, 
              file = paste("output/", gRNA_ASoC_list[i], "_gRNA.txt", collapse = "", sep = ""))
}

############

############## custom functions##############

get_CRISPRi_result <- function(gRNA_name, TPM_filter = FALSE, TPM_threshold = 0.01) {

    # TPM filter takes only genes with an estimated TPM above 1
  # in more than 25% of the considered cells
  # TPM_filter <- TRUE # temporary
  # TPM_threshold <- 0.01 # temporaray
  # if(TPM_filter) {
  #   exp_matrix <- exp_matrix[ 
  #     rowSums(exp_matrix > TPM_threshold) 
  #     > trunc(ncol(exp_matrix/4)), ]
  # }
  # make two matrix using grepl, separate the target gRNA and control gRNA (EGFP/neg)
  control_matrix <- exp_matrix[ , grepl("EGFP", cell_type_index) |
                                 grepl("neg", cell_type_index)]
  # gRNA_name <- "VPS45_2_gene" # temporary
  gRNA_matrix <- exp_matrix[ , grepl(gRNA_name, cell_type_index)]
 
  # prepare the input matrix for edgeR
  ## merge the two matrices as one, ensure the gRNA is the first
  matrix_combined <- as.matrix(cbind(control_matrix, gRNA_matrix))

  # matrix_combined <- matrix_combined[order(cell_type_index), ]
  ## Trim the tailing gRNA artificial genes, total 75
  # matrix_combined_transposed <- t(matrix_combined[, -((ncol(matrix_combined)-75):
  #                                                       ncol(matrix_combined))])
  matrix_combined_transposed <- matrix_combined[1:(nrow(matrix_combined)-76), ]
  ### Assign rownames as ENSG gene identifiers
  rownames(matrix_combined_transposed) <- gsub("\\..*", "", rownames(matrix_combined_transposed))
  # rownames(matrix_combined_transposed) <- gsub("\\..*", "", 
  #                                              colnames(matrix_combined[ , 1:(ncol(matrix_combined)-76)]))
  # colnames(matrix_combined_transposed) <- rownames(matrix_combined)

  # print(scale(colMeans(matrix_combined_transposed > 0)))
  # Run edgeR use edgeRQLFDetRate, nrow(control_matrix) should be 139
  group <- factor(c(rep("ctrl", len = ncol(control_matrix)), 
                    rep("gRNA", len = ncol(gRNA_matrix))))
  # make DGEList()
  main_DGE <- DGEList(counts = matrix_combined_transposed, group = group, remove.zeros = T)
  # Use edgeRQLFDetRate flow from now on
  main_DGE <- calcNormFactors(main_DGE)
  cdr <- scale(colMeans(matrix_combined_transposed > 0)) # DetRate is applied here
  design <- model.matrix(~ cdr + group) # cdr (~ cdr + group)
  main_DGE <- estimateDisp(main_DGE, design = design)
  fit <- glmQLFit(main_DGE, design = design) # fit
  qlf <- glmQLFTest(fit) # QLF vs LRT
  # tt <- topTags(qlf, n = 100)
  exp_table <- qlf$table
  exp_table$FDR <- p.adjust(exp_table$PValue, "fdr")
  
  # write.table(group, append = F, 
  #             row.names = F, col.names = T, sep = "\t", quote = F, 
  #             file = paste("output/", gRNA_ASoC_list[i], "_group.txt", collapse = ""))
  
  return(exp_table)
}

Session information

sessionInfo()
R version 3.3.2 (2016-10-31)
Platform: x86_64-apple-darwin11.4.2 (64-bit)
Running under: OS X El Capitan 10.11.6

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

loaded via a namespace (and not attached):
 [1] workflowr_1.1.1   Rcpp_0.12.17      digest_0.6.12    
 [4] rprojroot_1.2     R.methodsS3_1.7.1 backports_1.0.5  
 [7] git2r_0.18.0      magrittr_1.5      evaluate_0.10    
[10] stringi_1.1.5     whisker_0.3-2     R.oo_1.22.0      
[13] R.utils_2.7.0     rmarkdown_1.10    tools_3.3.2      
[16] stringr_1.2.0     yaml_2.1.16       htmltools_0.3.6  
[19] knitr_1.20       

This reproducible R Markdown analysis was created with workflowr 1.1.1