Data integration - Normalization
suppressMessages(library(Matrix))
suppressMessages(library(SingleCellExperiment))
suppressMessages(library(ggplot2))
suppressMessages(library(scran))
suppressMessages(library(scater))
suppressMessages(library(ggpubr))
sce <- readRDS("/mnt/nmorais-nfs/marta/pA_karine/r_session/e12h-preliminary-analysis/integration/data/sce-total.rds")
df <- readRDS("/mnt/nmorais-nfs/marta/pA_karine/r_session/e12h-preliminary-analysis/integration/data/qc-df.rds")
logmat <- log2(counts(sce)+1)
df1 <- data.frame(total_counts = df$library_size,
                 total_features = df$total_features,
                 sample = sce$sample,
                 sorting = sce$sorting,
                 dataset = sce$dataset,
                 condition = sce$condition)
df2 <- data.frame(log10_total_counts = log10(df$library_size),
                  log10_total_features = log10(df$total_features),
                  sample = sce$sample)
varMatrix <- getVarianceExplained(logmat, variables = df1)
varMatrix[1:5,]
##               total_counts total_features     sample    sorting     dataset
## a              4.466088077    5.299560374 9.56103743 9.45667067 9.454462503
## A030001D20Rik  0.120626638    0.099983181 0.03523434 0.01813587 0.016851073
## A030005L19Rik  0.057712044    0.043820024 0.15454537 0.06542176 0.059112264
## A030014E15Rik  0.003305317    0.002250046 0.03705156 0.01595485 0.009999625
## A130010J15Rik  1.839822552    1.822475712 0.30735145 0.22298430 0.209924108
##                 condition
## a             6.992200656
## A030001D20Rik 0.023883849
## A030005L19Rik 0.100616812
## A030014E15Rik 0.007623449
## A130010J15Rik 0.198262641
          plotExplanatoryVariables(
            varMatrix) +
  ggtitle("All samples after QC") +
  theme(text = element_text(size=20)) +
 scale_color_manual(values=c( "dodgerblue" ,"purple" ,"red", "green3","orange", "blue"))
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.

make_plot <- function(df, x, y, title, ylab){
  ggplot(df, aes(x=x, y=y, fill=x)) +
  
  geom_violin(show.legend = FALSE) + 
  theme_bw() +
  theme(text = element_text(size=20), axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  ylab(ylab) + xlab("Sample") +
  scale_fill_manual(values=sample_colors) +
    ggtitle(title)
}
sample_colors <- c("blue", "green", "orange", "yellow3", "pink", "red", "dodgerblue", "coral1",
                   "purple", "magenta", "cyan", "grey")
make_plot(df1, df1$sample, df1$total_counts, "All samples after QC", "Library size")

make_plot(df2, df2$sample, df2$log10_total_counts, "All samples after QC", "Library size (log10)")

make_plot(df1, df1$sample, df1$total_features, "All samples after QC", "Total features")

make_plot(df2, df2$sample, df2$log10_total_features, "All samples after QC", "Total features (log10)")

Remove lowly expressed genes:

keep_genes <- rowSums(counts(sce)) > 10
table(keep_genes)
## keep_genes
##  TRUE 
## 20515
sce <- sce[keep_genes,]
qclust <- quickCluster(sce)
sce <- computeSumFactors(sce, clusters=qclust)
summary(sizeFactors(sce))   
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1313  0.5151  0.7136  1.0000  1.1406 36.7784
sce <- logNormCounts(sce, 
                     size_factors = sizeFactors(sce),
                     log = TRUE,
                     pseudo_count = 1, 
                     exprs_values = "counts"
                     )
mat <- assays(sce)$logcounts
total_counts <- colSums((2**mat)-1)
total_counts[1:5]
## AAACCCAAGTGGAAGA-1_mct1 AAACCCACAGTGTGGA-1_mct1 AAACCCACATGTGCTA-1_mct1 
##                13985.65                14202.51                14759.26 
## AAACCCAGTTGGTACT-1_mct1 AAACGAAAGATCACTC-1_mct1 
##                10060.42                14615.34
df3 <- data.frame(total_counts = total_counts,
                 total_features = df$total_features,
                 sample = sce$sample,
                 sorting = sce$sorting,
                 dataset = sce$dataset,
                 condition = sce$condition)
df4 <- data.frame(log10_total_counts = log10(total_counts),
                  log10_total_features = log10(df$total_features),
                  sample = sce$sample,
                  sorting = sce$sorting)
make_plot(df3, df3$sample, df3$total_counts, "All samples after library size norm", "Library size") 

make_plot(df4, df4$sample, df4$log10_total_counts, "All samples after library size norm", "Library size (log10)") +ylim(3.5,5)

make_plot(df3, df3$sorting, df3$total_counts, "All samples after library size norm", "Library size") 

make_plot(df4, df4$sorting, df4$log10_total_counts, "All samples after library size norm", "Library size (log10)") +ylim(3.5,5)

normVarMatrix <- getVarianceExplained(assays(sce)$logcounts, variables = df3[,c("total_counts", "total_features", "sample", "condition", "sorting", "dataset")])
          plotExplanatoryVariables(
            normVarMatrix,
            variables = variables)  + 
  ggtitle("All samples - after library size norm") +
  theme(text = element_text(size=20)) +
 scale_color_manual(values=c( "green3","purple" ,"orange","red", "dodgerblue" , "blue"))
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.

#saveRDS(sce, "/mnt/nmorais-nfs/marta/pA_karine/r_session/e12h-preliminary-analysis/integration/data/sce-total.rds")
sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.6 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] ggpubr_0.5.0                scater_1.22.0              
##  [3] scran_1.22.1                scuttle_1.4.0              
##  [5] ggplot2_3.4.2               SingleCellExperiment_1.16.0
##  [7] SummarizedExperiment_1.24.0 Biobase_2.54.0             
##  [9] GenomicRanges_1.46.1        GenomeInfoDb_1.30.1        
## [11] IRanges_2.28.0              S4Vectors_0.32.4           
## [13] BiocGenerics_0.40.0         MatrixGenerics_1.6.0       
## [15] matrixStats_0.63.0          Matrix_1.5-3               
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-7              tools_4.1.2              
##  [3] backports_1.4.1           bslib_0.6.1              
##  [5] utf8_1.2.4                R6_2.5.1                 
##  [7] irlba_2.3.5.1             vipor_0.4.5              
##  [9] colorspace_2.1-0          withr_3.0.0              
## [11] tidyselect_1.2.0          gridExtra_2.3            
## [13] compiler_4.1.2            cli_3.6.2                
## [15] BiocNeighbors_1.12.0      DelayedArray_0.20.0      
## [17] labeling_0.4.3            sass_0.4.8               
## [19] scales_1.3.0              digest_0.6.34            
## [21] rmarkdown_2.26            XVector_0.34.0           
## [23] pkgconfig_2.0.3           htmltools_0.5.7          
## [25] sparseMatrixStats_1.6.0   highr_0.10               
## [27] fastmap_1.1.1             limma_3.50.3             
## [29] rlang_1.1.3               rstudioapi_0.15.0        
## [31] DelayedMatrixStats_1.16.0 farver_2.1.1             
## [33] jquerylib_0.1.4           generics_0.1.3           
## [35] jsonlite_1.8.8            BiocParallel_1.28.3      
## [37] dplyr_1.1.4               car_3.1-1                
## [39] RCurl_1.98-1.10           magrittr_2.0.3           
## [41] BiocSingular_1.10.0       GenomeInfoDbData_1.2.7   
## [43] Rcpp_1.0.12               ggbeeswarm_0.7.1         
## [45] munsell_0.5.0             fansi_1.0.6              
## [47] abind_1.4-5               viridis_0.6.2            
## [49] lifecycle_1.0.4           yaml_2.3.8               
## [51] edgeR_3.36.0              carData_3.0-5            
## [53] zlibbioc_1.40.0           grid_4.1.2               
## [55] parallel_4.1.2            ggrepel_0.9.5            
## [57] dqrng_0.3.0               lattice_0.20-45          
## [59] cowplot_1.1.1             beachmat_2.10.0          
## [61] locfit_1.5-9.7            metapod_1.2.0            
## [63] knitr_1.45                pillar_1.9.0             
## [65] igraph_1.3.5              ggsignif_0.6.4           
## [67] ScaledMatrix_1.2.0        glue_1.7.0               
## [69] evaluate_0.23             vctrs_0.6.5              
## [71] gtable_0.3.4              purrr_1.0.2              
## [73] tidyr_1.3.1               cachem_1.0.8             
## [75] xfun_0.42                 rsvd_1.0.5               
## [77] broom_1.0.5               rstatix_0.7.2            
## [79] viridisLite_0.4.2         tibble_3.2.1             
## [81] beeswarm_0.4.0            cluster_2.1.4            
## [83] bluster_1.4.0             statmod_1.5.0