Data integration - Normalization
suppressMessages(library(Matrix))
suppressMessages(library(SingleCellExperiment))
suppressMessages(library(ggplot2))
suppressMessages(library(scran))
suppressMessages(library(scater))
suppressMessages(library(ggpubr))
sce <- readRDS("/mnt/nmorais-nfs/marta/pA_karine/r_session/e12h-preliminary-analysis/integration/data/sce-total.rds")
df <- readRDS("/mnt/nmorais-nfs/marta/pA_karine/r_session/e12h-preliminary-analysis/integration/data/qc-df.rds")
logmat <- log2(counts(sce)+1)
df1 <- data.frame(total_counts = df$library_size,
total_features = df$total_features,
sample = sce$sample,
sorting = sce$sorting,
dataset = sce$dataset,
condition = sce$condition)
df2 <- data.frame(log10_total_counts = log10(df$library_size),
log10_total_features = log10(df$total_features),
sample = sce$sample)
varMatrix <- getVarianceExplained(logmat, variables = df1)
varMatrix[1:5,]
## total_counts total_features sample sorting dataset
## a 4.466088077 5.299560374 9.56103743 9.45667067 9.454462503
## A030001D20Rik 0.120626638 0.099983181 0.03523434 0.01813587 0.016851073
## A030005L19Rik 0.057712044 0.043820024 0.15454537 0.06542176 0.059112264
## A030014E15Rik 0.003305317 0.002250046 0.03705156 0.01595485 0.009999625
## A130010J15Rik 1.839822552 1.822475712 0.30735145 0.22298430 0.209924108
## condition
## a 6.992200656
## A030001D20Rik 0.023883849
## A030005L19Rik 0.100616812
## A030014E15Rik 0.007623449
## A130010J15Rik 0.198262641
plotExplanatoryVariables(
varMatrix) +
ggtitle("All samples after QC") +
theme(text = element_text(size=20)) +
scale_color_manual(values=c( "dodgerblue" ,"purple" ,"red", "green3","orange", "blue"))
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
make_plot <- function(df, x, y, title, ylab){
ggplot(df, aes(x=x, y=y, fill=x)) +
geom_violin(show.legend = FALSE) +
theme_bw() +
theme(text = element_text(size=20), axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
ylab(ylab) + xlab("Sample") +
scale_fill_manual(values=sample_colors) +
ggtitle(title)
}
sample_colors <- c("blue", "green", "orange", "yellow3", "pink", "red", "dodgerblue", "coral1",
"purple", "magenta", "cyan", "grey")
make_plot(df1, df1$sample, df1$total_counts, "All samples after QC", "Library size")
make_plot(df2, df2$sample, df2$log10_total_counts, "All samples after QC", "Library size (log10)")
make_plot(df1, df1$sample, df1$total_features, "All samples after QC", "Total features")
make_plot(df2, df2$sample, df2$log10_total_features, "All samples after QC", "Total features (log10)")
Remove lowly expressed genes:
keep_genes <- rowSums(counts(sce)) > 10
table(keep_genes)
## keep_genes
## TRUE
## 20515
sce <- sce[keep_genes,]
qclust <- quickCluster(sce)
sce <- computeSumFactors(sce, clusters=qclust)
summary(sizeFactors(sce))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1313 0.5151 0.7136 1.0000 1.1406 36.7784
sce <- logNormCounts(sce,
size_factors = sizeFactors(sce),
log = TRUE,
pseudo_count = 1,
exprs_values = "counts"
)
mat <- assays(sce)$logcounts
total_counts <- colSums((2**mat)-1)
total_counts[1:5]
## AAACCCAAGTGGAAGA-1_mct1 AAACCCACAGTGTGGA-1_mct1 AAACCCACATGTGCTA-1_mct1
## 13985.65 14202.51 14759.26
## AAACCCAGTTGGTACT-1_mct1 AAACGAAAGATCACTC-1_mct1
## 10060.42 14615.34
df3 <- data.frame(total_counts = total_counts,
total_features = df$total_features,
sample = sce$sample,
sorting = sce$sorting,
dataset = sce$dataset,
condition = sce$condition)
df4 <- data.frame(log10_total_counts = log10(total_counts),
log10_total_features = log10(df$total_features),
sample = sce$sample,
sorting = sce$sorting)
make_plot(df3, df3$sample, df3$total_counts, "All samples after library size norm", "Library size")
make_plot(df4, df4$sample, df4$log10_total_counts, "All samples after library size norm", "Library size (log10)") +ylim(3.5,5)
make_plot(df3, df3$sorting, df3$total_counts, "All samples after library size norm", "Library size")
make_plot(df4, df4$sorting, df4$log10_total_counts, "All samples after library size norm", "Library size (log10)") +ylim(3.5,5)
normVarMatrix <- getVarianceExplained(assays(sce)$logcounts, variables = df3[,c("total_counts", "total_features", "sample", "condition", "sorting", "dataset")])
plotExplanatoryVariables(
normVarMatrix,
variables = variables) +
ggtitle("All samples - after library size norm") +
theme(text = element_text(size=20)) +
scale_color_manual(values=c( "green3","purple" ,"orange","red", "dodgerblue" , "blue"))
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
#saveRDS(sce, "/mnt/nmorais-nfs/marta/pA_karine/r_session/e12h-preliminary-analysis/integration/data/sce-total.rds")
sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] ggpubr_0.5.0 scater_1.22.0
## [3] scran_1.22.1 scuttle_1.4.0
## [5] ggplot2_3.4.2 SingleCellExperiment_1.16.0
## [7] SummarizedExperiment_1.24.0 Biobase_2.54.0
## [9] GenomicRanges_1.46.1 GenomeInfoDb_1.30.1
## [11] IRanges_2.28.0 S4Vectors_0.32.4
## [13] BiocGenerics_0.40.0 MatrixGenerics_1.6.0
## [15] matrixStats_0.63.0 Matrix_1.5-3
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-7 tools_4.1.2
## [3] backports_1.4.1 bslib_0.6.1
## [5] utf8_1.2.4 R6_2.5.1
## [7] irlba_2.3.5.1 vipor_0.4.5
## [9] colorspace_2.1-0 withr_3.0.0
## [11] tidyselect_1.2.0 gridExtra_2.3
## [13] compiler_4.1.2 cli_3.6.2
## [15] BiocNeighbors_1.12.0 DelayedArray_0.20.0
## [17] labeling_0.4.3 sass_0.4.8
## [19] scales_1.3.0 digest_0.6.34
## [21] rmarkdown_2.26 XVector_0.34.0
## [23] pkgconfig_2.0.3 htmltools_0.5.7
## [25] sparseMatrixStats_1.6.0 highr_0.10
## [27] fastmap_1.1.1 limma_3.50.3
## [29] rlang_1.1.3 rstudioapi_0.15.0
## [31] DelayedMatrixStats_1.16.0 farver_2.1.1
## [33] jquerylib_0.1.4 generics_0.1.3
## [35] jsonlite_1.8.8 BiocParallel_1.28.3
## [37] dplyr_1.1.4 car_3.1-1
## [39] RCurl_1.98-1.10 magrittr_2.0.3
## [41] BiocSingular_1.10.0 GenomeInfoDbData_1.2.7
## [43] Rcpp_1.0.12 ggbeeswarm_0.7.1
## [45] munsell_0.5.0 fansi_1.0.6
## [47] abind_1.4-5 viridis_0.6.2
## [49] lifecycle_1.0.4 yaml_2.3.8
## [51] edgeR_3.36.0 carData_3.0-5
## [53] zlibbioc_1.40.0 grid_4.1.2
## [55] parallel_4.1.2 ggrepel_0.9.5
## [57] dqrng_0.3.0 lattice_0.20-45
## [59] cowplot_1.1.1 beachmat_2.10.0
## [61] locfit_1.5-9.7 metapod_1.2.0
## [63] knitr_1.45 pillar_1.9.0
## [65] igraph_1.3.5 ggsignif_0.6.4
## [67] ScaledMatrix_1.2.0 glue_1.7.0
## [69] evaluate_0.23 vctrs_0.6.5
## [71] gtable_0.3.4 purrr_1.0.2
## [73] tidyr_1.3.1 cachem_1.0.8
## [75] xfun_0.42 rsvd_1.0.5
## [77] broom_1.0.5 rstatix_0.7.2
## [79] viridisLite_0.4.2 tibble_3.2.1
## [81] beeswarm_0.4.0 cluster_2.1.4
## [83] bluster_1.4.0 statmod_1.5.0
