Sozen, B., Jorgensen, V., Weatherbee, B.A.T., Chen, S., Zhu, M., and Zernicka-Goetz, M. (2021). Reconstructing aspects of human embryogenesis with pluripotent stem cells. Nat. Commun. 12, 1–13.

  • BioProject Accession: PRJNA738498
  • GEO Accession: GSE178326



Load required packages.

library(tidyverse)
library(Matrix)
library(patchwork)
library(extrafont)
# library(magrittr)
Sys.time()
## [1] "2021-09-24 01:35:43 CDT"

Data preparation

Functions loading

source(
    file = file.path(
        SCRIPT_DIR,
        "utilities.R"
    )
)

Data loading

PROJECT_DIR <- "/Users/jialei/Dropbox/Data/Projects/UTSW/Peri-implantation"

Matrix

ad <- reticulate::import(module = "anndata", convert = TRUE)
print(ad$`__version__`)
## [1] "0.7.5"
BACKED <- NULL
adata <- ad$read_h5ad(
    filename = file.path(
        PROJECT_DIR,
        "raw/public/PRJNA738498",
        "matrix",
        "adata.h5ad"
    ),
    backed = BACKED
)

matrix_readcount_use <- adata |> convert_adata()
matrix_readcount_use |> dim()
## [1] 33538  6231

Embedding

EMBEDDING_FILE <- "embedding_ncomponents18_ccc1_seed20210719.csv.gz"

embedding <- vroom::vroom(
    file = file.path(
        PROJECT_DIR,
        "raw/public/PRJNA738498/",
        "clustering/PRJNA738498/exploring/Scanpy_Harmony",
        EMBEDDING_FILE
    )
)

embedding |> head()

Metadata

cell_metadata <- vroom::vroom(
    file = file.path(
        PROJECT_DIR,
        "raw/public/PRJNA738498",
        "matrix",
        "cell_metadata.csv"
    )
) |>
    dplyr::mutate(
        group = stringr::str_remove(
            string = sample_id,
            pattern = "-.+$"
        ),
        group = case_when(
            group == "2D" ~ "hEPSCs in 2D",
            group == "D5" ~ "Day 5 hEP-structures",
            group == "D6" ~ "Day 6 hEP-structures",
            group == "unknown" ~ "hEPSCs, unknown",
            is.na(group) ~ "Natural human embryos"
        ),
        group = factor(
            group,
            levels = c(
                "Natural human embryos",
                "Day 5 hEP-structures",
                "Day 6 hEP-structures",
                "hEPSCs in 2D",
                "hEPSCs, unknown"
            )
        )
    ) |>
    dplyr::left_join(
        adata$obs |>
            tibble::rownames_to_column(var = "cell") |>
            dplyr::select(-batch),
        by = c("cell" = "cell")
    )


# cell_metadata$sample_id |> table(exclude = "")
# cell_metadata$developmental_stage |> table(exclude = "")
# cell_metadata |> dplyr::count(group)


cell_metadata |>
    dplyr::count(group, name = "num_cells") |>
    gt::gt() |>
    gt::tab_options(table.font.size = "median")
group num_cells
Natural human embryos 542
Day 5 hEP-structures 2013
Day 6 hEP-structures 2057
hEPSCs in 2D 228
hEPSCs, unknown 1391
purrr::walk(list(matrix_readcount_use, cell_metadata), \(x) {
    print(object.size(x), units = "auto", standard = "SI")
})
## 134.8 MB
## 905 kB

Single-cell transcriptome analysis

embedding <- embedding |>
    dplyr::left_join(
        cell_metadata |>
            dplyr::select(
                cell, group:mt_percentage
            ),
        by = c("cell" = "cell")
    )
embedding |>
    dplyr::group_by(
        group
    ) |>
    dplyr::summarise(
        num_cells = n(),
        median_umis = median(num_umis),
        num_features = median(num_features),
        median_mt_percentage = median(mt_percentage)
    ) |>
    gt::gt() |>
    gt::tab_options(table.font.size = "median")
group num_cells median_umis num_features median_mt_percentage
Natural human embryos 542 1525.0 788 0.07816558
Day 5 hEP-structures 2013 3916.0 1542 0.05452046
Day 6 hEP-structures 2057 5634.0 2146 0.04631579
hEPSCs in 2D 228 4658.5 1740 0.07582252
embedding |>
    dplyr::group_by(
        leiden
    ) |>
    dplyr::summarise(
        num_cells = n(),
        median_umis = median(num_umis),
        num_features = median(num_features),
        median_mt_percentage = median(mt_percentage)
    ) |>
    gt::gt() |>
    gt::tab_options(table.font.size = "median")
leiden num_cells median_umis num_features median_mt_percentage
0 650 7229.0 2548.0 0.044980507
1 575 5098.0 2001.0 0.047900969
2 519 4596.0 1750.0 0.055059253
3 486 3592.5 1367.5 0.053230059
4 467 1615.0 828.0 0.076511094
5 381 5207.0 1995.0 0.045398773
6 331 5504.0 2094.0 0.044977861
7 329 5760.0 2061.0 0.053837597
8 259 563.0 293.0 0.286012526
9 252 2873.5 1286.0 0.051105461
10 212 4263.0 1723.5 0.087807433
11 172 3567.5 1502.5 0.055306764
12 96 4089.0 1676.5 0.043022000
13 83 1504.0 798.0 0.163967611
14 17 5581.0 1828.0 0.062293144
15 11 825.0 347.0 0.001129944

Embedding visualization

x_column <- "x_umap_min_dist=0.1"
y_column <- "y_umap_min_dist=0.1"


GEOM_POINT_SIZE <- 0.4
EMBEDDING_TITLE_PREFIX <- "UMAP"
RASTERISED <- TRUE

Clustering & UMI & MT

p_embedding_leiden <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = embedding$leiden |> as.factor(),
    label = paste(EMBEDDING_TITLE_PREFIX, "Leiden", sep = "; "),
    label_position = NULL,
    show_color_value_labels = TRUE,
    show_color_legend = FALSE,
    geom_point_size = GEOM_POINT_SIZE,
    sort_values = FALSE,
    rasterise = RASTERISED
) +
    theme_customized()

CB_POSITION <- c(0.8, 0.325)
p_embedding_UMI <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = log10(embedding$num_umis),
    label = paste(EMBEDDING_TITLE_PREFIX, "UMI", sep = "; "),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE * 1.5,
    sort_values = TRUE,
    shuffle_values = FALSE,
    rasterise = RASTERISED,
    legend_size = 2
) +
    theme_customized(
        y = CB_POSITION[2],
        legend_key_size = 2,
        legend_text_size = 5
    )

p_embedding_MT <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = embedding$mt_percentage,
    label = paste(EMBEDDING_TITLE_PREFIX, "MT%", sep = "; "),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE * 1.5,
    sort_values = TRUE,
    shuffle_values = FALSE,
    rasterise = RASTERISED,
    legend_size = 2
) +
    theme_customized(
        y = CB_POSITION[2],
        legend_key_size = 2,
        legend_text_size = 5
    )


purrr::reduce(
    list(
        p_embedding_leiden,
        p_embedding_UMI,
        p_embedding_MT
    ),
    `+`
) +
    patchwork::plot_layout(ncol = 3) +
    patchwork::plot_annotation(
        theme = theme(plot.margin = margin())
    )

purrr::map(c(0.4, 0.6, 0.8), \(x) {
    values <- embedding |>
        dplyr::mutate(
            value = case_when(
                mt_percentage >= x ~ "1",
                TRUE ~ "0"
            )
        ) |>
        dplyr::pull(value)

    plot_embedding(
        embedding = embedding[, c(x_column, y_column)],
        color_values = values |> as.factor(),
        label = glue::glue(
            "{EMBEDDING_TITLE_PREFIX}; MT% >= {scales::percent(x)}; Cells: {sum(as.numeric(values))}"
        ),
        label_position = NULL,
        show_color_value_labels = FALSE,
        show_color_legend = FALSE,
        geom_point_size = GEOM_POINT_SIZE * 1.5,
        sort_values = TRUE,
        shuffle_values = FALSE,
        rasterise = RASTERISED,
        legend_size = 2
    ) +
        theme_customized(
            legend_key_size = 2,
            legend_text_size = 5
        ) +
        scale_color_manual(
            values = c("grey70", "salmon")
        )
}) |>
    purrr::reduce(
        `+`
    ) +
    patchwork::plot_layout(ncol = 3) +
    patchwork::plot_annotation(
        theme = theme(plot.margin = margin())
    )

Batch & group

p_embedding_batch <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = embedding$batch |> as.factor(),
    label = paste(EMBEDDING_TITLE_PREFIX, "Batch", sep = "; "),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE,
    sort_values = FALSE,
    shuffle_values = TRUE,
    rasterise = RASTERISED,
    legend_size = 2
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    )

p_embedding_group <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = embedding$group |> as.factor(),
    label = paste(EMBEDDING_TITLE_PREFIX, "Group", sep = "; "),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE,
    sort_values = FALSE,
    rasterise = RASTERISED,
    legend_size = 2
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    )

values <- embedding |>
    dplyr::mutate(
        value = case_when(
            num_features < 200 ~ "1",
            TRUE ~ "0"
        )
    ) |>
    dplyr::pull(value)

p_embedding_low_complexity <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = values |> as.factor(),
    label = paste(EMBEDDING_TITLE_PREFIX, "Genes < 200", sep = "; "),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = FALSE,
    geom_point_size = GEOM_POINT_SIZE * 1.5,
    sort_values = TRUE,
    rasterise = RASTERISED,
    legend_size = 2
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    ) +
    scale_color_manual(
        values = c("grey70", "salmon")
    )
purrr::reduce(
    list(
        p_embedding_batch,
        p_embedding_group,
        p_embedding_low_complexity
    ),
    `+`
) +
    patchwork::plot_layout(ncol = 3) +
    patchwork::plot_annotation(
        theme = theme(plot.margin = margin())
    )

Composition

p_barplot_composition_batch <- calc_group_composition(
    data = embedding,
    x = "leiden",
    group = "batch"
) |>
    dplyr::mutate(
        leiden = factor(leiden)
    ) |>
    plot_barplot(
        x = "leiden",
        y = "percentage",
        z = "batch",
        legend_ncol = 1
    )

p_barplot_group <- calc_group_composition(
    data = embedding,
    x = "leiden",
    group = "group"
) |>
    dplyr::mutate(
        leiden = factor(leiden)
    ) |>
    plot_barplot(
        x = "leiden",
        y = "percentage",
        z = "group",
        legend_ncol = 1
    )

p_barplot_combined <- list(
    p_barplot_composition_batch,
    p_barplot_group
) |>
    purrr::reduce(`+`) +
    patchwork::plot_layout(nrow = 2, guides = "collect") +
    patchwork::plot_annotation(
        theme = theme(plot.margin = margin())
    )
p_barplot_combined

Expression

Embedding


FEATURES_SELECTED <- c(
    "ENSG00000204531_POU5F1",
    "ENSG00000111704_NANOG",
    "ENSG00000171872_KLF17",
    "ENSG00000186103_ARGFX",
    #
    "ENSG00000164736_SOX17",
    "ENSG00000125798_FOXA2",
    "ENSG00000136574_GATA4",
    "ENSG00000134853_PDGFRA",
    #
    "ENSG00000179348_GATA2",
    "ENSG00000070915_SLC12A3",
    "ENSG00000165556_CDX2",
    "ENSG00000007866_TEAD3"
)
purrr::map(FEATURES_SELECTED, \(x) {
    selected_feature <- x

    cat(selected_feature, "\n")
    values <- log10(calc_cpm(matrix_readcount_use[, embedding$cell])[selected_feature, ] + 1)

    plot_embedding(
        embedding = embedding[, c(x_column, y_column)],
        color_values = values,
        label = paste(
            EMBEDDING_TITLE_PREFIX,
            selected_feature |> stringr::str_remove(pattern = "^E.+_"),
            sep = "; "
        ),
        label_position = NULL,
        show_color_value_labels = FALSE,
        show_color_legend = TRUE,
        geom_point_size = GEOM_POINT_SIZE * 1.5,
        sort_values = TRUE,
        shuffle_values = FALSE,
        rasterise = RASTERISED,
        legend_size = 2
    ) +
        scale_color_viridis_c(
            na.value = "grey80"
        ) +
        theme_customized(
            y = CB_POSITION[2],
            legend_key_size = 2,
            legend_text_size = 5
        )
}) |>
    purrr::reduce(`+`) +
    patchwork::plot_layout(ncol = 3, byrow = FALSE) +
    patchwork::plot_annotation(
        theme = theme(plot.margin = margin())
    )
## ENSG00000204531_POU5F1 
## ENSG00000111704_NANOG 
## ENSG00000171872_KLF17 
## ENSG00000186103_ARGFX 
## ENSG00000164736_SOX17 
## ENSG00000125798_FOXA2 
## ENSG00000136574_GATA4 
## ENSG00000134853_PDGFRA 
## ENSG00000179348_GATA2 
## ENSG00000070915_SLC12A3 
## ENSG00000165556_CDX2 
## ENSG00000007866_TEAD3



R session info

devtools::session_info()$platform
##  setting  value                       
##  version  R version 4.1.1 (2021-08-10)
##  os       macOS Big Sur 11.6          
##  system   x86_64, darwin20.4.0        
##  ui       unknown                     
##  language (EN)                        
##  collate  en_US.UTF-8                 
##  ctype    en_US.UTF-8                 
##  tz       America/Chicago             
##  date     2021-09-24
devtools::session_info()$pack |>
    as_tibble() |>
    dplyr::select(
        package,
        loadedversion,
        date,
        `source`
    ) |>
    # print(n = nrow(.))
    gt::gt() |>
    gt::tab_options(table.font.size = "median")
package loadedversion date source
assertthat 0.2.1 2019-03-21 CRAN (R 4.1.1)
backports 1.2.1 2020-12-09 CRAN (R 4.1.1)
beeswarm 0.4.0 2021-06-01 CRAN (R 4.1.1)
bit 4.0.4 2020-08-04 CRAN (R 4.1.1)
bit64 4.0.5 2020-08-30 CRAN (R 4.1.1)
broom 0.7.9 2021-07-27 CRAN (R 4.1.1)
bslib 0.3.0 2021-09-02 CRAN (R 4.1.1)
cachem 1.0.6 2021-08-19 CRAN (R 4.1.1)
callr 3.7.0 2021-04-20 CRAN (R 4.1.1)
cellranger 1.1.0 2016-07-27 CRAN (R 4.1.1)
checkmate 2.0.0 2020-02-06 CRAN (R 4.1.1)
cli 3.0.1 2021-07-17 CRAN (R 4.1.1)
colorspace 2.0-2 2021-06-24 CRAN (R 4.1.1)
crayon 1.4.1 2021-02-08 CRAN (R 4.1.1)
DBI 1.1.1 2021-01-15 CRAN (R 4.1.1)
dbplyr 2.1.1 2021-04-06 CRAN (R 4.1.1)
desc 1.3.0 2021-03-05 CRAN (R 4.1.1)
devtools 2.4.2 2021-08-19 Github (r-lib/devtools@e10658f)
digest 0.6.27 2020-10-24 CRAN (R 4.1.1)
dplyr 1.0.7.9000 2021-09-23 Github (tidyverse/dplyr@cab38bf)
ellipsis 0.3.2 2021-04-29 CRAN (R 4.1.1)
evaluate 0.14 2019-05-28 CRAN (R 4.1.1)
extrafont 0.17 2014-12-08 CRAN (R 4.1.1)
extrafontdb 1.0 2012-06-11 CRAN (R 4.1.1)
fansi 0.5.0 2021-05-25 CRAN (R 4.1.1)
farver 2.1.0 2021-02-28 CRAN (R 4.1.1)
fastmap 1.1.0 2021-01-25 CRAN (R 4.1.1)
forcats 0.5.1.9000 2021-08-15 Github (tidyverse/forcats@b5fce89)
fs 1.5.0.9000 2021-08-15 Github (r-lib/fs@10e38dd)
generics 0.1.0 2020-10-31 CRAN (R 4.1.1)
ggbeeswarm 0.6.0 2017-08-07 CRAN (R 4.1.1)
ggplot2 3.3.5.9000 2021-09-21 Github (tidyverse/ggplot2@759c63c)
ggrastr 0.2.3 2021-08-15 Github (VPetukhov/ggrastr@1ef0ff5)
glue 1.4.2 2020-08-27 CRAN (R 4.1.1)
gt 0.3.1 2021-08-07 CRAN (R 4.1.1)
gtable 0.3.0.9000 2021-08-15 Github (r-lib/gtable@0fc53e0)
haven 2.4.3 2021-08-04 CRAN (R 4.1.1)
highr 0.9 2021-04-16 CRAN (R 4.1.1)
hms 1.1.0 2021-05-17 CRAN (R 4.1.1)
htmltools 0.5.2 2021-08-25 CRAN (R 4.1.1)
httr 1.4.2 2020-07-20 CRAN (R 4.1.1)
jquerylib 0.1.4 2021-04-26 CRAN (R 4.1.1)
jsonlite 1.7.2 2020-12-09 CRAN (R 4.1.1)
knitr 1.34 2021-09-09 CRAN (R 4.1.1)
labeling 0.4.2 2020-10-20 CRAN (R 4.1.1)
lattice 0.20-45 2021-09-22 CRAN (R 4.1.1)
lifecycle 1.0.0 2021-02-15 CRAN (R 4.1.1)
lubridate 1.7.10 2021-02-26 CRAN (R 4.1.1)
magrittr 2.0.1 2020-11-17 CRAN (R 4.1.1)
Matrix 1.3-4 2021-06-01 CRAN (R 4.1.1)
memoise 2.0.0 2021-01-26 CRAN (R 4.1.1)
modelr 0.1.8 2020-05-19 CRAN (R 4.1.1)
munsell 0.5.0 2018-06-12 CRAN (R 4.1.1)
patchwork 1.1.0.9000 2021-08-15 Github (thomasp85/patchwork@79223d3)
pillar 1.6.2 2021-08-26 Github (r-lib/pillar@1058bda)
pkgbuild 1.2.0 2020-12-15 CRAN (R 4.1.1)
pkgconfig 2.0.3 2019-09-22 CRAN (R 4.1.1)
pkgload 1.2.2 2021-09-11 CRAN (R 4.1.1)
png 0.1-7 2013-12-03 CRAN (R 4.1.1)
prettyunits 1.1.1.9000 2021-08-15 Github (r-lib/prettyunits@8706d89)
processx 3.5.2 2021-04-30 CRAN (R 4.1.1)
ps 1.6.0 2021-02-28 CRAN (R 4.1.1)
purrr 0.3.4.9000 2021-08-15 Github (tidyverse/purrr@5aca9df)
R6 2.5.0.9000 2021-08-15 Github (r-lib/R6@1d70936)
ragg 1.1.3.9000 2021-08-15 Github (r-lib/ragg@adf5f06)
Rcpp 1.0.7 2021-07-07 CRAN (R 4.1.1)
readr 2.0.1.9000 2021-09-22 Github (tidyverse/readr@0f37e2b)
readxl 1.3.1.9000 2021-08-19 Github (tidyverse/readxl@649982a)
remotes 2.4.0.9001 2021-09-02 Github (r-lib/remotes@6a0e560)
reprex 2.0.1 2021-08-05 CRAN (R 4.1.1)
reticulate 1.22 2021-09-17 CRAN (R 4.1.1)
rlang 0.4.11.9001 2021-09-23 Github (r-lib/rlang@217ef27)
rmarkdown 2.11.2 2021-09-23 Github (rstudio/rmarkdown@71ae829)
rprojroot 2.0.2 2020-11-15 CRAN (R 4.1.1)
rstudioapi 0.13.0-9000 2021-08-15 Github (rstudio/rstudioapi@96fad1d)
Rttf2pt1 1.3.9 2021-07-22 CRAN (R 4.1.1)
rvest 1.0.1 2021-07-26 CRAN (R 4.1.1)
sass 0.4.0 2021-05-12 CRAN (R 4.1.1)
scales 1.1.1 2020-05-11 CRAN (R 4.1.1)
sessioninfo 1.1.1 2018-11-05 CRAN (R 4.1.1)
stringi 1.7.4 2021-08-25 CRAN (R 4.1.1)
stringr 1.4.0 2019-02-10 CRAN (R 4.1.1)
styler 1.6.1.9000 2021-09-17 Github (r-lib/styler@0e41d88)
systemfonts 1.0.2 2021-05-11 CRAN (R 4.1.1)
testthat 3.0.4.9000 2021-09-23 Github (r-lib/testthat@74a0fee)
textshaping 0.3.5 2021-06-09 CRAN (R 4.1.1)
tibble 3.1.4.9000 2021-08-26 Github (tidyverse/tibble@e3e40af)
tidyr 1.1.3.9000 2021-09-23 Github (tidyverse/tidyr@a39f8de)
tidyselect 1.1.1 2021-04-30 CRAN (R 4.1.1)
tidyverse 1.3.1.9000 2021-08-16 Github (tidyverse/tidyverse@195d8a4)
tzdb 0.1.2 2021-07-20 CRAN (R 4.1.1)
usethis 2.0.1.9000 2021-08-30 Github (r-lib/usethis@3385e14)
utf8 1.2.2 2021-07-24 CRAN (R 4.1.1)
vctrs 0.3.8 2021-04-29 CRAN (R 4.1.1)
vipor 0.4.5 2017-03-22 CRAN (R 4.1.1)
viridisLite 0.4.0 2021-04-13 CRAN (R 4.1.1)
vroom 1.5.5.9000 2021-09-19 Github (r-lib/vroom@32c2ba6)
withr 2.4.2 2021-04-18 CRAN (R 4.1.1)
xfun 0.26 2021-09-14 CRAN (R 4.1.1)
xml2 1.3.2 2020-04-23 CRAN (R 4.1.1)
yaml 2.2.1 2020-02-01 CRAN (R 4.1.1)