Tyser, R.C.V., Mahammadov, E., Nakanoh, S., Vallier, L., Scialdone, A., and Srinivas, S. (2021). Single-cell transcriptomic characterization of a gastrulating human embryo. Nature 600, 285–289.

  • BioProject Accession: PRJEB40781
  • AEArrayExpress Accession: E-MTAB-9388



Load required packages.

library(tidyverse)
library(Matrix)
library(patchwork)
library(extrafont)
Sys.time()
## [1] "2022-01-19 00:46:30 CST"

Data preparation

Functions loading

source(
    file = file.path(
        SCRIPT_DIR,
        "utilities.R"
    )
)
plot_embedding_highlight <- function(embedding, x, y, label, n_cols = 3) {
    cell_metadata_selected <- x
    selected_column <- y

    purrr::map(levels(cell_metadata_selected[[selected_column]]), \(x) {
        values <- embedding |>
            dplyr::left_join(cell_metadata_selected) |>
            dplyr::mutate(
                value = case_when(
                    .data[[selected_column]] == x ~ "1",
                    .data[[selected_column]] != x ~ "0"
                )
            ) |>
            dplyr::pull(value) |>
            as.integer() |>
            as.factor()

        plot_embedding(
            embedding = embedding[, c(x_column, y_column)],
            color_values = values,
            label = glue::glue(
                "{label}; ",
                "{x}: {sum(as.integer(as.character(values)), na.rm = TRUE)}"
            ),
            label_position = NULL,
            show_color_value_labels = FALSE,
            show_color_legend = FALSE,
            geom_point_size = GEOM_POINT_SIZE,
            sort_values = TRUE,
            shuffle_values = FALSE,
            rasterise = RASTERISED,
            legend_size = 2
        ) +
            theme_customized(
                legend_key_size = 2,
                legend_text_size = 5
            ) +
            scale_color_manual(
                na.translate = TRUE,
                values = c("grey70", "salmon"),
                na.value = "#7F7F7F"
            ) +
            ggplot2::annotate(
                geom = "text",
                x = Inf,
                y = Inf,
                label = sum(as.integer(as.character(values)), na.rm = TRUE),
                size = 5 / ggplot2::.pt,
                hjust = 1,
                vjust = 1,
                na.rm = FALSE
            )
    }) |>
        purrr::reduce(`+`) +
        patchwork::plot_layout(ncol = n_cols) +
        patchwork::plot_annotation(
            theme = ggplot2::theme(plot.margin = ggplot2::margin())
        )
}

Data loading

PROJECT_DIR <- "/Users/jialei/Dropbox/Data/Projects/UTSW/Peri-implantation"

Matrix

ad <- reticulate::import(module = "anndata", convert = TRUE)
print(ad$`__version__`)
## [1] "0.7.6"
adata_files <- purrr::map(c("PRJEB40781", "PRJEB11202", "PRJNA555602"), \(x) {
    file.path(
        PROJECT_DIR,
        "raw",
        "public",
        x,
        "matrix",
        "adata.h5ad"
    )
})
purrr::map_lgl(adata_files, file.exists)
## [1] TRUE TRUE TRUE
BACKED <- NULL
matrix_readcount_use <- purrr::map(adata_files, function(x) {
    ad$read_h5ad(
        filename = x, backed = BACKED
    ) |>
        convert_adata()
}) |>
    purrr::reduce(cbind)

matrix_readcount_use |> dim()
## [1] 33538 12690

Metadata

BACKED <- "r"
cell_metadata <- purrr::map(adata_files, function(x) {
    ad$read_h5ad(
        filename = x, backed = BACKED
    )$obs |>
        tibble::rownames_to_column(var = "cell") |>
        dplyr::select(cell, everything())
}) |>
    dplyr::bind_rows() |>
    dplyr::select(-batch)

cell_metadata |> head()


Check memory usage.

purrr::walk(list(matrix_readcount_use, cell_metadata), function(x) {
    print(object.size(x), units = "auto", standard = "SI")
})
## 828.5 MB
## 1.3 MB

Clustering of a CS7 human gastrula

Cell metadata & embedding

cell_metadata_PRJEB40781 <- vroom::vroom(
    file = file.path(
        PROJECT_DIR,
        "raw",
        "public",
        "PRJEB40781",
        "matrix",
        "cell_metadata.csv"
    )
) |>
    dplyr::mutate(
        lineage = factor(lineage)
    )
## Rows: 1195 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): cell, run, source_name, developmental_stage, individual, sex, sampl...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
EMBEDDING_FILE <- "embedding_ncomponents15_ccc1_seed20210719.csv.gz"

embedding_1195 <- vroom::vroom(
    file = file.path(
        PROJECT_DIR,
        "raw/public/PRJEB40781",
        "clustering/PRJEB40781/star/exploring",
        "Scanpy_Harmony",
        EMBEDDING_FILE
    )
)

embedding_1195 |> head()

Visualization

x_column <- "x_umap_min_dist=0.1"
y_column <- "y_umap_min_dist=0.1"

GEOM_POINT_SIZE <- 1.25
EMBEDDING_TITLE_PREFIX <- "UMAP"
RASTERISED <- TRUE

Clustering

p_embedding_leiden <- plot_embedding(
    embedding = embedding_1195[, c(x_column, y_column)],
    color_values = embedding_1195$leiden |> as.factor(),
    label = paste(EMBEDDING_TITLE_PREFIX, "Leiden", sep = "; "),
    label_position = NULL,
    show_color_value_labels = TRUE,
    show_color_legend = FALSE,
    geom_point_size = GEOM_POINT_SIZE,
    sort_values = FALSE,
    shuffle_values = TRUE,
    rasterise = RASTERISED
) +
    theme_customized()

# CB_POSITION <- c(0.8, 0.995)
p_embedding_UMI <- plot_embedding(
    embedding = embedding_1195[, c(x_column, y_column)],
    color_values = embedding_1195 |>
        dplyr::left_join(
            cell_metadata
        ) |>
        dplyr::pull(num_umis) |>
        {
            \(x) log10(x)
        }(),
    label = paste(EMBEDDING_TITLE_PREFIX, "UMI", sep = "; "),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE / 1.5,
    sort_values = TRUE,
    shuffle_values = FALSE,
    rasterise = RASTERISED,
    legend_size = 2
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    )

p_embedding_MT <- plot_embedding(
    embedding = embedding_1195[, c(x_column, y_column)],
    color_values = embedding_1195 |>
        dplyr::left_join(cell_metadata) |>
        dplyr::pull(mt_percentage),
    label = paste(EMBEDDING_TITLE_PREFIX, "MT %", sep = "; "),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE,
    sort_values = TRUE,
    shuffle_values = FALSE,
    rasterise = RASTERISED,
    legend_size = 2
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    )

p_embedding_sampling_site <- plot_embedding(
    embedding = embedding_1195[, c(x_column, y_column)],
    color_values = embedding_1195 |>
        dplyr::left_join(
            cell_metadata_PRJEB40781
        ) |>
        dplyr::pull(sampling_site) |>
        as.factor(),
    label = paste(EMBEDDING_TITLE_PREFIX, "Sampling site", sep = "; "),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE / 1.5,
    sort_values = FALSE,
    shuffle_values = TRUE,
    rasterise = RASTERISED
) +
    theme_customized()
embedding_1195 |>
    dplyr::left_join(
        cell_metadata |>
            dplyr::select(cell, num_umis:mt_percentage)
    ) |>
    dplyr::left_join(
        cell_metadata_PRJEB40781
    ) |>
    dplyr::group_by(
        leiden
    ) |>
    dplyr::summarise(
        num_cells = dplyr::n(),
        median_umis = median(num_umis),
        median_features = median(num_features),
        median_mt_percentage = median(mt_percentage)
    ) |>
    gt::gt() |>
    gt::tab_options(table.font.size = "median") |>
    gt::summary_rows(
        columns = c(num_cells),
        fns = list(
            Sum = ~ sum(.)
        ),
        decimals = 0
    ) |>
    gt::summary_rows(
        columns = c("median_umis", "median_features", "median_mt_percentage"),
        fns = list(
            Median = ~ median(.)
        ),
        decimals = 2
    )
leiden num_cells median_umis median_features median_mt_percentage
0 189 883508.0 2581.0 0.008930657
1 138 996283.5 4297.5 0.006288449
2 134 836131.0 4346.0 0.010772693
3 115 887650.0 4906.0 0.007057741
4 96 916361.0 4618.5 0.014087410
5 85 1110656.0 3667.0 0.005815362
6 81 763755.0 5965.0 0.008891377
7 79 938071.0 4924.0 0.006874891
8 79 746953.0 5211.0 0.008341437
9 72 1010636.5 4530.0 0.004510133
10 60 940026.0 3409.5 0.009846117
11 34 1028500.0 4321.5 0.005531307
12 33 728519.0 4344.0 0.014426759
Sum — 1,195 — — —
Median — — 916,361.00 4,346.00 0.01


purrr::reduce(
    list(
        p_embedding_leiden,
        p_embedding_sampling_site,
        p_embedding_UMI,
        p_embedding_MT
    ),
    `+`
) +
    patchwork::plot_layout(ncol = 2) +
    patchwork::plot_annotation(
        theme = theme(plot.margin = margin())
    )

Lineage

embedding_1195 |>
    dplyr::left_join(
        cell_metadata
    ) |>
    dplyr::left_join(cell_metadata_PRJEB40781) |>
    dplyr::group_by(lineage) |>
    dplyr::summarise(
        num_cells = dplyr::n(),
        median_umis = median(num_umis),
        median_features = median(num_features),
        median_mt_percentage = median(mt_percentage)
    ) |>
    gt::gt() |>
    gt::tab_options(table.font.size = "median") |>
    gt::summary_rows(
        columns = c(num_cells),
        fns = list(
            Sum = ~ sum(.)
        ),
        decimals = 0
    ) |>
    gt::summary_rows(
        columns = c("median_umis", "median_features", "median_mt_percentage"),
        fns = list(
            Median = ~ median(.)
        ),
        decimals = 2
    )
lineage num_cells median_umis median_features median_mt_percentage
advanced mesoderm 164 993741.0 4164.5 0.006218430
axial mesoderm 23 853681.0 4181.0 0.007745421
ectodermal cell 29 996476.0 4314.0 0.005143689
emergent mesoderm 185 911736.0 4184.0 0.007838085
endodermal cell 135 989383.0 4837.0 0.005348267
epiblast cell 133 790584.0 5607.0 0.009357085
erythrocyte 32 716602.0 4345.5 0.014388710
hemogenic endothelial progenitor 111 925287.0 4498.0 0.013978306
nascent mesoderm 98 787231.5 4479.0 0.008292312
primitive streak 202 826282.5 3781.0 0.010169492
yolk sac mesoderm 83 1124901.0 3778.0 0.006017290
Sum — 1,195 — — —
Median — — 911,736.00 4,314.00 0.01


p_embedding_lineage <- plot_embedding(
    embedding = embedding_1195[, c(x_column, y_column)],
    color_values = embedding_1195 |>
        dplyr::left_join(
            cell_metadata_PRJEB40781
        ) |>
        dplyr::pull(lineage) |>
        as.factor(),
    label = paste(EMBEDDING_TITLE_PREFIX, "Lineage", sep = "; "),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE / 1.5,
    sort_values = FALSE,
    shuffle_values = TRUE,
    rasterise = RASTERISED
) +
    theme_customized()

list(
    p_embedding_lineage,
    purrr::map(levels(cell_metadata_PRJEB40781$lineage), \(x) {
        values <- embedding_1195 |>
            dplyr::left_join(cell_metadata_PRJEB40781) |>
            dplyr::mutate(
                value = dplyr::case_when(
                    lineage == x ~ "1",
                    TRUE ~ "0"
                )
            ) |>
            dplyr::pull(value)
        plot_embedding(
            embedding = embedding_1195[, c(x_column, y_column)],
            color_values = factor(values),
            label = glue::glue("{EMBEDDING_TITLE_PREFIX}; {x}: {sum(as.integer(values))}"),
            label_position = NULL,
            show_color_value_labels = FALSE,
            show_color_legend = FALSE,
            geom_point_size = GEOM_POINT_SIZE,
            sort_values = TRUE,
            rasterise = RASTERISED,
            legend_size = 2
        ) +
            theme_customized(
                legend_key_size = 2,
                legend_text_size = 5
            ) +
            scale_color_manual(
                values = c("grey70", "salmon")
            )
    })
) |>
    purrr::reduce(`+`) +
    patchwork::plot_layout(ncol = 2) +
    patchwork::plot_annotation(
        theme = theme(plot.margin = margin())
    )

Composition

p_barplot_composition_sampling_site <- calc_group_composition(
    data = embedding_1195 |>
        dplyr::left_join(
            cell_metadata_PRJEB40781
        ),
    x = "leiden",
    group = "sampling_site"
) |>
    dplyr::mutate(
        leiden = factor(leiden)
    ) |>
    plot_barplot(
        x = "leiden",
        y = "percentage",
        z = "sampling_site",
        legend_ncol = 1
    )

p_barplot_composition_lineage <- calc_group_composition(
    data = embedding_1195 |>
        dplyr::left_join(
            cell_metadata_PRJEB40781
        ),
    x = "leiden",
    group = "lineage"
) |>
    dplyr::mutate(
        leiden = factor(leiden)
    ) |>
    plot_barplot(
        x = "leiden",
        y = "percentage",
        z = "lineage",
        legend_ncol = 1
    )

list(
    p_barplot_composition_sampling_site,
    p_barplot_composition_lineage
) |>
    purrr::reduce(`+`) +
    patchwork::plot_layout(ncol = 1, guides = "collect") +
    patchwork::plot_annotation(
        theme = theme(plot.margin = margin())
    )

Expression

FEATURES_SELECTED <- c(
    "ENSG00000204531_POU5F1",
    "ENSG00000111704_NANOG",
    "ENSG00000171872_KLF17",
    "ENSG00000186103_ARGFX",
    #
    "ENSG00000164736_SOX17",
    "ENSG00000125798_FOXA2",
    "ENSG00000136574_GATA4",
    "ENSG00000134853_PDGFRA",
    #
    "ENSG00000179348_GATA2",
    "ENSG00000070915_SLC12A3",
    "ENSG00000165556_CDX2",
    "ENSG00000007866_TEAD3"
)
purrr::map(FEATURES_SELECTED, \(x) {
    selected_feature <- x

    cat(selected_feature, "\n")
    values <- log10(calc_cpm(matrix_readcount_use[, embedding_1195$cell])[selected_feature, ] + 1)

    plot_embedding(
        embedding = embedding_1195[, c(x_column, y_column)],
        color_values = values,
        label = paste(
            EMBEDDING_TITLE_PREFIX,
            selected_feature |> stringr::str_remove(pattern = "^E.+_"),
            sep = "; "
        ),
        label_position = NULL,
        show_color_value_labels = FALSE,
        show_color_legend = TRUE,
        geom_point_size = GEOM_POINT_SIZE,
        sort_values = TRUE,
        shuffle_values = FALSE,
        rasterise = RASTERISED,
        legend_size = 2
    ) +
        scale_color_viridis_c(
            na.value = "grey80"
        ) +
        theme_customized(
            legend_key_size = 2,
            legend_text_size = 5
        )
}) |>
    purrr::reduce(`+`) +
    patchwork::plot_layout(ncol = 3, byrow = FALSE) +
    patchwork::plot_annotation(
        theme = theme(plot.margin = margin())
    )
## ENSG00000204531_POU5F1 
## ENSG00000111704_NANOG 
## ENSG00000171872_KLF17 
## ENSG00000186103_ARGFX 
## ENSG00000164736_SOX17 
## ENSG00000125798_FOXA2 
## ENSG00000136574_GATA4 
## ENSG00000134853_PDGFRA 
## ENSG00000179348_GATA2 
## ENSG00000070915_SLC12A3 
## ENSG00000165556_CDX2 
## ENSG00000007866_TEAD3

Integration with other datasets

EMBEDDING_FILE <- glue::glue(
    "embedding_ncomponents18_ccc1_seed20210719.csv.gz"
)

embedding <- vroom::vroom(
    file = file.path(
        PROJECT_DIR,
        "raw",
        "public",
        "PRJEB40781",
        "clustering",
        "PRJEB40781_PRJEB11202_PRJNA555602",
        "exploring",
        "Scanpy_Harmony",
        EMBEDDING_FILE
    )
) |>
    dplyr::mutate(
        study = dplyr::case_when(
            batch %in% c("GSM3956280", "GSM3956281") ~ "PRJNA555602",
            TRUE ~ batch
        ),
        study = factor(study)
    )

embedding |> head()
studies <- tibble::tribble(
    ~bioproject, ~citation,
    "PRJEB11202", "Petropoulos et al. 2016",
    "PRJEB40781", "Tyser et al. 2021",
    "PRJNA555602", "Zheng et al. 2019",
    "PRJNA431392", "Zhou et al. 2019",
    "PRJNA737139", "Kagawa et al. 2021",
    "PRJNA632839", "Yu et al. 2021",
    "PRJNA658478", "Liu et al. 2021",
    "PRJNA720968", "Yanagida et al. 2021",
    "PRJNA667174", "Fan et al. 2021",
    "PRJNA738498", "Sozen et al. 2021",
    "PRJNA737139", "Kagawa et al. 2021"
)

studies <- setNames(
    object = studies$citation,
    nm = studies$bioproject
)
embedding |>
    dplyr::count(study, name = "num_cells") |>
    dplyr::mutate(
        study = studies[study]
    ) |>
    gt::gt() |>
    gt::tab_options(table.font.size = "median") |>
    gt::summary_rows(
        columns = c(num_cells),
        fns = list(
            Sum = ~ sum(.)
        ),
        decimals = 0
    )
study num_cells
Petropoulos et al. 2016 1529
Tyser et al. 2021 1195
Zheng et al. 2019 9966
Sum — 12,690

Metadata

# PRJEB11202; Petropoulos et al. 2016
cell_metadata_PRJEB11202 <- vroom::vroom(
    file = file.path(
        PROJECT_DIR,
        "raw",
        "public",
        "PRJEB11202",
        "matrix/cell_metadata.csv"
    )
) |>
    dplyr::mutate(
        developmental_stage = stringr::str_remove(
            string = individual,
            pattern = "\\..+$"
        ),
        developmental_stage = factor(
            developmental_stage,
            levels = stringr::str_sort(
                unique(developmental_stage),
                numeric = TRUE
            )
        ),
        #
        lineage = factor(
            inferred_lineage,
            levels = c(
                "epiblast",
                "primitive_endoderm",
                "trophectoderm",
                "not_applicable"
            )
        )
    ) |>
    dplyr::select(
        cell, lineage, developmental_stage
    )

# PRJNA555602; Zheng et al. 2019
cell_metadata_PRJNA555602 <- vroom::vroom(
    file = file.path(
        PROJECT_DIR,
        "raw",
        "public",
        "PRJNA555602",
        "matrix/cell_metadata.csv"
    )
) |>
    dplyr::mutate(
        lineage = dplyr::case_when(
            # amniotic ectoderm-like cells
            rna_snn_res_0.3 == 0 ~ "Transwell-AMLC",
            # mesoderm-like cell 2
            rna_snn_res_0.3 == 1 ~ "MeLC2",
            rna_snn_res_0.3 == 2 ~ "Human ES cell",
            # mesoderm-like cell 1
            rna_snn_res_0.3 == 3 ~ "MeLC1",
            # human PGC-like cells
            rna_snn_res_0.3 == 4 ~ "hPGCLC",
            rna_snn_res_0.3 == 5 ~ "AMLC"
        ),
        lineage = factor(
            lineage,
            levels = c(
                "Human ES cell",
                "hPGCLC",
                "MeLC1",
                "MeLC2",
                "AMLC",
                "Transwell-AMLC"
            )
        ),
        source = dplyr::case_when(
            orig_ident == "10X_Embryoid" ~ "Embryoid",
            orig_ident == "10X_H9_Amnion" ~ "H9_Amnion"
        ),
        source = factor(source)
    )

Visualization

Clustering

x_column <- "x_pacmap"
y_column <- "y_pacmap"
EMBEDDING_TITLE_PREFIX <- "PaCMAP"
GEOM_POINT_SIZE <- 0.4

p_embedding_leiden <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = embedding$leiden |> as.factor(),
    label = glue::glue("{EMBEDDING_TITLE_PREFIX}; Leiden"),
    label_position = NULL,
    show_color_value_labels = TRUE,
    show_color_legend = FALSE,
    geom_point_size = GEOM_POINT_SIZE,
    sort_values = FALSE,
    shuffle_values = FALSE,
    rasterise = RASTERISED
) +
    theme_customized()

p_embedding_UMI <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = log10(Matrix::colSums(matrix_readcount_use[, embedding$cell])),
    label = glue::glue("{EMBEDDING_TITLE_PREFIX}; UMI"),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE * 1,
    geom_point_alpha = 1,
    sort_values = TRUE,
    shuffle_values = FALSE,
    label_size = 2.5,
    label_hjust = 0,
    label_vjust = 0,
    rasterise = RASTERISED,
    legend_size = 2,
    legend_ncol = 1
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    )

p_embedding_MT <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = embedding |>
        dplyr::left_join(
            cell_metadata # |> dplyr::select(-batch)
        ) |>
        dplyr::pull(mt_percentage),
    label = glue::glue("{EMBEDDING_TITLE_PREFIX}; MT %"),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE * 1,
    geom_point_alpha = 1,
    sort_values = TRUE,
    shuffle_values = FALSE,
    label_size = 2.5,
    label_hjust = 0,
    label_vjust = 0,
    rasterise = RASTERISED,
    legend_size = 2,
    legend_ncol = 1
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    )

p_embedding_study <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = embedding$study,
    label = glue::glue("{EMBEDDING_TITLE_PREFIX}; Study"),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE / 2,
    sort_values = FALSE,
    shuffle_values = TRUE,
    rasterise = RASTERISED,
    legend_size = 2
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    ) +
    scale_color_manual(
        values = scales::hue_pal()(n = length(unique(embedding$batch))),
        labels = studies
    )
embedding |>
    dplyr::left_join(
        cell_metadata |>
            dplyr::select(cell, num_umis:mt_percentage)
    ) |>
    dplyr::group_by(
        leiden
    ) |>
    dplyr::summarise(
        num_cells = dplyr::n(),
        median_umis = median(num_umis),
        median_features = median(num_features),
        median_mt_percentage = median(mt_percentage)
    ) |>
    gt::gt() |>
    gt::tab_options(table.font.size = "median") |>
    gt::summary_rows(
        columns = c(num_cells),
        fns = list(
            Sum = ~ sum(.)
        ),
        decimals = 0
    ) |>
    gt::summary_rows(
        columns = c("median_umis", "median_features", "median_mt_percentage"),
        fns = list(
            Median = ~ median(.)
        ),
        decimals = 2
    )
leiden num_cells median_umis median_features median_mt_percentage
0 1179 27676.0 5089.0 0.025830026
1 1156 21609.5 4597.0 0.033986369
2 1141 22175.0 4243.0 0.043475207
3 1124 28028.0 5113.0 0.043416238
4 969 23013.0 4643.0 0.026599176
5 892 28762.0 4966.5 0.034163936
6 868 29193.5 5040.0 0.025966430
7 826 23003.0 4499.5 0.030224185
8 797 22893.0 4362.0 0.041594258
9 647 1348781.0 9991.0 0.008139240
10 602 861400.5 4880.5 0.008097055
11 472 1787005.5 10772.5 0.006137011
12 434 24493.5 4661.0 0.035983814
13 345 31267.0 5475.0 0.039519924
14 324 1607461.5 10278.0 0.007017669
15 235 26117.0 4854.0 0.047443806
16 228 881789.5 2682.5 0.009192114
17 178 1092848.0 3854.0 0.005902158
18 131 883583.0 4508.0 0.014213556
19 84 1570985.0 10117.5 0.007988063
20 58 958118.0 4789.5 0.004154967
Sum — 12,690 — — —
Median — — 29,193.50 4,854.00 0.03


purrr::reduce(
    list(
        p_embedding_leiden,
        p_embedding_study,
        p_embedding_UMI,
        p_embedding_MT
    ),
    `+`
) +
    patchwork::plot_layout(ncol = 2) +
    patchwork::plot_annotation(
        theme = theme(plot.margin = margin())
    )

purrr::map(unique(embedding$study), \(x) {
    plot_embedding(
        embedding = embedding[, c(x_column, y_column)],
        color_values = as.integer(embedding$study == x) |> as.factor(),
        label = glue::glue("{EMBEDDING_TITLE_PREFIX}; {studies[x]}, {sum(as.integer(embedding$study == x), na.rm = TRUE)}"),
        label_position = NULL,
        show_color_value_labels = FALSE,
        show_color_legend = FALSE,
        geom_point_size = GEOM_POINT_SIZE,
        sort_values = TRUE,
        shuffle_values = FALSE,
        rasterise = RASTERISED,
        legend_size = 2
    ) +
        theme_customized(
            legend_key_size = 2,
            legend_text_size = 5
        ) +
        scale_color_manual(
            values = c("grey70", "salmon")
        )
}) |>
    purrr::reduce(`+`) +
    patchwork::plot_layout(ncol = 2) +
    patchwork::plot_annotation(
        theme = ggplot2::theme(plot.margin = ggplot2::margin())
    )

embedding |>
    dplyr::left_join(
        cell_metadata |>
            dplyr::select(cell, num_umis:mt_percentage)
    ) |>
    dplyr::group_by(
        study
    ) |>
    dplyr::summarise(
        num_cells = dplyr::n(),
        median_umis = median(num_umis),
        median_features = median(num_features),
        median_mt_percentage = median(mt_percentage)
    ) |>
    dplyr::mutate(
        study = studies[study],
        platform = c("Smart-Seq2", "Smart-Seq2", "10x Genomics")
    ) |>
    dplyr::select(
        study, platform, everything()
    ) |>
    gt::gt() |>
    gt::tab_options(table.font.size = "median") |>
    gt::summary_rows(
        columns = c(num_cells),
        fns = list(
            Sum = ~ sum(.)
        ),
        decimals = 0
    ) |>
    gt::summary_rows(
        columns = c("median_umis", "median_features", "median_mt_percentage"),
        fns = list(
            Median = ~ median(.)
        ),
        decimals = 2
    )
study platform num_cells median_umis median_features median_mt_percentage
Petropoulos et al. 2016 Smart-Seq2 1529 1551093 10305 0.007144702
Tyser et al. 2021 Smart-Seq2 1195 899241 4379 0.008164051
Zheng et al. 2019 10x Genomics 9966 25264 4772 0.034485263
Sum — — 12,690 — — —
Median — — — 899,241.00 4,772.00 0.01


Cell annotation

Petropoulos et al. 2016

bioproject <- "PRJEB11202"
cell_metadata_selected <- cell_metadata_PRJEB11202

selected_column <- "developmental_stage"
p_embedding_developmental_stage <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = embedding |>
        dplyr::left_join(cell_metadata_selected) |>
        dplyr::pull(.data[[selected_column]]),
    label = glue::glue(
        "{EMBEDDING_TITLE_PREFIX}; {studies[bioproject]}; ",
        "{selected_column |>
        stringr::str_to_title() |>
        stringr::str_replace(pattern = \"_\", replacement = \" \")}"
    ),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE,
    sort_values = TRUE,
    shuffle_values = FALSE,
    rasterise = RASTERISED,
    legend_size = 2
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    ) +
    scale_color_manual(
        na.translate = TRUE,
        values = scales::hue_pal()(n = length(unique(cell_metadata_selected[[selected_column]]))),
        na.value = "#7F7F7F"
    )

selected_column <- "lineage"
p_embedding_lineage <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = embedding |>
        dplyr::left_join(cell_metadata_selected) |>
        dplyr::pull(.data[[selected_column]]),
    label = glue::glue(
        "{EMBEDDING_TITLE_PREFIX}; {studies[bioproject]}; ",
        "{selected_column |>
        stringr::str_to_title() |>
        stringr::str_replace(pattern = \"_\", replacement = \" \")}"
    ),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE,
    sort_values = TRUE,
    shuffle_values = FALSE,
    rasterise = RASTERISED,
    legend_size = 2
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    ) +
    scale_color_manual(
        na.translate = TRUE,
        values = scales::hue_pal()(n = length(unique(cell_metadata_selected[[selected_column]]))),
        na.value = "#7F7F7F"
    )

list(
    p_embedding_lineage,
    p_embedding_developmental_stage
) |>
    purrr::reduce(`+`) +
    patchwork::plot_layout(ncol = 2) +
    patchwork::plot_annotation(
        theme = ggplot2::theme(plot.margin = ggplot2::margin())
    )


Salmon: highlighted group of cells; Light grey: cells belonging to this dataset but not the highlighted group; Dark grey: cells belonging to other datasets.

plot_embedding_highlight(
    embedding = embedding,
    x = cell_metadata_selected,
    y = "lineage",
    label = studies[bioproject],
    n_cols = 2
)

plot_embedding_highlight(
    embedding = embedding,
    x = cell_metadata_selected,
    y = "developmental_stage",
    label = studies[bioproject],
    n_cols = 2
)

Zheng et al. 2019

bioproject <- "PRJNA555602"
cell_metadata_selected <- cell_metadata_PRJNA555602

selected_column <- "source"
p_embedding_developmental_stage <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = embedding |>
        dplyr::left_join(cell_metadata_selected) |>
        dplyr::pull(.data[[selected_column]]),
    label = glue::glue(
        "{EMBEDDING_TITLE_PREFIX}; {studies[bioproject]}; ",
        "{selected_column |>
        stringr::str_to_title() |>
        stringr::str_replace(pattern = \"_\", replacement = \" \")}"
    ),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE,
    sort_values = FALSE,
    shuffle_values = TRUE,
    rasterise = RASTERISED,
    legend_size = 2
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    ) +
    scale_color_manual(
        na.translate = TRUE,
        values = scales::hue_pal()(n = length(unique(cell_metadata_selected[[selected_column]]))),
        na.value = "#7F7F7F"
    )

selected_column <- "lineage"
p_embedding_lineage <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = embedding |>
        dplyr::left_join(cell_metadata_selected) |>
        dplyr::pull(.data[[selected_column]]),
    label = glue::glue(
        "{EMBEDDING_TITLE_PREFIX}; {studies[bioproject]}; ",
        "{selected_column |>
        stringr::str_to_title() |>
        stringr::str_replace(pattern = \"_\", replacement = \" \")}"
    ),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE,
    sort_values = FALSE,
    shuffle_values = TRUE,
    rasterise = RASTERISED,
    legend_size = 2
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    ) +
    scale_color_manual(
        na.translate = TRUE,
        values = scales::hue_pal()(n = length(unique(cell_metadata_selected[[selected_column]]))),
        na.value = "#7F7F7F"
    )

list(
    p_embedding_lineage,
    p_embedding_developmental_stage
) |>
    purrr::reduce(`+`) +
    patchwork::plot_layout(ncol = 2) +
    patchwork::plot_annotation(
        theme = ggplot2::theme(plot.margin = ggplot2::margin())
    )

Tyser et al. 2021

bioproject <- "PRJEB40781"
cell_metadata_selected <- cell_metadata_PRJEB40781

selected_column <- "sampling_site"
p_embedding_sampling_site <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = embedding |>
        dplyr::left_join(cell_metadata_selected) |>
        dplyr::pull(.data[[selected_column]]),
    label = glue::glue(
        "{EMBEDDING_TITLE_PREFIX}; {studies[bioproject]}; ",
        "{selected_column |>
        stringr::str_to_title() |>
        stringr::str_replace(pattern = \"_\", replacement = \" \")}"
    ),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE,
    sort_values = TRUE,
    shuffle_values = FALSE,
    rasterise = RASTERISED,
    legend_size = 2
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    ) +
    scale_color_manual(
        na.translate = TRUE,
        values = scales::hue_pal()(n = length(unique(cell_metadata_selected[[selected_column]]))),
        na.value = "grey70"
    )

selected_column <- "lineage"
p_embedding_lineage <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = embedding |>
        dplyr::left_join(cell_metadata_selected) |>
        dplyr::pull(.data[[selected_column]]),
    label = glue::glue(
        "{EMBEDDING_TITLE_PREFIX}; {studies[bioproject]}; ",
        "{selected_column |>
        stringr::str_to_title() |>
        stringr::str_replace(pattern = \"_\", replacement = \" \")}"
    ),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE,
    sort_values = TRUE,
    shuffle_values = FALSE,
    rasterise = RASTERISED,
    legend_size = 2
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    ) +
    scale_color_manual(
        na.translate = TRUE,
        values = scales::hue_pal()(n = length(unique(cell_metadata_selected[[selected_column]]))),
        na.value = "#7F7F7F"
    )

list(
    p_embedding_lineage,
    p_embedding_sampling_site
) |>
    purrr::reduce(`+`) +
    patchwork::plot_layout(ncol = 2) +
    patchwork::plot_annotation(
        theme = ggplot2::theme(plot.margin = ggplot2::margin())
    )

plot_embedding_highlight(
    embedding = embedding,
    x = cell_metadata_selected,
    y = "lineage",
    label = studies[bioproject],
    n_cols = 2
)


R session info

devtools::session_info()$platform
##  setting  value
##  version  R version 4.1.2 (2021-11-01)
##  os       macOS Monterey 12.1
##  system   aarch64, darwin20.6.0
##  ui       unknown
##  language (EN)
##  collate  en_US.UTF-8
##  ctype    en_US.UTF-8
##  tz       America/Chicago
##  date     2022-01-19
##  pandoc   2.14.0.3 @ /Applications/RStudio.app/Contents/MacOS/pandoc/ (via rmarkdown)
devtools::session_info()$pack |>
    as_tibble() |>
    dplyr::select(
        package,
        loadedversion,
        date,
        `source`
    ) |>
    # print(n = nrow(.))
    gt::gt() |>
    gt::tab_options(table.font.size = "median")
package loadedversion date source
assertthat 0.2.1 2019-03-21 CRAN (R 4.1.1)
backports 1.4.1 2021-12-13 CRAN (R 4.1.2)
beeswarm 0.4.0 2021-06-01 CRAN (R 4.1.2)
bit 4.0.4 2020-08-04 CRAN (R 4.1.1)
bit64 4.0.5 2020-08-30 CRAN (R 4.1.1)
brio 1.1.3 2021-11-30 CRAN (R 4.1.2)
broom 0.7.11 2022-01-03 CRAN (R 4.1.2)
bslib 0.3.1 2021-10-06 CRAN (R 4.1.1)
cachem 1.0.6 2021-08-19 CRAN (R 4.1.1)
callr 3.7.0 2021-04-20 CRAN (R 4.1.1)
cellranger 1.1.0 2016-07-27 CRAN (R 4.1.1)
checkmate 2.0.0 2020-02-06 CRAN (R 4.1.1)
cli 3.1.0 2021-10-27 CRAN (R 4.1.1)
codetools 0.2-18 2020-11-04 CRAN (R 4.1.2)
colorspace 2.0-2 2021-06-24 CRAN (R 4.1.1)
crayon 1.4.2 2021-10-29 CRAN (R 4.1.1)
data.table 1.14.2 2021-09-27 CRAN (R 4.1.1)
DBI 1.1.2 2021-12-20 CRAN (R 4.1.2)
dbplyr 2.1.1 2021-04-06 CRAN (R 4.1.1)
desc 1.4.0 2021-09-28 CRAN (R 4.1.1)
devtools 2.4.3.9000 2022-01-15 Github (r-lib/devtools@e2f25cd69031c8d2099106baed894df4109cb7a4)
digest 0.6.29 2021-12-01 CRAN (R 4.1.2)
dplyr 1.0.7.9000 2022-01-12 Github (tidyverse/dplyr@05013358ace44fe17a51395d49d384232d18d6c1)
dtplyr 1.2.0 2021-12-05 CRAN (R 4.1.2)
ellipsis 0.3.2 2021-04-29 CRAN (R 4.1.1)
evaluate 0.14 2019-05-28 CRAN (R 4.1.1)
extrafont 0.17 2014-12-08 CRAN (R 4.1.1)
extrafontdb 1.0 2012-06-11 CRAN (R 4.1.1)
fansi 1.0.2 2022-01-14 CRAN (R 4.1.2)
farver 2.1.0 2021-02-28 CRAN (R 4.1.1)
fastmap 1.1.0 2021-01-25 CRAN (R 4.1.1)
forcats 0.5.1.9000 2021-11-29 Github (tidyverse/forcats@b4dade0636a46543c30b0b647d97c3ce697c0ada)
fs 1.5.2.9000 2021-12-09 Github (r-lib/fs@6d1182fea7e1c1ddbef3b0bba37c0b0a2e09749c)
gargle 1.2.0 2021-07-02 CRAN (R 4.1.1)
generics 0.1.1 2021-10-25 CRAN (R 4.1.1)
ggbeeswarm 0.6.0 2017-08-07 CRAN (R 4.1.2)
ggplot2 3.3.5 2021-06-25 CRAN (R 4.1.1)
ggrastr 1.0.1 2021-12-08 Github (VPetukhov/ggrastr@7aed9af2b9cffabda86e6d2af2fa10d4e60cc63d)
glue 1.6.0.9000 2021-12-21 Github (tidyverse/glue@76793ef2c376140350c0e1909e66fd404a52b1ef)
googledrive 2.0.0 2021-07-08 CRAN (R 4.1.1)
googlesheets4 1.0.0 2021-07-21 CRAN (R 4.1.1)
gt 0.3.1.9000 2022-01-17 Github (rstudio/gt@fcabb414c55b70c9e445fbedfb24d52fe394ba61)
gtable 0.3.0.9000 2021-10-28 Github (r-lib/gtable@a0bd2721a0a31c8b4391b84aabe98f8c85881140)
haven 2.4.3 2021-08-04 CRAN (R 4.1.1)
highr 0.9 2021-04-16 CRAN (R 4.1.1)
hms 1.1.1 2021-09-26 CRAN (R 4.1.1)
htmltools 0.5.2 2021-08-25 CRAN (R 4.1.1)
httr 1.4.2 2020-07-20 CRAN (R 4.1.1)
jquerylib 0.1.4 2021-04-26 CRAN (R 4.1.1)
jsonlite 1.7.3 2022-01-17 CRAN (R 4.1.2)
knitr 1.37.1 2021-12-21 https://yihui.r-universe.dev (R 4.1.2)
labeling 0.4.2 2020-10-20 CRAN (R 4.1.1)
lattice 0.20-45 2021-09-22 CRAN (R 4.1.2)
lifecycle 1.0.1 2021-09-24 CRAN (R 4.1.1)
lubridate 1.8.0 2022-01-15 Github (tidyverse/lubridate@53e5892a548b3425d6c3bf887542aa105341ab73)
magrittr 2.0.1 2020-11-17 CRAN (R 4.1.1)
Matrix 1.4-0 2021-12-08 CRAN (R 4.1.2)
memoise 2.0.1 2021-11-26 CRAN (R 4.1.2)
modelr 0.1.8.9000 2021-10-27 Github (tidyverse/modelr@16168e0624215d9d1a008f3a85de30aeb75302f6)
munsell 0.5.0 2018-06-12 CRAN (R 4.1.1)
patchwork 1.1.0.9000 2021-10-27 Github (thomasp85/patchwork@79223d3002e7bd7e715a270685c6507d684b2622)
pillar 1.6.4 2021-10-18 CRAN (R 4.1.1)
pkgbuild 1.3.1 2021-12-20 CRAN (R 4.1.2)
pkgconfig 2.0.3 2019-09-22 CRAN (R 4.1.1)
pkgload 1.2.4 2021-11-30 CRAN (R 4.1.2)
png 0.1-7 2013-12-03 CRAN (R 4.1.1)
prettyunits 1.1.1 2020-01-24 CRAN (R 4.1.1)
processx 3.5.2 2021-04-30 CRAN (R 4.1.1)
ps 1.6.0 2021-02-28 CRAN (R 4.1.1)
purrr 0.3.4 2020-04-17 CRAN (R 4.1.1)
R.cache 0.15.0 2021-04-30 CRAN (R 4.1.1)
R.methodsS3 1.8.1 2020-08-26 CRAN (R 4.1.1)
R.oo 1.24.0 2020-08-26 CRAN (R 4.1.1)
R.utils 2.11.0 2021-09-26 CRAN (R 4.1.1)
R6 2.5.1.9000 2021-12-09 Github (r-lib/R6@1b05b89f30fe6713cb9ff51d91fc56bd3016e4b2)
ragg 1.2.1.9000 2021-12-08 Github (r-lib/ragg@c68c6665ef894f16c006333658b32bf25d2e9d19)
Rcpp 1.0.8 2022-01-13 CRAN (R 4.1.2)
readr 2.1.1 2021-11-30 CRAN (R 4.1.2)
readxl 1.3.1.9000 2022-01-18 Github (tidyverse/readxl@03258a3b2341ce600ee0af56851c80c35d6245ef)
remotes 2.4.2 2021-12-02 Github (r-lib/remotes@fcad17b68b7a19d5363d64adfb0a0426a3a5b3db)
reprex 2.0.1 2021-08-05 CRAN (R 4.1.1)
reticulate 1.23 2022-01-14 CRAN (R 4.1.2)
rlang 0.99.0.9003 2022-01-18 Github (r-lib/rlang@d79ab3a1ab1ce8ca5bb0ebc6ab0454cb10fa4dd1)
rmarkdown 2.11.9 2022-01-18 Github (rstudio/rmarkdown@d0d3b08bf78b6cd900d0505fb7141037e117c6b2)
rprojroot 2.0.2 2020-11-15 CRAN (R 4.1.1)
rstudioapi 0.13.0-9000 2022-01-15 Github (rstudio/rstudioapi@5d0f0873dc160779c71bf4b00d8b016b898f6fb5)
Rttf2pt1 1.3.9 2021-07-22 CRAN (R 4.1.1)
rvest 1.0.2 2021-10-16 CRAN (R 4.1.1)
sass 0.4.0 2021-05-12 CRAN (R 4.1.1)
scales 1.1.1 2020-05-11 CRAN (R 4.1.1)
sessioninfo 1.2.2 2021-12-06 CRAN (R 4.1.2)
stringi 1.7.6 2021-11-29 CRAN (R 4.1.2)
stringr 1.4.0.9000 2022-01-17 Github (tidyverse/stringr@3848cd70b1e331e6c20401e4da518ff4c3725324)
styler 1.6.2.9000 2022-01-17 Github (r-lib/styler@9274aed613282eca01909ae8c341224055d9c928)
systemfonts 1.0.3.9000 2021-12-07 Github (r-lib/systemfonts@414114e645efb316def3d8de1056d855f92d588e)
testthat 3.1.1.9000 2022-01-13 Github (r-lib/testthat@f09df60dd881530332b252474e9f35c97f8640be)
textshaping 0.3.6 2021-10-13 CRAN (R 4.1.1)
tibble 3.1.6.9000 2022-01-18 Github (tidyverse/tibble@7aa54e67d6ceb31c81172c7d18d28ea9ce088888)
tidyr 1.1.4 2021-09-27 CRAN (R 4.1.1)
tidyselect 1.1.1 2021-04-30 CRAN (R 4.1.1)
tidyverse 1.3.1.9000 2021-12-08 Github (tidyverse/tidyverse@6186fbf09bf359110f8800ff989cbbdd40485eb0)
tzdb 0.2.0 2021-10-27 CRAN (R 4.1.1)
usethis 2.1.5.9000 2022-01-18 Github (r-lib/usethis@3c4ab669481ab4a11b6426dbc583f05077a4c6db)
utf8 1.2.2 2021-07-24 CRAN (R 4.1.1)
vctrs 0.3.8 2021-04-29 CRAN (R 4.1.1)
vipor 0.4.5 2017-03-22 CRAN (R 4.1.2)
viridisLite 0.4.0 2021-04-13 CRAN (R 4.1.1)
vroom 1.5.7 2021-11-30 CRAN (R 4.1.2)
withr 2.4.3 2021-11-30 CRAN (R 4.1.2)
xfun 0.29 2021-12-14 CRAN (R 4.1.2)
xml2 1.3.3 2021-11-30 CRAN (R 4.1.2)
yaml 2.2.1 2020-02-01 CRAN (R 4.1.1)