Single-cell transcriptomes of human blastoids generated by Kagawa et al., 2021

Jialei Duan

2022-01-24



Kagawa, H., Javali, A., Khoei, H.H., Sommer, T.M., Sestini, G., Novatchkova, M., Scholte Op Reimer, Y., Castel, G., Bruneau, A., Maenhoudt, N., et al. (2021). Human blastoids model blastocyst development and implantation. Nature 1–9.

  • BioProject Accession: PRJNA737139
  • GEO Accession: GSE177689



Load required packages.

[1] "2022-01-24 22:16:17 CST"

Data preparation

Functions loading

source(
    file = file.path(
        SCRIPT_DIR,
        "utilities.R"
    )
)
plot_embedding_highlight <- function(embedding, x, y, label, n_cols = 3) {
    cell_metadata_selected <- x
    selected_column <- y

    purrr::map(levels(cell_metadata_selected[[selected_column]]), \(x) {
        values <- embedding |>
            dplyr::left_join(cell_metadata_selected) |>
            dplyr::mutate(
                value = case_when(
                    .data[[selected_column]] == x ~ "1",
                    .data[[selected_column]] != x ~ "0"
                )
            ) |>
            dplyr::pull(value) |>
            as.integer() |>
            as.factor()

        plot_embedding(
            embedding = embedding[, c(x_column, y_column)],
            color_values = values,
            label = glue::glue(
                "{label}; ",
                "{x}: {sum(as.integer(as.character(values)), na.rm = TRUE)}"
            ),
            label_position = NULL,
            show_color_value_labels = FALSE,
            show_color_legend = FALSE,
            geom_point_size = GEOM_POINT_SIZE,
            sort_values = TRUE,
            shuffle_values = FALSE,
            rasterise = RASTERISED,
            legend_size = 2
        ) +
            theme_customized(
                legend_key_size = 2,
                legend_text_size = 5
            ) +
            scale_color_manual(
                na.translate = TRUE,
                values = c("grey70", "salmon"),
                na.value = "#7F7F7F"
            ) +
            ggplot2::annotate(
                geom = "text",
                x = Inf,
                y = Inf,
                label = sum(as.integer(as.character(values)), na.rm = TRUE),
                size = 5 / ggplot2::.pt,
                hjust = 1,
                vjust = 1,
                na.rm = FALSE
            )
    }) |>
        purrr::reduce(`+`) +
        patchwork::plot_layout(ncol = n_cols) +
        patchwork::plot_annotation(
            theme = ggplot2::theme(plot.margin = ggplot2::margin())
        )
}

Data loading

PROJECT_DIR <- "/Users/jialei/Dropbox/Data/Projects/UTSW/Peri-implantation"

Matrix


Raw fastq files of these datasets were downloaded and re-processed as described in Yu et al. 2021 to minimize platform and processing differences.

ad <- reticulate::import(module = "anndata", convert = TRUE)
print(ad$`__version__`)
[1] "0.7.6"
adata_files <- purrr::map(c("PRJNA737139", 
                            "PRJEB11202", 
                            "PRJNA431392", 
                            "PRJEB40781"), \(x) {
    file.path(
        PROJECT_DIR,
        "raw",
        "public",
        x,
        "matrix",
        "adata.h5ad"
    )
})
purrr::map_lgl(adata_files, file.exists)
[1] TRUE TRUE TRUE TRUE
BACKED <- NULL
matrix_readcount_use <- purrr::map(adata_files, function(x) {
    ad$read_h5ad(
        filename = x, backed = BACKED
    ) |>
        convert_adata()
}) |>
    purrr::reduce(cbind)
dim(matrix_readcount_use)
[1] 33538 10702

Metadata

BACKED <- "r"
cell_metadata <- purrr::map(adata_files, function(x) {
    ad$read_h5ad(
        filename = x, backed = BACKED
    )$obs |>
        tibble::rownames_to_column(var = "cell") |>
        dplyr::select(cell, everything())
}) |>
    dplyr::bind_rows()

cell_metadata |> head() |> knitr::kable()
cell batch num_umis num_features mt_percentage
GSM5375527 PRJNA737139 2550015 7220 0.0446268
GSM5375528 PRJNA737139 1571718 7112 0.0408521
GSM5375529 PRJNA737139 1508237 5800 0.0472021
GSM5375530 PRJNA737139 2536507 7031 0.0456332
GSM5375531 PRJNA737139 2302943 5056 0.0269316
GSM5375532 PRJNA737139 2260431 5769 0.0238485

Check memory usage.

purrr::walk(list(matrix_readcount_use, cell_metadata), \(x) {
    print(object.size(x), units = "auto", standard = "SI")
})
922.6 MB
1.1 MB

Clustering w/ PSCs and TSCs

Cell metadata

# Kagawa et al. 2021
cell_metadata_PRJNA737139 <- vroom::vroom(
    file = file.path(
        PROJECT_DIR,
        "raw",
        "public",
        "PRJNA737139",
        "matrix/cell_metadata.csv"
    )
) |>
    dplyr::mutate(
        lineage = factor(
            lineage,
            levels = c(
                "Naive H9",
                "Primed H9",
                "TSCs",
                "Blastoid_24h",
                "Blastoid_60h",
                "Blastoid_96h"
            )
        ),
        source_name = factor(
            source_name
        )
    )

cell_metadata_PRJNA737139 |> head() |> knitr::kable()
cell run sampleid developmental_stage source_name nFeature_RNA percent.mt samplename treatment blastoid.Fig2b.lowres blastoid.FigS3a.highres blastoid.FigS2g.PZT.integration blastoid.FigS3g.T.integration lineage
GSM5375527 SRR14795911 150182 E8 stem cells 7858 6.808364 primed_H9-150182 E8 4 9 5 2 Primed H9
GSM5375528 SRR14795912 150183 E8 stem cells 7607 6.181098 primed_H9-150183 E8 4 9 5 2 Primed H9
GSM5375529 SRR14795913 150185 E8 stem cells 6161 7.011326 primed_H9-150185 E8 4 9 5 2 Primed H9
GSM5375530 SRR14795914 150187 E8 stem cells 7574 7.557354 primed_H9-150187 E8 4 9 5 2 Primed H9
GSM5375531 SRR14795915 150188 E8 stem cells 5386 4.021456 primed_H9-150188 E8 4 9 2 8 Primed H9
GSM5375532 SRR14795916 150189 E8 stem cells 6260 3.559934 primed_H9-150189 E8 4 9 5 2 Primed H9


The expression matrix was re-generated from raw fastq files. To make this analysis comparable with the original publication, we retrieved the cell ids from the deposited dataset.


Retrieve the cell metadata from Gene Expression Omnibus.

cell_metadata_PRJNA737139_author <- vroom::vroom(
    file = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE177nnn/GSE177689/suppl/GSE177689_blastoid.H9.okae_bts5__scRNAseq.unfiltered__metadata.tsv.gz"
)
Rows: 2715 Columns: 9
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (2): samplename, treatment
dbl (7): sampleid, nFeature_RNA, percent.mt, blastoid.Fig2b.lowres, blastoid...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cell_metadata_PRJNA737139_author |> head() |> knitr::kable()
sampleid nFeature_RNA percent.mt samplename treatment blastoid.Fig2b.lowres blastoid.FigS3a.highres blastoid.FigS2g.PZT.integration blastoid.FigS3g.T.integration
150567 6697 5.502513 blastoid_24h-150567 PALLY 0 3 NA NA
150568 5773 2.771147 blastoid_24h-150568 PALLY 0 1 NA NA
150569 3996 1.933010 blastoid_24h-150569 PALLY 0 1 NA NA
150570 0 NA blastoid_24h-150570 PALLY NA NA NA NA
150571 6380 5.795510 blastoid_24h-150571 PALLY 0 1 NA NA
150572 5089 2.590785 blastoid_24h-150572 PALLY 0 1 NA NA
num_cells_used_in_Fig2b <- cell_metadata_PRJNA737139_author |>
    dplyr::filter(
        !is.na(blastoid.Fig2b.lowres)
    ) |>
    nrow()

print(
    glue::glue(
        "Total number of cells deposited: ",
        "{nrow(cell_metadata_PRJNA737139_author)}"
    ),
    glue::glue(
        "Number of cells used in Fig. 2b: ",
        "{num_cells_used_in_Fig2b}"
    )
)
Total number of cells deposited: 2715
Number of cells used in Fig. 2b: 2067
num_cells_deposited_blastoid <- cell_metadata_PRJNA737139_author |>
    dplyr::filter(
        stringr::str_starts(string = treatment, pattern = "PALLY"),
    ) |>
    nrow()

num_cells_used_in_Fig2b_blastoid <- cell_metadata_PRJNA737139_author |>
    dplyr::filter(
        stringr::str_starts(string = treatment, pattern = "PALLY"),
        !is.na(blastoid.Fig2b.lowres)
    ) |>
    nrow()

print(
    glue::glue(
        "Total number of blastoid cells deposited: ",
        "{num_cells_deposited_blastoid}"
    ),
    glue::glue(
        "Number of blastoid cells used in Fig. 2b: ",
        "{num_cells_used_in_Fig2b_blastoid}"
    )
)
Total number of blastoid cells deposited: 2311
Number of blastoid cells used in Fig. 2b: 1672

2067 out of total 2715 deposited cells (73.1%), and 1672 out of 2311 deposited blastoid cells (72.3%) are used in Fig. 2a-d.

Visualization

Embedding

EMBEDDING_FILE <- glue::glue(
    "embedding_ncomponents10_ccc1_seed20210719.csv.gz"
)

embedding_2067 <- vroom::vroom(
    file = file.path(
        PROJECT_DIR,
        "raw",
        "public",
        "PRJNA737139",
        "clustering",
        "PRJNA737139",
        "exploring",
        "Scanpy_Harmony",
        EMBEDDING_FILE
    )
)
Rows: 2067 Columns: 16
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (2): cell, batch
dbl (14): louvain, leiden, x_tsne, y_tsne, x_fitsne, y_fitsne, x_umap_min_di...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Clustering

GEOM_POINT_SIZE <- 0.6
RASTERISED <- TRUE
x_column <- "x_umap_min_dist=0.1"
y_column <- "y_umap_min_dist=0.1"

EMBEDDING_TITLE_PREFIX <- "UMAP"

p_embedding_leiden <- plot_embedding(
    embedding = embedding_2067[, c(x_column, y_column)],
    color_values = embedding_2067$leiden |> as.factor(),
    label = glue::glue("{EMBEDDING_TITLE_PREFIX}; Leiden"),
    label_position = NULL,
    show_color_value_labels = TRUE,
    show_color_legend = FALSE,
    geom_point_size = GEOM_POINT_SIZE,
    sort_values = FALSE,
    shuffle_values = FALSE,
    rasterise = RASTERISED
) +
    theme_customized()

p_embedding_UMI <- plot_embedding(
    embedding = embedding_2067[, c(x_column, y_column)],
    color_values = log10(
        Matrix::colSums(matrix_readcount_use[, embedding_2067$cell])
    ),
    label = glue::glue("{EMBEDDING_TITLE_PREFIX}; UMI"),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE * 1,
    geom_point_alpha = 1,
    sort_values = TRUE,
    shuffle_values = FALSE,
    label_size = 2.5,
    label_hjust = 0,
    label_vjust = 0,
    rasterise = RASTERISED,
    legend_size = 2,
    legend_ncol = 1
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    )

p_embedding_MT <- plot_embedding(
    embedding = embedding_2067[, c(x_column, y_column)],
    color_values = embedding_2067 |>
        dplyr::left_join(
            cell_metadata # |> dplyr::select(-batch)
        ) |>
        dplyr::pull(mt_percentage),
    label = glue::glue("{EMBEDDING_TITLE_PREFIX}; MT %"),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE * 1,
    geom_point_alpha = 1,
    sort_values = TRUE,
    shuffle_values = FALSE,
    label_size = 2.5,
    label_hjust = 0,
    label_vjust = 0,
    rasterise = RASTERISED,
    legend_size = 2,
    legend_ncol = 1
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    )

selected_feature <- "ENSG00000204531_POU5F1"
p_embedding_POU5F1 <- plot_embedding(
    embedding = embedding_2067[, c(x_column, y_column)],
    color_values = log10(
        calc_cpm(matrix_readcount_use)[
            selected_feature,
            embedding_2067$cell
        ] + 1
    ),
    label = glue::glue("{EMBEDDING_TITLE_PREFIX}; {selected_feature}"),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE,
    geom_point_alpha = 1,
    sort_values = TRUE,
    shuffle_values = FALSE,
    label_size = 2.5,
    label_hjust = 0,
    label_vjust = 0,
    rasterise = FALSE,
    legend_size = 2,
    legend_ncol = 1
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    )
purrr::reduce(
    list(
        p_embedding_leiden,
        p_embedding_UMI,
        p_embedding_MT,
        p_embedding_POU5F1
    ), `+`
) +
    patchwork::plot_layout(ncol = 2) +
    patchwork::plot_annotation(
        theme = ggplot2::theme(plot.margin = ggplot2::margin())
    )

Summary

embedding_2067 |>
    dplyr::left_join(
        cell_metadata
    ) |>
    dplyr::mutate(
        num_umis = Matrix::colSums(matrix_readcount_use[, cell]),
        num_genes = Matrix::colSums(matrix_readcount_use[, cell] > 0),
    ) |>
    dplyr::group_by(leiden) |>
    dplyr::summarise(
        num_cells = n(),
        median_umis = median(num_umis),
        median_genes = median(num_genes),
        meidan_MT = median(mt_percentage)
    ) |>
    gt::gt() |>
    gt::tab_options(table.font.size = "median")
leiden num_cells median_umis median_genes meidan_MT
0 205 2140756 5387.0 0.0529720842
1 182 2311908 5423.0 0.0404300979
2 173 1631691 4577.0 0.0488606864
3 156 2319072 6379.5 0.0413304999
4 152 1390714 3764.5 0.0543443623
5 149 855797 3103.0 0.0364863153
6 126 2646474 6842.5 0.0501413970
7 124 2202163 6414.5 0.0413531842
8 119 1673032 4618.0 0.0522041437
9 115 1849883 5760.0 0.0506782652
10 115 1738237 3665.0 0.0002466655
11 114 1201448 3621.0 0.0016033615
12 109 1327329 3660.0 0.0361322014
13 103 1507763 4685.0 0.0376887867
14 79 1519467 4506.0 0.0003062544
15 46 1677120 4610.0 0.0469316175


embedding_2067 |>
    dplyr::left_join(
        cell_metadata,
    ) |>
    dplyr::left_join(
        cell_metadata_PRJNA737139
    ) |>
    dplyr::mutate(
        num_umis = Matrix::colSums(matrix_readcount_use[, cell]),
        num_genes = Matrix::colSums(matrix_readcount_use[, cell] > 0),
    ) |>
    dplyr::group_by(lineage) |>
    dplyr::summarise(
        num_cells = n(),
        median_umis = median(num_umis),
        median_genes = median(num_genes),
        meidan_MT = median(mt_percentage)
    ) |>
    gt::gt() |>
    gt::tab_options(table.font.size = "median")
lineage num_cells median_umis median_genes meidan_MT
Naive H9 143 2322872 6187.0 0.03970047
Primed H9 124 2202163 6414.5 0.04135318
TSCs 128 2642517 6831.0 0.05028367
Blastoid_24h 223 2036212 5169.0 0.04553170
Blastoid_60h 460 1964516 4855.5 0.03979052
Blastoid_96h 989 1465353 4264.0 0.04480325

Cell type

plot_embedding_highlight(
    embedding = embedding_2067,
    x = cell_metadata_PRJNA737139,
    y = "lineage",
    label = "Kagawa et al. 2021",
    n_cols = 2
)

Expression

##| column: page

FEATURES_SELECTED <- c(
    "ENSG00000171872_KLF17",
    "ENSG00000204531_POU5F1",
    "ENSG00000111704_NANOG",
    "ENSG00000186103_ARGFX",
    #
    "ENSG00000164736_SOX17",
    "ENSG00000125798_FOXA2",
    "ENSG00000136574_GATA4",
    "ENSG00000134853_PDGFRA",
    #
    "ENSG00000179348_GATA2",
    "ENSG00000070915_SLC12A3",
    "ENSG00000165556_CDX2",
    "ENSG00000007866_TEAD3"
)

purrr::map(FEATURES_SELECTED, \(x) {
    selected_feature <- x

    cat(selected_feature, "\n")
    values <- log10(
        calc_cpm(
            matrix_readcount_use[, embedding_2067$cell]
        )[selected_feature, ] + 1
    )

    plot_embedding(
        embedding = embedding_2067[, c(x_column, y_column)],
        color_values = values,
        label = paste(
            EMBEDDING_TITLE_PREFIX,
            selected_feature |> stringr::str_remove(pattern = "^E.+_"),
            sep = "; "
        ),
        label_position = NULL,
        show_color_value_labels = FALSE,
        show_color_legend = TRUE,
        geom_point_size = GEOM_POINT_SIZE,
        sort_values = TRUE,
        shuffle_values = FALSE,
        rasterise = RASTERISED,
        legend_size = 2
    ) +
        scale_color_viridis_c(
            na.value = "grey80"
        ) +
        theme_customized(
            legend_key_size = 2,
            legend_text_size = 5
        )
}) |>
    purrr::reduce(`+`) +
    patchwork::plot_layout(ncol = 3, byrow = FALSE) +
    patchwork::plot_annotation(
        theme = ggplot2::theme(plot.margin = ggplot2::margin())
    )
ENSG00000171872_KLF17 
ENSG00000204531_POU5F1 
ENSG00000111704_NANOG 
ENSG00000186103_ARGFX 
ENSG00000164736_SOX17 
ENSG00000125798_FOXA2 
ENSG00000136574_GATA4 
ENSG00000134853_PDGFRA 
ENSG00000179348_GATA2 
ENSG00000070915_SLC12A3 
ENSG00000165556_CDX2 
ENSG00000007866_TEAD3 

Author-defined clusters


Visualize author-defined clusters on this embedding (Low resolution; Fig. 2b). Note: 1672 out of total 2311 GEO-deposited blastoid cells (72.3%) are used in Fig. 2a-d.


GEO-deposited cell metadata.

cell_metadata_PRJNA737139_author |>
    dplyr::count(treatment, name = "num_cells") |>
    gt::gt() |>
    gt::tab_options(table.font.size = "median") |>
    # gt::grand_summary_rows(
    gt::summary_rows(
        columns = c(num_cells),
        fns = list(
            Sum = ~ sum(.)
        ),
        decimals = 0
    )
treatment num_cells
2D_TSCM 128
E8 125
PALLY 248
PALLY_48_hours_LY_12_hours 744
PALLY_48_hours_LY_48_hours 1319
PXGL 151
Sum 2,715


Cells included in Fig. 2a.

cell_metadata_PRJNA737139_author |>
    dplyr::filter(
        !is.na(blastoid.Fig2b.lowres)
    ) |>
    dplyr::count(treatment, name = "num_cells") |>
    gt::gt() |>
    gt::tab_options(table.font.size = "median") |>
    gt::summary_rows(
        columns = c(num_cells),
        fns = list(
            Sum = ~ sum(.)
        ),
        decimals = 0
    )
treatment num_cells
2D_TSCM 128
E8 124
PALLY 223
PALLY_48_hours_LY_12_hours 460
PALLY_48_hours_LY_48_hours 989
PXGL 143
Sum 2,067


Match the names used in Fig. 2a.

cell_metadata_PRJNA737139 |>
    dplyr::filter(
        !is.na(blastoid.Fig2b.lowres)
    ) |>
    dplyr::count(lineage, name = "num_cells") |>
    dplyr::rename(group = lineage) |>
    gt::gt() |>
    gt::tab_options(table.font.size = "median") |>
    gt::summary_rows(
        columns = c(num_cells),
        fns = list(
            Sum = ~ sum(.)
        ),
        decimals = 0
    )
group num_cells
Naive H9 143
Primed H9 124
TSCs 128
Blastoid_24h 223
Blastoid_60h 460
Blastoid_96h 989
Sum 2,067

Low resolution


Visualize author-defined clusters on this embedding (Low resolution; Fig. 2b).

plot_embedding(
    embedding = embedding_2067[, c(x_column, y_column)],
    color_values = embedding_2067 |>
        dplyr::left_join(
            cell_metadata_PRJNA737139
        ) |>
        dplyr::pull(blastoid.Fig2b.lowres)
        |> as.factor(),
    label = glue::glue("{EMBEDDING_TITLE_PREFIX}; Fig. 2b clusters"),
    label_position = NULL,
    show_color_value_labels = TRUE,
    show_color_legend = FALSE,
    geom_point_size = GEOM_POINT_SIZE,
    sort_values = FALSE,
    shuffle_values = FALSE,
    rasterise = RASTERISED
) +
    theme_customized()

calc_group_composition(
    data = embedding_2067 |>
        dplyr::left_join(
            cell_metadata_PRJNA737139
        ),
    x = "blastoid.Fig2b.lowres",
    group = "lineage"
) |>
    dplyr::mutate(
        blastoid.Fig2b.lowres = as.factor(blastoid.Fig2b.lowres)
    ) |>
    plot_barplot(
        x = "blastoid.Fig2b.lowres",
        y = "percentage",
        z = "lineage",
        legend_ncol = 1
    )

Visualize selected four non-blastocyst genes on this embedding (Extended Data Fig. 3b).

FEATURES_SELECTED <- c(
    "ENSG00000094755_GABRP",
    "ENSG00000134817_APLNR",
    "ENSG00000016082_ISL1",
    "ENSG00000143320_CRABP2"
)

purrr::map(FEATURES_SELECTED, \(x) {
    selected_feature <- x

    cat(selected_feature, "\n")
    values <- log10(
        calc_cpm(matrix_readcount_use[, embedding_2067$cell])
        [selected_feature, ] + 1
    )

    plot_embedding(
        embedding = embedding_2067[, c(x_column, y_column)],
        color_values = values,
        label = paste(
            EMBEDDING_TITLE_PREFIX,
            selected_feature |> stringr::str_remove(pattern = "^E.+_"),
            sep = "; "
        ),
        label_position = NULL,
        show_color_value_labels = FALSE,
        show_color_legend = TRUE,
        geom_point_size = GEOM_POINT_SIZE * 1.5,
        sort_values = TRUE,
        shuffle_values = FALSE,
        rasterise = RASTERISED,
        legend_size = 2
    ) +
        scale_color_viridis_c(
            na.value = "grey80"
        ) +
        theme_customized(
            legend_key_size = 2,
            legend_text_size = 5
        )
}) |>
    purrr::reduce(`+`) +
    patchwork::plot_layout(ncol = 2, byrow = FALSE) +
    patchwork::plot_annotation(
        theme = ggplot2::theme(plot.margin = ggplot2::margin())
    )
ENSG00000094755_GABRP 
ENSG00000134817_APLNR 
ENSG00000016082_ISL1 
ENSG00000143320_CRABP2 

High resolution


Visualize author-defined clusters on this embedding (High resolution; Extended Data Fig. 3a).

plot_embedding(
    embedding = embedding_2067[, c(x_column, y_column)],
    color_values = embedding_2067 |>
        dplyr::left_join(
            cell_metadata_PRJNA737139
        ) |>
        dplyr::pull(blastoid.FigS3a.highres)
        |> as.factor(),
    label = glue::glue("{EMBEDDING_TITLE_PREFIX}; Fig. S3a clusters"),
    label_position = NULL,
    show_color_value_labels = TRUE,
    show_color_legend = FALSE,
    geom_point_size = GEOM_POINT_SIZE,
    sort_values = FALSE,
    shuffle_values = FALSE,
    rasterise = RASTERISED
) +
    theme_customized()

p_barplot_highres_1 <- calc_group_composition(
    data = embedding_2067 |>
        dplyr::left_join(
            cell_metadata_PRJNA737139
        ),
    x = "blastoid.FigS3a.highres",
    group = "lineage"
) |>
    dplyr::mutate(
        lineage = stringr::str_remove(string = lineage, pattern = "_.+$"),
        lineage = as.factor(
            lineage
        ),
        blastoid.FigS3a.highres = as.factor(blastoid.FigS3a.highres)
    ) |>
    plot_barplot(
        x = "blastoid.FigS3a.highres",
        y = "percentage",
        z = "lineage",
        legend_ncol = 1
    )

p_barplot_highres_2 <- calc_group_composition(
    data = embedding_2067 |>
        dplyr::left_join(
            cell_metadata_PRJNA737139
        ),
    x = "blastoid.FigS3a.highres",
    group = "lineage"
) |>
    dplyr::mutate(
        blastoid.FigS3a.highres = as.factor(blastoid.FigS3a.highres)
    ) |>
    plot_barplot(
        x = "blastoid.FigS3a.highres",
        y = "percentage",
        z = "lineage",
        legend_ncol = 1
    )

list(p_barplot_highres_1, p_barplot_highres_2) |>
    purrr::reduce(`+`) +
    patchwork::plot_layout(ncol = 1, byrow = FALSE, guides = "collect") +
    patchwork::plot_annotation(
        theme = ggplot2::theme(plot.margin = ggplot2::margin())
    )

Subset of blastoid cells


According to the cell metadata deposited at GEO, only 96h-blastoid cells were included for comparison with other datasets.

cell_metadata_PRJNA737139_author |>
    dplyr::filter(
        !is.na(blastoid.FigS2g.PZT.integration)
    ) |>
    dplyr::count(treatment, name = "num_cells") |>
    gt::gt() |>
    gt::tab_options(table.font.size = "median") |>
    gt::summary_rows(
        columns = c(num_cells),
        fns = list(
            Sum = ~ sum(.)
        ),
        decimals = 0
    )
treatment num_cells
E8 124
PALLY_48_hours_LY_48_hours 989
PXGL 143
Sum 1,256


Match the names used in Fig. 2a.

cell_metadata_PRJNA737139 |>
    dplyr::filter(
        !is.na(blastoid.FigS2g.PZT.integration)
    ) |>
    dplyr::count(lineage, name = "num_cells") |>
    dplyr::rename(group = lineage) |>
    gt::gt() |>
    gt::tab_options(table.font.size = "median") |>
    gt::summary_rows(
        columns = c(num_cells),
        fns = list(
            Sum = ~ sum(.)
        ),
        decimals = 0
    )
group num_cells
Naive H9 143
Primed H9 124
Blastoid_96h 989
Sum 1,256


Highlight blastoid cells used in Fig. 2e-f.

plot_embedding(
    embedding = embedding_2067[, c(x_column, y_column, "cell")] |>
        dplyr::left_join(cell_metadata_PRJNA737139),
    color_values = embedding_2067[, c(x_column, y_column, "cell")] |>
        dplyr::left_join(cell_metadata_PRJNA737139) |>
        dplyr::mutate(
            value = case_when(
                (
                    stringr::str_detect(
                        string = lineage, pattern = "Blast"
                    ) & is.na(blastoid.FigS2g.PZT.integration)
                ) ~ "1",
                TRUE ~ "0"
            )
        ) |>
        dplyr::pull(value) |>
        as.factor(),
    label = glue::glue("{EMBEDDING_TITLE_PREFIX}; 96h blastoid cells"),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = FALSE,
    geom_point_size = GEOM_POINT_SIZE,
    sort_values = FALSE,
    shuffle_values = FALSE,
    rasterise = RASTERISED
) +
    theme_customized() +
    ggplot2::scale_color_manual(
        values = c("grey70", "salmon")
    )

Integration w/ other datasets


Instead of only including a subset of each dataset, this embedding includes all the cells mentioned in their respective publications to cover the full spectrum of cellular diversity.

Included cells and their lineage annotations are the same as in their respective publications. See here for details.


EMBEDDING_FILE <- glue::glue(
    "embedding_ncomponents18_ccc1_seed20210719.csv.gz"
)

embedding <- vroom::vroom(
    file = file.path(
        PROJECT_DIR,
        "raw",
        "public",
        "PRJNA737139",
        "clustering",
        "PRJNA737139_PRJEB11202_PRJNA431392_PRJEB40781",
        "exploring",
        "Scanpy_Harmony_variable",
        EMBEDDING_FILE
    )
)

embedding |> head() |> knitr::kable()
cell batch louvain leiden x_tsne y_tsne x_fitsne y_fitsne x_umap_min_dist=0.5 y_umap_min_dist=0.5 x_umap_min_dist=0.1 y_umap_min_dist=0.1 x_phate y_phate x_pacmap y_pacmap
D10_IVC4_E1_B1_1 PRJNA431392 0 3 -7.556968 53.075813 31.59123 -41.64920 12.654197 -3.6284332 11.933653 -3.4647641 0.0166832 -0.0070260 7.790154 -11.322366
D10_IVC4_E1_B1_10 PRJNA431392 1 6 -116.844467 9.400366 72.41945 21.89626 -4.288216 4.7477736 -4.915794 5.2280717 0.0143929 -0.0097040 -11.047858 -28.940121
D10_IVC4_E1_B1_11 PRJNA431392 2 11 -21.472546 -11.853150 -45.40295 -15.77693 8.047470 -0.0075634 7.928275 0.9275900 0.0155106 0.0026800 -11.631661 -5.534178
D10_IVC4_E1_B1_12 PRJNA431392 3 1 -10.749787 -9.368344 -20.57556 -28.79359 9.341379 0.1016088 9.088892 1.1644886 0.0158193 -0.0085744 -10.191150 -6.816082
D10_IVC4_E1_B1_13 PRJNA431392 4 7 -23.630573 54.296280 39.70749 -59.28475 11.469043 -5.4803004 10.867358 -4.4557619 0.0265396 -0.0007893 9.937391 -12.877551
D10_IVC4_E1_B1_14 PRJNA431392 2 11 -22.174698 -12.000992 -46.05968 -15.38952 7.953345 0.0190667 7.878169 0.9776146 0.0184427 0.0036553 -12.004497 -5.443837
studies <- tibble::tribble(
    ~bioproject, ~citation,
    "PRJEB11202", "Petropoulos et al. 2016",
    "PRJEB40781", "Tyser et al. 2021",
    "PRJNA555602", "Zheng et al. 2019",
    "PRJNA431392", "Zhou et al. 2019",
    "PRJNA737139", "Kagawa et al. 2021",
    "PRJNA632839", "Yu et al. 2021",
    "PRJNA658478", "Liu et al. 2021",
    "PRJNA720968", "Yanagida et al. 2021",
    "PRJNA667174", "Fan et al. 2021",
    "PRJNA738498", "Sozen et al. 2021",
    "PRJNA737139", "Kagawa et al. 2021"
)

studies <- setNames(
    object = studies$citation,
    nm = studies$bioproject
)
embedding |>
    dplyr::count(batch, name = "num_cells") |>
    dplyr::mutate(
        study = studies[batch]
    ) |>
    gt::gt() |>
    gt::tab_options(table.font.size = "median") |>
    gt::summary_rows(
        columns = c(num_cells),
        fns = list(
            Sum = ~ sum(.)
        ),
        decimals = 0
    )
batch num_cells study
PRJEB11202 1529 Petropoulos et al. 2016
PRJEB40781 1195 Tyser et al. 2021
PRJNA431392 5911 Zhou et al. 2019
PRJNA737139 2067 Kagawa et al. 2021
Sum 10,702


Metadata

# PRJNA431392; Zhou et al. 2019
embryos_selected_PRJNA431392 <- c(
    "ha_D6_E2",
    "hm_D6_E1",
    "hm_D6_E2",
    "hm_D8_E2",
    "hm_D8_E3",
    "hm_D8_E5",
    "ha_D8_E1",
    "hm_D8_E1",
    "hv_D8_E1",
    "hv_D8_E2",
    "hv_D8_E3",
    "hv_D10_E6",
    "ha_D10_E1",
    "ha_D10_E2",
    "hm_D10_E4",
    "hm_D10_E9",
    "hv_D10_E7",
    "hv_D10_E8",
    "ha_D12_E1",
    "hv_D12_E1",
    "hv_D12_E2"
)

cell_metadata_PRJNA431392 <- vroom::vroom(
    file = file.path(
        PROJECT_DIR,
        "raw",
        "public",
        "PRJNA431392",
        "matrix/cell_metadata.csv"
    )
) |>
    dplyr::mutate(
        developmental_stage = factor(
            Day,
            levels = stringr::str_sort(
                unique(Day),
                numeric = TRUE
            )
        ),
        lineage = factor(
            Lineage,
            levels = c(
                "EPI",
                "PE",
                "TE",
                "MIX"
            )
        ),
        `3184` = case_when(
            Ori_Day_Emb %in% embryos_selected_PRJNA431392 ~ "1",
            TRUE ~ "0"
        )
    ) |>
    dplyr::rename(cell = Sample)


# PRJEB40781; Tyser et al. 2021
cell_metadata_PRJEB40781 <- vroom::vroom(
    file = file.path(
        PROJECT_DIR,
        "raw",
        "public",
        "PRJEB40781",
        "matrix/cell_metadata.csv"
    )
) |>
    dplyr::mutate(
        lineage = factor(lineage),
        lineage_ontology = factor(lineage_ontology),
        sampling_site = factor(sampling_site)
    )


# PRJEB11202; Petropoulos et al. 2016
cell_metadata_PRJEB11202 <- vroom::vroom(
    file = file.path(
        PROJECT_DIR,
        "raw",
        "public",
        "PRJEB11202",
        "matrix/cell_metadata.csv"
    )
) |>
    dplyr::mutate(
        developmental_stage = stringr::str_remove(
            string = individual,
            pattern = "\\..+$"
        ),
        developmental_stage = factor(
            developmental_stage,
            levels = stringr::str_sort(
                unique(developmental_stage),
                numeric = TRUE
            )
        ),
        #
        lineage = factor(
            inferred_lineage,
            levels = c(
                "epiblast",
                "primitive_endoderm",
                "trophectoderm",
                "not_applicable"
            )
        )
    ) |>
    dplyr::select(
        cell, lineage, developmental_stage
    )

Visualization

Clustering

p_embedding_leiden <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = embedding$leiden |> as.factor(),
    label = glue::glue("{EMBEDDING_TITLE_PREFIX}; Leiden"),
    label_position = NULL,
    show_color_value_labels = TRUE,
    show_color_legend = FALSE,
    geom_point_size = GEOM_POINT_SIZE,
    sort_values = FALSE,
    shuffle_values = FALSE,
    rasterise = RASTERISED
) +
    theme_customized()

p_embedding_UMI <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = log10(
        Matrix::colSums(matrix_readcount_use[, embedding$cell])
    ),
    label = glue::glue("{EMBEDDING_TITLE_PREFIX}; UMI"),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE * 1,
    geom_point_alpha = 1,
    sort_values = TRUE,
    shuffle_values = FALSE,
    label_size = 2.5,
    label_hjust = 0,
    label_vjust = 0,
    rasterise = RASTERISED,
    legend_size = 2,
    legend_ncol = 1
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    )

p_embedding_MT <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = embedding |>
        dplyr::left_join(
            cell_metadata # |> dplyr::select(-batch)
        ) |>
        dplyr::pull(mt_percentage),
    label = glue::glue("{EMBEDDING_TITLE_PREFIX}; MT %"),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE * 1,
    geom_point_alpha = 1,
    sort_values = TRUE,
    shuffle_values = FALSE,
    label_size = 2.5,
    label_hjust = 0,
    label_vjust = 0,
    rasterise = RASTERISED,
    legend_size = 2,
    legend_ncol = 1
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    )

p_embedding_batch <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = embedding$batch |> as.factor(),
    label = glue::glue("{EMBEDDING_TITLE_PREFIX}; Batch"),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE / 2,
    sort_values = FALSE,
    shuffle_values = TRUE,
    rasterise = RASTERISED,
    legend_size = 2
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    ) +
    scale_color_manual(
        values = scales::hue_pal()(n = length(unique(embedding$batch))),
        labels = studies
    )
purrr::reduce(
    list(
        p_embedding_leiden,
        p_embedding_batch,
        p_embedding_UMI,
        p_embedding_MT
    ), `+`
) +
    patchwork::plot_layout(ncol = 2) +
    patchwork::plot_annotation(
        theme = ggplot2::theme(plot.margin = ggplot2::margin())
    )

purrr::map(unique(embedding$batch), \(x) {
    plot_embedding(
        embedding = embedding[, c(x_column, y_column)],
        color_values = as.integer(embedding$batch == x) |> as.factor(),
        label = glue::glue(
            "{EMBEDDING_TITLE_PREFIX}; {studies[x]}, ",
            "{sum(as.integer(embedding$batch == x), na.rm = TRUE)}"
        ),
        label_position = NULL,
        show_color_value_labels = FALSE,
        show_color_legend = FALSE,
        geom_point_size = GEOM_POINT_SIZE,
        sort_values = TRUE,
        shuffle_values = FALSE,
        rasterise = RASTERISED,
        legend_size = 2
    ) +
        theme_customized(
            legend_key_size = 2,
            legend_text_size = 5
        ) +
        scale_color_manual(
            values = c("grey70", "salmon")
        )
}) |>
    purrr::reduce(`+`) +
    patchwork::plot_layout(ncol = 2) +
    patchwork::plot_annotation(
        theme = ggplot2::theme(plot.margin = ggplot2::margin())
    )

Cell annotation

Petropoulos et al. 2016

# Petropoulos et al. 2016
bioproject <- "PRJEB11202"
cell_metadata_selected <- cell_metadata_PRJEB11202

selected_column <- "developmental_stage"
p_embedding_developmental_stage <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = embedding |>
        dplyr::left_join(cell_metadata_selected) |>
        dplyr::pull(.data[[selected_column]]),
    label = glue::glue(
        "{EMBEDDING_TITLE_PREFIX}; {studies[bioproject]}; ",
        "{selected_column |>
        stringr::str_to_title() |>
        stringr::str_replace(pattern = \"_\", replacement = \" \")}"
    ),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE,
    sort_values = TRUE,
    shuffle_values = FALSE,
    rasterise = RASTERISED,
    legend_size = 2
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    ) +
    scale_color_manual(
        na.translate = TRUE,
        values = scales::hue_pal()(
            n = length(unique(cell_metadata_selected[[selected_column]]))
        ),
        na.value = "#7F7F7F"
    )

selected_column <- "lineage"
p_embedding_lineage <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = embedding |>
        dplyr::left_join(cell_metadata_selected) |>
        dplyr::pull(.data[[selected_column]]),
    label = glue::glue(
        "{EMBEDDING_TITLE_PREFIX}; {studies[bioproject]}; ",
        "{selected_column |>
        stringr::str_to_title() |>
        stringr::str_replace(pattern = \"_\", replacement = \" \")}"
    ),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE,
    sort_values = TRUE,
    shuffle_values = FALSE,
    rasterise = RASTERISED,
    legend_size = 2
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    ) +
    scale_color_manual(
        na.translate = TRUE,
        values = scales::hue_pal()(
            n = length(unique(cell_metadata_selected[[selected_column]]))
        ),
        na.value = "#7F7F7F"
    )

list(
    p_embedding_lineage,
    p_embedding_developmental_stage
) |>
    purrr::reduce(`+`) +
    patchwork::plot_layout(ncol = 2) +
    patchwork::plot_annotation(
        theme = ggplot2::theme(plot.margin = ggplot2::margin())
    )


Salmon: highlighted group of cells; Light grey: cells belonging to this dataset but not the highlighted group; Dark grey: cells belonging to other datasets.

plot_embedding_highlight(
    embedding = embedding,
    x = cell_metadata_selected,
    y = "lineage",
    label = studies[bioproject],
    n_cols = 2
)

plot_embedding_highlight(
    embedding = embedding,
    x = cell_metadata_selected,
    y = "developmental_stage",
    label = studies[bioproject],
    n_cols = 2
)

Zhou et al. 2019

# Zhou et al. 2019
bioproject <- "PRJNA431392"
cell_metadata_selected <- cell_metadata_PRJNA431392
selected_column <- "developmental_stage"

p_embedding_developmental_stage <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = embedding |>
        dplyr::left_join(cell_metadata_selected) |>
        dplyr::pull(.data[[selected_column]]),
    label = glue::glue(
        "{EMBEDDING_TITLE_PREFIX}; {studies[bioproject]}; ",
        "{selected_column |>
        stringr::str_to_title() |>
        stringr::str_replace(pattern = \"_\", replacement = \" \")}"
    ),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE,
    sort_values = TRUE,
    shuffle_values = FALSE,
    rasterise = RASTERISED,
    legend_size = 2
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    ) +
    scale_color_manual(
        na.translate = TRUE,
        values = scales::hue_pal()(
            n = length(unique(cell_metadata_selected[[selected_column]]))
        ),
        na.value = "#7F7F7F"
    )

selected_column <- "lineage"
p_embedding_lineage <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = embedding |>
        dplyr::left_join(cell_metadata_selected) |>
        dplyr::pull(.data[[selected_column]]),
    label = glue::glue(
        "{EMBEDDING_TITLE_PREFIX}; {studies[bioproject]}; ",
        "{selected_column |>
        stringr::str_to_title() |>
        stringr::str_replace(pattern = \"_\", replacement = \" \")}"
    ),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE,
    sort_values = TRUE,
    shuffle_values = FALSE,
    rasterise = RASTERISED,
    legend_size = 2
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    ) +
    scale_color_manual(
        na.translate = TRUE,
        values = scales::hue_pal()(
            n = length(unique(cell_metadata_selected[[selected_column]]))
        ),
        na.value = "#7F7F7F"
    )

selected_column <- "3184"
p_embedding_chosen <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = embedding |>
        dplyr::left_join(cell_metadata_selected) |>
        dplyr::pull(.data[[selected_column]]),
    label = glue::glue(
        "{EMBEDDING_TITLE_PREFIX}; {studies[bioproject]}; ",
        "{selected_column |>
        stringr::str_to_title() |>
        stringr::str_replace(pattern = \"_\", replacement = \" \")}"
    ),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE,
    sort_values = TRUE,
    shuffle_values = FALSE,
    rasterise = RASTERISED,
    legend_size = 2
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    ) +
    scale_color_manual(
        na.translate = TRUE,
        values = c("grey70", "salmon"),
        na.value = "#7F7F7F"
    )

list(
    p_embedding_lineage,
    p_embedding_developmental_stage,
    p_embedding_chosen
) |>
    purrr::reduce(`+`) +
    patchwork::plot_layout(ncol = 2) +
    patchwork::plot_annotation(
        theme = ggplot2::theme(plot.margin = ggplot2::margin())
    )

plot_embedding_highlight(
    embedding = embedding,
    x = cell_metadata_selected,
    y = "developmental_stage",
    label = studies[bioproject],
    n_cols = 2
)

Tyser et al. 2021

# Tyser et al. 2021
bioproject <- "PRJEB40781"
cell_metadata_selected <- cell_metadata_PRJEB40781

selected_column <- "sampling_site"
p_embedding_sampling_site <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = embedding |>
        dplyr::left_join(cell_metadata_selected) |>
        dplyr::pull(.data[[selected_column]]),
    label = glue::glue(
        "{EMBEDDING_TITLE_PREFIX}; {studies[bioproject]}; ",
        "{selected_column |>
        stringr::str_to_title() |>
        stringr::str_replace(pattern = \"_\", replacement = \" \")}"
    ),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE,
    sort_values = TRUE,
    shuffle_values = FALSE,
    rasterise = RASTERISED,
    legend_size = 2
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    ) +
    scale_color_manual(
        na.translate = TRUE,
        values = scales::hue_pal()(
            n = length(unique(cell_metadata_selected[[selected_column]]))
        ),
        na.value = "grey70"
    )

selected_column <- "lineage"
p_embedding_lineage <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = embedding |>
        dplyr::left_join(cell_metadata_selected) |>
        dplyr::pull(.data[[selected_column]]),
    label = glue::glue(
        "{EMBEDDING_TITLE_PREFIX}; {studies[bioproject]}; ",
        "{selected_column |>
        stringr::str_to_title() |>
        stringr::str_replace(pattern = \"_\", replacement = \" \")}"
    ),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE,
    sort_values = TRUE,
    shuffle_values = FALSE,
    rasterise = RASTERISED,
    legend_size = 2
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    ) +
    scale_color_manual(
        na.translate = TRUE,
        values = scales::hue_pal()(
            n = length(unique(cell_metadata_selected[[selected_column]]))
        ),
        na.value = "#7F7F7F"
    )

list(
    p_embedding_lineage,
    p_embedding_sampling_site
) |>
    purrr::reduce(`+`) +
    patchwork::plot_layout(ncol = 2) +
    patchwork::plot_annotation(
        theme = ggplot2::theme(plot.margin = ggplot2::margin())
    )

plot_embedding_highlight(
    embedding = embedding,
    x = cell_metadata_selected,
    y = "lineage",
    label = studies[bioproject],
    n_cols = 2
)

Kagawa et al. 2021

# Kagawa et al. 2021
bioproject <- "PRJNA737139"
cell_metadata_selected <- cell_metadata_PRJNA737139

selected_column <- "source_name"
p_embedding_source <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = embedding |>
        dplyr::left_join(cell_metadata_selected) |>
        dplyr::pull(.data[[selected_column]]),
    label = glue::glue(
        "{EMBEDDING_TITLE_PREFIX}; {studies[bioproject]}; ",
        "{selected_column |>
        stringr::str_to_title() |>
        stringr::str_replace(pattern = \"_\", replacement = \" \")}"
    ),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE,
    sort_values = TRUE,
    shuffle_values = FALSE,
    rasterise = RASTERISED,
    legend_size = 2
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    ) +
    scale_color_manual(
        na.translate = TRUE,
        values = scales::hue_pal()(
            n = length(unique(cell_metadata_selected[[selected_column]]))
        ),
        na.value = "#7F7F7F"
    )

selected_column <- "lineage"
p_embedding_lineage <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = embedding |>
        dplyr::left_join(cell_metadata_selected) |>
        dplyr::pull(.data[[selected_column]]),
    label = glue::glue(
        "{EMBEDDING_TITLE_PREFIX}; {studies[bioproject]}; ",
        "{selected_column |>
        stringr::str_to_title() |>
        stringr::str_replace(pattern = \"_\", replacement = \" \")}"
    ),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE,
    sort_values = TRUE,
    shuffle_values = FALSE,
    rasterise = RASTERISED,
    legend_size = 2
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    ) +
    scale_color_manual(
        na.translate = TRUE,
        values = scales::hue_pal()(
            n = length(unique(cell_metadata_selected[[selected_column]]))
        ),
        na.value = "#7F7F7F"
    )

list(
    p_embedding_source,
    p_embedding_lineage
) |>
    purrr::reduce(`+`) +
    patchwork::plot_layout(ncol = 2) +
    patchwork::plot_annotation(
        theme = ggplot2::theme(plot.margin = ggplot2::margin())
    )

plot_embedding_highlight(
    embedding = embedding,
    x = cell_metadata_selected,
    y = "lineage",
    label = studies[bioproject],
    n_cols = 2
)

R session info

devtools::session_info()
─ Session info ───────────────────────────────────────────────────────────────
 setting  value
 version  R version 4.1.2 (2021-11-01)
 os       macOS Monterey 12.1
 system   aarch64, darwin20.6.0
 ui       unknown
 language (EN)
 collate  en_US.UTF-8
 ctype    en_US.UTF-8
 tz       America/Chicago
 date     2022-01-24
 pandoc   2.17.0.1 @ /opt/homebrew/bin/ (via rmarkdown)

─ Packages ───────────────────────────────────────────────────────────────────
 package       * version     date (UTC) lib source
 assertthat      0.2.1       2019-03-21 [1] CRAN (R 4.1.1)
 backports       1.4.1       2021-12-13 [1] CRAN (R 4.1.2)
 beeswarm        0.4.0       2021-06-01 [1] CRAN (R 4.1.2)
 bit             4.0.4       2020-08-04 [1] CRAN (R 4.1.1)
 bit64           4.0.5       2020-08-30 [1] CRAN (R 4.1.1)
 brio            1.1.3       2021-11-30 [1] CRAN (R 4.1.2)
 broom           0.7.11      2022-01-03 [1] CRAN (R 4.1.2)
 cachem          1.0.6       2021-08-19 [1] CRAN (R 4.1.1)
 callr           3.7.0       2021-04-20 [1] CRAN (R 4.1.1)
 cellranger      1.1.0       2016-07-27 [1] CRAN (R 4.1.1)
 checkmate       2.0.0       2020-02-06 [1] CRAN (R 4.1.1)
 cli             3.1.1       2022-01-20 [1] CRAN (R 4.1.2)
 colorspace      2.0-2       2021-06-24 [1] CRAN (R 4.1.1)
 crayon          1.4.2       2021-10-29 [1] CRAN (R 4.1.1)
 curl            4.3.2       2021-06-23 [1] CRAN (R 4.1.1)
 data.table      1.14.2      2021-09-27 [1] CRAN (R 4.1.1)
 DBI             1.1.2       2021-12-20 [1] CRAN (R 4.1.2)
 dbplyr          2.1.1       2021-04-06 [1] CRAN (R 4.1.1)
 desc            1.4.0       2021-09-28 [1] CRAN (R 4.1.1)
 devtools        2.4.3.9000  2022-01-22 [1] Github (r-lib/devtools@41280ac)
 digest          0.6.29      2021-12-01 [1] CRAN (R 4.1.2)
 dplyr         * 1.0.7.9000  2022-01-12 [1] Github (tidyverse/dplyr@0501335)
 dtplyr          1.2.1       2022-01-19 [1] CRAN (R 4.1.2)
 ellipsis        0.3.2       2021-04-29 [1] CRAN (R 4.1.1)
 evaluate        0.14        2019-05-28 [1] CRAN (R 4.1.1)
 extrafont     * 0.17        2014-12-08 [1] CRAN (R 4.1.1)
 extrafontdb     1.0         2012-06-11 [1] CRAN (R 4.1.1)
 fansi           1.0.2       2022-01-14 [1] CRAN (R 4.1.2)
 farver          2.1.0       2021-02-28 [1] CRAN (R 4.1.1)
 fastmap         1.1.0       2021-01-25 [1] CRAN (R 4.1.1)
 forcats       * 0.5.1.9000  2021-11-29 [1] Github (tidyverse/forcats@b4dade0)
 fs              1.5.2.9000  2021-12-09 [1] Github (r-lib/fs@6d1182f)
 gargle          1.2.0       2021-07-02 [1] CRAN (R 4.1.1)
 generics        0.1.1       2021-10-25 [1] CRAN (R 4.1.1)
 ggbeeswarm      0.6.0       2017-08-07 [1] CRAN (R 4.1.2)
 ggplot2       * 3.3.5       2021-06-25 [1] CRAN (R 4.1.1)
 ggrastr         1.0.1       2021-12-08 [1] Github (VPetukhov/ggrastr@7aed9af)
 glue            1.6.1.9000  2022-01-23 [1] Github (tidyverse/glue@3da70df)
 googledrive     2.0.0       2021-07-08 [1] CRAN (R 4.1.1)
 googlesheets4   1.0.0       2021-07-21 [1] CRAN (R 4.1.1)
 gt              0.3.1.9000  2022-01-17 [1] Github (rstudio/gt@fcabb41)
 gtable          0.3.0.9000  2021-10-28 [1] Github (r-lib/gtable@a0bd272)
 haven           2.4.3       2021-08-04 [1] CRAN (R 4.1.1)
 highr           0.9         2021-04-16 [1] CRAN (R 4.1.1)
 hms             1.1.1       2021-09-26 [1] CRAN (R 4.1.1)
 htmltools       0.5.2       2021-08-25 [1] CRAN (R 4.1.1)
 htmlwidgets     1.5.4       2021-09-08 [1] CRAN (R 4.1.1)
 httr            1.4.2       2020-07-20 [1] CRAN (R 4.1.1)
 jsonlite        1.7.3       2022-01-17 [1] CRAN (R 4.1.2)
 knitr           1.37.1      2021-12-21 [1] https://yihui.r-universe.dev (R 4.1.2)
 labeling        0.4.2       2020-10-20 [1] CRAN (R 4.1.1)
 lattice         0.20-45     2021-09-22 [2] CRAN (R 4.1.2)
 lifecycle       1.0.1       2021-09-24 [1] CRAN (R 4.1.1)
 lubridate       1.8.0       2022-01-20 [1] Github (tidyverse/lubridate@566590f)
 magrittr        2.0.1       2020-11-17 [1] CRAN (R 4.1.1)
 Matrix        * 1.4-0       2021-12-08 [2] CRAN (R 4.1.2)
 memoise         2.0.1       2021-11-26 [1] CRAN (R 4.1.2)
 modelr          0.1.8.9000  2021-10-27 [1] Github (tidyverse/modelr@16168e0)
 munsell         0.5.0       2018-06-12 [1] CRAN (R 4.1.1)
 patchwork     * 1.1.0.9000  2021-10-27 [1] Github (thomasp85/patchwork@79223d3)
 pillar          1.6.4       2021-10-18 [1] CRAN (R 4.1.1)
 pkgbuild        1.3.1       2021-12-20 [1] CRAN (R 4.1.2)
 pkgconfig       2.0.3       2019-09-22 [1] CRAN (R 4.1.1)
 pkgload         1.2.4       2021-11-30 [1] CRAN (R 4.1.2)
 png             0.1-7       2013-12-03 [1] CRAN (R 4.1.1)
 prettyunits     1.1.1       2020-01-24 [1] CRAN (R 4.1.1)
 processx        3.5.2       2021-04-30 [1] CRAN (R 4.1.1)
 ps              1.6.0       2021-02-28 [1] CRAN (R 4.1.1)
 purrr         * 0.3.4       2020-04-17 [1] CRAN (R 4.1.1)
 R.cache         0.15.0      2021-04-30 [1] CRAN (R 4.1.1)
 R.methodsS3     1.8.1       2020-08-26 [1] CRAN (R 4.1.1)
 R.oo            1.24.0      2020-08-26 [1] CRAN (R 4.1.1)
 R.utils         2.11.0      2021-09-26 [1] CRAN (R 4.1.1)
 R6              2.5.1.9000  2021-12-09 [1] Github (r-lib/R6@1b05b89)
 ragg            1.2.1.9000  2021-12-08 [1] Github (r-lib/ragg@c68c666)
 Rcpp            1.0.8       2022-01-13 [1] CRAN (R 4.1.2)
 readr         * 2.1.1       2021-11-30 [1] CRAN (R 4.1.2)
 readxl          1.3.1.9000  2022-01-20 [1] Github (tidyverse/readxl@2ccb82c)
 remotes         2.4.2       2022-01-24 [1] Github (r-lib/remotes@7b0ee01)
 reprex          2.0.1       2021-08-05 [1] CRAN (R 4.1.1)
 reticulate      1.23        2022-01-14 [1] CRAN (R 4.1.2)
 rlang           1.0.0       2022-01-20 [1] Github (r-lib/rlang@f2fbaad)
 rmarkdown       2.11.12     2022-01-24 [1] Github (rstudio/rmarkdown@b53a7ce)
 rprojroot       2.0.2       2020-11-15 [1] CRAN (R 4.1.1)
 rstudioapi      0.13.0-9000 2022-01-15 [1] Github (rstudio/rstudioapi@5d0f087)
 Rttf2pt1        1.3.9       2021-07-22 [1] CRAN (R 4.1.1)
 rvest           1.0.2       2021-10-16 [1] CRAN (R 4.1.1)
 sass            0.4.0       2021-05-12 [1] CRAN (R 4.1.1)
 scales          1.1.1       2020-05-11 [1] CRAN (R 4.1.1)
 sessioninfo     1.2.2       2021-12-06 [1] CRAN (R 4.1.2)
 stringi         1.7.6       2021-11-29 [1] CRAN (R 4.1.2)
 stringr       * 1.4.0.9000  2022-01-24 [1] Github (tidyverse/stringr@85f6140)
 styler        * 1.6.2.9000  2022-01-17 [1] Github (r-lib/styler@9274aed)
 systemfonts     1.0.3.9000  2021-12-07 [1] Github (r-lib/systemfonts@414114e)
 testthat        3.1.2.9000  2022-01-21 [1] Github (r-lib/testthat@54b9db2)
 textshaping     0.3.6       2021-10-13 [1] CRAN (R 4.1.1)
 tibble        * 3.1.6.9000  2022-01-18 [1] Github (tidyverse/tibble@7aa54e6)
 tidyr         * 1.1.4       2021-09-27 [1] CRAN (R 4.1.1)
 tidyselect      1.1.1       2021-04-30 [1] CRAN (R 4.1.1)
 tidyverse     * 1.3.1.9000  2021-12-08 [1] Github (tidyverse/tidyverse@6186fbf)
 tzdb            0.2.0       2021-10-27 [1] CRAN (R 4.1.1)
 usethis         2.1.5.9000  2022-01-20 [1] Github (r-lib/usethis@57b109a)
 utf8            1.2.2       2021-07-24 [1] CRAN (R 4.1.1)
 vctrs           0.3.8       2021-04-29 [1] CRAN (R 4.1.1)
 vipor           0.4.5       2017-03-22 [1] CRAN (R 4.1.2)
 viridisLite     0.4.0       2021-04-13 [1] CRAN (R 4.1.1)
 vroom           1.5.7       2021-11-30 [1] CRAN (R 4.1.2)
 withr           2.4.3       2021-11-30 [1] CRAN (R 4.1.2)
 xfun            0.29        2021-12-14 [1] CRAN (R 4.1.2)
 xml2            1.3.3       2021-11-30 [1] CRAN (R 4.1.2)
 yaml            2.2.1       2020-02-01 [1] CRAN (R 4.1.1)

 [1] /opt/homebrew/lib/R/4.1/site-library
 [2] /opt/homebrew/Cellar/r/4.1.2/lib/R/library

─ Python configuration ───────────────────────────────────────────────────────
 python:         /Users/jialei/.pyenv/shims/python
 libpython:      /Users/jialei/.pyenv/versions/miniforge3-4.10.1-5/lib/libpython3.9.dylib
 pythonhome:     /Users/jialei/.pyenv/versions/miniforge3-4.10.1-5:/Users/jialei/.pyenv/versions/miniforge3-4.10.1-5
 version:        3.9.5 | packaged by conda-forge | (default, Jun 19 2021, 00:24:55)  [Clang 11.1.0 ]
 numpy:          /Users/jialei/.pyenv/versions/miniforge3-4.10.1-5/lib/python3.9/site-packages/numpy
 numpy_version:  1.20.3
 anndata:        /Users/jialei/.pyenv/versions/miniforge3-4.10.1-5/lib/python3.9/site-packages/anndata
 
 NOTE: Python version was forced by RETICULATE_PYTHON

──────────────────────────────────────────────────────────────────────────────