Single-cell transcriptomes of human blastoids generated by Kagawa et al., 2021

Jialei Duan

2022-01-24

Kagawa, H., Javali, A., Khoei, H.H., Sommer, T.M., Sestini, G., Novatchkova, M., Scholte Op Reimer, Y., Castel, G., Bruneau, A., Maenhoudt, N., et al. (2021). Human blastoids model blastocyst development and implantation. Nature 1–9.

BioProject Accession: PRJNA737139
GEO Accession: GSE177689

Load required packages.

library(tidyverse)
library(Matrix)
library(patchwork)
library(extrafont)

Sys.time()

[1] "2022-01-24 22:16:17 CST"

Data preparation

Functions loading

source(
    file = file.path(
        SCRIPT_DIR,
        "utilities.R"
    )
)

plot_embedding_highlight <- function(embedding, x, y, label, n_cols = 3) {
    cell_metadata_selected <- x
    selected_column <- y

    purrr::map(levels(cell_metadata_selected[[selected_column]]), \(x) {
        values <- embedding |>
            dplyr::left_join(cell_metadata_selected) |>
            dplyr::mutate(
                value = case_when(
                    .data[[selected_column]] == x ~ "1",
                    .data[[selected_column]] != x ~ "0"
                )
            ) |>
            dplyr::pull(value) |>
            as.integer() |>
            as.factor()

        plot_embedding(
            embedding = embedding[, c(x_column, y_column)],
            color_values = values,
            label = glue::glue(
                "{label}; ",
                "{x}: {sum(as.integer(as.character(values)), na.rm = TRUE)}"
            ),
            label_position = NULL,
            show_color_value_labels = FALSE,
            show_color_legend = FALSE,
            geom_point_size = GEOM_POINT_SIZE,
            sort_values = TRUE,
            shuffle_values = FALSE,
            rasterise = RASTERISED,
            legend_size = 2
        ) +
            theme_customized(
                legend_key_size = 2,
                legend_text_size = 5
            ) +
            scale_color_manual(
                na.translate = TRUE,
                values = c("grey70", "salmon"),
                na.value = "#7F7F7F"
            ) +
            ggplot2::annotate(
                geom = "text",
                x = Inf,
                y = Inf,
                label = sum(as.integer(as.character(values)), na.rm = TRUE),
                size = 5 / ggplot2::.pt,
                hjust = 1,
                vjust = 1,
                na.rm = FALSE
            )
    }) |>
        purrr::reduce(`+`) +
        patchwork::plot_layout(ncol = n_cols) +
        patchwork::plot_annotation(
            theme = ggplot2::theme(plot.margin = ggplot2::margin())
        )
}

Data loading

PROJECT_DIR <- "/Users/jialei/Dropbox/Data/Projects/UTSW/Peri-implantation"

Matrix

Raw fastq files of these datasets were downloaded and re-processed as described in Yu et al. 2021 to minimize platform and processing differences.

ad <- reticulate::import(module = "anndata", convert = TRUE)
print(ad$`__version__`)

[1] "0.7.6"

adata_files <- purrr::map(c("PRJNA737139", 
                            "PRJEB11202", 
                            "PRJNA431392", 
                            "PRJEB40781"), \(x) {
    file.path(
        PROJECT_DIR,
        "raw",
        "public",
        x,
        "matrix",
        "adata.h5ad"
    )
})
purrr::map_lgl(adata_files, file.exists)

[1] TRUE TRUE TRUE TRUE

BACKED <- NULL
matrix_readcount_use <- purrr::map(adata_files, function(x) {
    ad$read_h5ad(
        filename = x, backed = BACKED
    ) |>
        convert_adata()
}) |>
    purrr::reduce(cbind)

dim(matrix_readcount_use)

[1] 33538 10702

Metadata

BACKED <- "r"
cell_metadata <- purrr::map(adata_files, function(x) {
    ad$read_h5ad(
        filename = x, backed = BACKED
    )$obs |>
        tibble::rownames_to_column(var = "cell") |>
        dplyr::select(cell, everything())
}) |>
    dplyr::bind_rows()

cell_metadata |> head() |> knitr::kable()

cell	batch	num_umis	num_features	mt_percentage
GSM5375527	PRJNA737139	2550015	7220	0.0446268
GSM5375528	PRJNA737139	1571718	7112	0.0408521
GSM5375529	PRJNA737139	1508237	5800	0.0472021
GSM5375530	PRJNA737139	2536507	7031	0.0456332
GSM5375531	PRJNA737139	2302943	5056	0.0269316
GSM5375532	PRJNA737139	2260431	5769	0.0238485

Check memory usage.

purrr::walk(list(matrix_readcount_use, cell_metadata), \(x) {
    print(object.size(x), units = "auto", standard = "SI")
})

922.6 MB
1.1 MB

Clustering w/ PSCs and TSCs

Cell metadata

# Kagawa et al. 2021
cell_metadata_PRJNA737139 <- vroom::vroom(
    file = file.path(
        PROJECT_DIR,
        "raw",
        "public",
        "PRJNA737139",
        "matrix/cell_metadata.csv"
    )
) |>
    dplyr::mutate(
        lineage = factor(
            lineage,
            levels = c(
                "Naive H9",
                "Primed H9",
                "TSCs",
                "Blastoid_24h",
                "Blastoid_60h",
                "Blastoid_96h"
            )
        ),
        source_name = factor(
            source_name
        )
    )

cell_metadata_PRJNA737139 |> head() |> knitr::kable()

cell	run	sampleid	developmental_stage	source_name	nFeature_RNA	percent.mt	samplename	treatment	blastoid.Fig2b.lowres	blastoid.FigS3a.highres	blastoid.FigS2g.PZT.integration	blastoid.FigS3g.T.integration	lineage
GSM5375527	SRR14795911	150182	E8	stem cells	7858	6.808364	primed_H9-150182	E8	4	9	5	2	Primed H9
GSM5375528	SRR14795912	150183	E8	stem cells	7607	6.181098	primed_H9-150183	E8	4	9	5	2	Primed H9
GSM5375529	SRR14795913	150185	E8	stem cells	6161	7.011326	primed_H9-150185	E8	4	9	5	2	Primed H9
GSM5375530	SRR14795914	150187	E8	stem cells	7574	7.557354	primed_H9-150187	E8	4	9	5	2	Primed H9
GSM5375531	SRR14795915	150188	E8	stem cells	5386	4.021456	primed_H9-150188	E8	4	9	2	8	Primed H9
GSM5375532	SRR14795916	150189	E8	stem cells	6260	3.559934	primed_H9-150189	E8	4	9	5	2	Primed H9

The expression matrix was re-generated from raw fastq files. To make this analysis comparable with the original publication, we retrieved the cell ids from the deposited dataset.

Retrieve the cell metadata from Gene Expression Omnibus.

cell_metadata_PRJNA737139_author <- vroom::vroom(
    file = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE177nnn/GSE177689/suppl/GSE177689_blastoid.H9.okae_bts5__scRNAseq.unfiltered__metadata.tsv.gz"
)

Rows: 2715 Columns: 9
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (2): samplename, treatment
dbl (7): sampleid, nFeature_RNA, percent.mt, blastoid.Fig2b.lowres, blastoid...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

cell_metadata_PRJNA737139_author |> head() |> knitr::kable()

sampleid	nFeature_RNA	percent.mt	samplename	treatment	blastoid.Fig2b.lowres	blastoid.FigS3a.highres	blastoid.FigS2g.PZT.integration	blastoid.FigS3g.T.integration
150567	6697	5.502513	blastoid_24h-150567	PALLY	0	3	NA	NA
150568	5773	2.771147	blastoid_24h-150568	PALLY	0	1	NA	NA
150569	3996	1.933010	blastoid_24h-150569	PALLY	0	1	NA	NA
150570	0	NA	blastoid_24h-150570	PALLY	NA	NA	NA	NA
150571	6380	5.795510	blastoid_24h-150571	PALLY	0	1	NA	NA
150572	5089	2.590785	blastoid_24h-150572	PALLY	0	1	NA	NA

num_cells_used_in_Fig2b <- cell_metadata_PRJNA737139_author |>
    dplyr::filter(
        !is.na(blastoid.Fig2b.lowres)
    ) |>
    nrow()

print(
    glue::glue(
        "Total number of cells deposited: ",
        "{nrow(cell_metadata_PRJNA737139_author)}"
    ),
    glue::glue(
        "Number of cells used in Fig. 2b: ",
        "{num_cells_used_in_Fig2b}"
    )
)

Total number of cells deposited: 2715
Number of cells used in Fig. 2b: 2067

num_cells_deposited_blastoid <- cell_metadata_PRJNA737139_author |>
    dplyr::filter(
        stringr::str_starts(string = treatment, pattern = "PALLY"),
    ) |>
    nrow()

num_cells_used_in_Fig2b_blastoid <- cell_metadata_PRJNA737139_author |>
    dplyr::filter(
        stringr::str_starts(string = treatment, pattern = "PALLY"),
        !is.na(blastoid.Fig2b.lowres)
    ) |>
    nrow()

print(
    glue::glue(
        "Total number of blastoid cells deposited: ",
        "{num_cells_deposited_blastoid}"
    ),
    glue::glue(
        "Number of blastoid cells used in Fig. 2b: ",
        "{num_cells_used_in_Fig2b_blastoid}"
    )
)

Total number of blastoid cells deposited: 2311
Number of blastoid cells used in Fig. 2b: 1672

2067 out of total 2715 deposited cells (73.1%), and 1672 out of 2311 deposited blastoid cells (72.3%) are used in Fig. 2a-d.

Visualization

Embedding

EMBEDDING_FILE <- glue::glue(
    "embedding_ncomponents10_ccc1_seed20210719.csv.gz"
)

embedding_2067 <- vroom::vroom(
    file = file.path(
        PROJECT_DIR,
        "raw",
        "public",
        "PRJNA737139",
        "clustering",
        "PRJNA737139",
        "exploring",
        "Scanpy_Harmony",
        EMBEDDING_FILE
    )
)

Rows: 2067 Columns: 16
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (2): cell, batch
dbl (14): louvain, leiden, x_tsne, y_tsne, x_fitsne, y_fitsne, x_umap_min_di...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Clustering

GEOM_POINT_SIZE <- 0.6
RASTERISED <- TRUE
x_column <- "x_umap_min_dist=0.1"
y_column <- "y_umap_min_dist=0.1"

EMBEDDING_TITLE_PREFIX <- "UMAP"

p_embedding_leiden <- plot_embedding(
    embedding = embedding_2067[, c(x_column, y_column)],
    color_values = embedding_2067$leiden |> as.factor(),
    label = glue::glue("{EMBEDDING_TITLE_PREFIX}; Leiden"),
    label_position = NULL,
    show_color_value_labels = TRUE,
    show_color_legend = FALSE,
    geom_point_size = GEOM_POINT_SIZE,
    sort_values = FALSE,
    shuffle_values = FALSE,
    rasterise = RASTERISED
) +
    theme_customized()

p_embedding_UMI <- plot_embedding(
    embedding = embedding_2067[, c(x_column, y_column)],
    color_values = log10(
        Matrix::colSums(matrix_readcount_use[, embedding_2067$cell])
    ),
    label = glue::glue("{EMBEDDING_TITLE_PREFIX}; UMI"),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE * 1,
    geom_point_alpha = 1,
    sort_values = TRUE,
    shuffle_values = FALSE,
    label_size = 2.5,
    label_hjust = 0,
    label_vjust = 0,
    rasterise = RASTERISED,
    legend_size = 2,
    legend_ncol = 1
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    )

p_embedding_MT <- plot_embedding(
    embedding = embedding_2067[, c(x_column, y_column)],
    color_values = embedding_2067 |>
        dplyr::left_join(
            cell_metadata # |> dplyr::select(-batch)
        ) |>
        dplyr::pull(mt_percentage),
    label = glue::glue("{EMBEDDING_TITLE_PREFIX}; MT %"),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE * 1,
    geom_point_alpha = 1,
    sort_values = TRUE,
    shuffle_values = FALSE,
    label_size = 2.5,
    label_hjust = 0,
    label_vjust = 0,
    rasterise = RASTERISED,
    legend_size = 2,
    legend_ncol = 1
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    )

selected_feature <- "ENSG00000204531_POU5F1"
p_embedding_POU5F1 <- plot_embedding(
    embedding = embedding_2067[, c(x_column, y_column)],
    color_values = log10(
        calc_cpm(matrix_readcount_use)[
            selected_feature,
            embedding_2067$cell
        ] + 1
    ),
    label = glue::glue("{EMBEDDING_TITLE_PREFIX}; {selected_feature}"),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE,
    geom_point_alpha = 1,
    sort_values = TRUE,
    shuffle_values = FALSE,
    label_size = 2.5,
    label_hjust = 0,
    label_vjust = 0,
    rasterise = FALSE,
    legend_size = 2,
    legend_ncol = 1
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    )

purrr::reduce(
    list(
        p_embedding_leiden,
        p_embedding_UMI,
        p_embedding_MT,
        p_embedding_POU5F1
    ), `+`
) +
    patchwork::plot_layout(ncol = 2) +
    patchwork::plot_annotation(
        theme = ggplot2::theme(plot.margin = ggplot2::margin())
    )

Summary

embedding_2067 |>
    dplyr::left_join(
        cell_metadata
    ) |>
    dplyr::mutate(
        num_umis = Matrix::colSums(matrix_readcount_use[, cell]),
        num_genes = Matrix::colSums(matrix_readcount_use[, cell] > 0),
    ) |>
    dplyr::group_by(leiden) |>
    dplyr::summarise(
        num_cells = n(),
        median_umis = median(num_umis),
        median_genes = median(num_genes),
        meidan_MT = median(mt_percentage)
    ) |>
    gt::gt() |>
    gt::tab_options(table.font.size = "median")

leiden	num_cells	median_umis	median_genes	meidan_MT
0	205	2140756	5387.0	0.0529720842
1	182	2311908	5423.0	0.0404300979
2	173	1631691	4577.0	0.0488606864
3	156	2319072	6379.5	0.0413304999
4	152	1390714	3764.5	0.0543443623
5	149	855797	3103.0	0.0364863153
6	126	2646474	6842.5	0.0501413970
7	124	2202163	6414.5	0.0413531842
8	119	1673032	4618.0	0.0522041437
9	115	1849883	5760.0	0.0506782652
10	115	1738237	3665.0	0.0002466655
11	114	1201448	3621.0	0.0016033615
12	109	1327329	3660.0	0.0361322014
13	103	1507763	4685.0	0.0376887867
14	79	1519467	4506.0	0.0003062544
15	46	1677120	4610.0	0.0469316175

embedding_2067 |>
    dplyr::left_join(
        cell_metadata,
    ) |>
    dplyr::left_join(
        cell_metadata_PRJNA737139
    ) |>
    dplyr::mutate(
        num_umis = Matrix::colSums(matrix_readcount_use[, cell]),
        num_genes = Matrix::colSums(matrix_readcount_use[, cell] > 0),
    ) |>
    dplyr::group_by(lineage) |>
    dplyr::summarise(
        num_cells = n(),
        median_umis = median(num_umis),
        median_genes = median(num_genes),
        meidan_MT = median(mt_percentage)
    ) |>
    gt::gt() |>
    gt::tab_options(table.font.size = "median")

lineage	num_cells	median_umis	median_genes	meidan_MT
Naive H9	143	2322872	6187.0	0.03970047
Primed H9	124	2202163	6414.5	0.04135318
TSCs	128	2642517	6831.0	0.05028367
Blastoid_24h	223	2036212	5169.0	0.04553170
Blastoid_60h	460	1964516	4855.5	0.03979052
Blastoid_96h	989	1465353	4264.0	0.04480325

Cell type

plot_embedding_highlight(
    embedding = embedding_2067,
    x = cell_metadata_PRJNA737139,
    y = "lineage",
    label = "Kagawa et al. 2021",
    n_cols = 2
)

Expression

##| column: page

FEATURES_SELECTED <- c(
    "ENSG00000171872_KLF17",
    "ENSG00000204531_POU5F1",
    "ENSG00000111704_NANOG",
    "ENSG00000186103_ARGFX",
    #
    "ENSG00000164736_SOX17",
    "ENSG00000125798_FOXA2",
    "ENSG00000136574_GATA4",
    "ENSG00000134853_PDGFRA",
    #
    "ENSG00000179348_GATA2",
    "ENSG00000070915_SLC12A3",
    "ENSG00000165556_CDX2",
    "ENSG00000007866_TEAD3"
)

purrr::map(FEATURES_SELECTED, \(x) {
    selected_feature <- x

    cat(selected_feature, "\n")
    values <- log10(
        calc_cpm(
            matrix_readcount_use[, embedding_2067$cell]
        )[selected_feature, ] + 1
    )

    plot_embedding(
        embedding = embedding_2067[, c(x_column, y_column)],
        color_values = values,
        label = paste(
            EMBEDDING_TITLE_PREFIX,
            selected_feature |> stringr::str_remove(pattern = "^E.+_"),
            sep = "; "
        ),
        label_position = NULL,
        show_color_value_labels = FALSE,
        show_color_legend = TRUE,
        geom_point_size = GEOM_POINT_SIZE,
        sort_values = TRUE,
        shuffle_values = FALSE,
        rasterise = RASTERISED,
        legend_size = 2
    ) +
        scale_color_viridis_c(
            na.value = "grey80"
        ) +
        theme_customized(
            legend_key_size = 2,
            legend_text_size = 5
        )
}) |>
    purrr::reduce(`+`) +
    patchwork::plot_layout(ncol = 3, byrow = FALSE) +
    patchwork::plot_annotation(
        theme = ggplot2::theme(plot.margin = ggplot2::margin())
    )

ENSG00000171872_KLF17 
ENSG00000204531_POU5F1 
ENSG00000111704_NANOG 
ENSG00000186103_ARGFX 
ENSG00000164736_SOX17 
ENSG00000125798_FOXA2 
ENSG00000136574_GATA4 
ENSG00000134853_PDGFRA 
ENSG00000179348_GATA2 
ENSG00000070915_SLC12A3 
ENSG00000165556_CDX2 
ENSG00000007866_TEAD3

Author-defined clusters

Visualize author-defined clusters on this embedding (Low resolution; Fig. 2b). Note: 1672 out of total 2311 GEO-deposited blastoid cells (72.3%) are used in Fig. 2a-d.

GEO-deposited cell metadata.

cell_metadata_PRJNA737139_author |>
    dplyr::count(treatment, name = "num_cells") |>
    gt::gt() |>
    gt::tab_options(table.font.size = "median") |>
    # gt::grand_summary_rows(
    gt::summary_rows(
        columns = c(num_cells),
        fns = list(
            Sum = ~ sum(.)
        ),
        decimals = 0
    )

	treatment	num_cells
	2D_TSCM	128
	E8	125
	PALLY	248
	PALLY_48_hours_LY_12_hours	744
	PALLY_48_hours_LY_48_hours	1319
	PXGL	151
Sum	—	2,715

Cells included in Fig. 2a.

cell_metadata_PRJNA737139_author |>
    dplyr::filter(
        !is.na(blastoid.Fig2b.lowres)
    ) |>
    dplyr::count(treatment, name = "num_cells") |>
    gt::gt() |>
    gt::tab_options(table.font.size = "median") |>
    gt::summary_rows(
        columns = c(num_cells),
        fns = list(
            Sum = ~ sum(.)
        ),
        decimals = 0
    )

	treatment	num_cells
	2D_TSCM	128
	E8	124
	PALLY	223
	PALLY_48_hours_LY_12_hours	460
	PALLY_48_hours_LY_48_hours	989
	PXGL	143
Sum	—	2,067

Match the names used in Fig. 2a.

cell_metadata_PRJNA737139 |>
    dplyr::filter(
        !is.na(blastoid.Fig2b.lowres)
    ) |>
    dplyr::count(lineage, name = "num_cells") |>
    dplyr::rename(group = lineage) |>
    gt::gt() |>
    gt::tab_options(table.font.size = "median") |>
    gt::summary_rows(
        columns = c(num_cells),
        fns = list(
            Sum = ~ sum(.)
        ),
        decimals = 0
    )

	group	num_cells
	Naive H9	143
	Primed H9	124
	TSCs	128
	Blastoid_24h	223
	Blastoid_60h	460
	Blastoid_96h	989
Sum	—	2,067

Low resolution

Visualize author-defined clusters on this embedding (Low resolution; Fig. 2b).

plot_embedding(
    embedding = embedding_2067[, c(x_column, y_column)],
    color_values = embedding_2067 |>
        dplyr::left_join(
            cell_metadata_PRJNA737139
        ) |>
        dplyr::pull(blastoid.Fig2b.lowres)
        |> as.factor(),
    label = glue::glue("{EMBEDDING_TITLE_PREFIX}; Fig. 2b clusters"),
    label_position = NULL,
    show_color_value_labels = TRUE,
    show_color_legend = FALSE,
    geom_point_size = GEOM_POINT_SIZE,
    sort_values = FALSE,
    shuffle_values = FALSE,
    rasterise = RASTERISED
) +
    theme_customized()

calc_group_composition(
    data = embedding_2067 |>
        dplyr::left_join(
            cell_metadata_PRJNA737139
        ),
    x = "blastoid.Fig2b.lowres",
    group = "lineage"
) |>
    dplyr::mutate(
        blastoid.Fig2b.lowres = as.factor(blastoid.Fig2b.lowres)
    ) |>
    plot_barplot(
        x = "blastoid.Fig2b.lowres",
        y = "percentage",
        z = "lineage",
        legend_ncol = 1
    )

Visualize selected four non-blastocyst genes on this embedding (Extended Data Fig. 3b).

FEATURES_SELECTED <- c(
    "ENSG00000094755_GABRP",
    "ENSG00000134817_APLNR",
    "ENSG00000016082_ISL1",
    "ENSG00000143320_CRABP2"
)

purrr::map(FEATURES_SELECTED, \(x) {
    selected_feature <- x

    cat(selected_feature, "\n")
    values <- log10(
        calc_cpm(matrix_readcount_use[, embedding_2067$cell])
        [selected_feature, ] + 1
    )

    plot_embedding(
        embedding = embedding_2067[, c(x_column, y_column)],
        color_values = values,
        label = paste(
            EMBEDDING_TITLE_PREFIX,
            selected_feature |> stringr::str_remove(pattern = "^E.+_"),
            sep = "; "
        ),
        label_position = NULL,
        show_color_value_labels = FALSE,
        show_color_legend = TRUE,
        geom_point_size = GEOM_POINT_SIZE * 1.5,
        sort_values = TRUE,
        shuffle_values = FALSE,
        rasterise = RASTERISED,
        legend_size = 2
    ) +
        scale_color_viridis_c(
            na.value = "grey80"
        ) +
        theme_customized(
            legend_key_size = 2,
            legend_text_size = 5
        )
}) |>
    purrr::reduce(`+`) +
    patchwork::plot_layout(ncol = 2, byrow = FALSE) +
    patchwork::plot_annotation(
        theme = ggplot2::theme(plot.margin = ggplot2::margin())
    )

ENSG00000094755_GABRP 
ENSG00000134817_APLNR 
ENSG00000016082_ISL1 
ENSG00000143320_CRABP2

High resolution

Visualize author-defined clusters on this embedding (High resolution; Extended Data Fig. 3a).

plot_embedding(
    embedding = embedding_2067[, c(x_column, y_column)],
    color_values = embedding_2067 |>
        dplyr::left_join(
            cell_metadata_PRJNA737139
        ) |>
        dplyr::pull(blastoid.FigS3a.highres)
        |> as.factor(),
    label = glue::glue("{EMBEDDING_TITLE_PREFIX}; Fig. S3a clusters"),
    label_position = NULL,
    show_color_value_labels = TRUE,
    show_color_legend = FALSE,
    geom_point_size = GEOM_POINT_SIZE,
    sort_values = FALSE,
    shuffle_values = FALSE,
    rasterise = RASTERISED
) +
    theme_customized()

p_barplot_highres_1 <- calc_group_composition(
    data = embedding_2067 |>
        dplyr::left_join(
            cell_metadata_PRJNA737139
        ),
    x = "blastoid.FigS3a.highres",
    group = "lineage"
) |>
    dplyr::mutate(
        lineage = stringr::str_remove(string = lineage, pattern = "_.+$"),
        lineage = as.factor(
            lineage
        ),
        blastoid.FigS3a.highres = as.factor(blastoid.FigS3a.highres)
    ) |>
    plot_barplot(
        x = "blastoid.FigS3a.highres",
        y = "percentage",
        z = "lineage",
        legend_ncol = 1
    )

p_barplot_highres_2 <- calc_group_composition(
    data = embedding_2067 |>
        dplyr::left_join(
            cell_metadata_PRJNA737139
        ),
    x = "blastoid.FigS3a.highres",
    group = "lineage"
) |>
    dplyr::mutate(
        blastoid.FigS3a.highres = as.factor(blastoid.FigS3a.highres)
    ) |>
    plot_barplot(
        x = "blastoid.FigS3a.highres",
        y = "percentage",
        z = "lineage",
        legend_ncol = 1
    )

list(p_barplot_highres_1, p_barplot_highres_2) |>
    purrr::reduce(`+`) +
    patchwork::plot_layout(ncol = 1, byrow = FALSE, guides = "collect") +
    patchwork::plot_annotation(
        theme = ggplot2::theme(plot.margin = ggplot2::margin())
    )

Subset of blastoid cells

According to the cell metadata deposited at GEO, only 96h-blastoid cells were included for comparison with other datasets.

cell_metadata_PRJNA737139_author |>
    dplyr::filter(
        !is.na(blastoid.FigS2g.PZT.integration)
    ) |>
    dplyr::count(treatment, name = "num_cells") |>
    gt::gt() |>
    gt::tab_options(table.font.size = "median") |>
    gt::summary_rows(
        columns = c(num_cells),
        fns = list(
            Sum = ~ sum(.)
        ),
        decimals = 0
    )

	treatment	num_cells
	E8	124
	PALLY_48_hours_LY_48_hours	989
	PXGL	143
Sum	—	1,256

Match the names used in Fig. 2a.

cell_metadata_PRJNA737139 |>
    dplyr::filter(
        !is.na(blastoid.FigS2g.PZT.integration)
    ) |>
    dplyr::count(lineage, name = "num_cells") |>
    dplyr::rename(group = lineage) |>
    gt::gt() |>
    gt::tab_options(table.font.size = "median") |>
    gt::summary_rows(
        columns = c(num_cells),
        fns = list(
            Sum = ~ sum(.)
        ),
        decimals = 0
    )

	group	num_cells
	Naive H9	143
	Primed H9	124
	Blastoid_96h	989
Sum	—	1,256

Highlight blastoid cells used in Fig. 2e-f.

plot_embedding(
    embedding = embedding_2067[, c(x_column, y_column, "cell")] |>
        dplyr::left_join(cell_metadata_PRJNA737139),
    color_values = embedding_2067[, c(x_column, y_column, "cell")] |>
        dplyr::left_join(cell_metadata_PRJNA737139) |>
        dplyr::mutate(
            value = case_when(
                (
                    stringr::str_detect(
                        string = lineage, pattern = "Blast"
                    ) & is.na(blastoid.FigS2g.PZT.integration)
                ) ~ "1",
                TRUE ~ "0"
            )
        ) |>
        dplyr::pull(value) |>
        as.factor(),
    label = glue::glue("{EMBEDDING_TITLE_PREFIX}; 96h blastoid cells"),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = FALSE,
    geom_point_size = GEOM_POINT_SIZE,
    sort_values = FALSE,
    shuffle_values = FALSE,
    rasterise = RASTERISED
) +
    theme_customized() +
    ggplot2::scale_color_manual(
        values = c("grey70", "salmon")
    )

Integration w/ other datasets

Instead of only including a subset of each dataset, this embedding includes all the cells mentioned in their respective publications to cover the full spectrum of cellular diversity.

Included cells and their lineage annotations are the same as in their respective publications. See here for details.

EMBEDDING_FILE <- glue::glue(
    "embedding_ncomponents18_ccc1_seed20210719.csv.gz"
)

embedding <- vroom::vroom(
    file = file.path(
        PROJECT_DIR,
        "raw",
        "public",
        "PRJNA737139",
        "clustering",
        "PRJNA737139_PRJEB11202_PRJNA431392_PRJEB40781",
        "exploring",
        "Scanpy_Harmony_variable",
        EMBEDDING_FILE
    )
)

embedding |> head() |> knitr::kable()

cell	batch	louvain	leiden	x_tsne	y_tsne	x_fitsne	y_fitsne	x_umap_min_dist=0.5	y_umap_min_dist=0.5	x_umap_min_dist=0.1	y_umap_min_dist=0.1	x_phate	y_phate	x_pacmap	y_pacmap
D10_IVC4_E1_B1_1	PRJNA431392	0	3	-7.556968	53.075813	31.59123	-41.64920	12.654197	-3.6284332	11.933653	-3.4647641	0.0166832	-0.0070260	7.790154	-11.322366
D10_IVC4_E1_B1_10	PRJNA431392	1	6	-116.844467	9.400366	72.41945	21.89626	-4.288216	4.7477736	-4.915794	5.2280717	0.0143929	-0.0097040	-11.047858	-28.940121
D10_IVC4_E1_B1_11	PRJNA431392	2	11	-21.472546	-11.853150	-45.40295	-15.77693	8.047470	-0.0075634	7.928275	0.9275900	0.0155106	0.0026800	-11.631661	-5.534178
D10_IVC4_E1_B1_12	PRJNA431392	3	1	-10.749787	-9.368344	-20.57556	-28.79359	9.341379	0.1016088	9.088892	1.1644886	0.0158193	-0.0085744	-10.191150	-6.816082
D10_IVC4_E1_B1_13	PRJNA431392	4	7	-23.630573	54.296280	39.70749	-59.28475	11.469043	-5.4803004	10.867358	-4.4557619	0.0265396	-0.0007893	9.937391	-12.877551
D10_IVC4_E1_B1_14	PRJNA431392	2	11	-22.174698	-12.000992	-46.05968	-15.38952	7.953345	0.0190667	7.878169	0.9776146	0.0184427	0.0036553	-12.004497	-5.443837

studies <- tibble::tribble(
    ~bioproject, ~citation,
    "PRJEB11202", "Petropoulos et al. 2016",
    "PRJEB40781", "Tyser et al. 2021",
    "PRJNA555602", "Zheng et al. 2019",
    "PRJNA431392", "Zhou et al. 2019",
    "PRJNA737139", "Kagawa et al. 2021",
    "PRJNA632839", "Yu et al. 2021",
    "PRJNA658478", "Liu et al. 2021",
    "PRJNA720968", "Yanagida et al. 2021",
    "PRJNA667174", "Fan et al. 2021",
    "PRJNA738498", "Sozen et al. 2021",
    "PRJNA737139", "Kagawa et al. 2021"
)

studies <- setNames(
    object = studies$citation,
    nm = studies$bioproject
)

embedding |>
    dplyr::count(batch, name = "num_cells") |>
    dplyr::mutate(
        study = studies[batch]
    ) |>
    gt::gt() |>
    gt::tab_options(table.font.size = "median") |>
    gt::summary_rows(
        columns = c(num_cells),
        fns = list(
            Sum = ~ sum(.)
        ),
        decimals = 0
    )

	batch	num_cells	study
	PRJEB11202	1529	Petropoulos et al. 2016
	PRJEB40781	1195	Tyser et al. 2021
	PRJNA431392	5911	Zhou et al. 2019
	PRJNA737139	2067	Kagawa et al. 2021
Sum	—	10,702	—

Metadata

# PRJNA431392; Zhou et al. 2019
embryos_selected_PRJNA431392 <- c(
    "ha_D6_E2",
    "hm_D6_E1",
    "hm_D6_E2",
    "hm_D8_E2",
    "hm_D8_E3",
    "hm_D8_E5",
    "ha_D8_E1",
    "hm_D8_E1",
    "hv_D8_E1",
    "hv_D8_E2",
    "hv_D8_E3",
    "hv_D10_E6",
    "ha_D10_E1",
    "ha_D10_E2",
    "hm_D10_E4",
    "hm_D10_E9",
    "hv_D10_E7",
    "hv_D10_E8",
    "ha_D12_E1",
    "hv_D12_E1",
    "hv_D12_E2"
)

cell_metadata_PRJNA431392 <- vroom::vroom(
    file = file.path(
        PROJECT_DIR,
        "raw",
        "public",
        "PRJNA431392",
        "matrix/cell_metadata.csv"
    )
) |>
    dplyr::mutate(
        developmental_stage = factor(
            Day,
            levels = stringr::str_sort(
                unique(Day),
                numeric = TRUE
            )
        ),
        lineage = factor(
            Lineage,
            levels = c(
                "EPI",
                "PE",
                "TE",
                "MIX"
            )
        ),
        `3184` = case_when(
            Ori_Day_Emb %in% embryos_selected_PRJNA431392 ~ "1",
            TRUE ~ "0"
        )
    ) |>
    dplyr::rename(cell = Sample)


# PRJEB40781; Tyser et al. 2021
cell_metadata_PRJEB40781 <- vroom::vroom(
    file = file.path(
        PROJECT_DIR,
        "raw",
        "public",
        "PRJEB40781",
        "matrix/cell_metadata.csv"
    )
) |>
    dplyr::mutate(
        lineage = factor(lineage),
        lineage_ontology = factor(lineage_ontology),
        sampling_site = factor(sampling_site)
    )


# PRJEB11202; Petropoulos et al. 2016
cell_metadata_PRJEB11202 <- vroom::vroom(
    file = file.path(
        PROJECT_DIR,
        "raw",
        "public",
        "PRJEB11202",
        "matrix/cell_metadata.csv"
    )
) |>
    dplyr::mutate(
        developmental_stage = stringr::str_remove(
            string = individual,
            pattern = "\\..+$"
        ),
        developmental_stage = factor(
            developmental_stage,
            levels = stringr::str_sort(
                unique(developmental_stage),
                numeric = TRUE
            )
        ),
        #
        lineage = factor(
            inferred_lineage,
            levels = c(
                "epiblast",
                "primitive_endoderm",
                "trophectoderm",
                "not_applicable"
            )
        )
    ) |>
    dplyr::select(
        cell, lineage, developmental_stage
    )

Visualization

Clustering

p_embedding_leiden <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = embedding$leiden |> as.factor(),
    label = glue::glue("{EMBEDDING_TITLE_PREFIX}; Leiden"),
    label_position = NULL,
    show_color_value_labels = TRUE,
    show_color_legend = FALSE,
    geom_point_size = GEOM_POINT_SIZE,
    sort_values = FALSE,
    shuffle_values = FALSE,
    rasterise = RASTERISED
) +
    theme_customized()

p_embedding_UMI <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = log10(
        Matrix::colSums(matrix_readcount_use[, embedding$cell])
    ),
    label = glue::glue("{EMBEDDING_TITLE_PREFIX}; UMI"),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE * 1,
    geom_point_alpha = 1,
    sort_values = TRUE,
    shuffle_values = FALSE,
    label_size = 2.5,
    label_hjust = 0,
    label_vjust = 0,
    rasterise = RASTERISED,
    legend_size = 2,
    legend_ncol = 1
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    )

p_embedding_MT <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = embedding |>
        dplyr::left_join(
            cell_metadata # |> dplyr::select(-batch)
        ) |>
        dplyr::pull(mt_percentage),
    label = glue::glue("{EMBEDDING_TITLE_PREFIX}; MT %"),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE * 1,
    geom_point_alpha = 1,
    sort_values = TRUE,
    shuffle_values = FALSE,
    label_size = 2.5,
    label_hjust = 0,
    label_vjust = 0,
    rasterise = RASTERISED,
    legend_size = 2,
    legend_ncol = 1
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    )

p_embedding_batch <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = embedding$batch |> as.factor(),
    label = glue::glue("{EMBEDDING_TITLE_PREFIX}; Batch"),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE / 2,
    sort_values = FALSE,
    shuffle_values = TRUE,
    rasterise = RASTERISED,
    legend_size = 2
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    ) +
    scale_color_manual(
        values = scales::hue_pal()(n = length(unique(embedding$batch))),
        labels = studies
    )

purrr::reduce(
    list(
        p_embedding_leiden,
        p_embedding_batch,
        p_embedding_UMI,
        p_embedding_MT
    ), `+`
) +
    patchwork::plot_layout(ncol = 2) +
    patchwork::plot_annotation(
        theme = ggplot2::theme(plot.margin = ggplot2::margin())
    )

purrr::map(unique(embedding$batch), \(x) {
    plot_embedding(
        embedding = embedding[, c(x_column, y_column)],
        color_values = as.integer(embedding$batch == x) |> as.factor(),
        label = glue::glue(
            "{EMBEDDING_TITLE_PREFIX}; {studies[x]}, ",
            "{sum(as.integer(embedding$batch == x), na.rm = TRUE)}"
        ),
        label_position = NULL,
        show_color_value_labels = FALSE,
        show_color_legend = FALSE,
        geom_point_size = GEOM_POINT_SIZE,
        sort_values = TRUE,
        shuffle_values = FALSE,
        rasterise = RASTERISED,
        legend_size = 2
    ) +
        theme_customized(
            legend_key_size = 2,
            legend_text_size = 5
        ) +
        scale_color_manual(
            values = c("grey70", "salmon")
        )
}) |>
    purrr::reduce(`+`) +
    patchwork::plot_layout(ncol = 2) +
    patchwork::plot_annotation(
        theme = ggplot2::theme(plot.margin = ggplot2::margin())
    )

Cell annotation

Petropoulos et al. 2016

# Petropoulos et al. 2016
bioproject <- "PRJEB11202"
cell_metadata_selected <- cell_metadata_PRJEB11202

selected_column <- "developmental_stage"
p_embedding_developmental_stage <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = embedding |>
        dplyr::left_join(cell_metadata_selected) |>
        dplyr::pull(.data[[selected_column]]),
    label = glue::glue(
        "{EMBEDDING_TITLE_PREFIX}; {studies[bioproject]}; ",
        "{selected_column |>
        stringr::str_to_title() |>
        stringr::str_replace(pattern = \"_\", replacement = \" \")}"
    ),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE,
    sort_values = TRUE,
    shuffle_values = FALSE,
    rasterise = RASTERISED,
    legend_size = 2
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    ) +
    scale_color_manual(
        na.translate = TRUE,
        values = scales::hue_pal()(
            n = length(unique(cell_metadata_selected[[selected_column]]))
        ),
        na.value = "#7F7F7F"
    )

selected_column <- "lineage"
p_embedding_lineage <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = embedding |>
        dplyr::left_join(cell_metadata_selected) |>
        dplyr::pull(.data[[selected_column]]),
    label = glue::glue(
        "{EMBEDDING_TITLE_PREFIX}; {studies[bioproject]}; ",
        "{selected_column |>
        stringr::str_to_title() |>
        stringr::str_replace(pattern = \"_\", replacement = \" \")}"
    ),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE,
    sort_values = TRUE,
    shuffle_values = FALSE,
    rasterise = RASTERISED,
    legend_size = 2
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    ) +
    scale_color_manual(
        na.translate = TRUE,
        values = scales::hue_pal()(
            n = length(unique(cell_metadata_selected[[selected_column]]))
        ),
        na.value = "#7F7F7F"
    )

list(
    p_embedding_lineage,
    p_embedding_developmental_stage
) |>
    purrr::reduce(`+`) +
    patchwork::plot_layout(ncol = 2) +
    patchwork::plot_annotation(
        theme = ggplot2::theme(plot.margin = ggplot2::margin())
    )

Salmon: highlighted group of cells; Light grey: cells belonging to this dataset but not the highlighted group; Dark grey: cells belonging to other datasets.

plot_embedding_highlight(
    embedding = embedding,
    x = cell_metadata_selected,
    y = "lineage",
    label = studies[bioproject],
    n_cols = 2
)

plot_embedding_highlight(
    embedding = embedding,
    x = cell_metadata_selected,
    y = "developmental_stage",
    label = studies[bioproject],
    n_cols = 2
)

Zhou et al. 2019

# Zhou et al. 2019
bioproject <- "PRJNA431392"
cell_metadata_selected <- cell_metadata_PRJNA431392
selected_column <- "developmental_stage"

p_embedding_developmental_stage <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = embedding |>
        dplyr::left_join(cell_metadata_selected) |>
        dplyr::pull(.data[[selected_column]]),
    label = glue::glue(
        "{EMBEDDING_TITLE_PREFIX}; {studies[bioproject]}; ",
        "{selected_column |>
        stringr::str_to_title() |>
        stringr::str_replace(pattern = \"_\", replacement = \" \")}"
    ),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE,
    sort_values = TRUE,
    shuffle_values = FALSE,
    rasterise = RASTERISED,
    legend_size = 2
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    ) +
    scale_color_manual(
        na.translate = TRUE,
        values = scales::hue_pal()(
            n = length(unique(cell_metadata_selected[[selected_column]]))
        ),
        na.value = "#7F7F7F"
    )

selected_column <- "lineage"
p_embedding_lineage <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = embedding |>
        dplyr::left_join(cell_metadata_selected) |>
        dplyr::pull(.data[[selected_column]]),
    label = glue::glue(
        "{EMBEDDING_TITLE_PREFIX}; {studies[bioproject]}; ",
        "{selected_column |>
        stringr::str_to_title() |>
        stringr::str_replace(pattern = \"_\", replacement = \" \")}"
    ),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE,
    sort_values = TRUE,
    shuffle_values = FALSE,
    rasterise = RASTERISED,
    legend_size = 2
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    ) +
    scale_color_manual(
        na.translate = TRUE,
        values = scales::hue_pal()(
            n = length(unique(cell_metadata_selected[[selected_column]]))
        ),
        na.value = "#7F7F7F"
    )

selected_column <- "3184"
p_embedding_chosen <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = embedding |>
        dplyr::left_join(cell_metadata_selected) |>
        dplyr::pull(.data[[selected_column]]),
    label = glue::glue(
        "{EMBEDDING_TITLE_PREFIX}; {studies[bioproject]}; ",
        "{selected_column |>
        stringr::str_to_title() |>
        stringr::str_replace(pattern = \"_\", replacement = \" \")}"
    ),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE,
    sort_values = TRUE,
    shuffle_values = FALSE,
    rasterise = RASTERISED,
    legend_size = 2
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    ) +
    scale_color_manual(
        na.translate = TRUE,
        values = c("grey70", "salmon"),
        na.value = "#7F7F7F"
    )

list(
    p_embedding_lineage,
    p_embedding_developmental_stage,
    p_embedding_chosen
) |>
    purrr::reduce(`+`) +
    patchwork::plot_layout(ncol = 2) +
    patchwork::plot_annotation(
        theme = ggplot2::theme(plot.margin = ggplot2::margin())
    )

plot_embedding_highlight(
    embedding = embedding,
    x = cell_metadata_selected,
    y = "developmental_stage",
    label = studies[bioproject],
    n_cols = 2
)

Tyser et al. 2021

# Tyser et al. 2021
bioproject <- "PRJEB40781"
cell_metadata_selected <- cell_metadata_PRJEB40781

selected_column <- "sampling_site"
p_embedding_sampling_site <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = embedding |>
        dplyr::left_join(cell_metadata_selected) |>
        dplyr::pull(.data[[selected_column]]),
    label = glue::glue(
        "{EMBEDDING_TITLE_PREFIX}; {studies[bioproject]}; ",
        "{selected_column |>
        stringr::str_to_title() |>
        stringr::str_replace(pattern = \"_\", replacement = \" \")}"
    ),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE,
    sort_values = TRUE,
    shuffle_values = FALSE,
    rasterise = RASTERISED,
    legend_size = 2
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    ) +
    scale_color_manual(
        na.translate = TRUE,
        values = scales::hue_pal()(
            n = length(unique(cell_metadata_selected[[selected_column]]))
        ),
        na.value = "grey70"
    )

selected_column <- "lineage"
p_embedding_lineage <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = embedding |>
        dplyr::left_join(cell_metadata_selected) |>
        dplyr::pull(.data[[selected_column]]),
    label = glue::glue(
        "{EMBEDDING_TITLE_PREFIX}; {studies[bioproject]}; ",
        "{selected_column |>
        stringr::str_to_title() |>
        stringr::str_replace(pattern = \"_\", replacement = \" \")}"
    ),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE,
    sort_values = TRUE,
    shuffle_values = FALSE,
    rasterise = RASTERISED,
    legend_size = 2
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    ) +
    scale_color_manual(
        na.translate = TRUE,
        values = scales::hue_pal()(
            n = length(unique(cell_metadata_selected[[selected_column]]))
        ),
        na.value = "#7F7F7F"
    )

list(
    p_embedding_lineage,
    p_embedding_sampling_site
) |>
    purrr::reduce(`+`) +
    patchwork::plot_layout(ncol = 2) +
    patchwork::plot_annotation(
        theme = ggplot2::theme(plot.margin = ggplot2::margin())
    )

plot_embedding_highlight(
    embedding = embedding,
    x = cell_metadata_selected,
    y = "lineage",
    label = studies[bioproject],
    n_cols = 2
)

Kagawa et al. 2021

# Kagawa et al. 2021
bioproject <- "PRJNA737139"
cell_metadata_selected <- cell_metadata_PRJNA737139

selected_column <- "source_name"
p_embedding_source <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = embedding |>
        dplyr::left_join(cell_metadata_selected) |>
        dplyr::pull(.data[[selected_column]]),
    label = glue::glue(
        "{EMBEDDING_TITLE_PREFIX}; {studies[bioproject]}; ",
        "{selected_column |>
        stringr::str_to_title() |>
        stringr::str_replace(pattern = \"_\", replacement = \" \")}"
    ),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE,
    sort_values = TRUE,
    shuffle_values = FALSE,
    rasterise = RASTERISED,
    legend_size = 2
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    ) +
    scale_color_manual(
        na.translate = TRUE,
        values = scales::hue_pal()(
            n = length(unique(cell_metadata_selected[[selected_column]]))
        ),
        na.value = "#7F7F7F"
    )

selected_column <- "lineage"
p_embedding_lineage <- plot_embedding(
    embedding = embedding[, c(x_column, y_column)],
    color_values = embedding |>
        dplyr::left_join(cell_metadata_selected) |>
        dplyr::pull(.data[[selected_column]]),
    label = glue::glue(
        "{EMBEDDING_TITLE_PREFIX}; {studies[bioproject]}; ",
        "{selected_column |>
        stringr::str_to_title() |>
        stringr::str_replace(pattern = \"_\", replacement = \" \")}"
    ),
    label_position = NULL,
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = GEOM_POINT_SIZE,
    sort_values = TRUE,
    shuffle_values = FALSE,
    rasterise = RASTERISED,
    legend_size = 2
) +
    theme_customized(
        legend_key_size = 2,
        legend_text_size = 5
    ) +
    scale_color_manual(
        na.translate = TRUE,
        values = scales::hue_pal()(
            n = length(unique(cell_metadata_selected[[selected_column]]))
        ),
        na.value = "#7F7F7F"
    )

list(
    p_embedding_source,
    p_embedding_lineage
) |>
    purrr::reduce(`+`) +
    patchwork::plot_layout(ncol = 2) +
    patchwork::plot_annotation(
        theme = ggplot2::theme(plot.margin = ggplot2::margin())
    )

plot_embedding_highlight(
    embedding = embedding,
    x = cell_metadata_selected,
    y = "lineage",
    label = studies[bioproject],
    n_cols = 2
)

R session info

devtools::session_info()

─ Session info ───────────────────────────────────────────────────────────────
 setting  value
 version  R version 4.1.2 (2021-11-01)
 os       macOS Monterey 12.1
 system   aarch64, darwin20.6.0
 ui       unknown
 language (EN)
 collate  en_US.UTF-8
 ctype    en_US.UTF-8
 tz       America/Chicago
 date     2022-01-24
 pandoc   2.17.0.1 @ /opt/homebrew/bin/ (via rmarkdown)

─ Packages ───────────────────────────────────────────────────────────────────
 package       * version     date (UTC) lib source
 assertthat      0.2.1       2019-03-21 [1] CRAN (R 4.1.1)
 backports       1.4.1       2021-12-13 [1] CRAN (R 4.1.2)
 beeswarm        0.4.0       2021-06-01 [1] CRAN (R 4.1.2)
 bit             4.0.4       2020-08-04 [1] CRAN (R 4.1.1)
 bit64           4.0.5       2020-08-30 [1] CRAN (R 4.1.1)
 brio            1.1.3       2021-11-30 [1] CRAN (R 4.1.2)
 broom           0.7.11      2022-01-03 [1] CRAN (R 4.1.2)
 cachem          1.0.6       2021-08-19 [1] CRAN (R 4.1.1)
 callr           3.7.0       2021-04-20 [1] CRAN (R 4.1.1)
 cellranger      1.1.0       2016-07-27 [1] CRAN (R 4.1.1)
 checkmate       2.0.0       2020-02-06 [1] CRAN (R 4.1.1)
 cli             3.1.1       2022-01-20 [1] CRAN (R 4.1.2)
 colorspace      2.0-2       2021-06-24 [1] CRAN (R 4.1.1)
 crayon          1.4.2       2021-10-29 [1] CRAN (R 4.1.1)
 curl            4.3.2       2021-06-23 [1] CRAN (R 4.1.1)
 data.table      1.14.2      2021-09-27 [1] CRAN (R 4.1.1)
 DBI             1.1.2       2021-12-20 [1] CRAN (R 4.1.2)
 dbplyr          2.1.1       2021-04-06 [1] CRAN (R 4.1.1)
 desc            1.4.0       2021-09-28 [1] CRAN (R 4.1.1)
 devtools        2.4.3.9000  2022-01-22 [1] Github (r-lib/devtools@41280ac)
 digest          0.6.29      2021-12-01 [1] CRAN (R 4.1.2)
 dplyr         * 1.0.7.9000  2022-01-12 [1] Github (tidyverse/dplyr@0501335)
 dtplyr          1.2.1       2022-01-19 [1] CRAN (R 4.1.2)
 ellipsis        0.3.2       2021-04-29 [1] CRAN (R 4.1.1)
 evaluate        0.14        2019-05-28 [1] CRAN (R 4.1.1)
 extrafont     * 0.17        2014-12-08 [1] CRAN (R 4.1.1)
 extrafontdb     1.0         2012-06-11 [1] CRAN (R 4.1.1)
 fansi           1.0.2       2022-01-14 [1] CRAN (R 4.1.2)
 farver          2.1.0       2021-02-28 [1] CRAN (R 4.1.1)
 fastmap         1.1.0       2021-01-25 [1] CRAN (R 4.1.1)
 forcats       * 0.5.1.9000  2021-11-29 [1] Github (tidyverse/forcats@b4dade0)
 fs              1.5.2.9000  2021-12-09 [1] Github (r-lib/fs@6d1182f)
 gargle          1.2.0       2021-07-02 [1] CRAN (R 4.1.1)
 generics        0.1.1       2021-10-25 [1] CRAN (R 4.1.1)
 ggbeeswarm      0.6.0       2017-08-07 [1] CRAN (R 4.1.2)
 ggplot2       * 3.3.5       2021-06-25 [1] CRAN (R 4.1.1)
 ggrastr         1.0.1       2021-12-08 [1] Github (VPetukhov/ggrastr@7aed9af)
 glue            1.6.1.9000  2022-01-23 [1] Github (tidyverse/glue@3da70df)
 googledrive     2.0.0       2021-07-08 [1] CRAN (R 4.1.1)
 googlesheets4   1.0.0       2021-07-21 [1] CRAN (R 4.1.1)
 gt              0.3.1.9000  2022-01-17 [1] Github (rstudio/gt@fcabb41)
 gtable          0.3.0.9000  2021-10-28 [1] Github (r-lib/gtable@a0bd272)
 haven           2.4.3       2021-08-04 [1] CRAN (R 4.1.1)
 highr           0.9         2021-04-16 [1] CRAN (R 4.1.1)
 hms             1.1.1       2021-09-26 [1] CRAN (R 4.1.1)
 htmltools       0.5.2       2021-08-25 [1] CRAN (R 4.1.1)
 htmlwidgets     1.5.4       2021-09-08 [1] CRAN (R 4.1.1)
 httr            1.4.2       2020-07-20 [1] CRAN (R 4.1.1)
 jsonlite        1.7.3       2022-01-17 [1] CRAN (R 4.1.2)
 knitr           1.37.1      2021-12-21 [1] https://yihui.r-universe.dev (R 4.1.2)
 labeling        0.4.2       2020-10-20 [1] CRAN (R 4.1.1)
 lattice         0.20-45     2021-09-22 [2] CRAN (R 4.1.2)
 lifecycle       1.0.1       2021-09-24 [1] CRAN (R 4.1.1)
 lubridate       1.8.0       2022-01-20 [1] Github (tidyverse/lubridate@566590f)
 magrittr        2.0.1       2020-11-17 [1] CRAN (R 4.1.1)
 Matrix        * 1.4-0       2021-12-08 [2] CRAN (R 4.1.2)
 memoise         2.0.1       2021-11-26 [1] CRAN (R 4.1.2)
 modelr          0.1.8.9000  2021-10-27 [1] Github (tidyverse/modelr@16168e0)
 munsell         0.5.0       2018-06-12 [1] CRAN (R 4.1.1)
 patchwork     * 1.1.0.9000  2021-10-27 [1] Github (thomasp85/patchwork@79223d3)
 pillar          1.6.4       2021-10-18 [1] CRAN (R 4.1.1)
 pkgbuild        1.3.1       2021-12-20 [1] CRAN (R 4.1.2)
 pkgconfig       2.0.3       2019-09-22 [1] CRAN (R 4.1.1)
 pkgload         1.2.4       2021-11-30 [1] CRAN (R 4.1.2)
 png             0.1-7       2013-12-03 [1] CRAN (R 4.1.1)
 prettyunits     1.1.1       2020-01-24 [1] CRAN (R 4.1.1)
 processx        3.5.2       2021-04-30 [1] CRAN (R 4.1.1)
 ps              1.6.0       2021-02-28 [1] CRAN (R 4.1.1)
 purrr         * 0.3.4       2020-04-17 [1] CRAN (R 4.1.1)
 R.cache         0.15.0      2021-04-30 [1] CRAN (R 4.1.1)
 R.methodsS3     1.8.1       2020-08-26 [1] CRAN (R 4.1.1)
 R.oo            1.24.0      2020-08-26 [1] CRAN (R 4.1.1)
 R.utils         2.11.0      2021-09-26 [1] CRAN (R 4.1.1)
 R6              2.5.1.9000  2021-12-09 [1] Github (r-lib/R6@1b05b89)
 ragg            1.2.1.9000  2021-12-08 [1] Github (r-lib/ragg@c68c666)
 Rcpp            1.0.8       2022-01-13 [1] CRAN (R 4.1.2)
 readr         * 2.1.1       2021-11-30 [1] CRAN (R 4.1.2)
 readxl          1.3.1.9000  2022-01-20 [1] Github (tidyverse/readxl@2ccb82c)
 remotes         2.4.2       2022-01-24 [1] Github (r-lib/remotes@7b0ee01)
 reprex          2.0.1       2021-08-05 [1] CRAN (R 4.1.1)
 reticulate      1.23        2022-01-14 [1] CRAN (R 4.1.2)
 rlang           1.0.0       2022-01-20 [1] Github (r-lib/rlang@f2fbaad)
 rmarkdown       2.11.12     2022-01-24 [1] Github (rstudio/rmarkdown@b53a7ce)
 rprojroot       2.0.2       2020-11-15 [1] CRAN (R 4.1.1)
 rstudioapi      0.13.0-9000 2022-01-15 [1] Github (rstudio/rstudioapi@5d0f087)
 Rttf2pt1        1.3.9       2021-07-22 [1] CRAN (R 4.1.1)
 rvest           1.0.2       2021-10-16 [1] CRAN (R 4.1.1)
 sass            0.4.0       2021-05-12 [1] CRAN (R 4.1.1)
 scales          1.1.1       2020-05-11 [1] CRAN (R 4.1.1)
 sessioninfo     1.2.2       2021-12-06 [1] CRAN (R 4.1.2)
 stringi         1.7.6       2021-11-29 [1] CRAN (R 4.1.2)
 stringr       * 1.4.0.9000  2022-01-24 [1] Github (tidyverse/stringr@85f6140)
 styler        * 1.6.2.9000  2022-01-17 [1] Github (r-lib/styler@9274aed)
 systemfonts     1.0.3.9000  2021-12-07 [1] Github (r-lib/systemfonts@414114e)
 testthat        3.1.2.9000  2022-01-21 [1] Github (r-lib/testthat@54b9db2)
 textshaping     0.3.6       2021-10-13 [1] CRAN (R 4.1.1)
 tibble        * 3.1.6.9000  2022-01-18 [1] Github (tidyverse/tibble@7aa54e6)
 tidyr         * 1.1.4       2021-09-27 [1] CRAN (R 4.1.1)
 tidyselect      1.1.1       2021-04-30 [1] CRAN (R 4.1.1)
 tidyverse     * 1.3.1.9000  2021-12-08 [1] Github (tidyverse/tidyverse@6186fbf)
 tzdb            0.2.0       2021-10-27 [1] CRAN (R 4.1.1)
 usethis         2.1.5.9000  2022-01-20 [1] Github (r-lib/usethis@57b109a)
 utf8            1.2.2       2021-07-24 [1] CRAN (R 4.1.1)
 vctrs           0.3.8       2021-04-29 [1] CRAN (R 4.1.1)
 vipor           0.4.5       2017-03-22 [1] CRAN (R 4.1.2)
 viridisLite     0.4.0       2021-04-13 [1] CRAN (R 4.1.1)
 vroom           1.5.7       2021-11-30 [1] CRAN (R 4.1.2)
 withr           2.4.3       2021-11-30 [1] CRAN (R 4.1.2)
 xfun            0.29        2021-12-14 [1] CRAN (R 4.1.2)
 xml2            1.3.3       2021-11-30 [1] CRAN (R 4.1.2)
 yaml            2.2.1       2020-02-01 [1] CRAN (R 4.1.1)

 [1] /opt/homebrew/lib/R/4.1/site-library
 [2] /opt/homebrew/Cellar/r/4.1.2/lib/R/library

─ Python configuration ───────────────────────────────────────────────────────
 python:         /Users/jialei/.pyenv/shims/python
 libpython:      /Users/jialei/.pyenv/versions/miniforge3-4.10.1-5/lib/libpython3.9.dylib
 pythonhome:     /Users/jialei/.pyenv/versions/miniforge3-4.10.1-5:/Users/jialei/.pyenv/versions/miniforge3-4.10.1-5
 version:        3.9.5 | packaged by conda-forge | (default, Jun 19 2021, 00:24:55)  [Clang 11.1.0 ]
 numpy:          /Users/jialei/.pyenv/versions/miniforge3-4.10.1-5/lib/python3.9/site-packages/numpy
 numpy_version:  1.20.3
 anndata:        /Users/jialei/.pyenv/versions/miniforge3-4.10.1-5/lib/python3.9/site-packages/anndata
 
 NOTE: Python version was forced by RETICULATE_PYTHON

──────────────────────────────────────────────────────────────────────────────