Single Cell Transcriptomes of Human Blastoids

Author

Jialei Duan

Published

Sun Sep 25, 2022 00:35:14-05:00

Doi
Abstract

Human blastoids provide a readily accessible, scalable, versatile and perturbable alternative to blastocysts for studying early human development, understanding early pregnancy loss and gaining insights into early developmental defects.


[1] "2022-09-25 00:33:35 CDT"
[1] "America/Chicago"

Preparation

Functions

Load required packages.

source(
    file = file.path(
        SCRIPT_DIR,
        "utilities.R"
    )
)

load_matrix <- function(x) {
    matrix_readcount_use <- scipy.sparse$load_npz(
        file.path(x, "matrix_readcount.npz")
    )
    colnames(matrix_readcount_use) <- np$load(
        file.path(x, "matrix_readcount_barcodes.npy")
    )
    rownames(matrix_readcount_use) <- np$load(
        file.path(x, "matrix_readcount_features.npy")
    )

    return(matrix_readcount_use)
}

Python env

np <- reticulate::import("numpy", convert = TRUE)
scipy.sparse <- reticulate::import(module = "scipy.sparse", convert = TRUE)
reticulate::py_config()
python:         /Users/jialei/.pyenv/shims/python
libpython:      /Users/jialei/.pyenv/versions/mambaforge-4.10.3-10/lib/libpython3.9.dylib
pythonhome:     /Users/jialei/.pyenv/versions/mambaforge-4.10.3-10:/Users/jialei/.pyenv/versions/mambaforge-4.10.3-10
version:        3.9.13 | packaged by conda-forge | (main, May 27 2022, 17:00:33)  [Clang 13.0.1 ]
numpy:          /Users/jialei/.pyenv/versions/mambaforge-4.10.3-10/lib/python3.9/site-packages/numpy
numpy_version:  1.22.4
numpy:          /Users/jialei/.pyenv/versions/mambaforge-4.10.3-10/lib/python3.9/site-packages/numpy

NOTE: Python version was forced by RETICULATE_PYTHON

Matrix

PROJECT_DIR <- "/Users/jialei/Dropbox/Data/Projects/UTSW/Human_blastoid"
MATRIX_DIR <- list(
    "github/data/matrices/LW36",
    "github/data/matrices/LW49_LW50_LW51_LW52",
    "github/data/matrices/LW58_LW59",
    "github/data/matrices/LW60_LW61",
    "raw/public/PRJEB11202/reformatted_matrix"
)

matrix_readcount_use <- purrr::map(MATRIX_DIR, \(x) {
    load_matrix(file.path(PROJECT_DIR, x))
}) |>
    purrr::reduce(cbind)

# clean up
rm(MATRIX_DIR)

Embedding

EMBEDDING_FILE <- "embedding.csv.gz"

embedding <- read_csv(
    file = file.path(
        PROJECT_DIR,
        "github",
        "data",
        EMBEDDING_FILE
    )
)

# clean up
rm(EMBEDDING_FILE)

Metadata

cell_metadata_PRJEB11202 <- read_delim(
    file.path(
        PROJECT_DIR,
        "raw/public/PRJEB11202/",
        "E-MTAB-3929.sdrf.tsv"
    ),
    delim = "\t"
) |>
    dplyr::select(
        `Comment[ENA_SAMPLE]`,
        `Comment[ENA_RUN]`,
        `Characteristics[developmental stage]`,
        `Characteristics[inferred lineage]`
    ) |>
    dplyr::rename(
        cell = `Comment[ENA_SAMPLE]`,
        run = `Comment[ENA_RUN]`,
        developmental_stage = `Characteristics[developmental stage]`,
        lineage = `Characteristics[inferred lineage]`
    ) |>
    dplyr::mutate(
        developmental_stage = str_replace(
            string = developmental_stage,
            pattern = "embryonic day ",
            replacement = "E"
        ),
        lineage = case_when(
            lineage == "epiblast" ~ "EPI",
            lineage == "primitive endoderm" ~ "HYP",
            lineage == "trophectoderm" ~ "TE",
            lineage == "not applicable" ~ "Pre-lineage"
        )
    )

embedding <- embedding |>
    dplyr::left_join(
        cell_metadata_PRJEB11202
    ) |>
    dplyr::mutate(
        developmental_stage = factor(developmental_stage),
        lineage = factor(lineage)
    )


Check memory usage.

walk(list(matrix_readcount_use, embedding), \(x) {
    print(object.size(x), units = "auto", standard = "SI")
})
496.5 MB
1.6 MB

Blastoids globally resemble blastocysts

Embedding visualization

EMBEDDING_TITLE_PREFIX <- "UMAP"

x_column <- "x_umap"
y_column <- "y_umap"

GEOM_POINT_SIZE <- 0.6
RASTERISED <- FALSE

Clustering & batch & UMI

p_embedding_louvain <- plot_embedding(
    data = embedding[, c(x_column, y_column)],
    color = embedding$louvain |> as.factor(),
    label = glue::glue("{EMBEDDING_TITLE_PREFIX}; Cluster"),
    color_labels = TRUE,
    color_legend = FALSE,
    sort_values = FALSE,
    shuffle_values = TRUE,
    rasterise = RASTERISED,
    geom_point_size = GEOM_POINT_SIZE
) +
    theme_customized_embedding()

p_embedding_batch <- plot_embedding(
    data = embedding[, c(x_column, y_column)],
    color = embedding$batch |> as.factor(),
    label = glue::glue("{EMBEDDING_TITLE_PREFIX}; Batch"),
    color_labels = FALSE,
    color_legend = TRUE,
    sort_values = FALSE,
    shuffle_values = TRUE,
    rasterise = RASTERISED,
    geom_point_size = GEOM_POINT_SIZE
) +
    theme_customized_embedding()

p_embedding_UMI <- plot_embedding(
    data = embedding[, c(x_column, y_column)],
    color = log10(Matrix::colSums(matrix_readcount_use[, embedding$cell])),
    label = glue::glue("{EMBEDDING_TITLE_PREFIX}; UMI"),
    color_legend = TRUE,
    sort_values = FALSE,
    shuffle_values = TRUE,
    rasterise = RASTERISED,
    geom_point_size = GEOM_POINT_SIZE
) +
    theme_customized_embedding()

selected_feature <- "ENSG00000204531_POU5F1"
p_embedding_POU5F1 <- plot_embedding(
    data = embedding[, c(x_column, y_column)],
    color = log10(
        calc_cpm(matrix_readcount_use[, embedding$cell])
        [selected_feature, ] + 1
    ),
    label = glue::glue(
        "{EMBEDDING_TITLE_PREFIX}; ",
        "{selected_feature |> stringr::str_remove(pattern = \"^E.+_\")}"
    ),
    color_legend = TRUE,
    sort_values = TRUE,
    rasterise = RASTERISED,
    geom_point_size = GEOM_POINT_SIZE,
    na_value = "grey80"
) +
    theme_customized_embedding()
embedding |>
    dplyr::mutate(
        num_umis = colSums(matrix_readcount_use[, cell]),
        num_features = colSums(matrix_readcount_use[, cell] > 0),
    ) |>
    dplyr::group_by(louvain) |>
    dplyr::summarise(
        num_cells = n(),
        median_umis = median(num_umis),
        median_features = median(num_features)
    ) |>
    gt::gt() |>
    gt::data_color(
        columns = c(median_umis),
        colors = scales::col_numeric(
            palette = c(
                "green", "orange", "red"
            ),
            domain = c(1000, 32000)
        )
    ) |>
    gt::summary_rows(
        columns = c(louvain),
        fns = list(
            Count = ~ n()
        ),
        decimals = 0
    ) |>
    gt::summary_rows(
        columns = c(median_umis:median_features),
        fns = list(
            Mean = ~ mean(.)
        ),
        decimals = 0
    ) |>
    gt::summary_rows(
        columns = c(num_cells),
        fns = list(
            Sum = ~ sum(.)
        ),
        decimals = 0
    ) |>
    gt::tab_header(
        title = gt::md("**Blastoid Clustering**; Cluster")
    )
Blastoid Clustering; Cluster
louvain num_cells median_umis median_features
0 1533 11534.0 3074.0
1 1113 18092.0 3612.0
2 1079 11308.0 2940.0
3 996 14659.0 3220.5
4 924 1108.5 627.0
5 725 6915.0 1981.0
6 710 9711.5 2158.0
7 672 10544.0 2791.5
8 649 11961.0 2905.0
9 570 19201.0 3656.0
10 410 23705.5 4208.5
11 366 13910.5 3355.5
12 365 16942.0 3437.0
13 275 8010.0 2455.0
14 249 31950.0 4954.0
15 202 8217.0 2452.0
16 138 10242.5 2852.5
17 120 17426.0 3400.0
18 86 18158.0 3853.5
Count 19
Mean 13,873 3,049
Sum 11,182
embedding |>
    dplyr::mutate(
        num_umis = colSums(matrix_readcount_use[, cell]),
        num_features = colSums(matrix_readcount_use[, cell] > 0),
    ) |>
    dplyr::group_by(batch) |>
    dplyr::summarise(
        num_cells = n(),
        median_umis = median(num_umis),
        median_features = median(num_features)
    ) |>
    dplyr::mutate(
        sample = dplyr::case_when(
            batch == "PRJEB11202" ~ "Petropoulos et al., 2016",
            batch == "LW60" ~ "Blastoid, D9; HT; 5i/L/A",
            batch == "LW61" ~ "Blastoid, D9; HT; 5i/L/A"
        )
    ) |>
    dplyr::select(
        sample, dplyr::everything()
    ) |>
    gt::gt() |>
    gt::data_color(
        columns = c(median_umis),
        colors = scales::col_numeric(
            palette = c(
                "green", "orange", "red"
            ),
            domain = c(7000, 1600000)
        )
    ) |>
    gt::summary_rows(
        columns = c(sample:batch),
        fns = list(
            Count = ~ n()
        ),
        decimals = 0
    ) |>
    gt::summary_rows(
        columns = c(median_umis:median_features),
        fns = list(
            Mean = ~ mean(.)
        ),
        decimals = 0
    ) |>
    gt::summary_rows(
        columns = c(num_cells),
        fns = list(
            Sum = ~ sum(.)
        ),
        decimals = 0
    ) |>
    gt::tab_header(
        title = gt::md("**Blastoid Clustering**; Batch")
    )
Blastoid Clustering; Batch
sample batch num_cells median_umis median_features
Blastoid, D9; HT; 5i/L/A LW60 4497 14421 3337
Blastoid, D9; HT; 5i/L/A LW61 5156 7625 2185
Petropoulos et al., 2016 PRJEB11202 1529 1551093 10305
Count 3 3
Mean 524,380 5,276
Sum 11,182
purrr::reduce(list(
    p_embedding_louvain,
    p_embedding_batch,
    p_embedding_UMI,
    p_embedding_POU5F1
), `+`) +
    patchwork::plot_layout(ncol = 2) +
    patchwork::plot_annotation(
        theme = ggplot2::theme(plot.margin = ggplot2::margin())
    )

Developmental stages


Salmon: highlighted group of cells; Light grey: cells belonging to this dataset but not the highlighted group; Dark grey: cells belonging to other datasets.

purrr::map(levels(embedding$developmental_stage), \(x) {
    plot_embedding(
        data = embedding[, c(x_column, y_column)],
        color = as.integer(embedding$developmental_stage == x) |> as.factor(),
        label = glue::glue(
            "{EMBEDDING_TITLE_PREFIX}; Petropoulos et al., 2016, {x}"
        ),
        color_legend = FALSE,
        sort_values = TRUE,
        geom_point_size = GEOM_POINT_SIZE,
    ) +
        theme_customized_embedding() +
        ggplot2::scale_color_manual(values = c("grey70", "salmon"))
}) |>
    purrr::reduce(`+`) +
    patchwork::plot_layout(ncol = 3, byrow = TRUE) +
    patchwork::plot_annotation(
        theme = ggplot2::theme(plot.margin = ggplot2::margin())
    )

Polishing

color_palette_cluster <- c(
    "0" = "#8DD3C7",
    "1" = "#9EDAE5FF",
    "2" = "#BEBADA",
    "3" = "#FB8072",
    "4" = "#80B1D3",
    "5" = "#FDB462",
    "6" = "#B3DE69",
    "7" = "#FCCDE5",
    "8" = "#DC863B",
    "9" = "#BC80BD",
    "10" = "#11c638",
    "11" = "#BCBD22FF",
    "12" = "#17BECFFF",
    "13" = "#AEC7E8FF",
    "14" = "#EAD3C6",
    "15" = "#98DF8AFF",
    "16" = "#FF9896FF",
    "17" = "#C5B0D5FF",
    "18" = "#C49C94FF",
    "19" = "#F7B6D2FF",
    "20" = "#D33F6A",
    "21" = "#8E063B",
    "22" = "#023FA5"
)

cluster_labels <- embedding |>
    dplyr::group_by(.data[["louvain"]]) |>
    dplyr::summarise(
        x = quantile(.data[[x_column]], 0.5),
        y = quantile(.data[[y_column]], 0.5),
        .groups = "drop"
    ) |>
    as.data.frame()
cluster_labels[cluster_labels[["louvain"]] == 14, c("x", "y")] <- c(6.8, -1.8)

clusters_unidentified <- c(2, 3, 4, 5, 6, 7, 13, 15, 16)
cluster_labels <- cluster_labels |>
    dplyr::mutate(
        label = dplyr::case_when(
            louvain %in% clusters_unidentified ~ paste0(
                "U",
                as.character(louvain)
            ),
            TRUE ~ as.character(louvain)
        )
    )

# cluster
p_embedding_cluster <- embedding |>
    arrange(desc(louvain)) |>
    ggplot(
        aes(
            x = .data[[x_column]],
            y = .data[[y_column]],
            color = .data[["louvain"]] |> as.factor()
        )
    ) +
    geom_point(
        size = 0.45,
        alpha = 1,
        stroke = 0,
        shape = 16
    ) +
    scale_color_manual(
        values = color_palette_cluster
    ) +
    theme_void() +
    ggplot2::guides(color = "none") +
    ggplot2::annotate(
        "text",
        family = "Arial",
        x = cluster_labels[, "x"],
        y = cluster_labels[, "y"],
        label = cluster_labels[, "label"],
        parse = TRUE,
        size = 2,
        color = c("black")
    )

# lineage
p_embedding_lineage <- plot_embedding(
    data = embedding[, c(x_column, y_column)],
    color = embedding |>
        dplyr::mutate(
            value = factor(lineage,
                levels = c("Blastoid", "TE", "HYP", "EPI", "Pre-lineage")
            )
        ) |>
        dplyr::pull(value),
    label = NULL,
    color_legend = TRUE,
    sort_values = TRUE,
    geom_point_size = 0.45,
) +
    theme_customized_embedding(void = TRUE) +
    ggplot2::scale_color_manual(
        name = NULL,
        values = c("grey70", "#8ace7e", "#ff684c", "#9467bd", "#ffd8b1"),
        breaks = c("Blastoid", "EPI", "HYP", "TE", "Pre-lineage"),
        labels = c(
            "Blastoid cells", "Epiblast (EPI)", "Hypoblast (HYP)",
            "Trophectoderm (TE)", "Pre-lineage"
        ),
        na.value = "grey70"
    )

# developmental stage
p_embedding_developmental_stage <- plot_embedding(
    data = embedding[, c(x_column, y_column)],
    color = embedding$developmental_stage,
    label = NULL,
    color_legend = TRUE,
    sort_values = TRUE,
    geom_point_size = 0.45,
) +
    theme_customized_embedding(void = TRUE) +
    ggplot2::scale_color_manual(
        name = NULL,
        values = c(
            "grey70",
            ggthemes::tableau_color_pal("Tableau 10")(
                length(unique(embedding$developmental_stage))
            )
        ),
        labels = c("Blastoid cells", "E3", "E4", "E5", "E6", "E7"),
        na.value = "grey70"
    )
features_selected <- c(
    "ENSG00000181449_SOX2",
    "ENSG00000187498_COL4A1",
    "ENSG00000179348_GATA2"
)

lineage_labels <- c(
    "(Epiblast)",
    "(Hypoblast)",
    "(Trophectoderm)"
)

CB_POSITION <- c(0.875, 0.32)
x_label <- ggplot_build(
    p_embedding_POU5F1
)$layout$panel_params[[1]][c("x.range")] |>
    unlist() |>
    quantile(0.575)
y_label <- ggplot_build(
    p_embedding_POU5F1
)$layout$panel_params[[1]][c("y.range")] |>
    unlist() |>
    quantile(0.8)

p_embedding_SOX2_COL4A1_GATA2 <- purrr::map2(
    features_selected, lineage_labels, \(x, y) {
        selected_feature <- x
        values <- log10(calc_cpm(matrix_readcount_use)[x, embedding$cell] + 1)
        values[embedding$batch == "PRJEB11202"] <- NA

        plot_embedding(
            data = embedding[, c(x_column, y_column)],
            color = log10(
                calc_cpm(matrix_readcount_use[, embedding$cell])
                [selected_feature, ] + 1
            ),
            label = NULL,
            color_legend = TRUE,
            sort_values = TRUE,
            rasterise = RASTERISED,
            geom_point_size = 0.5,
            na_value = "grey70"
        ) +
            theme_customized_embedding(
                x = CB_POSITION[1],
                y = CB_POSITION[2],
                void = TRUE,
                legend_key_size = c(1.5, 1.5)
            ) +
            ggplot2::annotate(
                geom = "text",
                x = x_label,
                y = y_label,
                label = stringr::str_c(
                    x |> stringr::str_remove(pattern = "^E.+_"),
                    y,
                    sep = "\n"
                ),
                family = "Arial",
                color = "black",
                size = 5 / ggplot2::.pt,
                hjust = 0.5,
                vjust = 0
                # parse = TRUE
            )
    }
)
c(
    list(
        p_embedding_cluster,
        p_embedding_lineage,
        p_embedding_developmental_stage
    ),
    p_embedding_SOX2_COL4A1_GATA2
) |>
    purrr::reduce(`+`) +
    patchwork::plot_layout(ncol = 2, byrow = FALSE) +
    patchwork::plot_annotation(
        theme = ggplot2::theme(plot.margin = ggplot2::margin())
    )

Expression

Embedding

features_selected <- c(
    "ENSG00000204531_POU5F1",
    "ENSG00000203909_DPPA5",
    "ENSG00000184344_GDF3",
    "ENSG00000156574_NODAL",
    "ENSG00000111704_NANOG",
    "ENSG00000075388_FGF4",
    #
    "ENSG00000164736_SOX17",
    "ENSG00000125798_FOXA2",
    "ENSG00000136574_GATA4",
    "ENSG00000141448_GATA6",
    "ENSG00000134853_PDGFRA",
    "ENSG00000115414_FN1",
    #
    "ENSG00000107485_GATA3",
    "ENSG00000118777_ABCG2",
    "ENSG00000165556_CDX2",
    "ENSG00000137869_CYP19A1",
    "ENSG00000172818_OVOL1",
    "ENSG00000126353_CCR7"
)

purrr::map(features_selected, \(x) {
    selected_feature <- x
    values <- log10(calc_cpm(matrix_readcount_use)[x, embedding$cell] + 1)
    values[embedding$batch == "PRJEB11202"] <- NA

    plot_embedding(
        data = embedding[, c(x_column, y_column)],
        color = values,
        label = NULL,
        color_legend = TRUE,
        sort_values = TRUE,
        rasterise = RASTERISED,
        geom_point_size = 0.5,
        na_value = "grey70"
    ) +
        theme_customized_embedding(
            x = CB_POSITION[1],
            y = CB_POSITION[2],
            void = TRUE,
            legend_key_size = c(1.5, 1.5)
        ) +
        ggplot2::annotate(
            geom = "text",
            x = x_label,
            y = y_label,
            label = x |> stringr::str_remove(pattern = "^E.+_"),
            family = "Arial",
            color = "black",
            size = 5 / ggplot2::.pt,
            hjust = 0.5,
            vjust = 0
            # parse = TRUE
        )
}) |>
    purrr::reduce(`+`) +
    patchwork::plot_layout(ncol = 3, byrow = FALSE) +
    patchwork::plot_annotation(
        theme = ggplot2::theme(plot.margin = ggplot2::margin())
    )

Lollipop

clusters_selected_lollipop <- c(
    11,
    18,
    0, 1, 8, 9, 12, 17
)

cells_selected_lollipop <- purrr::map(clusters_selected_lollipop, \(x) {
    embedding |>
        filter(
            louvain == x,
            batch != "PRJEB11202"
        ) |>
        pull(cell)
})
names(cells_selected_lollipop) <- clusters_selected_lollipop

features_selected <- c(
    "ENSG00000181449_SOX2",
    "ENSG00000156574_NODAL",
    "ENSG00000111704_NANOG",
    "ENSG00000147596_PRDM14",
    "ENSG00000075388_FGF4",
    #
    "ENSG00000164736_SOX17",
    "ENSG00000125798_FOXA2",
    "ENSG00000136574_GATA4",
    "ENSG00000141448_GATA6",
    "ENSG00000115414_FN1",
    "ENSG00000187498_COL4A1",
    #
    "ENSG00000179348_GATA2",
    "ENSG00000107485_GATA3",
    "ENSG00000118777_ABCG2",
    #
    "ENSG00000126353_CCR7",
    "ENSG00000137869_CYP19A1",
    "ENSG00000169550_MUC15",
    "ENSG00000172818_OVOL1"
)

plot_lollipop(
    cells = cells_selected_lollipop,
    features = features_selected,
    matrix_cpm = calc_cpm(matrix_readcount_use),
    size_range_limits = c(0, 4),
    dot_size = 3
) +
    ggplot2::theme(
        legend.box = "horizontal",
        axis.text.x.top = ggplot2::element_text(
            angle = 90, vjust = 0.5, hjust = 0
        )
    )

Violin

features_selected <- c(
    "ENSG00000204531_POU5F1",
    "ENSG00000203909_DPPA5",
    "ENSG00000125798_FOXA2",
    "ENSG00000134853_PDGFRA",
    "ENSG00000107485_GATA3",
    "ENSG00000118777_ABCG2"
)

clusters_selected_violin <- c(
    11,
    18,
    0, 1, 8, 9, 12, 17
)

# blastoid
cells_violin <- purrr::map(clusters_selected_violin, \(x) {
    embedding |>
        dplyr::filter(
            louvain == x,
            batch != "PRJEB11202"
        ) |>
        dplyr::pull(cell)
})
names(cells_violin) <- clusters_selected_violin

p_violin_blastoid <- plot_violin(
    cells = cells_violin,
    features = features_selected,
    matrix_cpm = calc_cpm(matrix_readcount_use),
    x_range_breaks = NULL
) +
    ggplot2::scale_fill_manual(
        name = NULL,
        values = color_palette_cluster
    ) +
    ggplot2::scale_color_manual(
        name = NULL,
        values = color_palette_cluster
    ) +
    ggplot2::labs(title = "Blastoids; 10x Genomics") +
    theme_customized_violin(panel_border_color = "black") +
    ggplot2::theme(
        plot.title = ggplot2::element_text(
            family = "Arial",
            size = 8,
            margin = ggplot2::margin(
                t = 0, r = 0, b = 1, l = 0,
                unit = "mm"
            ),
            hjust = 0.5
        )
    )
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0. Please use
`linewidth` instead.
ℹ The deprecated feature was likely used in the ggplot2 package.
  Please report the issue at <https://github.com/tidyverse/ggplot2/issues>.
# in vivo
cells_violin <- purrr::map(clusters_selected_violin, \(x) {
    embedding |>
        filter(
            louvain == x,
            batch == "PRJEB11202"
        ) |>
        pull(cell)
})
names(cells_violin) <- clusters_selected_violin

# re-aligned
p_violin_PRJEB11202 <- plot_violin(
    cells = cells_violin,
    features = features_selected,
    matrix_cpm = calc_cpm(matrix_readcount_use),
    x_range_breaks = NULL,
    y_title = NULL
) +
    ggplot2::scale_fill_manual(
        name = NULL,
        values = color_palette_cluster
    ) +
    ggplot2::scale_color_manual(
        name = NULL,
        values = color_palette_cluster
    ) +
    ggplot2::labs(title = "Human pre-implantation embryos; Smart-Seq2") +
    theme_customized_violin(panel_border_color = "black") +
    ggplot2::theme(
        plot.title = ggplot2::element_text(
            family = "Arial",
            size = 8,
            margin = ggplot2::margin(
                t = 0, r = 0, b = 1, l = 0,
                unit = "mm"
            ),
            hjust = 0.5
        )
    )

p_violin_PRJEB11202 <- p_violin_PRJEB11202 +
    theme(
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank()
    )

p_violin_blastoid_dims <- patchwork::get_dim(p_violin_blastoid)
p_violin_PRJEB11202_aligned <- patchwork::set_dim(
    p_violin_PRJEB11202,
    p_violin_blastoid_dims
)
gridExtra::grid.arrange(
    p_violin_blastoid, p_violin_PRJEB11202_aligned,
    ncol = 2,
    clip = FALSE
)

Cluster composition

clusters_selected <- c(
    11,
    18,
    0, 1, 8, 9, 12, 17,
    10, 14
)

cell_distribution_labels_right <- calc_group_composition(
    data = embedding,
    x = "louvain",
    group = "lineage"
) |>
    dplyr::filter(
        is.na(lineage),
        louvain %in% clusters_selected
    ) |>
    dplyr::mutate(
        louvain = factor(
            louvain,
            levels = rev(louvain)
        )
    )

cell_distribution_labels_left <- tibble::tribble(
    ~louvain, ~lineage, ~label_position, ~percentage,
    11L, "EPI", 0.068306011, 0.136612022,
    18L, "HYP", 0.20930232, 0.37209302,
    0L, "TE", 0.036855838, 0.063274625,
    1L, "TE", 0.20350404, 0.33513028,
    8L, "TE", 0.095531587, 0.181818182,
    9L, "TE", 0.19649123, 0.2877193,
    12L, "TE", 0.073972603, 0.126027397,
    17L, "TE", 0.09583335, 0.1916667,
    10L, "Pre-lineage", 0.284146342, 0.412195122,
    14L, "Pre-lineage", 0.24698795, 0.4939759
) |>
    dplyr::mutate(
        louvain = factor(
            louvain,
            levels = rev(clusters_selected)
        )
    )

calc_group_composition(
    data = embedding,
    x = "louvain",
    group = "lineage"
) |>
    dplyr::filter(louvain %in% clusters_selected) |>
    dplyr::mutate(
        louvain = factor(louvain, levels = rev(clusters_selected)),
        lineage = dplyr::case_when(
            is.na(lineage) ~ "Blastoid",
            TRUE ~ as.character(lineage)
        ),
        lineage = factor(lineage,
            levels = rev(c("EPI", "HYP", "TE", "Pre-lineage", "Blastoid"))
        )
    ) |>
    plot_barplot(
        x = "louvain",
        y = "percentage",
        z = "lineage"
    ) +
    ggplot2::theme(
        panel.grid.major.y = ggplot2::element_line(color = "grey80")
    ) +
    ggplot2::coord_flip() +
    ggplot2::scale_fill_manual(
        name = NULL,
        values = c("grey85", "#8ace7e", "#ff684c", "#9467bd", "#ffd8b1"),
        breaks = c("Blastoid", "EPI", "HYP", "TE", "Pre-lineage"),
        labels = c(
            "Blastoid cells", "Epiblast (EPI)", "Hypoblast (HYP)",
            "Trophectoderm (TE)", "Pre-lineage"
        )
    ) +
    ggplot2::geom_text(
        ggplot2::aes(
            y = label_position,
            x = louvain,
            label = scales::percent(percentage, accuracy = 1L),
            group = NULL
        ),
        size = 1.8,
        family = "Arial",
        color = "black",
        data = cell_distribution_labels_left,
        hjust = 0.5
    ) +
    ggplot2::geom_text(
        ggplot2::aes(
            y = 0.825,
            x = louvain,
            label = scales::percent(percentage, accuracy = 1L),
            group = NULL
        ),
        size = 1.8,
        family = "Arial",
        color = "black",
        data = cell_distribution_labels_right,
        hjust = 0
    )

Heatmap construction

# prepare features; cells; heatmap matrix;
features_heatmap <- c(
    "ENSG00000204531_POU5F1",
    "ENSG00000184344_GDF3",
    "ENSG00000203909_DPPA5",
    "ENSG00000181449_SOX2",
    "ENSG00000111704_NANOG",
    "ENSG00000241186_TDGF1",
    "ENSG00000156574_NODAL",
    "ENSG00000254339_AC064802.1",
    "ENSG00000283567_C19orf85",
    "ENSG00000151650_VENTX",
    "ENSG00000163032_VSNL1",
    "ENSG00000159231_CBR3",
    "ENSG00000145423_SFRP2",
    "ENSG00000006468_ETV1",
    "ENSG00000147596_PRDM14",
    #
    "ENSG00000136574_GATA4",
    "ENSG00000164266_SPINK1",
    "ENSG00000158966_CACHD1",
    "ENSG00000164093_PITX2",
    "ENSG00000087303_NID2",
    "ENSG00000174358_SLC6A19",
    "ENSG00000134962_KLB",
    "ENSG00000167780_SOAT2",
    "ENSG00000110245_APOC3",
    "ENSG00000170558_CDH2",
    "ENSG00000125848_FLRT3",
    "ENSG00000129538_RNASE1",
    "ENSG00000100079_LGALS2",
    "ENSG00000134853_PDGFRA",
    "ENSG00000164736_SOX17",
    "ENSG00000017427_IGF1",
    "ENSG00000275410_HNF1B",
    "ENSG00000118137_APOA1",
    "ENSG00000198336_MYL4",
    "ENSG00000171557_FGG",
    "ENSG00000125798_FOXA2",
    "ENSG00000146374_RSPO3",
    "ENSG00000115414_FN1",
    "ENSG00000164292_RHOBTB3",
    "ENSG00000141448_GATA6",
    "ENSG00000187498_COL4A1",
    #
    "ENSG00000196549_MME",
    "ENSG00000138814_PPP3CA",
    "ENSG00000082074_FYB1",
    "ENSG00000123191_ATP7B",
    "ENSG00000175318_GRAMD2A",
    "ENSG00000100593_ISM2",
    "ENSG00000265107_GJA5",
    "ENSG00000226887_ERVMER34-1",
    "ENSG00000183734_ASCL2",
    "ENSG00000143850_PLEKHA6",
    "ENSG00000103534_TMC5",
    "ENSG00000132470_ITGB4",
    "ENSG00000180999_C1orf105",
    "ENSG00000109610_SOD3",
    "ENSG00000143369_ECM1",
    "ENSG00000204632_HLA-G",
    "ENSG00000113594_LIFR",
    "ENSG00000168394_TAP1",
    "ENSG00000183287_CCBE1",
    "ENSG00000079393_DUSP13",
    "ENSG00000121769_FABP3",
    "ENSG00000182985_CADM1",
    "ENSG00000164692_COL1A2",
    "ENSG00000112655_PTK7",
    "ENSG00000134873_CLDN10",
    "ENSG00000108375_RNF43",
    "ENSG00000254726_MEX3A",
    "ENSG00000164120_HPGD",
    "ENSG00000081026_MAGI3",
    "ENSG00000120738_EGR1",
    "ENSG00000108960_MMD",
    "ENSG00000112137_PHACTR1",
    "ENSG00000166450_PRTG",
    "ENSG00000164099_PRSS12",
    "ENSG00000143320_CRABP2",
    "ENSG00000145681_HAPLN1",
    "ENSG00000113196_HAND1",
    "ENSG00000198300_PEG3",
    "ENSG00000101144_BMP7",
    "ENSG00000174498_IGDCC3",
    "ENSG00000132005_RFX1",
    "ENSG00000099814_CEP170B",
    "ENSG00000101986_ABCD1",
    "ENSG00000144648_ACKR2",
    "ENSG00000074410_CA12",
    "ENSG00000125740_FOSB",
    "ENSG00000183979_NPB",
    "ENSG00000214049_UCA1",
    "ENSG00000153071_DAB2",
    "ENSG00000126353_CCR7",
    "ENSG00000105880_DLX5",
    "ENSG00000124749_COL21A1",
    "ENSG00000112041_TULP1",
    "ENSG00000120211_INSL4",
    "ENSG00000223850_MYCNUT",
    "ENSG00000164707_SLC13A4",
    "ENSG00000173157_ADAMTS20",
    "ENSG00000196482_ESRRG",
    "ENSG00000180875_GREM2",
    "ENSG00000170255_MRGPRX1",
    "ENSG00000267943_AC010328.1",
    "ENSG00000172818_OVOL1",
    "ENSG00000137270_GCM1",
    "ENSG00000137869_CYP19A1",
    "ENSG00000135678_CPM",
    "ENSG00000171476_HOPX",
    "ENSG00000249861_LGALS16",
    "ENSG00000108244_KRT23",
    "ENSG00000169550_MUC15",
    "ENSG00000260034_LCMT1-AS2",
    "ENSG00000269526_ERVV-1",
    "ENSG00000028137_TNFRSF1B",
    "ENSG00000124731_TREM1",
    "ENSG00000072422_RHOBTB1",
    "ENSG00000185215_TNFAIP2",
    "ENSG00000164877_MICALL2",
    "ENSG00000244476_ERVFRD-1",
    "ENSG00000280109_PLAC4",
    "ENSG00000111057_KRT18",
    "ENSG00000170421_KRT8",
    "ENSG00000169583_CLIC3",
    "ENSG00000107485_GATA3",
    "ENSG00000179348_GATA2",
    "ENSG00000187186_AL162231.1",
    "ENSG00000176155_CCDC57",
    "ENSG00000133243_BTBD2",
    "ENSG00000179364_PACS2",
    "ENSG00000237651_C2orf74",
    "ENSG00000125726_CD70",
    "ENSG00000156587_UBE2L6",
    "ENSG00000182165_TP53TG1",
    "ENSG00000163017_ACTG2",
    "ENSG00000143632_ACTA1",
    "ENSG00000119632_IFI27L2",
    "ENSG00000185847_LINC01405",
    "ENSG00000116661_FBXO2",
    "ENSG00000196878_LAMB3",
    "ENSG00000122861_PLAU",
    "ENSG00000142227_EMP3",
    "ENSG00000011422_PLAUR",
    "ENSG00000100985_MMP9",
    "ENSG00000026508_CD44",
    "ENSG00000104368_PLAT",
    "ENSG00000164171_ITGA2",
    "ENSG00000101198_NKAIN4",
    "ENSG00000116774_OLFML3",
    "ENSG00000125398_SOX9",
    "ENSG00000124225_PMEPA1",
    "ENSG00000118785_SPP1",
    "ENSG00000163453_IGFBP7",
    "ENSG00000182871_COL18A1",
    "ENSG00000110925_CSRNP2",
    "ENSG00000165886_UBTD1",
    "ENSG00000115073_ACTR1B",
    "ENSG00000256288_AC022075.3",
    "ENSG00000228106_AL392172.1",
    "ENSG00000163577_EIF5A2"
)

# heatmap matrix
clusters_selected_heatmap <- c(
    11,
    18,
    0, 1, 8, 9, 12, 17
)

matrix_heatmap <- calc_cpm(matrix_readcount_use)[
    features_heatmap,
    embedding |>
        dplyr::filter(
            louvain %in% clusters_selected_heatmap,
            batch != "PRJEB11202"
        ) |>
        dplyr::pull(cell)
]

matrix_heatmap <- matrix_heatmap[rowSums(matrix_heatmap) != 0, ]

matrix_heatmap <- log10(matrix_heatmap + 1)
matrix_heatmap <- t(scale(t(matrix_heatmap)))

heatmap_limits <- quantile(matrix_heatmap, c(0.05, 0.95))
matrix_heatmap[matrix_heatmap < heatmap_limits[1]] <- heatmap_limits[1]
matrix_heatmap[matrix_heatmap > heatmap_limits[2]] <- heatmap_limits[2]
# sample cells
cells_heatmap_sampled <- purrr::map(clusters_selected_heatmap, \(x) {
    cells_in_group <- embedding |>
        filter(
            louvain == x,
            batch != "PRJEB11202"
        ) |>
        pull(cell)

    cat(length(cells_in_group), "\n")

    if (length(cells_in_group) >= 100) {
        c <- sample(cells_in_group, 100)
    } else {
        c <- cells_in_group
    }

    return(c)
})
## 312 
## 51 
## 1428 
## 682 
## 528 
## 376 
## 310 
## 97
names(cells_heatmap_sampled) <- clusters_selected_heatmap

# cells
clusters_EPI <- 11
clusters_HYP <- 18
clusters_TE <- c(0, 1, 8, 9, 12, 17)

cells_heatmap_EPI <- embedding |>
    dplyr::filter(
        batch != "PRJEB11202",
        louvain %in% clusters_EPI,
        cell %in% unlist(cells_heatmap_sampled)
    ) |>
    dplyr::pull(cell)

cells_heatmap_HYP <- embedding |>
    dplyr::filter(
        batch != "PRJEB11202",
        louvain %in% clusters_HYP,
        cell %in% unlist(cells_heatmap_sampled)
    ) |>
    dplyr::pull(cell)

cells_heatmap_TE <- cells_heatmap_sampled[as.character(clusters_TE)] |>
    unlist()

purrr::map_int(
    list(cells_heatmap_EPI, cells_heatmap_HYP, cells_heatmap_TE),
    length
)
## [1] 100  51 597
# column annotation
ha_cluster <- embedding[
    match(
        x = cells_heatmap_sampled |> unlist(),
        table = embedding$cell
    ), "louvain",
    drop = TRUE
]

ha_lineage <- rep("TE", length(ha_cluster))
ha_lineage[ha_cluster == "11"] <- "EPI"
ha_lineage[ha_cluster %in% c("18")] <- "HYP"
# map color
col_fun <- circlize::colorRamp2(
    quantile(
        c(
            min(matrix_heatmap),
            max(matrix_heatmap)
        ),
        seq(0, 1, 0.1)
    ),
    viridis::plasma(11)
)
# features to mark
features_to_mark_left <- c(
    "ENSG00000184344_GDF3",
    "ENSG00000241186_TDGF1",
    "ENSG00000156574_NODAL",
    "ENSG00000066468_FGFR2",
    "ENSG00000111704_NANOG",
    "ENSG00000204531_POU5F1",
    "ENSG00000203909_DPPA5",
    "ENSG00000181449_SOX2",
    "ENSG00000186103_ARGFX",
    "ENSG00000147596_PRDM14",
    "ENSG00000171872_KLF17",
    "ENSG00000086548_CEACAM6",
    "ENSG00000166073_GPR176",
    "ENSG00000125740_FOSB",
    "ENSG00000105880_DLX5",
    "ENSG00000169583_CLIC3",
    "ENSG00000244476_ERVFRD-1",
    "ENSG00000072422_RHOBTB1",
    "ENSG00000137869_CYP19A1",
    "ENSG00000169550_MUC15",
    "ENSG00000108244_KRT23",
    "ENSG00000260034_LCMT1-AS2"
)

features_to_mark_right <- c(
    "ENSG00000187498_COL4A1",
    "ENSG00000136574_GATA4",
    "ENSG00000115414_FN1",
    "ENSG00000125798_FOXA2",
    "ENSG00000164736_SOX17",
    "ENSG00000141448_GATA6",
    "ENSG00000275410_HNF1B",
    "ENSG00000153707_PTPRD",
    "ENSG00000101441_CST4",
    "ENSG00000146374_RSPO3",
    "ENSG00000111057_KRT18",
    "ENSG00000170421_KRT8",
    "ENSG00000153071_DAB2",
    "ENSG00000107485_GATA3",
    "ENSG00000179348_GATA2",
    "ENSG00000144648_ACKR2",
    "ENSG00000126353_CCR7",
    "ENSG00000137270_GCM1",
    "ENSG00000180875_GREM2",
    "ENSG00000172818_OVOL1",
    "ENSG00000265107_GJA5",
    "ENSG00000132470_ITGB4",
    "ENSG00000074410_CA12",
    "ENSG00000124749_COL21A1"
)

#
features_to_mark_left <- features_to_mark_left[
    features_to_mark_left %in% rownames(matrix_heatmap)
]

ha_left <- ComplexHeatmap::rowAnnotation(
    foo = ComplexHeatmap::anno_mark(
        at = which(rownames(matrix_heatmap) %in% features_to_mark_left),
        labels = features_to_mark_left |>
            stringr::str_remove(pattern = "E.+_"),
        which = "row",
        side = "left",
        lines_gp = grid::gpar(col = "grey50"),
        labels_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 5
        )
    )
)

#
features_to_mark_right <- features_to_mark_right[
    features_to_mark_right %in% rownames(matrix_heatmap)
]

ha_right <- ComplexHeatmap::rowAnnotation(
    foo = ComplexHeatmap::anno_mark(
        at = which(rownames(matrix_heatmap) %in% features_to_mark_right),
        labels = features_to_mark_right |>
            stringr::str_remove(pattern = "E.+_"),
        which = "row",
        side = "right",
        lines_gp = grid::gpar(col = "grey50"),
        labels_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 5
        )
    )
)
RASTERISED <- FALSE
# EPI
anno_labels_tbl_EPI <- table(
    ha_cluster[ha_cluster %in% clusters_EPI]
)[as.character(clusters_EPI)] |>
    tibble::as_tibble() |>
    dplyr::mutate(
        cum_sum = cumsum(value),
        position = cum_sum - value / 2
    )

anno_labels_cluster_EPI <- rep(
    NA, length(ha_cluster[ha_cluster %in% clusters_EPI])
)

if (nchar(as.character(clusters_EPI)) > 1) {
    cluster_label <- strsplit(as.character(clusters_EPI), "")[[1]]

    anno_labels_cluster_EPI[
        anno_labels_tbl_EPI$position - 5
    ] <- cluster_label[1]
    anno_labels_cluster_EPI[
        anno_labels_tbl_EPI$position + 5
    ] <- cluster_label[2]
} else {
    anno_labels_cluster_EPI[anno_labels_tbl_EPI$position] <- as.character(
        clusters_EPI
    )
}

ha_column_EPI <- ComplexHeatmap::HeatmapAnnotation(
    #
    lineage = ComplexHeatmap::anno_simple(
        ha_lineage[ha_lineage == "EPI"],
        # pch = anno_labels_cluster,
        col = setNames(
            object = c("grey85", "#8ace7e", "#ff684c", "#9467bd", "#ffd8b1"),
            nm = c("Blastoid", "EPI", "HYP", "TE", "n/a")
        ),
        which = "column",
        pt_size = unit(2, "mm"),
        pt_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 5
        ),
        simple_anno_size = unit(2, "mm")
    ),
    #
    cluster = ComplexHeatmap::anno_simple(
        ha_cluster[ha_cluster %in% clusters_EPI],
        pch = anno_labels_cluster_EPI,
        col = color_palette_cluster[as.character(clusters_selected_heatmap)],
        which = "column",
        # pt_size = unit(2, "mm"),
        pt_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 5
        ),
        simple_anno_size = unit(2, "mm")
    ),
    #
    show_annotation_name = TRUE,
    annotation_label = c(
        "Lineage",
        "Cluster"
    ),
    annotation_name_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    annotation_name_side = "left"
)

ht_EPI <- ComplexHeatmap::Heatmap(
    matrix = matrix_heatmap[, cells_heatmap_EPI] |> as.matrix(),
    rect_gp = grid::gpar(col = NA, lwd = 0),
    col = col_fun,
    #
    row_title_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    row_title_rot = 0,
    column_title_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    column_title_rot = 0,
    #
    cluster_rows = FALSE,
    show_row_dend = FALSE,
    cluster_columns = FALSE,
    show_column_dend = FALSE,
    #
    show_row_names = FALSE,
    show_column_names = FALSE,
    #
    top_annotation = ha_column_EPI,
    bottom_annotation = NULL,
    left_annotation = ha_left,
    right_annotation = NULL,
    #
    column_split = factor(
        ha_lineage[ha_lineage == "EPI"],
        levels = unique(ha_lineage[ha_lineage == "EPI"])
    ),
    column_gap = unit(0, "mm"),
    #
    show_heatmap_legend = FALSE,
    heatmap_legend_param = list(
        # title = expression(paste("Log"[10], " (CPM + 1)")),
        title = "Expr",
        title_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 6
        ),
        legend_direction = "vertical",
        labels_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 5
        ),
        legend_height = unit(15, "mm"),
        legend_width = unit(5, "mm")
    ),
    #
    use_raster = RASTERISED,
    #
    width = unit(8, "mm")
)
# ComplexHeatmap::draw(ht_EPI)
# HYP
anno_labels_tbl_HYP <- table(
    ha_cluster[ha_cluster %in% clusters_HYP]
)[as.character(clusters_HYP)] |>
    as_tibble() |>
    mutate(
        cum_sum = cumsum(value),
        position = cum_sum - value / 2
    )

anno_labels_cluster_HYP <- rep(
    NA, length(ha_cluster[ha_cluster %in% clusters_HYP])
)

if (nchar(as.character(clusters_HYP)) > 1) {
    cluster_label <- strsplit(as.character(clusters_HYP), "")[[1]]

    anno_labels_cluster_HYP[
        anno_labels_tbl_HYP$position - 5
    ] <- cluster_label[1]
    anno_labels_cluster_HYP[
        anno_labels_tbl_HYP$position + 5
    ] <- cluster_label[2]
} else {
    anno_labels_cluster_HYP[
        anno_labels_tbl_HYP$position
    ] <- as.character(clusters_HYP)
}

ha_column_HYP <- ComplexHeatmap::HeatmapAnnotation(
    #
    lineage = ComplexHeatmap::anno_simple(
        ha_lineage[ha_lineage == "HYP"],
        # pch = anno_labels_cluster,
        col = setNames(
            object = c("grey85", "#8ace7e", "#ff684c", "#9467bd", "#ffd8b1"),
            nm = c("Blastoid", "EPI", "HYP", "TE", "n/a")
        ),
        which = "column",
        pt_size = unit(2, "mm"),
        pt_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 5
        ),
        simple_anno_size = unit(2, "mm")
    ),
    #
    cluster = ComplexHeatmap::anno_simple(
        ha_cluster[ha_cluster %in% clusters_HYP],
        pch = anno_labels_cluster_HYP,
        col = color_palette_cluster[as.character(clusters_selected_heatmap)],
        which = "column",
        # pt_size = unit(2, "mm"),
        pt_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 5
        ),
        simple_anno_size = unit(2, "mm")
    ),
    #
    show_annotation_name = FALSE,
    annotation_label = c(
        "Lineage",
        "Cluster"
    ),
    annotation_name_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    annotation_name_side = "left"
)

ht_HYP <- ComplexHeatmap::Heatmap(
    matrix = matrix_heatmap[, cells_heatmap_HYP] |> as.matrix(),
    rect_gp = grid::gpar(col = NA, lwd = 0),
    col = col_fun,
    #
    row_title_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    row_title_rot = 0,
    column_title_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    column_title_rot = 0,
    #
    cluster_rows = FALSE,
    show_row_dend = FALSE,
    cluster_columns = FALSE,
    show_column_dend = FALSE,
    #
    show_row_names = FALSE,
    show_column_names = FALSE,
    #
    top_annotation = ha_column_HYP,
    bottom_annotation = NULL,
    left_annotation = NULL,
    right_annotation = NULL,
    #
    # row_split = table_s2_sheet4$ClusterID,
    # row_gap = unit(0.3, "mm"),
    column_split = factor(
        ha_lineage[ha_lineage == "HYP"],
        levels = unique(ha_lineage[ha_lineage == "HYP"])
    ),
    column_gap = unit(0, "mm"),
    #
    show_heatmap_legend = FALSE,
    heatmap_legend_param = list(
        # title = expression(paste("Log"[10], " (CPM + 1)")),
        title = "Expr",
        title_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 6
        ),
        legend_direction = "vertical",
        labels_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 5
        ),
        legend_height = unit(15, "mm"),
        legend_width = unit(5, "mm")
    ),
    #
    use_raster = RASTERISED,
    #
    width = unit(5, "mm")
)
# ComplexHeatmap::draw(ht_HYP)
# TE
anno_labels_tbl_TE <- table(
    ha_cluster[ha_cluster %in% clusters_TE]
)[as.character(clusters_TE)] |>
    tibble::enframe() |>
    dplyr::mutate(
        cum_sum = cumsum(value),
        position = cum_sum - value / 2
    )

anno_labels_tbl_TE <- purrr::map(anno_labels_tbl_TE$name, \(x) {
    # works but ugly
    a <- anno_labels_tbl_TE |>
        filter(name == x)

    if (nchar(a$name) > 1) {
        cluster_label <- strsplit(as.character(a$name), "")[[1]]

        a <- rbind(a, a)

        a[1, 1] <- cluster_label[1]
        a[2, 1] <- cluster_label[2]
        a <- a |> as.data.frame()
        a[1, 4] <- a[1, 4] - 5
        a[2, 4] <- a[2, 4] + 5
    }
    return(a)
}) |>
    purrr::reduce(rbind)

anno_labels_cluster_TE <- rep(
    NA, length(ha_cluster[ha_cluster %in% clusters_TE])
)
anno_labels_cluster_TE[anno_labels_tbl_TE$position] <- anno_labels_tbl_TE$name

ha_column_TE <- ComplexHeatmap::HeatmapAnnotation(
    #
    lineage = ComplexHeatmap::anno_simple(
        ha_lineage[ha_lineage == "TE"],
        # pch = anno_labels_cluster,
        col = setNames(
            object = c("grey85", "#8ace7e", "#ff684c", "#9467bd", "#ffd8b1"),
            nm = c("Blastoid", "EPI", "HYP", "TE", "n/a")
        ),
        which = "column",
        pt_size = unit(2, "mm"),
        pt_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 5
        ),
        simple_anno_size = unit(2, "mm")
    ),
    #
    cluster = ComplexHeatmap::anno_simple(
        ha_cluster[ha_cluster %in% clusters_TE],
        pch = anno_labels_cluster_TE,
        col = color_palette_cluster[as.character(clusters_selected_heatmap)],
        which = "column",
        # pt_size = unit(2, "mm"),
        pt_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 5
        ),
        simple_anno_size = unit(2, "mm")
    ),
    #
    show_annotation_name = FALSE,
    annotation_label = c(
        "Lineage",
        "Cluster"
    ),
    annotation_name_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    annotation_name_side = "left"
)

ht_TE <- ComplexHeatmap::Heatmap(
    matrix = matrix_heatmap[, cells_heatmap_TE] |> as.matrix(),
    rect_gp = grid::gpar(col = NA, lwd = 0),
    col = col_fun,
    #
    row_title_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    row_title_rot = 0,
    column_title_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    column_title_rot = 0,
    #
    cluster_rows = FALSE,
    show_row_dend = FALSE,
    cluster_columns = FALSE,
    show_column_dend = FALSE,
    #
    show_row_names = FALSE,
    show_column_names = FALSE,
    #
    top_annotation = ha_column_TE,
    bottom_annotation = NULL,
    left_annotation = NULL,
    right_annotation = ha_right,
    #
    column_split = factor(
        ha_lineage[ha_lineage == "TE"],
        levels = unique(ha_lineage[ha_lineage == "TE"])
    ),
    column_gap = unit(0, "mm"),
    #
    show_heatmap_legend = FALSE,
    heatmap_legend_param = list(
        title = "Expr",
        title_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 6
        ),
        legend_direction = "vertical",
        labels_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 5
        ),
        legend_height = unit(15, "mm"),
        legend_width = unit(5, "mm")
    ),
    #
    use_raster = RASTERISED,
    #
    width = unit(50, "mm")
)
# legend
lgd_colorbar <- ComplexHeatmap::Legend(
    col_fun = col_fun,
    title = "Expr",
    grid_height = unit(1, "mm"),
    grid_width = unit(2, "mm"),
    legend_height = unit(10, "mm"),
    legend_width = unit(2, "mm"),
    title_gp = grid::gpar(
        fontfamily = "Arial",
        fontsize = 6
    ),
    labels_gp = grid::gpar(
        fontfamily = "Arial",
        fontsize = 5
    )
)

pd <- ComplexHeatmap::packLegend(
    lgd_colorbar,
    # gap = unit(8, "mm"),
    direction = "vertical"
)
ComplexHeatmap::draw(
    ht_EPI + ht_HYP + ht_TE,
    heatmap_legend_list = list(pd),
    ht_gap = unit(0, "mm")
)

Gene Ontology enrichment

enriched_go <- tibble::tribble(
    ~category, ~rank, ~go_id, ~term, ~p_value, ~p_value_log,
    "EPI", 1L, "GO:0019827", "stem cell population maintenance", 9e-07, 6.045757491,
    "EPI", 2L, "GO:0048368", "lateral mesoderm development", 0.00053, 3.27572413,
    "EPI", 3L, "GO:0009790", "embryo development", 0.00088, 3.055517328,
    "HYP", 1L, "GO:0048598", "embryonic morphogenesis", 3.9e-07, 6.408935393,
    "HYP", 2L, "GO:0007369", "gastrulation", 1.8e-05, 4.744727495,
    "HYP", 3L, "GO:0007492", "endoderm development", 0.00284, 2.54668166,
    "TE", 1L, "GO:0001890", "placenta development", 6e-11, 10.22184875,
    "TE", 2L, "GO:0001892", "embryonic placenta development", 2e-08, 7.698970004,
    "TE", 3L, "GO:0048513", "animal organ development", 2.7e-07, 6.568636236
)

enriched_go |>
    dplyr::mutate(
        category = factor(
            category,
            levels = c("EPI", "HYP", "TE") #  |> rev()
        ),
        rank = factor(
            rank,
            levels = c(3, 2, 1)
        )
    ) |>
    plot_barplot_go_enrichment(
        x = p_value_log,
        y = rank,
        z = category
    ) +
    ggplot2::facet_wrap(
        ~category,
        nrow = 1,
        strip.position = "left",
        scales = "free_x",
        labeller = ggplot2::labeller(
            category = c("EPI" = "EPI", "HYP" = "HYP", "TE" = "TE")
        )
    ) +
    ggplot2::geom_text(
        ggplot2::aes(
            x = 0,
            label = term,
            group = NULL
        ),
        size = 2.2,
        family = "Arial",
        color = "black",
        data = enriched_go |>
            dplyr::mutate(
                category = factor(
                    category,
                    levels = c("EPI", "HYP", "TE")
                ),
                rank = factor(
                    rank,
                    levels = c(3, 2, 1)
                ),
                term = paste(" ", term)
            ),
        hjust = 0
    ) +
    ggplot2::scale_fill_manual(
        values = c("#8ace7e", "#ff684c", "#9467bd")
    )

High concordance of blastoid replicates

Data loading

EMBEDDING_FILE <- "embedding_replicate.csv.gz"

embedding_replicate <- read_csv(
    file = file.path(PROJECT_DIR, "github", "data", EMBEDDING_FILE)
) |>
    dplyr::select(cell:y_fitsne)

# clean up
rm(EMBEDDING_FILE)
MATRIX_DIR <- list(
    "github/data/matrices/LW36",
    "github/data/matrices/LW49_LW50_LW51_LW52",
    "github/data/matrices/LW58_LW59",
    "github/data/matrices/LW60_LW61",
    "raw/public/PRJEB11202/reformatted_matrix"
)

matrix_readcount_use <- purrr::map(MATRIX_DIR, \(x) {
    load_matrix(file.path(PROJECT_DIR, x))
}) |>
    purrr::reduce(cbind)

matrix_readcount_use <- matrix_readcount_use[, embedding_replicate$cell]

Embedding visualization

EMBEDDING_TITLE_PREFIX <- "FIt-SNE"

x_column <- "x_fitsne"
y_column <- "y_fitsne"

Clustering & batch & UMI

p_embedding_replicate_louvain <- plot_embedding(
    data = embedding_replicate[, c(x_column, y_column)],
    color = embedding_replicate$louvain |> as.factor(),
    label = glue::glue("{EMBEDDING_TITLE_PREFIX}; Cluster"),
    color_labels = TRUE,
    color_legend = FALSE,
    sort_values = FALSE,
    shuffle_values = TRUE,
    rasterise = RASTERISED,
    geom_point_size = GEOM_POINT_SIZE
) +
    theme_customized_embedding()

p_embedding_replicate_batch <- plot_embedding(
    data = embedding_replicate[, c(x_column, y_column)],
    color = embedding_replicate$batch |> as.factor(),
    label = glue::glue("{EMBEDDING_TITLE_PREFIX}; Batch"),
    color_labels = FALSE,
    color_legend = TRUE,
    sort_values = FALSE,
    shuffle_values = TRUE,
    rasterise = RASTERISED,
    geom_point_size = GEOM_POINT_SIZE
) +
    theme_customized_embedding()

CB_POSITION <- c(0.875, 0.3)
p_embedding_replicate_UMI <- plot_embedding(
    data = embedding_replicate[, c(x_column, y_column)],
    color = log10(
        Matrix::colSums(matrix_readcount_use[, embedding_replicate$cell])
    ),
    label = glue::glue("{EMBEDDING_TITLE_PREFIX}; UMI"),
    color_legend = TRUE,
    sort_values = FALSE,
    shuffle_values = TRUE,
    rasterise = RASTERISED,
    geom_point_size = GEOM_POINT_SIZE
) +
    theme_customized_embedding(
        x = CB_POSITION[1],
        y = CB_POSITION[2]
    )
embedding_replicate |>
    dplyr::mutate(
        num_umis = colSums(matrix_readcount_use[, cell]),
        num_features = colSums(matrix_readcount_use[, cell] > 0),
    ) |>
    dplyr::group_by(batch) |>
    dplyr::summarise(
        num_cells = n(),
        median_umis = median(num_umis),
        median_features = median(num_features)
    ) |>
    dplyr::mutate(
        sample = dplyr::case_when(
            batch == "LW36" ~ "Blastoid, D8; TH; 5i/L/A",
            batch == "LW60" ~ "Blastoid, D9; HT; 5i/L/A",
            batch == "LW61" ~ "Blastoid, D9; HT; PXGL"
        )
    ) |>
    dplyr::select(
        sample, dplyr::everything()
    ) |>
    gt::gt() |>
    gt::data_color(
        columns = c(median_umis),
        colors = scales::col_numeric(
            palette = c(
                "green", "orange", "red"
            ),
            domain = c(7500, 32000)
        )
    ) |>
    gt::summary_rows(
        columns = c(sample, batch),
        fns = list(
            Count = ~ n()
        ),
        decimals = 0
    ) |>
    gt::summary_rows(
        columns = c(median_umis:median_features),
        fns = list(
            Mean = ~ mean(.)
        ),
        decimals = 0
    ) |>
    gt::summary_rows(
        columns = c(num_cells),
        fns = list(
            Sum = ~ sum(.)
        ),
        decimals = 0
    ) |>
    gt::tab_header(
        title = gt::md("**Clustering of TH- and HT-Blastoids**; Batch")
    )
Clustering of TH- and HT-Blastoids; Batch
sample batch num_cells median_umis median_features
Blastoid, D8; TH; 5i/L/A LW36 6854 31193 5014
Blastoid, D9; HT; 5i/L/A LW60 4497 14421 3337
Blastoid, D9; HT; PXGL LW61 5156 7625 2185
Count 3 3
Mean 17,746 3,512
Sum 16,507
purrr::reduce(
    list(
        p_embedding_replicate_louvain,
        p_embedding_replicate_batch,
        p_embedding_replicate_UMI
    ), `+`
) +
    patchwork::plot_layout(ncol = 3) +
    patchwork::plot_annotation(
        theme = ggplot2::theme(plot.margin = ggplot2::margin())
    )

Cluster composition

calc_group_composition(
    data = embedding_replicate,
    x = "louvain",
    group = "batch"
) |>
    dplyr::mutate(
        louvain = as.factor(louvain)
    ) |>
    plot_barplot(
        x = "louvain",
        y = "percentage",
        z = "batch"
    ) +
    ggplot2::guides(fill = ggplot2::guide_legend(direction = "horizontal")) +
    ggplot2::theme(legend.position = "bottom")

Blastoids derived stem cell lines

Data loading

EMBEDDING_FILE <- "embedding_stem.csv.gz"

embedding_stem <- read_csv(
    file = file.path(
        PROJECT_DIR, "github", "data", EMBEDDING_FILE
    )
)

rm(EMBEDDING_FILE)
MATRIX_DIR <- list(
    "github/data/matrices/LW36",
    "github/data/matrices/LW49_LW50_LW51_LW52",
    "github/data/matrices/LW58_LW59",
    "github/data/matrices/LW60_LW61",
    "raw/public/PRJEB11202/reformatted_matrix"
)

matrix_readcount_use <- purrr::map(MATRIX_DIR, \(x) {
    load_matrix(file.path(PROJECT_DIR, x))
}) |>
    purrr::reduce(cbind)

matrix_readcount_use <- matrix_readcount_use[, embedding_stem$cell]

Embedding visualization

EMBEDDING_TITLE_PREFIX <- "FIt-SNE"

x_column <- "x_fitsne"
y_column <- "y_fitsne"

Clustering & batch & UMI

p_embedding_stem_batch <- plot_embedding(
    data = embedding_stem[, c(x_column, y_column)],
    color = embedding_stem$batch |> as.factor(),
    label = glue::glue("{EMBEDDING_TITLE_PREFIX}; Batch"),
    color_labels = FALSE,
    color_legend = TRUE,
    sort_values = FALSE,
    shuffle_values = TRUE,
    rasterise = RASTERISED,
    geom_point_size = GEOM_POINT_SIZE
) +
    theme_customized_embedding()

CB_POSITION <- c(0.875, 0.3)
p_embedding_stem_UMI <- plot_embedding(
    data = embedding_stem[, c(x_column, y_column)],
    color = log10(Matrix::colSums(matrix_readcount_use[, embedding_stem$cell])),
    label = glue::glue("{EMBEDDING_TITLE_PREFIX}; UMI"),
    color_legend = TRUE,
    sort_values = FALSE,
    shuffle_values = TRUE,
    rasterise = RASTERISED,
    geom_point_size = GEOM_POINT_SIZE
) +
    theme_customized_embedding(
        x = CB_POSITION[1],
        y = CB_POSITION[2]
    )

selected_feature <- "ENSG00000204531_POU5F1"
p_embedding_stem_POU5F1 <- plot_embedding(
    data = embedding_stem[, c(x_column, y_column)],
    color = log10(
        calc_cpm(matrix_readcount_use[, embedding_stem$cell])
        [selected_feature, ] + 1
    ),
    label = glue::glue(
        "{EMBEDDING_TITLE_PREFIX}; ",
        "{selected_feature |> stringr::str_remove(pattern = \"^E.+_\")}"
    ),
    color_legend = TRUE,
    sort_values = TRUE,
    rasterise = RASTERISED,
    geom_point_size = GEOM_POINT_SIZE,
    na_value = "grey80"
) +
    theme_customized_embedding(
        x = CB_POSITION[1],
        y = CB_POSITION[2]
    )
embedding_stem |>
    dplyr::mutate(
        num_umis = colSums(matrix_readcount_use[, cell]),
        num_features = colSums(matrix_readcount_use[, cell] > 0),
    ) |>
    dplyr::group_by(batch) |>
    dplyr::summarise(
        num_cells = n(),
        median_umis = median(num_umis),
        median_features = median(num_features)
    ) |>
    dplyr::mutate(
        sample = dplyr::case_when(
            batch == "LW49" ~ "Naïve WIRB3; 5i/L/A",
            batch == "LW50" ~ "Blastoid naïve ES cells; 5i/L/A",
            batch == "LW51" ~ "Blastoid nEND; NACL",
            batch == "LW52" ~ "Blastoid TSCs; TSM"
        )
    ) |>
    dplyr::select(
        sample, dplyr::everything()
    ) |>
    gt::gt() |>
    gt::data_color(
        columns = c(median_umis),
        colors = scales::col_numeric(
            palette = c(
                "green", "orange", "red"
            ),
            domain = c(8500, 20000)
        )
    ) |>
    gt::summary_rows(
        columns = c(sample, batch),
        fns = list(
            Count = ~ n()
        ),
        decimals = 0
    ) |>
    gt::summary_rows(
        columns = c(median_umis:median_features),
        fns = list(
            Mean = ~ mean(.)
        ),
        decimals = 0
    ) |>
    gt::summary_rows(
        columns = c(num_cells),
        fns = list(
            Sum = ~ sum(.)
        ),
        decimals = 0
    ) |>
    gt::tab_header(
        title = gt::md("**Clustering of Blastoid Derived Cells**; Batch")
    )
Clustering of Blastoid Derived Cells; Batch
sample batch num_cells median_umis median_features
Naïve WIRB3; 5i/L/A LW49 1638 18363.0 4025.5
Blastoid naïve ES cells; 5i/L/A LW50 2663 9022.0 2734.0
Blastoid nEND; NACL LW51 2055 11767.0 3250.0
Blastoid TSCs; TSM LW52 4486 8585.5 2581.0
Count 4 4
Mean 11,934 3,148
Sum 10,842
purrr::reduce(list(
    p_embedding_stem_batch,
    p_embedding_stem_UMI,
    p_embedding_stem_POU5F1
), `+`) +
    patchwork::plot_layout(ncol = 3) +
    patchwork::plot_annotation(
        theme = ggplot2::theme(plot.margin = ggplot2::margin())
    )

Polishing

p_embedding_stem_batch <- plot_embedding(
    data = embedding_stem[, c(x_column, y_column)],
    color = embedding_stem$batch |> as.factor(),
    label = NULL,
    color_labels = FALSE,
    color_legend = TRUE,
    sort_values = FALSE,
    shuffle_values = TRUE,
    rasterise = RASTERISED,
    geom_point_size = GEOM_POINT_SIZE
) +
    theme_customized_embedding(void = TRUE) +
    scale_color_manual(
        values = scales::hue_pal()(n = length(unique(embedding_stem$batch))),
        labels = c(
            "Naïve ES cells",
            "Blastoid Naïve ES cells",
            "Blastoid nEND",
            "Blastoid TSCs"
        )
    )
x_label <- ggplot_build(
    p_embedding_stem_batch
)$layout$panel_params[[1]][c("x.range")] |>
    unlist() |>
    quantile(0.1)
y_label <- ggplot_build(
    p_embedding_stem_batch
)$layout$panel_params[[1]][c("y.range")] |>
    unlist() |>
    quantile(0.8)

features_selected <- c(
    "ENSG00000181449_SOX2",
    "ENSG00000141448_GATA6",
    "ENSG00000107485_GATA3"
)

p_embedding_stem_SOX2_GATA6_GATA3 <- purrr::map(features_selected, \(x) {
    selected_feature <- x

    plot_embedding(
        data = embedding_stem[, c(x_column, y_column)],
        color = log10(
            calc_cpm(matrix_readcount_use[, embedding_stem$cell])
            [selected_feature, ] + 1
        ),
        label = NULL,
        color_legend = TRUE,
        sort_values = TRUE,
        rasterise = RASTERISED,
        geom_point_size = 0.5,
        na_value = "grey80"
    ) +
        theme_customized_embedding(
            x = CB_POSITION[1],
            y = CB_POSITION[2],
            void = TRUE,
            legend_key_size = c(1.5, 1.5)
        ) +
        ggplot2::annotate(
            geom = "text",
            x = x_label,
            y = y_label,
            label = stringr::str_c(
                x |> stringr::str_remove(pattern = "^E.+_")
            ),
            family = "Arial",
            color = "black",
            size = 5 / ggplot2::.pt,
            hjust = 0,
            vjust = 0
            # parse = TRUE
        )
})
list(
    p_embedding_stem_batch,
    p_embedding_stem_SOX2_GATA6_GATA3
) |>
    purrr::reduce(`+`) +
    patchwork::plot_layout(nrow = 2) +
    patchwork::plot_annotation(
        theme = ggplot2::theme(plot.margin = ggplot2::margin())
    )

Trajectory inference

Data loading

EMBEDDING_FILE <- "embedding_timecourse.csv.gz"

embedding_timecourse <- read_csv(
    file = file.path(PROJECT_DIR, "github", "data", EMBEDDING_FILE)
)

rm(EMBEDDING_FILE)
MATRIX_DIR <- list(
    "github/data/matrices/LW36",
    "github/data/matrices/LW49_LW50_LW51_LW52",
    "github/data/matrices/LW58_LW59",
    "github/data/matrices/LW60_LW61",
    "raw/public/PRJEB11202/reformatted_matrix"
)

matrix_readcount_use <- purrr::map(MATRIX_DIR, \(x) {
    load_matrix(file.path(PROJECT_DIR, x))
}) |>
    purrr::reduce(cbind)

matrix_readcount_use <- matrix_readcount_use[, embedding_timecourse$cell]

# clean up
rm(MATRIX_DIR)

Embedding visualization

EMBEDDING_TITLE_PREFIX <- "FIt-SNE"

x_column <- "x_fitsne"
y_column <- "y_fitsne"

Clustering & batch & UMI

p_embedding_timecourse_cluster <- plot_embedding(
    data = embedding_timecourse[, c(x_column, y_column)],
    color = letters[as.integer(embedding_timecourse$louvain + 1)] |>
        as.factor(),
    label = glue::glue("{EMBEDDING_TITLE_PREFIX}; Cluster"),
    color_labels = TRUE,
    color_legend = FALSE,
    sort_values = FALSE,
    shuffle_values = TRUE,
    rasterise = RASTERISED,
    geom_point_size = GEOM_POINT_SIZE
) +
    theme_customized_embedding()

p_embedding_timecourse_batch <- plot_embedding(
    data = embedding_timecourse[, c(x_column, y_column)],
    color = embedding_timecourse$batch |> as.factor(),
    label = glue::glue("{EMBEDDING_TITLE_PREFIX}; Batch"),
    color_labels = FALSE,
    color_legend = TRUE,
    sort_values = FALSE,
    shuffle_values = TRUE,
    rasterise = RASTERISED,
    geom_point_size = GEOM_POINT_SIZE
) +
    theme_customized_embedding()

CB_POSITION <- c(0.875, 0.3)
p_embedding_timecourse_UMI <- plot_embedding(
    data = embedding_timecourse[, c(x_column, y_column)],
    color = log10(
        Matrix::colSums(matrix_readcount_use[, embedding_timecourse$cell])
    ),
    label = glue::glue("{EMBEDDING_TITLE_PREFIX}; UMI"),
    color_legend = TRUE,
    sort_values = FALSE,
    shuffle_values = TRUE,
    rasterise = RASTERISED,
    geom_point_size = GEOM_POINT_SIZE
) +
    theme_customized_embedding(
        x = CB_POSITION[1],
        y = CB_POSITION[2]
    )

selected_feature <- "ENSG00000204531_POU5F1"
p_embedding_timecourse_POU5F1 <- plot_embedding(
    data = embedding_timecourse[, c(x_column, y_column)],
    color = log10(
        calc_cpm(matrix_readcount_use[, embedding_timecourse$cell])
        [selected_feature, ] + 1
    ),
    label = glue::glue(
        "{EMBEDDING_TITLE_PREFIX}; ",
        "{selected_feature |> stringr::str_remove(pattern = \"^E.+_\")}"
    ),
    color_legend = TRUE,
    sort_values = TRUE,
    rasterise = RASTERISED,
    geom_point_size = GEOM_POINT_SIZE,
    na_value = "grey80"
) +
    theme_customized_embedding(
        x = CB_POSITION[1],
        y = CB_POSITION[2]
    )
list(
    p_embedding_timecourse_cluster,
    p_embedding_timecourse_batch,
    p_embedding_timecourse_UMI,
    p_embedding_timecourse_POU5F1
) |>
    purrr::reduce(`+`) +
    patchwork::plot_layout(ncol = 2) +
    patchwork::plot_annotation(
        theme = ggplot2::theme(plot.margin = ggplot2::margin())
    )

Polishing

p_embedding_timecourse_batch <- plot_embedding(
    data = embedding_timecourse[, c(x_column, y_column)],
    color = embedding_timecourse$batch |> as.factor(),
    label = NULL,
    color_labels = FALSE,
    color_legend = TRUE,
    sort_values = FALSE,
    shuffle_values = TRUE,
    rasterise = RASTERISED,
    geom_point_size = GEOM_POINT_SIZE
) +
    theme_customized_embedding(void = TRUE) +
    ggplot2::scale_color_manual(
        values = yarrr::piratepal(palette = "google") |> as.character(),
        labels = c("D3", "D6", "D9", "D9; PXGL")
    )
embedding_timecourse <- embedding_timecourse |>
    dplyr::select(cell:y_fitsne) |>
    dplyr::left_join(
        embedding |> dplyr::select(cell, louvain_ = louvain),
        by = c("cell" = "cell")
    ) |>
    mutate(
        louvain_ = case_when(
            is.na(louvain_) ~ "NA",
            TRUE ~ as.character(louvain_)
        ),
        #
        lineage = case_when(
            louvain_ %in% c(11) ~ "EPI",
            louvain_ %in% c(18) ~ "HYP",
            louvain_ %in% c(0, 1, 8, 9, 12, 17) ~ "TE",
            louvain_ %in% c(10, 14) ~ "Pre-lineage",
            TRUE ~ "Other"
        ),
        lineage = factor(
            lineage,
            levels = c("Other", "TE", "Pre-lineage", "HYP", "EPI")
        ),
        #
        cluters_selected = case_when(
            louvain_ %in% c(2, 3, 4, 5, 6, 7, 13, 15, 16) ~ louvain_,
            TRUE ~ "Other"
        ),
        cluters_selected = factor(
            cluters_selected,
            levels = c(
                "Other", "2", "3", "4", "5", "6", "7", "13", "15", "16"
            )
        )
    )
p_embedding_timecourse_lineage <- plot_embedding(
    data = embedding_timecourse[, c(x_column, y_column)],
    color = embedding_timecourse$lineage |> as.factor(),
    label = NULL,
    color_labels = FALSE,
    color_legend = TRUE,
    sort_values = TRUE,
    shuffle_values = FALSE,
    rasterise = RASTERISED,
    geom_point_size = GEOM_POINT_SIZE,
    na_value = "grey70"
) +
    ggplot2::scale_color_manual(
        name = NULL,
        limits = c("TE", "Pre-lineage", "HYP", "EPI"),
        values = c("grey70", "#8ace7e", "#ff684c", "#9467bd", "#ffd8b1"),
        breaks = c("Other", "EPI", "HYP", "TE", "Pre-lineage"),
        labels = c("Other", "ELCs", "HLCs", "TLCs", "Pre-lineage-like"),
        guide = ggplot2::guides(
            color = ggplot2::guide_legend(
                override.aes = list(
                    size = 2, alpha = 1
                ),
                ncol = 1,
                reverse = TRUE,
                order = 1
            )
        ),
        na.value = "grey70"
    ) +
    theme_customized_embedding(void = TRUE)
p_embedding_timecourse_cluster <- plot_embedding(
    data = embedding_timecourse[, c(x_column, y_column)],
    color = embedding_timecourse$cluters_selected,
    label = NULL,
    color_labels = FALSE,
    color_legend = TRUE,
    sort_values = TRUE,
    shuffle_values = FALSE,
    rasterise = RASTERISED,
    geom_point_size = GEOM_POINT_SIZE,
    geom_point_legend_ncol = 2
) +
    ggplot2::scale_color_manual(
        values = c("Other" = "grey70", color_palette_cluster),
        limits = embedding_timecourse |>
            dplyr::filter(cluters_selected != "Other") |>
            dplyr::pull(cluters_selected) |>
            as.character() |>
            unique() |>
            stringr::str_sort(numeric = TRUE),
        na.value = "grey70"
    ) +
    theme_customized_embedding(void = TRUE) +
    ggrepel::geom_text_repel(
        data = get_middle_points(
            data = embedding_timecourse |>
                dplyr::filter(cluters_selected != "Other"),
            x = x_column,
            y = y_column,
            group = "cluters_selected"
        ),
        ggplot2::aes(
            x = x_fitsne,
            y = y_fitsne,
            label = cluters_selected
        ),
        color = "black",
        size = 1.8,
        family = "Arial",
        #
        box.padding = 0.4,
        point.padding = 1e-06,
        min.segment.length = 0,
        arrow = ggplot2::arrow(length = unit(0.015, "npc")),
        max.overlaps = Inf,
        nudge_x = 0,
        nudge_y = 10,
        #
        segment.color = "grey35",
        segment.size = 0.25,
        segment.alpha = 1,
        # segment.inflect = TRUE,
        seed = 20201121
    )
features_selected <- c(
    "ENSG00000181449_SOX2",
    "ENSG00000141448_GATA6",
    "ENSG00000107485_GATA3"
)

p_embedding_timecourse_SOX2_GATA6_GATA3 <- purrr::map(features_selected, \(x) {
    selected_feature <- x

    plot_embedding(
        data = embedding_timecourse[, c(x_column, y_column)],
        color = log10(
            calc_cpm(matrix_readcount_use[, embedding_timecourse$cell])
            [selected_feature, ] + 1
        ),
        label = NULL,
        color_legend = TRUE,
        sort_values = TRUE,
        rasterise = RASTERISED,
        geom_point_size = 0.5,
        na_value = "grey80"
    ) +
        theme_customized_embedding(
            x = CB_POSITION[1],
            y = CB_POSITION[2],
            void = TRUE,
            legend_key_size = c(1.5, 1.5)
        ) +
        ggplot2::annotate(
            geom = "text",
            x = x_label,
            y = y_label,
            label = stringr::str_c(
                x |> stringr::str_remove(pattern = "^E.+_")
            ),
            family = "Arial",
            color = "black",
            size = 5 / ggplot2::.pt,
            hjust = 0.5,
            vjust = 0
            # parse = TRUE
        )
})
c(
    list(
        p_embedding_timecourse_batch,
        p_embedding_timecourse_lineage,
        p_embedding_timecourse_cluster
    ),
    p_embedding_timecourse_SOX2_GATA6_GATA3
) |>
    purrr::reduce(`+`) +
    patchwork::plot_layout(ncol = 2, byrow = FALSE) +
    patchwork::plot_annotation(
        theme = theme(plot.margin = margin())
    )

Cluster composition

calc_group_composition(
    data = embedding_timecourse,
    x = "louvain",
    group = "batch"
) |>
    dplyr::mutate(
        louvain = as.factor(louvain)
    ) |>
    plot_barplot(
        x = "louvain",
        y = "percentage",
        z = "batch"
    ) +
    ggplot2::scale_fill_manual(
        values = yarrr::piratepal(palette = "google") %>% as.character(),
        labels = c("D3", "D6", "D9", "D9; PXGL")
    ) +
    ggplot2::guides(fill = ggplot2::guide_legend(direction = "horizontal")) +
    ggplot2::theme(legend.position = "bottom")


R session info

devtools::session_info()
─ Session info ───────────────────────────────────────────────────────────────
 setting  value
 version  R version 4.2.1 (2022-06-23)
 os       macOS Monterey 12.6
 system   aarch64, darwin21.6.0
 ui       unknown
 language (EN)
 collate  en_US.UTF-8
 ctype    en_US.UTF-8
 tz       America/Chicago
 date     2022-09-25
 pandoc   2.19.2 @ /opt/homebrew/bin/ (via rmarkdown)

─ Packages ───────────────────────────────────────────────────────────────────
 package        * version       date (UTC) lib source
 BayesFactor      0.9.12-4.4    2022-07-05 [1] CRAN (R 4.2.1)
 BiocGenerics     0.42.0        2022-04-26 [1] Bioconductor
 bit              4.0.4         2020-08-04 [1] CRAN (R 4.2.0)
 bit64            4.0.5         2020-08-30 [1] CRAN (R 4.2.0)
 cachem           1.0.6         2021-08-19 [1] CRAN (R 4.2.0)
 Cairo            1.6-0         2022-07-05 [1] CRAN (R 4.2.1)
 callr            3.7.2         2022-08-22 [1] CRAN (R 4.2.1)
 circlize         0.4.15        2022-05-10 [1] CRAN (R 4.2.0)
 cli              3.4.1         2022-09-23 [1] CRAN (R 4.2.1)
 clue             0.3-61        2022-05-30 [1] CRAN (R 4.2.0)
 cluster          2.1.4         2022-08-22 [2] CRAN (R 4.2.1)
 coda             0.19-4        2020-09-30 [1] CRAN (R 4.2.0)
 codetools        0.2-18        2020-11-04 [2] CRAN (R 4.2.1)
 colorspace       2.0-3         2022-02-21 [1] CRAN (R 4.2.0)
 commonmark       1.8.0         2022-03-09 [1] CRAN (R 4.2.0)
 ComplexHeatmap   2.12.1        2022-08-09 [1] Bioconductor
 crayon           1.5.1         2022-03-26 [1] CRAN (R 4.2.0)
 devtools         2.4.4.9000    2022-09-23 [1] Github (r-lib/devtools@9e2793a)
 digest           0.6.29        2021-12-01 [1] CRAN (R 4.2.0)
 doParallel       1.0.17        2022-02-07 [1] CRAN (R 4.2.0)
 dplyr          * 1.0.99.9000   2022-09-23 [1] Github (tidyverse/dplyr@19c2be3)
 ellipsis         0.3.2         2021-04-29 [1] CRAN (R 4.2.0)
 evaluate         0.16          2022-08-09 [1] CRAN (R 4.2.1)
 extrafont      * 0.18          2022-04-12 [1] CRAN (R 4.2.0)
 extrafontdb      1.0           2012-06-11 [1] CRAN (R 4.2.0)
 fansi            1.0.3         2022-03-24 [1] CRAN (R 4.2.0)
 farver           2.1.1         2022-07-06 [1] CRAN (R 4.2.1)
 fastmap          1.1.0         2021-01-25 [1] CRAN (R 4.2.0)
 forcats        * 0.5.2.9000    2022-08-20 [1] Github (tidyverse/forcats@bd319e0)
 foreach          1.5.2         2022-02-02 [1] CRAN (R 4.2.0)
 fs               1.5.2.9000    2022-08-24 [1] Github (r-lib/fs@238032f)
 generics         0.1.3         2022-07-05 [1] CRAN (R 4.2.1)
 GetoptLong       1.0.5         2020-12-15 [1] CRAN (R 4.2.0)
 ggplot2        * 3.3.6.9000    2022-09-12 [1] Github (tidyverse/ggplot2@a58b48c)
 ggrepel          0.9.1         2021-01-15 [1] CRAN (R 4.2.0)
 ggthemes         4.2.4         2021-01-20 [1] CRAN (R 4.2.0)
 GlobalOptions    0.1.2         2020-06-10 [1] CRAN (R 4.2.0)
 glue             1.6.2.9000    2022-04-22 [1] Github (tidyverse/glue@d47d6c7)
 gridExtra        2.3           2017-09-09 [1] CRAN (R 4.2.0)
 gt               0.7.0.9000    2022-09-23 [1] Github (rstudio/gt@4030fb7)
 gtable           0.3.1.9000    2022-09-01 [1] Github (r-lib/gtable@c1a7a81)
 hms              1.1.2         2022-08-19 [1] CRAN (R 4.2.1)
 htmltools        0.5.3         2022-07-18 [1] CRAN (R 4.2.1)
 htmlwidgets      1.5.4         2022-08-23 [1] Github (ramnathv/htmlwidgets@400cf1a)
 httpuv           1.6.6         2022-09-08 [1] CRAN (R 4.2.1)
 IRanges          2.30.1        2022-08-18 [1] Bioconductor
 iterators        1.0.14        2022-02-05 [1] CRAN (R 4.2.0)
 jpeg             0.1-9         2021-07-24 [1] CRAN (R 4.2.0)
 jsonlite         1.8.0         2022-02-22 [1] CRAN (R 4.2.0)
 knitr            1.40          2022-08-24 [1] CRAN (R 4.2.1)
 labeling         0.4.2         2020-10-20 [1] CRAN (R 4.2.0)
 later            1.3.0         2021-08-18 [1] CRAN (R 4.2.0)
 lattice          0.20-45       2021-09-22 [2] CRAN (R 4.2.1)
 lifecycle        1.0.2.9000    2022-09-23 [1] Github (r-lib/lifecycle@0a6860a)
 lubridate      * 1.8.0.9000    2022-05-24 [1] Github (tidyverse/lubridate@0bb49b2)
 magick           2.7.3         2021-08-18 [1] CRAN (R 4.2.0)
 magrittr         2.0.3         2022-03-30 [1] CRAN (R 4.2.0)
 Matrix         * 1.5-1         2022-09-13 [1] CRAN (R 4.2.1)
 MatrixModels     0.5-1         2022-09-11 [1] CRAN (R 4.2.1)
 matrixStats      0.62.0        2022-04-19 [1] CRAN (R 4.2.0)
 memoise          2.0.1         2021-11-26 [1] CRAN (R 4.2.0)
 mime             0.12          2021-09-28 [1] CRAN (R 4.2.0)
 miniUI           0.1.1.1       2018-05-18 [1] CRAN (R 4.2.0)
 munsell          0.5.0         2018-06-12 [1] CRAN (R 4.2.0)
 mvtnorm          1.1-3         2021-10-08 [1] CRAN (R 4.2.1)
 patchwork      * 1.1.2.9000    2022-08-20 [1] Github (thomasp85/patchwork@c14c960)
 pbapply          1.5-0         2021-09-16 [1] CRAN (R 4.2.0)
 pillar           1.8.1         2022-08-19 [1] CRAN (R 4.2.1)
 pkgbuild         1.3.1         2021-12-20 [1] CRAN (R 4.2.0)
 pkgconfig        2.0.3         2019-09-22 [1] CRAN (R 4.2.0)
 pkgload          1.3.0         2022-06-27 [1] CRAN (R 4.2.1)
 png              0.1-7         2013-12-03 [1] CRAN (R 4.2.0)
 prettyunits      1.1.1.9000    2022-04-22 [1] Github (r-lib/prettyunits@8706d89)
 processx         3.7.0         2022-07-07 [1] CRAN (R 4.2.1)
 profvis          0.3.7         2020-11-02 [1] CRAN (R 4.2.0)
 promises         1.2.0.1       2021-02-11 [1] CRAN (R 4.2.0)
 ps               1.7.1         2022-06-18 [1] CRAN (R 4.2.0)
 purrr          * 0.9000.0.9000 2022-09-24 [1] Github (tidyverse/purrr@4ab13f5)
 R.cache          0.16.0        2022-07-21 [1] CRAN (R 4.2.1)
 R.methodsS3      1.8.2         2022-06-13 [1] CRAN (R 4.2.0)
 R.oo             1.25.0        2022-06-12 [1] CRAN (R 4.2.0)
 R.utils          2.12.0        2022-06-28 [1] CRAN (R 4.2.1)
 R6               2.5.1.9000    2022-08-04 [1] Github (r-lib/R6@87d5e45)
 ragg             1.2.2.9000    2022-09-12 [1] Github (r-lib/ragg@904e145)
 RColorBrewer     1.1-3         2022-04-03 [1] CRAN (R 4.2.0)
 Rcpp             1.0.9         2022-07-08 [1] CRAN (R 4.2.1)
 readr          * 2.1.2.9000    2022-09-20 [1] Github (tidyverse/readr@5cac6ed)
 remotes          2.4.2         2022-09-12 [1] Github (r-lib/remotes@bc0949d)
 reticulate       1.26          2022-08-31 [1] CRAN (R 4.2.1)
 rjson            0.2.21        2022-01-09 [1] CRAN (R 4.2.0)
 rlang            1.0.6         2022-09-24 [1] Github (r-lib/rlang@66454bd)
 rmarkdown        2.16.1        2022-09-24 [1] Github (rstudio/rmarkdown@9577707)
 Rttf2pt1         1.3.10        2022-02-07 [1] CRAN (R 4.2.0)
 S4Vectors        0.34.0        2022-04-26 [1] Bioconductor
 sass             0.4.2         2022-07-16 [1] CRAN (R 4.2.1)
 scales           1.2.1.9000    2022-08-20 [1] Github (r-lib/scales@b3df2fb)
 sessioninfo      1.2.2         2021-12-06 [1] CRAN (R 4.2.0)
 shape            1.4.6         2021-05-19 [1] CRAN (R 4.2.0)
 shiny            1.7.2         2022-07-19 [1] CRAN (R 4.2.1)
 stringi          1.7.8         2022-07-11 [1] CRAN (R 4.2.1)
 stringr        * 1.4.1.9000    2022-08-21 [1] Github (tidyverse/stringr@792bc92)
 styler         * 1.7.0.9002    2022-09-21 [1] Github (r-lib/styler@1f4437b)
 systemfonts      1.0.4         2022-02-11 [1] CRAN (R 4.2.0)
 textshaping      0.3.6         2021-10-13 [1] CRAN (R 4.2.0)
 tibble         * 3.1.8.9002    2022-09-24 [1] Github (tidyverse/tibble@e9db4f4)
 tidyr          * 1.2.1.9000    2022-09-09 [1] Github (tidyverse/tidyr@653def2)
 tidyselect       1.1.2.9000    2022-09-21 [1] Github (r-lib/tidyselect@edd0a3b)
 tidyverse      * 1.3.2.9000    2022-09-12 [1] Github (tidyverse/tidyverse@3be8283)
 tzdb             0.3.0         2022-03-28 [1] CRAN (R 4.2.0)
 urlchecker       1.0.1         2021-11-30 [1] CRAN (R 4.2.0)
 usethis          2.1.6.9000    2022-09-23 [1] Github (r-lib/usethis@8ecb7ab)
 utf8             1.2.2         2021-07-24 [1] CRAN (R 4.2.0)
 vctrs            0.4.1.9000    2022-09-19 [1] Github (r-lib/vctrs@0a219ba)
 viridis          0.6.2         2021-10-13 [1] CRAN (R 4.2.0)
 viridisLite      0.4.1         2022-08-22 [1] CRAN (R 4.2.1)
 vroom            1.5.7.9000    2022-09-09 [1] Github (r-lib/vroom@0c2423e)
 withr            2.5.0         2022-03-03 [1] CRAN (R 4.2.0)
 xfun             0.33          2022-09-12 [1] CRAN (R 4.2.1)
 xtable           1.8-4         2019-04-21 [1] CRAN (R 4.2.0)
 yaml             2.3.5         2022-02-21 [1] CRAN (R 4.2.0)
 yarrr            0.1.6         2022-04-22 [1] Github (ndphillips/yarrr@e2e4488)

 [1] /opt/homebrew/lib/R/4.2/site-library
 [2] /opt/homebrew/Cellar/r/4.2.1_4/lib/R/library

─ Python configuration ───────────────────────────────────────────────────────
 python:         /Users/jialei/.pyenv/shims/python
 libpython:      /Users/jialei/.pyenv/versions/mambaforge-4.10.3-10/lib/libpython3.9.dylib
 pythonhome:     /Users/jialei/.pyenv/versions/mambaforge-4.10.3-10:/Users/jialei/.pyenv/versions/mambaforge-4.10.3-10
 version:        3.9.13 | packaged by conda-forge | (main, May 27 2022, 17:00:33)  [Clang 13.0.1 ]
 numpy:          /Users/jialei/.pyenv/versions/mambaforge-4.10.3-10/lib/python3.9/site-packages/numpy
 numpy_version:  1.22.4
 numpy:          /Users/jialei/.pyenv/versions/mambaforge-4.10.3-10/lib/python3.9/site-packages/numpy
 
 NOTE: Python version was forced by RETICULATE_PYTHON

──────────────────────────────────────────────────────────────────────────────



Citation

BibTeX citation:
@article{yu,
  author = {Leqian Yu and Yulei Wei and Jialei Duan and Daniel A.
    Schmitz and Masahiro Sakurai and Lei Wang and Kunhua Wang and Shuhua
    Zhao and Gary C. Hon and Jun Wu},
  editor = {},
  publisher = {Nature Publishing Group},
  title = {Blastocyst-Like Structures Generated from Human Pluripotent
    Stem Cells},
  journal = {Nature},
  volume = {591},
  number = {7851},
  pages = {620 - 626},
  date = {},
  url = {https://doi.org/10.1038/s41586-021-03356-y},
  doi = {10.1038/s41586-021-03356-y},
  langid = {en},
  abstract = {Human blastoids provide a readily accessible, scalable,
    versatile and perturbable alternative to blastocysts for studying
    early human development, understanding early pregnancy loss and
    gaining insights into early developmental defects.}
}
For attribution, please cite this work as:
Leqian Yu, Yulei Wei, Jialei Duan, Daniel A. Schmitz, Masahiro Sakurai, Lei Wang, Kunhua Wang, Shuhua Zhao, Gary C. Hon, and Jun Wu. n.d. “Blastocyst-Like Structures Generated from Human Pluripotent Stem Cells.” Nature 591 (7851): 620–26. https://doi.org/10.1038/s41586-021-03356-y.