Nakamura, T., Okamoto, I., Sasaki, K., Yabuta, Y., Iwatani, C., Tsuchiya, H., Seita, Y., Nakamura, S., Yamamoto, T., and Saitou, M. (2016). A developmental coordinate of pluripotency among mice, monkeys and humans. Nature 537, 57–62.

  • BioProject Accession: PRJNA301445
  • GEO Accession: GSE74767



Load required packages.

library(tidyverse)
library(magrittr)
library(Matrix)
library(extrafont)
library(ggrepel)
library(patchwork)
# library(tidylog)
Sys.time()
## [1] "2020-08-18 15:08:05 CDT"

Data preparation

SEED_USE <- 20200616
MINIMAL_NUM_COUNTS_REQUIRED_FOR_GENE <- 60

Functions loading

source(
    file = file.path(
        SCRIPT_DIR,
        "utilities.R"
    )
)

Data loading

Prepare metadata for single cells.

# https://www.ncbi.nlm.nih.gov/Traces/study/?query_key=1&WebEnv=MCID_5f3b627244419748985be4f3
cell_metadata <- read_delim(
    file = "../SraRunTable.txt",
    delim = ","
) %>%
    rename_all(. %>% tolower()) %>%
    `colnames<-`(str_replace(
        string = colnames(.),
        pattern = " ", replacement = "_"
    )) %>%
    dplyr::select(
        sample_name,
        # run,
        biosample,
        source_name,
        cell_line,
        organism
    ) %>%
    unique()
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   AvgSpotLen = col_double(),
##   Bases = col_double(),
##   Bytes = col_double(),
##   ReleaseDate = col_datetime(format = ""),
##   Strain = col_logical(),
##   Cell_type = col_logical()
## )
## See spec(...) for full column specifications.
## Warning: 558 parsing failures.
##  row    col           expected  actual                 file
## 2527 Strain 1/0/T/F/TRUE/FALSE C57BL/6 '../SraRunTable.txt'
## 2528 Strain 1/0/T/F/TRUE/FALSE C57BL/6 '../SraRunTable.txt'
## 2529 Strain 1/0/T/F/TRUE/FALSE C57BL/6 '../SraRunTable.txt'
## 2530 Strain 1/0/T/F/TRUE/FALSE C57BL/6 '../SraRunTable.txt'
## 2531 Strain 1/0/T/F/TRUE/FALSE C57BL/6 '../SraRunTable.txt'
## .... ...... .................. ....... ....................
## See problems(...) for more details.

cell_metadata
cell_metadata %>%
    dplyr::count(organism) %>%
    gt::gt() %>%
    gt::cols_label(n = "num_cells") %>%
    gt::tab_options(table.font.size = "median")

organism num_cells
Macaca fascicularis 421
Mus musculus 71

Load SC3-seq sample information.

# Supplementary Table 1, Sheet 3
table_s1_sheet3 <- readxl::read_excel(
    path = "../../../../docs/A_developmental_coordinate_of_pluripotency_among_mice,_monkeys_and_humans/41586_2016_BFnature19096_MOESM296_ESM.xlsx",
    sheet = "SC3-seq_SampleTable",
    skip = 1
)

table_s1_sheet3
cell_metadata <- cell_metadata %>%
    dplyr::left_join(
        table_s1_sheet3,
        by = c("sample_name" = "GSM_ID")
    ) %>%
    rename_all(. %>% tolower()) %>%
    dplyr::rename(
        developmental_stage = `embryonic day/ culture condition`
    ) %>%
    dplyr::select(
        sample_name:developmental_stage
    ) %>%
    mutate(
        developmental_stage = factor(
            developmental_stage,
            levels = c(
                "E5.5",
                "E6.5",
                "E06",
                "E07",
                "E08",
                "E09",
                "E13",
                "E14",
                "E16",
                "E17",
                "2iLESC",
                "EpiLC",
                "CMK6_FeederFree",
                "CMK6_onFeeder",
                "CMK9_onFeeder"
            )
        ),
        group = factor(
            group,
            levels = c(
                "ICM",
                "Pre-EPI",
                "Hypoblast",
                "PreE-TE",
                "PreL-TE",
                "PostE-EPI",
                "PostL-EPI",
                "Gast1",
                "Gast2a",
                "Gast2b",
                "Post-paTE",
                "VE/YE",
                "EXMC",
                "cyESCFF",
                "cyESCoF",
                "E5.5EPI",
                "E5.5VE",
                "E6.5EPI-T hi",
                "E6.5EPI-T lo",
                "E6.5VE",
                "EpiLC",
                "2i+L ESC"
            )
        ),
        source_name = factor(
            source_name,
            levels = c(
                "Cynomolgus monkey embryo E06",
                "Cynomolgus monkey embryo E07",
                "Cynomolgus monkey embryo E08",
                "Cynomolgus monkey embryo E09",
                "Cynomolgus monkey embryo E13",
                "Cynomolgus monkey embryo E14",
                "Cynomolgus monkey embryo E16",
                "Cynomolgus monkey embryo E17",
                "cyESC\\, feeder free",
                "cyESC\\, on feeder",
                "mouse embryo E5.5",
                "mouse embryo E6.5",
                "2i/LIF ESC",
                "EpiLC"
            )
        )
    )
cell_metadata
embedding_cy <- read_csv(
    file = "../preprocessed/cy/exploring/embedding_ncomponents44_seed20200317.csv"
) %>%
    dplyr::left_join(
        cell_metadata,
        by = c("cell" = "sampleid")
    ) %>%
    droplevels()
## Parsed with column specification:
## cols(
##   cell = col_character(),
##   batch = col_character(),
##   louvain = col_double(),
##   x_tsne = col_double(),
##   y_tsne = col_double(),
##   x_umap = col_double(),
##   y_umap = col_double(),
##   x_fitsne = col_double(),
##   y_fitsne = col_double(),
##   x_phate = col_double(),
##   y_phate = col_double(),
##   `x_min_dist=0.1` = col_double(),
##   `y_min_dist=0.1` = col_double(),
##   x_multicoretsne = col_double(),
##   y_multicoretsne = col_double()
## )
reticulate::py_discover_config()
## python:         /Users/jialei/.pyenv/shims/python
## libpython:      /Users/jialei/.pyenv/versions/3.8.2/lib/libpython3.8.dylib
## pythonhome:     /Users/jialei/.pyenv/versions/3.8.2:/Users/jialei/.pyenv/versions/3.8.2
## version:        3.8.2 (default, May 23 2020, 03:35:41)  [Clang 11.0.3 (clang-1103.0.32.62)]
## numpy:          /Users/jialei/.pyenv/versions/3.8.2/lib/python3.8/site-packages/numpy
## numpy_version:  1.19.0
## 
## NOTE: Python version was forced by RETICULATE_PYTHON
np <- reticulate::import("numpy", convert = TRUE)
scipy.sparse <- reticulate::import(module = "scipy.sparse", convert = TRUE)

matrix_readcount_use <- scipy.sparse$load_npz("../preprocessed/cy/matrix_readcount.npz")
matrix_readcount_use_features <- np$load("../preprocessed/cy/matrix_readcount_features.npy")
matrix_readcount_use_barcodes <- np$load("../preprocessed/cy/matrix_readcount_barcodes.npy")
colnames(matrix_readcount_use) <- matrix_readcount_use_barcodes
rownames(matrix_readcount_use) <- matrix_readcount_use_features
matrix_readcount_use <- matrix_readcount_use[, embedding_cy$cell]

# calculate CPM
# matrix_cpm_use <- calc_cpm(matrix_readcount_use)

# stopifnot(dim(matrix_readcount_use) == dim(matrix_cpm_use))
print(dim(matrix_readcount_use))
## [1] 29095   421
walk(list(embedding_cy, matrix_readcount_use), function(x) {
    print(object.size(x), units = "auto", standard = "SI")
})
## 196.1 kB
## 57.4 MB

Lineage delineation by transcriptome

Transcriptome clustering

embedding_cy %>%
    dplyr::count(organism) %>%
    gt::gt() %>%
    gt::cols_label(n = "num_cells") %>%
    gt::tab_options(table.font.size = "median")

organism num_cells
Macaca fascicularis 421

embedding_cy %>%
    dplyr::mutate(
        num_genes = Matrix::colSums(matrix_readcount_use > 0)
    ) %>%
    dplyr::group_by(
        louvain
    ) %>%
    summarise(
        median_genes = median(num_genes),
        num_cells = n()
    ) %>%
    gt::gt() %>%
    gt::tab_options(table.font.size = "median")
## `summarise()` ungrouping output (override with `.groups` argument)

louvain median_genes num_cells
0 10679.0 82
1 10880.5 64
2 10997.0 62
3 10898.0 55
4 11073.0 53
5 10649.0 39
6 11590.5 34
7 11974.5 32

embedding_cy %>%
    dplyr::count(source_name) %>%
    gt::gt() %>%
    gt::cols_label(n = "num_cells") %>%
    gt::tab_options(table.font.size = "median")

source_name num_cells
Cynomolgus monkey embryo E06 43
Cynomolgus monkey embryo E07 35
Cynomolgus monkey embryo E08 95
Cynomolgus monkey embryo E09 20
Cynomolgus monkey embryo E13 46
Cynomolgus monkey embryo E14 62
Cynomolgus monkey embryo E16 36
Cynomolgus monkey embryo E17 53
cyESC\, feeder free 9
cyESC\, on feeder 22

embedding_cy %>%
    dplyr::count(group) %>%
    gt::gt() %>%
    gt::cols_label(n = "num_cells") %>%
    gt::tab_options(table.font.size = "median")

group num_cells
ICM 30
Pre-EPI 34
Hypoblast 54
PreE-TE 23
PreL-TE 52
PostE-EPI 55
PostL-EPI 50
Gast1 18
Gast2a 13
Gast2b 13
Post-paTE 11
VE/YE 5
EXMC 32
cyESCFF 9
cyESCoF 22

embedding_cy %>%
    dplyr::count(developmental_stage) %>%
    gt::gt() %>%
    gt::cols_label(n = "num_cells") %>%
    gt::tab_options(table.font.size = "median")

developmental_stage num_cells
E06 43
E07 35
E08 95
E09 20
E13 46
E14 62
E16 36
E17 53
CMK6_FeederFree 9
CMK6_onFeeder 14
CMK9_onFeeder 8

get_middle_points <- function(embedding, x, y, group) {
    embedding_middle <- embedding %>%
        dplyr::group_by({{ group }}) %>%
        dplyr::summarise(
            x_median = median({{ x }}),
            y_median = median({{ y }})
        )

    labels_group <- purrr::map(levels(embedding %>% dplyr::pull({{ group }})), function(i) {
        embedding %>%
            dplyr::filter(
                {{ group }} == i
            ) %>%
            dplyr::left_join(embedding_middle) %>%
            dplyr::mutate(
                distance = sqrt(
                    ({{ x }} - x_median)^2 + ({{ y }} - y_median)^2
                )
            ) %>%
            dplyr::filter(distance == min(distance))
    }) %>%
        dplyr::bind_rows() %>%
        dplyr::select({{ group }}, {{ x }}, {{ y }})

    return(labels_group)
}
p_embedding_cluster <- plot_embedding(
    embedding = embedding_cy[, c("x_tsne", "y_tsne")],
    color_values = embedding_cy$louvain %>% as.factor(),
    label = "t-SNE; Cluster",
    # label_position = c(label_x, label_y),
    show_color_value_labels = TRUE,
    show_color_legend = FALSE,
    geom_point_size = 1,
    sort_values = FALSE
) +
    scale_color_manual(
        values = ggthemes::tableau_color_pal("Tableau 10")(
            length(unique(embedding_cy$louvain))
        )
    )

p_embedding_lineage <- plot_embedding(
    embedding = embedding_cy[, c("x_tsne", "y_tsne")],
    color_values = embedding_cy$group,
    label = "t-SNE; Lineage",
    # label_position = c(label_x, label_y),
    show_color_value_labels = FALSE,
    show_color_legend = FALSE,
    geom_point_size = 1,
    sort_values = FALSE
) +
    scale_color_manual(
        values = ggthemes::tableau_color_pal("Tableau 20")(
            length(unique(embedding_cy$group))
        )
    ) +
    labs(color = NULL) +
    ggrepel::geom_text_repel(
        data = get_middle_points(embedding_cy, x_tsne, y_tsne, group),
        ggplot2::aes(
            x = x_tsne,
            y = y_tsne,
            label = group
        ),
        color = "black",
        size = 2,
        family = "Arial",
        #
        segment.color = "grey35",
        segment.size = 0.25,
        segment.inflect = TRUE,
        #
        box.padding = 0.5,
        min.segment.length = 0,
        arrow = ggplot2::arrow(length = unit(0.015, "npc")),
        max.overlaps = Inf,
        seed = SEED_USE
    )

p_embedding_developmental_stage <- plot_embedding(
    embedding = embedding_cy[, c("x_tsne", "y_tsne")],
    color_values = embedding_cy$developmental_stage,
    label = "t-SNE; Developmental stage",
    # label_position = c(label_x, label_y),
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = 1,
    sort_values = FALSE
) +
    scale_color_manual(
        values = gg_color_hue(n = embedding_cy$developmental_stage %>%
            droplevels() %>%
            nlevels())
    ) +
    labs(color = NULL) +
    guides(colour = guide_legend(
        override.aes = list(size = 3),
        ncol = 2
    )) +
    customized_theme(x = 0.035, y = 0.42)

p_embedding_cell_line <- plot_embedding(
    embedding = embedding_cy[, c("x_tsne", "y_tsne")],
    color_values = embedding_cy$cell_line %>% replace_na("N/A"),
    label = "t-SNE; Cell line",
    # label_position = c(label_x, label_y),
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = 1,
    sort_values = FALSE
) +
    scale_color_manual(
        values = c("green", "red", "grey70")
    ) +
    labs(color = NULL) +
    guides(colour = guide_legend(override.aes = list(size = 3))) +
    customized_theme(x = 0.035, y = 0.25)


Fig. 5a

purrr::reduce(list(
    p_embedding_cluster,
    p_embedding_lineage,
    p_embedding_developmental_stage,
    p_embedding_cell_line
), `+`) +
    plot_layout(ncol = 2) +
    plot_annotation(
        theme = theme(plot.margin = margin())
    )

Heatmap of selected marker genes

FEATURES_SELECTED <- c(
    "GAPDH",
    # "PPIA",
    "LOC101925040",
    "POU5F1",
    "NANOG",
    "SOX2",
    "PRDM14",
    "DPPA3",
    "GATA6",
    "PDGFRA",
    "GATA4",
    "TFAP2C",
    # "GATA2",
    "LOC101865311",
    "CDX2",
    "T",
    "MIXL1"
)

heatmap_labels <- c(
    "GAPDH",
    # "PPIA",
    "LOC101925040/PPIA",
    "POU5F1",
    "NANOG",
    "SOX2",
    "PRDM14",
    "DPPA3",
    "GATA6",
    "PDGFRA",
    "GATA4",
    "TFAP2C",
    # "GATA2",
    "LOC101865311/GATA2",
    "CDX2",
    "T",
    "MIXL1"
)

cells_heatmap <- embedding_cy %>%
    filter(str_detect(string = developmental_stage, pattern = "^E")) %>%
    pull(cell)


Scaled version

matrix_heatmap <- matrix_readcount_use[FEATURES_SELECTED, cells_heatmap]
matrix_heatmap <- matrix_heatmap[rowSums(matrix_heatmap) != 0, ]
matrix_heatmap <- log10(matrix_heatmap + 1)
matrix_heatmap <- t(scale(t(matrix_heatmap)))
heatmap_limits <- quantile(matrix_heatmap, c(0.05, 0.95))
matrix_heatmap[matrix_heatmap < heatmap_limits[1]] <- heatmap_limits[1]
matrix_heatmap[matrix_heatmap > heatmap_limits[2]] <- heatmap_limits[2]
groups_selected <- c(
    "Post-paTE",
    "PreL-TE",
    "Hypoblast",
    "PreE-TE",
    "ICM",
    "Pre-EPI",
    "PostE-EPI",
    "PostL-EPI",
    "Gast1",
    "Gast2a",
    "Gast2b",
    "VE/YE",
    "EXMC"
)
cells_heatmap <- map(groups_selected, function(x) {
    cells_in_group <- embedding_cy %>%
        filter(group == x, cell %in% cells_heatmap) %>%
        pull(cell)
    cells_in_group[
        pvclust::pvclust(
            matrix_heatmap[, cells_in_group],
            nboot = 100,
            parallel = parallel::detectCores() - 2L
        )$hclust$order
    ]
}) %>%
    unlist()
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
matrix_heatmap <- matrix_heatmap[FEATURES_SELECTED, cells_heatmap]
ha_columns <- ComplexHeatmap::HeatmapAnnotation(
    lineage = ComplexHeatmap::anno_simple(
        embedding_cy[match(
            x = colnames(matrix_heatmap),
            table = embedding_cy$cell
        ), "group"],
        # pch = anno_labels_cluster,
        col = setNames(
            object = ggthemes::tableau_color_pal("Tableau 20")(
                embedding_cy$group %>%
                    # droplevels() %>%
                    nlevels()
            ),
            nm = levels(embedding_cy$group)
        ),
        which = "column",
        pt_size = unit(2, "mm"),
        pt_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 6
        ),
        simple_anno_size = unit(3, "mm")
    ),
    #
    developmental_stage = ComplexHeatmap::anno_simple(
        embedding_cy[match(
            x = colnames(matrix_heatmap),
            table = embedding_cy$cell
        ), "developmental_stage"],
        # pch = anno_labels_lineage,
        col = setNames(
            object = gg_color_hue(n = embedding_cy$developmental_stage %>%
                # droplevels() %>%
                nlevels()),
            nm = embedding_cy$developmental_stage %>%
                # droplevels() %>%
                levels()
        ),
        which = "column",
        # pt_size = unit(2, "mm"),
        pt_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 6
        ),
        simple_anno_size = unit(3, "mm")
    ),
    show_annotation_name = TRUE,
    annotation_label = c(
        "Lineage",
        "Developmental stage"
    ),
    annotation_name_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    annotation_name_side = "left"
)
lgd_lineage <- ComplexHeatmap::Legend(
    title = "Lineage",
    labels = levels(
        embedding_cy$group
    )[
        seq_len(nlevels(embedding_cy$group) - 2)
    ],
    legend_gp = grid::gpar(
        fill = ggthemes::tableau_color_pal("Tableau 20")(
            nlevels(embedding_cy$group) - 2
        )
    ),
    labels_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    title_gp = grid::gpar(fontfamily = "Arial", fontsize = 6)
)

lgd_developmental_stage <- ComplexHeatmap::Legend(
    title = "Develpmental stage",
    labels = levels(
        embedding_cy$developmental_stage
    )[
        seq_len(nlevels(embedding_cy$developmental_stage) - 3)
    ],
    legend_gp = grid::gpar(
        fill = gg_color_hue(
            n = embedding_cy$developmental_stage %>%
                # droplevels() %>%
                nlevels()
        )[seq_len(nlevels(embedding_cy$developmental_stage) - 3)]
    ),
    labels_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    title_gp = grid::gpar(fontfamily = "Arial", fontsize = 6)
)
ht <- ComplexHeatmap::Heatmap(
    matrix = matrix_heatmap,
    # col =  viridis::plasma(n = 10),
    col = wesanderson::wes_palette("Zissou1", 50, type = "continuous"),
    #
    cluster_rows = FALSE,
    cluster_columns = FALSE,
    #
    row_labels = heatmap_labels,
    row_names_side = c("left"),
    row_names_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    show_row_names = TRUE,
    show_column_names = FALSE,
    #
    # column_names_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    show_row_dend = FALSE,
    show_column_dend = FALSE,
    #
    show_heatmap_legend = TRUE,
    top_annotation = ha_columns,
    ## left_annotation = ha_left,
    #
    heatmap_legend_param = list(
        title = "Z score",
        title_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
        legend_direction = "vertical",
        labels_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
        legend_height = unit(25, "mm"),
        legend_width = unit(10, "mm")
    ),
    #
    column_split = embedding_cy[match(
        x = colnames(matrix_heatmap),
        table = embedding_cy$cell
    ), "group", drop = TRUE] %>%
        factor(levels = groups_selected),
    column_gap = unit(0.3, "mm"),
    #
    column_title_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    column_title_rot = -90,
    #
    use_raster = FALSE
)
pd <- ComplexHeatmap::packLegend(
    lgd_lineage,
    lgd_developmental_stage,
    # gap = unit(8, "mm"),
    direction = "vertical"
)


Fig. 3

ComplexHeatmap::draw(
    ht,
    heatmap_legend_list = list(pd)
)


Non scaled version

matrix_heatmap <- matrix_readcount_use[FEATURES_SELECTED, cells_heatmap]
matrix_heatmap <- matrix_heatmap[rowSums(matrix_heatmap) != 0, ]
matrix_heatmap <- log10(matrix_heatmap + 1)
ht <- ComplexHeatmap::Heatmap(
    matrix = as.matrix(matrix_heatmap),
    # col =  viridis::plasma(n = 10),
    col = wesanderson::wes_palette("Zissou1", 50, type = "continuous"),
    #
    cluster_rows = FALSE,
    cluster_columns = FALSE,
    #
    row_labels = heatmap_labels,
    row_names_side = c("left"),
    row_names_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    show_row_names = TRUE,
    show_column_names = FALSE,
    #
    column_names_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    show_row_dend = FALSE,
    show_column_dend = FALSE,
    #
    show_heatmap_legend = TRUE,
    top_annotation = ha_columns,
    ## left_annotation = ha_left,
    #
    heatmap_legend_param = list(
        title = expression(paste("Log"[10], " (RPM + 1)")), # "Log10(RPM + 1)",
        title_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
        legend_direction = "vertical",
        labels_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
        legend_height = unit(25, "mm"),
        legend_width = unit(10, "mm")
    ),
    #
    column_split = embedding_cy[match(
        x = colnames(matrix_heatmap),
        table = embedding_cy$cell
    ), "group", drop = TRUE] %>%
        factor(levels = groups_selected),
    column_gap = unit(0.3, "mm"),
    #
    column_title_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    column_title_rot = -90,
    #
    use_raster = FALSE
)


Fig. 3

ComplexHeatmap::draw(
    ht,
    heatmap_legend_list = list(pd)
)

Progressive transitions of cyEPI

cells_cyEPI <- embedding_cy %>%
    filter(group %in% c(
        "ICM",
        "Pre-EPI",
        "PostE-EPI",
        "PostL-EPI",
        "Gast1",
        "Gast2a",
        "Gast2b"
    )) %>%
    pull(cell) %>%
    sort()

Dimensionality reduction

Preprocessing

matrix_readcount_cyEPI <- matrix_readcount_use[, cells_cyEPI]
matrix_readcount_cyEPI <- matrix_readcount_cyEPI[
    Matrix::rowSums(
        matrix_readcount_cyEPI
    ) >= MINIMAL_NUM_COUNTS_REQUIRED_FOR_GENE,
]

dim(matrix_readcount_cyEPI)
## [1] 16831   213

PCA

matrix_readcount_cyEPI_log <- matrix_readcount_cyEPI
matrix_readcount_cyEPI_log@x <- log1p(matrix_readcount_cyEPI@x)

# z-score
matrix_readcount_cyEPI_log_scaled <- t(
    scale(
        t(
            matrix_readcount_cyEPI_log
        ),
        center = TRUE, scale = TRUE
    )
)
# features_var <- apply(matrix_readcount_norm_log_scaled, 1, var)
# features_use <- rownames(matrix_readcount_norm_log_scaled)[features_var > 0]

pca_out <- prcomp(
    t(matrix_readcount_cyEPI_log_scaled),
    center = FALSE,
    scale = FALSE
)
summary(pca_out)$imp %>%
    t() %>%
    as.data.frame() %>%
    rownames_to_column(var = "component") %>%
    mutate(
        rank = 1:n()
    ) %>%
    dplyr::slice(1:30) %>%
    ggplot(
        aes(
            x = rank,
            y = `Proportion of Variance`
        )
    ) +
    geom_point(size = 0.3) +
    theme_bw() +
    scale_x_continuous(
        name = "Component",
        breaks = c(1, seq(5, 30, 5))
    ) +
    scale_y_continuous(
        name = "Proportion of variance",
        labels = scales::percent
    ) +
    theme(
        axis.title = ggplot2::element_text(family = "Arial", size = 6),
        axis.text = ggplot2::element_text(family = "Arial", size = 6),
        panel.grid.minor = ggplot2::element_blank()
    )

embedding_cyEPI <- pca_out$x %>%
    as.data.frame() %>%
    rownames_to_column(var = "sample_name") %>%
    dplyr::select(sample_name:PC3) %>%
    left_join(
        embedding_cy %>%
            select(
                cell, batch, sample_name:developmental_stage
            ),
        by = c("sample_name" = "cell")
    ) %>%
    dplyr::rename(
        x_pca = PC1,
        y_pca = PC2,
        z_pca = PC3
    ) %>%
    relocate(x_pca, y_pca, z_pca, .after = last_col()) # %>% droplevels()
p_embedding_pca1 <- embedding_cyEPI %>%
    mutate(
        category = "PCA"
    ) %>%
    sample_frac() %>%
    plot_pca(
        x = y_pca,
        y = x_pca,
        fill = group
    ) +
    scale_fill_manual(
        values = setNames(
            object = ggthemes::tableau_color_pal("Tableau 20")(
                length(unique(embedding_cy$group))
            ),
            nm = embedding_cy$group %>% levels()
        )[embedding_cyEPI$group %>%
            droplevels() %>%
            levels()]
    ) +
    add_xy_label_pca(x = "PC2", y = "PC1") +
    ggrepel::geom_text_repel(
        data = get_middle_points(embedding_cyEPI, x_pca, y_pca, group),
        ggplot2::aes(
            x = y_pca,
            y = x_pca,
            label = group
        ),
        color = "red",
        size = 2,
        family = "Arial",
        #
        segment.color = "grey35",
        segment.size = 0.25,
        segment.inflect = TRUE,
        #
        box.padding = 0.5,
        min.segment.length = 0,
        arrow = ggplot2::arrow(length = unit(0.015, "npc")),
        max.overlaps = Inf,
        seed = SEED_USE
    )

p_embedding_pca2 <- embedding_cyEPI %>%
    mutate(
        category = "PCA"
    ) %>%
    sample_frac() %>%
    plot_pca(
        x = z_pca,
        y = x_pca,
        fill = group
    ) +
    scale_fill_manual(
        values = setNames(
            object = ggthemes::tableau_color_pal("Tableau 20")(
                length(unique(embedding_cy$group))
            ),
            nm = embedding_cy$group %>% levels()
        )[embedding_cyEPI$group %>%
            droplevels() %>%
            levels()]
    ) +
    add_xy_label_pca(x = "PC3", y = "PC1")


Fig. 4a

p_embedding_pca1 +
    p_embedding_pca2 +
    plot_annotation(theme = theme(plot.margin = margin())) +
    plot_layout(guides = "collect")

# Supplementary Table 2, Sheet 4
table_s2_sheet4 <- readxl::read_excel(
    path = "../../../../docs/A_developmental_coordinate_of_pluripotency_among_mice,_monkeys_and_humans/41586_2016_BFnature19096_MOESM297_ESM.xlsx",
    sheet = "cyEPI ontogenic genes",
    skip = 0
)
## New names:
## * `` -> ...11
## * `` -> ...12
table_s2_sheet4


Fig. 4b

FEATURES_SELECTED <- c(
    "SFRP1",
    "FST",
    "HMGA2",
    "VIM",
    "T",
    "BMP4",
    "EOMES",
    "MIXL1",
    "HAND1",
    "PDGFRA",
    "GSC",
    "TBX3",
    "GATA6",
    "ESRRB",
    "KLF5",
    "TFCP2L1",
    "KHDC1L",
    "KLF4",
    "NLRP7",
    "OOEP",
    "DPPA3",
    "DNMT3L",
    "DPPA5",
    "DPPA2",
    "NANOG",
    "PRDM14",
    "SOX2",
    "SALL2",
    "SFRP2",
    "FGF2",
    "SOX11"
)

scaled_pc_loadings <- pca_out$rotation %>%
    scale() %>%
    as.data.frame() %>%
    rownames_to_column(var = "feature") %>%
    dplyr::select(feature, PC1, PC2) %>%
    mutate(
        group = feature %in% table_s2_sheet4$macFas5_gene_symbol,
        group2 = feature %in% FEATURES_SELECTED
    ) %>%
    arrange(group)

plot_pca_loading(
    data = scaled_pc_loadings,
    x = PC2,
    y = PC1,
    z1 = group,
    z2 = group2,
    x_label = "Z score of PC2 loading",
    y_label = "Z score of PC1 loading"
)

Heatmap

Read genes used in Fig. 4c from supplementary table 2.

features_heatmap <- table_s2_sheet4$macFas5_gene_symbol
features_heatmap %>% length()
## [1] 776
# Genes listed in the supplementary table, but not in the expression matrix
features_heatmap[!features_heatmap %in% rownames(matrix_readcount_use)]
##  [1] "CYP11A"       "LOC102139757" "BRDT"         "COX7B2"       "42253"       
##  [6] "42065"        "DYRK3"        "GSTA4"        "FXYD6"        "BCAR3"
# fix
features_heatmap <- features_heatmap %>%
    enframe(name = "order", value = "symbol_raw") %>%
    mutate(
        symbol = case_when(
            symbol_raw == "CYP11A" ~ "LOC102120640",
            symbol_raw == "LOC102139757" ~ "XIST",
            symbol_raw == "BRDT" ~ "LOC102123270",
            symbol_raw == "COX7B2" ~ "LOC101864947",
            symbol_raw == "42253" ~ "MARC2",
            symbol_raw == "42065" ~ "SEPT6",
            symbol_raw == "DYRK3" ~ "LOC101866219",
            symbol_raw == "GSTA4" ~ "LOC101865382",
            symbol_raw == "FXYD6" ~ "LOC101867245",
            symbol_raw == "BCAR3" ~ "LOC101867222",
            TRUE ~ .$symbol_raw
        )
    ) %>%
    pull(symbol)

features_heatmap[!features_heatmap %in% rownames(matrix_readcount_use)]
## character(0)
matrix_heatmap <- matrix_readcount_use[features_heatmap, cells_cyEPI]
matrix_heatmap <- matrix_heatmap[rowSums(matrix_heatmap) != 0, ]
matrix_heatmap <- log10(matrix_heatmap + 1)
heatmap_limits <- quantile(matrix_heatmap, c(0.005, 0.995))
matrix_heatmap[matrix_heatmap < heatmap_limits[1]] <- heatmap_limits[1]
matrix_heatmap[matrix_heatmap > heatmap_limits[2]] <- heatmap_limits[2]
cells_heatmap <- map(embedding_cyEPI %>%
    pull(group) %>%
    droplevels() %>%
    levels(), function(x) {
    cells_in_group <- embedding_cyEPI %>%
        filter(group == x) %>%
        pull(sample_name)

    cells_in_group[
        pvclust::pvclust(
            matrix_heatmap[features_heatmap, cells_in_group] %>% as.matrix(),
            nboot = 100,
            parallel = parallel::detectCores() - 2L
        )$hclust$order
    ]
}) %>%
    unlist()
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
matrix_heatmap <- matrix_heatmap[features_heatmap, cells_heatmap]
ha_columns <- ComplexHeatmap::HeatmapAnnotation(
    lineage = ComplexHeatmap::anno_simple(
        embedding_cyEPI[match(
            x = colnames(matrix_heatmap),
            table = embedding_cyEPI$sample_name
        ), "group"],
        # pch = anno_labels_cluster,
        col = setNames(
            object = ggthemes::tableau_color_pal("Tableau 20")(
                embedding_cyEPI$group %>%
                    # droplevels() %>%
                    nlevels()
            ),
            nm = levels(embedding_cyEPI$group)
        ),
        which = "column",
        pt_size = unit(2, "mm"),
        pt_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 6
        ),
        simple_anno_size = unit(3, "mm")
    ),
    #
    developmental_stage = ComplexHeatmap::anno_simple(
        embedding_cyEPI[match(
            x = colnames(matrix_heatmap),
            table = embedding_cyEPI$sample_name
        ), "developmental_stage"],
        # pch = anno_labels_lineage,
        col = setNames(
            object = gg_color_hue(n = embedding_cy$developmental_stage %>%
                nlevels()),
            nm = embedding_cy$developmental_stage %>%
                levels()
        ),
        which = "column",
        # pt_size = unit(2, "mm"),
        pt_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 6
        ),
        simple_anno_size = unit(3, "mm")
    ),
    show_annotation_name = TRUE,
    annotation_label = c(
        "Lineage",
        "Developmental stage"
    ),
    annotation_name_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    annotation_name_side = "left"
)
lgd_lineage <- ComplexHeatmap::Legend(
    title = "Lineage",
    labels = embedding_cyEPI %>% pull(group) %>% droplevels() %>% levels(),
    legend_gp = grid::gpar(
        fill = setNames(
            object = ggthemes::tableau_color_pal("Tableau 20")(
                length(unique(embedding_cy$group))
            ),
            nm = embedding_cy$group %>% levels()
        )[embedding_cyEPI$group %>%
            droplevels() %>%
            levels()]
    ),
    labels_gp = grid::gpar(
        fontfamily = "Arial",
        fontsize = 6
    ),
    title_gp = grid::gpar(
        fontfamily = "Arial",
        fontsize = 6
    )
)

lgd_developmental_stage <- ComplexHeatmap::Legend(
    title = "Develpmental stage",
    labels = levels(
        embedding_cy$developmental_stage
    )[
        seq_len(nlevels(embedding_cy$developmental_stage) - 3)
    ],
    legend_gp = grid::gpar(
        fill = gg_color_hue(
            n = embedding_cy$developmental_stage %>%
                droplevels() %>%
                nlevels()
        )[seq_len(nlevels(embedding_cy$developmental_stage) - 3)]
    ),
    labels_gp = grid::gpar(
        fontfamily = "Arial",
        fontsize = 6
    ),
    title_gp = grid::gpar(
        fontfamily = "Arial",
        fontsize = 6
    )
)
# genes highlighted in Fig. 4c, fix gene symbols
features_to_mark_left <- c(
    "KLF5",
    "TBX3",
    "GATA6",
    "PDGFRA",
    "LAMA1",
    "LAMB1",
    "LAMC1",
    "TET2",
    #
    "ESRRB",
    "TFCP2L1",
    "LOC102137091", # "KHDC1L",
    "SOX15",
    "TFAP2C",
    "GDF15",
    #
    "DPPA3",
    "NLRP2",
    "DPPA2",
    "KHDC3L",
    #
    "PRDM14",
    "NANOG",
    #
    "LOC102143209", # "TDGF1",
    "SOX2",
    "CXCL12",
    "DNMT3B",
    #
    "ZSCAN10",
    "SALL2",
    "SEMA6A",
    "TCF7L1",
    "THY1",
    "COL1A1",
    "FGF19",
    "NOTCH3",
    #
    "SOX11",
    "TCF4",
    "FST",
    "ZIC3",
    "ZIC2",
    "FGF2",
    "VIM",
    "COL4A2",
    #
    "T",
    "MIXL1",
    "EOMES",
    "GSC",
    "BMP2",
    "HHEX",
    "LHX1",
    "CXCR4",
    "LOC101925698", # "SOX17",
    "BMP4"
)

features_to_mark_left[!features_to_mark_left %in% rownames(matrix_heatmap)]
## character(0)

ha_left <- ComplexHeatmap::rowAnnotation(
    foo = ComplexHeatmap::anno_mark(
        at = which(rownames(matrix_heatmap) %in% features_to_mark_left),
        labels = features_to_mark_left,
        which = "row",
        side = "right",
        lines_gp = grid::gpar(col = "grey50"),
        labels_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 5
        )
    )
)
ht <- ComplexHeatmap::Heatmap(
    matrix = matrix_heatmap %>% as.matrix(),
    # col =  viridis::plasma(n = 10),
    col = wesanderson::wes_palette("Zissou1", 50, type = "continuous"),
    #
    cluster_rows = FALSE,
    cluster_columns = FALSE,
    #
    ## row_labels = str_remove(string = rownames(matrix_heatmap), pattern = "^E.+_"),
    # row_names_side = c("left"),
    # row_names_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    show_row_names = FALSE,
    show_column_names = FALSE,
    #
    show_row_dend = FALSE,
    show_column_dend = FALSE,
    #
    column_names_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    show_heatmap_legend = TRUE,
    top_annotation = ha_columns,
    ## left_annotation = ha_left,
    right_annotation = ha_left,
    #
    heatmap_legend_param = list(
        title = expression(paste("Log"[10], " (RPM + 1)")),
        title_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 6
        ),
        legend_direction = "vertical",
        labels_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 6
        ),
        legend_height = unit(25, "mm"),
        legend_width = unit(10, "mm")
    ),
    #
    row_split = table_s2_sheet4$ClusterID,
    row_gap = unit(0.3, "mm"),
    #
    row_title_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    row_title_rot = 0,
    #
    # column_split = embedding_cy[match(x = colnames(matrix_heatmap),table = embedding_cy$cell), "group", drop = TRUE],
    column_split = factor(
        embedding_cyEPI[match(
            x = colnames(matrix_heatmap),
            table = embedding_cyEPI$sample_name
        ), "group", drop = TRUE] %>%
            droplevels() %>%
            str_pad(
                width = map_int(levels(embedding_cyEPI$group), nchar) %>%
                    max(),
                side = "left"
            ),
        levels = levels(embedding_cyEPI$group %>% droplevels()) %>%
            str_pad(
                width = map_int(levels(embedding_cyEPI$group), nchar) %>%
                    max(),
                side = "left"
            )
    ),
    column_gap = unit(0.3, "mm"),
    #
    column_title_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    column_title_rot = -90,
    #
    use_raster = FALSE
)
pd <- ComplexHeatmap::packLegend(
    lgd_lineage,
    lgd_developmental_stage,
    # gap = unit(8, "mm"),
    direction = "vertical"
)


Fig. 4c

ComplexHeatmap::draw(
    ht,
    heatmap_legend_list = list(pd)
)

Mouse EPI development

# Supplementary Table 1, Sheet 3
table_s1_sheet3 <- readxl::read_excel(
    path = "../../../../docs/A_developmental_coordinate_of_pluripotency_among_mice,_monkeys_and_humans/41586_2016_BFnature19096_MOESM296_ESM.xlsx",
    sheet = "SC3-seq_SampleTable",
    skip = 1
)

table_s1_sheet3 %>% head()
embedding_mm <- read_csv(
    file = "../preprocessed/mm/exploring/embedding_ncomponents18_seed20200317.csv"
) %>%
    dplyr::left_join(
        table_s1_sheet3 %>%
            filter(Species == "Mouse") %>%
            mutate(
                SampleID = case_when(
                    str_detect(string = SampleID, pattern = "^\\d") ~ str_c("X", .$SampleID),
                    TRUE ~ .$SampleID
                )
            ),
        by = c("cell" = "SampleID")
    ) %>%
    dplyr::select(
        cell:y_umap, GSM_ID:`Embryonic Day/ Culture Condition`
    ) %>%
    dplyr::rename_all(. %>% tolower()) %>%
    dplyr::rename("developmental_stage" = `embryonic day/ culture condition`) %>%
    mutate(
        developmental_stage = factor(
            developmental_stage,
            levels = c("E4.5", "E5.5", "E6.5", "EpiLC", "2iLESC")
        ),
        group = factor(
            group,
            levels = c(
                "E4.5EPI",
                "E4.5PE",
                "E4.5mTE",
                "E4.5pTE",
                "E5.5EPI",
                "E5.5VE",
                "E6.5EPI-T hi",
                "E6.5EPI-T lo",
                "E6.5VE",
                "EpiLC",
                "2i+L ESC"
            )
        )
    )
## Parsed with column specification:
## cols(
##   cell = col_character(),
##   batch = col_character(),
##   louvain = col_double(),
##   x_tsne = col_double(),
##   y_tsne = col_double(),
##   x_umap = col_double(),
##   y_umap = col_double(),
##   x_fitsne = col_double(),
##   y_fitsne = col_double(),
##   x_phate = col_double(),
##   y_phate = col_double(),
##   `x_min_dist=0.1` = col_double(),
##   `y_min_dist=0.1` = col_double(),
##   x_multicoretsne = col_double(),
##   y_multicoretsne = col_double()
## )
embedding_mm %>%
    dplyr::count(louvain) %>%
    gt::gt() %>%
    gt::cols_label(n = "num_cells") %>%
    gt::tab_options(table.font.size = "median")

louvain num_cells
0 49
1 37
2 22

embedding_mm %>%
    dplyr::count(group) %>%
    gt::gt() %>%
    gt::cols_label(n = "num_cells") %>%
    gt::tab_options(table.font.size = "median")

group num_cells
E4.5EPI 9
E4.5PE 9
E4.5mTE 9
E4.5pTE 10
E5.5EPI 16
E5.5VE 9
E6.5EPI-T hi 10
E6.5EPI-T lo 9
E6.5VE 5
EpiLC 12
2i+L ESC 10

embedding_mm %>%
    dplyr::count(developmental_stage) %>%
    gt::gt() %>%
    gt::cols_label(n = "num_cells") %>%
    gt::tab_options(table.font.size = "median")

developmental_stage num_cells
E4.5 37
E5.5 25
E6.5 24
EpiLC 12
2iLESC 10

embedding_middle <- embedding_mm %>%
    group_by(group) %>%
    summarise(
        x_median = median(x_umap),
        y_median = median(y_umap)
    )
## `summarise()` ungrouping output (override with `.groups` argument)
labels_group <- map(levels(embedding_mm$group), function(x) {
    embedding_mm %>%
        dplyr::filter(
            group == x
        ) %>%
        dplyr::left_join(embedding_middle) %>%
        dplyr::mutate(
            distance = sqrt(
                (x_umap - x_median)^2 + (y_umap - y_median)^2
            )
        ) %>%
        dplyr::filter(distance == min(distance))
}) %>%
    dplyr::bind_rows() %>%
    dplyr::select(cell, x_umap, y_umap, group)
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
labels_group


p_embedding_lineage <- plot_embedding(
    embedding = embedding_mm[, c("x_umap", "y_umap")],
    color_values = embedding_mm$group,
    label = "UMAP; Lineage",
    # label_position = c(label_x, label_y),
    show_color_value_labels = FALSE,
    show_color_legend = FALSE,
    geom_point_size = 1,
    sort_values = FALSE
) +
    scale_color_manual(
        values = ggthemes::tableau_color_pal("Tableau 20")(
            length(unique(embedding_mm$group))
        )
    ) +
    labs(color = NULL) +
    # guides(colour = guide_legend(override.aes = list(size = 3), ncol = 1)) +
    # customized_theme(x = 0.035, y = 0.72) +
    ggrepel::geom_text_repel(
        data = get_middle_points(embedding_mm, x_umap, y_umap, group),
        ggplot2::aes(
            x = x_umap,
            y = y_umap,
            label = group
        ),
        color = "black",
        size = 2,
        family = "Arial",
        #
        segment.color = "grey35",
        segment.size = 0.25,
        segment.inflect = TRUE,
        #
        box.padding = 0.5,
        min.segment.length = 0,
        arrow = ggplot2::arrow(length = unit(0.015, "npc")),
        max.overlaps = Inf,
        seed = SEED_USE,
    )

p_embedding_developmental_stage <- plot_embedding(
    embedding = embedding_mm[, c("x_umap", "y_umap")],
    color_values = embedding_mm$developmental_stage,
    label = "UMAP; Developmental stage",
    # label_position = c(label_x, label_y),
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = 1,
    sort_values = FALSE
) +
    scale_color_manual(
        values = gg_color_hue(n = embedding_mm$developmental_stage %>%
            droplevels() %>%
            nlevels())
    ) +
    labs(color = NULL) +
    guides(colour = guide_legend(override.aes = list(size = 3), ncol = 1)) +
    customized_theme(x = 0.035, y = 0.36)


Extended Data Figure 7d

purrr::reduce(list(
    p_embedding_lineage,
    p_embedding_developmental_stage
), `+`) +
    plot_layout(ncol = 2) +
    plot_annotation(
        theme = theme(plot.margin = margin())
    )

features_heatmap_mm <- c(
    "Gapdh",
    "Ppia",
    "Pou5f1",
    "Sox2",
    "Nanog",
    "Prdm14",
    "Tfcp2l1",
    "Dppa3",
    "Esrrb",
    "Zfp42",
    "Klf2",
    "Klf4",
    "Klf5",
    "Tbx3",
    "Nr5a2",
    "Fgf4",
    "Il6st",
    "Lifr",
    "Fgf5",
    "Otx2",
    "Pou3f1",
    "Sox3",
    "Dnmt3b",
    "T",
    "Mixl1",
    "Sp5",
    "Cdh2",
    "Snai1",
    "Gata6",
    "Pdgfra",
    "Sox17",
    "Gata4",
    "Afp",
    "Cdx2",
    "Gata2",
    "Gata3",
    "Tfap2c",
    "Krt18"
)

matrix_readcount_mm <- scipy.sparse$load_npz("../preprocessed/mm/matrix_readcount.npz")
matrix_readcount_mm_features <- np$load("../preprocessed/mm/matrix_readcount_features.npy")
matrix_readcount_mm_barcodes <- np$load("../preprocessed/mm/matrix_readcount_barcodes.npy")
colnames(matrix_readcount_mm) <- matrix_readcount_mm_barcodes
rownames(matrix_readcount_mm) <- matrix_readcount_mm_features
matrix_readcount_mm <- matrix_readcount_mm[, embedding_mm$cell]

features_heatmap_mm %in% rownames(matrix_readcount_mm)
##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
matrix_heatmap <- log10(matrix_readcount_mm[features_heatmap_mm, ] + 1)
cells_heatmap <- map(levels(embedding_mm$group), function(x) {
    cells_in_group <- embedding_mm %>%
        filter(group == x) %>%
        pull(cell)

    cells_in_group[
        pvclust::pvclust(
            matrix_heatmap[, cells_in_group] %>% as.matrix(),
            nboot = 100,
            parallel = parallel::detectCores() - 2L
        )$hclust$order
    ]
}) %>%
    unlist()
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.


Extended Data Figure 7c

matrix_heatmap[, cells_heatmap] %>%
    as.matrix() %>%
    as.data.frame() %>%
    rownames_to_column(var = "feature") %>%
    pivot_longer(
        -c("feature"),
        names_to = "sample"
    ) %>%
    left_join(
        embedding_mm %>%
            dplyr::select(cell, group),
        by = c("sample" = "cell")
    ) %>%
    mutate(
        feature = factor(
            feature,
            levels = features_heatmap_mm %>% rev(.)
        ),
        sample = factor(
            sample,
            levels = cells_heatmap
        )
    ) %>%
    plot_heatmap(
        x = sample,
        y = feature,
        z = value,
        y_order = rev(features_heatmap_mm)
    ) +
    facet_grid(~group, scales = "free", space = "free")

Cynomolgus vs mouse EPI development

Matching homologs

Number of genes of cy matrix: 29095
Number of genes of mm matrix: 26219

library(org.Hs.eg.db)
library(biomaRt)

org.Hs.eg.db version: 3.11.4
biomaRt version: 2.44.1

ensembl <- biomaRt::useMart("ensembl")
biomaRt::listDatasets(ensembl) %>%
    filter(dataset %in% "Cynomolgus")

biomaRt::listDatasets(ensembl)$dataset
##   [1] "acalliptera_gene_ensembl"       "acarolinensis_gene_ensembl"    
##   [3] "acchrysaetos_gene_ensembl"      "acitrinellus_gene_ensembl"     
##   [5] "amelanoleuca_gene_ensembl"      "amexicanus_gene_ensembl"       
##   [7] "ampachon_gene_ensembl"          "anancymaae_gene_ensembl"       
##   [9] "aplatyrhynchos_gene_ensembl"    "applatyrhynchos_gene_ensembl"  
##  [11] "atestudineus_gene_ensembl"      "bbbison_gene_ensembl"          
##  [13] "bgrunniens_gene_ensembl"        "bihybrid_gene_ensembl"         
##  [15] "bmutus_gene_ensembl"            "bsplendens_gene_ensembl"       
##  [17] "btaurus_gene_ensembl"           "bthybrid_gene_ensembl"         
##  [19] "cabingdonii_gene_ensembl"       "capalliatus_gene_ensembl"      
##  [21] "caperea_gene_ensembl"           "catys_gene_ensembl"            
##  [23] "ccanadensis_gene_ensembl"       "ccapucinus_gene_ensembl"       
##  [25] "cdromedarius_gene_ensembl"      "celegans_gene_ensembl"         
##  [27] "cgchok1gshd_gene_ensembl"       "cgcrigri_gene_ensembl"         
##  [29] "cgobio_gene_ensembl"            "cgpicr_gene_ensembl"           
##  [31] "charengus_gene_ensembl"         "chircus_gene_ensembl"          
##  [33] "choffmanni_gene_ensembl"        "cintestinalis_gene_ensembl"    
##  [35] "cjacchus_gene_ensembl"          "cjaponica_gene_ensembl"        
##  [37] "clanigera_gene_ensembl"         "cldingo_gene_ensembl"          
##  [39] "clfamiliaris_gene_ensembl"      "cmilii_gene_ensembl"           
##  [41] "cpbellii_gene_ensembl"          "cporcellus_gene_ensembl"       
##  [43] "cporosus_gene_ensembl"          "csabaeus_gene_ensembl"         
##  [45] "csavignyi_gene_ensembl"         "csemilaevis_gene_ensembl"      
##  [47] "csyrichta_gene_ensembl"         "cvariegatus_gene_ensembl"      
##  [49] "dclupeoides_gene_ensembl"       "dmelanogaster_gene_ensembl"    
##  [51] "dnovaehollandiae_gene_ensembl"  "dnovemcinctus_gene_ensembl"    
##  [53] "dordii_gene_ensembl"            "drerio_gene_ensembl"           
##  [55] "eaasinus_gene_ensembl"          "eburgeri_gene_ensembl"         
##  [57] "ecaballus_gene_ensembl"         "ecalabaricus_gene_ensembl"     
##  [59] "eelectricus_gene_ensembl"       "eeuropaeus_gene_ensembl"       
##  [61] "elucius_gene_ensembl"           "enaucrates_gene_ensembl"       
##  [63] "etelfairi_gene_ensembl"         "falbicollis_gene_ensembl"      
##  [65] "fcatus_gene_ensembl"            "fdamarensis_gene_ensembl"      
##  [67] "fheteroclitus_gene_ensembl"     "gaculeatus_gene_ensembl"       
##  [69] "gagassizii_gene_ensembl"        "ggallus_gene_ensembl"          
##  [71] "ggorilla_gene_ensembl"          "gmorhua_gene_ensembl"          
##  [73] "gwilldenowi_gene_ensembl"       "hburtoni_gene_ensembl"         
##  [75] "hcomes_gene_ensembl"            "hgfemale_gene_ensembl"         
##  [77] "hgmale_gene_ensembl"            "hhucho_gene_ensembl"           
##  [79] "hsapiens_gene_ensembl"          "ipunctatus_gene_ensembl"       
##  [81] "itridecemlineatus_gene_ensembl" "jhyemalis_gene_ensembl"        
##  [83] "jjaculus_gene_ensembl"          "lafricana_gene_ensembl"        
##  [85] "lbergylta_gene_ensembl"         "lcalcarifer_gene_ensembl"      
##  [87] "lcanadensis_gene_ensembl"       "lchalumnae_gene_ensembl"       
##  [89] "lcoronata_gene_ensembl"         "lcrocea_gene_ensembl"          
##  [91] "loculatus_gene_ensembl"         "lsdomestica_gene_ensembl"      
##  [93] "malbus_gene_ensembl"            "marmatus_gene_ensembl"         
##  [95] "mauratus_gene_ensembl"          "mcaroli_gene_ensembl"          
##  [97] "mdomestica_gene_ensembl"        "mfascicularis_gene_ensembl"    
##  [99] "mgallopavo_gene_ensembl"        "mleucophaeus_gene_ensembl"     
## [101] "mlucifugus_gene_ensembl"        "mmmarmota_gene_ensembl"        
## [103] "mmulatta_gene_ensembl"          "mmurdjan_gene_ensembl"         
## [105] "mmurinus_gene_ensembl"          "mmusculus_gene_ensembl"        
## [107] "mnemestrina_gene_ensembl"       "mochrogaster_gene_ensembl"     
## [109] "mpahari_gene_ensembl"           "mpfuro_gene_ensembl"           
## [111] "mspicilegus_gene_ensembl"       "mspretus_gene_ensembl"         
## [113] "mundulatus_gene_ensembl"        "munguiculatus_gene_ensembl"    
## [115] "mvitellinus_gene_ensembl"       "mzebra_gene_ensembl"           
## [117] "neugenii_gene_ensembl"          "ngalili_gene_ensembl"          
## [119] "nleucogenys_gene_ensembl"       "nvison_gene_ensembl"           
## [121] "oanatinus_gene_ensembl"         "oaries_gene_ensembl"           
## [123] "oaureus_gene_ensembl"           "ocuniculus_gene_ensembl"       
## [125] "odegus_gene_ensembl"            "ogarnettii_gene_ensembl"       
## [127] "olatipes_gene_ensembl"          "olhni_gene_ensembl"            
## [129] "olhsok_gene_ensembl"            "omelastigma_gene_ensembl"      
## [131] "oniloticus_gene_ensembl"        "oprinceps_gene_ensembl"        
## [133] "pabelii_gene_ensembl"           "panubis_gene_ensembl"          
## [135] "pcapensis_gene_ensembl"         "pcinereus_gene_ensembl"        
## [137] "pcoquereli_gene_ensembl"        "pformosa_gene_ensembl"         
## [139] "pkingsleyae_gene_ensembl"       "platipinna_gene_ensembl"       
## [141] "pmarinus_gene_ensembl"          "pmbairdii_gene_ensembl"        
## [143] "pmexicana_gene_ensembl"         "pnattereri_gene_ensembl"       
## [145] "pnyererei_gene_ensembl"         "ppaniscus_gene_ensembl"        
## [147] "ppardus_gene_ensembl"           "pranga_gene_ensembl"           
## [149] "preticulata_gene_ensembl"       "psimus_gene_ensembl"           
## [151] "psinensis_gene_ensembl"         "ptaltaica_gene_ensembl"        
## [153] "ptephrosceles_gene_ensembl"     "ptroglodytes_gene_ensembl"     
## [155] "pvampyrus_gene_ensembl"         "pvitticeps_gene_ensembl"       
## [157] "rbieti_gene_ensembl"            "rferrumequinum_gene_ensembl"   
## [159] "rnorvegicus_gene_ensembl"       "rroxellana_gene_ensembl"       
## [161] "saraneus_gene_ensembl"          "saurata_gene_ensembl"          
## [163] "sbboliviensis_gene_ensembl"     "scanaria_gene_ensembl"         
## [165] "scerevisiae_gene_ensembl"       "sdumerili_gene_ensembl"        
## [167] "sfasciatus_gene_ensembl"        "sformosus_gene_ensembl"        
## [169] "shabroptila_gene_ensembl"       "sharrisii_gene_ensembl"        
## [171] "sldorsalis_gene_ensembl"        "smaximus_gene_ensembl"         
## [173] "smerianae_gene_ensembl"         "sorbicularis_gene_ensembl"     
## [175] "spunctatus_gene_ensembl"        "ssalar_gene_ensembl"           
## [177] "ssbamei_gene_ensembl"           "ssberkshire_gene_ensembl"      
## [179] "sscrofa_gene_ensembl"           "sshampshire_gene_ensembl"      
## [181] "ssjinhua_gene_ensembl"          "sslandrace_gene_ensembl"       
## [183] "sslargewhite_gene_ensembl"      "ssmeishan_gene_ensembl"        
## [185] "sspietrain_gene_ensembl"        "ssrongchang_gene_ensembl"      
## [187] "sstibetan_gene_ensembl"         "ssusmarc_gene_ensembl"         
## [189] "sswuzhishan_gene_ensembl"       "strutta_gene_ensembl"          
## [191] "tbelangeri_gene_ensembl"        "tgelada_gene_ensembl"          
## [193] "tguttata_gene_ensembl"          "tnigroviridis_gene_ensembl"    
## [195] "trubripes_gene_ensembl"         "ttruncatus_gene_ensembl"       
## [197] "uamericanus_gene_ensembl"       "umaritimus_gene_ensembl"       
## [199] "vpacos_gene_ensembl"            "vursinus_gene_ensembl"         
## [201] "vvulpes_gene_ensembl"           "xmaculatus_gene_ensembl"       
## [203] "xtropicalis_gene_ensembl"
grep(
    pattern = "fascicularis",
    x = biomaRt::listDatasets(ensembl)$dataset,
    ignore.case = TRUE,
    value = TRUE
)
## [1] "mfascicularis_gene_ensembl"
# mfascicularis_gene_ensembl"

grep(
    "Human",
    listDatasets(ensembl)$description,
    ignore.case = TRUE,
    value = TRUE
)
## [1] "Human genes (GRCh38.p13)"
biomaRt::listDatasets(ensembl)[
    grep(
        "Human",
        biomaRt::listDatasets(ensembl)$description,
        ignore.case = TRUE
    ),
]
ensembl <- biomaRt::useDataset("hsapiens_gene_ensembl", mart = ensembl)

# biomaRt::listAttributes(ensembl) %>% head()
# biomaRt::listFilters(ensembl) %>% head()

Extract gene names from our human expression matrix.

ensembl_ids_human <- read_delim(
    file = "../../../LW36/filtered_feature_bc_matrix/features.tsv.gz",
    delim = "\t",
    col_names = FALSE
) %>%
    pull(X1)
## Parsed with column specification:
## cols(
##   X1 = col_character(),
##   X2 = col_character(),
##   X3 = col_character()
## )
gene_homologs <- biomaRt::getBM(
    attributes = c(
        "ensembl_gene_id",
        "ensembl_gene_id_version",
        "external_gene_name",
        "external_gene_source",
        #
        "mmusculus_homolog_ensembl_gene",
        "mmusculus_homolog_orthology_type",
        "mmusculus_homolog_associated_gene_name",
        "mmusculus_homolog_orthology_confidence",
        #
        "mfascicularis_homolog_ensembl_gene",
        "mfascicularis_homolog_orthology_type",
        "mfascicularis_homolog_associated_gene_name",
        "mfascicularis_homolog_orthology_confidence"
    ),
    filters = "ensembl_gene_id",
    values = ensembl_ids_human,
    mart = ensembl
)
## [1] "gene_homologs_20200818_151012_CDT.rds"


Keep one2one orthologs.

gene_homologs_filtered <- ensembl_ids_human %>%
    tibble::enframe(value = "ensembl_gene_id") %>%
    dplyr::left_join(
        gene_homologs %>%
            filter(
                mmusculus_homolog_orthology_type == "ortholog_one2one",
                mmusculus_homolog_orthology_confidence == 1,
                mfascicularis_homolog_orthology_type == "ortholog_one2one",
                mfascicularis_homolog_orthology_confidence == 1
            ),
        by = "ensembl_gene_id"
    ) %>%
    dplyr::select(
        -c(
            name,
            mmusculus_homolog_orthology_confidence,
            mfascicularis_homolog_orthology_confidence
        )
    ) %>%
    tidyr::drop_na()

gene_homologs_filtered %>% head()


Add ENTREZID.

library(org.Mm.eg.db)
## 
# keytypes(org.Mm.eg)

gene_homologs_filtered <- gene_homologs_filtered %>%
    mutate(
        ENTREZID_mm = AnnotationDbi::select(
            org.Mm.eg.db,
            keys = mmusculus_homolog_associated_gene_name,
            columns = c(
                "ENTREZID"
            ),
            keytype = "SYMBOL"
        ) %>%
            pull(ENTREZID)
    )
## 'select()' returned many:1 mapping between keys and columns

entrezgene_id_cy <- biomaRt::getBM(
    attributes = c("ensembl_gene_id", "entrezgene_id"),
    filters = "ensembl_gene_id",
    values = gene_homologs_filtered$mfascicularis_homolog_ensembl_gene,
    mart = biomaRt::useMart("ensembl", dataset = "mfascicularis_gene_ensembl")
)

# cy
entrezgene_id_cy_matrix <- read_delim(
    file = "../preprocessed/GSE74767_SC3seq_Cy_ProcessedData.txt.gz",
    delim = "\t"
) %>%
    pull(macFas5_entrez_id)
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   macFas5_gene_symbol = col_character()
## )
## See spec(...) for full column specifications.

gene_homologs_filtered <- gene_homologs_filtered %>%
    left_join(
        entrezgene_id_cy[entrezgene_id_cy$entrezgene_id %in% entrezgene_id_cy_matrix, ],
        by = c("mfascicularis_homolog_ensembl_gene" = "ensembl_gene_id")
    ) %>%
    dplyr::rename("ENTREZID_cy" = entrezgene_id)

# mm
entrezgene_id_mm_matrix <- read_delim(
    file = "../preprocessed/GSE74767_SC3seq_Ms_ProcessedData.txt.gz",
    delim = "\t"
) %>%
    pull(mm10_entrez_id)
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   mm10_gene_symbol = col_character()
## )
## See spec(...) for full column specifications.

# sum(gene_homologs_filtered$ENTREZID_mm %in% entrezgene_id_mm_matrix)
# sum(gene_homologs_filtered$ENTREZID_cy %in% entrezgene_id_cy_matrix)

gene_homologs_filtered <- gene_homologs_filtered %>%
    filter(
        ENTREZID_cy %in% entrezgene_id_cy_matrix,
        ENTREZID_mm %in% entrezgene_id_mm_matrix
    )

gene_homologs_filtered %>% dim()
## [1] 13411    12
gene_homologs_filtered %>% head()

Dimensionality reduction

Preprocessing

Combine cy and mm matrices.

matrix_readcount_cy_mm <- cbind(
    matrix_readcount_use[
        match(
            gene_homologs_filtered$ENTREZID_cy,
            entrezgene_id_cy_matrix
        ),
    ],
    matrix_readcount_mm[
        match(
            gene_homologs_filtered$ENTREZID_mm,
            entrezgene_id_mm_matrix
        ),
    ]
)

if (!file.exists("matrix_readcount_cy_mm.rds")) {
    saveRDS(
        object = matrix_readcount_cy_mm,
        file = "matrix_readcount_cy_mm.rds"
    )
}
matrix_readcount_cyEPI_mmEPI <- matrix_readcount_cy_mm[, c(
    embedding_cyEPI$sample_name,
    embedding_mm %>%
        filter(
            !developmental_stage %in% c("EpiLC", "2iLESC")
        ) %>%
        pull(cell)
)]

matrix_readcount_cyEPI_mmEPI <- matrix_readcount_cyEPI_mmEPI[
    Matrix::rowSums(
        matrix_readcount_cyEPI_mmEPI
    ) >= MINIMAL_NUM_COUNTS_REQUIRED_FOR_GENE,
]

dim(matrix_readcount_cyEPI_mmEPI)
## [1] 12038   299

PCA

matrix_readcount_cyEPI_mmEPI_log <- matrix_readcount_cyEPI_mmEPI
matrix_readcount_cyEPI_mmEPI_log@x <- log1p(matrix_readcount_cyEPI_mmEPI_log@x)

# z-score
matrix_readcount_cyEPI_mmEPI_log_scaled <- t(
    scale(
        t(
            matrix_readcount_cyEPI_mmEPI_log
        ),
        center = TRUE, scale = TRUE
    )
)

pca_out <- prcomp(
    t(matrix_readcount_cyEPI_mmEPI_log_scaled),
    center = FALSE,
    scale = FALSE
)
summary(pca_out)$imp %>%
    t() %>%
    as.data.frame() %>%
    rownames_to_column(var = "component") %>%
    mutate(
        rank = 1:n()
    ) %>%
    dplyr::slice(1:30) %>%
    ggplot(
        aes(
            x = rank,
            y = `Proportion of Variance`
        )
    ) +
    geom_point(size = 0.3) +
    theme_bw() +
    scale_x_continuous(
        name = "Component",
        breaks = c(1, seq(5, 30, 5))
    ) +
    scale_y_continuous(
        name = "Proportion of variance",
        labels = scales::percent
    ) +
    ggplot2::theme(
        axis.title = ggplot2::element_text(family = "Arial", size = 6),
        axis.text = ggplot2::element_text(family = "Arial", size = 6),
        panel.grid.minor = ggplot2::element_blank()
    )

embedding_cyEPI_mmEPI <- pca_out$x %>%
    as.data.frame() %>%
    rownames_to_column(var = "sample_name") %>%
    dplyr::select(sample_name:PC3) %>%
    dplyr::left_join(
        table_s1_sheet3 %>%
            # filter(Species == "Mouse") %>%
            mutate(
                SampleID = case_when(
                    str_detect(string = SampleID, pattern = "^\\d") ~ str_c("X", .$SampleID),
                    TRUE ~ .$SampleID
                )
            ),
        by = c("sample_name" = "SampleID")
    ) %>%
    dplyr::select(sample_name:`Embryonic Day/ Culture Condition`) %>%
    dplyr::rename_all(. %>% tolower()) %>%
    dplyr::rename(
        x_pca = pc1,
        y_pca = pc2,
        z_pca = pc3,
        developmental_stage = `embryonic day/ culture condition`
    ) %>%
    relocate(x_pca, y_pca, z_pca, .after = last_col()) %>%
    mutate(
        group = factor(
            group,
            levels = c(
                "ICM",
                "Pre-EPI",
                "PostE-EPI",
                "PostL-EPI",
                "Gast1",
                "Gast2a",
                "Gast2b",
                "E4.5EPI",
                "E4.5PE",
                "E4.5mTE",
                "E4.5pTE",
                "E5.5EPI",
                "E5.5VE",
                "E6.5EPI-T hi",
                "E6.5EPI-T lo",
                "E6.5VE"
            )
        ),
        species = factor(
            species,
            levels = c(
                "Cynomolgus Monkey",
                "Mouse"
            )
        )
    )
p_embedding_pca1 <- embedding_cyEPI_mmEPI %>%
    droplevels() %>%
    mutate(
        category = "PCA"
    ) %>%
    sample_frac() %>%
    plot_pca(
        x = x_pca,
        y = y_pca,
        color = group,
        shape = species
    ) +
    add_xy_label_pca(x = "PC1", y = "PC2") +
    scale_color_manual(
        values = ggthemes::tableau_color_pal("Tableau 20")(
            length(unique(embedding_cyEPI_mmEPI$group))
        )
    ) +
    ggrepel::geom_text_repel(
        data = get_middle_points(embedding_cyEPI_mmEPI, x_pca, y_pca, group),
        aes(
            x = x_pca,
            y = y_pca,
            label = group,
            shape = NULL
        ),
        color = "black",
        size = 2,
        family = "Arial",
        #
        segment.color = "grey35",
        segment.size = 0.25,
        segment.inflect = TRUE,
        #
        box.padding = 0.5,
        min.segment.length = 0,
        arrow = ggplot2::arrow(length = unit(0.015, "npc")),
        max.overlaps = Inf,
        seed = SEED_USE,
    ) +
    guides(color = "none", shape = "none")
## `summarise()` ungrouping output (override with `.groups` argument)
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
p_embedding_pca2 <- embedding_cyEPI_mmEPI %>%
    droplevels() %>%
    mutate(
        category = "PCA"
    ) %>%
    sample_frac() %>%
    plot_pca(
        x = y_pca,
        y = z_pca,
        color = group,
        shape = species
    ) +
    add_xy_label_pca(x = "PC2", y = "PC3") +
    scale_color_manual(
        values = ggthemes::tableau_color_pal("Tableau 20")(
            length(unique(embedding_cyEPI_mmEPI$group))
        )
    ) +
    guides(color = "none") +
    theme(
        legend.text = element_text(family = "Arial", size = 6),
        legend.key.size = unit(3, "mm"),
        legend.margin = margin(t = 0, r = 0, b = 0, l = 0, unit = "mm"),
        legend.background = element_blank(),
        #
        legend.justification = c(0, 1),
        legend.position = c(0.035, 0.22)
    )


Extended Data Figure 8a

p_embedding_pca1 +
    p_embedding_pca2 +
    plot_annotation(theme = theme(plot.margin = margin())) +
    plot_layout(ncol = 2, guides = "keep")

Comparion of cyEPI and mEPI

Species specific

# Supplementary Table 2, Sheet 12
table_s2_sheet12 <- readxl::read_excel(
    path = "../../../../docs/A_developmental_coordinate_of_pluripotency_among_mice,_monkeys_and_humans/41586_2016_BFnature19096_MOESM297_ESM.xlsx",
    sheet = "EDFig8b",
    skip = 1
)
## New names:
## * MF5ID -> MF5ID...2
## * MF5Symbol -> MF5Symbol...3
## * HsID -> HsID...4
## * HsSymbol -> HsSymbol...5
## * MsID -> MsID...6
## * ...
table_s2_sheet12 %>% head()

Use genes listed in Supplementary Table 2, Sheet 12.

features_heatmap <- c(
    table_s2_sheet12$MF5ID...22,
    table_s2_sheet12$MF5ID...2
)

features_heatmap <- features_heatmap[!is.na(features_heatmap)]
groups_selected <- c(
    "ICM", "Pre-EPI", "PostE-EPI", "PostL-EPI", "Gast1", "Gast2a", "Gast2b",
    "E4.5EPI", "E5.5EPI", "E6.5EPI-T hi", "E6.5EPI-T lo"
)

cells_heatmap <- embedding_cyEPI_mmEPI %>%
    filter(group %in% groups_selected) %>%
    pull(sample_name)
features_heatmap <- features_heatmap[
    features_heatmap %in% gene_homologs_filtered$ENTREZID_cy
]

matrix_heatmap <- matrix_readcount_cy_mm[
    match(features_heatmap, gene_homologs_filtered$ENTREZID_cy),
    cells_heatmap
]

matrix_heatmap <- matrix_heatmap[rowSums(matrix_heatmap) != 0, ]
matrix_heatmap <- log10(matrix_heatmap + 1)
heatmap_limits <- quantile(matrix_heatmap, c(0.005, 0.995))
matrix_heatmap[matrix_heatmap < heatmap_limits[1]] <- heatmap_limits[1]
matrix_heatmap[matrix_heatmap > heatmap_limits[2]] <- heatmap_limits[2]


Number of genes used: 658.

cells_heatmap <- map(groups_selected, function(x) {
    cells_in_group <- embedding_cyEPI_mmEPI %>%
        filter(group == x) %>%
        pull(sample_name)

    cells_in_group[
        pvclust::pvclust(
            matrix_heatmap[, cells_in_group] %>% as.matrix(),
            nboot = 100,
            parallel = parallel::detectCores() - 2L
        )$hclust$order
    ]
}) %>%
    unlist()
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
matrix_heatmap <- matrix_heatmap[, cells_heatmap]
ha_columns <- ComplexHeatmap::HeatmapAnnotation(
    lineage = ComplexHeatmap::anno_simple(
        embedding_cyEPI_mmEPI[match(
            x = colnames(matrix_heatmap),
            table = embedding_cyEPI_mmEPI$sample_name
        ), "group"],
        # pch = anno_labels_cluster,
        col = setNames(
            object = ggthemes::tableau_color_pal("Tableau 20")(
                embedding_cyEPI_mmEPI$group %>%
                    # droplevels() %>%
                    nlevels()
            ),
            nm = levels(embedding_cyEPI_mmEPI$group)
        ),
        which = "column",
        pt_size = unit(2, "mm"),
        pt_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 6
        ),
        simple_anno_size = unit(3, "mm")
    ),
    #
    Species = ComplexHeatmap::anno_simple(
        embedding_cyEPI_mmEPI[match(
            x = colnames(matrix_heatmap),
            table = embedding_cyEPI_mmEPI$sample_name
        ), "species"],
        # pch = anno_labels_cluster,
        col = setNames(
            object = as.character(
                yarrr::piratepal(palette = "google")
            )[seq_along(levels(embedding_cyEPI_mmEPI$species))],
            nm = levels(embedding_cyEPI_mmEPI$species)
        ),
        which = "column",
        pt_size = unit(2, "mm"),
        pt_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 6
        ),
        simple_anno_size = unit(3, "mm")
    ),
    #
    show_annotation_name = TRUE,
    annotation_label = c(
        "Lineage",
        "Species"
    ),
    annotation_name_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    annotation_name_side = "left"
)
lgd_lineage <- ComplexHeatmap::Legend(
    title = "Lineage",
    labels = groups_selected, # embedding_cyEPI_mmEPI %>% pull(group) %>% droplevels() %>% levels(),
    legend_gp = grid::gpar(
        fill = setNames(
            object = ggthemes::tableau_color_pal("Tableau 20")(
                length(unique(embedding_cyEPI_mmEPI$group))
            ),
            nm = embedding_cyEPI_mmEPI$group %>% levels()
        )[embedding_cyEPI_mmEPI$group %>%
            droplevels() %>%
            levels()][groups_selected]
    ),
    labels_gp = grid::gpar(
        fontfamily = "Arial",
        fontsize = 6
    ),
    title_gp = grid::gpar(
        fontfamily = "Arial",
        fontsize = 6
    )
)

lgd_species <- ComplexHeatmap::Legend(
    title = "Species",
    labels = levels(embedding_cyEPI_mmEPI$species),
    legend_gp = grid::gpar(
        fill = setNames(
            object = as.character(
                yarrr::piratepal(palette = "google")
            )[seq_along(levels(embedding_cyEPI_mmEPI$species))],
            nm = levels(embedding_cyEPI_mmEPI$species)
        )
    ),
    labels_gp = grid::gpar(
        fontfamily = "Arial",
        fontsize = 6
    ),
    title_gp = grid::gpar(
        fontfamily = "Arial",
        fontsize = 6
    )
)
ht <- ComplexHeatmap::Heatmap(
    matrix = matrix_heatmap %>% as.matrix(),
    # col =  viridis::plasma(n = 10),
    col = wesanderson::wes_palette("Zissou1", 50, type = "continuous"),
    #
    cluster_rows = FALSE,
    cluster_columns = FALSE,
    #
    ## row_labels = str_remove(string = rownames(matrix_heatmap), pattern = "^E.+_"),
    # row_names_side = c("left"),
    # row_names_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    show_row_names = FALSE,
    show_column_names = FALSE,
    #
    show_row_dend = FALSE,
    show_column_dend = FALSE,
    #
    column_names_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    show_heatmap_legend = TRUE,
    top_annotation = ha_columns,
    ## left_annotation = ha_left,
    ## right_annotation = ha_left,
    #
    heatmap_legend_param = list(
        title = expression(paste("Log"[10], " (RPM + 1)")),
        title_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 6
        ),
        legend_direction = "vertical",
        labels_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 6
        ),
        legend_height = unit(25, "mm"),
        legend_width = unit(10, "mm")
    ),
    #
    # row_split = table_s2_sheet4$ClusterID,
    # row_gap = unit(0.3, "mm"),
    #
    # row_title_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    # row_title_rot = 0,
    #
    column_split = factor(
        embedding_cyEPI_mmEPI[match(
            x = colnames(matrix_heatmap),
            table = embedding_cyEPI_mmEPI$sample_name
        ), "group", drop = TRUE] %>%
            # levels() %>%
            str_pad(
                width = max(nchar(levels(embedding_cyEPI_mmEPI$group))),
                side = "left"
            ),
        levels = embedding_cyEPI_mmEPI[match(
            x = colnames(matrix_heatmap),
            table = embedding_cyEPI_mmEPI$sample_name
        ), "group", drop = TRUE] %>%
            levels() %>%
            str_pad(
                width = max(nchar(levels(embedding_cyEPI_mmEPI$group))),
                side = "left"
            )
    ),
    column_gap = unit(0.3, "mm"),
    column_title_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    column_title_rot = -90,
    #
    use_raster = FALSE
)
pd <- ComplexHeatmap::packLegend(
    lgd_lineage,
    lgd_species,
    # gap = unit(8, "mm"),
    direction = "vertical"
)


Extended Data Figure 8b

ComplexHeatmap::draw(
    ht,
    heatmap_legend_list = list(pd)
)

Developmental stage

scaled_pc_loadings <- pca_out$rotation %>%
    scale() %>%
    as.data.frame() %>%
    rownames_to_column(var = "feature") %>%
    dplyr::select(feature, PC1, PC2, PC3) %>%
    mutate(
        # group = (abs(PC2) >= 2.5 | abs(PC3) >= 2.5) & abs(PC1 < 2),
        r23 = sqrt(PC2^2 + PC3^2) >= 2.75,
        r12 = sqrt(PC1^2 + PC2^2) < 3,
        r13 = sqrt(PC1^2 + PC3^2) < 3,
        # pc1 = abs(PC1 < 2),
        group = r23 & r12 & r13
    ) %>%
    arrange(group)

sum(scaled_pc_loadings$group)
## [1] 226
plot_pca_loading(
    data = scaled_pc_loadings,
    x = PC2,
    y = PC3,
    z1 = group,
    x_label = "Z score of PC2 loading",
    y_label = "Z score of PC3 loading"
) +
    coord_fixed()

features_heatmap <- rownames(
    matrix_readcount_cyEPI_mmEPI
)[rownames(matrix_readcount_cyEPI_mmEPI) %in%
    scaled_pc_loadings$feature[scaled_pc_loadings$group]]

matrix_heatmap <- matrix_readcount_cy_mm[
    rownames(matrix_readcount_cy_mm) %in% features_heatmap,
    cells_heatmap
]

matrix_heatmap <- matrix_heatmap[rowSums(matrix_heatmap) != 0, ]
matrix_heatmap <- log10(matrix_heatmap + 1)
heatmap_limits <- quantile(matrix_heatmap, c(0.005, 0.995))
matrix_heatmap[matrix_heatmap < heatmap_limits[1]] <- heatmap_limits[1]
matrix_heatmap[matrix_heatmap > heatmap_limits[2]] <- heatmap_limits[2]


features_heatmap <- rownames(matrix_heatmap)[
    pvclust::pvclust(
        t(matrix_heatmap %>% as.matrix()),
        nboot = 100,
        parallel = parallel::detectCores() - 2L
    )$hclust$order
]
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
matrix_heatmap <- as.matrix(matrix_heatmap)[features_heatmap, ]
ht <- ComplexHeatmap::Heatmap(
    matrix = matrix_heatmap,
    # col =  viridis::plasma(n = 10),
    col = wesanderson::wes_palette("Zissou1", 50, type = "continuous"),
    #
    cluster_rows = FALSE,
    cluster_columns = FALSE,
    #
    ## row_labels = str_remove(string = rownames(matrix_heatmap), pattern = "^E.+_"),
    # row_names_side = c("left"),
    # row_names_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    show_row_names = FALSE,
    show_column_names = FALSE,
    #
    show_row_dend = FALSE,
    show_column_dend = FALSE,
    #
    column_names_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    show_heatmap_legend = TRUE,
    top_annotation = ha_columns,
    ## left_annotation = ha_left,
    ## right_annotation = ha_left,
    #
    heatmap_legend_param = list(
        title = expression(paste("Log"[10], " (RPM + 1)")),
        title_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 6
        ),
        legend_direction = "vertical",
        labels_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 6
        ),
        legend_height = unit(25, "mm"),
        legend_width = unit(10, "mm")
    ),
    #
    # row_split = table_s2_sheet4$ClusterID,
    # row_gap = unit(0.3, "mm"),
    #
    # row_title_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    # row_title_rot = 0,
    #
    column_split = factor(
        embedding_cyEPI_mmEPI[match(
            x = colnames(matrix_heatmap),
            table = embedding_cyEPI_mmEPI$sample_name
        ), "group", drop = TRUE] %>%
            # levels() %>%
            str_pad(
                width = max(nchar(levels(embedding_cyEPI_mmEPI$group))),
                side = "left"
            ),
        levels = embedding_cyEPI_mmEPI[match(
            x = colnames(matrix_heatmap),
            table = embedding_cyEPI_mmEPI$sample_name
        ), "group", drop = TRUE] %>%
            levels() %>%
            str_pad(
                width = max(nchar(levels(embedding_cyEPI_mmEPI$group))),
                side = "left"
            )
    ),
    column_gap = unit(0.3, "mm"),
    column_title_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    column_title_rot = -90,
    #
    use_raster = FALSE
)


Extended Data Figure 8c

ComplexHeatmap::draw(
    ht,
    heatmap_legend_list = list(pd)
)

Correlation heatmap

# Supplementary Table 2, Sheet 5
table_s2_sheet5 <- readxl::read_excel(
    path = "../../../../docs/A_developmental_coordinate_of_pluripotency_among_mice,_monkeys_and_humans/41586_2016_BFnature19096_MOESM297_ESM.xlsx",
    sheet = "cy_m common EPI genes",
    skip = 0
)

table_s2_sheet5 %>% head()
matrix_heatmap_corr <- map(groups_selected, function(x) {
    cells_in_group <- embedding_cyEPI_mmEPI %>%
        filter(group == x) %>%
        pull(sample_name)
    rowMeans(matrix_readcount_cy_mm[, cells_in_group])
}) %>%
    bind_cols() %>%
    as.matrix()
## New names:
## * NA -> ...1
## * NA -> ...2
## * NA -> ...3
## * NA -> ...4
## * NA -> ...5
## * ...
colnames(matrix_heatmap_corr) <- groups_selected
rownames(matrix_heatmap_corr) <- rownames(matrix_heatmap_corr)

matrix_heatmap_corr <- matrix_heatmap_corr[
    gene_homologs_filtered$ENTREZID_cy %in% table_s2_sheet5$macFas5_entrez_id,
]

head(matrix_heatmap_corr)
##              ICM   Pre-EPI  PostE-EPI  PostL-EPI       Gast1      Gast2a
## [1,]   0.3884385  3.066446  21.705624  56.140386  38.8517128  27.0977585
## [2,]  33.3950797  6.219453   3.198068   7.396065  11.6537856  66.8365023
## [3,]  47.4773044 45.381955 173.390106 319.768217 837.4882833 398.9513877
## [4,]  13.1562846 52.278984   3.787983   2.408792   0.8139348   0.9224277
## [5,] 217.7737733 53.153054   9.975591   8.359721   7.8830661   3.3246738
## [6,]   0.0000000  4.702147  18.555872  16.887636  13.6229839  11.6953331
##           Gast2b    E4.5EPI      E5.5EPI E6.5EPI-T hi E6.5EPI-T lo
## [1,]  17.2856300   1.456039  13.07441250   23.7086779    4.8825092
## [2,]  19.8709846   0.000000   0.13180738    8.2164376    8.2147970
## [3,] 319.1945685 165.748156 115.74261687  160.4587570  246.5546333
## [4,]   0.1858084   0.000000   0.02128681    0.0995788    1.0954902
## [5,]   6.3924846   5.338176   1.99523812    0.9123928    0.8186693
## [6,]  11.9185731  24.416767  15.68149437    3.3923033   18.0418240


Extended Data Figure 8d

corr_heatmap <- cor(log10(matrix_heatmap_corr + 1))
corr_heatmap[lower.tri(corr_heatmap)] <- NA

corr_heatmap %>%
    as.data.frame() %>%
    rownames_to_column(var = "sample_x") %>%
    pivot_longer(
        -c("sample_x"),
        names_to = "sample_y"
    ) %>%
    as.data.frame() %>%
    filter(!is.na(value)) %>%
    # round(value, digits = 2)
    mutate(
        sample_x = factor(
            sample_x,
            levels = groups_selected,
        ),
        sample_y = factor(
            sample_y,
            levels = groups_selected %>% rev(.)
        )
    ) %>%
    plot_corr_heatmap(
        x = sample_x,
        y = sample_y,
        z = value
    ) +
    scale_fill_gradientn(
        limits = c(-1, 1),
        colors = wesanderson::wes_palette(
            "Zissou1",
            21,
            type = "continuous"
        )
    )
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.

cyEPI development vs hPSCs/cyPSCs

Load data in Ref. 36

matrix_readcount_PRJEB7132 <- read_delim(
    file = "../../PRJEB7132/matrix/xaa_Aligned.sortedByCoord.out_deduped_q10_gene_id_featureCounts.txt.gz",
    delim = "\t",
    col_names = TRUE,
    skip = 1
) %>%
    dplyr::select(-(2:6))
## Parsed with column specification:
## cols(
##   Geneid = col_character(),
##   Chr = col_character(),
##   Start = col_character(),
##   End = col_character(),
##   Strand = col_character(),
##   Length = col_double(),
##   `../aln/ERR590398_rnaseq/aln/Aligned.sortedByCoord.out_deduped.bam` = col_double(),
##   `../aln/ERR590399_rnaseq/aln/Aligned.sortedByCoord.out_deduped.bam` = col_double(),
##   `../aln/ERR590400_rnaseq/aln/Aligned.sortedByCoord.out_deduped.bam` = col_double(),
##   `../aln/ERR590401_rnaseq/aln/Aligned.sortedByCoord.out_deduped.bam` = col_double(),
##   `../aln/ERR590408_rnaseq/aln/Aligned.sortedByCoord.out_deduped.bam` = col_double(),
##   `../aln/ERR590410_rnaseq/aln/Aligned.sortedByCoord.out_deduped.bam` = col_double()
## )

colnames(matrix_readcount_PRJEB7132) <- colnames(matrix_readcount_PRJEB7132) %>%
    str_remove(
        pattern = "_rnaseq/aln/Aligned.sortedByCoord.out_deduped.bam"
    ) %>%
    str_remove(pattern = "../aln/")

cell_metadata_PRJEB7132 <- read_delim(
    file = "../../PRJEB7132/SraRunTable.txt",
    delim = ","
) %>%
    dplyr::select(-c("organism", "Sample Name")) %>%
    rename_all(. %>% tolower()) %>%
    `colnames<-`(str_replace(
        string = colnames(.),
        pattern = " ", replacement = "_"
    )) %>%
    filter(organism == "Homo sapiens") %>%
    dplyr::select(
        sample_name,
        run,
        biosample,
        alias,
        cell_line,
        organism,
        title
    ) %>%
    mutate(
        sample_description = str_remove(
            string = title,
            pattern = "_R\\d$"
        )
    )
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   AvgSpotLen = col_double(),
##   Bases = col_double(),
##   Bytes = col_double(),
##   `ENA-FIRST-PUBLIC (run)` = col_date(format = ""),
##   `ENA-LAST-UPDATE (run)` = col_date(format = ""),
##   INSDC_first_public = col_datetime(format = ""),
##   INSDC_last_update = col_datetime(format = ""),
##   ReleaseDate = col_datetime(format = "")
## )
## See spec(...) for full column specifications.

matrix_readcount_PRJEB7132 <- Matrix(
    data = as.matrix(matrix_readcount_PRJEB7132[, -1]),
    dimnames = list(
        matrix_readcount_PRJEB7132$Geneid,
        colnames(matrix_readcount_PRJEB7132)[-1]
    ),
    sparse = TRUE
)

colnames(matrix_readcount_PRJEB7132) <- colnames(matrix_readcount_PRJEB7132) %>%
    enframe(value = "run") %>%
    left_join(cell_metadata_PRJEB7132, by = c("run" = "run")) %>%
    pull("sample_name")

matrix_cpm_PRJEB7132 <- calc_cpm(
    matrix_readcount_PRJEB7132[
        ,
        c("ERS537888", "ERS537890", "ERS537878", "ERS537884", "ERS537881", "ERS537876")
    ]
)
# gene_homologs_filtered
# matrix_readcount_use[entrezgene_id_cy_matrix %in% table_s2_sheet4$macFas5_entrez_id]

matrix_readcount_use2 <- matrix_readcount_use[
    match(gene_homologs_filtered$ENTREZID_cy, entrezgene_id_cy_matrix),
]

matrix_cpm_PRJEB7132_ <- matrix_cpm_PRJEB7132[
    match(gene_homologs_filtered$ensembl_gene_id, rownames(matrix_cpm_PRJEB7132)),
]
groups_selected <- c(
    "ICM", "Pre-EPI", "PostE-EPI", "PostL-EPI",
    "Gast1", "Gast2a", "Gast2b",
    "cyESCFF", "cyESCoF"
)
matrix_heatmap_corr <- map(groups_selected, function(x) {
    cells_in_group <- embedding_cy %>%
        filter(group == x) %>%
        pull(cell)
    Matrix::rowMeans(matrix_readcount_use2[, cells_in_group])
}) %>%
    bind_cols() %>%
    as.matrix()
## New names:
## * NA -> ...1
## * NA -> ...2
## * NA -> ...3
## * NA -> ...4
## * NA -> ...5
## * ...
colnames(matrix_heatmap_corr) <- groups_selected
rownames(matrix_heatmap_corr) <- rownames(matrix_readcount_use2)

matrix_heatmap_corr <- cbind(
    matrix_heatmap_corr,
    H9 = Matrix::rowMeans(
        matrix_cpm_PRJEB7132_[, c("ERS537888", "ERS537890", "ERS537878")]
    ),
    H9_reset = Matrix::rowMeans(
        matrix_cpm_PRJEB7132_[, c("ERS537884", "ERS537881", "ERS537876")]
    )
)

# cleanup
rm(matrix_readcount_use2)
rm(matrix_cpm_PRJEB7132_)

matrix_heatmap_corr <- matrix_heatmap_corr[
    gene_homologs_filtered$ENTREZID_cy %in% table_s2_sheet4$macFas5_entrez_id,
]

matrix_heatmap_corr %>% head()
##                  ICM   Pre-EPI  PostE-EPI  PostL-EPI      Gast1    Gast2a
## SMIM1    151.1363033 93.115194   3.820487   7.118101   8.682972  10.24251
## FBXO2      1.4469895 13.184514 102.988591  54.911807  54.469006  38.52027
## MAD2L2    24.9603647 61.772933 279.058067 204.688162 114.454772 123.24133
## DRAXIN     0.6368063  1.544775  41.484816  57.921171  19.596015  16.83393
## FBLIM1     5.6753280 41.920067 109.251597  60.510816 107.413512 215.56966
## ARHGEF19   0.3884385  3.066446  21.705624  56.140386  38.851713  27.09776
##             Gast2b   cyESCFF    cyESCoF         H9  H9_reset
## SMIM1     5.740824  14.18091   7.773821   2.190192  2.617383
## FBXO2    16.046803  46.32920  70.165794  12.201141  0.702835
## MAD2L2   87.188329 169.93101 209.349509 131.947331 69.062950
## DRAXIN   52.722563  40.49054  31.154642  81.348664  7.799315
## FBLIM1   46.282181 154.37242 260.867845  67.786364 25.493542
## ARHGEF19 17.285630  56.76092  28.440670  12.810655  3.328117


Fig. 5c (only includes Ref. 36)

corr_heatmap <- cor(log10(matrix_heatmap_corr + 1))
corr_heatmap[lower.tri(corr_heatmap)] <- NA

corr_heatmap %>%
    as.data.frame() %>%
    rownames_to_column(var = "sample_x") %>%
    pivot_longer(
        -c("sample_x"),
        names_to = "sample_y"
    ) %>%
    as.data.frame() %>%
    filter(!is.na(value)) %>%
    # round(value, digits = 2)
    mutate(
        sample_x = factor(
            sample_x,
            levels = c(groups_selected, "H9", "H9_reset"),
        ),
        sample_y = factor(
            sample_y,
            levels = c(groups_selected, "H9", "H9_reset") %>% rev(.)
        )
    ) %>%
    plot_corr_heatmap(
        x = sample_x,
        y = sample_y,
        z = value
    ) +
    scale_fill_gradientn(
        # limits = c(-1, 1),
        colors = wesanderson::wes_palette(
            "Zissou1",
            21,
            type = "continuous"
        )
    )
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.

corr_heatmap <- cor(log10(matrix_heatmap_corr + 1))
# colnames(corr_heatmap) == rownames(corr_heatmap)
corr_heatmap <- corr_heatmap[!rownames(corr_heatmap) %in% c("H9", "H9_reset"), ]

ha_columns <- ComplexHeatmap::HeatmapAnnotation(
    lineage = ComplexHeatmap::anno_simple(
        c(groups_selected, "H9", "H9_reset"),
        # pch = anno_labels_cluster,
        col = c(
            setNames(
                object = ggthemes::tableau_color_pal("Tableau 20")(
                    embedding_cy$group %>%
                        nlevels()
                ),
                nm = levels(embedding_cy$group)
            )[groups_selected],
            "H9" = "pink",
            "H9_reset" = "purple"
        ),
        which = "column",
        pt_size = unit(2, "mm"),
        pt_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 6
        ),
        simple_anno_size = unit(3, "mm")
    ),
    #
    Species = ComplexHeatmap::anno_simple(
        c(
            rep("Cynomolgus Monkey", length(groups_selected)),
            rep("Human", 2)
        ),
        # pch = anno_labels_cluster,
        col = c("Cynomolgus Monkey" = "#3D79F3FF", "Human" = "#34A74BFF"),
        which = "column",
        pt_size = unit(2, "mm"),
        pt_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 6
        ),
        simple_anno_size = unit(3, "mm")
    ),
    show_annotation_name = TRUE,
    annotation_label = c(
        "Lineage",
        "Species"
    ),
    annotation_name_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    annotation_name_side = "left"
)

ha_left <- ComplexHeatmap::HeatmapAnnotation(
    lineage = ComplexHeatmap::anno_simple(
        groups_selected,
        col = setNames(
            object = ggthemes::tableau_color_pal("Tableau 20")(
                embedding_cy$group %>%
                    nlevels()
            ),
            nm = levels(embedding_cy$group)
        )[groups_selected],
        which = "row",
        # pt_size = unit(2, "mm"),
        pt_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 5
        ),
        simple_anno_size = unit(3, "mm")
    ),
    which = "row",
    show_annotation_name = FALSE,
    annotation_label = c(
        "Lineage"
    ),
    annotation_name_gp = grid::gpar(fontfamily = "Arial", fontsize = 6)
)

ht <- ComplexHeatmap::Heatmap(
    corr_heatmap,
    # name = "heatmap",
    col = wesanderson::wes_palette("Zissou1", 50, type = "continuous"),
    border = NA,
    rect_gp = grid::gpar(col = "white", lwd = 1),
    cell_fun = function(j, i, x, y, width, height, fill) {
        # grid::grid.text(sprintf("%.2f", corr_heatmap[i, j]), x, y, gp = grid::gpar(fontsize = 6))
        grid::grid.text(round(corr_heatmap[i, j], 2), x, y, gp = grid::gpar(fontsize = 6))
    },
    #
    cluster_rows = FALSE,
    cluster_columns = FALSE,
    #
    ## row_labels = str_remove(string = rownames(matrix_heatmap), pattern = "^E.+_"),
    row_names_side = c("left"),
    row_names_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    show_row_names = TRUE,
    #
    column_names_side = c("top"),
    column_names_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    column_names_rot = -90,
    show_column_names = TRUE,
    #
    show_row_dend = FALSE,
    show_column_dend = FALSE,
    #
    top_annotation = ha_columns,
    left_annotation = ha_left,
    #
    heatmap_legend_param = list(
        title = "Corr",
        title_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 6
        ),
        legend_direction = "vertical",
        labels_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 6
        ),
        legend_height = unit(25, "mm"),
        legend_width = unit(10, "mm")
    ),
    #
    column_split = c(
        rep("Cynomolgus Monkey", length(groups_selected)),
        rep("Human", 2)
    ),
    column_gap = unit(2, "mm"),
    column_title_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    column_title_rot = -90,
    column_title = NULL
)

lgd_lineage <- ComplexHeatmap::Legend(
    title = "Lineage",
    labels = groups_selected, # embedding_cyEPI_mmEPI %>% pull(group) %>% droplevels() %>% levels(),
    legend_gp = grid::gpar(
        fill = c(
            setNames(
                object = ggthemes::tableau_color_pal("Tableau 20")(
                    embedding_cy$group %>%
                        nlevels()
                ),
                nm = levels(embedding_cy$group)
            )[groups_selected],
            "H9" = "pink",
            "H9_reset" = "purple"
        )
    ),
    labels_gp = grid::gpar(
        fontfamily = "Arial",
        fontsize = 6
    ),
    title_gp = grid::gpar(
        fontfamily = "Arial",
        fontsize = 6
    )
)

lgd_species <- ComplexHeatmap::Legend(
    title = "Species",
    labels = c("Cynomolgus Monkey", "Human"),
    legend_gp = grid::gpar(
        fill = c("Cynomolgus Monkey" = "#3D79F3FF", "Human" = "#34A74BFF")
    ),
    labels_gp = grid::gpar(
        fontfamily = "Arial",
        fontsize = 6
    ),
    title_gp = grid::gpar(
        fontfamily = "Arial",
        fontsize = 6
    )
)

pd <- ComplexHeatmap::packLegend(
    # lgd_lineage,
    lgd_species,
    # gap = unit(8, "mm"),
    direction = "vertical"
)


Fig. 5c (only includes Ref. 36)

ComplexHeatmap::draw(
    ht,
    heatmap_legend_list = list(pd)
)

R session info

devtools::session_info()$platform
##  setting  value                       
##  version  R version 4.0.2 (2020-06-22)
##  os       macOS Catalina 10.15.6      
##  system   x86_64, darwin19.5.0        
##  ui       unknown                     
##  language (EN)                        
##  collate  en_US.UTF-8                 
##  ctype    en_US.UTF-8                 
##  tz       America/Chicago             
##  date     2020-08-18
devtools::session_info()$pack %>%
    as_tibble() %>%
    dplyr::select(
        package,
        loadedversion,
        date,
        `source`
    ) %>%
    # print(n = nrow(.))
    gt::gt() %>%
    gt::tab_options(table.font.size = "median")
package loadedversion date source
AnnotationDbi 1.50.3 2020-07-25 Bioconductor
askpass 1.1 2019-01-13 CRAN (R 4.0.0)
assertthat 0.2.1 2019-03-21 CRAN (R 4.0.0)
backports 1.1.8 2020-06-17 CRAN (R 4.0.1)
BayesFactor 0.9.12-4.2 2018-05-19 CRAN (R 4.0.2)
Biobase 2.48.0 2020-04-27 Bioconductor
BiocFileCache 1.12.1 2020-08-04 Bioconductor
BiocGenerics 0.34.0 2020-04-27 Bioconductor
biomaRt 2.44.1 2020-06-17 Bioconductor
bit 4.0.4 2020-08-04 CRAN (R 4.0.2)
bit64 4.0.2 2020-07-30 CRAN (R 4.0.2)
blob 1.2.1 2020-01-20 CRAN (R 4.0.0)
broom 0.7.0.9001 2020-08-17 Github (tidymodels/broom@591f87e)
callr 3.4.3.9000 2020-07-31 Github (r-lib/callr@b96da8f)
cellranger 1.1.0 2016-07-27 CRAN (R 4.0.0)
checkmate 2.0.0 2020-02-06 CRAN (R 4.0.0)
circlize 0.4.10 2020-06-15 CRAN (R 4.0.1)
cli 2.0.2 2020-02-28 CRAN (R 4.0.0)
clue 0.3-57 2019-02-25 CRAN (R 4.0.0)
cluster 2.1.0 2019-06-19 CRAN (R 4.0.2)
coda 0.19-3 2019-07-05 CRAN (R 4.0.0)
codetools 0.2-16 2018-12-24 CRAN (R 4.0.2)
colorspace 1.4-1 2019-03-18 CRAN (R 4.0.0)
ComplexHeatmap 2.4.3 2020-07-25 Bioconductor
crayon 1.3.4 2017-09-16 CRAN (R 4.0.0)
curl 4.3 2019-12-02 CRAN (R 4.0.0)
DBI 1.1.0 2019-12-15 CRAN (R 4.0.0)
dbplyr 1.4.4.9000 2020-07-28 Github (tidyverse/dbplyr@a6ed629)
desc 1.2.0 2018-05-01 CRAN (R 4.0.0)
devtools 2.3.1.9000 2020-07-31 Github (r-lib/devtools@df619ce)
digest 0.6.25 2020-02-23 CRAN (R 4.0.0)
dplyr 1.0.2 2020-08-12 Github (tidyverse/dplyr@0bea3e8)
ellipsis 0.3.1.9000 2020-07-18 Github (r-lib/ellipsis@57a5071)
evaluate 0.14 2019-05-28 CRAN (R 4.0.0)
extrafont 0.17 2014-12-08 CRAN (R 4.0.2)
extrafontdb 1.0 2012-06-11 CRAN (R 4.0.0)
fansi 0.4.1 2020-01-08 CRAN (R 4.0.0)
farver 2.0.3 2020-01-16 CRAN (R 4.0.0)
forcats 0.5.0.9000 2020-05-28 Github (tidyverse/forcats@ab81d1b)
fs 1.5.0.9000 2020-08-03 Github (r-lib/fs@93e70a9)
generics 0.0.2 2018-11-29 CRAN (R 4.0.0)
GetoptLong 1.0.2 2020-07-06 CRAN (R 4.0.2)
ggplot2 3.3.2.9000 2020-08-14 Github (tidyverse/ggplot2@3be0acc)
ggrepel 0.9.0 2020-07-24 Github (slowkow/ggrepel@4d0ef50)
ggthemes 4.2.0 2019-05-13 CRAN (R 4.0.0)
GlobalOptions 0.1.2 2020-06-10 CRAN (R 4.0.0)
glue 1.4.1.9000 2020-07-07 Github (tidyverse/glue@205f18b)
gt 0.2.2 2020-08-06 Github (rstudio/gt@c97cb4c)
gtable 0.3.0 2019-03-25 CRAN (R 4.0.0)
gtools 3.8.2 2020-03-31 CRAN (R 4.0.0)
haven 2.3.1 2020-06-01 CRAN (R 4.0.0)
hms 0.5.3 2020-01-08 CRAN (R 4.0.0)
htmltools 0.5.0 2020-06-16 CRAN (R 4.0.1)
httr 1.4.2 2020-07-20 CRAN (R 4.0.2)
IRanges 2.22.2 2020-05-21 Bioconductor
jpeg 0.1-8.1 2019-10-24 CRAN (R 4.0.0)
jsonlite 1.7.0 2020-06-25 CRAN (R 4.0.2)
knitr 1.29 2020-06-23 CRAN (R 4.0.2)
labeling 0.3 2014-08-23 CRAN (R 4.0.0)
lattice 0.20-41 2020-04-02 CRAN (R 4.0.2)
lifecycle 0.2.0 2020-03-06 CRAN (R 4.0.0)
lubridate 1.7.9 2020-07-11 Github (tidyverse/lubridate@de2ee09)
magrittr 1.5.0.9000 2020-08-04 Github (tidyverse/magrittr@ba52096)
Matrix 1.2-18 2019-11-27 CRAN (R 4.0.2)
MatrixModels 0.4-1 2015-08-22 CRAN (R 4.0.0)
memoise 1.1.0 2017-04-21 CRAN (R 4.0.0)
modelr 0.1.8.9000 2020-05-19 Github (tidyverse/modelr@16168e0)
munsell 0.5.0 2018-06-12 CRAN (R 4.0.0)
mvtnorm 1.1-1 2020-06-09 CRAN (R 4.0.2)
openssl 1.4.2 2020-06-27 CRAN (R 4.0.2)
org.Hs.eg.db 3.11.4 2020-05-27 Bioconductor
org.Mm.eg.db 3.11.4 2020-06-07 Bioconductor
patchwork 1.0.1.9000 2020-06-22 Github (thomasp85/patchwork@82a5e03)
pbapply 1.4-2 2019-08-31 CRAN (R 4.0.0)
pillar 1.4.6.9000 2020-07-21 Github (r-lib/pillar@8aef8f2)
pkgbuild 1.1.0.9000 2020-07-14 Github (r-lib/pkgbuild@3a87bd9)
pkgconfig 2.0.3 2019-09-22 CRAN (R 4.0.0)
pkgload 1.1.0 2020-05-29 CRAN (R 4.0.0)
png 0.1-7 2013-12-03 CRAN (R 4.0.0)
prettyunits 1.1.1.9000 2020-08-10 Github (r-lib/prettyunits@b1cdad8)
processx 3.4.3 2020-07-05 CRAN (R 4.0.2)
progress 1.2.2 2019-05-16 CRAN (R 4.0.0)
ps 1.3.4 2020-08-11 CRAN (R 4.0.2)
purrr 0.3.4.9000 2020-08-10 Github (tidyverse/purrr@5de5ad2)
pvclust 2.2-0 2020-04-25 Github (shimo-lab/pvclust@e734d05)
R6 2.4.1.9000 2020-07-18 Github (r-lib/R6@1415d11)
rappdirs 0.3.1 2016-03-28 CRAN (R 4.0.0)
RColorBrewer 1.1-2 2014-12-07 CRAN (R 4.0.0)
Rcpp 1.0.5 2020-07-06 CRAN (R 4.0.2)
readr 1.3.1.9000 2020-07-16 Github (tidyverse/readr@2ab51b2)
readxl 1.3.1.9000 2020-05-28 Github (tidyverse/readxl@3815961)
remotes 2.2.0.9000 2020-08-06 Github (r-lib/remotes@5a546ad)
reprex 0.3.0 2019-05-16 CRAN (R 4.0.0)
reticulate 1.16 2020-05-27 CRAN (R 4.0.2)
rjson 0.2.20 2018-06-08 CRAN (R 4.0.0)
rlang 0.4.7.9000 2020-08-12 Github (r-lib/rlang@de0c176)
rmarkdown 2.3.3 2020-07-25 Github (rstudio/rmarkdown@204aa41)
rprojroot 1.3-2 2018-01-03 CRAN (R 4.0.0)
RSQLite 2.2.0 2020-01-07 CRAN (R 4.0.0)
rstudioapi 0.11 2020-02-07 CRAN (R 4.0.2)
Rttf2pt1 1.3.8 2020-01-10 CRAN (R 4.0.0)
rvest 0.3.6 2020-07-25 CRAN (R 4.0.2)
S4Vectors 0.26.1 2020-05-16 Bioconductor
sass 0.2.0 2020-03-18 CRAN (R 4.0.2)
scales 1.1.1.9000 2020-07-24 Github (r-lib/scales@9ff4757)
sessioninfo 1.1.1.9000 2020-07-18 Github (r-lib/sessioninfo@791705b)
shape 1.4.4 2018-02-07 CRAN (R 4.0.0)
stringi 1.4.6 2020-02-17 CRAN (R 4.0.0)
stringr 1.4.0.9000 2020-06-01 Github (tidyverse/stringr@f70c4ba)
styler 1.3.2.9000 2020-08-13 Github (r-lib/styler@7376cf4)
testthat 2.99.0.9000 2020-08-12 Github (r-lib/testthat@9e643d8)
tibble 3.0.3.9000 2020-07-21 Github (tidyverse/tibble@b4eec19)
tidyr 1.1.1.9000 2020-08-18 Github (tidyverse/tidyr@4d0dd69)
tidyselect 1.1.0.9000 2020-07-11 Github (tidyverse/tidyselect@69fdc96)
tidyverse 1.3.0.9000 2020-06-01 Github (hadley/tidyverse@8a0bb99)
usethis 1.6.1.9001 2020-08-17 Github (r-lib/usethis@860c1ea)
vctrs 0.3.2.9000 2020-08-18 Github (r-lib/vctrs@c0dcd3f)
viridisLite 0.3.0 2018-02-01 CRAN (R 4.0.0)
wesanderson 0.3.6.9000 2020-06-05 Github (karthik/wesanderson@d90700a)
withr 2.2.0 2020-04-20 CRAN (R 4.0.0)
xfun 0.16 2020-07-24 CRAN (R 4.0.2)
XML 3.99-0.5 2020-07-23 CRAN (R 4.0.2)
xml2 1.3.2 2020-04-23 CRAN (R 4.0.0)
yaml 2.2.1 2020-02-01 CRAN (R 4.0.0)
yarrr 0.1.6 2020-07-23 Github (ndphillips/yarrr@e2e4488)