Nakamura, T., Okamoto, I., Sasaki, K., Yabuta, Y., Iwatani, C., Tsuchiya, H., Seita, Y., Nakamura, S., Yamamoto, T., and Saitou, M. (2016). A developmental coordinate of pluripotency among mice, monkeys and humans. Nature 537, 57–62.

BioProject Accession: PRJNA301445
GEO Accession: GSE74767

Load required packages.

library(tidyverse)
library(magrittr)
library(Matrix)
library(extrafont)
library(ggrepel)
library(patchwork)
# library(tidylog)

Sys.time()

## [1] "2020-08-18 15:08:05 CDT"

Data preparation

SEED_USE <- 20200616
MINIMAL_NUM_COUNTS_REQUIRED_FOR_GENE <- 60

Functions loading

source(
    file = file.path(
        SCRIPT_DIR,
        "utilities.R"
    )
)

Data loading

Prepare metadata for single cells.

# https://www.ncbi.nlm.nih.gov/Traces/study/?query_key=1&WebEnv=MCID_5f3b627244419748985be4f3
cell_metadata <- read_delim(
    file = "../SraRunTable.txt",
    delim = ","
) %>%
    rename_all(. %>% tolower()) %>%
    `colnames<-`(str_replace(
        string = colnames(.),
        pattern = " ", replacement = "_"
    )) %>%
    dplyr::select(
        sample_name,
        # run,
        biosample,
        source_name,
        cell_line,
        organism
    ) %>%
    unique()
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   AvgSpotLen = col_double(),
##   Bases = col_double(),
##   Bytes = col_double(),
##   ReleaseDate = col_datetime(format = ""),
##   Strain = col_logical(),
##   Cell_type = col_logical()
## )
## See spec(...) for full column specifications.
## Warning: 558 parsing failures.
##  row    col           expected  actual                 file
## 2527 Strain 1/0/T/F/TRUE/FALSE C57BL/6 '../SraRunTable.txt'
## 2528 Strain 1/0/T/F/TRUE/FALSE C57BL/6 '../SraRunTable.txt'
## 2529 Strain 1/0/T/F/TRUE/FALSE C57BL/6 '../SraRunTable.txt'
## 2530 Strain 1/0/T/F/TRUE/FALSE C57BL/6 '../SraRunTable.txt'
## 2531 Strain 1/0/T/F/TRUE/FALSE C57BL/6 '../SraRunTable.txt'
## .... ...... .................. ....... ....................
## See problems(...) for more details.

cell_metadata

cell_metadata %>%
    dplyr::count(organism) %>%
    gt::gt() %>%
    gt::cols_label(n = "num_cells") %>%
    gt::tab_options(table.font.size = "median")

organism	num_cells
Macaca fascicularis	421
Mus musculus	71

Load SC3-seq sample information.

# Supplementary Table 1, Sheet 3
table_s1_sheet3 <- readxl::read_excel(
    path = "../../../../docs/A_developmental_coordinate_of_pluripotency_among_mice,_monkeys_and_humans/41586_2016_BFnature19096_MOESM296_ESM.xlsx",
    sheet = "SC3-seq_SampleTable",
    skip = 1
)

table_s1_sheet3

cell_metadata <- cell_metadata %>%
    dplyr::left_join(
        table_s1_sheet3,
        by = c("sample_name" = "GSM_ID")
    ) %>%
    rename_all(. %>% tolower()) %>%
    dplyr::rename(
        developmental_stage = `embryonic day/ culture condition`
    ) %>%
    dplyr::select(
        sample_name:developmental_stage
    ) %>%
    mutate(
        developmental_stage = factor(
            developmental_stage,
            levels = c(
                "E5.5",
                "E6.5",
                "E06",
                "E07",
                "E08",
                "E09",
                "E13",
                "E14",
                "E16",
                "E17",
                "2iLESC",
                "EpiLC",
                "CMK6_FeederFree",
                "CMK6_onFeeder",
                "CMK9_onFeeder"
            )
        ),
        group = factor(
            group,
            levels = c(
                "ICM",
                "Pre-EPI",
                "Hypoblast",
                "PreE-TE",
                "PreL-TE",
                "PostE-EPI",
                "PostL-EPI",
                "Gast1",
                "Gast2a",
                "Gast2b",
                "Post-paTE",
                "VE/YE",
                "EXMC",
                "cyESCFF",
                "cyESCoF",
                "E5.5EPI",
                "E5.5VE",
                "E6.5EPI-T hi",
                "E6.5EPI-T lo",
                "E6.5VE",
                "EpiLC",
                "2i+L ESC"
            )
        ),
        source_name = factor(
            source_name,
            levels = c(
                "Cynomolgus monkey embryo E06",
                "Cynomolgus monkey embryo E07",
                "Cynomolgus monkey embryo E08",
                "Cynomolgus monkey embryo E09",
                "Cynomolgus monkey embryo E13",
                "Cynomolgus monkey embryo E14",
                "Cynomolgus monkey embryo E16",
                "Cynomolgus monkey embryo E17",
                "cyESC\\, feeder free",
                "cyESC\\, on feeder",
                "mouse embryo E5.5",
                "mouse embryo E6.5",
                "2i/LIF ESC",
                "EpiLC"
            )
        )
    )

cell_metadata

embedding_cy <- read_csv(
    file = "../preprocessed/cy/exploring/embedding_ncomponents44_seed20200317.csv"
) %>%
    dplyr::left_join(
        cell_metadata,
        by = c("cell" = "sampleid")
    ) %>%
    droplevels()

## Parsed with column specification:
## cols(
##   cell = col_character(),
##   batch = col_character(),
##   louvain = col_double(),
##   x_tsne = col_double(),
##   y_tsne = col_double(),
##   x_umap = col_double(),
##   y_umap = col_double(),
##   x_fitsne = col_double(),
##   y_fitsne = col_double(),
##   x_phate = col_double(),
##   y_phate = col_double(),
##   `x_min_dist=0.1` = col_double(),
##   `y_min_dist=0.1` = col_double(),
##   x_multicoretsne = col_double(),
##   y_multicoretsne = col_double()
## )

reticulate::py_discover_config()

## python:         /Users/jialei/.pyenv/shims/python
## libpython:      /Users/jialei/.pyenv/versions/3.8.2/lib/libpython3.8.dylib
## pythonhome:     /Users/jialei/.pyenv/versions/3.8.2:/Users/jialei/.pyenv/versions/3.8.2
## version:        3.8.2 (default, May 23 2020, 03:35:41)  [Clang 11.0.3 (clang-1103.0.32.62)]
## numpy:          /Users/jialei/.pyenv/versions/3.8.2/lib/python3.8/site-packages/numpy
## numpy_version:  1.19.0
## 
## NOTE: Python version was forced by RETICULATE_PYTHON

np <- reticulate::import("numpy", convert = TRUE)
scipy.sparse <- reticulate::import(module = "scipy.sparse", convert = TRUE)

matrix_readcount_use <- scipy.sparse$load_npz("../preprocessed/cy/matrix_readcount.npz")
matrix_readcount_use_features <- np$load("../preprocessed/cy/matrix_readcount_features.npy")
matrix_readcount_use_barcodes <- np$load("../preprocessed/cy/matrix_readcount_barcodes.npy")
colnames(matrix_readcount_use) <- matrix_readcount_use_barcodes
rownames(matrix_readcount_use) <- matrix_readcount_use_features
matrix_readcount_use <- matrix_readcount_use[, embedding_cy$cell]

# calculate CPM
# matrix_cpm_use <- calc_cpm(matrix_readcount_use)

# stopifnot(dim(matrix_readcount_use) == dim(matrix_cpm_use))
print(dim(matrix_readcount_use))

## [1] 29095   421

walk(list(embedding_cy, matrix_readcount_use), function(x) {
    print(object.size(x), units = "auto", standard = "SI")
})

## 196.1 kB
## 57.4 MB

Lineage delineation by transcriptome

Transcriptome clustering

embedding_cy %>%
    dplyr::count(organism) %>%
    gt::gt() %>%
    gt::cols_label(n = "num_cells") %>%
    gt::tab_options(table.font.size = "median")

organism	num_cells
Macaca fascicularis	421

embedding_cy %>%
    dplyr::mutate(
        num_genes = Matrix::colSums(matrix_readcount_use > 0)
    ) %>%
    dplyr::group_by(
        louvain
    ) %>%
    summarise(
        median_genes = median(num_genes),
        num_cells = n()
    ) %>%
    gt::gt() %>%
    gt::tab_options(table.font.size = "median")

## `summarise()` ungrouping output (override with `.groups` argument)

louvain	median_genes	num_cells
0	10679.0	82
1	10880.5	64
2	10997.0	62
3	10898.0	55
4	11073.0	53
5	10649.0	39
6	11590.5	34
7	11974.5	32

embedding_cy %>%
    dplyr::count(source_name) %>%
    gt::gt() %>%
    gt::cols_label(n = "num_cells") %>%
    gt::tab_options(table.font.size = "median")

source_name	num_cells
Cynomolgus monkey embryo E06	43
Cynomolgus monkey embryo E07	35
Cynomolgus monkey embryo E08	95
Cynomolgus monkey embryo E09	20
Cynomolgus monkey embryo E13	46
Cynomolgus monkey embryo E14	62
Cynomolgus monkey embryo E16	36
Cynomolgus monkey embryo E17	53
cyESC\, feeder free	9
cyESC\, on feeder	22

embedding_cy %>%
    dplyr::count(group) %>%
    gt::gt() %>%
    gt::cols_label(n = "num_cells") %>%
    gt::tab_options(table.font.size = "median")

group	num_cells
ICM	30
Pre-EPI	34
Hypoblast	54
PreE-TE	23
PreL-TE	52
PostE-EPI	55
PostL-EPI	50
Gast1	18
Gast2a	13
Gast2b	13
Post-paTE	11
VE/YE	5
EXMC	32
cyESCFF	9
cyESCoF	22

embedding_cy %>%
    dplyr::count(developmental_stage) %>%
    gt::gt() %>%
    gt::cols_label(n = "num_cells") %>%
    gt::tab_options(table.font.size = "median")

developmental_stage	num_cells
E06	43
E07	35
E08	95
E09	20
E13	46
E14	62
E16	36
E17	53
CMK6_FeederFree	9
CMK6_onFeeder	14
CMK9_onFeeder	8

get_middle_points <- function(embedding, x, y, group) {
    embedding_middle <- embedding %>%
        dplyr::group_by({{ group }}) %>%
        dplyr::summarise(
            x_median = median({{ x }}),
            y_median = median({{ y }})
        )

    labels_group <- purrr::map(levels(embedding %>% dplyr::pull({{ group }})), function(i) {
        embedding %>%
            dplyr::filter(
                {{ group }} == i
            ) %>%
            dplyr::left_join(embedding_middle) %>%
            dplyr::mutate(
                distance = sqrt(
                    ({{ x }} - x_median)^2 + ({{ y }} - y_median)^2
                )
            ) %>%
            dplyr::filter(distance == min(distance))
    }) %>%
        dplyr::bind_rows() %>%
        dplyr::select({{ group }}, {{ x }}, {{ y }})

    return(labels_group)
}

p_embedding_cluster <- plot_embedding(
    embedding = embedding_cy[, c("x_tsne", "y_tsne")],
    color_values = embedding_cy$louvain %>% as.factor(),
    label = "t-SNE; Cluster",
    # label_position = c(label_x, label_y),
    show_color_value_labels = TRUE,
    show_color_legend = FALSE,
    geom_point_size = 1,
    sort_values = FALSE
) +
    scale_color_manual(
        values = ggthemes::tableau_color_pal("Tableau 10")(
            length(unique(embedding_cy$louvain))
        )
    )

p_embedding_lineage <- plot_embedding(
    embedding = embedding_cy[, c("x_tsne", "y_tsne")],
    color_values = embedding_cy$group,
    label = "t-SNE; Lineage",
    # label_position = c(label_x, label_y),
    show_color_value_labels = FALSE,
    show_color_legend = FALSE,
    geom_point_size = 1,
    sort_values = FALSE
) +
    scale_color_manual(
        values = ggthemes::tableau_color_pal("Tableau 20")(
            length(unique(embedding_cy$group))
        )
    ) +
    labs(color = NULL) +
    ggrepel::geom_text_repel(
        data = get_middle_points(embedding_cy, x_tsne, y_tsne, group),
        ggplot2::aes(
            x = x_tsne,
            y = y_tsne,
            label = group
        ),
        color = "black",
        size = 2,
        family = "Arial",
        #
        segment.color = "grey35",
        segment.size = 0.25,
        segment.inflect = TRUE,
        #
        box.padding = 0.5,
        min.segment.length = 0,
        arrow = ggplot2::arrow(length = unit(0.015, "npc")),
        max.overlaps = Inf,
        seed = SEED_USE
    )

p_embedding_developmental_stage <- plot_embedding(
    embedding = embedding_cy[, c("x_tsne", "y_tsne")],
    color_values = embedding_cy$developmental_stage,
    label = "t-SNE; Developmental stage",
    # label_position = c(label_x, label_y),
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = 1,
    sort_values = FALSE
) +
    scale_color_manual(
        values = gg_color_hue(n = embedding_cy$developmental_stage %>%
            droplevels() %>%
            nlevels())
    ) +
    labs(color = NULL) +
    guides(colour = guide_legend(
        override.aes = list(size = 3),
        ncol = 2
    )) +
    customized_theme(x = 0.035, y = 0.42)

p_embedding_cell_line <- plot_embedding(
    embedding = embedding_cy[, c("x_tsne", "y_tsne")],
    color_values = embedding_cy$cell_line %>% replace_na("N/A"),
    label = "t-SNE; Cell line",
    # label_position = c(label_x, label_y),
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = 1,
    sort_values = FALSE
) +
    scale_color_manual(
        values = c("green", "red", "grey70")
    ) +
    labs(color = NULL) +
    guides(colour = guide_legend(override.aes = list(size = 3))) +
    customized_theme(x = 0.035, y = 0.25)

Fig. 5a

purrr::reduce(list(
    p_embedding_cluster,
    p_embedding_lineage,
    p_embedding_developmental_stage,
    p_embedding_cell_line
), `+`) +
    plot_layout(ncol = 2) +
    plot_annotation(
        theme = theme(plot.margin = margin())
    )

Heatmap of selected marker genes

FEATURES_SELECTED <- c(
    "GAPDH",
    # "PPIA",
    "LOC101925040",
    "POU5F1",
    "NANOG",
    "SOX2",
    "PRDM14",
    "DPPA3",
    "GATA6",
    "PDGFRA",
    "GATA4",
    "TFAP2C",
    # "GATA2",
    "LOC101865311",
    "CDX2",
    "T",
    "MIXL1"
)

heatmap_labels <- c(
    "GAPDH",
    # "PPIA",
    "LOC101925040/PPIA",
    "POU5F1",
    "NANOG",
    "SOX2",
    "PRDM14",
    "DPPA3",
    "GATA6",
    "PDGFRA",
    "GATA4",
    "TFAP2C",
    # "GATA2",
    "LOC101865311/GATA2",
    "CDX2",
    "T",
    "MIXL1"
)

cells_heatmap <- embedding_cy %>%
    filter(str_detect(string = developmental_stage, pattern = "^E")) %>%
    pull(cell)

Scaled version

matrix_heatmap <- matrix_readcount_use[FEATURES_SELECTED, cells_heatmap]
matrix_heatmap <- matrix_heatmap[rowSums(matrix_heatmap) != 0, ]
matrix_heatmap <- log10(matrix_heatmap + 1)
matrix_heatmap <- t(scale(t(matrix_heatmap)))
heatmap_limits <- quantile(matrix_heatmap, c(0.05, 0.95))
matrix_heatmap[matrix_heatmap < heatmap_limits[1]] <- heatmap_limits[1]
matrix_heatmap[matrix_heatmap > heatmap_limits[2]] <- heatmap_limits[2]

groups_selected <- c(
    "Post-paTE",
    "PreL-TE",
    "Hypoblast",
    "PreE-TE",
    "ICM",
    "Pre-EPI",
    "PostE-EPI",
    "PostL-EPI",
    "Gast1",
    "Gast2a",
    "Gast2b",
    "VE/YE",
    "EXMC"
)

cells_heatmap <- map(groups_selected, function(x) {
    cells_in_group <- embedding_cy %>%
        filter(group == x, cell %in% cells_heatmap) %>%
        pull(cell)
    cells_in_group[
        pvclust::pvclust(
            matrix_heatmap[, cells_in_group],
            nboot = 100,
            parallel = parallel::detectCores() - 2L
        )$hclust$order
    ]
}) %>%
    unlist()

## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.

matrix_heatmap <- matrix_heatmap[FEATURES_SELECTED, cells_heatmap]

ha_columns <- ComplexHeatmap::HeatmapAnnotation(
    lineage = ComplexHeatmap::anno_simple(
        embedding_cy[match(
            x = colnames(matrix_heatmap),
            table = embedding_cy$cell
        ), "group"],
        # pch = anno_labels_cluster,
        col = setNames(
            object = ggthemes::tableau_color_pal("Tableau 20")(
                embedding_cy$group %>%
                    # droplevels() %>%
                    nlevels()
            ),
            nm = levels(embedding_cy$group)
        ),
        which = "column",
        pt_size = unit(2, "mm"),
        pt_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 6
        ),
        simple_anno_size = unit(3, "mm")
    ),
    #
    developmental_stage = ComplexHeatmap::anno_simple(
        embedding_cy[match(
            x = colnames(matrix_heatmap),
            table = embedding_cy$cell
        ), "developmental_stage"],
        # pch = anno_labels_lineage,
        col = setNames(
            object = gg_color_hue(n = embedding_cy$developmental_stage %>%
                # droplevels() %>%
                nlevels()),
            nm = embedding_cy$developmental_stage %>%
                # droplevels() %>%
                levels()
        ),
        which = "column",
        # pt_size = unit(2, "mm"),
        pt_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 6
        ),
        simple_anno_size = unit(3, "mm")
    ),
    show_annotation_name = TRUE,
    annotation_label = c(
        "Lineage",
        "Developmental stage"
    ),
    annotation_name_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    annotation_name_side = "left"
)

lgd_lineage <- ComplexHeatmap::Legend(
    title = "Lineage",
    labels = levels(
        embedding_cy$group
    )[
        seq_len(nlevels(embedding_cy$group) - 2)
    ],
    legend_gp = grid::gpar(
        fill = ggthemes::tableau_color_pal("Tableau 20")(
            nlevels(embedding_cy$group) - 2
        )
    ),
    labels_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    title_gp = grid::gpar(fontfamily = "Arial", fontsize = 6)
)

lgd_developmental_stage <- ComplexHeatmap::Legend(
    title = "Develpmental stage",
    labels = levels(
        embedding_cy$developmental_stage
    )[
        seq_len(nlevels(embedding_cy$developmental_stage) - 3)
    ],
    legend_gp = grid::gpar(
        fill = gg_color_hue(
            n = embedding_cy$developmental_stage %>%
                # droplevels() %>%
                nlevels()
        )[seq_len(nlevels(embedding_cy$developmental_stage) - 3)]
    ),
    labels_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    title_gp = grid::gpar(fontfamily = "Arial", fontsize = 6)
)

ht <- ComplexHeatmap::Heatmap(
    matrix = matrix_heatmap,
    # col =  viridis::plasma(n = 10),
    col = wesanderson::wes_palette("Zissou1", 50, type = "continuous"),
    #
    cluster_rows = FALSE,
    cluster_columns = FALSE,
    #
    row_labels = heatmap_labels,
    row_names_side = c("left"),
    row_names_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    show_row_names = TRUE,
    show_column_names = FALSE,
    #
    # column_names_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    show_row_dend = FALSE,
    show_column_dend = FALSE,
    #
    show_heatmap_legend = TRUE,
    top_annotation = ha_columns,
    ## left_annotation = ha_left,
    #
    heatmap_legend_param = list(
        title = "Z score",
        title_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
        legend_direction = "vertical",
        labels_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
        legend_height = unit(25, "mm"),
        legend_width = unit(10, "mm")
    ),
    #
    column_split = embedding_cy[match(
        x = colnames(matrix_heatmap),
        table = embedding_cy$cell
    ), "group", drop = TRUE] %>%
        factor(levels = groups_selected),
    column_gap = unit(0.3, "mm"),
    #
    column_title_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    column_title_rot = -90,
    #
    use_raster = FALSE
)

pd <- ComplexHeatmap::packLegend(
    lgd_lineage,
    lgd_developmental_stage,
    # gap = unit(8, "mm"),
    direction = "vertical"
)

Fig. 3

ComplexHeatmap::draw(
    ht,
    heatmap_legend_list = list(pd)
)

Non scaled version

matrix_heatmap <- matrix_readcount_use[FEATURES_SELECTED, cells_heatmap]
matrix_heatmap <- matrix_heatmap[rowSums(matrix_heatmap) != 0, ]
matrix_heatmap <- log10(matrix_heatmap + 1)

ht <- ComplexHeatmap::Heatmap(
    matrix = as.matrix(matrix_heatmap),
    # col =  viridis::plasma(n = 10),
    col = wesanderson::wes_palette("Zissou1", 50, type = "continuous"),
    #
    cluster_rows = FALSE,
    cluster_columns = FALSE,
    #
    row_labels = heatmap_labels,
    row_names_side = c("left"),
    row_names_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    show_row_names = TRUE,
    show_column_names = FALSE,
    #
    column_names_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    show_row_dend = FALSE,
    show_column_dend = FALSE,
    #
    show_heatmap_legend = TRUE,
    top_annotation = ha_columns,
    ## left_annotation = ha_left,
    #
    heatmap_legend_param = list(
        title = expression(paste("Log"[10], " (RPM + 1)")), # "Log10(RPM + 1)",
        title_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
        legend_direction = "vertical",
        labels_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
        legend_height = unit(25, "mm"),
        legend_width = unit(10, "mm")
    ),
    #
    column_split = embedding_cy[match(
        x = colnames(matrix_heatmap),
        table = embedding_cy$cell
    ), "group", drop = TRUE] %>%
        factor(levels = groups_selected),
    column_gap = unit(0.3, "mm"),
    #
    column_title_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    column_title_rot = -90,
    #
    use_raster = FALSE
)

Fig. 3

ComplexHeatmap::draw(
    ht,
    heatmap_legend_list = list(pd)
)

Progressive transitions of cyEPI

cells_cyEPI <- embedding_cy %>%
    filter(group %in% c(
        "ICM",
        "Pre-EPI",
        "PostE-EPI",
        "PostL-EPI",
        "Gast1",
        "Gast2a",
        "Gast2b"
    )) %>%
    pull(cell) %>%
    sort()

Dimensionality reduction

Preprocessing

matrix_readcount_cyEPI <- matrix_readcount_use[, cells_cyEPI]
matrix_readcount_cyEPI <- matrix_readcount_cyEPI[
    Matrix::rowSums(
        matrix_readcount_cyEPI
    ) >= MINIMAL_NUM_COUNTS_REQUIRED_FOR_GENE,
]

dim(matrix_readcount_cyEPI)
## [1] 16831   213

PCA

matrix_readcount_cyEPI_log <- matrix_readcount_cyEPI
matrix_readcount_cyEPI_log@x <- log1p(matrix_readcount_cyEPI@x)

# z-score
matrix_readcount_cyEPI_log_scaled <- t(
    scale(
        t(
            matrix_readcount_cyEPI_log
        ),
        center = TRUE, scale = TRUE
    )
)
# features_var <- apply(matrix_readcount_norm_log_scaled, 1, var)
# features_use <- rownames(matrix_readcount_norm_log_scaled)[features_var > 0]

pca_out <- prcomp(
    t(matrix_readcount_cyEPI_log_scaled),
    center = FALSE,
    scale = FALSE
)

summary(pca_out)$imp %>%
    t() %>%
    as.data.frame() %>%
    rownames_to_column(var = "component") %>%
    mutate(
        rank = 1:n()
    ) %>%
    dplyr::slice(1:30) %>%
    ggplot(
        aes(
            x = rank,
            y = `Proportion of Variance`
        )
    ) +
    geom_point(size = 0.3) +
    theme_bw() +
    scale_x_continuous(
        name = "Component",
        breaks = c(1, seq(5, 30, 5))
    ) +
    scale_y_continuous(
        name = "Proportion of variance",
        labels = scales::percent
    ) +
    theme(
        axis.title = ggplot2::element_text(family = "Arial", size = 6),
        axis.text = ggplot2::element_text(family = "Arial", size = 6),
        panel.grid.minor = ggplot2::element_blank()
    )

embedding_cyEPI <- pca_out$x %>%
    as.data.frame() %>%
    rownames_to_column(var = "sample_name") %>%
    dplyr::select(sample_name:PC3) %>%
    left_join(
        embedding_cy %>%
            select(
                cell, batch, sample_name:developmental_stage
            ),
        by = c("sample_name" = "cell")
    ) %>%
    dplyr::rename(
        x_pca = PC1,
        y_pca = PC2,
        z_pca = PC3
    ) %>%
    relocate(x_pca, y_pca, z_pca, .after = last_col()) # %>% droplevels()

p_embedding_pca1 <- embedding_cyEPI %>%
    mutate(
        category = "PCA"
    ) %>%
    sample_frac() %>%
    plot_pca(
        x = y_pca,
        y = x_pca,
        fill = group
    ) +
    scale_fill_manual(
        values = setNames(
            object = ggthemes::tableau_color_pal("Tableau 20")(
                length(unique(embedding_cy$group))
            ),
            nm = embedding_cy$group %>% levels()
        )[embedding_cyEPI$group %>%
            droplevels() %>%
            levels()]
    ) +
    add_xy_label_pca(x = "PC2", y = "PC1") +
    ggrepel::geom_text_repel(
        data = get_middle_points(embedding_cyEPI, x_pca, y_pca, group),
        ggplot2::aes(
            x = y_pca,
            y = x_pca,
            label = group
        ),
        color = "red",
        size = 2,
        family = "Arial",
        #
        segment.color = "grey35",
        segment.size = 0.25,
        segment.inflect = TRUE,
        #
        box.padding = 0.5,
        min.segment.length = 0,
        arrow = ggplot2::arrow(length = unit(0.015, "npc")),
        max.overlaps = Inf,
        seed = SEED_USE
    )

p_embedding_pca2 <- embedding_cyEPI %>%
    mutate(
        category = "PCA"
    ) %>%
    sample_frac() %>%
    plot_pca(
        x = z_pca,
        y = x_pca,
        fill = group
    ) +
    scale_fill_manual(
        values = setNames(
            object = ggthemes::tableau_color_pal("Tableau 20")(
                length(unique(embedding_cy$group))
            ),
            nm = embedding_cy$group %>% levels()
        )[embedding_cyEPI$group %>%
            droplevels() %>%
            levels()]
    ) +
    add_xy_label_pca(x = "PC3", y = "PC1")

Fig. 4a

p_embedding_pca1 +
    p_embedding_pca2 +
    plot_annotation(theme = theme(plot.margin = margin())) +
    plot_layout(guides = "collect")

# Supplementary Table 2, Sheet 4
table_s2_sheet4 <- readxl::read_excel(
    path = "../../../../docs/A_developmental_coordinate_of_pluripotency_among_mice,_monkeys_and_humans/41586_2016_BFnature19096_MOESM297_ESM.xlsx",
    sheet = "cyEPI ontogenic genes",
    skip = 0
)

## New names:
## * `` -> ...11
## * `` -> ...12

table_s2_sheet4

Fig. 4b

FEATURES_SELECTED <- c(
    "SFRP1",
    "FST",
    "HMGA2",
    "VIM",
    "T",
    "BMP4",
    "EOMES",
    "MIXL1",
    "HAND1",
    "PDGFRA",
    "GSC",
    "TBX3",
    "GATA6",
    "ESRRB",
    "KLF5",
    "TFCP2L1",
    "KHDC1L",
    "KLF4",
    "NLRP7",
    "OOEP",
    "DPPA3",
    "DNMT3L",
    "DPPA5",
    "DPPA2",
    "NANOG",
    "PRDM14",
    "SOX2",
    "SALL2",
    "SFRP2",
    "FGF2",
    "SOX11"
)

scaled_pc_loadings <- pca_out$rotation %>%
    scale() %>%
    as.data.frame() %>%
    rownames_to_column(var = "feature") %>%
    dplyr::select(feature, PC1, PC2) %>%
    mutate(
        group = feature %in% table_s2_sheet4$macFas5_gene_symbol,
        group2 = feature %in% FEATURES_SELECTED
    ) %>%
    arrange(group)

plot_pca_loading(
    data = scaled_pc_loadings,
    x = PC2,
    y = PC1,
    z1 = group,
    z2 = group2,
    x_label = "Z score of PC2 loading",
    y_label = "Z score of PC1 loading"
)

Heatmap

Read genes used in Fig. 4c from supplementary table 2.

features_heatmap <- table_s2_sheet4$macFas5_gene_symbol
features_heatmap %>% length()

## [1] 776

# Genes listed in the supplementary table, but not in the expression matrix
features_heatmap[!features_heatmap %in% rownames(matrix_readcount_use)]

##  [1] "CYP11A"       "LOC102139757" "BRDT"         "COX7B2"       "42253"       
##  [6] "42065"        "DYRK3"        "GSTA4"        "FXYD6"        "BCAR3"

# fix
features_heatmap <- features_heatmap %>%
    enframe(name = "order", value = "symbol_raw") %>%
    mutate(
        symbol = case_when(
            symbol_raw == "CYP11A" ~ "LOC102120640",
            symbol_raw == "LOC102139757" ~ "XIST",
            symbol_raw == "BRDT" ~ "LOC102123270",
            symbol_raw == "COX7B2" ~ "LOC101864947",
            symbol_raw == "42253" ~ "MARC2",
            symbol_raw == "42065" ~ "SEPT6",
            symbol_raw == "DYRK3" ~ "LOC101866219",
            symbol_raw == "GSTA4" ~ "LOC101865382",
            symbol_raw == "FXYD6" ~ "LOC101867245",
            symbol_raw == "BCAR3" ~ "LOC101867222",
            TRUE ~ .$symbol_raw
        )
    ) %>%
    pull(symbol)

features_heatmap[!features_heatmap %in% rownames(matrix_readcount_use)]

## character(0)

matrix_heatmap <- matrix_readcount_use[features_heatmap, cells_cyEPI]
matrix_heatmap <- matrix_heatmap[rowSums(matrix_heatmap) != 0, ]
matrix_heatmap <- log10(matrix_heatmap + 1)
heatmap_limits <- quantile(matrix_heatmap, c(0.005, 0.995))
matrix_heatmap[matrix_heatmap < heatmap_limits[1]] <- heatmap_limits[1]
matrix_heatmap[matrix_heatmap > heatmap_limits[2]] <- heatmap_limits[2]

cells_heatmap <- map(embedding_cyEPI %>%
    pull(group) %>%
    droplevels() %>%
    levels(), function(x) {
    cells_in_group <- embedding_cyEPI %>%
        filter(group == x) %>%
        pull(sample_name)

    cells_in_group[
        pvclust::pvclust(
            matrix_heatmap[features_heatmap, cells_in_group] %>% as.matrix(),
            nboot = 100,
            parallel = parallel::detectCores() - 2L
        )$hclust$order
    ]
}) %>%
    unlist()

## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.

matrix_heatmap <- matrix_heatmap[features_heatmap, cells_heatmap]

ha_columns <- ComplexHeatmap::HeatmapAnnotation(
    lineage = ComplexHeatmap::anno_simple(
        embedding_cyEPI[match(
            x = colnames(matrix_heatmap),
            table = embedding_cyEPI$sample_name
        ), "group"],
        # pch = anno_labels_cluster,
        col = setNames(
            object = ggthemes::tableau_color_pal("Tableau 20")(
                embedding_cyEPI$group %>%
                    # droplevels() %>%
                    nlevels()
            ),
            nm = levels(embedding_cyEPI$group)
        ),
        which = "column",
        pt_size = unit(2, "mm"),
        pt_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 6
        ),
        simple_anno_size = unit(3, "mm")
    ),
    #
    developmental_stage = ComplexHeatmap::anno_simple(
        embedding_cyEPI[match(
            x = colnames(matrix_heatmap),
            table = embedding_cyEPI$sample_name
        ), "developmental_stage"],
        # pch = anno_labels_lineage,
        col = setNames(
            object = gg_color_hue(n = embedding_cy$developmental_stage %>%
                nlevels()),
            nm = embedding_cy$developmental_stage %>%
                levels()
        ),
        which = "column",
        # pt_size = unit(2, "mm"),
        pt_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 6
        ),
        simple_anno_size = unit(3, "mm")
    ),
    show_annotation_name = TRUE,
    annotation_label = c(
        "Lineage",
        "Developmental stage"
    ),
    annotation_name_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    annotation_name_side = "left"
)

lgd_lineage <- ComplexHeatmap::Legend(
    title = "Lineage",
    labels = embedding_cyEPI %>% pull(group) %>% droplevels() %>% levels(),
    legend_gp = grid::gpar(
        fill = setNames(
            object = ggthemes::tableau_color_pal("Tableau 20")(
                length(unique(embedding_cy$group))
            ),
            nm = embedding_cy$group %>% levels()
        )[embedding_cyEPI$group %>%
            droplevels() %>%
            levels()]
    ),
    labels_gp = grid::gpar(
        fontfamily = "Arial",
        fontsize = 6
    ),
    title_gp = grid::gpar(
        fontfamily = "Arial",
        fontsize = 6
    )
)

lgd_developmental_stage <- ComplexHeatmap::Legend(
    title = "Develpmental stage",
    labels = levels(
        embedding_cy$developmental_stage
    )[
        seq_len(nlevels(embedding_cy$developmental_stage) - 3)
    ],
    legend_gp = grid::gpar(
        fill = gg_color_hue(
            n = embedding_cy$developmental_stage %>%
                droplevels() %>%
                nlevels()
        )[seq_len(nlevels(embedding_cy$developmental_stage) - 3)]
    ),
    labels_gp = grid::gpar(
        fontfamily = "Arial",
        fontsize = 6
    ),
    title_gp = grid::gpar(
        fontfamily = "Arial",
        fontsize = 6
    )
)

# genes highlighted in Fig. 4c, fix gene symbols
features_to_mark_left <- c(
    "KLF5",
    "TBX3",
    "GATA6",
    "PDGFRA",
    "LAMA1",
    "LAMB1",
    "LAMC1",
    "TET2",
    #
    "ESRRB",
    "TFCP2L1",
    "LOC102137091", # "KHDC1L",
    "SOX15",
    "TFAP2C",
    "GDF15",
    #
    "DPPA3",
    "NLRP2",
    "DPPA2",
    "KHDC3L",
    #
    "PRDM14",
    "NANOG",
    #
    "LOC102143209", # "TDGF1",
    "SOX2",
    "CXCL12",
    "DNMT3B",
    #
    "ZSCAN10",
    "SALL2",
    "SEMA6A",
    "TCF7L1",
    "THY1",
    "COL1A1",
    "FGF19",
    "NOTCH3",
    #
    "SOX11",
    "TCF4",
    "FST",
    "ZIC3",
    "ZIC2",
    "FGF2",
    "VIM",
    "COL4A2",
    #
    "T",
    "MIXL1",
    "EOMES",
    "GSC",
    "BMP2",
    "HHEX",
    "LHX1",
    "CXCR4",
    "LOC101925698", # "SOX17",
    "BMP4"
)

features_to_mark_left[!features_to_mark_left %in% rownames(matrix_heatmap)]
## character(0)

ha_left <- ComplexHeatmap::rowAnnotation(
    foo = ComplexHeatmap::anno_mark(
        at = which(rownames(matrix_heatmap) %in% features_to_mark_left),
        labels = features_to_mark_left,
        which = "row",
        side = "right",
        lines_gp = grid::gpar(col = "grey50"),
        labels_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 5
        )
    )
)

ht <- ComplexHeatmap::Heatmap(
    matrix = matrix_heatmap %>% as.matrix(),
    # col =  viridis::plasma(n = 10),
    col = wesanderson::wes_palette("Zissou1", 50, type = "continuous"),
    #
    cluster_rows = FALSE,
    cluster_columns = FALSE,
    #
    ## row_labels = str_remove(string = rownames(matrix_heatmap), pattern = "^E.+_"),
    # row_names_side = c("left"),
    # row_names_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    show_row_names = FALSE,
    show_column_names = FALSE,
    #
    show_row_dend = FALSE,
    show_column_dend = FALSE,
    #
    column_names_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    show_heatmap_legend = TRUE,
    top_annotation = ha_columns,
    ## left_annotation = ha_left,
    right_annotation = ha_left,
    #
    heatmap_legend_param = list(
        title = expression(paste("Log"[10], " (RPM + 1)")),
        title_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 6
        ),
        legend_direction = "vertical",
        labels_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 6
        ),
        legend_height = unit(25, "mm"),
        legend_width = unit(10, "mm")
    ),
    #
    row_split = table_s2_sheet4$ClusterID,
    row_gap = unit(0.3, "mm"),
    #
    row_title_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    row_title_rot = 0,
    #
    # column_split = embedding_cy[match(x = colnames(matrix_heatmap),table = embedding_cy$cell), "group", drop = TRUE],
    column_split = factor(
        embedding_cyEPI[match(
            x = colnames(matrix_heatmap),
            table = embedding_cyEPI$sample_name
        ), "group", drop = TRUE] %>%
            droplevels() %>%
            str_pad(
                width = map_int(levels(embedding_cyEPI$group), nchar) %>%
                    max(),
                side = "left"
            ),
        levels = levels(embedding_cyEPI$group %>% droplevels()) %>%
            str_pad(
                width = map_int(levels(embedding_cyEPI$group), nchar) %>%
                    max(),
                side = "left"
            )
    ),
    column_gap = unit(0.3, "mm"),
    #
    column_title_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    column_title_rot = -90,
    #
    use_raster = FALSE
)

pd <- ComplexHeatmap::packLegend(
    lgd_lineage,
    lgd_developmental_stage,
    # gap = unit(8, "mm"),
    direction = "vertical"
)

Fig. 4c

ComplexHeatmap::draw(
    ht,
    heatmap_legend_list = list(pd)
)

Mouse EPI development

# Supplementary Table 1, Sheet 3
table_s1_sheet3 <- readxl::read_excel(
    path = "../../../../docs/A_developmental_coordinate_of_pluripotency_among_mice,_monkeys_and_humans/41586_2016_BFnature19096_MOESM296_ESM.xlsx",
    sheet = "SC3-seq_SampleTable",
    skip = 1
)

table_s1_sheet3 %>% head()

embedding_mm <- read_csv(
    file = "../preprocessed/mm/exploring/embedding_ncomponents18_seed20200317.csv"
) %>%
    dplyr::left_join(
        table_s1_sheet3 %>%
            filter(Species == "Mouse") %>%
            mutate(
                SampleID = case_when(
                    str_detect(string = SampleID, pattern = "^\\d") ~ str_c("X", .$SampleID),
                    TRUE ~ .$SampleID
                )
            ),
        by = c("cell" = "SampleID")
    ) %>%
    dplyr::select(
        cell:y_umap, GSM_ID:`Embryonic Day/ Culture Condition`
    ) %>%
    dplyr::rename_all(. %>% tolower()) %>%
    dplyr::rename("developmental_stage" = `embryonic day/ culture condition`) %>%
    mutate(
        developmental_stage = factor(
            developmental_stage,
            levels = c("E4.5", "E5.5", "E6.5", "EpiLC", "2iLESC")
        ),
        group = factor(
            group,
            levels = c(
                "E4.5EPI",
                "E4.5PE",
                "E4.5mTE",
                "E4.5pTE",
                "E5.5EPI",
                "E5.5VE",
                "E6.5EPI-T hi",
                "E6.5EPI-T lo",
                "E6.5VE",
                "EpiLC",
                "2i+L ESC"
            )
        )
    )

## Parsed with column specification:
## cols(
##   cell = col_character(),
##   batch = col_character(),
##   louvain = col_double(),
##   x_tsne = col_double(),
##   y_tsne = col_double(),
##   x_umap = col_double(),
##   y_umap = col_double(),
##   x_fitsne = col_double(),
##   y_fitsne = col_double(),
##   x_phate = col_double(),
##   y_phate = col_double(),
##   `x_min_dist=0.1` = col_double(),
##   `y_min_dist=0.1` = col_double(),
##   x_multicoretsne = col_double(),
##   y_multicoretsne = col_double()
## )

embedding_mm %>%
    dplyr::count(louvain) %>%
    gt::gt() %>%
    gt::cols_label(n = "num_cells") %>%
    gt::tab_options(table.font.size = "median")

louvain	num_cells
0	49
1	37
2	22

embedding_mm %>%
    dplyr::count(group) %>%
    gt::gt() %>%
    gt::cols_label(n = "num_cells") %>%
    gt::tab_options(table.font.size = "median")

group	num_cells
E4.5EPI	9
E4.5PE	9
E4.5mTE	9
E4.5pTE	10
E5.5EPI	16
E5.5VE	9
E6.5EPI-T hi	10
E6.5EPI-T lo	9
E6.5VE	5
EpiLC	12
2i+L ESC	10

embedding_mm %>%
    dplyr::count(developmental_stage) %>%
    gt::gt() %>%
    gt::cols_label(n = "num_cells") %>%
    gt::tab_options(table.font.size = "median")

developmental_stage	num_cells
E4.5	37
E5.5	25
E6.5	24
EpiLC	12
2iLESC	10

embedding_middle <- embedding_mm %>%
    group_by(group) %>%
    summarise(
        x_median = median(x_umap),
        y_median = median(y_umap)
    )

## `summarise()` ungrouping output (override with `.groups` argument)

labels_group <- map(levels(embedding_mm$group), function(x) {
    embedding_mm %>%
        dplyr::filter(
            group == x
        ) %>%
        dplyr::left_join(embedding_middle) %>%
        dplyr::mutate(
            distance = sqrt(
                (x_umap - x_median)^2 + (y_umap - y_median)^2
            )
        ) %>%
        dplyr::filter(distance == min(distance))
}) %>%
    dplyr::bind_rows() %>%
    dplyr::select(cell, x_umap, y_umap, group)

## Joining, by = "group"

## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"

labels_group

p_embedding_lineage <- plot_embedding(
    embedding = embedding_mm[, c("x_umap", "y_umap")],
    color_values = embedding_mm$group,
    label = "UMAP; Lineage",
    # label_position = c(label_x, label_y),
    show_color_value_labels = FALSE,
    show_color_legend = FALSE,
    geom_point_size = 1,
    sort_values = FALSE
) +
    scale_color_manual(
        values = ggthemes::tableau_color_pal("Tableau 20")(
            length(unique(embedding_mm$group))
        )
    ) +
    labs(color = NULL) +
    # guides(colour = guide_legend(override.aes = list(size = 3), ncol = 1)) +
    # customized_theme(x = 0.035, y = 0.72) +
    ggrepel::geom_text_repel(
        data = get_middle_points(embedding_mm, x_umap, y_umap, group),
        ggplot2::aes(
            x = x_umap,
            y = y_umap,
            label = group
        ),
        color = "black",
        size = 2,
        family = "Arial",
        #
        segment.color = "grey35",
        segment.size = 0.25,
        segment.inflect = TRUE,
        #
        box.padding = 0.5,
        min.segment.length = 0,
        arrow = ggplot2::arrow(length = unit(0.015, "npc")),
        max.overlaps = Inf,
        seed = SEED_USE,
    )

p_embedding_developmental_stage <- plot_embedding(
    embedding = embedding_mm[, c("x_umap", "y_umap")],
    color_values = embedding_mm$developmental_stage,
    label = "UMAP; Developmental stage",
    # label_position = c(label_x, label_y),
    show_color_value_labels = FALSE,
    show_color_legend = TRUE,
    geom_point_size = 1,
    sort_values = FALSE
) +
    scale_color_manual(
        values = gg_color_hue(n = embedding_mm$developmental_stage %>%
            droplevels() %>%
            nlevels())
    ) +
    labs(color = NULL) +
    guides(colour = guide_legend(override.aes = list(size = 3), ncol = 1)) +
    customized_theme(x = 0.035, y = 0.36)

Extended Data Figure 7d

purrr::reduce(list(
    p_embedding_lineage,
    p_embedding_developmental_stage
), `+`) +
    plot_layout(ncol = 2) +
    plot_annotation(
        theme = theme(plot.margin = margin())
    )

features_heatmap_mm <- c(
    "Gapdh",
    "Ppia",
    "Pou5f1",
    "Sox2",
    "Nanog",
    "Prdm14",
    "Tfcp2l1",
    "Dppa3",
    "Esrrb",
    "Zfp42",
    "Klf2",
    "Klf4",
    "Klf5",
    "Tbx3",
    "Nr5a2",
    "Fgf4",
    "Il6st",
    "Lifr",
    "Fgf5",
    "Otx2",
    "Pou3f1",
    "Sox3",
    "Dnmt3b",
    "T",
    "Mixl1",
    "Sp5",
    "Cdh2",
    "Snai1",
    "Gata6",
    "Pdgfra",
    "Sox17",
    "Gata4",
    "Afp",
    "Cdx2",
    "Gata2",
    "Gata3",
    "Tfap2c",
    "Krt18"
)

matrix_readcount_mm <- scipy.sparse$load_npz("../preprocessed/mm/matrix_readcount.npz")
matrix_readcount_mm_features <- np$load("../preprocessed/mm/matrix_readcount_features.npy")
matrix_readcount_mm_barcodes <- np$load("../preprocessed/mm/matrix_readcount_barcodes.npy")
colnames(matrix_readcount_mm) <- matrix_readcount_mm_barcodes
rownames(matrix_readcount_mm) <- matrix_readcount_mm_features
matrix_readcount_mm <- matrix_readcount_mm[, embedding_mm$cell]

features_heatmap_mm %in% rownames(matrix_readcount_mm)

##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

matrix_heatmap <- log10(matrix_readcount_mm[features_heatmap_mm, ] + 1)

cells_heatmap <- map(levels(embedding_mm$group), function(x) {
    cells_in_group <- embedding_mm %>%
        filter(group == x) %>%
        pull(cell)

    cells_in_group[
        pvclust::pvclust(
            matrix_heatmap[, cells_in_group] %>% as.matrix(),
            nboot = 100,
            parallel = parallel::detectCores() - 2L
        )$hclust$order
    ]
}) %>%
    unlist()

## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.

Extended Data Figure 7c

matrix_heatmap[, cells_heatmap] %>%
    as.matrix() %>%
    as.data.frame() %>%
    rownames_to_column(var = "feature") %>%
    pivot_longer(
        -c("feature"),
        names_to = "sample"
    ) %>%
    left_join(
        embedding_mm %>%
            dplyr::select(cell, group),
        by = c("sample" = "cell")
    ) %>%
    mutate(
        feature = factor(
            feature,
            levels = features_heatmap_mm %>% rev(.)
        ),
        sample = factor(
            sample,
            levels = cells_heatmap
        )
    ) %>%
    plot_heatmap(
        x = sample,
        y = feature,
        z = value,
        y_order = rev(features_heatmap_mm)
    ) +
    facet_grid(~group, scales = "free", space = "free")

Cynomolgus vs mouse EPI development

Matching homologs

Number of genes of cy matrix: 29095
Number of genes of mm matrix: 26219

library(org.Hs.eg.db)
library(biomaRt)

org.Hs.eg.db version: 3.11.4
biomaRt version: 2.44.1

ensembl <- biomaRt::useMart("ensembl")
biomaRt::listDatasets(ensembl) %>%
    filter(dataset %in% "Cynomolgus")


biomaRt::listDatasets(ensembl)$dataset
##   [1] "acalliptera_gene_ensembl"       "acarolinensis_gene_ensembl"    
##   [3] "acchrysaetos_gene_ensembl"      "acitrinellus_gene_ensembl"     
##   [5] "amelanoleuca_gene_ensembl"      "amexicanus_gene_ensembl"       
##   [7] "ampachon_gene_ensembl"          "anancymaae_gene_ensembl"       
##   [9] "aplatyrhynchos_gene_ensembl"    "applatyrhynchos_gene_ensembl"  
##  [11] "atestudineus_gene_ensembl"      "bbbison_gene_ensembl"          
##  [13] "bgrunniens_gene_ensembl"        "bihybrid_gene_ensembl"         
##  [15] "bmutus_gene_ensembl"            "bsplendens_gene_ensembl"       
##  [17] "btaurus_gene_ensembl"           "bthybrid_gene_ensembl"         
##  [19] "cabingdonii_gene_ensembl"       "capalliatus_gene_ensembl"      
##  [21] "caperea_gene_ensembl"           "catys_gene_ensembl"            
##  [23] "ccanadensis_gene_ensembl"       "ccapucinus_gene_ensembl"       
##  [25] "cdromedarius_gene_ensembl"      "celegans_gene_ensembl"         
##  [27] "cgchok1gshd_gene_ensembl"       "cgcrigri_gene_ensembl"         
##  [29] "cgobio_gene_ensembl"            "cgpicr_gene_ensembl"           
##  [31] "charengus_gene_ensembl"         "chircus_gene_ensembl"          
##  [33] "choffmanni_gene_ensembl"        "cintestinalis_gene_ensembl"    
##  [35] "cjacchus_gene_ensembl"          "cjaponica_gene_ensembl"        
##  [37] "clanigera_gene_ensembl"         "cldingo_gene_ensembl"          
##  [39] "clfamiliaris_gene_ensembl"      "cmilii_gene_ensembl"           
##  [41] "cpbellii_gene_ensembl"          "cporcellus_gene_ensembl"       
##  [43] "cporosus_gene_ensembl"          "csabaeus_gene_ensembl"         
##  [45] "csavignyi_gene_ensembl"         "csemilaevis_gene_ensembl"      
##  [47] "csyrichta_gene_ensembl"         "cvariegatus_gene_ensembl"      
##  [49] "dclupeoides_gene_ensembl"       "dmelanogaster_gene_ensembl"    
##  [51] "dnovaehollandiae_gene_ensembl"  "dnovemcinctus_gene_ensembl"    
##  [53] "dordii_gene_ensembl"            "drerio_gene_ensembl"           
##  [55] "eaasinus_gene_ensembl"          "eburgeri_gene_ensembl"         
##  [57] "ecaballus_gene_ensembl"         "ecalabaricus_gene_ensembl"     
##  [59] "eelectricus_gene_ensembl"       "eeuropaeus_gene_ensembl"       
##  [61] "elucius_gene_ensembl"           "enaucrates_gene_ensembl"       
##  [63] "etelfairi_gene_ensembl"         "falbicollis_gene_ensembl"      
##  [65] "fcatus_gene_ensembl"            "fdamarensis_gene_ensembl"      
##  [67] "fheteroclitus_gene_ensembl"     "gaculeatus_gene_ensembl"       
##  [69] "gagassizii_gene_ensembl"        "ggallus_gene_ensembl"          
##  [71] "ggorilla_gene_ensembl"          "gmorhua_gene_ensembl"          
##  [73] "gwilldenowi_gene_ensembl"       "hburtoni_gene_ensembl"         
##  [75] "hcomes_gene_ensembl"            "hgfemale_gene_ensembl"         
##  [77] "hgmale_gene_ensembl"            "hhucho_gene_ensembl"           
##  [79] "hsapiens_gene_ensembl"          "ipunctatus_gene_ensembl"       
##  [81] "itridecemlineatus_gene_ensembl" "jhyemalis_gene_ensembl"        
##  [83] "jjaculus_gene_ensembl"          "lafricana_gene_ensembl"        
##  [85] "lbergylta_gene_ensembl"         "lcalcarifer_gene_ensembl"      
##  [87] "lcanadensis_gene_ensembl"       "lchalumnae_gene_ensembl"       
##  [89] "lcoronata_gene_ensembl"         "lcrocea_gene_ensembl"          
##  [91] "loculatus_gene_ensembl"         "lsdomestica_gene_ensembl"      
##  [93] "malbus_gene_ensembl"            "marmatus_gene_ensembl"         
##  [95] "mauratus_gene_ensembl"          "mcaroli_gene_ensembl"          
##  [97] "mdomestica_gene_ensembl"        "mfascicularis_gene_ensembl"    
##  [99] "mgallopavo_gene_ensembl"        "mleucophaeus_gene_ensembl"     
## [101] "mlucifugus_gene_ensembl"        "mmmarmota_gene_ensembl"        
## [103] "mmulatta_gene_ensembl"          "mmurdjan_gene_ensembl"         
## [105] "mmurinus_gene_ensembl"          "mmusculus_gene_ensembl"        
## [107] "mnemestrina_gene_ensembl"       "mochrogaster_gene_ensembl"     
## [109] "mpahari_gene_ensembl"           "mpfuro_gene_ensembl"           
## [111] "mspicilegus_gene_ensembl"       "mspretus_gene_ensembl"         
## [113] "mundulatus_gene_ensembl"        "munguiculatus_gene_ensembl"    
## [115] "mvitellinus_gene_ensembl"       "mzebra_gene_ensembl"           
## [117] "neugenii_gene_ensembl"          "ngalili_gene_ensembl"          
## [119] "nleucogenys_gene_ensembl"       "nvison_gene_ensembl"           
## [121] "oanatinus_gene_ensembl"         "oaries_gene_ensembl"           
## [123] "oaureus_gene_ensembl"           "ocuniculus_gene_ensembl"       
## [125] "odegus_gene_ensembl"            "ogarnettii_gene_ensembl"       
## [127] "olatipes_gene_ensembl"          "olhni_gene_ensembl"            
## [129] "olhsok_gene_ensembl"            "omelastigma_gene_ensembl"      
## [131] "oniloticus_gene_ensembl"        "oprinceps_gene_ensembl"        
## [133] "pabelii_gene_ensembl"           "panubis_gene_ensembl"          
## [135] "pcapensis_gene_ensembl"         "pcinereus_gene_ensembl"        
## [137] "pcoquereli_gene_ensembl"        "pformosa_gene_ensembl"         
## [139] "pkingsleyae_gene_ensembl"       "platipinna_gene_ensembl"       
## [141] "pmarinus_gene_ensembl"          "pmbairdii_gene_ensembl"        
## [143] "pmexicana_gene_ensembl"         "pnattereri_gene_ensembl"       
## [145] "pnyererei_gene_ensembl"         "ppaniscus_gene_ensembl"        
## [147] "ppardus_gene_ensembl"           "pranga_gene_ensembl"           
## [149] "preticulata_gene_ensembl"       "psimus_gene_ensembl"           
## [151] "psinensis_gene_ensembl"         "ptaltaica_gene_ensembl"        
## [153] "ptephrosceles_gene_ensembl"     "ptroglodytes_gene_ensembl"     
## [155] "pvampyrus_gene_ensembl"         "pvitticeps_gene_ensembl"       
## [157] "rbieti_gene_ensembl"            "rferrumequinum_gene_ensembl"   
## [159] "rnorvegicus_gene_ensembl"       "rroxellana_gene_ensembl"       
## [161] "saraneus_gene_ensembl"          "saurata_gene_ensembl"          
## [163] "sbboliviensis_gene_ensembl"     "scanaria_gene_ensembl"         
## [165] "scerevisiae_gene_ensembl"       "sdumerili_gene_ensembl"        
## [167] "sfasciatus_gene_ensembl"        "sformosus_gene_ensembl"        
## [169] "shabroptila_gene_ensembl"       "sharrisii_gene_ensembl"        
## [171] "sldorsalis_gene_ensembl"        "smaximus_gene_ensembl"         
## [173] "smerianae_gene_ensembl"         "sorbicularis_gene_ensembl"     
## [175] "spunctatus_gene_ensembl"        "ssalar_gene_ensembl"           
## [177] "ssbamei_gene_ensembl"           "ssberkshire_gene_ensembl"      
## [179] "sscrofa_gene_ensembl"           "sshampshire_gene_ensembl"      
## [181] "ssjinhua_gene_ensembl"          "sslandrace_gene_ensembl"       
## [183] "sslargewhite_gene_ensembl"      "ssmeishan_gene_ensembl"        
## [185] "sspietrain_gene_ensembl"        "ssrongchang_gene_ensembl"      
## [187] "sstibetan_gene_ensembl"         "ssusmarc_gene_ensembl"         
## [189] "sswuzhishan_gene_ensembl"       "strutta_gene_ensembl"          
## [191] "tbelangeri_gene_ensembl"        "tgelada_gene_ensembl"          
## [193] "tguttata_gene_ensembl"          "tnigroviridis_gene_ensembl"    
## [195] "trubripes_gene_ensembl"         "ttruncatus_gene_ensembl"       
## [197] "uamericanus_gene_ensembl"       "umaritimus_gene_ensembl"       
## [199] "vpacos_gene_ensembl"            "vursinus_gene_ensembl"         
## [201] "vvulpes_gene_ensembl"           "xmaculatus_gene_ensembl"       
## [203] "xtropicalis_gene_ensembl"
grep(
    pattern = "fascicularis",
    x = biomaRt::listDatasets(ensembl)$dataset,
    ignore.case = TRUE,
    value = TRUE
)
## [1] "mfascicularis_gene_ensembl"
# mfascicularis_gene_ensembl"

grep(
    "Human",
    listDatasets(ensembl)$description,
    ignore.case = TRUE,
    value = TRUE
)
## [1] "Human genes (GRCh38.p13)"
biomaRt::listDatasets(ensembl)[
    grep(
        "Human",
        biomaRt::listDatasets(ensembl)$description,
        ignore.case = TRUE
    ),
]

ensembl <- biomaRt::useDataset("hsapiens_gene_ensembl", mart = ensembl)

# biomaRt::listAttributes(ensembl) %>% head()
# biomaRt::listFilters(ensembl) %>% head()

Extract gene names from our human expression matrix.

ensembl_ids_human <- read_delim(
    file = "../../../LW36/filtered_feature_bc_matrix/features.tsv.gz",
    delim = "\t",
    col_names = FALSE
) %>%
    pull(X1)

## Parsed with column specification:
## cols(
##   X1 = col_character(),
##   X2 = col_character(),
##   X3 = col_character()
## )

gene_homologs <- biomaRt::getBM(
    attributes = c(
        "ensembl_gene_id",
        "ensembl_gene_id_version",
        "external_gene_name",
        "external_gene_source",
        #
        "mmusculus_homolog_ensembl_gene",
        "mmusculus_homolog_orthology_type",
        "mmusculus_homolog_associated_gene_name",
        "mmusculus_homolog_orthology_confidence",
        #
        "mfascicularis_homolog_ensembl_gene",
        "mfascicularis_homolog_orthology_type",
        "mfascicularis_homolog_associated_gene_name",
        "mfascicularis_homolog_orthology_confidence"
    ),
    filters = "ensembl_gene_id",
    values = ensembl_ids_human,
    mart = ensembl
)

## [1] "gene_homologs_20200818_151012_CDT.rds"

Keep one2one orthologs.

gene_homologs_filtered <- ensembl_ids_human %>%
    tibble::enframe(value = "ensembl_gene_id") %>%
    dplyr::left_join(
        gene_homologs %>%
            filter(
                mmusculus_homolog_orthology_type == "ortholog_one2one",
                mmusculus_homolog_orthology_confidence == 1,
                mfascicularis_homolog_orthology_type == "ortholog_one2one",
                mfascicularis_homolog_orthology_confidence == 1
            ),
        by = "ensembl_gene_id"
    ) %>%
    dplyr::select(
        -c(
            name,
            mmusculus_homolog_orthology_confidence,
            mfascicularis_homolog_orthology_confidence
        )
    ) %>%
    tidyr::drop_na()

gene_homologs_filtered %>% head()

Add ENTREZID.

library(org.Mm.eg.db)
## 
# keytypes(org.Mm.eg)

gene_homologs_filtered <- gene_homologs_filtered %>%
    mutate(
        ENTREZID_mm = AnnotationDbi::select(
            org.Mm.eg.db,
            keys = mmusculus_homolog_associated_gene_name,
            columns = c(
                "ENTREZID"
            ),
            keytype = "SYMBOL"
        ) %>%
            pull(ENTREZID)
    )
## 'select()' returned many:1 mapping between keys and columns

entrezgene_id_cy <- biomaRt::getBM(
    attributes = c("ensembl_gene_id", "entrezgene_id"),
    filters = "ensembl_gene_id",
    values = gene_homologs_filtered$mfascicularis_homolog_ensembl_gene,
    mart = biomaRt::useMart("ensembl", dataset = "mfascicularis_gene_ensembl")
)

# cy
entrezgene_id_cy_matrix <- read_delim(
    file = "../preprocessed/GSE74767_SC3seq_Cy_ProcessedData.txt.gz",
    delim = "\t"
) %>%
    pull(macFas5_entrez_id)
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   macFas5_gene_symbol = col_character()
## )
## See spec(...) for full column specifications.

gene_homologs_filtered <- gene_homologs_filtered %>%
    left_join(
        entrezgene_id_cy[entrezgene_id_cy$entrezgene_id %in% entrezgene_id_cy_matrix, ],
        by = c("mfascicularis_homolog_ensembl_gene" = "ensembl_gene_id")
    ) %>%
    dplyr::rename("ENTREZID_cy" = entrezgene_id)

# mm
entrezgene_id_mm_matrix <- read_delim(
    file = "../preprocessed/GSE74767_SC3seq_Ms_ProcessedData.txt.gz",
    delim = "\t"
) %>%
    pull(mm10_entrez_id)
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   mm10_gene_symbol = col_character()
## )
## See spec(...) for full column specifications.

# sum(gene_homologs_filtered$ENTREZID_mm %in% entrezgene_id_mm_matrix)
# sum(gene_homologs_filtered$ENTREZID_cy %in% entrezgene_id_cy_matrix)

gene_homologs_filtered <- gene_homologs_filtered %>%
    filter(
        ENTREZID_cy %in% entrezgene_id_cy_matrix,
        ENTREZID_mm %in% entrezgene_id_mm_matrix
    )

gene_homologs_filtered %>% dim()
## [1] 13411    12

gene_homologs_filtered %>% head()

Dimensionality reduction

Preprocessing

Combine cy and mm matrices.

matrix_readcount_cy_mm <- cbind(
    matrix_readcount_use[
        match(
            gene_homologs_filtered$ENTREZID_cy,
            entrezgene_id_cy_matrix
        ),
    ],
    matrix_readcount_mm[
        match(
            gene_homologs_filtered$ENTREZID_mm,
            entrezgene_id_mm_matrix
        ),
    ]
)

if (!file.exists("matrix_readcount_cy_mm.rds")) {
    saveRDS(
        object = matrix_readcount_cy_mm,
        file = "matrix_readcount_cy_mm.rds"
    )
}

matrix_readcount_cyEPI_mmEPI <- matrix_readcount_cy_mm[, c(
    embedding_cyEPI$sample_name,
    embedding_mm %>%
        filter(
            !developmental_stage %in% c("EpiLC", "2iLESC")
        ) %>%
        pull(cell)
)]

matrix_readcount_cyEPI_mmEPI <- matrix_readcount_cyEPI_mmEPI[
    Matrix::rowSums(
        matrix_readcount_cyEPI_mmEPI
    ) >= MINIMAL_NUM_COUNTS_REQUIRED_FOR_GENE,
]

dim(matrix_readcount_cyEPI_mmEPI)

## [1] 12038   299

PCA

matrix_readcount_cyEPI_mmEPI_log <- matrix_readcount_cyEPI_mmEPI
matrix_readcount_cyEPI_mmEPI_log@x <- log1p(matrix_readcount_cyEPI_mmEPI_log@x)

# z-score
matrix_readcount_cyEPI_mmEPI_log_scaled <- t(
    scale(
        t(
            matrix_readcount_cyEPI_mmEPI_log
        ),
        center = TRUE, scale = TRUE
    )
)

pca_out <- prcomp(
    t(matrix_readcount_cyEPI_mmEPI_log_scaled),
    center = FALSE,
    scale = FALSE
)

summary(pca_out)$imp %>%
    t() %>%
    as.data.frame() %>%
    rownames_to_column(var = "component") %>%
    mutate(
        rank = 1:n()
    ) %>%
    dplyr::slice(1:30) %>%
    ggplot(
        aes(
            x = rank,
            y = `Proportion of Variance`
        )
    ) +
    geom_point(size = 0.3) +
    theme_bw() +
    scale_x_continuous(
        name = "Component",
        breaks = c(1, seq(5, 30, 5))
    ) +
    scale_y_continuous(
        name = "Proportion of variance",
        labels = scales::percent
    ) +
    ggplot2::theme(
        axis.title = ggplot2::element_text(family = "Arial", size = 6),
        axis.text = ggplot2::element_text(family = "Arial", size = 6),
        panel.grid.minor = ggplot2::element_blank()
    )

embedding_cyEPI_mmEPI <- pca_out$x %>%
    as.data.frame() %>%
    rownames_to_column(var = "sample_name") %>%
    dplyr::select(sample_name:PC3) %>%
    dplyr::left_join(
        table_s1_sheet3 %>%
            # filter(Species == "Mouse") %>%
            mutate(
                SampleID = case_when(
                    str_detect(string = SampleID, pattern = "^\\d") ~ str_c("X", .$SampleID),
                    TRUE ~ .$SampleID
                )
            ),
        by = c("sample_name" = "SampleID")
    ) %>%
    dplyr::select(sample_name:`Embryonic Day/ Culture Condition`) %>%
    dplyr::rename_all(. %>% tolower()) %>%
    dplyr::rename(
        x_pca = pc1,
        y_pca = pc2,
        z_pca = pc3,
        developmental_stage = `embryonic day/ culture condition`
    ) %>%
    relocate(x_pca, y_pca, z_pca, .after = last_col()) %>%
    mutate(
        group = factor(
            group,
            levels = c(
                "ICM",
                "Pre-EPI",
                "PostE-EPI",
                "PostL-EPI",
                "Gast1",
                "Gast2a",
                "Gast2b",
                "E4.5EPI",
                "E4.5PE",
                "E4.5mTE",
                "E4.5pTE",
                "E5.5EPI",
                "E5.5VE",
                "E6.5EPI-T hi",
                "E6.5EPI-T lo",
                "E6.5VE"
            )
        ),
        species = factor(
            species,
            levels = c(
                "Cynomolgus Monkey",
                "Mouse"
            )
        )
    )

p_embedding_pca1 <- embedding_cyEPI_mmEPI %>%
    droplevels() %>%
    mutate(
        category = "PCA"
    ) %>%
    sample_frac() %>%
    plot_pca(
        x = x_pca,
        y = y_pca,
        color = group,
        shape = species
    ) +
    add_xy_label_pca(x = "PC1", y = "PC2") +
    scale_color_manual(
        values = ggthemes::tableau_color_pal("Tableau 20")(
            length(unique(embedding_cyEPI_mmEPI$group))
        )
    ) +
    ggrepel::geom_text_repel(
        data = get_middle_points(embedding_cyEPI_mmEPI, x_pca, y_pca, group),
        aes(
            x = x_pca,
            y = y_pca,
            label = group,
            shape = NULL
        ),
        color = "black",
        size = 2,
        family = "Arial",
        #
        segment.color = "grey35",
        segment.size = 0.25,
        segment.inflect = TRUE,
        #
        box.padding = 0.5,
        min.segment.length = 0,
        arrow = ggplot2::arrow(length = unit(0.015, "npc")),
        max.overlaps = Inf,
        seed = SEED_USE,
    ) +
    guides(color = "none", shape = "none")

## `summarise()` ungrouping output (override with `.groups` argument)

## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"
## Joining, by = "group"

p_embedding_pca2 <- embedding_cyEPI_mmEPI %>%
    droplevels() %>%
    mutate(
        category = "PCA"
    ) %>%
    sample_frac() %>%
    plot_pca(
        x = y_pca,
        y = z_pca,
        color = group,
        shape = species
    ) +
    add_xy_label_pca(x = "PC2", y = "PC3") +
    scale_color_manual(
        values = ggthemes::tableau_color_pal("Tableau 20")(
            length(unique(embedding_cyEPI_mmEPI$group))
        )
    ) +
    guides(color = "none") +
    theme(
        legend.text = element_text(family = "Arial", size = 6),
        legend.key.size = unit(3, "mm"),
        legend.margin = margin(t = 0, r = 0, b = 0, l = 0, unit = "mm"),
        legend.background = element_blank(),
        #
        legend.justification = c(0, 1),
        legend.position = c(0.035, 0.22)
    )

Extended Data Figure 8a

p_embedding_pca1 +
    p_embedding_pca2 +
    plot_annotation(theme = theme(plot.margin = margin())) +
    plot_layout(ncol = 2, guides = "keep")

Comparion of cyEPI and mEPI

Species specific

# Supplementary Table 2, Sheet 12
table_s2_sheet12 <- readxl::read_excel(
    path = "../../../../docs/A_developmental_coordinate_of_pluripotency_among_mice,_monkeys_and_humans/41586_2016_BFnature19096_MOESM297_ESM.xlsx",
    sheet = "EDFig8b",
    skip = 1
)

## New names:
## * MF5ID -> MF5ID...2
## * MF5Symbol -> MF5Symbol...3
## * HsID -> HsID...4
## * HsSymbol -> HsSymbol...5
## * MsID -> MsID...6
## * ...

table_s2_sheet12 %>% head()

Use genes listed in Supplementary Table 2, Sheet 12.

features_heatmap <- c(
    table_s2_sheet12$MF5ID...22,
    table_s2_sheet12$MF5ID...2
)

features_heatmap <- features_heatmap[!is.na(features_heatmap)]

groups_selected <- c(
    "ICM", "Pre-EPI", "PostE-EPI", "PostL-EPI", "Gast1", "Gast2a", "Gast2b",
    "E4.5EPI", "E5.5EPI", "E6.5EPI-T hi", "E6.5EPI-T lo"
)

cells_heatmap <- embedding_cyEPI_mmEPI %>%
    filter(group %in% groups_selected) %>%
    pull(sample_name)

features_heatmap <- features_heatmap[
    features_heatmap %in% gene_homologs_filtered$ENTREZID_cy
]

matrix_heatmap <- matrix_readcount_cy_mm[
    match(features_heatmap, gene_homologs_filtered$ENTREZID_cy),
    cells_heatmap
]

matrix_heatmap <- matrix_heatmap[rowSums(matrix_heatmap) != 0, ]
matrix_heatmap <- log10(matrix_heatmap + 1)
heatmap_limits <- quantile(matrix_heatmap, c(0.005, 0.995))
matrix_heatmap[matrix_heatmap < heatmap_limits[1]] <- heatmap_limits[1]
matrix_heatmap[matrix_heatmap > heatmap_limits[2]] <- heatmap_limits[2]

Number of genes used: 658.

cells_heatmap <- map(groups_selected, function(x) {
    cells_in_group <- embedding_cyEPI_mmEPI %>%
        filter(group == x) %>%
        pull(sample_name)

    cells_in_group[
        pvclust::pvclust(
            matrix_heatmap[, cells_in_group] %>% as.matrix(),
            nboot = 100,
            parallel = parallel::detectCores() - 2L
        )$hclust$order
    ]
}) %>%
    unlist()

## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.
## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.

matrix_heatmap <- matrix_heatmap[, cells_heatmap]

ha_columns <- ComplexHeatmap::HeatmapAnnotation(
    lineage = ComplexHeatmap::anno_simple(
        embedding_cyEPI_mmEPI[match(
            x = colnames(matrix_heatmap),
            table = embedding_cyEPI_mmEPI$sample_name
        ), "group"],
        # pch = anno_labels_cluster,
        col = setNames(
            object = ggthemes::tableau_color_pal("Tableau 20")(
                embedding_cyEPI_mmEPI$group %>%
                    # droplevels() %>%
                    nlevels()
            ),
            nm = levels(embedding_cyEPI_mmEPI$group)
        ),
        which = "column",
        pt_size = unit(2, "mm"),
        pt_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 6
        ),
        simple_anno_size = unit(3, "mm")
    ),
    #
    Species = ComplexHeatmap::anno_simple(
        embedding_cyEPI_mmEPI[match(
            x = colnames(matrix_heatmap),
            table = embedding_cyEPI_mmEPI$sample_name
        ), "species"],
        # pch = anno_labels_cluster,
        col = setNames(
            object = as.character(
                yarrr::piratepal(palette = "google")
            )[seq_along(levels(embedding_cyEPI_mmEPI$species))],
            nm = levels(embedding_cyEPI_mmEPI$species)
        ),
        which = "column",
        pt_size = unit(2, "mm"),
        pt_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 6
        ),
        simple_anno_size = unit(3, "mm")
    ),
    #
    show_annotation_name = TRUE,
    annotation_label = c(
        "Lineage",
        "Species"
    ),
    annotation_name_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    annotation_name_side = "left"
)

lgd_lineage <- ComplexHeatmap::Legend(
    title = "Lineage",
    labels = groups_selected, # embedding_cyEPI_mmEPI %>% pull(group) %>% droplevels() %>% levels(),
    legend_gp = grid::gpar(
        fill = setNames(
            object = ggthemes::tableau_color_pal("Tableau 20")(
                length(unique(embedding_cyEPI_mmEPI$group))
            ),
            nm = embedding_cyEPI_mmEPI$group %>% levels()
        )[embedding_cyEPI_mmEPI$group %>%
            droplevels() %>%
            levels()][groups_selected]
    ),
    labels_gp = grid::gpar(
        fontfamily = "Arial",
        fontsize = 6
    ),
    title_gp = grid::gpar(
        fontfamily = "Arial",
        fontsize = 6
    )
)

lgd_species <- ComplexHeatmap::Legend(
    title = "Species",
    labels = levels(embedding_cyEPI_mmEPI$species),
    legend_gp = grid::gpar(
        fill = setNames(
            object = as.character(
                yarrr::piratepal(palette = "google")
            )[seq_along(levels(embedding_cyEPI_mmEPI$species))],
            nm = levels(embedding_cyEPI_mmEPI$species)
        )
    ),
    labels_gp = grid::gpar(
        fontfamily = "Arial",
        fontsize = 6
    ),
    title_gp = grid::gpar(
        fontfamily = "Arial",
        fontsize = 6
    )
)

ht <- ComplexHeatmap::Heatmap(
    matrix = matrix_heatmap %>% as.matrix(),
    # col =  viridis::plasma(n = 10),
    col = wesanderson::wes_palette("Zissou1", 50, type = "continuous"),
    #
    cluster_rows = FALSE,
    cluster_columns = FALSE,
    #
    ## row_labels = str_remove(string = rownames(matrix_heatmap), pattern = "^E.+_"),
    # row_names_side = c("left"),
    # row_names_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    show_row_names = FALSE,
    show_column_names = FALSE,
    #
    show_row_dend = FALSE,
    show_column_dend = FALSE,
    #
    column_names_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    show_heatmap_legend = TRUE,
    top_annotation = ha_columns,
    ## left_annotation = ha_left,
    ## right_annotation = ha_left,
    #
    heatmap_legend_param = list(
        title = expression(paste("Log"[10], " (RPM + 1)")),
        title_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 6
        ),
        legend_direction = "vertical",
        labels_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 6
        ),
        legend_height = unit(25, "mm"),
        legend_width = unit(10, "mm")
    ),
    #
    # row_split = table_s2_sheet4$ClusterID,
    # row_gap = unit(0.3, "mm"),
    #
    # row_title_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    # row_title_rot = 0,
    #
    column_split = factor(
        embedding_cyEPI_mmEPI[match(
            x = colnames(matrix_heatmap),
            table = embedding_cyEPI_mmEPI$sample_name
        ), "group", drop = TRUE] %>%
            # levels() %>%
            str_pad(
                width = max(nchar(levels(embedding_cyEPI_mmEPI$group))),
                side = "left"
            ),
        levels = embedding_cyEPI_mmEPI[match(
            x = colnames(matrix_heatmap),
            table = embedding_cyEPI_mmEPI$sample_name
        ), "group", drop = TRUE] %>%
            levels() %>%
            str_pad(
                width = max(nchar(levels(embedding_cyEPI_mmEPI$group))),
                side = "left"
            )
    ),
    column_gap = unit(0.3, "mm"),
    column_title_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    column_title_rot = -90,
    #
    use_raster = FALSE
)

pd <- ComplexHeatmap::packLegend(
    lgd_lineage,
    lgd_species,
    # gap = unit(8, "mm"),
    direction = "vertical"
)

Extended Data Figure 8b

ComplexHeatmap::draw(
    ht,
    heatmap_legend_list = list(pd)
)

Developmental stage

scaled_pc_loadings <- pca_out$rotation %>%
    scale() %>%
    as.data.frame() %>%
    rownames_to_column(var = "feature") %>%
    dplyr::select(feature, PC1, PC2, PC3) %>%
    mutate(
        # group = (abs(PC2) >= 2.5 | abs(PC3) >= 2.5) & abs(PC1 < 2),
        r23 = sqrt(PC2^2 + PC3^2) >= 2.75,
        r12 = sqrt(PC1^2 + PC2^2) < 3,
        r13 = sqrt(PC1^2 + PC3^2) < 3,
        # pc1 = abs(PC1 < 2),
        group = r23 & r12 & r13
    ) %>%
    arrange(group)

sum(scaled_pc_loadings$group)

## [1] 226

plot_pca_loading(
    data = scaled_pc_loadings,
    x = PC2,
    y = PC3,
    z1 = group,
    x_label = "Z score of PC2 loading",
    y_label = "Z score of PC3 loading"
) +
    coord_fixed()

features_heatmap <- rownames(
    matrix_readcount_cyEPI_mmEPI
)[rownames(matrix_readcount_cyEPI_mmEPI) %in%
    scaled_pc_loadings$feature[scaled_pc_loadings$group]]

matrix_heatmap <- matrix_readcount_cy_mm[
    rownames(matrix_readcount_cy_mm) %in% features_heatmap,
    cells_heatmap
]

matrix_heatmap <- matrix_heatmap[rowSums(matrix_heatmap) != 0, ]
matrix_heatmap <- log10(matrix_heatmap + 1)
heatmap_limits <- quantile(matrix_heatmap, c(0.005, 0.995))
matrix_heatmap[matrix_heatmap < heatmap_limits[1]] <- heatmap_limits[1]
matrix_heatmap[matrix_heatmap > heatmap_limits[2]] <- heatmap_limits[2]

features_heatmap <- rownames(matrix_heatmap)[
    pvclust::pvclust(
        t(matrix_heatmap %>% as.matrix()),
        nboot = 100,
        parallel = parallel::detectCores() - 2L
    )$hclust$order
]

## Creating a temporary cluster...done:
## socket cluster with 6 nodes on host 'localhost'
## Multiscale bootstrap... Done.

matrix_heatmap <- as.matrix(matrix_heatmap)[features_heatmap, ]

ht <- ComplexHeatmap::Heatmap(
    matrix = matrix_heatmap,
    # col =  viridis::plasma(n = 10),
    col = wesanderson::wes_palette("Zissou1", 50, type = "continuous"),
    #
    cluster_rows = FALSE,
    cluster_columns = FALSE,
    #
    ## row_labels = str_remove(string = rownames(matrix_heatmap), pattern = "^E.+_"),
    # row_names_side = c("left"),
    # row_names_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    show_row_names = FALSE,
    show_column_names = FALSE,
    #
    show_row_dend = FALSE,
    show_column_dend = FALSE,
    #
    column_names_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    show_heatmap_legend = TRUE,
    top_annotation = ha_columns,
    ## left_annotation = ha_left,
    ## right_annotation = ha_left,
    #
    heatmap_legend_param = list(
        title = expression(paste("Log"[10], " (RPM + 1)")),
        title_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 6
        ),
        legend_direction = "vertical",
        labels_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 6
        ),
        legend_height = unit(25, "mm"),
        legend_width = unit(10, "mm")
    ),
    #
    # row_split = table_s2_sheet4$ClusterID,
    # row_gap = unit(0.3, "mm"),
    #
    # row_title_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    # row_title_rot = 0,
    #
    column_split = factor(
        embedding_cyEPI_mmEPI[match(
            x = colnames(matrix_heatmap),
            table = embedding_cyEPI_mmEPI$sample_name
        ), "group", drop = TRUE] %>%
            # levels() %>%
            str_pad(
                width = max(nchar(levels(embedding_cyEPI_mmEPI$group))),
                side = "left"
            ),
        levels = embedding_cyEPI_mmEPI[match(
            x = colnames(matrix_heatmap),
            table = embedding_cyEPI_mmEPI$sample_name
        ), "group", drop = TRUE] %>%
            levels() %>%
            str_pad(
                width = max(nchar(levels(embedding_cyEPI_mmEPI$group))),
                side = "left"
            )
    ),
    column_gap = unit(0.3, "mm"),
    column_title_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    column_title_rot = -90,
    #
    use_raster = FALSE
)

Extended Data Figure 8c

ComplexHeatmap::draw(
    ht,
    heatmap_legend_list = list(pd)
)

Correlation heatmap

# Supplementary Table 2, Sheet 5
table_s2_sheet5 <- readxl::read_excel(
    path = "../../../../docs/A_developmental_coordinate_of_pluripotency_among_mice,_monkeys_and_humans/41586_2016_BFnature19096_MOESM297_ESM.xlsx",
    sheet = "cy_m common EPI genes",
    skip = 0
)

table_s2_sheet5 %>% head()

matrix_heatmap_corr <- map(groups_selected, function(x) {
    cells_in_group <- embedding_cyEPI_mmEPI %>%
        filter(group == x) %>%
        pull(sample_name)
    rowMeans(matrix_readcount_cy_mm[, cells_in_group])
}) %>%
    bind_cols() %>%
    as.matrix()

## New names:
## * NA -> ...1
## * NA -> ...2
## * NA -> ...3
## * NA -> ...4
## * NA -> ...5
## * ...

colnames(matrix_heatmap_corr) <- groups_selected
rownames(matrix_heatmap_corr) <- rownames(matrix_heatmap_corr)

matrix_heatmap_corr <- matrix_heatmap_corr[
    gene_homologs_filtered$ENTREZID_cy %in% table_s2_sheet5$macFas5_entrez_id,
]

head(matrix_heatmap_corr)

##              ICM   Pre-EPI  PostE-EPI  PostL-EPI       Gast1      Gast2a
## [1,]   0.3884385  3.066446  21.705624  56.140386  38.8517128  27.0977585
## [2,]  33.3950797  6.219453   3.198068   7.396065  11.6537856  66.8365023
## [3,]  47.4773044 45.381955 173.390106 319.768217 837.4882833 398.9513877
## [4,]  13.1562846 52.278984   3.787983   2.408792   0.8139348   0.9224277
## [5,] 217.7737733 53.153054   9.975591   8.359721   7.8830661   3.3246738
## [6,]   0.0000000  4.702147  18.555872  16.887636  13.6229839  11.6953331
##           Gast2b    E4.5EPI      E5.5EPI E6.5EPI-T hi E6.5EPI-T lo
## [1,]  17.2856300   1.456039  13.07441250   23.7086779    4.8825092
## [2,]  19.8709846   0.000000   0.13180738    8.2164376    8.2147970
## [3,] 319.1945685 165.748156 115.74261687  160.4587570  246.5546333
## [4,]   0.1858084   0.000000   0.02128681    0.0995788    1.0954902
## [5,]   6.3924846   5.338176   1.99523812    0.9123928    0.8186693
## [6,]  11.9185731  24.416767  15.68149437    3.3923033   18.0418240

Extended Data Figure 8d

corr_heatmap <- cor(log10(matrix_heatmap_corr + 1))
corr_heatmap[lower.tri(corr_heatmap)] <- NA

corr_heatmap %>%
    as.data.frame() %>%
    rownames_to_column(var = "sample_x") %>%
    pivot_longer(
        -c("sample_x"),
        names_to = "sample_y"
    ) %>%
    as.data.frame() %>%
    filter(!is.na(value)) %>%
    # round(value, digits = 2)
    mutate(
        sample_x = factor(
            sample_x,
            levels = groups_selected,
        ),
        sample_y = factor(
            sample_y,
            levels = groups_selected %>% rev(.)
        )
    ) %>%
    plot_corr_heatmap(
        x = sample_x,
        y = sample_y,
        z = value
    ) +
    scale_fill_gradientn(
        limits = c(-1, 1),
        colors = wesanderson::wes_palette(
            "Zissou1",
            21,
            type = "continuous"
        )
    )

## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.

cyEPI development vs hPSCs/cyPSCs

Load data in Ref. 36

matrix_readcount_PRJEB7132 <- read_delim(
    file = "../../PRJEB7132/matrix/xaa_Aligned.sortedByCoord.out_deduped_q10_gene_id_featureCounts.txt.gz",
    delim = "\t",
    col_names = TRUE,
    skip = 1
) %>%
    dplyr::select(-(2:6))
## Parsed with column specification:
## cols(
##   Geneid = col_character(),
##   Chr = col_character(),
##   Start = col_character(),
##   End = col_character(),
##   Strand = col_character(),
##   Length = col_double(),
##   `../aln/ERR590398_rnaseq/aln/Aligned.sortedByCoord.out_deduped.bam` = col_double(),
##   `../aln/ERR590399_rnaseq/aln/Aligned.sortedByCoord.out_deduped.bam` = col_double(),
##   `../aln/ERR590400_rnaseq/aln/Aligned.sortedByCoord.out_deduped.bam` = col_double(),
##   `../aln/ERR590401_rnaseq/aln/Aligned.sortedByCoord.out_deduped.bam` = col_double(),
##   `../aln/ERR590408_rnaseq/aln/Aligned.sortedByCoord.out_deduped.bam` = col_double(),
##   `../aln/ERR590410_rnaseq/aln/Aligned.sortedByCoord.out_deduped.bam` = col_double()
## )

colnames(matrix_readcount_PRJEB7132) <- colnames(matrix_readcount_PRJEB7132) %>%
    str_remove(
        pattern = "_rnaseq/aln/Aligned.sortedByCoord.out_deduped.bam"
    ) %>%
    str_remove(pattern = "../aln/")

cell_metadata_PRJEB7132 <- read_delim(
    file = "../../PRJEB7132/SraRunTable.txt",
    delim = ","
) %>%
    dplyr::select(-c("organism", "Sample Name")) %>%
    rename_all(. %>% tolower()) %>%
    `colnames<-`(str_replace(
        string = colnames(.),
        pattern = " ", replacement = "_"
    )) %>%
    filter(organism == "Homo sapiens") %>%
    dplyr::select(
        sample_name,
        run,
        biosample,
        alias,
        cell_line,
        organism,
        title
    ) %>%
    mutate(
        sample_description = str_remove(
            string = title,
            pattern = "_R\\d$"
        )
    )
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   AvgSpotLen = col_double(),
##   Bases = col_double(),
##   Bytes = col_double(),
##   `ENA-FIRST-PUBLIC (run)` = col_date(format = ""),
##   `ENA-LAST-UPDATE (run)` = col_date(format = ""),
##   INSDC_first_public = col_datetime(format = ""),
##   INSDC_last_update = col_datetime(format = ""),
##   ReleaseDate = col_datetime(format = "")
## )
## See spec(...) for full column specifications.

matrix_readcount_PRJEB7132 <- Matrix(
    data = as.matrix(matrix_readcount_PRJEB7132[, -1]),
    dimnames = list(
        matrix_readcount_PRJEB7132$Geneid,
        colnames(matrix_readcount_PRJEB7132)[-1]
    ),
    sparse = TRUE
)

colnames(matrix_readcount_PRJEB7132) <- colnames(matrix_readcount_PRJEB7132) %>%
    enframe(value = "run") %>%
    left_join(cell_metadata_PRJEB7132, by = c("run" = "run")) %>%
    pull("sample_name")

matrix_cpm_PRJEB7132 <- calc_cpm(
    matrix_readcount_PRJEB7132[
        ,
        c("ERS537888", "ERS537890", "ERS537878", "ERS537884", "ERS537881", "ERS537876")
    ]
)

# gene_homologs_filtered
# matrix_readcount_use[entrezgene_id_cy_matrix %in% table_s2_sheet4$macFas5_entrez_id]

matrix_readcount_use2 <- matrix_readcount_use[
    match(gene_homologs_filtered$ENTREZID_cy, entrezgene_id_cy_matrix),
]

matrix_cpm_PRJEB7132_ <- matrix_cpm_PRJEB7132[
    match(gene_homologs_filtered$ensembl_gene_id, rownames(matrix_cpm_PRJEB7132)),
]

groups_selected <- c(
    "ICM", "Pre-EPI", "PostE-EPI", "PostL-EPI",
    "Gast1", "Gast2a", "Gast2b",
    "cyESCFF", "cyESCoF"
)

matrix_heatmap_corr <- map(groups_selected, function(x) {
    cells_in_group <- embedding_cy %>%
        filter(group == x) %>%
        pull(cell)
    Matrix::rowMeans(matrix_readcount_use2[, cells_in_group])
}) %>%
    bind_cols() %>%
    as.matrix()

## New names:
## * NA -> ...1
## * NA -> ...2
## * NA -> ...3
## * NA -> ...4
## * NA -> ...5
## * ...

colnames(matrix_heatmap_corr) <- groups_selected
rownames(matrix_heatmap_corr) <- rownames(matrix_readcount_use2)

matrix_heatmap_corr <- cbind(
    matrix_heatmap_corr,
    H9 = Matrix::rowMeans(
        matrix_cpm_PRJEB7132_[, c("ERS537888", "ERS537890", "ERS537878")]
    ),
    H9_reset = Matrix::rowMeans(
        matrix_cpm_PRJEB7132_[, c("ERS537884", "ERS537881", "ERS537876")]
    )
)

# cleanup
rm(matrix_readcount_use2)
rm(matrix_cpm_PRJEB7132_)

matrix_heatmap_corr <- matrix_heatmap_corr[
    gene_homologs_filtered$ENTREZID_cy %in% table_s2_sheet4$macFas5_entrez_id,
]

matrix_heatmap_corr %>% head()

##                  ICM   Pre-EPI  PostE-EPI  PostL-EPI      Gast1    Gast2a
## SMIM1    151.1363033 93.115194   3.820487   7.118101   8.682972  10.24251
## FBXO2      1.4469895 13.184514 102.988591  54.911807  54.469006  38.52027
## MAD2L2    24.9603647 61.772933 279.058067 204.688162 114.454772 123.24133
## DRAXIN     0.6368063  1.544775  41.484816  57.921171  19.596015  16.83393
## FBLIM1     5.6753280 41.920067 109.251597  60.510816 107.413512 215.56966
## ARHGEF19   0.3884385  3.066446  21.705624  56.140386  38.851713  27.09776
##             Gast2b   cyESCFF    cyESCoF         H9  H9_reset
## SMIM1     5.740824  14.18091   7.773821   2.190192  2.617383
## FBXO2    16.046803  46.32920  70.165794  12.201141  0.702835
## MAD2L2   87.188329 169.93101 209.349509 131.947331 69.062950
## DRAXIN   52.722563  40.49054  31.154642  81.348664  7.799315
## FBLIM1   46.282181 154.37242 260.867845  67.786364 25.493542
## ARHGEF19 17.285630  56.76092  28.440670  12.810655  3.328117

Fig. 5c (only includes Ref. 36)

corr_heatmap <- cor(log10(matrix_heatmap_corr + 1))
corr_heatmap[lower.tri(corr_heatmap)] <- NA

corr_heatmap %>%
    as.data.frame() %>%
    rownames_to_column(var = "sample_x") %>%
    pivot_longer(
        -c("sample_x"),
        names_to = "sample_y"
    ) %>%
    as.data.frame() %>%
    filter(!is.na(value)) %>%
    # round(value, digits = 2)
    mutate(
        sample_x = factor(
            sample_x,
            levels = c(groups_selected, "H9", "H9_reset"),
        ),
        sample_y = factor(
            sample_y,
            levels = c(groups_selected, "H9", "H9_reset") %>% rev(.)
        )
    ) %>%
    plot_corr_heatmap(
        x = sample_x,
        y = sample_y,
        z = value
    ) +
    scale_fill_gradientn(
        # limits = c(-1, 1),
        colors = wesanderson::wes_palette(
            "Zissou1",
            21,
            type = "continuous"
        )
    )

## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.

corr_heatmap <- cor(log10(matrix_heatmap_corr + 1))
# colnames(corr_heatmap) == rownames(corr_heatmap)
corr_heatmap <- corr_heatmap[!rownames(corr_heatmap) %in% c("H9", "H9_reset"), ]

ha_columns <- ComplexHeatmap::HeatmapAnnotation(
    lineage = ComplexHeatmap::anno_simple(
        c(groups_selected, "H9", "H9_reset"),
        # pch = anno_labels_cluster,
        col = c(
            setNames(
                object = ggthemes::tableau_color_pal("Tableau 20")(
                    embedding_cy$group %>%
                        nlevels()
                ),
                nm = levels(embedding_cy$group)
            )[groups_selected],
            "H9" = "pink",
            "H9_reset" = "purple"
        ),
        which = "column",
        pt_size = unit(2, "mm"),
        pt_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 6
        ),
        simple_anno_size = unit(3, "mm")
    ),
    #
    Species = ComplexHeatmap::anno_simple(
        c(
            rep("Cynomolgus Monkey", length(groups_selected)),
            rep("Human", 2)
        ),
        # pch = anno_labels_cluster,
        col = c("Cynomolgus Monkey" = "#3D79F3FF", "Human" = "#34A74BFF"),
        which = "column",
        pt_size = unit(2, "mm"),
        pt_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 6
        ),
        simple_anno_size = unit(3, "mm")
    ),
    show_annotation_name = TRUE,
    annotation_label = c(
        "Lineage",
        "Species"
    ),
    annotation_name_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    annotation_name_side = "left"
)

ha_left <- ComplexHeatmap::HeatmapAnnotation(
    lineage = ComplexHeatmap::anno_simple(
        groups_selected,
        col = setNames(
            object = ggthemes::tableau_color_pal("Tableau 20")(
                embedding_cy$group %>%
                    nlevels()
            ),
            nm = levels(embedding_cy$group)
        )[groups_selected],
        which = "row",
        # pt_size = unit(2, "mm"),
        pt_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 5
        ),
        simple_anno_size = unit(3, "mm")
    ),
    which = "row",
    show_annotation_name = FALSE,
    annotation_label = c(
        "Lineage"
    ),
    annotation_name_gp = grid::gpar(fontfamily = "Arial", fontsize = 6)
)

ht <- ComplexHeatmap::Heatmap(
    corr_heatmap,
    # name = "heatmap",
    col = wesanderson::wes_palette("Zissou1", 50, type = "continuous"),
    border = NA,
    rect_gp = grid::gpar(col = "white", lwd = 1),
    cell_fun = function(j, i, x, y, width, height, fill) {
        # grid::grid.text(sprintf("%.2f", corr_heatmap[i, j]), x, y, gp = grid::gpar(fontsize = 6))
        grid::grid.text(round(corr_heatmap[i, j], 2), x, y, gp = grid::gpar(fontsize = 6))
    },
    #
    cluster_rows = FALSE,
    cluster_columns = FALSE,
    #
    ## row_labels = str_remove(string = rownames(matrix_heatmap), pattern = "^E.+_"),
    row_names_side = c("left"),
    row_names_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    show_row_names = TRUE,
    #
    column_names_side = c("top"),
    column_names_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    column_names_rot = -90,
    show_column_names = TRUE,
    #
    show_row_dend = FALSE,
    show_column_dend = FALSE,
    #
    top_annotation = ha_columns,
    left_annotation = ha_left,
    #
    heatmap_legend_param = list(
        title = "Corr",
        title_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 6
        ),
        legend_direction = "vertical",
        labels_gp = grid::gpar(
            fontfamily = "Arial",
            fontsize = 6
        ),
        legend_height = unit(25, "mm"),
        legend_width = unit(10, "mm")
    ),
    #
    column_split = c(
        rep("Cynomolgus Monkey", length(groups_selected)),
        rep("Human", 2)
    ),
    column_gap = unit(2, "mm"),
    column_title_gp = grid::gpar(fontfamily = "Arial", fontsize = 6),
    column_title_rot = -90,
    column_title = NULL
)

lgd_lineage <- ComplexHeatmap::Legend(
    title = "Lineage",
    labels = groups_selected, # embedding_cyEPI_mmEPI %>% pull(group) %>% droplevels() %>% levels(),
    legend_gp = grid::gpar(
        fill = c(
            setNames(
                object = ggthemes::tableau_color_pal("Tableau 20")(
                    embedding_cy$group %>%
                        nlevels()
                ),
                nm = levels(embedding_cy$group)
            )[groups_selected],
            "H9" = "pink",
            "H9_reset" = "purple"
        )
    ),
    labels_gp = grid::gpar(
        fontfamily = "Arial",
        fontsize = 6
    ),
    title_gp = grid::gpar(
        fontfamily = "Arial",
        fontsize = 6
    )
)

lgd_species <- ComplexHeatmap::Legend(
    title = "Species",
    labels = c("Cynomolgus Monkey", "Human"),
    legend_gp = grid::gpar(
        fill = c("Cynomolgus Monkey" = "#3D79F3FF", "Human" = "#34A74BFF")
    ),
    labels_gp = grid::gpar(
        fontfamily = "Arial",
        fontsize = 6
    ),
    title_gp = grid::gpar(
        fontfamily = "Arial",
        fontsize = 6
    )
)

pd <- ComplexHeatmap::packLegend(
    # lgd_lineage,
    lgd_species,
    # gap = unit(8, "mm"),
    direction = "vertical"
)

Fig. 5c (only includes Ref. 36)

ComplexHeatmap::draw(
    ht,
    heatmap_legend_list = list(pd)
)

R session info

devtools::session_info()$platform

##  setting  value                       
##  version  R version 4.0.2 (2020-06-22)
##  os       macOS Catalina 10.15.6      
##  system   x86_64, darwin19.5.0        
##  ui       unknown                     
##  language (EN)                        
##  collate  en_US.UTF-8                 
##  ctype    en_US.UTF-8                 
##  tz       America/Chicago             
##  date     2020-08-18

devtools::session_info()$pack %>%
    as_tibble() %>%
    dplyr::select(
        package,
        loadedversion,
        date,
        `source`
    ) %>%
    # print(n = nrow(.))
    gt::gt() %>%
    gt::tab_options(table.font.size = "median")

package	loadedversion	date	source
AnnotationDbi	1.50.3	2020-07-25	Bioconductor
askpass	1.1	2019-01-13	CRAN (R 4.0.0)
assertthat	0.2.1	2019-03-21	CRAN (R 4.0.0)
backports	1.1.8	2020-06-17	CRAN (R 4.0.1)
BayesFactor	0.9.12-4.2	2018-05-19	CRAN (R 4.0.2)
Biobase	2.48.0	2020-04-27	Bioconductor
BiocFileCache	1.12.1	2020-08-04	Bioconductor
BiocGenerics	0.34.0	2020-04-27	Bioconductor
biomaRt	2.44.1	2020-06-17	Bioconductor
bit	4.0.4	2020-08-04	CRAN (R 4.0.2)
bit64	4.0.2	2020-07-30	CRAN (R 4.0.2)
blob	1.2.1	2020-01-20	CRAN (R 4.0.0)
broom	0.7.0.9001	2020-08-17	Github (tidymodels/broom@591f87e)
callr	3.4.3.9000	2020-07-31	Github (r-lib/callr@b96da8f)
cellranger	1.1.0	2016-07-27	CRAN (R 4.0.0)
checkmate	2.0.0	2020-02-06	CRAN (R 4.0.0)
circlize	0.4.10	2020-06-15	CRAN (R 4.0.1)
cli	2.0.2	2020-02-28	CRAN (R 4.0.0)
clue	0.3-57	2019-02-25	CRAN (R 4.0.0)
cluster	2.1.0	2019-06-19	CRAN (R 4.0.2)
coda	0.19-3	2019-07-05	CRAN (R 4.0.0)
codetools	0.2-16	2018-12-24	CRAN (R 4.0.2)
colorspace	1.4-1	2019-03-18	CRAN (R 4.0.0)
ComplexHeatmap	2.4.3	2020-07-25	Bioconductor
crayon	1.3.4	2017-09-16	CRAN (R 4.0.0)
curl	4.3	2019-12-02	CRAN (R 4.0.0)
DBI	1.1.0	2019-12-15	CRAN (R 4.0.0)
dbplyr	1.4.4.9000	2020-07-28	Github (tidyverse/dbplyr@a6ed629)
desc	1.2.0	2018-05-01	CRAN (R 4.0.0)
devtools	2.3.1.9000	2020-07-31	Github (r-lib/devtools@df619ce)
digest	0.6.25	2020-02-23	CRAN (R 4.0.0)
dplyr	1.0.2	2020-08-12	Github (tidyverse/dplyr@0bea3e8)
ellipsis	0.3.1.9000	2020-07-18	Github (r-lib/ellipsis@57a5071)
evaluate	0.14	2019-05-28	CRAN (R 4.0.0)
extrafont	0.17	2014-12-08	CRAN (R 4.0.2)
extrafontdb	1.0	2012-06-11	CRAN (R 4.0.0)
fansi	0.4.1	2020-01-08	CRAN (R 4.0.0)
farver	2.0.3	2020-01-16	CRAN (R 4.0.0)
forcats	0.5.0.9000	2020-05-28	Github (tidyverse/forcats@ab81d1b)
fs	1.5.0.9000	2020-08-03	Github (r-lib/fs@93e70a9)
generics	0.0.2	2018-11-29	CRAN (R 4.0.0)
GetoptLong	1.0.2	2020-07-06	CRAN (R 4.0.2)
ggplot2	3.3.2.9000	2020-08-14	Github (tidyverse/ggplot2@3be0acc)
ggrepel	0.9.0	2020-07-24	Github (slowkow/ggrepel@4d0ef50)
ggthemes	4.2.0	2019-05-13	CRAN (R 4.0.0)
GlobalOptions	0.1.2	2020-06-10	CRAN (R 4.0.0)
glue	1.4.1.9000	2020-07-07	Github (tidyverse/glue@205f18b)
gt	0.2.2	2020-08-06	Github (rstudio/gt@c97cb4c)
gtable	0.3.0	2019-03-25	CRAN (R 4.0.0)
gtools	3.8.2	2020-03-31	CRAN (R 4.0.0)
haven	2.3.1	2020-06-01	CRAN (R 4.0.0)
hms	0.5.3	2020-01-08	CRAN (R 4.0.0)
htmltools	0.5.0	2020-06-16	CRAN (R 4.0.1)
httr	1.4.2	2020-07-20	CRAN (R 4.0.2)
IRanges	2.22.2	2020-05-21	Bioconductor
jpeg	0.1-8.1	2019-10-24	CRAN (R 4.0.0)
jsonlite	1.7.0	2020-06-25	CRAN (R 4.0.2)
knitr	1.29	2020-06-23	CRAN (R 4.0.2)
labeling	0.3	2014-08-23	CRAN (R 4.0.0)
lattice	0.20-41	2020-04-02	CRAN (R 4.0.2)
lifecycle	0.2.0	2020-03-06	CRAN (R 4.0.0)
lubridate	1.7.9	2020-07-11	Github (tidyverse/lubridate@de2ee09)
magrittr	1.5.0.9000	2020-08-04	Github (tidyverse/magrittr@ba52096)
Matrix	1.2-18	2019-11-27	CRAN (R 4.0.2)
MatrixModels	0.4-1	2015-08-22	CRAN (R 4.0.0)
memoise	1.1.0	2017-04-21	CRAN (R 4.0.0)
modelr	0.1.8.9000	2020-05-19	Github (tidyverse/modelr@16168e0)
munsell	0.5.0	2018-06-12	CRAN (R 4.0.0)
mvtnorm	1.1-1	2020-06-09	CRAN (R 4.0.2)
openssl	1.4.2	2020-06-27	CRAN (R 4.0.2)
org.Hs.eg.db	3.11.4	2020-05-27	Bioconductor
org.Mm.eg.db	3.11.4	2020-06-07	Bioconductor
patchwork	1.0.1.9000	2020-06-22	Github (thomasp85/patchwork@82a5e03)
pbapply	1.4-2	2019-08-31	CRAN (R 4.0.0)
pillar	1.4.6.9000	2020-07-21	Github (r-lib/pillar@8aef8f2)
pkgbuild	1.1.0.9000	2020-07-14	Github (r-lib/pkgbuild@3a87bd9)
pkgconfig	2.0.3	2019-09-22	CRAN (R 4.0.0)
pkgload	1.1.0	2020-05-29	CRAN (R 4.0.0)
png	0.1-7	2013-12-03	CRAN (R 4.0.0)
prettyunits	1.1.1.9000	2020-08-10	Github (r-lib/prettyunits@b1cdad8)
processx	3.4.3	2020-07-05	CRAN (R 4.0.2)
progress	1.2.2	2019-05-16	CRAN (R 4.0.0)
ps	1.3.4	2020-08-11	CRAN (R 4.0.2)
purrr	0.3.4.9000	2020-08-10	Github (tidyverse/purrr@5de5ad2)
pvclust	2.2-0	2020-04-25	Github (shimo-lab/pvclust@e734d05)
R6	2.4.1.9000	2020-07-18	Github (r-lib/R6@1415d11)
rappdirs	0.3.1	2016-03-28	CRAN (R 4.0.0)
RColorBrewer	1.1-2	2014-12-07	CRAN (R 4.0.0)
Rcpp	1.0.5	2020-07-06	CRAN (R 4.0.2)
readr	1.3.1.9000	2020-07-16	Github (tidyverse/readr@2ab51b2)
readxl	1.3.1.9000	2020-05-28	Github (tidyverse/readxl@3815961)
remotes	2.2.0.9000	2020-08-06	Github (r-lib/remotes@5a546ad)
reprex	0.3.0	2019-05-16	CRAN (R 4.0.0)
reticulate	1.16	2020-05-27	CRAN (R 4.0.2)
rjson	0.2.20	2018-06-08	CRAN (R 4.0.0)
rlang	0.4.7.9000	2020-08-12	Github (r-lib/rlang@de0c176)
rmarkdown	2.3.3	2020-07-25	Github (rstudio/rmarkdown@204aa41)
rprojroot	1.3-2	2018-01-03	CRAN (R 4.0.0)
RSQLite	2.2.0	2020-01-07	CRAN (R 4.0.0)
rstudioapi	0.11	2020-02-07	CRAN (R 4.0.2)
Rttf2pt1	1.3.8	2020-01-10	CRAN (R 4.0.0)
rvest	0.3.6	2020-07-25	CRAN (R 4.0.2)
S4Vectors	0.26.1	2020-05-16	Bioconductor
sass	0.2.0	2020-03-18	CRAN (R 4.0.2)
scales	1.1.1.9000	2020-07-24	Github (r-lib/scales@9ff4757)
sessioninfo	1.1.1.9000	2020-07-18	Github (r-lib/sessioninfo@791705b)
shape	1.4.4	2018-02-07	CRAN (R 4.0.0)
stringi	1.4.6	2020-02-17	CRAN (R 4.0.0)
stringr	1.4.0.9000	2020-06-01	Github (tidyverse/stringr@f70c4ba)
styler	1.3.2.9000	2020-08-13	Github (r-lib/styler@7376cf4)
testthat	2.99.0.9000	2020-08-12	Github (r-lib/testthat@9e643d8)
tibble	3.0.3.9000	2020-07-21	Github (tidyverse/tibble@b4eec19)
tidyr	1.1.1.9000	2020-08-18	Github (tidyverse/tidyr@4d0dd69)
tidyselect	1.1.0.9000	2020-07-11	Github (tidyverse/tidyselect@69fdc96)
tidyverse	1.3.0.9000	2020-06-01	Github (hadley/tidyverse@8a0bb99)
usethis	1.6.1.9001	2020-08-17	Github (r-lib/usethis@860c1ea)
vctrs	0.3.2.9000	2020-08-18	Github (r-lib/vctrs@c0dcd3f)
viridisLite	0.3.0	2018-02-01	CRAN (R 4.0.0)
wesanderson	0.3.6.9000	2020-06-05	Github (karthik/wesanderson@d90700a)
withr	2.2.0	2020-04-20	CRAN (R 4.0.0)
xfun	0.16	2020-07-24	CRAN (R 4.0.2)
XML	3.99-0.5	2020-07-23	CRAN (R 4.0.2)
xml2	1.3.2	2020-04-23	CRAN (R 4.0.0)
yaml	2.2.1	2020-02-01	CRAN (R 4.0.0)
yarrr	0.1.6	2020-07-23	Github (ndphillips/yarrr@e2e4488)

Transcriptomic analysis of developmental coordinate of pluripotency among mice, monkeys and humans

JD

2020-08-18

Data preparation

Functions loading

Data loading

Lineage delineation by transcriptome

Transcriptome clustering

Heatmap of selected marker genes

Progressive transitions of cyEPI

Dimensionality reduction

Preprocessing

PCA

Heatmap

Mouse EPI development

Cynomolgus vs mouse EPI development

Matching homologs

Dimensionality reduction

Preprocessing

PCA

Comparion of cyEPI and mEPI

Species specific

Developmental stage

Correlation heatmap

cyEPI development vs hPSCs/cyPSCs

R session info