Linneberg-Agerholm, M., Wong, Y.F., Romero Herrera, J.A., Monteiro, R.S., Anderson, K.G.V., and Brickman, J.M. (2019). Naïve human pluripotent stem cells respond to Wnt, Nodal and LIF signalling to produce expandable naïve extra-embryonic endoderm. Development 146.

  • BioProject Accession: PRJNA574150
  • GEO Accession: GSE138012



Load required packages.

library(tidyverse)
library(magrittr)
library(Matrix)
library(extrafont)
library(ggrepel)
library(patchwork)
# library(tidylog)
Sys.time()
## [1] "2020-08-10 23:46:59 CDT"

Data preparation

SEED_USE <- 20200616
MINIMAL_NUM_COUNTS_REQUIRED_FOR_GENE <- 30

Functions loading

source(
    file = file.path(
        SCRIPT_DIR,
        "utilities.R"
    )
)

Data loading

Prepare metadata for single cells.

# https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE138012
geo_info <- tibble::tribble(
    ~sample_name, ~sample_description,
    "GSM4096610",         "H9_pES_R1",
    "GSM4096611",         "H9_pES_R2",
    "GSM4096612",  "H9_t2iLGo_nES_R2",
    "GSM4096613",  "H9_t2iLGo_nES_R1",
    "GSM4096614",  "H9_t2iLGo_nES_R3",
    "GSM4096615",    "H9_h2iL_nES_R1",
    "GSM4096616",    "H9_h2iL_nES_R2",
    "GSM4096617",    "H9_h2iL_nES_R3",
    "GSM4096618",  "H9_t2iLGo_PrE_R2",
    "GSM4096619",  "H9_t2iLGo_PrE_R3",
    "GSM4096620",  "H9_t2iLGo_PrE_R1",
    "GSM4096621",    "H9_h2iL_PrE_R2",
    "GSM4096622",    "H9_h2iL_PrE_R3",
    "GSM4096623",    "H9_h2iL_PrE_R1",
    "GSM4096624",      "HUES4_pES_R1",
    "GSM4096625",      "HUES4_pES_R2",
    "GSM4096626",         "H9_ADE_R1",
    "GSM4096627",         "H9_ADE_R2",
    "GSM4096628",      "HUES4_ADE_R2",
    "GSM4096629",      "HUES4_ADE_R1",
    "GSM4096630",    "H9_RSeT_nES_R1",
    "GSM4096631",    "H9_RSeT_nES_R2",
    "GSM4096632", "H9_t2iLGo_nEnd_R1",
    "GSM4096633", "H9_t2iLGo_nEnd_R2",
    "GSM4096634", "H9_t2iLGo_nEnd_R3",
    "GSM4096635",  "H9_RSeT_PrE_R1-1",
    "GSM4096636",  "H9_RSeT_PrE_R2-1",
    "GSM4096637",          "H9_DE_R1",
    "GSM4096638",          "H9_DE_R2",
    "GSM4096639",          "H9_DE_R3",
    "GSM4096640",  "H9_RSeT_PrE_R1-2",
    "GSM4096641",  "H9_RSeT_PrE_R2-2",
    "GSM4096642",   "H9_RSeT_nEnd_R1",
    "GSM4096643",   "H9_RSeT_nEnd_R2",
    "GSM4096644",   "H9_RSeT_nEnd_R3"
)
cell_metadata <- read_delim(
    file = "../SraRunTable.txt",
    delim = ","
) %>%
    rename_all(. %>% tolower()) %>%
    `colnames<-`(str_replace(
        string = colnames(.),
        pattern = " ", replacement = "_"
    )) %>%
    select(
        sample_name,
        run,
        biosample,
        cell_type,
        background,
        source_name,
        stage
    ) %>%
    left_join(geo_info, by = "sample_name") %>%
    mutate(
        sample_description2 = str_remove(
            string = sample_description,
            pattern = "_R\\d.*$"
        ),
        lineage = str_remove(
            string = sample_description2, pattern = ".+_"
        ),
        lineage = factor(
            lineage,
            levels = c("nES", "pES", "PrE", "nEnd", "DE", "ADE")
        )
    ) %>%
    dplyr::rename(cell_line = "background")
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   AvgSpotLen = col_double(),
##   Bases = col_double(),
##   Bytes = col_double(),
##   ReleaseDate = col_datetime(format = "")
## )
## See spec(...) for full column specifications.
cell_metadata
cell_metadata %>%
    dplyr::count(lineage) %>%
    gt::gt() %>%
    gt::cols_label(n = "num_samples") %>%
    gt::tab_options(table.font.size = "median")
lineage num_samples
nES 8
pES 4
PrE 10
nEnd 6
DE 3
ADE 4

Load featureCounts output.

matrix_readcount_use <- read_delim(
    file = "../matrix/xaa_Aligned.sortedByCoord.out_deduped_q10_gene_id_featureCounts.txt.gz",
    delim = "\t",
    col_names = TRUE,
    skip = 1
) %>%
    select(-(2:6))
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Geneid = col_character(),
##   Chr = col_character(),
##   Start = col_character(),
##   End = col_character(),
##   Strand = col_character()
## )
## See spec(...) for full column specifications.
colnames(matrix_readcount_use) <- colnames(matrix_readcount_use) %>%
    str_remove(
        pattern = "_rnaseq/aln/Aligned.sortedByCoord.out_deduped.bam"
    ) %>%
    str_remove(pattern = "../aln/")

matrix_readcount_use <- Matrix(
    data = as.matrix(matrix_readcount_use[, -1]),
    dimnames = list(
        matrix_readcount_use$Geneid,
        colnames(matrix_readcount_use)[-1]
    ),
    sparse = TRUE
)

matrix_readcount_use <- matrix_readcount_use[, colnames(matrix_readcount_use)]
matrix_readcount_use %>%
    dim()
## [1] 33538    35
stopifnot(
    rownames(matrix_readcount_use) == gene_symbo_info$X1
)
rownames(matrix_readcount_use) <- paste(
    rownames(matrix_readcount_use),
    gene_symbo_info$X2,
    sep = "_"
)

colnames(matrix_readcount_use) <- colnames(matrix_readcount_use) %>%
    enframe(value = "run") %>%
    left_join(cell_metadata, by = c("run" = "run")) %>%
    pull("sample_name")


matrix_cpm_use <- calc_cpm(m = matrix_readcount_use)

Preprocessing

Summarize sequencing statistics.

cell_metadata %>%
    mutate(
        num_umis = matrix_readcount_use[, .$sample_name] %>% colSums(),
        num_genes = (matrix_readcount_use[, .$sample_name] > 0) %>% colSums()
    ) %>%
    group_by(cell_type) %>%
    summarise(
        num_samples = n(),
        num_umis = median(num_umis),
        num_genes = median(num_genes)
    ) %>%
    arrange(cell_type) %>%
    as.data.frame() %>%
    gt::gt() %>%
    gt::tab_options(table.font.size = "median")
## `summarise()` ungrouping output (override with `.groups` argument)

cell_type num_samples num_umis num_genes
Conventional pluripotent stem cells 4 7886898 21352.0
Endoderm 7 10541488 21312.0
Extra-embryonic endoderm 16 10733904 21656.0
Naive pluripotent stem cells 8 10144388 20880.5

cell_metadata %>%
    mutate(
        num_umis = matrix_readcount_use[, .$sample_name] %>% colSums(),
        num_genes = (matrix_readcount_use[, .$sample_name] > 0) %>% colSums()
    ) %>%
    group_by(lineage) %>%
    summarise(
        num_samples = n(),
        num_umis = median(num_umis),
        num_genes = median(num_genes)
    ) %>%
    arrange(lineage) %>%
    as.data.frame() %>%
    gt::gt() %>%
    gt::tab_options(table.font.size = "median")
## `summarise()` ungrouping output (override with `.groups` argument)

lineage num_samples num_umis num_genes
nES 8 10144388 20880.5
pES 4 7886898 21352.0
PrE 10 10912106 21923.5
nEnd 6 10372790 20912.5
DE 3 11025105 21312.0
ADE 4 9065590 21527.0

Filter uninformative genes.

matrix_readcount_norm <- matrix_readcount_use
matrix_readcount_norm <- matrix_readcount_norm[
    Matrix::rowSums(
        matrix_readcount_norm
    ) >= MINIMAL_NUM_COUNTS_REQUIRED_FOR_GENE,
]
dim(matrix_readcount_norm)
## [1] 22274    35

Median normalize matrix.

matrix_readcount_norm@x <- median(colSums(matrix_readcount_norm)) *
    (matrix_readcount_norm@x / rep.int(
        colSums(matrix_readcount_norm),
        diff(matrix_readcount_norm@p)
    ))

Dimensionality reduction

PCA

matrix_readcount_norm_log <- matrix_readcount_norm
matrix_readcount_norm_log@x <- log1p(matrix_readcount_norm_log@x)

# z-score
matrix_readcount_norm_log_scaled <- t(
    scale(
        t(
            matrix_readcount_norm_log
        ),
        center = TRUE, scale = TRUE
    )
)
# features_var <- apply(matrix_readcount_norm_log_scaled, 1, var)
# features_use <- rownames(matrix_readcount_norm_log_scaled)[features_var > 0]

pca_out <- prcomp(
    t(matrix_readcount_norm_log_scaled),
    center = FALSE,
    scale = FALSE
)
summary(pca_out)$imp %>%
    t() %>%
    as.data.frame() %>%
    rownames_to_column(var = "component") %>%
    mutate(
        rank = 1:n()
    ) %>%
    # slice(1:33) %>%
    ggplot(
        aes(
            x = rank,
            y = `Proportion of Variance`
        )
    ) +
    geom_point(size = 0.3) +
    theme_bw() +
    scale_x_continuous(
        name = "Component",
        breaks = c(1, seq(5, 30, 5))
    ) +
    scale_y_continuous(
        name = "Proportion of variance",
        labels = scales::percent
    ) +
    theme(
        axis.title = ggplot2::element_text(family = "Arial", size = 6),
        axis.text = ggplot2::element_text(family = "Arial", size = 6),
        panel.grid.minor = ggplot2::element_blank()
    )

file_name <- "Rplot_pca_variance_explained.pdf"
if (!file.exists(file_name)) {
    ggsave(
        filename = file_name,
        useDingbats = FALSE,
        plot = last_plot(),
        device = NULL,
        path = NULL,
        scale = 1,
        width = 55 * 1.5,
        height = 55,
        units = c("mm"),
    )
}
embedding <- pca_out$x %>%
    as.data.frame() %>%
    rownames_to_column(var = "sample_name") %>%
    select(sample_name:PC3) %>%
    left_join(cell_metadata, by = "sample_name") %>%
    dplyr::rename(
        x_pca = PC1,
        y_pca = PC2,
        z_pca = PC3
    ) %>%
    relocate(x_pca, y_pca, z_pca, .after = last_col())
p_embedding_pca1 <- embedding %>%
    mutate(
        category = "PCA"
    ) %>%
    sample_frac() %>%
    plot_pca(
        x = x_pca,
        y = y_pca,
        color = lineage,
        shape = cell_line
    ) +
    scale_color_manual(
        values = ggthemes::tableau_color_pal("Tableau 10")(
            length(unique(embedding$lineage))
        )
    ) +
    scale_shape_manual(values = c(15, 16)) + # seq(length(unique(embedding$cell_line)))) +
    add_xy_label_pca()


p_embedding_pca2 <- embedding %>%
    mutate(
        category = "PCA"
    ) %>%
    sample_frac() %>%
    plot_pca(
        x = x_pca,
        y = z_pca,
        color = lineage,
        shape = cell_line
    ) +
    scale_color_manual(
        values = ggthemes::tableau_color_pal("Tableau 10")(
            length(unique(embedding$lineage))
        )
    ) +
    scale_shape_manual(values = c(15, 16)) +
    add_xy_label_pca(y = "PC3")
p_embedding_pca1 +
    p_embedding_pca2 +
    plot_annotation(theme = theme(plot.margin = margin())) +
    plot_layout(guides = "collect")

file_name <- "Rplot_embedding_pca.pdf"
if (!file.exists(file_name)) {
    ggsave(
        filename = file_name,
        useDingbats = FALSE,
        plot = last_plot(),
        device = NULL,
        path = NULL,
        scale = 1,
        width = 55 * 2.5,
        height = 55,
        units = c("mm"),
    )
}

t-SNE

N_COMPONENTS <- 12

set.seed(seed = SEED_USE)
embedding_rtsne <- Rtsne::Rtsne(
    X = pca_out$x[, 1:N_COMPONENTS],
    perplexity = 5,
    check_duplicates = FALSE,
    pca = FALSE,
    max_iter = 3000,
    verbose = TRUE
)$Y
## Read the 35 x 12 data matrix successfully!
## Using no_dims = 2, perplexity = 5.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.00 seconds (sparsity = 0.556735)!
## Learning embedding...
## Iteration 50: error is 57.329608 (50 iterations in 0.00 seconds)
## Iteration 100: error is 57.746020 (50 iterations in 0.00 seconds)
## Iteration 150: error is 54.897412 (50 iterations in 0.00 seconds)
## Iteration 200: error is 59.825444 (50 iterations in 0.00 seconds)
## Iteration 250: error is 54.314094 (50 iterations in 0.00 seconds)
## Iteration 300: error is 1.980490 (50 iterations in 0.00 seconds)
## Iteration 350: error is 1.421003 (50 iterations in 0.00 seconds)
## Iteration 400: error is 0.850012 (50 iterations in 0.00 seconds)
## Iteration 450: error is 0.454588 (50 iterations in 0.00 seconds)
## Iteration 500: error is 0.284679 (50 iterations in 0.00 seconds)
## Iteration 550: error is 0.128779 (50 iterations in 0.00 seconds)
## Iteration 600: error is 0.114347 (50 iterations in 0.00 seconds)
## Iteration 650: error is 0.109010 (50 iterations in 0.00 seconds)
## Iteration 700: error is 0.084064 (50 iterations in 0.00 seconds)
## Iteration 750: error is 0.077202 (50 iterations in 0.00 seconds)
## Iteration 800: error is 0.076562 (50 iterations in 0.00 seconds)
## Iteration 850: error is 0.074529 (50 iterations in 0.00 seconds)
## Iteration 900: error is 0.078825 (50 iterations in 0.00 seconds)
## Iteration 950: error is 0.077370 (50 iterations in 0.00 seconds)
## Iteration 1000: error is 0.272412 (50 iterations in 0.00 seconds)
## Iteration 1050: error is 0.079904 (50 iterations in 0.00 seconds)
## Iteration 1100: error is 0.073611 (50 iterations in 0.00 seconds)
## Iteration 1150: error is 0.074615 (50 iterations in 0.00 seconds)
## Iteration 1200: error is 0.074144 (50 iterations in 0.00 seconds)
## Iteration 1250: error is 0.154058 (50 iterations in 0.00 seconds)
## Iteration 1300: error is 0.078669 (50 iterations in 0.00 seconds)
## Iteration 1350: error is 0.077498 (50 iterations in 0.00 seconds)
## Iteration 1400: error is 0.076977 (50 iterations in 0.00 seconds)
## Iteration 1450: error is 0.076548 (50 iterations in 0.00 seconds)
## Iteration 1500: error is 0.077090 (50 iterations in 0.00 seconds)
## Iteration 1550: error is 0.074893 (50 iterations in 0.00 seconds)
## Iteration 1600: error is 0.073004 (50 iterations in 0.00 seconds)
## Iteration 1650: error is 0.072412 (50 iterations in 0.00 seconds)
## Iteration 1700: error is 0.077486 (50 iterations in 0.00 seconds)
## Iteration 1750: error is 0.078381 (50 iterations in 0.00 seconds)
## Iteration 1800: error is 0.077254 (50 iterations in 0.00 seconds)
## Iteration 1850: error is 0.077347 (50 iterations in 0.00 seconds)
## Iteration 1900: error is 0.078200 (50 iterations in 0.00 seconds)
## Iteration 1950: error is 0.075983 (50 iterations in 0.00 seconds)
## Iteration 2000: error is 0.076613 (50 iterations in 0.00 seconds)
## Iteration 2050: error is 0.078509 (50 iterations in 0.00 seconds)
## Iteration 2100: error is 0.077969 (50 iterations in 0.00 seconds)
## Iteration 2150: error is 0.077406 (50 iterations in 0.00 seconds)
## Iteration 2200: error is 0.078023 (50 iterations in 0.00 seconds)
## Iteration 2250: error is 0.076945 (50 iterations in 0.00 seconds)
## Iteration 2300: error is 0.077633 (50 iterations in 0.00 seconds)
## Iteration 2350: error is 0.076848 (50 iterations in 0.00 seconds)
## Iteration 2400: error is 0.076851 (50 iterations in 0.00 seconds)
## Iteration 2450: error is 0.076830 (50 iterations in 0.00 seconds)
## Iteration 2500: error is 0.076023 (50 iterations in 0.00 seconds)
## Iteration 2550: error is 0.073397 (50 iterations in 0.00 seconds)
## Iteration 2600: error is 0.077205 (50 iterations in 0.00 seconds)
## Iteration 2650: error is 0.076949 (50 iterations in 0.00 seconds)
## Iteration 2700: error is 0.078875 (50 iterations in 0.00 seconds)
## Iteration 2750: error is 0.077259 (50 iterations in 0.00 seconds)
## Iteration 2800: error is 0.077331 (50 iterations in 0.00 seconds)
## Iteration 2850: error is 0.076959 (50 iterations in 0.00 seconds)
## Iteration 2900: error is 0.076526 (50 iterations in 0.00 seconds)
## Iteration 2950: error is 0.072785 (50 iterations in 0.00 seconds)
## Iteration 3000: error is 0.077396 (50 iterations in 0.00 seconds)
## Fitting performed in 0.14 seconds.

embedding <- cbind(
    embedding,
    embedding_rtsne
) %>%
    dplyr::rename(
        x_tsne = "1",
        y_tsne = "2"
    )
p_embedding_tsne <- embedding %>%
    mutate(
        category = "t-SNE"
    ) %>%
    sample_frac() %>%
    plot_pca(
        x = x_tsne,
        y = y_tsne,
        color = lineage,
        shape = cell_line
    ) +
    labs(x = "Dim 1", y = "Dim 2") +
    scale_color_manual(
        values = ggthemes::tableau_color_pal("Tableau 10")(
            length(unique(embedding$lineage))
        )
    ) +
    scale_shape_manual(values = c(15, 16))
embedding %>%
    dplyr::count(lineage) %>%
    gt::gt() %>%
    gt::cols_label(n = "num_samples") %>%
    gt::tab_options(table.font.size = "median")

lineage num_samples
nES 8
pES 4
PrE 10
nEnd 6
DE 3
ADE 4

UMAP

N_COMPONENTS <- 12

set.seed(seed = SEED_USE)
embedding_umap <- uwot::umap(
    X = pca_out$x[, 1:N_COMPONENTS],
    n_neighbors = 15,
    n_components = 2,
    metric = "cosine",
    spread = 1,
    min_dist = 0.01,
    verbose = TRUE
)
## 23:47:05 UMAP embedding parameters a = 1.896 b = 0.8006
## 23:47:05 Read 35 rows and found 12 numeric columns
## 23:47:05 Using Annoy for neighbor search, n_neighbors = 15
## 23:47:06 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 23:47:06 Writing NN index file to temp file /var/folders/h1/78b2tkd552ngjl6_ps_5sw7r0000gn/T//RtmpKzbwuV/file51cd2697bdc4
## 23:47:06 Searching Annoy index using 4 threads, search_k = 1500
## 23:47:06 Annoy recall = 100%
## 23:47:06 Commencing smooth kNN distance calibration using 4 threads
## 23:47:07 Initializing from normalized Laplacian + noise
## 23:47:07 Commencing optimization for 500 epochs, with 510 positive edges
## 23:47:07 Optimization finished

embedding <- cbind(
    embedding,
    embedding_umap
) %>%
    dplyr::rename(
        x_umap = "1",
        y_umap = "2"
    )
p_embedding_umap <- embedding %>%
    mutate(
        category = "UMAP"
    ) %>%
    sample_frac() %>%
    plot_pca(
        x = x_umap,
        y = y_umap,
        color = lineage,
        shape = cell_line
    ) +
    labs(x = "Dim 1", y = "Dim 2") +
    scale_color_manual(
        values = ggthemes::tableau_color_pal("Tableau 10")(
            length(unique(embedding$lineage))
        )
    ) +
    scale_shape_manual(values = c(15, 16))

Fig. 3C

p_embedding_tsne +
    p_embedding_umap +
    plot_annotation(theme = theme(plot.margin = margin())) +
    plot_layout(nrow = 1, guides = "collect")

file_name <- "Rplot_embedding.pdf"
if (!file.exists(file_name)) {
    ggsave(
        filename = file_name,
        useDingbats = FALSE,
        plot = last_plot(),
        device = NULL,
        path = NULL,
        scale = 1,
        width = 55 * 2.5,
        height = 55,
        units = c("mm"),
    )
}



Fig. 6BCE
Use whole transcriptomes.

p_embedding_pca1 +
    add_point_labels(
        x = x_pca,
        y = y_pca,
        z = sample_description2
    ) +
    p_embedding_tsne +
    add_point_labels(
        x = x_tsne,
        y = y_tsne,
        z = sample_description2
    ) +
    p_embedding_umap +
    add_point_labels(
        x = x_umap,
        y = y_umap,
        z = sample_description2
    ) +
    plot_annotation(theme = theme(plot.margin = margin())) +
    plot_layout(ncol = 1, guides = "collect")

Naïve pluripotency comparison

library("DESeq2")
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following object is masked from 'package:Matrix':
## 
##     which
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which, which.max, which.min
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:Matrix':
## 
##     expand
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:tidyr':
## 
##     expand
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## The following object is masked from 'package:purrr':
## 
##     reduce
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:rlang':
## 
##     exprs
## Loading required package: DelayedArray
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
## 
##     colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following object is masked from 'package:purrr':
## 
##     simplify
## The following objects are masked from 'package:base':
## 
##     aperm, apply, rowsum
library("BiocParallel")
samples_naïve_H9_h2iL <- c(
    "GSM4096615",
    "GSM4096616",
    "GSM4096617"
)

samples_naïve_H9_t2iLGo <- c(
    "GSM4096612",
    "GSM4096613",
    "GSM4096614"
)
cts <- matrix_readcount_use[, c(samples_naïve_H9_h2iL, samples_naïve_H9_t2iLGo)]
col_data <- data.frame(
    sample = c(
        rep("naïve_H9_h2iL", times = length(samples_naïve_H9_h2iL)),
        rep("naïve_H9_t2iLGo", times = length(samples_naïve_H9_t2iLGo))
    ),
    row.names = c(samples_naïve_H9_h2iL, samples_naïve_H9_t2iLGo)
) %>%
    mutate(
        sample = factor(
            sample,
            levels = c("naïve_H9_t2iLGo", "naïve_H9_h2iL")
        )
    )

dds <- DESeqDataSetFromMatrix(
    countData = cts,
    colData = col_data,
    design = ~sample
)
## converting counts to integer mode
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]

dds <- DESeq(dds)
## estimating size factors
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## fitting model and testing

deseq2_out <- results(dds)
# mcols(deseq2_out)
deseq2_out
## log2 fold change (MLE): sample naïve H9 h2iL vs naïve H9 t2iLGo 
## Wald test p-value: sample naïve H9 h2iL vs naïve H9 t2iLGo 
## DataFrame with 33538 rows and 6 columns
##                              baseMean log2FoldChange     lfcSE      stat
##                             <numeric>      <numeric> <numeric> <numeric>
## ENSG00000243485_MIR1302-2HG  0.189347      -1.179870   4.08047 -0.289150
## ENSG00000237613_FAM138A      0.000000             NA        NA        NA
## ENSG00000186092_OR4F5        0.000000             NA        NA        NA
## ENSG00000238009_AL627309.1   0.145485       0.743684   4.08047  0.182254
## ENSG00000239945_AL627309.3   0.000000             NA        NA        NA
## ...                               ...            ...       ...       ...
## ENSG00000277856_AC233755.2    0.00000             NA        NA        NA
## ENSG00000275063_AC233755.1    0.00000             NA        NA        NA
## ENSG00000271254_AC240274.1   24.21898      -0.751286  0.396887 -1.892947
## ENSG00000277475_AC213203.1    1.95519       0.794062  1.365314  0.581596
## ENSG00000268674_FAM231C       0.00000             NA        NA        NA
##                                pvalue      padj
##                             <numeric> <numeric>
## ENSG00000243485_MIR1302-2HG  0.772466        NA
## ENSG00000237613_FAM138A            NA        NA
## ENSG00000186092_OR4F5              NA        NA
## ENSG00000238009_AL627309.1   0.855383        NA
## ENSG00000239945_AL627309.3         NA        NA
## ...                               ...       ...
## ENSG00000277856_AC233755.2         NA        NA
## ENSG00000275063_AC233755.1         NA        NA
## ENSG00000271254_AC240274.1   0.058365  0.175178
## ENSG00000277475_AC213203.1   0.560839        NA
## ENSG00000268674_FAM231C            NA        NA
summary(deseq2_out)
## 
## out of 24527 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 1901, 7.8%
## LFC < 0 (down)     : 2051, 8.4%
## outliers [1]       : 0, 0%
## low counts [2]     : 9793, 40%
## (mean count < 10)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results


Fig. 4G

FEATURES_TO_LABEL <- c(
    "EGR1",
    "CDH8",
    "THBS1",
    "DPT"
)

FEATURES_TO_LABEL <- rownames(matrix_readcount_use)[
    gene_symbo_info$X2 %in% FEATURES_TO_LABEL
]

deseq2_out %>%
    as.data.frame() %>%
    mutate(
        group = case_when(
            log2FoldChange >= 1.5 & padj <= 0.05 ~ "High in naïve_H9_h2iL",
            log2FoldChange <= -1.5 & padj <= 0.05 ~ "High in naïve_H9_t2iLGo",
            TRUE ~ "Not significant"
        )
    ) %>%
    plot_volcano(
        x = log2FoldChange,
        y = -log10(padj),
        z = group, xlim = c(-6, 6)
    ) +
    geom_text_repel(
        data = as.data.frame(deseq2_out)[FEATURES_TO_LABEL, ] %>%
            mutate(
                symbol = rownames(.),
                symbol = str_remove(string = symbol, pattern = "^E.+_")
            ),
        aes(log2FoldChange,
            -log10(padj),
            label = symbol
        ),
        box.padding = 0.5,
        size = 1.8,
        family = "Arial",
        color = "black",
        max.iter = 10000,
        max.overlaps = Inf,
        nudge_x = .15,
        nudge_y = 1,
        segment.curvature = -0.1,
        segment.ncp = 3,
        segment.angle = 20,
    )
## Warning: Removed 18804 rows containing missing values (geom_point).

Expression

Heatmap

embedding %>%
    group_by(sample_description2) %>%
    summarise(
        num_samples = n()
    ) %>%
    gt::gt() %>%
    gt::cols_label(sample_description2 = "group") %>%
    gt::tab_options(table.font.size = "median")
## `summarise()` ungrouping output (override with `.groups` argument)
group num_samples
H9_ADE 2
H9_DE 3
H9_h2iL_nES 3
H9_h2iL_PrE 3
H9_pES 2
H9_RSeT_nEnd 3
H9_RSeT_nES 2
H9_RSeT_PrE 4
H9_t2iLGo_nEnd 3
H9_t2iLGo_nES 3
H9_t2iLGo_PrE 3
HUES4_ADE 2
HUES4_pES 2


FEATUES_SELECTED <- c(
    "ENSG00000159231_CBR3",
    "ENSG00000107562_CXCL12",
    "ENSG00000143494_VASH2",
    "ENSG00000241186_TDGF1",
    "ENSG00000091972_CD200",
    "ENSG00000181449_SOX2",
    "ENSG00000156574_NODAL",
    "ENSG00000184344_GDF3",
    "ENSG00000165140_FBP1",
    "ENSG00000203909_DPPA5",
    "ENSG00000075388_FGF4",
    "ENSG00000048545_GUCA1A",
    "ENSG00000147596_PRDM14",
    "ENSG00000111704_NANOG",
    "ENSG00000143768_LEFTY2",
    "ENSG00000171872_KLF17",
    "ENSG00000004848_ARX",
    "ENSG00000186103_ARGFX",
    "ENSG00000105370_LIM2",
    #
    "ENSG00000146374_RSPO3",
    "ENSG00000275410_HNF1B",
    "ENSG00000196188_CTSE",
    "ENSG00000164736_SOX17",
    "ENSG00000050730_TNIP3",
    "ENSG00000148702_HABP2",
    "ENSG00000162998_FRZB",
    "ENSG00000128564_VGF",
    "ENSG00000170558_CDH2",
    "ENSG00000134853_PDGFRA",
    "ENSG00000153162_BMP6",
    "ENSG00000125848_FLRT3",
    "ENSG00000187498_COL4A1",
    "ENSG00000134871_COL4A2",
    "ENSG00000115414_FN1",
    "ENSG00000087303_NID2",
    "ENSG00000118137_APOA1",
    "ENSG00000017427_IGF1",
    "ENSG00000107731_UNC5B",
    "ENSG00000136574_GATA4",
    "ENSG00000140937_CDH11",
    "ENSG00000198336_MYL4",
    "ENSG00000141448_GATA6"
)

SAMPLES_SELECTED <- c(
    "GSM4096632",
    "GSM4096633",
    "GSM4096634",
    #
    "GSM4096623",
    "GSM4096621",
    "GSM4096622",
    #
    "GSM4096620",
    "GSM4096618",
    "GSM4096619",
    #
    "GSM4096610",
    "GSM4096611",
    #
    "GSM4096624",
    "GSM4096625",
    #
    "GSM4096615",
    "GSM4096616",
    "GSM4096617",
    #
    "GSM4096613",
    "GSM4096612",
    "GSM4096614"
)


Fig. 6A

calc_cpm(matrix_readcount_use[, SAMPLES_SELECTED])[FEATUES_SELECTED, ] %>%
    as.matrix() %>%
    as.data.frame() %>%
    rownames_to_column(var = "feature") %>%
    pivot_longer(
        -c("feature"),
        names_to = "sample"
    ) %>%
    mutate(
        feature = factor(
            feature,
            levels = FEATUES_SELECTED %>% rev(.)
        ),
        sample = factor(
            sample,
            levels = SAMPLES_SELECTED
        )
    ) %>%
    plot_heatmap(
        x = sample,
        y = feature,
        z = log10(value + 1),
        y_order = rev(FEATUES_SELECTED)
    ) +
    scale_x_discrete(
        labels = embedding[match(SAMPLES_SELECTED, embedding$sample_name), "sample_description"]
    ) +
    geom_hline(yintercept = 23.5, color = "green") +
    geom_vline(xintercept = 9.5, color = "green")

R session info

devtools::session_info()$platform
##  setting  value                       
##  version  R version 4.0.2 (2020-06-22)
##  os       macOS Catalina 10.15.6      
##  system   x86_64, darwin19.5.0        
##  ui       unknown                     
##  language (EN)                        
##  collate  en_US.UTF-8                 
##  ctype    en_US.UTF-8                 
##  tz       America/Chicago             
##  date     2020-08-10
devtools::session_info()$pack %>%
    as_tibble() %>%
    select(
        package,
        loadedversion,
        date,
        `source`
    ) %>%
    # print(n = nrow(.))
    gt::gt() %>%
    gt::tab_options(table.font.size = "median")
package loadedversion date source
annotate 1.66.0 2020-04-27 Bioconductor
AnnotationDbi 1.50.3 2020-07-25 Bioconductor
assertthat 0.2.1 2019-03-21 CRAN (R 4.0.0)
backports 1.1.8 2020-06-17 CRAN (R 4.0.1)
Biobase 2.48.0 2020-04-27 Bioconductor
BiocGenerics 0.34.0 2020-04-27 Bioconductor
BiocParallel 1.22.0 2020-04-27 Bioconductor
bit 4.0.4 2020-08-04 CRAN (R 4.0.2)
bit64 4.0.2 2020-07-30 CRAN (R 4.0.2)
bitops 1.0-6 2013-08-17 CRAN (R 4.0.0)
blob 1.2.1 2020-01-20 CRAN (R 4.0.0)
broom 0.7.0.9001 2020-08-09 Github (tidymodels/broom@a0bb105)
callr 3.4.3.9000 2020-07-31 Github (r-lib/callr@b96da8f)
cellranger 1.1.0 2016-07-27 CRAN (R 4.0.0)
checkmate 2.0.0 2020-02-06 CRAN (R 4.0.0)
cli 2.0.2 2020-02-28 CRAN (R 4.0.0)
codetools 0.2-16 2018-12-24 CRAN (R 4.0.2)
colorspace 1.4-1 2019-03-18 CRAN (R 4.0.0)
crayon 1.3.4 2017-09-16 CRAN (R 4.0.0)
DBI 1.1.0 2019-12-15 CRAN (R 4.0.0)
dbplyr 1.4.4.9000 2020-07-28 Github (tidyverse/dbplyr@a6ed629)
DelayedArray 0.14.1 2020-07-14 Bioconductor
desc 1.2.0 2018-05-01 CRAN (R 4.0.0)
DESeq2 1.28.1 2020-05-12 Bioconductor
devtools 2.3.1.9000 2020-07-31 Github (r-lib/devtools@df619ce)
digest 0.6.25 2020-02-23 CRAN (R 4.0.0)
dplyr 1.0.1.9001 2020-08-10 Github (tidyverse/dplyr@166aed1)
ellipsis 0.3.1.9000 2020-07-18 Github (r-lib/ellipsis@57a5071)
evaluate 0.14 2019-05-28 CRAN (R 4.0.0)
extrafont 0.17 2014-12-08 CRAN (R 4.0.2)
extrafontdb 1.0 2012-06-11 CRAN (R 4.0.0)
fansi 0.4.1 2020-01-08 CRAN (R 4.0.0)
farver 2.0.3 2020-01-16 CRAN (R 4.0.0)
forcats 0.5.0.9000 2020-05-28 Github (tidyverse/forcats@ab81d1b)
fs 1.5.0.9000 2020-08-03 Github (r-lib/fs@93e70a9)
genefilter 1.70.0 2020-04-27 Bioconductor
geneplotter 1.66.0 2020-04-27 Bioconductor
generics 0.0.2 2018-11-29 CRAN (R 4.0.0)
GenomeInfoDb 1.24.2 2020-06-15 Bioconductor
GenomeInfoDbData 1.2.3 2020-04-24 Bioconductor
GenomicRanges 1.40.0 2020-04-27 Bioconductor
ggplot2 3.3.2.9000 2020-08-04 Github (tidyverse/ggplot2@6d91349)
ggrepel 0.9.0 2020-07-24 Github (slowkow/ggrepel@4d0ef50)
ggthemes 4.2.0 2019-05-13 CRAN (R 4.0.0)
glue 1.4.1.9000 2020-07-07 Github (tidyverse/glue@205f18b)
gt 0.2.2 2020-08-06 Github (rstudio/gt@c97cb4c)
gtable 0.3.0 2019-03-25 CRAN (R 4.0.0)
haven 2.3.1 2020-06-01 CRAN (R 4.0.0)
hms 0.5.3 2020-01-08 CRAN (R 4.0.0)
htmltools 0.5.0 2020-06-16 CRAN (R 4.0.1)
httr 1.4.2 2020-07-20 CRAN (R 4.0.2)
IRanges 2.22.2 2020-05-21 Bioconductor
jsonlite 1.7.0 2020-06-25 CRAN (R 4.0.2)
knitr 1.29 2020-06-23 CRAN (R 4.0.2)
labeling 0.3 2014-08-23 CRAN (R 4.0.0)
lattice 0.20-41 2020-04-02 CRAN (R 4.0.2)
lifecycle 0.2.0 2020-03-06 CRAN (R 4.0.0)
locfit 1.5-9.4 2020-03-25 CRAN (R 4.0.0)
lubridate 1.7.9 2020-07-11 Github (tidyverse/lubridate@de2ee09)
magrittr 1.5.0.9000 2020-08-04 Github (tidyverse/magrittr@ba52096)
Matrix 1.2-18 2019-11-27 CRAN (R 4.0.2)
matrixStats 0.56.0 2020-03-13 CRAN (R 4.0.0)
memoise 1.1.0 2017-04-21 CRAN (R 4.0.0)
modelr 0.1.8.9000 2020-05-19 Github (tidyverse/modelr@16168e0)
munsell 0.5.0 2018-06-12 CRAN (R 4.0.0)
patchwork 1.0.1.9000 2020-06-22 Github (thomasp85/patchwork@82a5e03)
pillar 1.4.6.9000 2020-07-21 Github (r-lib/pillar@8aef8f2)
pkgbuild 1.1.0.9000 2020-07-14 Github (r-lib/pkgbuild@3a87bd9)
pkgconfig 2.0.3 2019-09-22 CRAN (R 4.0.0)
pkgload 1.1.0 2020-05-29 CRAN (R 4.0.0)
prettyunits 1.1.1.9000 2020-08-10 Github (r-lib/prettyunits@b1cdad8)
processx 3.4.3 2020-07-05 CRAN (R 4.0.2)
ps 1.3.3 2020-05-08 CRAN (R 4.0.0)
purrr 0.3.4.9000 2020-08-10 Github (tidyverse/purrr@5de5ad2)
R6 2.4.1.9000 2020-07-18 Github (r-lib/R6@1415d11)
RColorBrewer 1.1-2 2014-12-07 CRAN (R 4.0.0)
Rcpp 1.0.5 2020-07-06 CRAN (R 4.0.2)
RcppAnnoy 0.0.16 2020-03-08 CRAN (R 4.0.0)
RCurl 1.98-1.2 2020-04-18 CRAN (R 4.0.0)
readr 1.3.1.9000 2020-07-16 Github (tidyverse/readr@2ab51b2)
readxl 1.3.1.9000 2020-05-28 Github (tidyverse/readxl@3815961)
remotes 2.2.0.9000 2020-08-06 Github (r-lib/remotes@5a546ad)
reprex 0.3.0 2019-05-16 CRAN (R 4.0.0)
rlang 0.4.7.9000 2020-07-31 Github (r-lib/rlang@a144ac0)
rmarkdown 2.3.3 2020-07-25 Github (rstudio/rmarkdown@204aa41)
rprojroot 1.3-2 2018-01-03 CRAN (R 4.0.0)
RSpectra 0.16-0 2019-12-01 CRAN (R 4.0.2)
RSQLite 2.2.0 2020-01-07 CRAN (R 4.0.0)
rstudioapi 0.11.0-9000 2020-07-31 Github (rstudio/rstudioapi@aa17630)
Rtsne 0.16 2020-07-03 Github (jkrijthe/Rtsne@14b195f)
Rttf2pt1 1.3.8 2020-01-10 CRAN (R 4.0.0)
rvest 0.3.6 2020-07-25 CRAN (R 4.0.2)
S4Vectors 0.26.1 2020-05-16 Bioconductor
sass 0.2.0 2020-03-18 CRAN (R 4.0.2)
scales 1.1.1.9000 2020-07-24 Github (r-lib/scales@9ff4757)
sessioninfo 1.1.1.9000 2020-07-18 Github (r-lib/sessioninfo@791705b)
stringi 1.4.6 2020-02-17 CRAN (R 4.0.0)
stringr 1.4.0.9000 2020-06-01 Github (tidyverse/stringr@f70c4ba)
styler 1.3.2.9000 2020-07-25 Github (r-lib/styler@16d815e)
SummarizedExperiment 1.18.2 2020-07-09 Bioconductor
survival 3.2-3 2020-06-13 CRAN (R 4.0.2)
testthat 2.99.0.9000 2020-07-28 Github (r-lib/testthat@0af11cd)
tibble 3.0.3.9000 2020-07-21 Github (tidyverse/tibble@b4eec19)
tidyr 1.1.1.9000 2020-07-31 Github (tidyverse/tidyr@61e9209)
tidyselect 1.1.0.9000 2020-07-11 Github (tidyverse/tidyselect@69fdc96)
tidyverse 1.3.0.9000 2020-06-01 Github (hadley/tidyverse@8a0bb99)
usethis 1.6.1.9001 2020-08-06 Github (r-lib/usethis@51fcdb8)
uwot 0.1.8.9000 2020-08-03 Github (jlmelville/uwot@db9e397)
vctrs 0.3.2.9000 2020-08-04 Github (r-lib/vctrs@1b3cd7c)
viridisLite 0.3.0 2018-02-01 CRAN (R 4.0.0)
withr 2.2.0 2020-04-20 CRAN (R 4.0.0)
xfun 0.16 2020-07-24 CRAN (R 4.0.2)
XML 3.99-0.5 2020-07-23 CRAN (R 4.0.2)
xml2 1.3.2 2020-04-23 CRAN (R 4.0.0)
xtable 1.8-4 2019-04-21 CRAN (R 4.0.0)
XVector 0.28.0 2020-04-27 Bioconductor
yaml 2.2.1 2020-02-01 CRAN (R 4.0.0)
zlibbioc 1.34.0 2020-04-27 Bioconductor