Data preparation

SEED_USE <- 20200616
MINIMAL_NUM_COUNTS_REQUIRED_FOR_GENE <- 30

Functions loading

source(
    file = file.path(
        SCRIPT_DIR,
        "utilities.R"
    )
)

Data loading

Prepare metadata for single cells.

# https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE138012
geo_info <- tibble::tribble(
    ~sample_name, ~sample_description,
    "GSM4096610",         "H9_pES_R1",
    "GSM4096611",         "H9_pES_R2",
    "GSM4096612",  "H9_t2iLGo_nES_R2",
    "GSM4096613",  "H9_t2iLGo_nES_R1",
    "GSM4096614",  "H9_t2iLGo_nES_R3",
    "GSM4096615",    "H9_h2iL_nES_R1",
    "GSM4096616",    "H9_h2iL_nES_R2",
    "GSM4096617",    "H9_h2iL_nES_R3",
    "GSM4096618",  "H9_t2iLGo_PrE_R2",
    "GSM4096619",  "H9_t2iLGo_PrE_R3",
    "GSM4096620",  "H9_t2iLGo_PrE_R1",
    "GSM4096621",    "H9_h2iL_PrE_R2",
    "GSM4096622",    "H9_h2iL_PrE_R3",
    "GSM4096623",    "H9_h2iL_PrE_R1",
    "GSM4096624",      "HUES4_pES_R1",
    "GSM4096625",      "HUES4_pES_R2",
    "GSM4096626",         "H9_ADE_R1",
    "GSM4096627",         "H9_ADE_R2",
    "GSM4096628",      "HUES4_ADE_R2",
    "GSM4096629",      "HUES4_ADE_R1",
    "GSM4096630",    "H9_RSeT_nES_R1",
    "GSM4096631",    "H9_RSeT_nES_R2",
    "GSM4096632", "H9_t2iLGo_nEnd_R1",
    "GSM4096633", "H9_t2iLGo_nEnd_R2",
    "GSM4096634", "H9_t2iLGo_nEnd_R3",
    "GSM4096635",  "H9_RSeT_PrE_R1-1",
    "GSM4096636",  "H9_RSeT_PrE_R2-1",
    "GSM4096637",          "H9_DE_R1",
    "GSM4096638",          "H9_DE_R2",
    "GSM4096639",          "H9_DE_R3",
    "GSM4096640",  "H9_RSeT_PrE_R1-2",
    "GSM4096641",  "H9_RSeT_PrE_R2-2",
    "GSM4096642",   "H9_RSeT_nEnd_R1",
    "GSM4096643",   "H9_RSeT_nEnd_R2",
    "GSM4096644",   "H9_RSeT_nEnd_R3"
)

cell_metadata <- read_delim(
    file = "../SraRunTable.txt",
    delim = ","
) %>%
    rename_all(. %>% tolower()) %>%
    `colnames<-`(str_replace(
        string = colnames(.),
        pattern = " ", replacement = "_"
    )) %>%
    select(
        sample_name,
        run,
        biosample,
        cell_type,
        background,
        source_name,
        stage
    ) %>%
    left_join(geo_info, by = "sample_name") %>%
    mutate(
        sample_description2 = str_remove(
            string = sample_description,
            pattern = "_R\\d.*$"
        ),
        lineage = str_remove(
            string = sample_description2, pattern = ".+_"
        ),
        lineage = factor(
            lineage,
            levels = c("nES", "pES", "PrE", "nEnd", "DE", "ADE")
        )
    ) %>%
    dplyr::rename(cell_line = "background")

## Parsed with column specification:
## cols(
##   .default = col_character(),
##   AvgSpotLen = col_double(),
##   Bases = col_double(),
##   Bytes = col_double(),
##   ReleaseDate = col_datetime(format = "")
## )

## See spec(...) for full column specifications.

cell_metadata

cell_metadata %>%
    dplyr::count(lineage) %>%
    gt::gt() %>%
    gt::cols_label(n = "num_samples") %>%
    gt::tab_options(table.font.size = "median")

lineage	num_samples
nES	8
pES	4
PrE	10
nEnd	6
DE	3
ADE	4

Load featureCounts output.

matrix_readcount_use <- read_delim(
    file = "../matrix/xaa_Aligned.sortedByCoord.out_deduped_q10_gene_id_featureCounts.txt.gz",
    delim = "\t",
    col_names = TRUE,
    skip = 1
) %>%
    select(-(2:6))

## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Geneid = col_character(),
##   Chr = col_character(),
##   Start = col_character(),
##   End = col_character(),
##   Strand = col_character()
## )

## See spec(...) for full column specifications.

colnames(matrix_readcount_use) <- colnames(matrix_readcount_use) %>%
    str_remove(
        pattern = "_rnaseq/aln/Aligned.sortedByCoord.out_deduped.bam"
    ) %>%
    str_remove(pattern = "../aln/")

matrix_readcount_use <- Matrix(
    data = as.matrix(matrix_readcount_use[, -1]),
    dimnames = list(
        matrix_readcount_use$Geneid,
        colnames(matrix_readcount_use)[-1]
    ),
    sparse = TRUE
)

matrix_readcount_use <- matrix_readcount_use[, colnames(matrix_readcount_use)]
matrix_readcount_use %>%
    dim()

## [1] 33538    35

stopifnot(
    rownames(matrix_readcount_use) == gene_symbo_info$X1
)
rownames(matrix_readcount_use) <- paste(
    rownames(matrix_readcount_use),
    gene_symbo_info$X2,
    sep = "_"
)

colnames(matrix_readcount_use) <- colnames(matrix_readcount_use) %>%
    enframe(value = "run") %>%
    left_join(cell_metadata, by = c("run" = "run")) %>%
    pull("sample_name")


matrix_cpm_use <- calc_cpm(m = matrix_readcount_use)

Preprocessing

Summarize sequencing statistics.

cell_metadata %>%
    mutate(
        num_umis = matrix_readcount_use[, .$sample_name] %>% colSums(),
        num_genes = (matrix_readcount_use[, .$sample_name] > 0) %>% colSums()
    ) %>%
    group_by(cell_type) %>%
    summarise(
        num_samples = n(),
        num_umis = median(num_umis),
        num_genes = median(num_genes)
    ) %>%
    arrange(cell_type) %>%
    as.data.frame() %>%
    gt::gt() %>%
    gt::tab_options(table.font.size = "median")

## `summarise()` ungrouping output (override with `.groups` argument)

cell_type	num_samples	num_umis	num_genes
Conventional pluripotent stem cells	4	7886898	21352.0
Endoderm	7	10541488	21312.0
Extra-embryonic endoderm	16	10733904	21656.0
Naive pluripotent stem cells	8	10144388	20880.5

cell_metadata %>%
    mutate(
        num_umis = matrix_readcount_use[, .$sample_name] %>% colSums(),
        num_genes = (matrix_readcount_use[, .$sample_name] > 0) %>% colSums()
    ) %>%
    group_by(lineage) %>%
    summarise(
        num_samples = n(),
        num_umis = median(num_umis),
        num_genes = median(num_genes)
    ) %>%
    arrange(lineage) %>%
    as.data.frame() %>%
    gt::gt() %>%
    gt::tab_options(table.font.size = "median")

## `summarise()` ungrouping output (override with `.groups` argument)

lineage	num_samples	num_umis	num_genes
nES	8	10144388	20880.5
pES	4	7886898	21352.0
PrE	10	10912106	21923.5
nEnd	6	10372790	20912.5
DE	3	11025105	21312.0
ADE	4	9065590	21527.0

Filter uninformative genes.

matrix_readcount_norm <- matrix_readcount_use
matrix_readcount_norm <- matrix_readcount_norm[
    Matrix::rowSums(
        matrix_readcount_norm
    ) >= MINIMAL_NUM_COUNTS_REQUIRED_FOR_GENE,
]
dim(matrix_readcount_norm)

## [1] 22274    35

Median normalize matrix.

matrix_readcount_norm@x <- median(colSums(matrix_readcount_norm)) *
    (matrix_readcount_norm@x / rep.int(
        colSums(matrix_readcount_norm),
        diff(matrix_readcount_norm@p)
    ))

Dimensionality reduction

PCA

matrix_readcount_norm_log <- matrix_readcount_norm
matrix_readcount_norm_log@x <- log1p(matrix_readcount_norm_log@x)

# z-score
matrix_readcount_norm_log_scaled <- t(
    scale(
        t(
            matrix_readcount_norm_log
        ),
        center = TRUE, scale = TRUE
    )
)
# features_var <- apply(matrix_readcount_norm_log_scaled, 1, var)
# features_use <- rownames(matrix_readcount_norm_log_scaled)[features_var > 0]

pca_out <- prcomp(
    t(matrix_readcount_norm_log_scaled),
    center = FALSE,
    scale = FALSE
)

summary(pca_out)$imp %>%
    t() %>%
    as.data.frame() %>%
    rownames_to_column(var = "component") %>%
    mutate(
        rank = 1:n()
    ) %>%
    # slice(1:33) %>%
    ggplot(
        aes(
            x = rank,
            y = `Proportion of Variance`
        )
    ) +
    geom_point(size = 0.3) +
    theme_bw() +
    scale_x_continuous(
        name = "Component",
        breaks = c(1, seq(5, 30, 5))
    ) +
    scale_y_continuous(
        name = "Proportion of variance",
        labels = scales::percent
    ) +
    theme(
        axis.title = ggplot2::element_text(family = "Arial", size = 6),
        axis.text = ggplot2::element_text(family = "Arial", size = 6),
        panel.grid.minor = ggplot2::element_blank()
    )

file_name <- "Rplot_pca_variance_explained.pdf"
if (!file.exists(file_name)) {
    ggsave(
        filename = file_name,
        useDingbats = FALSE,
        plot = last_plot(),
        device = NULL,
        path = NULL,
        scale = 1,
        width = 55 * 1.5,
        height = 55,
        units = c("mm"),
    )
}

embedding <- pca_out$x %>%
    as.data.frame() %>%
    rownames_to_column(var = "sample_name") %>%
    select(sample_name:PC3) %>%
    left_join(cell_metadata, by = "sample_name") %>%
    dplyr::rename(
        x_pca = PC1,
        y_pca = PC2,
        z_pca = PC3
    ) %>%
    relocate(x_pca, y_pca, z_pca, .after = last_col())

p_embedding_pca1 <- embedding %>%
    mutate(
        category = "PCA"
    ) %>%
    sample_frac() %>%
    plot_pca(
        x = x_pca,
        y = y_pca,
        color = lineage,
        shape = cell_line
    ) +
    scale_color_manual(
        values = ggthemes::tableau_color_pal("Tableau 10")(
            length(unique(embedding$lineage))
        )
    ) +
    scale_shape_manual(values = c(15, 16)) + # seq(length(unique(embedding$cell_line)))) +
    add_xy_label_pca()


p_embedding_pca2 <- embedding %>%
    mutate(
        category = "PCA"
    ) %>%
    sample_frac() %>%
    plot_pca(
        x = x_pca,
        y = z_pca,
        color = lineage,
        shape = cell_line
    ) +
    scale_color_manual(
        values = ggthemes::tableau_color_pal("Tableau 10")(
            length(unique(embedding$lineage))
        )
    ) +
    scale_shape_manual(values = c(15, 16)) +
    add_xy_label_pca(y = "PC3")

p_embedding_pca1 +
    p_embedding_pca2 +
    plot_annotation(theme = theme(plot.margin = margin())) +
    plot_layout(guides = "collect")

file_name <- "Rplot_embedding_pca.pdf"
if (!file.exists(file_name)) {
    ggsave(
        filename = file_name,
        useDingbats = FALSE,
        plot = last_plot(),
        device = NULL,
        path = NULL,
        scale = 1,
        width = 55 * 2.5,
        height = 55,
        units = c("mm"),
    )
}

t-SNE

N_COMPONENTS <- 12

set.seed(seed = SEED_USE)
embedding_rtsne <- Rtsne::Rtsne(
    X = pca_out$x[, 1:N_COMPONENTS],
    perplexity = 5,
    check_duplicates = FALSE,
    pca = FALSE,
    max_iter = 3000,
    verbose = TRUE
)$Y
## Read the 35 x 12 data matrix successfully!
## Using no_dims = 2, perplexity = 5.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.00 seconds (sparsity = 0.556735)!
## Learning embedding...
## Iteration 50: error is 57.329608 (50 iterations in 0.00 seconds)
## Iteration 100: error is 57.746020 (50 iterations in 0.00 seconds)
## Iteration 150: error is 54.897412 (50 iterations in 0.00 seconds)
## Iteration 200: error is 59.825444 (50 iterations in 0.00 seconds)
## Iteration 250: error is 54.314094 (50 iterations in 0.00 seconds)
## Iteration 300: error is 1.980490 (50 iterations in 0.00 seconds)
## Iteration 350: error is 1.421003 (50 iterations in 0.00 seconds)
## Iteration 400: error is 0.850012 (50 iterations in 0.00 seconds)
## Iteration 450: error is 0.454588 (50 iterations in 0.00 seconds)
## Iteration 500: error is 0.284679 (50 iterations in 0.00 seconds)
## Iteration 550: error is 0.128779 (50 iterations in 0.00 seconds)
## Iteration 600: error is 0.114347 (50 iterations in 0.00 seconds)
## Iteration 650: error is 0.109010 (50 iterations in 0.00 seconds)
## Iteration 700: error is 0.084064 (50 iterations in 0.00 seconds)
## Iteration 750: error is 0.077202 (50 iterations in 0.00 seconds)
## Iteration 800: error is 0.076562 (50 iterations in 0.00 seconds)
## Iteration 850: error is 0.074529 (50 iterations in 0.00 seconds)
## Iteration 900: error is 0.078825 (50 iterations in 0.00 seconds)
## Iteration 950: error is 0.077370 (50 iterations in 0.00 seconds)
## Iteration 1000: error is 0.272412 (50 iterations in 0.00 seconds)
## Iteration 1050: error is 0.079904 (50 iterations in 0.00 seconds)
## Iteration 1100: error is 0.073611 (50 iterations in 0.00 seconds)
## Iteration 1150: error is 0.074615 (50 iterations in 0.00 seconds)
## Iteration 1200: error is 0.074144 (50 iterations in 0.00 seconds)
## Iteration 1250: error is 0.154058 (50 iterations in 0.00 seconds)
## Iteration 1300: error is 0.078669 (50 iterations in 0.00 seconds)
## Iteration 1350: error is 0.077498 (50 iterations in 0.00 seconds)
## Iteration 1400: error is 0.076977 (50 iterations in 0.00 seconds)
## Iteration 1450: error is 0.076548 (50 iterations in 0.00 seconds)
## Iteration 1500: error is 0.077090 (50 iterations in 0.00 seconds)
## Iteration 1550: error is 0.074893 (50 iterations in 0.00 seconds)
## Iteration 1600: error is 0.073004 (50 iterations in 0.00 seconds)
## Iteration 1650: error is 0.072412 (50 iterations in 0.00 seconds)
## Iteration 1700: error is 0.077486 (50 iterations in 0.00 seconds)
## Iteration 1750: error is 0.078381 (50 iterations in 0.00 seconds)
## Iteration 1800: error is 0.077254 (50 iterations in 0.00 seconds)
## Iteration 1850: error is 0.077347 (50 iterations in 0.00 seconds)
## Iteration 1900: error is 0.078200 (50 iterations in 0.00 seconds)
## Iteration 1950: error is 0.075983 (50 iterations in 0.00 seconds)
## Iteration 2000: error is 0.076613 (50 iterations in 0.00 seconds)
## Iteration 2050: error is 0.078509 (50 iterations in 0.00 seconds)
## Iteration 2100: error is 0.077969 (50 iterations in 0.00 seconds)
## Iteration 2150: error is 0.077406 (50 iterations in 0.00 seconds)
## Iteration 2200: error is 0.078023 (50 iterations in 0.00 seconds)
## Iteration 2250: error is 0.076945 (50 iterations in 0.00 seconds)
## Iteration 2300: error is 0.077633 (50 iterations in 0.00 seconds)
## Iteration 2350: error is 0.076848 (50 iterations in 0.00 seconds)
## Iteration 2400: error is 0.076851 (50 iterations in 0.00 seconds)
## Iteration 2450: error is 0.076830 (50 iterations in 0.00 seconds)
## Iteration 2500: error is 0.076023 (50 iterations in 0.00 seconds)
## Iteration 2550: error is 0.073397 (50 iterations in 0.00 seconds)
## Iteration 2600: error is 0.077205 (50 iterations in 0.00 seconds)
## Iteration 2650: error is 0.076949 (50 iterations in 0.00 seconds)
## Iteration 2700: error is 0.078875 (50 iterations in 0.00 seconds)
## Iteration 2750: error is 0.077259 (50 iterations in 0.00 seconds)
## Iteration 2800: error is 0.077331 (50 iterations in 0.00 seconds)
## Iteration 2850: error is 0.076959 (50 iterations in 0.00 seconds)
## Iteration 2900: error is 0.076526 (50 iterations in 0.00 seconds)
## Iteration 2950: error is 0.072785 (50 iterations in 0.00 seconds)
## Iteration 3000: error is 0.077396 (50 iterations in 0.00 seconds)
## Fitting performed in 0.14 seconds.

embedding <- cbind(
    embedding,
    embedding_rtsne
) %>%
    dplyr::rename(
        x_tsne = "1",
        y_tsne = "2"
    )

p_embedding_tsne <- embedding %>%
    mutate(
        category = "t-SNE"
    ) %>%
    sample_frac() %>%
    plot_pca(
        x = x_tsne,
        y = y_tsne,
        color = lineage,
        shape = cell_line
    ) +
    labs(x = "Dim 1", y = "Dim 2") +
    scale_color_manual(
        values = ggthemes::tableau_color_pal("Tableau 10")(
            length(unique(embedding$lineage))
        )
    ) +
    scale_shape_manual(values = c(15, 16))

embedding %>%
    dplyr::count(lineage) %>%
    gt::gt() %>%
    gt::cols_label(n = "num_samples") %>%
    gt::tab_options(table.font.size = "median")

lineage	num_samples
nES	8
pES	4
PrE	10
nEnd	6
DE	3
ADE	4

UMAP

N_COMPONENTS <- 12

set.seed(seed = SEED_USE)
embedding_umap <- uwot::umap(
    X = pca_out$x[, 1:N_COMPONENTS],
    n_neighbors = 15,
    n_components = 2,
    metric = "cosine",
    spread = 1,
    min_dist = 0.01,
    verbose = TRUE
)
## 23:47:05 UMAP embedding parameters a = 1.896 b = 0.8006
## 23:47:05 Read 35 rows and found 12 numeric columns
## 23:47:05 Using Annoy for neighbor search, n_neighbors = 15
## 23:47:06 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 23:47:06 Writing NN index file to temp file /var/folders/h1/78b2tkd552ngjl6_ps_5sw7r0000gn/T//RtmpKzbwuV/file51cd2697bdc4
## 23:47:06 Searching Annoy index using 4 threads, search_k = 1500
## 23:47:06 Annoy recall = 100%
## 23:47:06 Commencing smooth kNN distance calibration using 4 threads
## 23:47:07 Initializing from normalized Laplacian + noise
## 23:47:07 Commencing optimization for 500 epochs, with 510 positive edges
## 23:47:07 Optimization finished

embedding <- cbind(
    embedding,
    embedding_umap
) %>%
    dplyr::rename(
        x_umap = "1",
        y_umap = "2"
    )

p_embedding_umap <- embedding %>%
    mutate(
        category = "UMAP"
    ) %>%
    sample_frac() %>%
    plot_pca(
        x = x_umap,
        y = y_umap,
        color = lineage,
        shape = cell_line
    ) +
    labs(x = "Dim 1", y = "Dim 2") +
    scale_color_manual(
        values = ggthemes::tableau_color_pal("Tableau 10")(
            length(unique(embedding$lineage))
        )
    ) +
    scale_shape_manual(values = c(15, 16))

Fig. 3C

p_embedding_tsne +
    p_embedding_umap +
    plot_annotation(theme = theme(plot.margin = margin())) +
    plot_layout(nrow = 1, guides = "collect")

file_name <- "Rplot_embedding.pdf"
if (!file.exists(file_name)) {
    ggsave(
        filename = file_name,
        useDingbats = FALSE,
        plot = last_plot(),
        device = NULL,
        path = NULL,
        scale = 1,
        width = 55 * 2.5,
        height = 55,
        units = c("mm"),
    )
}

Fig. 6BCE
Use whole transcriptomes.

p_embedding_pca1 +
    add_point_labels(
        x = x_pca,
        y = y_pca,
        z = sample_description2
    ) +
    p_embedding_tsne +
    add_point_labels(
        x = x_tsne,
        y = y_tsne,
        z = sample_description2
    ) +
    p_embedding_umap +
    add_point_labels(
        x = x_umap,
        y = y_umap,
        z = sample_description2
    ) +
    plot_annotation(theme = theme(plot.margin = margin())) +
    plot_layout(ncol = 1, guides = "collect")

Naïve pluripotency comparison

library("DESeq2")
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following object is masked from 'package:Matrix':
## 
##     which
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which, which.max, which.min
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:Matrix':
## 
##     expand
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:tidyr':
## 
##     expand
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## The following object is masked from 'package:purrr':
## 
##     reduce
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:rlang':
## 
##     exprs
## Loading required package: DelayedArray
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
## 
##     colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following object is masked from 'package:purrr':
## 
##     simplify
## The following objects are masked from 'package:base':
## 
##     aperm, apply, rowsum
library("BiocParallel")

samples_naïve_H9_h2iL <- c(
    "GSM4096615",
    "GSM4096616",
    "GSM4096617"
)

samples_naïve_H9_t2iLGo <- c(
    "GSM4096612",
    "GSM4096613",
    "GSM4096614"
)

cts <- matrix_readcount_use[, c(samples_naïve_H9_h2iL, samples_naïve_H9_t2iLGo)]
col_data <- data.frame(
    sample = c(
        rep("naïve_H9_h2iL", times = length(samples_naïve_H9_h2iL)),
        rep("naïve_H9_t2iLGo", times = length(samples_naïve_H9_t2iLGo))
    ),
    row.names = c(samples_naïve_H9_h2iL, samples_naïve_H9_t2iLGo)
) %>%
    mutate(
        sample = factor(
            sample,
            levels = c("naïve_H9_t2iLGo", "naïve_H9_h2iL")
        )
    )

dds <- DESeqDataSetFromMatrix(
    countData = cts,
    colData = col_data,
    design = ~sample
)
## converting counts to integer mode
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]

dds <- DESeq(dds)
## estimating size factors
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## fitting model and testing

deseq2_out <- results(dds)
# mcols(deseq2_out)
deseq2_out
## log2 fold change (MLE): sample naïve H9 h2iL vs naïve H9 t2iLGo 
## Wald test p-value: sample naïve H9 h2iL vs naïve H9 t2iLGo 
## DataFrame with 33538 rows and 6 columns
##                              baseMean log2FoldChange     lfcSE      stat
##                             <numeric>      <numeric> <numeric> <numeric>
## ENSG00000243485_MIR1302-2HG  0.189347      -1.179870   4.08047 -0.289150
## ENSG00000237613_FAM138A      0.000000             NA        NA        NA
## ENSG00000186092_OR4F5        0.000000             NA        NA        NA
## ENSG00000238009_AL627309.1   0.145485       0.743684   4.08047  0.182254
## ENSG00000239945_AL627309.3   0.000000             NA        NA        NA
## ...                               ...            ...       ...       ...
## ENSG00000277856_AC233755.2    0.00000             NA        NA        NA
## ENSG00000275063_AC233755.1    0.00000             NA        NA        NA
## ENSG00000271254_AC240274.1   24.21898      -0.751286  0.396887 -1.892947
## ENSG00000277475_AC213203.1    1.95519       0.794062  1.365314  0.581596
## ENSG00000268674_FAM231C       0.00000             NA        NA        NA
##                                pvalue      padj
##                             <numeric> <numeric>
## ENSG00000243485_MIR1302-2HG  0.772466        NA
## ENSG00000237613_FAM138A            NA        NA
## ENSG00000186092_OR4F5              NA        NA
## ENSG00000238009_AL627309.1   0.855383        NA
## ENSG00000239945_AL627309.3         NA        NA
## ...                               ...       ...
## ENSG00000277856_AC233755.2         NA        NA
## ENSG00000275063_AC233755.1         NA        NA
## ENSG00000271254_AC240274.1   0.058365  0.175178
## ENSG00000277475_AC213203.1   0.560839        NA
## ENSG00000268674_FAM231C            NA        NA

summary(deseq2_out)

## 
## out of 24527 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 1901, 7.8%
## LFC < 0 (down)     : 2051, 8.4%
## outliers [1]       : 0, 0%
## low counts [2]     : 9793, 40%
## (mean count < 10)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

Fig. 4G

FEATURES_TO_LABEL <- c(
    "EGR1",
    "CDH8",
    "THBS1",
    "DPT"
)

FEATURES_TO_LABEL <- rownames(matrix_readcount_use)[
    gene_symbo_info$X2 %in% FEATURES_TO_LABEL
]

deseq2_out %>%
    as.data.frame() %>%
    mutate(
        group = case_when(
            log2FoldChange >= 1.5 & padj <= 0.05 ~ "High in naïve_H9_h2iL",
            log2FoldChange <= -1.5 & padj <= 0.05 ~ "High in naïve_H9_t2iLGo",
            TRUE ~ "Not significant"
        )
    ) %>%
    plot_volcano(
        x = log2FoldChange,
        y = -log10(padj),
        z = group, xlim = c(-6, 6)
    ) +
    geom_text_repel(
        data = as.data.frame(deseq2_out)[FEATURES_TO_LABEL, ] %>%
            mutate(
                symbol = rownames(.),
                symbol = str_remove(string = symbol, pattern = "^E.+_")
            ),
        aes(log2FoldChange,
            -log10(padj),
            label = symbol
        ),
        box.padding = 0.5,
        size = 1.8,
        family = "Arial",
        color = "black",
        max.iter = 10000,
        max.overlaps = Inf,
        nudge_x = .15,
        nudge_y = 1,
        segment.curvature = -0.1,
        segment.ncp = 3,
        segment.angle = 20,
    )

## Warning: Removed 18804 rows containing missing values (geom_point).

Expression

Heatmap

embedding %>%
    group_by(sample_description2) %>%
    summarise(
        num_samples = n()
    ) %>%
    gt::gt() %>%
    gt::cols_label(sample_description2 = "group") %>%
    gt::tab_options(table.font.size = "median")

## `summarise()` ungrouping output (override with `.groups` argument)

group	num_samples
H9_ADE	2
H9_DE	3
H9_h2iL_nES	3
H9_h2iL_PrE	3
H9_pES	2
H9_RSeT_nEnd	3
H9_RSeT_nES	2
H9_RSeT_PrE	4
H9_t2iLGo_nEnd	3
H9_t2iLGo_nES	3
H9_t2iLGo_PrE	3
HUES4_ADE	2
HUES4_pES	2

FEATUES_SELECTED <- c(
    "ENSG00000159231_CBR3",
    "ENSG00000107562_CXCL12",
    "ENSG00000143494_VASH2",
    "ENSG00000241186_TDGF1",
    "ENSG00000091972_CD200",
    "ENSG00000181449_SOX2",
    "ENSG00000156574_NODAL",
    "ENSG00000184344_GDF3",
    "ENSG00000165140_FBP1",
    "ENSG00000203909_DPPA5",
    "ENSG00000075388_FGF4",
    "ENSG00000048545_GUCA1A",
    "ENSG00000147596_PRDM14",
    "ENSG00000111704_NANOG",
    "ENSG00000143768_LEFTY2",
    "ENSG00000171872_KLF17",
    "ENSG00000004848_ARX",
    "ENSG00000186103_ARGFX",
    "ENSG00000105370_LIM2",
    #
    "ENSG00000146374_RSPO3",
    "ENSG00000275410_HNF1B",
    "ENSG00000196188_CTSE",
    "ENSG00000164736_SOX17",
    "ENSG00000050730_TNIP3",
    "ENSG00000148702_HABP2",
    "ENSG00000162998_FRZB",
    "ENSG00000128564_VGF",
    "ENSG00000170558_CDH2",
    "ENSG00000134853_PDGFRA",
    "ENSG00000153162_BMP6",
    "ENSG00000125848_FLRT3",
    "ENSG00000187498_COL4A1",
    "ENSG00000134871_COL4A2",
    "ENSG00000115414_FN1",
    "ENSG00000087303_NID2",
    "ENSG00000118137_APOA1",
    "ENSG00000017427_IGF1",
    "ENSG00000107731_UNC5B",
    "ENSG00000136574_GATA4",
    "ENSG00000140937_CDH11",
    "ENSG00000198336_MYL4",
    "ENSG00000141448_GATA6"
)

SAMPLES_SELECTED <- c(
    "GSM4096632",
    "GSM4096633",
    "GSM4096634",
    #
    "GSM4096623",
    "GSM4096621",
    "GSM4096622",
    #
    "GSM4096620",
    "GSM4096618",
    "GSM4096619",
    #
    "GSM4096610",
    "GSM4096611",
    #
    "GSM4096624",
    "GSM4096625",
    #
    "GSM4096615",
    "GSM4096616",
    "GSM4096617",
    #
    "GSM4096613",
    "GSM4096612",
    "GSM4096614"
)

Fig. 6A

calc_cpm(matrix_readcount_use[, SAMPLES_SELECTED])[FEATUES_SELECTED, ] %>%
    as.matrix() %>%
    as.data.frame() %>%
    rownames_to_column(var = "feature") %>%
    pivot_longer(
        -c("feature"),
        names_to = "sample"
    ) %>%
    mutate(
        feature = factor(
            feature,
            levels = FEATUES_SELECTED %>% rev(.)
        ),
        sample = factor(
            sample,
            levels = SAMPLES_SELECTED
        )
    ) %>%
    plot_heatmap(
        x = sample,
        y = feature,
        z = log10(value + 1),
        y_order = rev(FEATUES_SELECTED)
    ) +
    scale_x_discrete(
        labels = embedding[match(SAMPLES_SELECTED, embedding$sample_name), "sample_description"]
    ) +
    geom_hline(yintercept = 23.5, color = "green") +
    geom_vline(xintercept = 9.5, color = "green")

R session info

devtools::session_info()$platform

##  setting  value                       
##  version  R version 4.0.2 (2020-06-22)
##  os       macOS Catalina 10.15.6      
##  system   x86_64, darwin19.5.0        
##  ui       unknown                     
##  language (EN)                        
##  collate  en_US.UTF-8                 
##  ctype    en_US.UTF-8                 
##  tz       America/Chicago             
##  date     2020-08-10

devtools::session_info()$pack %>%
    as_tibble() %>%
    select(
        package,
        loadedversion,
        date,
        `source`
    ) %>%
    # print(n = nrow(.))
    gt::gt() %>%
    gt::tab_options(table.font.size = "median")

package	loadedversion	date	source
annotate	1.66.0	2020-04-27	Bioconductor
AnnotationDbi	1.50.3	2020-07-25	Bioconductor
assertthat	0.2.1	2019-03-21	CRAN (R 4.0.0)
backports	1.1.8	2020-06-17	CRAN (R 4.0.1)
Biobase	2.48.0	2020-04-27	Bioconductor
BiocGenerics	0.34.0	2020-04-27	Bioconductor
BiocParallel	1.22.0	2020-04-27	Bioconductor
bit	4.0.4	2020-08-04	CRAN (R 4.0.2)
bit64	4.0.2	2020-07-30	CRAN (R 4.0.2)
bitops	1.0-6	2013-08-17	CRAN (R 4.0.0)
blob	1.2.1	2020-01-20	CRAN (R 4.0.0)
broom	0.7.0.9001	2020-08-09	Github (tidymodels/broom@a0bb105)
callr	3.4.3.9000	2020-07-31	Github (r-lib/callr@b96da8f)
cellranger	1.1.0	2016-07-27	CRAN (R 4.0.0)
checkmate	2.0.0	2020-02-06	CRAN (R 4.0.0)
cli	2.0.2	2020-02-28	CRAN (R 4.0.0)
codetools	0.2-16	2018-12-24	CRAN (R 4.0.2)
colorspace	1.4-1	2019-03-18	CRAN (R 4.0.0)
crayon	1.3.4	2017-09-16	CRAN (R 4.0.0)
DBI	1.1.0	2019-12-15	CRAN (R 4.0.0)
dbplyr	1.4.4.9000	2020-07-28	Github (tidyverse/dbplyr@a6ed629)
DelayedArray	0.14.1	2020-07-14	Bioconductor
desc	1.2.0	2018-05-01	CRAN (R 4.0.0)
DESeq2	1.28.1	2020-05-12	Bioconductor
devtools	2.3.1.9000	2020-07-31	Github (r-lib/devtools@df619ce)
digest	0.6.25	2020-02-23	CRAN (R 4.0.0)
dplyr	1.0.1.9001	2020-08-10	Github (tidyverse/dplyr@166aed1)
ellipsis	0.3.1.9000	2020-07-18	Github (r-lib/ellipsis@57a5071)
evaluate	0.14	2019-05-28	CRAN (R 4.0.0)
extrafont	0.17	2014-12-08	CRAN (R 4.0.2)
extrafontdb	1.0	2012-06-11	CRAN (R 4.0.0)
fansi	0.4.1	2020-01-08	CRAN (R 4.0.0)
farver	2.0.3	2020-01-16	CRAN (R 4.0.0)
forcats	0.5.0.9000	2020-05-28	Github (tidyverse/forcats@ab81d1b)
fs	1.5.0.9000	2020-08-03	Github (r-lib/fs@93e70a9)
genefilter	1.70.0	2020-04-27	Bioconductor
geneplotter	1.66.0	2020-04-27	Bioconductor
generics	0.0.2	2018-11-29	CRAN (R 4.0.0)
GenomeInfoDb	1.24.2	2020-06-15	Bioconductor
GenomeInfoDbData	1.2.3	2020-04-24	Bioconductor
GenomicRanges	1.40.0	2020-04-27	Bioconductor
ggplot2	3.3.2.9000	2020-08-04	Github (tidyverse/ggplot2@6d91349)
ggrepel	0.9.0	2020-07-24	Github (slowkow/ggrepel@4d0ef50)
ggthemes	4.2.0	2019-05-13	CRAN (R 4.0.0)
glue	1.4.1.9000	2020-07-07	Github (tidyverse/glue@205f18b)
gt	0.2.2	2020-08-06	Github (rstudio/gt@c97cb4c)
gtable	0.3.0	2019-03-25	CRAN (R 4.0.0)
haven	2.3.1	2020-06-01	CRAN (R 4.0.0)
hms	0.5.3	2020-01-08	CRAN (R 4.0.0)
htmltools	0.5.0	2020-06-16	CRAN (R 4.0.1)
httr	1.4.2	2020-07-20	CRAN (R 4.0.2)
IRanges	2.22.2	2020-05-21	Bioconductor
jsonlite	1.7.0	2020-06-25	CRAN (R 4.0.2)
knitr	1.29	2020-06-23	CRAN (R 4.0.2)
labeling	0.3	2014-08-23	CRAN (R 4.0.0)
lattice	0.20-41	2020-04-02	CRAN (R 4.0.2)
lifecycle	0.2.0	2020-03-06	CRAN (R 4.0.0)
locfit	1.5-9.4	2020-03-25	CRAN (R 4.0.0)
lubridate	1.7.9	2020-07-11	Github (tidyverse/lubridate@de2ee09)
magrittr	1.5.0.9000	2020-08-04	Github (tidyverse/magrittr@ba52096)
Matrix	1.2-18	2019-11-27	CRAN (R 4.0.2)
matrixStats	0.56.0	2020-03-13	CRAN (R 4.0.0)
memoise	1.1.0	2017-04-21	CRAN (R 4.0.0)
modelr	0.1.8.9000	2020-05-19	Github (tidyverse/modelr@16168e0)
munsell	0.5.0	2018-06-12	CRAN (R 4.0.0)
patchwork	1.0.1.9000	2020-06-22	Github (thomasp85/patchwork@82a5e03)
pillar	1.4.6.9000	2020-07-21	Github (r-lib/pillar@8aef8f2)
pkgbuild	1.1.0.9000	2020-07-14	Github (r-lib/pkgbuild@3a87bd9)
pkgconfig	2.0.3	2019-09-22	CRAN (R 4.0.0)
pkgload	1.1.0	2020-05-29	CRAN (R 4.0.0)
prettyunits	1.1.1.9000	2020-08-10	Github (r-lib/prettyunits@b1cdad8)
processx	3.4.3	2020-07-05	CRAN (R 4.0.2)
ps	1.3.3	2020-05-08	CRAN (R 4.0.0)
purrr	0.3.4.9000	2020-08-10	Github (tidyverse/purrr@5de5ad2)
R6	2.4.1.9000	2020-07-18	Github (r-lib/R6@1415d11)
RColorBrewer	1.1-2	2014-12-07	CRAN (R 4.0.0)
Rcpp	1.0.5	2020-07-06	CRAN (R 4.0.2)
RcppAnnoy	0.0.16	2020-03-08	CRAN (R 4.0.0)
RCurl	1.98-1.2	2020-04-18	CRAN (R 4.0.0)
readr	1.3.1.9000	2020-07-16	Github (tidyverse/readr@2ab51b2)
readxl	1.3.1.9000	2020-05-28	Github (tidyverse/readxl@3815961)
remotes	2.2.0.9000	2020-08-06	Github (r-lib/remotes@5a546ad)
reprex	0.3.0	2019-05-16	CRAN (R 4.0.0)
rlang	0.4.7.9000	2020-07-31	Github (r-lib/rlang@a144ac0)
rmarkdown	2.3.3	2020-07-25	Github (rstudio/rmarkdown@204aa41)
rprojroot	1.3-2	2018-01-03	CRAN (R 4.0.0)
RSpectra	0.16-0	2019-12-01	CRAN (R 4.0.2)
RSQLite	2.2.0	2020-01-07	CRAN (R 4.0.0)
rstudioapi	0.11.0-9000	2020-07-31	Github (rstudio/rstudioapi@aa17630)
Rtsne	0.16	2020-07-03	Github (jkrijthe/Rtsne@14b195f)
Rttf2pt1	1.3.8	2020-01-10	CRAN (R 4.0.0)
rvest	0.3.6	2020-07-25	CRAN (R 4.0.2)
S4Vectors	0.26.1	2020-05-16	Bioconductor
sass	0.2.0	2020-03-18	CRAN (R 4.0.2)
scales	1.1.1.9000	2020-07-24	Github (r-lib/scales@9ff4757)
sessioninfo	1.1.1.9000	2020-07-18	Github (r-lib/sessioninfo@791705b)
stringi	1.4.6	2020-02-17	CRAN (R 4.0.0)
stringr	1.4.0.9000	2020-06-01	Github (tidyverse/stringr@f70c4ba)
styler	1.3.2.9000	2020-07-25	Github (r-lib/styler@16d815e)
SummarizedExperiment	1.18.2	2020-07-09	Bioconductor
survival	3.2-3	2020-06-13	CRAN (R 4.0.2)
testthat	2.99.0.9000	2020-07-28	Github (r-lib/testthat@0af11cd)
tibble	3.0.3.9000	2020-07-21	Github (tidyverse/tibble@b4eec19)
tidyr	1.1.1.9000	2020-07-31	Github (tidyverse/tidyr@61e9209)
tidyselect	1.1.0.9000	2020-07-11	Github (tidyverse/tidyselect@69fdc96)
tidyverse	1.3.0.9000	2020-06-01	Github (hadley/tidyverse@8a0bb99)
usethis	1.6.1.9001	2020-08-06	Github (r-lib/usethis@51fcdb8)
uwot	0.1.8.9000	2020-08-03	Github (jlmelville/uwot@db9e397)
vctrs	0.3.2.9000	2020-08-04	Github (r-lib/vctrs@1b3cd7c)
viridisLite	0.3.0	2018-02-01	CRAN (R 4.0.0)
withr	2.2.0	2020-04-20	CRAN (R 4.0.0)
xfun	0.16	2020-07-24	CRAN (R 4.0.2)
XML	3.99-0.5	2020-07-23	CRAN (R 4.0.2)
xml2	1.3.2	2020-04-23	CRAN (R 4.0.0)
xtable	1.8-4	2019-04-21	CRAN (R 4.0.0)
XVector	0.28.0	2020-04-27	Bioconductor
yaml	2.2.1	2020-02-01	CRAN (R 4.0.0)
zlibbioc	1.34.0	2020-04-27	Bioconductor

Transcriptomic analysis of expandable naïve extra-embryonic endoderm derived from naïve human pluripotent stem cells

JD

2020-08-10

Data preparation

Functions loading

Data loading

Preprocessing

Dimensionality reduction

PCA

t-SNE

UMAP

Naïve pluripotency comparison

Expression

Heatmap

R session info