Linneberg-Agerholm, M., Wong, Y.F., Romero Herrera, J.A., Monteiro, R.S., Anderson, K.G.V., and Brickman, J.M. (2019). Naïve human pluripotent stem cells respond to Wnt, Nodal and LIF signalling to produce expandable naïve extra-embryonic endoderm. Development 146.
Load required packages.
library(tidyverse)
library(magrittr)
library(Matrix)
library(extrafont)
library(ggrepel)
library(patchwork)
# library(tidylog)
Sys.time()
## [1] "2020-08-10 23:46:59 CDT"
SEED_USE <- 20200616
MINIMAL_NUM_COUNTS_REQUIRED_FOR_GENE <- 30
source(
file = file.path(
SCRIPT_DIR,
"utilities.R"
)
)
Prepare metadata for single cells.
# https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE138012
geo_info <- tibble::tribble(
~sample_name, ~sample_description,
"GSM4096610", "H9_pES_R1",
"GSM4096611", "H9_pES_R2",
"GSM4096612", "H9_t2iLGo_nES_R2",
"GSM4096613", "H9_t2iLGo_nES_R1",
"GSM4096614", "H9_t2iLGo_nES_R3",
"GSM4096615", "H9_h2iL_nES_R1",
"GSM4096616", "H9_h2iL_nES_R2",
"GSM4096617", "H9_h2iL_nES_R3",
"GSM4096618", "H9_t2iLGo_PrE_R2",
"GSM4096619", "H9_t2iLGo_PrE_R3",
"GSM4096620", "H9_t2iLGo_PrE_R1",
"GSM4096621", "H9_h2iL_PrE_R2",
"GSM4096622", "H9_h2iL_PrE_R3",
"GSM4096623", "H9_h2iL_PrE_R1",
"GSM4096624", "HUES4_pES_R1",
"GSM4096625", "HUES4_pES_R2",
"GSM4096626", "H9_ADE_R1",
"GSM4096627", "H9_ADE_R2",
"GSM4096628", "HUES4_ADE_R2",
"GSM4096629", "HUES4_ADE_R1",
"GSM4096630", "H9_RSeT_nES_R1",
"GSM4096631", "H9_RSeT_nES_R2",
"GSM4096632", "H9_t2iLGo_nEnd_R1",
"GSM4096633", "H9_t2iLGo_nEnd_R2",
"GSM4096634", "H9_t2iLGo_nEnd_R3",
"GSM4096635", "H9_RSeT_PrE_R1-1",
"GSM4096636", "H9_RSeT_PrE_R2-1",
"GSM4096637", "H9_DE_R1",
"GSM4096638", "H9_DE_R2",
"GSM4096639", "H9_DE_R3",
"GSM4096640", "H9_RSeT_PrE_R1-2",
"GSM4096641", "H9_RSeT_PrE_R2-2",
"GSM4096642", "H9_RSeT_nEnd_R1",
"GSM4096643", "H9_RSeT_nEnd_R2",
"GSM4096644", "H9_RSeT_nEnd_R3"
)
cell_metadata <- read_delim(
file = "../SraRunTable.txt",
delim = ","
) %>%
rename_all(. %>% tolower()) %>%
`colnames<-`(str_replace(
string = colnames(.),
pattern = " ", replacement = "_"
)) %>%
select(
sample_name,
run,
biosample,
cell_type,
background,
source_name,
stage
) %>%
left_join(geo_info, by = "sample_name") %>%
mutate(
sample_description2 = str_remove(
string = sample_description,
pattern = "_R\\d.*$"
),
lineage = str_remove(
string = sample_description2, pattern = ".+_"
),
lineage = factor(
lineage,
levels = c("nES", "pES", "PrE", "nEnd", "DE", "ADE")
)
) %>%
dplyr::rename(cell_line = "background")
## Parsed with column specification:
## cols(
## .default = col_character(),
## AvgSpotLen = col_double(),
## Bases = col_double(),
## Bytes = col_double(),
## ReleaseDate = col_datetime(format = "")
## )
## See spec(...) for full column specifications.
cell_metadata
cell_metadata %>%
dplyr::count(lineage) %>%
gt::gt() %>%
gt::cols_label(n = "num_samples") %>%
gt::tab_options(table.font.size = "median")
lineage | num_samples |
---|---|
nES | 8 |
pES | 4 |
PrE | 10 |
nEnd | 6 |
DE | 3 |
ADE | 4 |
Load featureCounts output.
matrix_readcount_use <- read_delim(
file = "../matrix/xaa_Aligned.sortedByCoord.out_deduped_q10_gene_id_featureCounts.txt.gz",
delim = "\t",
col_names = TRUE,
skip = 1
) %>%
select(-(2:6))
## Parsed with column specification:
## cols(
## .default = col_double(),
## Geneid = col_character(),
## Chr = col_character(),
## Start = col_character(),
## End = col_character(),
## Strand = col_character()
## )
## See spec(...) for full column specifications.
colnames(matrix_readcount_use) <- colnames(matrix_readcount_use) %>%
str_remove(
pattern = "_rnaseq/aln/Aligned.sortedByCoord.out_deduped.bam"
) %>%
str_remove(pattern = "../aln/")
matrix_readcount_use <- Matrix(
data = as.matrix(matrix_readcount_use[, -1]),
dimnames = list(
matrix_readcount_use$Geneid,
colnames(matrix_readcount_use)[-1]
),
sparse = TRUE
)
matrix_readcount_use <- matrix_readcount_use[, colnames(matrix_readcount_use)]
matrix_readcount_use %>%
dim()
## [1] 33538 35
stopifnot(
rownames(matrix_readcount_use) == gene_symbo_info$X1
)
rownames(matrix_readcount_use) <- paste(
rownames(matrix_readcount_use),
gene_symbo_info$X2,
sep = "_"
)
colnames(matrix_readcount_use) <- colnames(matrix_readcount_use) %>%
enframe(value = "run") %>%
left_join(cell_metadata, by = c("run" = "run")) %>%
pull("sample_name")
matrix_cpm_use <- calc_cpm(m = matrix_readcount_use)
Summarize sequencing statistics.
cell_metadata %>%
mutate(
num_umis = matrix_readcount_use[, .$sample_name] %>% colSums(),
num_genes = (matrix_readcount_use[, .$sample_name] > 0) %>% colSums()
) %>%
group_by(cell_type) %>%
summarise(
num_samples = n(),
num_umis = median(num_umis),
num_genes = median(num_genes)
) %>%
arrange(cell_type) %>%
as.data.frame() %>%
gt::gt() %>%
gt::tab_options(table.font.size = "median")
## `summarise()` ungrouping output (override with `.groups` argument)
cell_type | num_samples | num_umis | num_genes |
---|---|---|---|
Conventional pluripotent stem cells | 4 | 7886898 | 21352.0 |
Endoderm | 7 | 10541488 | 21312.0 |
Extra-embryonic endoderm | 16 | 10733904 | 21656.0 |
Naive pluripotent stem cells | 8 | 10144388 | 20880.5 |
cell_metadata %>%
mutate(
num_umis = matrix_readcount_use[, .$sample_name] %>% colSums(),
num_genes = (matrix_readcount_use[, .$sample_name] > 0) %>% colSums()
) %>%
group_by(lineage) %>%
summarise(
num_samples = n(),
num_umis = median(num_umis),
num_genes = median(num_genes)
) %>%
arrange(lineage) %>%
as.data.frame() %>%
gt::gt() %>%
gt::tab_options(table.font.size = "median")
## `summarise()` ungrouping output (override with `.groups` argument)
lineage | num_samples | num_umis | num_genes |
---|---|---|---|
nES | 8 | 10144388 | 20880.5 |
pES | 4 | 7886898 | 21352.0 |
PrE | 10 | 10912106 | 21923.5 |
nEnd | 6 | 10372790 | 20912.5 |
DE | 3 | 11025105 | 21312.0 |
ADE | 4 | 9065590 | 21527.0 |
Filter uninformative genes.
matrix_readcount_norm <- matrix_readcount_use
matrix_readcount_norm <- matrix_readcount_norm[
Matrix::rowSums(
matrix_readcount_norm
) >= MINIMAL_NUM_COUNTS_REQUIRED_FOR_GENE,
]
dim(matrix_readcount_norm)
## [1] 22274 35
Median normalize matrix.
matrix_readcount_norm@x <- median(colSums(matrix_readcount_norm)) *
(matrix_readcount_norm@x / rep.int(
colSums(matrix_readcount_norm),
diff(matrix_readcount_norm@p)
))
matrix_readcount_norm_log <- matrix_readcount_norm
matrix_readcount_norm_log@x <- log1p(matrix_readcount_norm_log@x)
# z-score
matrix_readcount_norm_log_scaled <- t(
scale(
t(
matrix_readcount_norm_log
),
center = TRUE, scale = TRUE
)
)
# features_var <- apply(matrix_readcount_norm_log_scaled, 1, var)
# features_use <- rownames(matrix_readcount_norm_log_scaled)[features_var > 0]
pca_out <- prcomp(
t(matrix_readcount_norm_log_scaled),
center = FALSE,
scale = FALSE
)
summary(pca_out)$imp %>%
t() %>%
as.data.frame() %>%
rownames_to_column(var = "component") %>%
mutate(
rank = 1:n()
) %>%
# slice(1:33) %>%
ggplot(
aes(
x = rank,
y = `Proportion of Variance`
)
) +
geom_point(size = 0.3) +
theme_bw() +
scale_x_continuous(
name = "Component",
breaks = c(1, seq(5, 30, 5))
) +
scale_y_continuous(
name = "Proportion of variance",
labels = scales::percent
) +
theme(
axis.title = ggplot2::element_text(family = "Arial", size = 6),
axis.text = ggplot2::element_text(family = "Arial", size = 6),
panel.grid.minor = ggplot2::element_blank()
)
file_name <- "Rplot_pca_variance_explained.pdf"
if (!file.exists(file_name)) {
ggsave(
filename = file_name,
useDingbats = FALSE,
plot = last_plot(),
device = NULL,
path = NULL,
scale = 1,
width = 55 * 1.5,
height = 55,
units = c("mm"),
)
}
embedding <- pca_out$x %>%
as.data.frame() %>%
rownames_to_column(var = "sample_name") %>%
select(sample_name:PC3) %>%
left_join(cell_metadata, by = "sample_name") %>%
dplyr::rename(
x_pca = PC1,
y_pca = PC2,
z_pca = PC3
) %>%
relocate(x_pca, y_pca, z_pca, .after = last_col())
p_embedding_pca1 <- embedding %>%
mutate(
category = "PCA"
) %>%
sample_frac() %>%
plot_pca(
x = x_pca,
y = y_pca,
color = lineage,
shape = cell_line
) +
scale_color_manual(
values = ggthemes::tableau_color_pal("Tableau 10")(
length(unique(embedding$lineage))
)
) +
scale_shape_manual(values = c(15, 16)) + # seq(length(unique(embedding$cell_line)))) +
add_xy_label_pca()
p_embedding_pca2 <- embedding %>%
mutate(
category = "PCA"
) %>%
sample_frac() %>%
plot_pca(
x = x_pca,
y = z_pca,
color = lineage,
shape = cell_line
) +
scale_color_manual(
values = ggthemes::tableau_color_pal("Tableau 10")(
length(unique(embedding$lineage))
)
) +
scale_shape_manual(values = c(15, 16)) +
add_xy_label_pca(y = "PC3")
p_embedding_pca1 +
p_embedding_pca2 +
plot_annotation(theme = theme(plot.margin = margin())) +
plot_layout(guides = "collect")
file_name <- "Rplot_embedding_pca.pdf"
if (!file.exists(file_name)) {
ggsave(
filename = file_name,
useDingbats = FALSE,
plot = last_plot(),
device = NULL,
path = NULL,
scale = 1,
width = 55 * 2.5,
height = 55,
units = c("mm"),
)
}
N_COMPONENTS <- 12
set.seed(seed = SEED_USE)
embedding_rtsne <- Rtsne::Rtsne(
X = pca_out$x[, 1:N_COMPONENTS],
perplexity = 5,
check_duplicates = FALSE,
pca = FALSE,
max_iter = 3000,
verbose = TRUE
)$Y
## Read the 35 x 12 data matrix successfully!
## Using no_dims = 2, perplexity = 5.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.00 seconds (sparsity = 0.556735)!
## Learning embedding...
## Iteration 50: error is 57.329608 (50 iterations in 0.00 seconds)
## Iteration 100: error is 57.746020 (50 iterations in 0.00 seconds)
## Iteration 150: error is 54.897412 (50 iterations in 0.00 seconds)
## Iteration 200: error is 59.825444 (50 iterations in 0.00 seconds)
## Iteration 250: error is 54.314094 (50 iterations in 0.00 seconds)
## Iteration 300: error is 1.980490 (50 iterations in 0.00 seconds)
## Iteration 350: error is 1.421003 (50 iterations in 0.00 seconds)
## Iteration 400: error is 0.850012 (50 iterations in 0.00 seconds)
## Iteration 450: error is 0.454588 (50 iterations in 0.00 seconds)
## Iteration 500: error is 0.284679 (50 iterations in 0.00 seconds)
## Iteration 550: error is 0.128779 (50 iterations in 0.00 seconds)
## Iteration 600: error is 0.114347 (50 iterations in 0.00 seconds)
## Iteration 650: error is 0.109010 (50 iterations in 0.00 seconds)
## Iteration 700: error is 0.084064 (50 iterations in 0.00 seconds)
## Iteration 750: error is 0.077202 (50 iterations in 0.00 seconds)
## Iteration 800: error is 0.076562 (50 iterations in 0.00 seconds)
## Iteration 850: error is 0.074529 (50 iterations in 0.00 seconds)
## Iteration 900: error is 0.078825 (50 iterations in 0.00 seconds)
## Iteration 950: error is 0.077370 (50 iterations in 0.00 seconds)
## Iteration 1000: error is 0.272412 (50 iterations in 0.00 seconds)
## Iteration 1050: error is 0.079904 (50 iterations in 0.00 seconds)
## Iteration 1100: error is 0.073611 (50 iterations in 0.00 seconds)
## Iteration 1150: error is 0.074615 (50 iterations in 0.00 seconds)
## Iteration 1200: error is 0.074144 (50 iterations in 0.00 seconds)
## Iteration 1250: error is 0.154058 (50 iterations in 0.00 seconds)
## Iteration 1300: error is 0.078669 (50 iterations in 0.00 seconds)
## Iteration 1350: error is 0.077498 (50 iterations in 0.00 seconds)
## Iteration 1400: error is 0.076977 (50 iterations in 0.00 seconds)
## Iteration 1450: error is 0.076548 (50 iterations in 0.00 seconds)
## Iteration 1500: error is 0.077090 (50 iterations in 0.00 seconds)
## Iteration 1550: error is 0.074893 (50 iterations in 0.00 seconds)
## Iteration 1600: error is 0.073004 (50 iterations in 0.00 seconds)
## Iteration 1650: error is 0.072412 (50 iterations in 0.00 seconds)
## Iteration 1700: error is 0.077486 (50 iterations in 0.00 seconds)
## Iteration 1750: error is 0.078381 (50 iterations in 0.00 seconds)
## Iteration 1800: error is 0.077254 (50 iterations in 0.00 seconds)
## Iteration 1850: error is 0.077347 (50 iterations in 0.00 seconds)
## Iteration 1900: error is 0.078200 (50 iterations in 0.00 seconds)
## Iteration 1950: error is 0.075983 (50 iterations in 0.00 seconds)
## Iteration 2000: error is 0.076613 (50 iterations in 0.00 seconds)
## Iteration 2050: error is 0.078509 (50 iterations in 0.00 seconds)
## Iteration 2100: error is 0.077969 (50 iterations in 0.00 seconds)
## Iteration 2150: error is 0.077406 (50 iterations in 0.00 seconds)
## Iteration 2200: error is 0.078023 (50 iterations in 0.00 seconds)
## Iteration 2250: error is 0.076945 (50 iterations in 0.00 seconds)
## Iteration 2300: error is 0.077633 (50 iterations in 0.00 seconds)
## Iteration 2350: error is 0.076848 (50 iterations in 0.00 seconds)
## Iteration 2400: error is 0.076851 (50 iterations in 0.00 seconds)
## Iteration 2450: error is 0.076830 (50 iterations in 0.00 seconds)
## Iteration 2500: error is 0.076023 (50 iterations in 0.00 seconds)
## Iteration 2550: error is 0.073397 (50 iterations in 0.00 seconds)
## Iteration 2600: error is 0.077205 (50 iterations in 0.00 seconds)
## Iteration 2650: error is 0.076949 (50 iterations in 0.00 seconds)
## Iteration 2700: error is 0.078875 (50 iterations in 0.00 seconds)
## Iteration 2750: error is 0.077259 (50 iterations in 0.00 seconds)
## Iteration 2800: error is 0.077331 (50 iterations in 0.00 seconds)
## Iteration 2850: error is 0.076959 (50 iterations in 0.00 seconds)
## Iteration 2900: error is 0.076526 (50 iterations in 0.00 seconds)
## Iteration 2950: error is 0.072785 (50 iterations in 0.00 seconds)
## Iteration 3000: error is 0.077396 (50 iterations in 0.00 seconds)
## Fitting performed in 0.14 seconds.
embedding <- cbind(
embedding,
embedding_rtsne
) %>%
dplyr::rename(
x_tsne = "1",
y_tsne = "2"
)
p_embedding_tsne <- embedding %>%
mutate(
category = "t-SNE"
) %>%
sample_frac() %>%
plot_pca(
x = x_tsne,
y = y_tsne,
color = lineage,
shape = cell_line
) +
labs(x = "Dim 1", y = "Dim 2") +
scale_color_manual(
values = ggthemes::tableau_color_pal("Tableau 10")(
length(unique(embedding$lineage))
)
) +
scale_shape_manual(values = c(15, 16))
embedding %>%
dplyr::count(lineage) %>%
gt::gt() %>%
gt::cols_label(n = "num_samples") %>%
gt::tab_options(table.font.size = "median")
lineage | num_samples |
---|---|
nES | 8 |
pES | 4 |
PrE | 10 |
nEnd | 6 |
DE | 3 |
ADE | 4 |
N_COMPONENTS <- 12
set.seed(seed = SEED_USE)
embedding_umap <- uwot::umap(
X = pca_out$x[, 1:N_COMPONENTS],
n_neighbors = 15,
n_components = 2,
metric = "cosine",
spread = 1,
min_dist = 0.01,
verbose = TRUE
)
## 23:47:05 UMAP embedding parameters a = 1.896 b = 0.8006
## 23:47:05 Read 35 rows and found 12 numeric columns
## 23:47:05 Using Annoy for neighbor search, n_neighbors = 15
## 23:47:06 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 23:47:06 Writing NN index file to temp file /var/folders/h1/78b2tkd552ngjl6_ps_5sw7r0000gn/T//RtmpKzbwuV/file51cd2697bdc4
## 23:47:06 Searching Annoy index using 4 threads, search_k = 1500
## 23:47:06 Annoy recall = 100%
## 23:47:06 Commencing smooth kNN distance calibration using 4 threads
## 23:47:07 Initializing from normalized Laplacian + noise
## 23:47:07 Commencing optimization for 500 epochs, with 510 positive edges
## 23:47:07 Optimization finished
embedding <- cbind(
embedding,
embedding_umap
) %>%
dplyr::rename(
x_umap = "1",
y_umap = "2"
)
p_embedding_umap <- embedding %>%
mutate(
category = "UMAP"
) %>%
sample_frac() %>%
plot_pca(
x = x_umap,
y = y_umap,
color = lineage,
shape = cell_line
) +
labs(x = "Dim 1", y = "Dim 2") +
scale_color_manual(
values = ggthemes::tableau_color_pal("Tableau 10")(
length(unique(embedding$lineage))
)
) +
scale_shape_manual(values = c(15, 16))
Fig. 3C
p_embedding_tsne +
p_embedding_umap +
plot_annotation(theme = theme(plot.margin = margin())) +
plot_layout(nrow = 1, guides = "collect")
file_name <- "Rplot_embedding.pdf"
if (!file.exists(file_name)) {
ggsave(
filename = file_name,
useDingbats = FALSE,
plot = last_plot(),
device = NULL,
path = NULL,
scale = 1,
width = 55 * 2.5,
height = 55,
units = c("mm"),
)
}
Fig. 6BCE
Use whole transcriptomes.
p_embedding_pca1 +
add_point_labels(
x = x_pca,
y = y_pca,
z = sample_description2
) +
p_embedding_tsne +
add_point_labels(
x = x_tsne,
y = y_tsne,
z = sample_description2
) +
p_embedding_umap +
add_point_labels(
x = x_umap,
y = y_umap,
z = sample_description2
) +
plot_annotation(theme = theme(plot.margin = margin())) +
plot_layout(ncol = 1, guides = "collect")
library("DESeq2")
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following object is masked from 'package:Matrix':
##
## which
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:Matrix':
##
## expand
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:tidyr':
##
## expand
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## The following object is masked from 'package:purrr':
##
## reduce
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:rlang':
##
## exprs
## Loading required package: DelayedArray
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
## The following object is masked from 'package:dplyr':
##
## count
##
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
##
## colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following object is masked from 'package:purrr':
##
## simplify
## The following objects are masked from 'package:base':
##
## aperm, apply, rowsum
library("BiocParallel")
samples_naïve_H9_h2iL <- c(
"GSM4096615",
"GSM4096616",
"GSM4096617"
)
samples_naïve_H9_t2iLGo <- c(
"GSM4096612",
"GSM4096613",
"GSM4096614"
)
cts <- matrix_readcount_use[, c(samples_naïve_H9_h2iL, samples_naïve_H9_t2iLGo)]
col_data <- data.frame(
sample = c(
rep("naïve_H9_h2iL", times = length(samples_naïve_H9_h2iL)),
rep("naïve_H9_t2iLGo", times = length(samples_naïve_H9_t2iLGo))
),
row.names = c(samples_naïve_H9_h2iL, samples_naïve_H9_t2iLGo)
) %>%
mutate(
sample = factor(
sample,
levels = c("naïve_H9_t2iLGo", "naïve_H9_h2iL")
)
)
dds <- DESeqDataSetFromMatrix(
countData = cts,
colData = col_data,
design = ~sample
)
## converting counts to integer mode
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
dds <- DESeq(dds)
## estimating size factors
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## fitting model and testing
deseq2_out <- results(dds)
# mcols(deseq2_out)
deseq2_out
## log2 fold change (MLE): sample naïve H9 h2iL vs naïve H9 t2iLGo
## Wald test p-value: sample naïve H9 h2iL vs naïve H9 t2iLGo
## DataFrame with 33538 rows and 6 columns
## baseMean log2FoldChange lfcSE stat
## <numeric> <numeric> <numeric> <numeric>
## ENSG00000243485_MIR1302-2HG 0.189347 -1.179870 4.08047 -0.289150
## ENSG00000237613_FAM138A 0.000000 NA NA NA
## ENSG00000186092_OR4F5 0.000000 NA NA NA
## ENSG00000238009_AL627309.1 0.145485 0.743684 4.08047 0.182254
## ENSG00000239945_AL627309.3 0.000000 NA NA NA
## ... ... ... ... ...
## ENSG00000277856_AC233755.2 0.00000 NA NA NA
## ENSG00000275063_AC233755.1 0.00000 NA NA NA
## ENSG00000271254_AC240274.1 24.21898 -0.751286 0.396887 -1.892947
## ENSG00000277475_AC213203.1 1.95519 0.794062 1.365314 0.581596
## ENSG00000268674_FAM231C 0.00000 NA NA NA
## pvalue padj
## <numeric> <numeric>
## ENSG00000243485_MIR1302-2HG 0.772466 NA
## ENSG00000237613_FAM138A NA NA
## ENSG00000186092_OR4F5 NA NA
## ENSG00000238009_AL627309.1 0.855383 NA
## ENSG00000239945_AL627309.3 NA NA
## ... ... ...
## ENSG00000277856_AC233755.2 NA NA
## ENSG00000275063_AC233755.1 NA NA
## ENSG00000271254_AC240274.1 0.058365 0.175178
## ENSG00000277475_AC213203.1 0.560839 NA
## ENSG00000268674_FAM231C NA NA
summary(deseq2_out)
##
## out of 24527 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 1901, 7.8%
## LFC < 0 (down) : 2051, 8.4%
## outliers [1] : 0, 0%
## low counts [2] : 9793, 40%
## (mean count < 10)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
Fig. 4G
FEATURES_TO_LABEL <- c(
"EGR1",
"CDH8",
"THBS1",
"DPT"
)
FEATURES_TO_LABEL <- rownames(matrix_readcount_use)[
gene_symbo_info$X2 %in% FEATURES_TO_LABEL
]
deseq2_out %>%
as.data.frame() %>%
mutate(
group = case_when(
log2FoldChange >= 1.5 & padj <= 0.05 ~ "High in naïve_H9_h2iL",
log2FoldChange <= -1.5 & padj <= 0.05 ~ "High in naïve_H9_t2iLGo",
TRUE ~ "Not significant"
)
) %>%
plot_volcano(
x = log2FoldChange,
y = -log10(padj),
z = group, xlim = c(-6, 6)
) +
geom_text_repel(
data = as.data.frame(deseq2_out)[FEATURES_TO_LABEL, ] %>%
mutate(
symbol = rownames(.),
symbol = str_remove(string = symbol, pattern = "^E.+_")
),
aes(log2FoldChange,
-log10(padj),
label = symbol
),
box.padding = 0.5,
size = 1.8,
family = "Arial",
color = "black",
max.iter = 10000,
max.overlaps = Inf,
nudge_x = .15,
nudge_y = 1,
segment.curvature = -0.1,
segment.ncp = 3,
segment.angle = 20,
)
## Warning: Removed 18804 rows containing missing values (geom_point).
embedding %>%
group_by(sample_description2) %>%
summarise(
num_samples = n()
) %>%
gt::gt() %>%
gt::cols_label(sample_description2 = "group") %>%
gt::tab_options(table.font.size = "median")
## `summarise()` ungrouping output (override with `.groups` argument)
group | num_samples |
---|---|
H9_ADE | 2 |
H9_DE | 3 |
H9_h2iL_nES | 3 |
H9_h2iL_PrE | 3 |
H9_pES | 2 |
H9_RSeT_nEnd | 3 |
H9_RSeT_nES | 2 |
H9_RSeT_PrE | 4 |
H9_t2iLGo_nEnd | 3 |
H9_t2iLGo_nES | 3 |
H9_t2iLGo_PrE | 3 |
HUES4_ADE | 2 |
HUES4_pES | 2 |
FEATUES_SELECTED <- c(
"ENSG00000159231_CBR3",
"ENSG00000107562_CXCL12",
"ENSG00000143494_VASH2",
"ENSG00000241186_TDGF1",
"ENSG00000091972_CD200",
"ENSG00000181449_SOX2",
"ENSG00000156574_NODAL",
"ENSG00000184344_GDF3",
"ENSG00000165140_FBP1",
"ENSG00000203909_DPPA5",
"ENSG00000075388_FGF4",
"ENSG00000048545_GUCA1A",
"ENSG00000147596_PRDM14",
"ENSG00000111704_NANOG",
"ENSG00000143768_LEFTY2",
"ENSG00000171872_KLF17",
"ENSG00000004848_ARX",
"ENSG00000186103_ARGFX",
"ENSG00000105370_LIM2",
#
"ENSG00000146374_RSPO3",
"ENSG00000275410_HNF1B",
"ENSG00000196188_CTSE",
"ENSG00000164736_SOX17",
"ENSG00000050730_TNIP3",
"ENSG00000148702_HABP2",
"ENSG00000162998_FRZB",
"ENSG00000128564_VGF",
"ENSG00000170558_CDH2",
"ENSG00000134853_PDGFRA",
"ENSG00000153162_BMP6",
"ENSG00000125848_FLRT3",
"ENSG00000187498_COL4A1",
"ENSG00000134871_COL4A2",
"ENSG00000115414_FN1",
"ENSG00000087303_NID2",
"ENSG00000118137_APOA1",
"ENSG00000017427_IGF1",
"ENSG00000107731_UNC5B",
"ENSG00000136574_GATA4",
"ENSG00000140937_CDH11",
"ENSG00000198336_MYL4",
"ENSG00000141448_GATA6"
)
SAMPLES_SELECTED <- c(
"GSM4096632",
"GSM4096633",
"GSM4096634",
#
"GSM4096623",
"GSM4096621",
"GSM4096622",
#
"GSM4096620",
"GSM4096618",
"GSM4096619",
#
"GSM4096610",
"GSM4096611",
#
"GSM4096624",
"GSM4096625",
#
"GSM4096615",
"GSM4096616",
"GSM4096617",
#
"GSM4096613",
"GSM4096612",
"GSM4096614"
)
Fig. 6A
calc_cpm(matrix_readcount_use[, SAMPLES_SELECTED])[FEATUES_SELECTED, ] %>%
as.matrix() %>%
as.data.frame() %>%
rownames_to_column(var = "feature") %>%
pivot_longer(
-c("feature"),
names_to = "sample"
) %>%
mutate(
feature = factor(
feature,
levels = FEATUES_SELECTED %>% rev(.)
),
sample = factor(
sample,
levels = SAMPLES_SELECTED
)
) %>%
plot_heatmap(
x = sample,
y = feature,
z = log10(value + 1),
y_order = rev(FEATUES_SELECTED)
) +
scale_x_discrete(
labels = embedding[match(SAMPLES_SELECTED, embedding$sample_name), "sample_description"]
) +
geom_hline(yintercept = 23.5, color = "green") +
geom_vline(xintercept = 9.5, color = "green")
devtools::session_info()$platform
## setting value
## version R version 4.0.2 (2020-06-22)
## os macOS Catalina 10.15.6
## system x86_64, darwin19.5.0
## ui unknown
## language (EN)
## collate en_US.UTF-8
## ctype en_US.UTF-8
## tz America/Chicago
## date 2020-08-10
devtools::session_info()$pack %>%
as_tibble() %>%
select(
package,
loadedversion,
date,
`source`
) %>%
# print(n = nrow(.))
gt::gt() %>%
gt::tab_options(table.font.size = "median")
package | loadedversion | date | source |
---|---|---|---|
annotate | 1.66.0 | 2020-04-27 | Bioconductor |
AnnotationDbi | 1.50.3 | 2020-07-25 | Bioconductor |
assertthat | 0.2.1 | 2019-03-21 | CRAN (R 4.0.0) |
backports | 1.1.8 | 2020-06-17 | CRAN (R 4.0.1) |
Biobase | 2.48.0 | 2020-04-27 | Bioconductor |
BiocGenerics | 0.34.0 | 2020-04-27 | Bioconductor |
BiocParallel | 1.22.0 | 2020-04-27 | Bioconductor |
bit | 4.0.4 | 2020-08-04 | CRAN (R 4.0.2) |
bit64 | 4.0.2 | 2020-07-30 | CRAN (R 4.0.2) |
bitops | 1.0-6 | 2013-08-17 | CRAN (R 4.0.0) |
blob | 1.2.1 | 2020-01-20 | CRAN (R 4.0.0) |
broom | 0.7.0.9001 | 2020-08-09 | Github (tidymodels/broom@a0bb105) |
callr | 3.4.3.9000 | 2020-07-31 | Github (r-lib/callr@b96da8f) |
cellranger | 1.1.0 | 2016-07-27 | CRAN (R 4.0.0) |
checkmate | 2.0.0 | 2020-02-06 | CRAN (R 4.0.0) |
cli | 2.0.2 | 2020-02-28 | CRAN (R 4.0.0) |
codetools | 0.2-16 | 2018-12-24 | CRAN (R 4.0.2) |
colorspace | 1.4-1 | 2019-03-18 | CRAN (R 4.0.0) |
crayon | 1.3.4 | 2017-09-16 | CRAN (R 4.0.0) |
DBI | 1.1.0 | 2019-12-15 | CRAN (R 4.0.0) |
dbplyr | 1.4.4.9000 | 2020-07-28 | Github (tidyverse/dbplyr@a6ed629) |
DelayedArray | 0.14.1 | 2020-07-14 | Bioconductor |
desc | 1.2.0 | 2018-05-01 | CRAN (R 4.0.0) |
DESeq2 | 1.28.1 | 2020-05-12 | Bioconductor |
devtools | 2.3.1.9000 | 2020-07-31 | Github (r-lib/devtools@df619ce) |
digest | 0.6.25 | 2020-02-23 | CRAN (R 4.0.0) |
dplyr | 1.0.1.9001 | 2020-08-10 | Github (tidyverse/dplyr@166aed1) |
ellipsis | 0.3.1.9000 | 2020-07-18 | Github (r-lib/ellipsis@57a5071) |
evaluate | 0.14 | 2019-05-28 | CRAN (R 4.0.0) |
extrafont | 0.17 | 2014-12-08 | CRAN (R 4.0.2) |
extrafontdb | 1.0 | 2012-06-11 | CRAN (R 4.0.0) |
fansi | 0.4.1 | 2020-01-08 | CRAN (R 4.0.0) |
farver | 2.0.3 | 2020-01-16 | CRAN (R 4.0.0) |
forcats | 0.5.0.9000 | 2020-05-28 | Github (tidyverse/forcats@ab81d1b) |
fs | 1.5.0.9000 | 2020-08-03 | Github (r-lib/fs@93e70a9) |
genefilter | 1.70.0 | 2020-04-27 | Bioconductor |
geneplotter | 1.66.0 | 2020-04-27 | Bioconductor |
generics | 0.0.2 | 2018-11-29 | CRAN (R 4.0.0) |
GenomeInfoDb | 1.24.2 | 2020-06-15 | Bioconductor |
GenomeInfoDbData | 1.2.3 | 2020-04-24 | Bioconductor |
GenomicRanges | 1.40.0 | 2020-04-27 | Bioconductor |
ggplot2 | 3.3.2.9000 | 2020-08-04 | Github (tidyverse/ggplot2@6d91349) |
ggrepel | 0.9.0 | 2020-07-24 | Github (slowkow/ggrepel@4d0ef50) |
ggthemes | 4.2.0 | 2019-05-13 | CRAN (R 4.0.0) |
glue | 1.4.1.9000 | 2020-07-07 | Github (tidyverse/glue@205f18b) |
gt | 0.2.2 | 2020-08-06 | Github (rstudio/gt@c97cb4c) |
gtable | 0.3.0 | 2019-03-25 | CRAN (R 4.0.0) |
haven | 2.3.1 | 2020-06-01 | CRAN (R 4.0.0) |
hms | 0.5.3 | 2020-01-08 | CRAN (R 4.0.0) |
htmltools | 0.5.0 | 2020-06-16 | CRAN (R 4.0.1) |
httr | 1.4.2 | 2020-07-20 | CRAN (R 4.0.2) |
IRanges | 2.22.2 | 2020-05-21 | Bioconductor |
jsonlite | 1.7.0 | 2020-06-25 | CRAN (R 4.0.2) |
knitr | 1.29 | 2020-06-23 | CRAN (R 4.0.2) |
labeling | 0.3 | 2014-08-23 | CRAN (R 4.0.0) |
lattice | 0.20-41 | 2020-04-02 | CRAN (R 4.0.2) |
lifecycle | 0.2.0 | 2020-03-06 | CRAN (R 4.0.0) |
locfit | 1.5-9.4 | 2020-03-25 | CRAN (R 4.0.0) |
lubridate | 1.7.9 | 2020-07-11 | Github (tidyverse/lubridate@de2ee09) |
magrittr | 1.5.0.9000 | 2020-08-04 | Github (tidyverse/magrittr@ba52096) |
Matrix | 1.2-18 | 2019-11-27 | CRAN (R 4.0.2) |
matrixStats | 0.56.0 | 2020-03-13 | CRAN (R 4.0.0) |
memoise | 1.1.0 | 2017-04-21 | CRAN (R 4.0.0) |
modelr | 0.1.8.9000 | 2020-05-19 | Github (tidyverse/modelr@16168e0) |
munsell | 0.5.0 | 2018-06-12 | CRAN (R 4.0.0) |
patchwork | 1.0.1.9000 | 2020-06-22 | Github (thomasp85/patchwork@82a5e03) |
pillar | 1.4.6.9000 | 2020-07-21 | Github (r-lib/pillar@8aef8f2) |
pkgbuild | 1.1.0.9000 | 2020-07-14 | Github (r-lib/pkgbuild@3a87bd9) |
pkgconfig | 2.0.3 | 2019-09-22 | CRAN (R 4.0.0) |
pkgload | 1.1.0 | 2020-05-29 | CRAN (R 4.0.0) |
prettyunits | 1.1.1.9000 | 2020-08-10 | Github (r-lib/prettyunits@b1cdad8) |
processx | 3.4.3 | 2020-07-05 | CRAN (R 4.0.2) |
ps | 1.3.3 | 2020-05-08 | CRAN (R 4.0.0) |
purrr | 0.3.4.9000 | 2020-08-10 | Github (tidyverse/purrr@5de5ad2) |
R6 | 2.4.1.9000 | 2020-07-18 | Github (r-lib/R6@1415d11) |
RColorBrewer | 1.1-2 | 2014-12-07 | CRAN (R 4.0.0) |
Rcpp | 1.0.5 | 2020-07-06 | CRAN (R 4.0.2) |
RcppAnnoy | 0.0.16 | 2020-03-08 | CRAN (R 4.0.0) |
RCurl | 1.98-1.2 | 2020-04-18 | CRAN (R 4.0.0) |
readr | 1.3.1.9000 | 2020-07-16 | Github (tidyverse/readr@2ab51b2) |
readxl | 1.3.1.9000 | 2020-05-28 | Github (tidyverse/readxl@3815961) |
remotes | 2.2.0.9000 | 2020-08-06 | Github (r-lib/remotes@5a546ad) |
reprex | 0.3.0 | 2019-05-16 | CRAN (R 4.0.0) |
rlang | 0.4.7.9000 | 2020-07-31 | Github (r-lib/rlang@a144ac0) |
rmarkdown | 2.3.3 | 2020-07-25 | Github (rstudio/rmarkdown@204aa41) |
rprojroot | 1.3-2 | 2018-01-03 | CRAN (R 4.0.0) |
RSpectra | 0.16-0 | 2019-12-01 | CRAN (R 4.0.2) |
RSQLite | 2.2.0 | 2020-01-07 | CRAN (R 4.0.0) |
rstudioapi | 0.11.0-9000 | 2020-07-31 | Github (rstudio/rstudioapi@aa17630) |
Rtsne | 0.16 | 2020-07-03 | Github (jkrijthe/Rtsne@14b195f) |
Rttf2pt1 | 1.3.8 | 2020-01-10 | CRAN (R 4.0.0) |
rvest | 0.3.6 | 2020-07-25 | CRAN (R 4.0.2) |
S4Vectors | 0.26.1 | 2020-05-16 | Bioconductor |
sass | 0.2.0 | 2020-03-18 | CRAN (R 4.0.2) |
scales | 1.1.1.9000 | 2020-07-24 | Github (r-lib/scales@9ff4757) |
sessioninfo | 1.1.1.9000 | 2020-07-18 | Github (r-lib/sessioninfo@791705b) |
stringi | 1.4.6 | 2020-02-17 | CRAN (R 4.0.0) |
stringr | 1.4.0.9000 | 2020-06-01 | Github (tidyverse/stringr@f70c4ba) |
styler | 1.3.2.9000 | 2020-07-25 | Github (r-lib/styler@16d815e) |
SummarizedExperiment | 1.18.2 | 2020-07-09 | Bioconductor |
survival | 3.2-3 | 2020-06-13 | CRAN (R 4.0.2) |
testthat | 2.99.0.9000 | 2020-07-28 | Github (r-lib/testthat@0af11cd) |
tibble | 3.0.3.9000 | 2020-07-21 | Github (tidyverse/tibble@b4eec19) |
tidyr | 1.1.1.9000 | 2020-07-31 | Github (tidyverse/tidyr@61e9209) |
tidyselect | 1.1.0.9000 | 2020-07-11 | Github (tidyverse/tidyselect@69fdc96) |
tidyverse | 1.3.0.9000 | 2020-06-01 | Github (hadley/tidyverse@8a0bb99) |
usethis | 1.6.1.9001 | 2020-08-06 | Github (r-lib/usethis@51fcdb8) |
uwot | 0.1.8.9000 | 2020-08-03 | Github (jlmelville/uwot@db9e397) |
vctrs | 0.3.2.9000 | 2020-08-04 | Github (r-lib/vctrs@1b3cd7c) |
viridisLite | 0.3.0 | 2018-02-01 | CRAN (R 4.0.0) |
withr | 2.2.0 | 2020-04-20 | CRAN (R 4.0.0) |
xfun | 0.16 | 2020-07-24 | CRAN (R 4.0.2) |
XML | 3.99-0.5 | 2020-07-23 | CRAN (R 4.0.2) |
xml2 | 1.3.2 | 2020-04-23 | CRAN (R 4.0.0) |
xtable | 1.8-4 | 2019-04-21 | CRAN (R 4.0.0) |
XVector | 0.28.0 | 2020-04-27 | Bioconductor |
yaml | 2.2.1 | 2020-02-01 | CRAN (R 4.0.0) |
zlibbioc | 1.34.0 | 2020-04-27 | Bioconductor |