West, R.C., Ming, H., Logsdon, D.M., Sun, J., Rajput, S.K., Kile, R.A., Schoolcraft, W.B., Roberts, R.M., Krisher, R.L., Jiang, Z., et al. (2019). Dynamics of trophoblast differentiation in peri-implantation-stage human embryos. Proc. Natl. Acad. Sci. U. S. A. 116, 22635–22644.
Load required packages.
library(tidyverse)
library(magrittr)
library(Matrix)
library(extrafont)
library(ggrepel)
library(patchwork)
# library(tidylog)
Sys.time()
## [1] "2020-08-19 15:26:39 CDT"
SEED_USE <- 20200616
MINIMAL_NUM_COUNTS_REQUIRED_FOR_GENE <- 60
source(
file = file.path(
SCRIPT_DIR,
"utilities.R"
)
)
Prepare metadata for single cells.
# https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE130289
geo_info <- tibble::tribble(
~sample_name, ~sample_description,
"GSM3735306", "D10_E1_L1",
"GSM3735307", "D10_E1_L2",
"GSM3735308", "D10_E1_L3",
"GSM3735309", "D10_E1",
"GSM3735310", "D10_E1_S1",
"GSM3735311", "D10_E1_S2",
"GSM3735312", "D10_E1_S3",
"GSM3735313", "D10_E1_S4",
"GSM3735314", "D10_E1_S6",
"GSM3735315", "D10_E1_S7",
"GSM3735316", "D10_E1_S8",
"GSM3735317", "D10_E2_L10",
"GSM3735318", "D10_E2_L11",
"GSM3735319", "D10_E2_L1",
"GSM3735320", "D10_E2_L2",
"GSM3735321", "D10_E2_L3",
"GSM3735322", "D10_E2_L4",
"GSM3735323", "D10_E2_L5",
"GSM3735324", "D10_E2_L6",
"GSM3735325", "D10_E2_L7",
"GSM3735326", "D10_E2_L8",
"GSM3735327", "D10_E2_L9",
"GSM3735328", "D10_E2_S1",
"GSM3735329", "D10_E2_S2",
"GSM3735330", "D10_E2_S3",
"GSM3735331", "D10_E2_S4",
"GSM3735332", "D10_E2_S5",
"GSM3735333", "D10_E2_S6",
"GSM3735334", "D10_E2_S7",
"GSM3735335", "D10_E2_S8",
"GSM3735336", "D10_E3_S1",
"GSM3735337", "D10_E3_S2",
"GSM3735338", "D10_E3_S3",
"GSM3735339", "D10_E3_S4",
"GSM3735340", "D10_E3_S5",
"GSM3735341", "D10_E3_S6",
"GSM3735342", "D10_E3_S7",
"GSM3735343", "D10_E3_S8",
"GSM3735344", "D10_E4_L1",
"GSM3735345", "D10_E4_L2",
"GSM3735346", "D10_E4_L3",
"GSM3735347", "D10_E4_L4",
"GSM3735348", "D10_E4_L5",
"GSM3735349", "D10_E4_L6",
"GSM3735350", "D10_E4_S1",
"GSM3735351", "D10_E4_S2",
"GSM3735352", "D10_E4_S3",
"GSM3735353", "D10_E4_S4",
"GSM3735354", "D12_E3_S10",
"GSM3735355", "D12_E3_S11",
"GSM3735356", "D12_E3_S12",
"GSM3735357", "D12_E3_S13",
"GSM3735358", "D12_E3_S14",
"GSM3735359", "D12_E3_S15",
"GSM3735360", "D12_E3_S16",
"GSM3735361", "D12_E3_S17",
"GSM3735362", "D12_E3_S18",
"GSM3735363", "D12_E3_S19",
"GSM3735364", "D12_E3_S1",
"GSM3735365", "D12_E3_S20",
"GSM3735366", "D12_E3_S21",
"GSM3735367", "D12_E3_S22",
"GSM3735368", "D12_E3_S23",
"GSM3735369", "D12_E3_S24",
"GSM3735370", "D12_E3_S2",
"GSM3735371", "D12_E3_S3",
"GSM3735372", "D12_E3_S4",
"GSM3735373", "D12_E3_S5",
"GSM3735374", "D12_E3_S6",
"GSM3735375", "D12_E3_S7",
"GSM3735376", "D12_E3_S8",
"GSM3735377", "D12_E3_S9",
"GSM3735378", "D12_E4_EVT10",
"GSM3735379", "D12_E4_EVT11",
"GSM3735380", "D12_E4_EVT12",
"GSM3735381", "D12_E4_EVT13",
"GSM3735382", "D12_E4_EVT1",
"GSM3735383", "D12_E4_EVT2",
"GSM3735384", "D12_E4_EVT3",
"GSM3735385", "D12_E4_EVT4",
"GSM3735386", "D12_E4_EVT5",
"GSM3735387", "D12_E4_EVT6",
"GSM3735388", "D12_E4_EVT7",
"GSM3735389", "D12_E4_EVT8",
"GSM3735390", "D12_E4_L1",
"GSM3735391", "D12_E4_L2",
"GSM3735392", "D12_E4_L3",
"GSM3735393", "D12_E4_L4",
"GSM3735394", "D12_E4_L5",
"GSM3735395", "D12_E4_L6",
"GSM3735396", "D12_E4_S1",
"GSM3735397", "D12_E4_S2",
"GSM3735398", "D12_E4_S3",
"GSM3735399", "D12_E4_S4",
"GSM3735400", "D12_E6_L1",
"GSM3735401", "D12_E6_L2",
"GSM3735402", "D12_E6_L3",
"GSM3735403", "D12_E6_L4",
"GSM3735404", "D12_E6_L5",
"GSM3735405", "D12_E6_L6",
"GSM3735406", "D12_E6_S1",
"GSM3735407", "D12_E6_S2",
"GSM3735408", "D12_E6_S3",
"GSM3735409", "D12_E6_S4",
"GSM3735410", "D12_EVT2",
"GSM3735411", "D12_EVT3",
"GSM3735412", "D12_EVT4",
"GSM3735413", "D12_EVT5",
"GSM3735414", "D8_E1_S1",
"GSM3735415", "D8_E1_S2",
"GSM3735416", "D8_E1_S3",
"GSM3735417", "D8_E1_S4",
"GSM3735418", "D8_E1_S5",
"GSM3735419", "D8_E1_S6",
"GSM3735420", "D8_E1_S7",
"GSM3735421", "D8_E1_S8",
"GSM3735422", "D8_E2_S1",
"GSM3735423", "D8_E2_S2",
"GSM3735424", "D8_E2_S3",
"GSM3735425", "D8_E2_S4",
"GSM3735426", "D8_E2_S5",
"GSM3735427", "D8_E2_S6",
"GSM3735428", "D8_E2_S7",
"GSM3735429", "D8_E2_S8",
"GSM3735430", "D8_E3_L1",
"GSM3735431", "D8_E3_L2",
"GSM3735432", "D8_E3_L3",
"GSM3735433", "D8_E3_L4",
"GSM3735434", "D8_E3_S1",
"GSM3735435", "D8_E3_S2",
"GSM3735436", "D8_E3_S3",
"GSM3735437", "D8_E3_S4",
"GSM3735438", "D8_E3_S5",
"GSM3735439", "D8_E3_S6",
"GSM3735440", "D8_E3_S7",
"GSM3735441", "D8_E3_S8",
"GSM3735442", "D8_E4_S1",
"GSM3735443", "D8_E4_S2",
"GSM3735444", "D8_E4_S3"
)
# cell lineage annotation
cell_identity <- tibble::tribble(
~sample_description, ~lineage,
"D8_E1_S1", "S",
"D8_E1_S2", "S",
"D8_E1_S3", "S",
"D8_E1_S4", "S",
"D8_E1_S5", "S",
"D8_E1_S6", "S",
"D8_E1_S7", "S",
"D8_E2_S1", "S",
"D8_E2_S2", "S",
"D8_E2_S3", "S",
"D8_E2_S4", "S",
"D8_E2_S5", "S",
"D8_E2_S6", "S",
"D8_E2_S7", "S",
"D8_E2_S8", "S",
"D8_E3_S1", "S",
"D8_E3_S2", "S",
"D8_E3_S3", "S",
"D8_E3_S4", "S",
"D8_E3_S5", "S",
"D8_E3_S7", "S",
"D8_E3_S8", "S",
"D8_E4_S3", "S",
"D8_E1_S8", "Pre_L",
"D8_E4_S2", "Pre_E",
"D8_E3_L1", "L",
"D8_E3_L2", "L",
"D8_E3_L3", "L",
"D8_E3_L4", "L",
"D10_E1_S5", "S",
"D10_E1_S7", "S",
"D10_E1_S8", "S",
"D10_E3_S6", "S",
"D10_E3_S7", "S",
"D10_E2_S8", "Pre_L",
"D10_E2_S1", "Pre_L",
"D10_E2_S2", "Pre_L",
"D10_E1_S6", "Pre_L",
"D10_E1_S1", "Pre_L",
"D10_E1_S2", "Pre_L",
"D10_E1_S3", "Pre_L",
"D10_E1_S4", "Pre_L",
"D10_E4_S1", "Pre_L",
"D10_E4_S2", "Pre_L",
"D10_E4_S3", "Pre_L",
"D10_E4_S4", "Pre_L",
"D10_E3_S4", "Pre_E",
"D10_E3_S5", "Pre_E",
"D10_E2_S3", "Pre_E",
"D10_E2_S5", "Pre_E",
"D10_E2_S6", "Pre_E",
"D10_E2_S7", "Pre_E",
"D10_E4_L1", "L",
"D10_E4_L2", "L",
"D10_E4_L3", "L",
"D10_E4_L5", "L",
"D10_E4_L6", "L",
"D10_E1_L1", "L",
"D10_E1_L2", "L",
"D10_E1_L3", "L",
"D10_E2_L10", "L",
"D10_E2_L11", "L",
"D10_E2_L1", "L",
"D10_E2_L2", "L",
"D10_E2_L3", "L",
"D10_E2_L4", "L",
"D10_E2_L5", "L",
"D10_E2_L6", "L",
"D10_E2_L7", "L",
"D10_E2_L8", "L",
"D10_E2_L9", "L",
"D12_E3_S10", "S",
"D12_E3_S11", "S",
"D12_E3_S12", "S",
"D12_E3_S14", "S",
"D12_E3_S15", "S",
"D12_E3_S16", "S",
"D12_E3_S17", "S",
"D12_E3_S18", "S",
"D12_E3_S19", "S",
"D12_E3_S20", "S",
"D12_E3_S21", "S",
"D12_E3_S22", "S",
"D12_E3_S23", "S",
"D12_E3_S24", "S",
"D12_E3_S3", "S",
"D12_E3_S4", "S",
"D12_E3_S7", "S",
"D12_E3_S9", "S",
"D12_E3_S1", "Pre_L",
"D12_E3_S8", "Pre_L",
"D12_E3_S2", "Pre_E",
"D12_E3_S5", "Pre_E",
"D12_E3_S6", "Pre_E",
"D12_E4_S1", "Pre_E",
"D12_E4_S2", "Pre_E",
"D12_E4_S3", "Pre_E",
"D12_E4_S4", "Pre_E",
"D12_E6_S1", "Pre_E",
"D12_E6_S3", "Pre_E",
"D12_E6_S4", "Pre_E",
"D12_E4_L2", "L",
"D12_E4_L3", "L",
"D12_E4_L5", "L",
"D12_E6_L1", "L",
"D12_E6_L6", "L",
"D12_E4_L1", "L_E",
"D12_E4_L4", "L_E",
"D12_E4_L6", "L_E",
"D12_EVT2", "E",
"D12_EVT3", "E",
"D12_EVT4", "E",
"D12_EVT5", "E",
"D12_E4_EVT10", "E",
"D12_E4_EVT11", "E",
"D12_E4_EVT12", "E",
"D12_E4_EVT13", "E",
"D12_E4_EVT1", "E",
"D12_E4_EVT2", "E",
"D12_E4_EVT3", "E",
"D12_E4_EVT4", "E",
"D12_E4_EVT5", "E",
"D12_E4_EVT6", "E",
"D12_E4_EVT7", "E",
"D12_E4_EVT8", "E"
)
# https://trace.ncbi.nlm.nih.gov/Traces/study/?acc=SRP193790
cell_metadata <- read_delim(
file = "../SraRunTable.txt",
delim = ","
) %>%
rename_all(. %>% tolower()) %>%
`colnames<-`(str_replace(
string = colnames(.),
pattern = " ", replacement = "_"
)) %>%
select(
sample_name,
run,
biosample,
cell_type,
source_name,
time
) %>%
left_join(geo_info, by = "sample_name")
## Parsed with column specification:
## cols(
## .default = col_character(),
## AvgSpotLen = col_double(),
## Bases = col_double(),
## Bytes = col_double(),
## ReleaseDate = col_datetime(format = "")
## )
## See spec(...) for full column specifications.
cell_metadata
Fix missing embryo info based on the preprocessed matrix and Table S2.
D10_E1 should be D10_E1_S5.
D12_EVT2, D12_EVT3, D12_EVT4, D12_EVT5 are from D12_E3.
matrix_readcount_preprocessed <- read_delim(
file = "GSE130289_Jiang_et_al.__human_trophoblast_RNAseq.txt.gz",
delim = "\t"
)
## Parsed with column specification:
## cols(
## .default = col_double(),
## Chromosome = col_character(),
## Strand = col_character(),
## `Gene Symbol` = col_character(),
## gene_biotype = col_character(),
## gene_name = col_character(),
## gene_source = col_character()
## )
## See spec(...) for full column specifications.
colnames(matrix_readcount_preprocessed)
## [1] "Chromosome" "Start" "Stop" "Strand" "Gene Symbol"
## [6] "gene_biotype" "gene_name" "gene_source" "D10_E1_L1" "D10_E1_L2"
## [11] "D10_E1_L3" "D10_E1_S1" "D10_E1_S2" "D10_E1_S3" "D10_E1_S4"
## [16] "D10_E1_S5" "D10_E1_S6" "D10_E1_S7" "D10_E1_S8" "D10_E2_L10"
## [21] "D10_E2_L11" "D10_E2_L1" "D10_E2_L2" "D10_E2_L3" "D10_E2_L4"
## [26] "D10_E2_L5" "D10_E2_L6" "D10_E2_L7" "D10_E2_L8" "D10_E2_L9"
## [31] "D10_E2_S1" "D10_E2_S2" "D10_E2_S3" "D10_E2_S4" "D10_E2_S5"
## [36] "D10_E2_S6" "D10_E2_S7" "D10_E2_S8" "D10_E3_S1" "D10_E3_S2"
## [41] "D10_E3_S4" "D10_E3_S5" "D10_E3_S6" "D10_E3_S7" "D10_E3_S8"
## [46] "D10_E4_L1" "D10_E4_L2" "D10_E4_L3" "D10_E4_L4" "D10_E4_L5"
## [51] "D10_E4_L6" "D10_E4_S1" "D10_E4_S2" "D10_E4_S3" "D10_E4_S4"
## [56] "D12_E3_S10" "D12_E3_S11" "D12_E3_S12" "D12_E3_S13" "D12_E3_S14"
## [61] "D12_E3_S15" "D12_E3_S16" "D12_E3_S17" "D12_E3_S18" "D12_E3_S19"
## [66] "D12_E3_S1" "D12_E3_S20" "D12_E3_S21" "D12_E3_S22" "D12_E3_S23"
## [71] "D12_E3_S24" "D12_E3_S2" "D12_E3_S3" "D12_E3_S4" "D12_E3_S5"
## [76] "D12_E3_S6" "D12_E3_S7" "D12_E3_S8" "D12_E3_S9" "D12_E4_EVT10"
## [81] "D12_E4_EVT11" "D12_E4_EVT12" "D12_E4_EVT13" "D12_E4_EVT1" "D12_E4_EVT2"
## [86] "D12_E4_EVT3" "D12_E4_EVT4" "D12_E4_EVT5" "D12_E4_EVT6" "D12_E4_EVT7"
## [91] "D12_E4_EVT8" "D12_E4_L1" "D12_E4_L2" "D12_E4_L3" "D12_E4_L4"
## [96] "D12_E4_L5" "D12_E4_L6" "D12_E4_S1" "D12_E4_S2" "D12_E4_S3"
## [101] "D12_E4_S4" "D12_E6_L1" "D12_E6_L2" "D12_E6_L3" "D12_E6_L4"
## [106] "D12_E6_L5" "D12_E6_L6" "D12_E6_S1" "D12_E6_S2" "D12_E6_S3"
## [111] "D12_E6_S4" "D12_EVT2" "D12_EVT3" "D12_EVT4" "D12_EVT5"
## [116] "D8_E1_S1" "D8_E1_S2" "D8_E1_S3" "D8_E1_S4" "D8_E1_S5"
## [121] "D8_E1_S6" "D8_E1_S7" "D8_E1_S8" "D8_E2_S1" "D8_E2_S2"
## [126] "D8_E2_S3" "D8_E2_S4" "D8_E2_S5" "D8_E2_S6" "D8_E2_S7"
## [131] "D8_E2_S8" "D8_E3_L2" "D8_E3_L3" "D8_E3_S1" "D8_E3_S2"
## [136] "D8_E3_S3" "D8_E3_S4" "D8_E3_S5" "D8_E3_S6" "D8_E3_S7"
## [141] "D8_E3_S8" "D8_E4_S2" "D8_E4_S3"
setdiff(colnames(matrix_readcount_preprocessed)[9:143], cell_metadata$sample_description)
## [1] "D10_E1_S5"
# D10_E1 should be D10_E1_S5
# D12_EVT2, D12_EVT3, D12_EVT4, D12_EVT5 are from D12_E3
cell_metadata <- cell_metadata %>%
mutate(
sample_description = case_when(
sample_description == "D10_E1" ~ "D10_E1_S5",
TRUE ~ .$sample_description
),
#
embryo = str_extract(
string = sample_description,
pattern = "^D\\d+_E\\d+"
),
embryo = case_when(
is.na(embryo) ~ "D12_E3",
TRUE ~ embryo
)
) %>%
left_join(cell_identity, by = "sample_description") %>%
mutate(
lineage2 = case_when(
lineage == c("S") ~ "CTB",
lineage == c("Pre_L") ~ "CTB",
lineage == c("Pre_E") ~ "CTB",
lineage == c("L") ~ "STB",
lineage == c("M") ~ "MTB",
lineage == c("E") ~ "MTB",
lineage == c("L_E") ~ "STB",
is.na(lineage) ~ "N/A",
TRUE ~ lineage
),
lineage2 = factor(
lineage2,
levels = c("CTB", "STB", "MTB", "N/A")
),
#
time2 = str_extract(
string = time,
pattern = str_extract(cell_metadata$time, pattern = "Day\\s\\d+")
),
time2 = factor(
time2,
levels = str_sort(unique(time2), numeric = TRUE)
),
#
embryo = factor(
embryo,
levels = c(
"D8_E1",
"D8_E2",
"D8_E3",
"D8_E4",
"D10_E1",
"D10_E2",
"D10_E3",
"D10_E4",
"D12_E3",
"D12_E4",
"D12_E6"
)
)
)
Table S2, raw
cell_metadata %>%
group_by(embryo, lineage) %>%
summarise(
num_cells = n()
) %>%
pivot_wider(names_from = lineage, values_from = num_cells) %>%
as.data.frame() %>%
mutate_if(is.numeric, replace_na, replace = 0) %>%
arrange(embryo) %>%
gt::gt() %>%
# gt::cols_label(n = "num_cells") %>%
gt::tab_options(table.font.size = "median")
## `summarise()` regrouping output by 'embryo' (override with `.groups` argument)
embryo | Pre_L | S | L | NA | Pre_E | E | L_E |
---|---|---|---|---|---|---|---|
D8_E1 | 1 | 7 | 0 | 0 | 0 | 0 | 0 |
D8_E2 | 0 | 8 | 0 | 0 | 0 | 0 | 0 |
D8_E3 | 0 | 7 | 4 | 1 | 0 | 0 | 0 |
D8_E4 | 0 | 1 | 0 | 1 | 1 | 0 | 0 |
D10_E1 | 5 | 3 | 3 | 0 | 0 | 0 | 0 |
D10_E2 | 3 | 0 | 11 | 1 | 4 | 0 | 0 |
D10_E3 | 0 | 2 | 0 | 4 | 2 | 0 | 0 |
D10_E4 | 4 | 0 | 5 | 1 | 0 | 0 | 0 |
D12_E3 | 2 | 18 | 0 | 1 | 3 | 4 | 0 |
D12_E4 | 0 | 0 | 3 | 0 | 4 | 12 | 3 |
D12_E6 | 0 | 0 | 2 | 5 | 3 | 0 | 0 |
Table S2
S: Small cell
Pre-L, Pre-E: Small cell
L: Primitive synthtium
E: Migratory cell
Note: S= small cell (CTB); Pre-L (Pre-STB), Pre-E (Pre-MTB); L= primitive synthtium (STB); E=Migratory cell (MTB)
cell_metadata %>%
group_by(embryo, lineage2) %>%
summarise(
num_cells = n()
) %>%
pivot_wider(names_from = lineage2, values_from = num_cells) %>%
as.data.frame() %>%
mutate_if(is.numeric, replace_na, replace = 0) %>%
arrange(embryo) %>%
gt::gt() %>%
# gt::cols_label(n = "num_cells") %>%
gt::cols_move_to_start(columns = vars(embryo, CTB, STB, MTB)) %>%
gt::tab_options(table.font.size = "median")
## `summarise()` regrouping output by 'embryo' (override with `.groups` argument)
embryo | CTB | STB | MTB | N/A |
---|---|---|---|---|
D8_E1 | 8 | 0 | 0 | 0 |
D8_E2 | 8 | 0 | 0 | 0 |
D8_E3 | 7 | 4 | 0 | 1 |
D8_E4 | 2 | 0 | 0 | 1 |
D10_E1 | 8 | 3 | 0 | 0 |
D10_E2 | 7 | 11 | 0 | 1 |
D10_E3 | 4 | 0 | 0 | 4 |
D10_E4 | 4 | 5 | 0 | 1 |
D12_E3 | 23 | 0 | 4 | 1 |
D12_E4 | 4 | 6 | 12 | 0 |
D12_E6 | 3 | 2 | 0 | 5 |
Load featureCounts output.
matrix_readcount_use <- read_delim(
file = "../matrix/xaa_Aligned.sortedByCoord.out_deduped_q10_gene_id_featureCounts.txt.gz",
delim = "\t",
col_names = TRUE,
skip = 1
) %>%
select(-(2:6))
## Parsed with column specification:
## cols(
## .default = col_double(),
## Geneid = col_character(),
## Chr = col_character(),
## Start = col_character(),
## End = col_character(),
## Strand = col_character()
## )
## See spec(...) for full column specifications.
colnames(matrix_readcount_use) <- str_extract(
string = colnames(matrix_readcount_use),
pattern = "S.+\\d+"
)
matrix_readcount_use <- Matrix(
data = as.matrix(matrix_readcount_use[, -1]),
dimnames = list(
matrix_readcount_use[, 1, drop = TRUE],
colnames(matrix_readcount_use)[-1]
),
sparse = TRUE
)
matrix_readcount_use <- matrix_readcount_use[, colnames(matrix_readcount_use)]
matrix_readcount_use %>%
dim()
## [1] 33538 139
stopifnot(
rownames(matrix_readcount_use) == gene_symbo_info$X1
)
rownames(matrix_readcount_use) <- paste(
rownames(matrix_readcount_use),
gene_symbo_info$X2,
sep = "_"
)
colnames(matrix_readcount_use) <- colnames(matrix_readcount_use) %>%
enframe(value = "run") %>%
left_join(cell_metadata, by = c("run" = "run")) %>%
pull("sample_name")
matrix_cpm_use <- calc_cpm(m = matrix_readcount_use)
Summarize sequencing statistics.
cell_metadata %>%
mutate(
num_umis = matrix_readcount_use[, .$sample_name] %>% colSums(),
num_genes = (matrix_readcount_use[, .$sample_name] > 0) %>% colSums()
) %>%
group_by(time) %>%
summarise(
num_cells = n(),
num_umis = median(num_umis),
num_genes = median(num_genes)
) %>%
arrange(desc(time)) %>%
as.data.frame() %>%
gt::gt() %>%
gt::tab_options(table.font.size = "median")
## `summarise()` ungrouping output (override with `.groups` argument)
time | num_cells | num_umis | num_genes |
---|---|---|---|
Day 8 (Extended culture) | 31 | 8609696 | 11131.0 |
Day 12 (Extended culture) | 60 | 9432453 | 11708.5 |
Day 10 (Extended culture) | 48 | 6803354 | 12894.0 |
Filter uninformative genes.
matrix_readcount_norm <- matrix_readcount_use
matrix_readcount_norm <- matrix_readcount_norm[
Matrix::rowSums(
matrix_readcount_norm
) >= MINIMAL_NUM_COUNTS_REQUIRED_FOR_GENE,
]
dim(matrix_readcount_norm)
## [1] 19192 139
Median normalize matrix.
matrix_readcount_norm@x <- median(colSums(matrix_readcount_norm)) *
(matrix_readcount_norm@x / rep.int(
colSums(matrix_readcount_norm),
diff(matrix_readcount_norm@p)
))
matrix_readcount_norm_log <- matrix_readcount_norm
matrix_readcount_norm_log@x <- log1p(matrix_readcount_norm_log@x)
# z-score
matrix_readcount_norm_log_scaled <- t(
scale(
t(
matrix_readcount_norm_log
),
center = TRUE, scale = TRUE
)
)
# features_var <- apply(matrix_readcount_norm_log_scaled, 1, var)
# features_use <- rownames(matrix_readcount_norm_log_scaled)[features_var > 0]
pca_out <- prcomp(
t(matrix_readcount_norm_log_scaled),
center = FALSE,
scale = FALSE
)
summary(pca_out)$imp %>%
t() %>%
as.data.frame() %>%
rownames_to_column(var = "component") %>%
mutate(
rank = 1:n()
) %>%
slice(1:30) %>%
ggplot(
aes(
x = rank,
y = `Proportion of Variance`
)
) +
geom_point(size = 0.3) +
theme_bw() +
scale_x_continuous(
name = "Component",
breaks = c(1, seq(5, 30, 5))
) +
scale_y_continuous(
name = "Proportion of variance",
labels = scales::percent
) +
theme(
axis.title = ggplot2::element_text(family = "Arial", size = 6),
axis.text = ggplot2::element_text(family = "Arial", size = 6),
panel.grid.minor = ggplot2::element_blank()
)
embedding <- cell_metadata %>%
left_join(
pca_out$x %>%
as.data.frame() %>%
rownames_to_column(var = "sample_name"),
by = "sample_name"
) %>%
dplyr::select(sample_name:PC3) %>%
dplyr::rename(
x_pca = PC1,
y_pca = PC2,
z_pca = PC3
)
p_embedding_pca1 <- embedding %>%
mutate(
category = "PCA"
) %>%
sample_frac() %>%
plot_pca(
x = x_pca,
y = y_pca,
color = time2,
shape = lineage2
) +
scale_color_manual(
values = as.character(yarrr::piratepal(palette = "google"))
) +
# scale_shape_manual(values = 21) +
add_xy_label_pca()
p_embedding_pca2 <- embedding %>%
mutate(
category = "PCA"
) %>%
sample_frac() %>%
plot_pca(
x = x_pca,
y = z_pca,
color = time2,
shape = lineage2
) +
scale_color_manual(
values = as.character(yarrr::piratepal(palette = "google"))
) +
add_xy_label_pca(y = "PC3")
p_embedding_pca1 +
p_embedding_pca2 +
plot_annotation(theme = theme(plot.margin = margin())) +
plot_layout(guides = "collect")
N_COMPONENTS <- 15
set.seed(seed = SEED_USE)
embedding_rtsne <- Rtsne::Rtsne(
X = pca_out$x[, 1:N_COMPONENTS],
perplexity = 5,
check_duplicates = FALSE,
pca = FALSE,
max_iter = 3000,
verbose = TRUE
)$Y
## Read the 139 x 15 data matrix successfully!
## Using no_dims = 2, perplexity = 5.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.00 seconds (sparsity = 0.143471)!
## Learning embedding...
## Iteration 50: error is 68.233127 (50 iterations in 0.01 seconds)
## Iteration 100: error is 66.399104 (50 iterations in 0.01 seconds)
## Iteration 150: error is 65.218464 (50 iterations in 0.01 seconds)
## Iteration 200: error is 66.530819 (50 iterations in 0.01 seconds)
## Iteration 250: error is 64.809300 (50 iterations in 0.01 seconds)
## Iteration 300: error is 2.381987 (50 iterations in 0.01 seconds)
## Iteration 350: error is 1.116405 (50 iterations in 0.01 seconds)
## Iteration 400: error is 0.631274 (50 iterations in 0.01 seconds)
## Iteration 450: error is 0.480870 (50 iterations in 0.01 seconds)
## Iteration 500: error is 0.458526 (50 iterations in 0.01 seconds)
## Iteration 550: error is 0.457536 (50 iterations in 0.01 seconds)
## Iteration 600: error is 0.451522 (50 iterations in 0.01 seconds)
## Iteration 650: error is 0.440383 (50 iterations in 0.01 seconds)
## Iteration 700: error is 0.439046 (50 iterations in 0.01 seconds)
## Iteration 750: error is 0.435972 (50 iterations in 0.01 seconds)
## Iteration 800: error is 0.427690 (50 iterations in 0.01 seconds)
## Iteration 850: error is 0.424590 (50 iterations in 0.01 seconds)
## Iteration 900: error is 0.422803 (50 iterations in 0.01 seconds)
## Iteration 950: error is 0.421123 (50 iterations in 0.01 seconds)
## Iteration 1000: error is 0.418542 (50 iterations in 0.01 seconds)
## Iteration 1050: error is 0.418290 (50 iterations in 0.01 seconds)
## Iteration 1100: error is 0.414548 (50 iterations in 0.01 seconds)
## Iteration 1150: error is 0.413519 (50 iterations in 0.01 seconds)
## Iteration 1200: error is 0.412369 (50 iterations in 0.01 seconds)
## Iteration 1250: error is 0.406898 (50 iterations in 0.01 seconds)
## Iteration 1300: error is 0.409023 (50 iterations in 0.01 seconds)
## Iteration 1350: error is 0.405329 (50 iterations in 0.01 seconds)
## Iteration 1400: error is 0.405205 (50 iterations in 0.01 seconds)
## Iteration 1450: error is 0.409830 (50 iterations in 0.01 seconds)
## Iteration 1500: error is 0.407525 (50 iterations in 0.01 seconds)
## Iteration 1550: error is 0.407099 (50 iterations in 0.01 seconds)
## Iteration 1600: error is 0.405904 (50 iterations in 0.01 seconds)
## Iteration 1650: error is 0.405214 (50 iterations in 0.01 seconds)
## Iteration 1700: error is 0.404648 (50 iterations in 0.01 seconds)
## Iteration 1750: error is 0.404208 (50 iterations in 0.01 seconds)
## Iteration 1800: error is 0.406493 (50 iterations in 0.01 seconds)
## Iteration 1850: error is 0.408674 (50 iterations in 0.01 seconds)
## Iteration 1900: error is 0.406626 (50 iterations in 0.01 seconds)
## Iteration 1950: error is 0.407169 (50 iterations in 0.01 seconds)
## Iteration 2000: error is 0.409883 (50 iterations in 0.01 seconds)
## Iteration 2050: error is 0.408436 (50 iterations in 0.01 seconds)
## Iteration 2100: error is 0.406734 (50 iterations in 0.01 seconds)
## Iteration 2150: error is 0.404644 (50 iterations in 0.01 seconds)
## Iteration 2200: error is 0.402225 (50 iterations in 0.01 seconds)
## Iteration 2250: error is 0.405850 (50 iterations in 0.01 seconds)
## Iteration 2300: error is 0.405466 (50 iterations in 0.01 seconds)
## Iteration 2350: error is 0.404437 (50 iterations in 0.01 seconds)
## Iteration 2400: error is 0.401076 (50 iterations in 0.01 seconds)
## Iteration 2450: error is 0.403112 (50 iterations in 0.01 seconds)
## Iteration 2500: error is 0.404353 (50 iterations in 0.01 seconds)
## Iteration 2550: error is 0.404927 (50 iterations in 0.01 seconds)
## Iteration 2600: error is 0.403501 (50 iterations in 0.01 seconds)
## Iteration 2650: error is 0.403125 (50 iterations in 0.01 seconds)
## Iteration 2700: error is 0.404316 (50 iterations in 0.01 seconds)
## Iteration 2750: error is 0.407113 (50 iterations in 0.01 seconds)
## Iteration 2800: error is 0.403180 (50 iterations in 0.01 seconds)
## Iteration 2850: error is 0.405307 (50 iterations in 0.01 seconds)
## Iteration 2900: error is 0.404784 (50 iterations in 0.01 seconds)
## Iteration 2950: error is 0.404059 (50 iterations in 0.01 seconds)
## Iteration 3000: error is 0.403463 (50 iterations in 0.01 seconds)
## Fitting performed in 0.55 seconds.
embedding <- cbind(
embedding,
embedding_rtsne
) %>%
dplyr::rename(
x_tsne = "1",
y_tsne = "2"
)
p_embedding_tsne <- embedding %>%
mutate(
category = "t-SNE"
) %>%
sample_frac() %>%
plot_pca(
x = x_tsne,
y = y_tsne,
color = time2,
shape = lineage2
) +
labs(x = "Dim 1", y = "Dim 2") +
scale_color_manual(
values = as.character(yarrr::piratepal(palette = "google"))
)
embedding %>%
dplyr::count(time) %>%
gt::gt() %>%
gt::cols_label(n = "num_cells") %>%
gt::tab_options(table.font.size = "median")
time | num_cells |
---|---|
Day 10 (Extended culture) | 48 |
Day 12 (Extended culture) | 60 |
Day 8 (Extended culture) | 31 |
N_COMPONENTS <- 15
set.seed(seed = SEED_USE)
embedding_umap <- uwot::umap(
X = pca_out$x[, 1:N_COMPONENTS],
n_neighbors = 15,
n_components = 2,
metric = "cosine",
spread = 1,
min_dist = 0.01,
verbose = TRUE
)
## 15:26:50 UMAP embedding parameters a = 1.896 b = 0.8006
## 15:26:50 Read 139 rows and found 15 numeric columns
## 15:26:50 Using Annoy for neighbor search, n_neighbors = 15
## 15:26:51 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 15:26:51 Writing NN index file to temp file /var/folders/h1/78b2tkd552ngjl6_ps_5sw7r0000gn/T//RtmpwESxWL/file3d565a39ffef
## 15:26:51 Searching Annoy index using 4 threads, search_k = 1500
## 15:26:51 Annoy recall = 100%
## 15:26:51 Commencing smooth kNN distance calibration using 4 threads
## 15:26:52 Initializing from normalized Laplacian + noise
## 15:26:52 Commencing optimization for 500 epochs, with 2450 positive edges
## 15:26:52 Optimization finished
embedding <- cbind(
embedding,
embedding_umap
) %>%
dplyr::rename(
x_umap = "1",
y_umap = "2"
)
p_embedding_umap <- embedding %>%
mutate(
category = "UMAP"
) %>%
sample_frac() %>%
plot_pca(
x = x_umap,
y = y_umap,
color = time2,
shape = lineage2
) +
labs(x = "Dim 1", y = "Dim 2") +
scale_color_manual(
values = as.character(yarrr::piratepal(palette = "google"))
)
Fig. 3C
p_embedding_tsne +
p_embedding_umap +
plot_annotation(theme = theme(plot.margin = margin())) +
plot_layout(nrow = 1, guides = "collect")
p_embedding_tsne_embryo <- embedding %>%
mutate(
category = "t-SNE"
) %>%
sample_frac() %>%
plot_pca(
x = x_tsne,
y = y_tsne,
color = embryo,
shape = lineage2
) +
labs(x = "Dim 1", y = "Dim 2") +
scale_color_manual(
values = ggthemes::tableau_color_pal("Tableau 20")(
length(unique(embedding$embryo))
)
)
p_embedding_umap_embryo <- embedding %>%
mutate(
category = "UMAP"
) %>%
sample_frac() %>%
plot_pca(
x = x_umap,
y = y_umap,
color = embryo,
shape = lineage2
) +
labs(x = "Dim 1", y = "Dim 2") +
scale_color_manual(
values = ggthemes::tableau_color_pal("Tableau 20")(
length(unique(embedding$embryo))
)
)
p_embedding_tsne_embryo +
p_embedding_umap_embryo +
plot_annotation(theme = theme(plot.margin = margin())) +
plot_layout(nrow = 1, guides = "collect")
p_embedding_tsne_embryo +
add_point_labels(
x = x_tsne,
y = y_tsne,
z = embryo
)