West, R.C., Ming, H., Logsdon, D.M., Sun, J., Rajput, S.K., Kile, R.A., Schoolcraft, W.B., Roberts, R.M., Krisher, R.L., Jiang, Z., et al. (2019). Dynamics of trophoblast differentiation in peri-implantation-stage human embryos. Proc. Natl. Acad. Sci. U. S. A. 116, 22635–22644.

BioProject Accession: PRJNA534673
GEO Accession: GSE130289

Load required packages.

library(tidyverse)
library(magrittr)
library(Matrix)
library(extrafont)
library(ggrepel)
library(patchwork)
# library(tidylog)

Sys.time()

## [1] "2020-08-19 15:26:39 CDT"

Data preparation

SEED_USE <- 20200616
MINIMAL_NUM_COUNTS_REQUIRED_FOR_GENE <- 60

Functions loading

source(
    file = file.path(
        SCRIPT_DIR,
        "utilities.R"
    )
)

Data loading

Prepare metadata for single cells.

# https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE130289
geo_info <- tibble::tribble(
    ~sample_name, ~sample_description,
    "GSM3735306",         "D10_E1_L1",
    "GSM3735307",         "D10_E1_L2",
    "GSM3735308",         "D10_E1_L3",
    "GSM3735309",            "D10_E1",
    "GSM3735310",         "D10_E1_S1",
    "GSM3735311",         "D10_E1_S2",
    "GSM3735312",         "D10_E1_S3",
    "GSM3735313",         "D10_E1_S4",
    "GSM3735314",         "D10_E1_S6",
    "GSM3735315",         "D10_E1_S7",
    "GSM3735316",         "D10_E1_S8",
    "GSM3735317",        "D10_E2_L10",
    "GSM3735318",        "D10_E2_L11",
    "GSM3735319",         "D10_E2_L1",
    "GSM3735320",         "D10_E2_L2",
    "GSM3735321",         "D10_E2_L3",
    "GSM3735322",         "D10_E2_L4",
    "GSM3735323",         "D10_E2_L5",
    "GSM3735324",         "D10_E2_L6",
    "GSM3735325",         "D10_E2_L7",
    "GSM3735326",         "D10_E2_L8",
    "GSM3735327",         "D10_E2_L9",
    "GSM3735328",         "D10_E2_S1",
    "GSM3735329",         "D10_E2_S2",
    "GSM3735330",         "D10_E2_S3",
    "GSM3735331",         "D10_E2_S4",
    "GSM3735332",         "D10_E2_S5",
    "GSM3735333",         "D10_E2_S6",
    "GSM3735334",         "D10_E2_S7",
    "GSM3735335",         "D10_E2_S8",
    "GSM3735336",         "D10_E3_S1",
    "GSM3735337",         "D10_E3_S2",
    "GSM3735338",         "D10_E3_S3",
    "GSM3735339",         "D10_E3_S4",
    "GSM3735340",         "D10_E3_S5",
    "GSM3735341",         "D10_E3_S6",
    "GSM3735342",         "D10_E3_S7",
    "GSM3735343",         "D10_E3_S8",
    "GSM3735344",         "D10_E4_L1",
    "GSM3735345",         "D10_E4_L2",
    "GSM3735346",         "D10_E4_L3",
    "GSM3735347",         "D10_E4_L4",
    "GSM3735348",         "D10_E4_L5",
    "GSM3735349",         "D10_E4_L6",
    "GSM3735350",         "D10_E4_S1",
    "GSM3735351",         "D10_E4_S2",
    "GSM3735352",         "D10_E4_S3",
    "GSM3735353",         "D10_E4_S4",
    "GSM3735354",        "D12_E3_S10",
    "GSM3735355",        "D12_E3_S11",
    "GSM3735356",        "D12_E3_S12",
    "GSM3735357",        "D12_E3_S13",
    "GSM3735358",        "D12_E3_S14",
    "GSM3735359",        "D12_E3_S15",
    "GSM3735360",        "D12_E3_S16",
    "GSM3735361",        "D12_E3_S17",
    "GSM3735362",        "D12_E3_S18",
    "GSM3735363",        "D12_E3_S19",
    "GSM3735364",         "D12_E3_S1",
    "GSM3735365",        "D12_E3_S20",
    "GSM3735366",        "D12_E3_S21",
    "GSM3735367",        "D12_E3_S22",
    "GSM3735368",        "D12_E3_S23",
    "GSM3735369",        "D12_E3_S24",
    "GSM3735370",         "D12_E3_S2",
    "GSM3735371",         "D12_E3_S3",
    "GSM3735372",         "D12_E3_S4",
    "GSM3735373",         "D12_E3_S5",
    "GSM3735374",         "D12_E3_S6",
    "GSM3735375",         "D12_E3_S7",
    "GSM3735376",         "D12_E3_S8",
    "GSM3735377",         "D12_E3_S9",
    "GSM3735378",      "D12_E4_EVT10",
    "GSM3735379",      "D12_E4_EVT11",
    "GSM3735380",      "D12_E4_EVT12",
    "GSM3735381",      "D12_E4_EVT13",
    "GSM3735382",       "D12_E4_EVT1",
    "GSM3735383",       "D12_E4_EVT2",
    "GSM3735384",       "D12_E4_EVT3",
    "GSM3735385",       "D12_E4_EVT4",
    "GSM3735386",       "D12_E4_EVT5",
    "GSM3735387",       "D12_E4_EVT6",
    "GSM3735388",       "D12_E4_EVT7",
    "GSM3735389",       "D12_E4_EVT8",
    "GSM3735390",         "D12_E4_L1",
    "GSM3735391",         "D12_E4_L2",
    "GSM3735392",         "D12_E4_L3",
    "GSM3735393",         "D12_E4_L4",
    "GSM3735394",         "D12_E4_L5",
    "GSM3735395",         "D12_E4_L6",
    "GSM3735396",         "D12_E4_S1",
    "GSM3735397",         "D12_E4_S2",
    "GSM3735398",         "D12_E4_S3",
    "GSM3735399",         "D12_E4_S4",
    "GSM3735400",         "D12_E6_L1",
    "GSM3735401",         "D12_E6_L2",
    "GSM3735402",         "D12_E6_L3",
    "GSM3735403",         "D12_E6_L4",
    "GSM3735404",         "D12_E6_L5",
    "GSM3735405",         "D12_E6_L6",
    "GSM3735406",         "D12_E6_S1",
    "GSM3735407",         "D12_E6_S2",
    "GSM3735408",         "D12_E6_S3",
    "GSM3735409",         "D12_E6_S4",
    "GSM3735410",          "D12_EVT2",
    "GSM3735411",          "D12_EVT3",
    "GSM3735412",          "D12_EVT4",
    "GSM3735413",          "D12_EVT5",
    "GSM3735414",          "D8_E1_S1",
    "GSM3735415",          "D8_E1_S2",
    "GSM3735416",          "D8_E1_S3",
    "GSM3735417",          "D8_E1_S4",
    "GSM3735418",          "D8_E1_S5",
    "GSM3735419",          "D8_E1_S6",
    "GSM3735420",          "D8_E1_S7",
    "GSM3735421",          "D8_E1_S8",
    "GSM3735422",          "D8_E2_S1",
    "GSM3735423",          "D8_E2_S2",
    "GSM3735424",          "D8_E2_S3",
    "GSM3735425",          "D8_E2_S4",
    "GSM3735426",          "D8_E2_S5",
    "GSM3735427",          "D8_E2_S6",
    "GSM3735428",          "D8_E2_S7",
    "GSM3735429",          "D8_E2_S8",
    "GSM3735430",          "D8_E3_L1",
    "GSM3735431",          "D8_E3_L2",
    "GSM3735432",          "D8_E3_L3",
    "GSM3735433",          "D8_E3_L4",
    "GSM3735434",          "D8_E3_S1",
    "GSM3735435",          "D8_E3_S2",
    "GSM3735436",          "D8_E3_S3",
    "GSM3735437",          "D8_E3_S4",
    "GSM3735438",          "D8_E3_S5",
    "GSM3735439",          "D8_E3_S6",
    "GSM3735440",          "D8_E3_S7",
    "GSM3735441",          "D8_E3_S8",
    "GSM3735442",          "D8_E4_S1",
    "GSM3735443",          "D8_E4_S2",
    "GSM3735444",          "D8_E4_S3"
)

# cell lineage annotation
cell_identity <- tibble::tribble(
    ~sample_description, ~lineage,
    "D8_E1_S1", "S",
    "D8_E1_S2", "S",
    "D8_E1_S3", "S",
    "D8_E1_S4", "S",
    "D8_E1_S5", "S",
    "D8_E1_S6", "S",
    "D8_E1_S7", "S",
    "D8_E2_S1", "S",
    "D8_E2_S2", "S",
    "D8_E2_S3", "S",
    "D8_E2_S4", "S",
    "D8_E2_S5", "S",
    "D8_E2_S6", "S",
    "D8_E2_S7", "S",
    "D8_E2_S8", "S",
    "D8_E3_S1", "S",
    "D8_E3_S2", "S",
    "D8_E3_S3", "S",
    "D8_E3_S4", "S",
    "D8_E3_S5", "S",
    "D8_E3_S7", "S",
    "D8_E3_S8", "S",
    "D8_E4_S3", "S",
    "D8_E1_S8", "Pre_L",
    "D8_E4_S2", "Pre_E",
    "D8_E3_L1", "L",
    "D8_E3_L2", "L",
    "D8_E3_L3", "L",
    "D8_E3_L4", "L",
    "D10_E1_S5", "S",
    "D10_E1_S7", "S",
    "D10_E1_S8", "S",
    "D10_E3_S6", "S",
    "D10_E3_S7", "S",
    "D10_E2_S8", "Pre_L",
    "D10_E2_S1", "Pre_L",
    "D10_E2_S2", "Pre_L",
    "D10_E1_S6", "Pre_L",
    "D10_E1_S1", "Pre_L",
    "D10_E1_S2", "Pre_L",
    "D10_E1_S3", "Pre_L",
    "D10_E1_S4", "Pre_L",
    "D10_E4_S1", "Pre_L",
    "D10_E4_S2", "Pre_L",
    "D10_E4_S3", "Pre_L",
    "D10_E4_S4", "Pre_L",
    "D10_E3_S4", "Pre_E",
    "D10_E3_S5", "Pre_E",
    "D10_E2_S3", "Pre_E",
    "D10_E2_S5", "Pre_E",
    "D10_E2_S6", "Pre_E",
    "D10_E2_S7", "Pre_E",
    "D10_E4_L1", "L",
    "D10_E4_L2", "L",
    "D10_E4_L3", "L",
    "D10_E4_L5", "L",
    "D10_E4_L6", "L",
    "D10_E1_L1", "L",
    "D10_E1_L2", "L",
    "D10_E1_L3", "L",
    "D10_E2_L10", "L",
    "D10_E2_L11", "L",
    "D10_E2_L1", "L",
    "D10_E2_L2", "L",
    "D10_E2_L3", "L",
    "D10_E2_L4", "L",
    "D10_E2_L5", "L",
    "D10_E2_L6", "L",
    "D10_E2_L7", "L",
    "D10_E2_L8", "L",
    "D10_E2_L9", "L",
    "D12_E3_S10", "S",
    "D12_E3_S11", "S",
    "D12_E3_S12", "S",
    "D12_E3_S14", "S",
    "D12_E3_S15", "S",
    "D12_E3_S16", "S",
    "D12_E3_S17", "S",
    "D12_E3_S18", "S",
    "D12_E3_S19", "S",
    "D12_E3_S20", "S",
    "D12_E3_S21", "S",
    "D12_E3_S22", "S",
    "D12_E3_S23", "S",
    "D12_E3_S24", "S",
    "D12_E3_S3", "S",
    "D12_E3_S4", "S",
    "D12_E3_S7", "S",
    "D12_E3_S9", "S",
    "D12_E3_S1", "Pre_L",
    "D12_E3_S8", "Pre_L",
    "D12_E3_S2", "Pre_E",
    "D12_E3_S5", "Pre_E",
    "D12_E3_S6", "Pre_E",
    "D12_E4_S1", "Pre_E",
    "D12_E4_S2", "Pre_E",
    "D12_E4_S3", "Pre_E",
    "D12_E4_S4", "Pre_E",
    "D12_E6_S1", "Pre_E",
    "D12_E6_S3", "Pre_E",
    "D12_E6_S4", "Pre_E",
    "D12_E4_L2", "L",
    "D12_E4_L3", "L",
    "D12_E4_L5", "L",
    "D12_E6_L1", "L",
    "D12_E6_L6", "L",
    "D12_E4_L1", "L_E",
    "D12_E4_L4", "L_E",
    "D12_E4_L6", "L_E",
    "D12_EVT2", "E",
    "D12_EVT3", "E",
    "D12_EVT4", "E",
    "D12_EVT5", "E",
    "D12_E4_EVT10", "E",
    "D12_E4_EVT11", "E",
    "D12_E4_EVT12", "E",
    "D12_E4_EVT13", "E",
    "D12_E4_EVT1", "E",
    "D12_E4_EVT2", "E",
    "D12_E4_EVT3", "E",
    "D12_E4_EVT4", "E",
    "D12_E4_EVT5", "E",
    "D12_E4_EVT6", "E",
    "D12_E4_EVT7", "E",
    "D12_E4_EVT8", "E"
)

# https://trace.ncbi.nlm.nih.gov/Traces/study/?acc=SRP193790
cell_metadata <- read_delim(
    file = "../SraRunTable.txt",
    delim = ","
) %>%
    rename_all(. %>% tolower()) %>%
    `colnames<-`(str_replace(
        string = colnames(.),
        pattern = " ", replacement = "_"
    )) %>%
    select(
        sample_name,
        run,
        biosample,
        cell_type,
        source_name,
        time
    ) %>%
    left_join(geo_info, by = "sample_name")

## Parsed with column specification:
## cols(
##   .default = col_character(),
##   AvgSpotLen = col_double(),
##   Bases = col_double(),
##   Bytes = col_double(),
##   ReleaseDate = col_datetime(format = "")
## )

## See spec(...) for full column specifications.

cell_metadata

Fix missing embryo info based on the preprocessed matrix and Table S2.

D10_E1 should be D10_E1_S5.
D12_EVT2, D12_EVT3, D12_EVT4, D12_EVT5 are from D12_E3.

matrix_readcount_preprocessed <- read_delim(
    file = "GSE130289_Jiang_et_al.__human_trophoblast_RNAseq.txt.gz",
    delim = "\t"
)
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Chromosome = col_character(),
##   Strand = col_character(),
##   `Gene Symbol` = col_character(),
##   gene_biotype = col_character(),
##   gene_name = col_character(),
##   gene_source = col_character()
## )
## See spec(...) for full column specifications.
colnames(matrix_readcount_preprocessed)
##   [1] "Chromosome"   "Start"        "Stop"         "Strand"       "Gene Symbol" 
##   [6] "gene_biotype" "gene_name"    "gene_source"  "D10_E1_L1"    "D10_E1_L2"   
##  [11] "D10_E1_L3"    "D10_E1_S1"    "D10_E1_S2"    "D10_E1_S3"    "D10_E1_S4"   
##  [16] "D10_E1_S5"    "D10_E1_S6"    "D10_E1_S7"    "D10_E1_S8"    "D10_E2_L10"  
##  [21] "D10_E2_L11"   "D10_E2_L1"    "D10_E2_L2"    "D10_E2_L3"    "D10_E2_L4"   
##  [26] "D10_E2_L5"    "D10_E2_L6"    "D10_E2_L7"    "D10_E2_L8"    "D10_E2_L9"   
##  [31] "D10_E2_S1"    "D10_E2_S2"    "D10_E2_S3"    "D10_E2_S4"    "D10_E2_S5"   
##  [36] "D10_E2_S6"    "D10_E2_S7"    "D10_E2_S8"    "D10_E3_S1"    "D10_E3_S2"   
##  [41] "D10_E3_S4"    "D10_E3_S5"    "D10_E3_S6"    "D10_E3_S7"    "D10_E3_S8"   
##  [46] "D10_E4_L1"    "D10_E4_L2"    "D10_E4_L3"    "D10_E4_L4"    "D10_E4_L5"   
##  [51] "D10_E4_L6"    "D10_E4_S1"    "D10_E4_S2"    "D10_E4_S3"    "D10_E4_S4"   
##  [56] "D12_E3_S10"   "D12_E3_S11"   "D12_E3_S12"   "D12_E3_S13"   "D12_E3_S14"  
##  [61] "D12_E3_S15"   "D12_E3_S16"   "D12_E3_S17"   "D12_E3_S18"   "D12_E3_S19"  
##  [66] "D12_E3_S1"    "D12_E3_S20"   "D12_E3_S21"   "D12_E3_S22"   "D12_E3_S23"  
##  [71] "D12_E3_S24"   "D12_E3_S2"    "D12_E3_S3"    "D12_E3_S4"    "D12_E3_S5"   
##  [76] "D12_E3_S6"    "D12_E3_S7"    "D12_E3_S8"    "D12_E3_S9"    "D12_E4_EVT10"
##  [81] "D12_E4_EVT11" "D12_E4_EVT12" "D12_E4_EVT13" "D12_E4_EVT1"  "D12_E4_EVT2" 
##  [86] "D12_E4_EVT3"  "D12_E4_EVT4"  "D12_E4_EVT5"  "D12_E4_EVT6"  "D12_E4_EVT7" 
##  [91] "D12_E4_EVT8"  "D12_E4_L1"    "D12_E4_L2"    "D12_E4_L3"    "D12_E4_L4"   
##  [96] "D12_E4_L5"    "D12_E4_L6"    "D12_E4_S1"    "D12_E4_S2"    "D12_E4_S3"   
## [101] "D12_E4_S4"    "D12_E6_L1"    "D12_E6_L2"    "D12_E6_L3"    "D12_E6_L4"   
## [106] "D12_E6_L5"    "D12_E6_L6"    "D12_E6_S1"    "D12_E6_S2"    "D12_E6_S3"   
## [111] "D12_E6_S4"    "D12_EVT2"     "D12_EVT3"     "D12_EVT4"     "D12_EVT5"    
## [116] "D8_E1_S1"     "D8_E1_S2"     "D8_E1_S3"     "D8_E1_S4"     "D8_E1_S5"    
## [121] "D8_E1_S6"     "D8_E1_S7"     "D8_E1_S8"     "D8_E2_S1"     "D8_E2_S2"    
## [126] "D8_E2_S3"     "D8_E2_S4"     "D8_E2_S5"     "D8_E2_S6"     "D8_E2_S7"    
## [131] "D8_E2_S8"     "D8_E3_L2"     "D8_E3_L3"     "D8_E3_S1"     "D8_E3_S2"    
## [136] "D8_E3_S3"     "D8_E3_S4"     "D8_E3_S5"     "D8_E3_S6"     "D8_E3_S7"    
## [141] "D8_E3_S8"     "D8_E4_S2"     "D8_E4_S3"
setdiff(colnames(matrix_readcount_preprocessed)[9:143], cell_metadata$sample_description)
## [1] "D10_E1_S5"

# D10_E1 should be D10_E1_S5
# D12_EVT2, D12_EVT3, D12_EVT4, D12_EVT5 are from D12_E3
cell_metadata <- cell_metadata %>%
    mutate(
        sample_description = case_when(
            sample_description == "D10_E1" ~ "D10_E1_S5",
            TRUE ~ .$sample_description
        ),
        #
        embryo = str_extract(
            string = sample_description,
            pattern = "^D\\d+_E\\d+"
        ),
        embryo = case_when(
            is.na(embryo) ~ "D12_E3",
            TRUE ~ embryo
        )
    ) %>%
    left_join(cell_identity, by = "sample_description") %>%
    mutate(
        lineage2 = case_when(
            lineage == c("S") ~ "CTB",
            lineage == c("Pre_L") ~ "CTB",
            lineage == c("Pre_E") ~ "CTB",
            lineage == c("L") ~ "STB",
            lineage == c("M") ~ "MTB",
            lineage == c("E") ~ "MTB",
            lineage == c("L_E") ~ "STB",
            is.na(lineage) ~ "N/A",
            TRUE ~ lineage
        ),
        lineage2 = factor(
            lineage2,
            levels = c("CTB", "STB", "MTB", "N/A")
        ),
        #
        time2 = str_extract(
            string = time,
            pattern = str_extract(cell_metadata$time, pattern = "Day\\s\\d+")
        ),
        time2 = factor(
            time2,
            levels = str_sort(unique(time2), numeric = TRUE)
        ),
        #
        embryo = factor(
            embryo,
            levels = c(
                "D8_E1",
                "D8_E2",
                "D8_E3",
                "D8_E4",
                "D10_E1",
                "D10_E2",
                "D10_E3",
                "D10_E4",
                "D12_E3",
                "D12_E4",
                "D12_E6"
            )
        )
    )

Table S2, raw

cell_metadata %>%
    group_by(embryo, lineage) %>%
    summarise(
        num_cells = n()
    ) %>%
    pivot_wider(names_from = lineage, values_from = num_cells) %>%
    as.data.frame() %>%
    mutate_if(is.numeric, replace_na, replace = 0) %>%
    arrange(embryo) %>%
    gt::gt() %>%
    # gt::cols_label(n = "num_cells") %>%
    gt::tab_options(table.font.size = "median")

## `summarise()` regrouping output by 'embryo' (override with `.groups` argument)

embryo	Pre_L	S	L	NA	Pre_E	E	L_E
D8_E1	1	7	0	0	0	0	0
D8_E2	0	8	0	0	0	0	0
D8_E3	0	7	4	1	0	0	0
D8_E4	0	1	0	1	1	0	0
D10_E1	5	3	3	0	0	0	0
D10_E2	3	0	11	1	4	0	0
D10_E3	0	2	0	4	2	0	0
D10_E4	4	0	5	1	0	0	0
D12_E3	2	18	0	1	3	4	0
D12_E4	0	0	3	0	4	12	3
D12_E6	0	0	2	5	3	0	0

Table S2

S: Small cell
Pre-L, Pre-E: Small cell
L: Primitive synthtium
E: Migratory cell

Note: S= small cell (CTB); Pre-L (Pre-STB), Pre-E (Pre-MTB); L= primitive synthtium (STB); E=Migratory cell (MTB)

cell_metadata %>%
    group_by(embryo, lineage2) %>%
    summarise(
        num_cells = n()
    ) %>%
    pivot_wider(names_from = lineage2, values_from = num_cells) %>%
    as.data.frame() %>%
    mutate_if(is.numeric, replace_na, replace = 0) %>%
    arrange(embryo) %>%
    gt::gt() %>%
    # gt::cols_label(n = "num_cells") %>%
    gt::cols_move_to_start(columns = vars(embryo, CTB, STB, MTB)) %>%
    gt::tab_options(table.font.size = "median")

## `summarise()` regrouping output by 'embryo' (override with `.groups` argument)

embryo	CTB	STB	MTB	N/A
D8_E1	8	0	0	0
D8_E2	8	0	0	0
D8_E3	7	4	0	1
D8_E4	2	0	0	1
D10_E1	8	3	0	0
D10_E2	7	11	0	1
D10_E3	4	0	0	4
D10_E4	4	5	0	1
D12_E3	23	0	4	1
D12_E4	4	6	12	0
D12_E6	3	2	0	5

Load featureCounts output.

matrix_readcount_use <- read_delim(
    file = "../matrix/xaa_Aligned.sortedByCoord.out_deduped_q10_gene_id_featureCounts.txt.gz",
    delim = "\t",
    col_names = TRUE,
    skip = 1
) %>%
    select(-(2:6))

## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Geneid = col_character(),
##   Chr = col_character(),
##   Start = col_character(),
##   End = col_character(),
##   Strand = col_character()
## )

## See spec(...) for full column specifications.

colnames(matrix_readcount_use) <- str_extract(
    string = colnames(matrix_readcount_use),
    pattern = "S.+\\d+"
)

matrix_readcount_use <- Matrix(
    data = as.matrix(matrix_readcount_use[, -1]),
    dimnames = list(
        matrix_readcount_use[, 1, drop = TRUE],
        colnames(matrix_readcount_use)[-1]
    ),
    sparse = TRUE
)

matrix_readcount_use <- matrix_readcount_use[, colnames(matrix_readcount_use)]
matrix_readcount_use %>%
    dim()

## [1] 33538   139

stopifnot(
    rownames(matrix_readcount_use) == gene_symbo_info$X1
)
rownames(matrix_readcount_use) <- paste(
    rownames(matrix_readcount_use),
    gene_symbo_info$X2,
    sep = "_"
)

colnames(matrix_readcount_use) <- colnames(matrix_readcount_use) %>%
    enframe(value = "run") %>%
    left_join(cell_metadata, by = c("run" = "run")) %>%
    pull("sample_name")

matrix_cpm_use <- calc_cpm(m = matrix_readcount_use)

Lineage inference

Dimensionality reduction

Preprocessing

Summarize sequencing statistics.

cell_metadata %>%
    mutate(
        num_umis = matrix_readcount_use[, .$sample_name] %>% colSums(),
        num_genes = (matrix_readcount_use[, .$sample_name] > 0) %>% colSums()
    ) %>%
    group_by(time) %>%
    summarise(
        num_cells = n(),
        num_umis = median(num_umis),
        num_genes = median(num_genes)
    ) %>%
    arrange(desc(time)) %>%
    as.data.frame() %>%
    gt::gt() %>%
    gt::tab_options(table.font.size = "median")

## `summarise()` ungrouping output (override with `.groups` argument)

time	num_cells	num_umis	num_genes
Day 8 (Extended culture)	31	8609696	11131.0
Day 12 (Extended culture)	60	9432453	11708.5
Day 10 (Extended culture)	48	6803354	12894.0

Filter uninformative genes.

matrix_readcount_norm <- matrix_readcount_use
matrix_readcount_norm <- matrix_readcount_norm[
    Matrix::rowSums(
        matrix_readcount_norm
    ) >= MINIMAL_NUM_COUNTS_REQUIRED_FOR_GENE,
]
dim(matrix_readcount_norm)

## [1] 19192   139

Median normalize matrix.

matrix_readcount_norm@x <- median(colSums(matrix_readcount_norm)) *
    (matrix_readcount_norm@x / rep.int(
        colSums(matrix_readcount_norm),
        diff(matrix_readcount_norm@p)
    ))

PCA

matrix_readcount_norm_log <- matrix_readcount_norm
matrix_readcount_norm_log@x <- log1p(matrix_readcount_norm_log@x)

# z-score
matrix_readcount_norm_log_scaled <- t(
    scale(
        t(
            matrix_readcount_norm_log
        ),
        center = TRUE, scale = TRUE
    )
)
# features_var <- apply(matrix_readcount_norm_log_scaled, 1, var)
# features_use <- rownames(matrix_readcount_norm_log_scaled)[features_var > 0]

pca_out <- prcomp(
    t(matrix_readcount_norm_log_scaled),
    center = FALSE,
    scale = FALSE
)

summary(pca_out)$imp %>%
    t() %>%
    as.data.frame() %>%
    rownames_to_column(var = "component") %>%
    mutate(
        rank = 1:n()
    ) %>%
    slice(1:30) %>%
    ggplot(
        aes(
            x = rank,
            y = `Proportion of Variance`
        )
    ) +
    geom_point(size = 0.3) +
    theme_bw() +
    scale_x_continuous(
        name = "Component",
        breaks = c(1, seq(5, 30, 5))
    ) +
    scale_y_continuous(
        name = "Proportion of variance",
        labels = scales::percent
    ) +
    theme(
        axis.title = ggplot2::element_text(family = "Arial", size = 6),
        axis.text = ggplot2::element_text(family = "Arial", size = 6),
        panel.grid.minor = ggplot2::element_blank()
    )

embedding <- cell_metadata %>%
    left_join(
        pca_out$x %>%
            as.data.frame() %>%
            rownames_to_column(var = "sample_name"),
        by = "sample_name"
    ) %>%
    dplyr::select(sample_name:PC3) %>%
    dplyr::rename(
        x_pca = PC1,
        y_pca = PC2,
        z_pca = PC3
    )

p_embedding_pca1 <- embedding %>%
    mutate(
        category = "PCA"
    ) %>%
    sample_frac() %>%
    plot_pca(
        x = x_pca,
        y = y_pca,
        color = time2,
        shape = lineage2
    ) +
    scale_color_manual(
        values = as.character(yarrr::piratepal(palette = "google"))
    ) +
    # scale_shape_manual(values = 21) +
    add_xy_label_pca()

p_embedding_pca2 <- embedding %>%
    mutate(
        category = "PCA"
    ) %>%
    sample_frac() %>%
    plot_pca(
        x = x_pca,
        y = z_pca,
        color = time2,
        shape = lineage2
    ) +
    scale_color_manual(
        values = as.character(yarrr::piratepal(palette = "google"))
    ) +
    add_xy_label_pca(y = "PC3")

p_embedding_pca1 +
    p_embedding_pca2 +
    plot_annotation(theme = theme(plot.margin = margin())) +
    plot_layout(guides = "collect")

t-SNE

N_COMPONENTS <- 15

set.seed(seed = SEED_USE)
embedding_rtsne <- Rtsne::Rtsne(
    X = pca_out$x[, 1:N_COMPONENTS],
    perplexity = 5,
    check_duplicates = FALSE,
    pca = FALSE,
    max_iter = 3000,
    verbose = TRUE
)$Y
## Read the 139 x 15 data matrix successfully!
## Using no_dims = 2, perplexity = 5.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.00 seconds (sparsity = 0.143471)!
## Learning embedding...
## Iteration 50: error is 68.233127 (50 iterations in 0.01 seconds)
## Iteration 100: error is 66.399104 (50 iterations in 0.01 seconds)
## Iteration 150: error is 65.218464 (50 iterations in 0.01 seconds)
## Iteration 200: error is 66.530819 (50 iterations in 0.01 seconds)
## Iteration 250: error is 64.809300 (50 iterations in 0.01 seconds)
## Iteration 300: error is 2.381987 (50 iterations in 0.01 seconds)
## Iteration 350: error is 1.116405 (50 iterations in 0.01 seconds)
## Iteration 400: error is 0.631274 (50 iterations in 0.01 seconds)
## Iteration 450: error is 0.480870 (50 iterations in 0.01 seconds)
## Iteration 500: error is 0.458526 (50 iterations in 0.01 seconds)
## Iteration 550: error is 0.457536 (50 iterations in 0.01 seconds)
## Iteration 600: error is 0.451522 (50 iterations in 0.01 seconds)
## Iteration 650: error is 0.440383 (50 iterations in 0.01 seconds)
## Iteration 700: error is 0.439046 (50 iterations in 0.01 seconds)
## Iteration 750: error is 0.435972 (50 iterations in 0.01 seconds)
## Iteration 800: error is 0.427690 (50 iterations in 0.01 seconds)
## Iteration 850: error is 0.424590 (50 iterations in 0.01 seconds)
## Iteration 900: error is 0.422803 (50 iterations in 0.01 seconds)
## Iteration 950: error is 0.421123 (50 iterations in 0.01 seconds)
## Iteration 1000: error is 0.418542 (50 iterations in 0.01 seconds)
## Iteration 1050: error is 0.418290 (50 iterations in 0.01 seconds)
## Iteration 1100: error is 0.414548 (50 iterations in 0.01 seconds)
## Iteration 1150: error is 0.413519 (50 iterations in 0.01 seconds)
## Iteration 1200: error is 0.412369 (50 iterations in 0.01 seconds)
## Iteration 1250: error is 0.406898 (50 iterations in 0.01 seconds)
## Iteration 1300: error is 0.409023 (50 iterations in 0.01 seconds)
## Iteration 1350: error is 0.405329 (50 iterations in 0.01 seconds)
## Iteration 1400: error is 0.405205 (50 iterations in 0.01 seconds)
## Iteration 1450: error is 0.409830 (50 iterations in 0.01 seconds)
## Iteration 1500: error is 0.407525 (50 iterations in 0.01 seconds)
## Iteration 1550: error is 0.407099 (50 iterations in 0.01 seconds)
## Iteration 1600: error is 0.405904 (50 iterations in 0.01 seconds)
## Iteration 1650: error is 0.405214 (50 iterations in 0.01 seconds)
## Iteration 1700: error is 0.404648 (50 iterations in 0.01 seconds)
## Iteration 1750: error is 0.404208 (50 iterations in 0.01 seconds)
## Iteration 1800: error is 0.406493 (50 iterations in 0.01 seconds)
## Iteration 1850: error is 0.408674 (50 iterations in 0.01 seconds)
## Iteration 1900: error is 0.406626 (50 iterations in 0.01 seconds)
## Iteration 1950: error is 0.407169 (50 iterations in 0.01 seconds)
## Iteration 2000: error is 0.409883 (50 iterations in 0.01 seconds)
## Iteration 2050: error is 0.408436 (50 iterations in 0.01 seconds)
## Iteration 2100: error is 0.406734 (50 iterations in 0.01 seconds)
## Iteration 2150: error is 0.404644 (50 iterations in 0.01 seconds)
## Iteration 2200: error is 0.402225 (50 iterations in 0.01 seconds)
## Iteration 2250: error is 0.405850 (50 iterations in 0.01 seconds)
## Iteration 2300: error is 0.405466 (50 iterations in 0.01 seconds)
## Iteration 2350: error is 0.404437 (50 iterations in 0.01 seconds)
## Iteration 2400: error is 0.401076 (50 iterations in 0.01 seconds)
## Iteration 2450: error is 0.403112 (50 iterations in 0.01 seconds)
## Iteration 2500: error is 0.404353 (50 iterations in 0.01 seconds)
## Iteration 2550: error is 0.404927 (50 iterations in 0.01 seconds)
## Iteration 2600: error is 0.403501 (50 iterations in 0.01 seconds)
## Iteration 2650: error is 0.403125 (50 iterations in 0.01 seconds)
## Iteration 2700: error is 0.404316 (50 iterations in 0.01 seconds)
## Iteration 2750: error is 0.407113 (50 iterations in 0.01 seconds)
## Iteration 2800: error is 0.403180 (50 iterations in 0.01 seconds)
## Iteration 2850: error is 0.405307 (50 iterations in 0.01 seconds)
## Iteration 2900: error is 0.404784 (50 iterations in 0.01 seconds)
## Iteration 2950: error is 0.404059 (50 iterations in 0.01 seconds)
## Iteration 3000: error is 0.403463 (50 iterations in 0.01 seconds)
## Fitting performed in 0.55 seconds.

embedding <- cbind(
    embedding,
    embedding_rtsne
) %>%
    dplyr::rename(
        x_tsne = "1",
        y_tsne = "2"
    )

p_embedding_tsne <- embedding %>%
    mutate(
        category = "t-SNE"
    ) %>%
    sample_frac() %>%
    plot_pca(
        x = x_tsne,
        y = y_tsne,
        color = time2,
        shape = lineage2
    ) +
    labs(x = "Dim 1", y = "Dim 2") +
    scale_color_manual(
        values = as.character(yarrr::piratepal(palette = "google"))
    )

embedding %>%
    dplyr::count(time) %>%
    gt::gt() %>%
    gt::cols_label(n = "num_cells") %>%
    gt::tab_options(table.font.size = "median")

time	num_cells
Day 10 (Extended culture)	48
Day 12 (Extended culture)	60
Day 8 (Extended culture)	31

UMAP

N_COMPONENTS <- 15

set.seed(seed = SEED_USE)
embedding_umap <- uwot::umap(
    X = pca_out$x[, 1:N_COMPONENTS],
    n_neighbors = 15,
    n_components = 2,
    metric = "cosine",
    spread = 1,
    min_dist = 0.01,
    verbose = TRUE
)
## 15:26:50 UMAP embedding parameters a = 1.896 b = 0.8006
## 15:26:50 Read 139 rows and found 15 numeric columns
## 15:26:50 Using Annoy for neighbor search, n_neighbors = 15
## 15:26:51 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 15:26:51 Writing NN index file to temp file /var/folders/h1/78b2tkd552ngjl6_ps_5sw7r0000gn/T//RtmpwESxWL/file3d565a39ffef
## 15:26:51 Searching Annoy index using 4 threads, search_k = 1500
## 15:26:51 Annoy recall = 100%
## 15:26:51 Commencing smooth kNN distance calibration using 4 threads
## 15:26:52 Initializing from normalized Laplacian + noise
## 15:26:52 Commencing optimization for 500 epochs, with 2450 positive edges
## 15:26:52 Optimization finished

embedding <- cbind(
    embedding,
    embedding_umap
) %>%
    dplyr::rename(
        x_umap = "1",
        y_umap = "2"
    )

p_embedding_umap <- embedding %>%
    mutate(
        category = "UMAP"
    ) %>%
    sample_frac() %>%
    plot_pca(
        x = x_umap,
        y = y_umap,
        color = time2,
        shape = lineage2
    ) +
    labs(x = "Dim 1", y = "Dim 2") +
    scale_color_manual(
        values = as.character(yarrr::piratepal(palette = "google"))
    )

Fig. 3C

p_embedding_tsne +
    p_embedding_umap +
    plot_annotation(theme = theme(plot.margin = margin())) +
    plot_layout(nrow = 1, guides = "collect")

p_embedding_tsne_embryo <- embedding %>%
    mutate(
        category = "t-SNE"
    ) %>%
    sample_frac() %>%
    plot_pca(
        x = x_tsne,
        y = y_tsne,
        color = embryo,
        shape = lineage2
    ) +
    labs(x = "Dim 1", y = "Dim 2") +
    scale_color_manual(
        values = ggthemes::tableau_color_pal("Tableau 20")(
            length(unique(embedding$embryo))
        )
    )

p_embedding_umap_embryo <- embedding %>%
    mutate(
        category = "UMAP"
    ) %>%
    sample_frac() %>%
    plot_pca(
        x = x_umap,
        y = y_umap,
        color = embryo,
        shape = lineage2
    ) +
    labs(x = "Dim 1", y = "Dim 2") +
    scale_color_manual(
        values = ggthemes::tableau_color_pal("Tableau 20")(
            length(unique(embedding$embryo))
        )
    )

p_embedding_tsne_embryo +
    p_embedding_umap_embryo +
    plot_annotation(theme = theme(plot.margin = margin())) +
    plot_layout(nrow = 1, guides = "collect")

p_embedding_tsne_embryo +
    add_point_labels(
        x = x_tsne,
        y = y_tsne,
        z = embryo
    )

Transcriptomic analysis of trophoblast differentiation in peri-implantation–stage human embryos

JD

2020-08-19