Genotyping of Immortalized Primary Endometrial Stromal Cells (IESCs)

Published

Sun Sep 10, 2023 03:28:41-05:00


[1] "2023-09-10 03:28:31 CDT"
[1] "America/Chicago"

Preparation

PROJECT_DIR <- "/Users/jialei/Dropbox/Data/Projects/UTSW/Peri-implantation"

Functions

score_snps <- function(m, cells, snp_concensus) {
    furrr::future_map(c(cells), \(x) {
        selected_cell <- x

        snps_for_testing <- rownames(
            m
        )[m[, selected_cell] != 0]
        snps_for_testing <- snps_for_testing[
            snps_for_testing %in% names(snp_concensus)
        ]
        # cat(length(snps_for_testing), "\n")

        data.frame(
            cell = selected_cell,
            score = mean(
                m[
                    snps_for_testing,
                    selected_cell
                ] == snp_concensus[snps_for_testing]
            ),
            num_snps = length(snps_for_testing)
        )
    }) |>
        dplyr::bind_rows() |>
        dplyr::mutate(batch = stringr::str_remove(cell, "_[A-Z]{16}$"))
}

Load required packages.

library(tidyverse)
## ── Attaching core tidyverse packages ─────────────────── tidyverse 2.0.0.9000 ──
## ✔ dplyr     1.1.3          ✔ readr     2.1.4.9000
## ✔ forcats   1.0.0.9000     ✔ stringr   1.5.0.9000
## ✔ ggplot2   3.4.3.9000     ✔ tibble    3.2.1.9005
## ✔ lubridate 1.9.2.9000     ✔ tidyr     1.3.0.9000
## ✔ purrr     1.0.2.9000     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(Matrix)
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
library(patchwork)
library(extrafont)
## Registering fonts with R
library(furrr)
## Loading required package: future
source(
    file = file.path(
        SCRIPT_DIR,
        "utilities.R"
    )
)
get_mode <- function(v, exclude_zero = FALSE) {
    if (exclude_zero) {
        v <- v[v != 0]
    }

    uniqv <- unique(v)
    uniqv[which.max(tabulate(match(v, uniqv)))]
}

normalize <- function(x) {
    return((x - min(x)) / (max(x) - min(x)))
}

add_panel_border <- function() {
    ggplot2::theme(
        plot.background = element_rect(
            color = "grey70", fill = NA, linewidth = 0.25
        )
    )
}
`%+replace%` <- ggplot2::`%+replace%`
`%<-%` <- zeallot::`%<-%`

Python env

ad <- reticulate::import(module = "anndata", convert = TRUE)
print(ad$`__version__`)
[1] "0.8.0"
reticulate::py_config()
python:         /Users/jialei/.pyenv/shims/python
libpython:      /Users/jialei/.pyenv/versions/mambaforge-22.9.0-3/lib/libpython3.10.dylib
pythonhome:     /Users/jialei/.pyenv/versions/mambaforge-22.9.0-3:/Users/jialei/.pyenv/versions/mambaforge-22.9.0-3
version:        3.10.9 | packaged by conda-forge | (main, Feb  2 2023, 20:26:08) [Clang 14.0.6 ]
numpy:          /Users/jialei/.pyenv/versions/mambaforge-22.9.0-3/lib/python3.10/site-packages/numpy
numpy_version:  1.24.3
anndata:        /Users/jialei/.pyenv/versions/mambaforge-22.9.0-3/lib/python3.10/site-packages/anndata

NOTE: Python version was forced by RETICULATE_PYTHON

Matrix

SNP matrix

VARTRIX_VCF <- "vartrix_LW203"
VARTRIX_MATRIX_DIR <- "no-duplicates_umi_mapq10_consensus"

matrix_snp <- purrr::map(c(
    "LW186", "LW187", "LW188",
    "LW189", "LW202", "LW203", "LW204"
), \(x) {
    read_snp_matrix(
        matrix_dir = file.path(
            PROJECT_DIR, "genotyping", x,
            VARTRIX_VCF, VARTRIX_MATRIX_DIR
        )
    )
}) |>
    purrr::reduce(cbind)

matrix_snp |> dim()
[1] 48238 39755
cell_metadata_snp <- Matrix::colSums(matrix_snp) |>
    tibble::enframe(name = "cell", value = "num_umis") |>
    dplyr::mutate(
        batch = stringr::str_remove(
            string = cell,
            pattern = "_[A-Z]{16}"
        )
    )
cell_metadata_snp |> head()
# A tibble: 6 × 3
  cell                   num_umis batch
  <chr>                     <dbl> <chr>
1 LW186_AAACCCAAGCCTCACG      348 LW186
2 LW186_AAACCCAAGTTGTAGA      396 LW186
3 LW186_AAACCCACAAATCGGG      271 LW186
4 LW186_AAACCCACAGCCTACG       96 LW186
5 LW186_AAACCCACAGGAAGTC       77 LW186
6 LW186_AAACCCACATTCAGGT      198 LW186

Expression

adata_files <- purrr::map(c("LW119", "LW120", "LW121", "LW122"), \(x) {
    file.path(
        PROJECT_DIR,
        "raw",
        x,
        "matrix",
        "adata.h5ad"
    )
})

purrr::map_lgl(adata_files, file.exists)
[1] TRUE TRUE TRUE TRUE
BACKED <- NULL
matrix_readcount_use <- purrr::map(adata_files, \(x) {
    cat(x, "\n")
    ad$read_h5ad(
        filename = x, backed = BACKED
    ) |>
        extract_matrix_from_adata()
}) |>
    purrr::reduce(cbind)
/Users/jialei/Dropbox/Data/Projects/UTSW/Peri-implantation/raw/LW119/matrix/adata.h5ad 
/Users/jialei/Dropbox/Data/Projects/UTSW/Peri-implantation/raw/LW120/matrix/adata.h5ad 
/Users/jialei/Dropbox/Data/Projects/UTSW/Peri-implantation/raw/LW121/matrix/adata.h5ad 
/Users/jialei/Dropbox/Data/Projects/UTSW/Peri-implantation/raw/LW122/matrix/adata.h5ad 
matrix_readcount_use <- matrix_readcount_use[
    , sort(colnames(matrix_readcount_use))
]
matrix_readcount_use |> dim()
[1] 33538  8831

Score cells

LW203

cells_LW203 <- cell_metadata_snp |>
    dplyr::filter(batch == "LW203") |>
    dplyr::pull(cell)

cells_LW203 |> length()
[1] 2571

IESCs

snp_group <- split(
    rownames(matrix_snp),
    ceiling(seq_along(rownames(matrix_snp)) / 1000)
)

# get concensus snp table
snp_concensus_LW203 <- purrr::map(seq_len(length(snp_group)), \(x) {
    apply(
        matrix_snp[snp_group[[x]], cells_LW203],
        MARGIN = 1, get_mode, TRUE
    )
}) |>
    unlist()

snp_concensus_LW203 <- snp_concensus_LW203[!is.na(snp_concensus_LW203)]

Scoring

snp_scores_LW203 <- score_snps(
    m = matrix_snp,
    cells = cells_LW203,
    snp_concensus = snp_concensus_LW203
)

LW187, LW189

cells_LW187_LW189 <- cell_metadata_snp |>
    dplyr::filter(batch %in% c("LW187", "LW189")) |>
    dplyr::pull(cell) |>
    sort()

Scoring

snp_scores_LW187_LW189 <- score_snps(
    m = matrix_snp,
    cells = cells_LW187_LW189,
    snp_concensus = snp_concensus_LW203
)

LW202, LW204

cells_LW202_LW204 <- cell_metadata_snp |>
    dplyr::filter(batch %in% c("LW202", "LW204")) |>
    dplyr::pull(cell) |>
    sort()

Scoring

snp_scores_LW202_LW204 <- score_snps(
    m = matrix_snp,
    cells = cells_LW202_LW204,
    snp_concensus = snp_concensus_LW203
)

LW186, LW188

cells_LW186_LW188 <- cell_metadata_snp |>
    dplyr::filter(batch %in% c("LW186", "LW188")) |>
    dplyr::pull(cell) |>
    sort()

Scoring

snp_scores_LW186_LW188 <- score_snps(
    m = matrix_snp,
    cells = cells_LW186_LW188,
    snp_concensus = snp_concensus_LW203
)
file_name <- "snp_scores_LW186_LW188.csv"
# file_name <- file.path(OUTPUT_DIR, file_name)
cat(file_name, "\n")
snp_scores_LW186_LW188.csv 
if (!file.exists(file_name)) {
    vroom::vroom_write(
        x = snp_scores_LW186_LW188,
        file = file_name,
        delim = ","
    )
}

Calc cutoff

MINIMAL_SNPS <- 20
BINWIDTH <- 0.0005

cutoff <- system(
    glue::glue(
        file.path(
            PROJECT_DIR, "genotyping", "call_gmm.py"
        ),
        " ",
        file.path(
            # OUTPUT_DIR,
            "snp_scores_LW186_LW188.csv"
        )
    ),
    intern = TRUE
) |>
    stringr::str_remove(
        pattern = "\\["
    ) |>
    stringr::str_remove(
        pattern = "\\]"
    ) |>
    stringr::str_split(
        pattern = " "
    ) |>
    unlist() |>
    as.numeric() |>
    rev()

snp_score_cutoff <- cutoff[4]
snp_score_cutoff
[1] 0.8289474

Visualize

color_palette_assay <- c(
    "Blastocyst (fibronectin)" = "#F28E2B",
    "Blastocyst (stromal cells)" = "#4E79A7",
    "Blastoid (fibronectin)" = "#E15759",
    "Blastoid (stromal cells)" = "#76B7B2",
    "Stromal cells" = "#59A14F"
)
ann_text <- data.frame(
    num_cells = snp_scores_LW203 |>
        dplyr::filter(
            num_snps >= MINIMAL_SNPS,
            batch == "LW203",
            score <= snp_score_cutoff
        ) |>
        nrow(),
    batch = "LW203"
)
p_histogram_snp_scores <- list(
    snp_scores_LW203,
    snp_scores_LW187_LW189,
    snp_scores_LW186_LW188,
    snp_scores_LW202_LW204
) |>
    dplyr::bind_rows() |>
    dplyr::mutate(
        batch_annotated = dplyr::case_when(
            batch %in% c("LW187", "LW189") ~ "Blastocyst (fibronectin)",
            batch %in% c("LW186", "LW188") ~ "Blastocyst (IESCs)",
            batch %in% c("LW202") ~ "Blastoid (fibronectin)",
            batch %in% c("LW203") ~ "IESCs",
            batch %in% c("LW204") ~ "Blastoid (IESCs)"
        ),
        label = dplyr::case_when(
            batch %in% c("LW187", "LW189") ~ paste(
                batch_annotated, "LW187, LW189",
                sep = "\n"
            ),
            batch %in% c("LW186", "LW188") ~ paste(
                batch_annotated, "LW186, LW188",
                sep = "\n"
            ),
            batch %in% c("LW202") ~ paste(
                batch_annotated, "LW202",
                sep = "\n"
            ),
            batch %in% c("LW203") ~ paste(
                batch_annotated, "LW203 ",
                sep = "\n"
            ),
            batch %in% c("LW204") ~ paste(
                batch_annotated, "LW204",
                sep = "\n"
            ),
        )
    ) |>
    dplyr::filter(
        num_snps >= MINIMAL_SNPS,
    ) |>
    dplyr::mutate(
        color = dplyr::case_when(
            score >= snp_score_cutoff ~ "#F8B195",
            TRUE ~ "grey35"
        )
    ) |>
    {
        \(x) {
            x |>
                ggplot2::ggplot(
                    ggplot2::aes(
                        x = score, fill = color
                    )
                ) +
                ggplot2::geom_histogram(
                    binwidth = 0.02,
                    boundary = 0,
                    closed = "left"
                ) +
                ggplot2::geom_vline(
                    xintercept = snp_score_cutoff,
                    color = "salmon",
                    linewidth = 0.25
                ) +
                ggplot2::scale_x_continuous(
                    name = "SNP score"
                ) +
                ggplot2::scale_y_continuous(
                    name = "Number of cells"
                ) +
                ggplot2::scale_fill_identity() +
                ggplot2::facet_wrap(
                    ggplot2::vars(label),
                    scales = "free_y",
                    nrow = 1
                ) +
                ggplot2::theme_grey(
                    base_size = 6, base_family = "Arial"
                ) +
                ggplot2::theme(
                    legend.background = ggplot2::element_blank(),
                    legend.key = ggplot2::element_blank(),
                    #
                    # panel.background = ggplot2::element_blank(),
                    #
                    plot.background = ggplot2::element_blank(),
                    strip.text = ggplot2::element_text(
                        family = "Arial", size = 6
                    )
                )
        }
    }()
g <- ggplot2::ggplot_gtable(
    ggplot2::ggplot_build(p_histogram_snp_scores)
)
stript <- which(grepl("strip-t", g$layout$name))


k <- 1
for (i in stript) {
    j <- which(grepl("rect", g$grobs[[i]]$grobs[[1]]$childrenOrder))
    g$grobs[[i]]$grobs[[1]]$children[[j]]$gp$fill <- setNames(
        object = color_palette_assay,
        nm = seq_len(length(color_palette_assay))
    )[k]
    k <- k + 1
}
grid::grid.draw(g)

R session info

devtools::session_info()
─ Session info ───────────────────────────────────────────────────────────────
 setting  value
 version  R version 4.3.1 (2023-06-16)
 os       macOS Ventura 13.5.2
 system   aarch64, darwin22.4.0
 ui       unknown
 language (EN)
 collate  en_US.UTF-8
 ctype    en_US.UTF-8
 tz       America/Chicago
 date     2023-09-10
 pandoc   2.19.2 @ /Users/jialei/.pyenv/shims/ (via rmarkdown)

─ Packages ───────────────────────────────────────────────────────────────────
 package     * version     date (UTC) lib source
 cachem        1.0.8       2023-05-01 [1] CRAN (R 4.3.0)
 callr         3.7.3       2022-11-02 [1] CRAN (R 4.3.0)
 cli           3.6.1       2023-03-23 [1] CRAN (R 4.3.0)
 codetools     0.2-19      2023-02-01 [2] CRAN (R 4.3.1)
 colorspace    2.1-0       2023-01-23 [1] CRAN (R 4.3.0)
 crayon        1.5.2       2022-09-29 [1] CRAN (R 4.3.0)
 devtools      2.4.5.9000  2023-08-11 [1] Github (r-lib/devtools@163c3f2)
 digest        0.6.33      2023-07-07 [1] CRAN (R 4.3.1)
 dplyr       * 1.1.3       2023-09-03 [1] CRAN (R 4.3.1)
 ellipsis      0.3.2       2021-04-29 [1] CRAN (R 4.3.0)
 evaluate      0.21        2023-05-05 [1] CRAN (R 4.3.0)
 extrafont   * 0.19        2023-01-18 [1] CRAN (R 4.3.0)
 extrafontdb   1.0         2012-06-11 [1] CRAN (R 4.3.0)
 fansi         1.0.4       2023-01-22 [1] CRAN (R 4.3.0)
 farver        2.1.1       2022-07-06 [1] CRAN (R 4.3.0)
 fastmap       1.1.1       2023-02-24 [1] CRAN (R 4.3.0)
 forcats     * 1.0.0.9000  2023-04-23 [1] Github (tidyverse/forcats@4a8525a)
 fs            1.6.3       2023-07-20 [1] CRAN (R 4.3.1)
 furrr       * 0.3.1       2022-08-15 [1] CRAN (R 4.3.1)
 future      * 1.33.0      2023-07-01 [1] CRAN (R 4.3.1)
 generics      0.1.3       2022-07-05 [1] CRAN (R 4.3.0)
 ggplot2     * 3.4.3.9000  2023-09-05 [1] Github (tidyverse/ggplot2@d180248)
 globals       0.16.2      2022-11-21 [1] CRAN (R 4.3.0)
 glue          1.6.2.9000  2023-04-23 [1] Github (tidyverse/glue@cbac82a)
 gtable        0.3.4.9000  2023-08-22 [1] Github (r-lib/gtable@c410a54)
 hms           1.1.3       2023-03-21 [1] CRAN (R 4.3.0)
 htmltools     0.5.6       2023-08-10 [1] CRAN (R 4.3.1)
 htmlwidgets   1.6.2       2023-03-17 [1] CRAN (R 4.3.0)
 jsonlite      1.8.7       2023-06-29 [1] CRAN (R 4.3.1)
 knitr         1.43        2023-05-25 [1] CRAN (R 4.3.0)
 labeling      0.4.3       2023-08-29 [1] CRAN (R 4.3.1)
 lattice       0.21-8      2023-04-05 [2] CRAN (R 4.3.1)
 lifecycle     1.0.3       2022-10-07 [1] CRAN (R 4.3.0)
 listenv       0.9.0       2022-12-16 [1] CRAN (R 4.3.0)
 lubridate   * 1.9.2.9000  2023-07-22 [1] Github (tidyverse/lubridate@cae67ea)
 magrittr      2.0.3       2022-03-30 [1] CRAN (R 4.3.0)
 Matrix      * 1.6-1       2023-08-14 [2] CRAN (R 4.3.1)
 memoise       2.0.1       2021-11-26 [1] CRAN (R 4.3.0)
 munsell       0.5.0       2018-06-12 [1] CRAN (R 4.3.0)
 parallelly    1.36.0      2023-05-26 [1] CRAN (R 4.3.0)
 patchwork   * 1.1.3.9000  2023-08-17 [1] Github (thomasp85/patchwork@51a6eff)
 pillar        1.9.0       2023-03-22 [1] CRAN (R 4.3.0)
 pkgbuild      1.4.2       2023-06-26 [1] CRAN (R 4.3.1)
 pkgconfig     2.0.3       2019-09-22 [1] CRAN (R 4.3.0)
 pkgload       1.3.2.9000  2023-07-05 [1] Github (r-lib/pkgload@3cf9896)
 png           0.1-8       2022-11-29 [1] CRAN (R 4.3.0)
 prettyunits   1.1.1.9000  2023-04-23 [1] Github (r-lib/prettyunits@8706d89)
 processx      3.8.2       2023-06-30 [1] CRAN (R 4.3.1)
 ps            1.7.5       2023-04-18 [1] CRAN (R 4.3.0)
 purrr       * 1.0.2.9000  2023-08-11 [1] Github (tidyverse/purrr@ac4f5a9)
 R.cache       0.16.0      2022-07-21 [1] CRAN (R 4.3.0)
 R.methodsS3   1.8.2       2022-06-13 [1] CRAN (R 4.3.0)
 R.oo          1.25.0      2022-06-12 [1] CRAN (R 4.3.0)
 R.utils       2.12.2      2022-11-11 [1] CRAN (R 4.3.0)
 R6            2.5.1.9000  2023-04-23 [1] Github (r-lib/R6@e97cca7)
 Rcpp          1.0.11      2023-07-06 [1] CRAN (R 4.3.1)
 readr       * 2.1.4.9000  2023-08-03 [1] Github (tidyverse/readr@80e4dc1)
 remotes       2.4.2.9000  2023-06-09 [1] Github (r-lib/remotes@8875171)
 reticulate    1.31        2023-08-10 [1] CRAN (R 4.3.1)
 rlang         1.1.1.9000  2023-06-09 [1] Github (r-lib/rlang@c55f602)
 rmarkdown     2.24.2      2023-09-07 [1] Github (rstudio/rmarkdown@8d2d9b8)
 rstudioapi    0.15.0.9000 2023-09-07 [1] Github (rstudio/rstudioapi@19c80c0)
 Rttf2pt1      1.3.12      2023-01-22 [1] CRAN (R 4.3.0)
 scales        1.2.1       2022-08-20 [1] CRAN (R 4.3.0)
 sessioninfo   1.2.2       2021-12-06 [1] CRAN (R 4.3.0)
 stringi       1.7.12      2023-01-11 [1] CRAN (R 4.3.0)
 stringr     * 1.5.0.9000  2023-08-11 [1] Github (tidyverse/stringr@08ff36f)
 styler      * 1.10.2      2023-08-30 [1] Github (r-lib/styler@1976817)
 tibble      * 3.2.1.9005  2023-05-28 [1] Github (tidyverse/tibble@4de5c15)
 tidyr       * 1.3.0.9000  2023-04-23 [1] Github (tidyverse/tidyr@0764e65)
 tidyselect    1.2.0       2022-10-10 [1] CRAN (R 4.3.0)
 tidyverse   * 2.0.0.9000  2023-04-23 [1] Github (tidyverse/tidyverse@8ec2e1f)
 timechange    0.2.0       2023-01-11 [1] CRAN (R 4.3.0)
 tzdb          0.4.0       2023-05-12 [1] CRAN (R 4.3.0)
 usethis       2.2.2.9000  2023-07-11 [1] Github (r-lib/usethis@467ff57)
 utf8          1.2.3       2023-01-31 [1] CRAN (R 4.3.0)
 vctrs         0.6.3       2023-06-14 [1] CRAN (R 4.3.0)
 withr         2.5.0       2022-03-03 [1] CRAN (R 4.3.0)
 xfun          0.40        2023-08-09 [1] CRAN (R 4.3.1)
 yaml          2.3.7       2023-01-23 [1] CRAN (R 4.3.0)
 zeallot       0.1.0.9000  2023-06-11 [1] Github (nteetor/zeallot@55167bf)

 [1] /opt/homebrew/lib/R/4.3/site-library
 [2] /opt/homebrew/Cellar/r/4.3.1/lib/R/library

─ Python configuration ───────────────────────────────────────────────────────
 python:         /Users/jialei/.pyenv/shims/python
 libpython:      /Users/jialei/.pyenv/versions/mambaforge-22.9.0-3/lib/libpython3.10.dylib
 pythonhome:     /Users/jialei/.pyenv/versions/mambaforge-22.9.0-3:/Users/jialei/.pyenv/versions/mambaforge-22.9.0-3
 version:        3.10.9 | packaged by conda-forge | (main, Feb  2 2023, 20:26:08) [Clang 14.0.6 ]
 numpy:          /Users/jialei/.pyenv/versions/mambaforge-22.9.0-3/lib/python3.10/site-packages/numpy
 numpy_version:  1.24.3
 anndata:        /Users/jialei/.pyenv/versions/mambaforge-22.9.0-3/lib/python3.10/site-packages/anndata
 
 NOTE: Python version was forced by RETICULATE_PYTHON

──────────────────────────────────────────────────────────────────────────────