Genome-Wide Chromatin Accessibility Matrix

Published

Sat Aug 12, 2023 07:27:20-05:00


[1] "2023-08-12 07:27:10 CDT"
[1] "America/Chicago"

Preparation

PROJECT_DIR <- file.path(
    "/Users/jialei/Dropbox/Data/Projects/UTSW",
    "/Cardiomyopathy/atac-seq"
)

Functions

Load required packages.

library(tidyverse)
## ── Attaching core tidyverse packages ─────────────────── tidyverse 2.0.0.9000 ──
## ✔ dplyr     1.1.2.9000     ✔ readr     2.1.4.9000
## ✔ forcats   1.0.0.9000     ✔ stringr   1.5.0.9000
## ✔ ggplot2   3.4.2.9000     ✔ tibble    3.2.1.9005
## ✔ lubridate 1.9.2.9000     ✔ tidyr     1.3.0.9000
## ✔ purrr     1.0.2.9000     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(Matrix)
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
library(patchwork)
library(extrafont)
## Registering fonts with R
source(
    file = file.path(
        SCRIPT_DIR,
        "utilities.R"
    )
)
`%+replace%` <- ggplot2::`%+replace%`

Python env

np <- reticulate::import("numpy", convert = TRUE)
cat("numpy version:", np$`__version__`, "\n")
numpy version: 1.24.3 
reticulate::py_config()
python:         /Users/jialei/.pyenv/shims/python
libpython:      /Users/jialei/.pyenv/versions/mambaforge-22.9.0-3/lib/libpython3.10.dylib
pythonhome:     /Users/jialei/.pyenv/versions/mambaforge-22.9.0-3:/Users/jialei/.pyenv/versions/mambaforge-22.9.0-3
version:        3.10.9 | packaged by conda-forge | (main, Feb  2 2023, 20:26:08) [Clang 14.0.6 ]
numpy:          /Users/jialei/.pyenv/versions/mambaforge-22.9.0-3/lib/python3.10/site-packages/numpy
numpy_version:  1.24.3
numpy:          /Users/jialei/.pyenv/versions/mambaforge-22.9.0-3/lib/python3.10/site-packages/numpy

NOTE: Python version was forced by RETICULATE_PYTHON

Preprocessing

Matrix

# load matrix
feature_counts_files <- list.files(
    path = file.path(PROJECT_DIR, "defined_peaks/summarize_counts")
)

feature_counts_files <- feature_counts_files[
    stringr::str_detect(feature_counts_files, pattern = "merged.+.bz2")
]

matrix_readcount_use <- purrr::map(feature_counts_files, \(x) {
    dt <- data.table::fread(
        file = file.path(PROJECT_DIR, "defined_peaks/summarize_counts", x)
    )

    features <- dt[["Geneid"]]
    dt <- dt[, .SD, .SDcols = patterns("/project")]
    colnames(dt) <- stringr::str_remove(
        string = colnames(dt),
        pattern = "_atacseq/alignments/Aligned_sorted_deduped_q10.bam"
    ) |>
        stringr::str_remove(pattern = "^.+/")

    m <- as.matrix(dt, sparse = TRUE)
    rownames(m) <- features

    return(m)
})

matrix_readcount_use <- purrr::reduce(matrix_readcount_use, cbind)
dim(matrix_readcount_use)
[1] 207021     87
# rename samples
sample_ids <- tibble::tribble(
    ~old, ~new,
    "heart_fresh1_1", "F1_1",
    "heart_fresh1_2", "F1_2",
    "heart_fresh2_1", "F2_1",
    "heart_fresh2_2", "F2_2",
    "heart_fresh5_1", "F5_1",
    "heart_fresh5_2", "F5_2",
    "heart_myectomy_p11_1", "HOCM11_1",
    "heart_myectomy_p11_2", "HOCM11_2",
    "heart_myectomy_p7_1", "HOCM7_1",
    "heart_myectomy_p7_2", "HOCM7_2",
    #
    "heart_myectomy_p4_2", "MYEC4_2"
)

colnames(matrix_readcount_use) <- colnames(matrix_readcount_use) |>
    tibble::enframe(value = "sample") |>
    dplyr::left_join(sample_ids, by = c("sample" = "old")) |>
    dplyr::mutate(
        new = case_when(
            is.na(new) ~ sample,
            TRUE ~ new
        ),
        new = stringr::str_remove(string = new, pattern = "heart_"),
        #
        new = case_when(
            stringr::str_detect(
                string = new,
                pattern = "[hocm|Hocm|MYEC]"
            ) ~ stringr::str_to_upper(new),
            TRUE ~ stringr::str_to_title(new)
        )
    ) |>
    dplyr::pull(new)
# inspect matrix
matrix_readcount_use[1:5, 1:8]
                F1_1 F1_2 F2_1 F2_2 F5_1 F5_2 P3_1 P3_2
1_181358_181567    4    3    5   14   10   11    8    1
1_183716_183885    4    4    6    9    2    7    3    5
1_183999_184321    3    3    9   15    2   11    5   10
1_191239_191880   19   17   24   28   14   28    9   10
1_267894_268128    5    2   10   21    8   14    2    8
# filter blocked regions
blocked_regions <- read.table(
    file = file.path(
        dirname(PROJECT_DIR),
        "misc/annotations",
        "ENCFF419RSJ.bed.gz"
    ),
    stringsAsFactors = FALSE
)

blocked_regions <- GenomicRanges::GRanges(
    blocked_regions[, 1],
    IRanges::IRanges(blocked_regions[, 2] + 1, blocked_regions[, 3])
)

if (ensembldb::seqlevelsStyle(blocked_regions) == "UCSC") {
    ensembldb::seqlevelsStyle(blocked_regions) <- "Ensembl"
}

peak_regions <- stringr::str_split(
    string = rownames(matrix_readcount_use),
    pattern = "_", simplify = TRUE
) |>
    as.data.frame()
peak_regions <- GenomicRanges::GRanges(
    peak_regions[, 1],
    IRanges::IRanges(
        as.numeric(peak_regions[, 2]) + 1,
        as.numeric(peak_regions[, 3])
    )
)

idy1 <- S4Vectors::queryHits(
    GenomicRanges::findOverlaps(peak_regions, blocked_regions)
)

idy2 <- grep("Y|MT|GL|KI", peak_regions)
idy <- unique(c(idy1, idy2))

matrix_readcount_use <- matrix_readcount_use[-idy, ]
dim(matrix_readcount_use)
[1] 206024     87
# sanity check
stopifnot(
    dim(matrix_readcount_use) == c(206024, 87)
)
# check memory usage
walk(list(matrix_readcount_use), \(x) {
    print(object.size(x), units = "auto", standard = "SI")
})
89.8 MB

R session info

devtools::session_info()
─ Session info ───────────────────────────────────────────────────────────────
 setting  value
 version  R version 4.3.1 (2023-06-16)
 os       macOS Ventura 13.5
 system   aarch64, darwin22.4.0
 ui       unknown
 language (EN)
 collate  en_US.UTF-8
 ctype    en_US.UTF-8
 tz       America/Chicago
 date     2023-08-12
 pandoc   2.19.2 @ /Users/jialei/.pyenv/shims/ (via rmarkdown)

─ Packages ───────────────────────────────────────────────────────────────────
 package              * version     date (UTC) lib source
 abind                  1.4-5       2016-07-21 [1] CRAN (R 4.3.0)
 AnnotationDbi          1.62.2      2023-07-02 [1] Bioconductor
 AnnotationFilter       1.24.0      2023-04-25 [1] Bioconductor
 Biobase                2.60.0      2023-04-25 [1] Bioconductor
 BiocFileCache          2.8.0       2023-04-25 [1] Bioconductor
 BiocGenerics           0.46.0      2023-04-25 [1] Bioconductor
 BiocIO                 1.10.0      2023-04-25 [1] Bioconductor
 BiocParallel           1.34.2      2023-05-22 [1] Bioconductor
 biomaRt                2.56.1      2023-06-09 [1] Bioconductor
 Biostrings             2.68.1      2023-05-16 [1] Bioconductor
 bit                    4.0.5       2022-11-15 [1] CRAN (R 4.3.0)
 bit64                  4.0.5       2020-08-30 [1] CRAN (R 4.3.0)
 bitops                 1.0-7       2021-04-24 [1] CRAN (R 4.3.0)
 blob                   1.2.4       2023-03-17 [1] CRAN (R 4.3.0)
 cachem                 1.0.8       2023-05-01 [1] CRAN (R 4.3.0)
 callr                  3.7.3       2022-11-02 [1] CRAN (R 4.3.0)
 cli                    3.6.1       2023-03-23 [1] CRAN (R 4.3.0)
 codetools              0.2-19      2023-02-01 [2] CRAN (R 4.3.1)
 colorspace             2.1-0       2023-01-23 [1] CRAN (R 4.3.0)
 crayon                 1.5.2       2022-09-29 [1] CRAN (R 4.3.0)
 curl                   5.0.1       2023-06-07 [1] CRAN (R 4.3.0)
 data.table             1.14.8      2023-02-17 [1] CRAN (R 4.3.0)
 DBI                    1.1.3       2022-06-18 [1] CRAN (R 4.3.0)
 dbplyr                 2.3.3       2023-07-07 [1] CRAN (R 4.3.1)
 DelayedArray           0.26.7      2023-07-28 [1] Bioconductor
 devtools               2.4.5.9000  2023-08-11 [1] Github (r-lib/devtools@163c3f2)
 digest                 0.6.33      2023-07-07 [1] CRAN (R 4.3.1)
 dplyr                * 1.1.2.9000  2023-07-19 [1] Github (tidyverse/dplyr@c963d4d)
 ellipsis               0.3.2       2021-04-29 [1] CRAN (R 4.3.0)
 ensembldb              2.24.0      2023-04-25 [1] Bioconductor
 evaluate               0.21        2023-05-05 [1] CRAN (R 4.3.0)
 extrafont            * 0.19        2023-01-18 [1] CRAN (R 4.3.0)
 extrafontdb            1.0         2012-06-11 [1] CRAN (R 4.3.0)
 fansi                  1.0.4       2023-01-22 [1] CRAN (R 4.3.0)
 fastmap                1.1.1       2023-02-24 [1] CRAN (R 4.3.0)
 filelock               1.0.2       2018-10-05 [1] CRAN (R 4.3.1)
 forcats              * 1.0.0.9000  2023-04-23 [1] Github (tidyverse/forcats@4a8525a)
 fs                     1.6.3       2023-07-20 [1] CRAN (R 4.3.1)
 generics               0.1.3       2022-07-05 [1] CRAN (R 4.3.0)
 GenomeInfoDb           1.36.1      2023-06-21 [1] Bioconductor
 GenomeInfoDbData       1.2.10      2023-04-23 [1] Bioconductor
 GenomicAlignments      1.36.0      2023-04-25 [1] Bioconductor
 GenomicFeatures        1.52.1      2023-06-22 [1] Bioconductor
 GenomicRanges          1.52.0      2023-04-25 [1] Bioconductor
 ggplot2              * 3.4.2.9000  2023-08-11 [1] Github (tidyverse/ggplot2@2cd0e96)
 glue                   1.6.2.9000  2023-04-23 [1] Github (tidyverse/glue@cbac82a)
 gtable                 0.3.3.9000  2023-04-23 [1] Github (r-lib/gtable@c56fd4f)
 hms                    1.1.3       2023-03-21 [1] CRAN (R 4.3.0)
 htmltools              0.5.6       2023-08-10 [1] CRAN (R 4.3.1)
 htmlwidgets            1.6.2       2023-03-17 [1] CRAN (R 4.3.0)
 httr                   1.4.6       2023-05-08 [1] CRAN (R 4.3.0)
 IRanges                2.34.1      2023-06-22 [1] Bioconductor
 jsonlite               1.8.7       2023-06-29 [1] CRAN (R 4.3.1)
 KEGGREST               1.40.0      2023-04-25 [1] Bioconductor
 knitr                  1.43        2023-05-25 [1] CRAN (R 4.3.0)
 lattice                0.21-8      2023-04-05 [2] CRAN (R 4.3.1)
 lazyeval               0.2.2       2019-03-15 [1] CRAN (R 4.3.0)
 lifecycle              1.0.3       2022-10-07 [1] CRAN (R 4.3.0)
 lubridate            * 1.9.2.9000  2023-07-22 [1] Github (tidyverse/lubridate@cae67ea)
 magrittr               2.0.3       2022-03-30 [1] CRAN (R 4.3.0)
 Matrix               * 1.6-0       2023-07-08 [2] CRAN (R 4.3.1)
 MatrixGenerics         1.12.3      2023-07-30 [1] Bioconductor
 matrixStats            1.0.0       2023-06-02 [1] CRAN (R 4.3.0)
 memoise                2.0.1       2021-11-26 [1] CRAN (R 4.3.0)
 munsell                0.5.0       2018-06-12 [1] CRAN (R 4.3.0)
 patchwork            * 1.1.2.9000  2023-08-11 [1] Github (thomasp85/patchwork@bd57553)
 pillar                 1.9.0       2023-03-22 [1] CRAN (R 4.3.0)
 pkgbuild               1.4.2       2023-06-26 [1] CRAN (R 4.3.1)
 pkgconfig              2.0.3       2019-09-22 [1] CRAN (R 4.3.0)
 pkgload                1.3.2.9000  2023-07-05 [1] Github (r-lib/pkgload@3cf9896)
 png                    0.1-8       2022-11-29 [1] CRAN (R 4.3.0)
 prettyunits            1.1.1.9000  2023-04-23 [1] Github (r-lib/prettyunits@8706d89)
 processx               3.8.2       2023-06-30 [1] CRAN (R 4.3.1)
 progress               1.2.2       2019-05-16 [1] CRAN (R 4.3.0)
 ProtGenerics           1.32.0      2023-04-25 [1] Bioconductor
 ps                     1.7.5       2023-04-18 [1] CRAN (R 4.3.0)
 purrr                * 1.0.2.9000  2023-08-11 [1] Github (tidyverse/purrr@ac4f5a9)
 R.cache                0.16.0      2022-07-21 [1] CRAN (R 4.3.0)
 R.methodsS3            1.8.2       2022-06-13 [1] CRAN (R 4.3.0)
 R.oo                   1.25.0      2022-06-12 [1] CRAN (R 4.3.0)
 R.utils                2.12.2      2022-11-11 [1] CRAN (R 4.3.0)
 R6                     2.5.1.9000  2023-04-23 [1] Github (r-lib/R6@e97cca7)
 ragg                   1.2.5       2023-01-12 [1] CRAN (R 4.3.0)
 rappdirs               0.3.3       2021-01-31 [1] CRAN (R 4.3.0)
 Rcpp                   1.0.11      2023-07-06 [1] CRAN (R 4.3.1)
 RCurl                  1.98-1.12   2023-03-27 [1] CRAN (R 4.3.0)
 readr                * 2.1.4.9000  2023-08-03 [1] Github (tidyverse/readr@80e4dc1)
 remotes                2.4.2.9000  2023-06-09 [1] Github (r-lib/remotes@8875171)
 restfulr               0.0.15      2022-06-16 [1] CRAN (R 4.3.0)
 reticulate             1.31        2023-08-10 [1] CRAN (R 4.3.1)
 rjson                  0.2.21      2022-01-09 [1] CRAN (R 4.3.0)
 rlang                  1.1.1.9000  2023-06-09 [1] Github (r-lib/rlang@c55f602)
 rmarkdown              2.23.4      2023-07-27 [1] Github (rstudio/rmarkdown@054d735)
 Rsamtools              2.16.0      2023-04-25 [1] Bioconductor
 RSQLite                2.3.1       2023-04-03 [1] CRAN (R 4.3.0)
 rstudioapi             0.15.0.9000 2023-07-19 [1] Github (rstudio/rstudioapi@feceaef)
 rtracklayer            1.60.0      2023-04-25 [1] Bioconductor
 Rttf2pt1               1.3.12      2023-01-22 [1] CRAN (R 4.3.0)
 S4Arrays               1.0.5       2023-07-24 [1] Bioconductor
 S4Vectors              0.38.1      2023-05-02 [1] Bioconductor
 scales                 1.2.1       2022-08-20 [1] CRAN (R 4.3.0)
 sessioninfo            1.2.2       2021-12-06 [1] CRAN (R 4.3.0)
 stringi                1.7.12      2023-01-11 [1] CRAN (R 4.3.0)
 stringr              * 1.5.0.9000  2023-08-11 [1] Github (tidyverse/stringr@08ff36f)
 styler               * 1.10.1      2023-07-17 [1] Github (r-lib/styler@aca7223)
 SummarizedExperiment   1.30.2      2023-06-06 [1] Bioconductor
 systemfonts            1.0.4       2022-02-11 [1] CRAN (R 4.3.0)
 textshaping            0.3.6       2021-10-13 [1] CRAN (R 4.3.0)
 tibble               * 3.2.1.9005  2023-05-28 [1] Github (tidyverse/tibble@4de5c15)
 tidyr                * 1.3.0.9000  2023-04-23 [1] Github (tidyverse/tidyr@0764e65)
 tidyselect             1.2.0       2022-10-10 [1] CRAN (R 4.3.0)
 tidyverse            * 2.0.0.9000  2023-04-23 [1] Github (tidyverse/tidyverse@8ec2e1f)
 timechange             0.2.0       2023-01-11 [1] CRAN (R 4.3.0)
 tzdb                   0.4.0       2023-05-12 [1] CRAN (R 4.3.0)
 usethis                2.2.2.9000  2023-07-11 [1] Github (r-lib/usethis@467ff57)
 utf8                   1.2.3       2023-01-31 [1] CRAN (R 4.3.0)
 vctrs                  0.6.3       2023-06-14 [1] CRAN (R 4.3.0)
 withr                  2.5.0       2022-03-03 [1] CRAN (R 4.3.0)
 xfun                   0.40        2023-08-09 [1] CRAN (R 4.3.1)
 XML                    3.99-0.14   2023-03-19 [1] CRAN (R 4.3.0)
 xml2                   1.3.5       2023-07-06 [1] CRAN (R 4.3.1)
 XVector                0.40.0      2023-04-25 [1] Bioconductor
 yaml                   2.3.7       2023-01-23 [1] CRAN (R 4.3.0)
 zlibbioc               1.46.0      2023-04-25 [1] Bioconductor

 [1] /opt/homebrew/lib/R/4.3/site-library
 [2] /opt/homebrew/Cellar/r/4.3.1/lib/R/library

─ Python configuration ───────────────────────────────────────────────────────
 python:         /Users/jialei/.pyenv/shims/python
 libpython:      /Users/jialei/.pyenv/versions/mambaforge-22.9.0-3/lib/libpython3.10.dylib
 pythonhome:     /Users/jialei/.pyenv/versions/mambaforge-22.9.0-3:/Users/jialei/.pyenv/versions/mambaforge-22.9.0-3
 version:        3.10.9 | packaged by conda-forge | (main, Feb  2 2023, 20:26:08) [Clang 14.0.6 ]
 numpy:          /Users/jialei/.pyenv/versions/mambaforge-22.9.0-3/lib/python3.10/site-packages/numpy
 numpy_version:  1.24.3
 numpy:          /Users/jialei/.pyenv/versions/mambaforge-22.9.0-3/lib/python3.10/site-packages/numpy
 
 NOTE: Python version was forced by RETICULATE_PYTHON

──────────────────────────────────────────────────────────────────────────────


Citation

BibTeX citation:
@article{bhattacharyya2023,
  author = {Bhattacharyya, Samadrita and Duan, Jialei and J. Vela, Ryan
    and Bhakta, Minoti and Bajona, Pietro and P.A. Mammen, Pradeep and
    C. Hon, Gary and V. Munshi, Nikhil},
  publisher = {American Heart Association},
  title = {Accurate {Classification} of {Cardiomyopathy} {Diagnosis} by
    {Chromatin} {Accessibility}},
  journal = {Circulation},
  volume = {146},
  number = {11},
  pages = {878 - 881},
  date = {2023-08-12},
  url = {https://doi.org/10.1161/CIRCULATIONAHA.122.059659},
  doi = {10.1161/CIRCULATIONAHA.122.059659},
  langid = {en}
}
For attribution, please cite this work as:
Bhattacharyya, Samadrita, Jialei Duan, Ryan J. Vela, Minoti Bhakta, Pietro Bajona, Pradeep P.A. Mammen, Gary C. Hon, and Nikhil V. Munshi. 2023. “Accurate Classification of Cardiomyopathy Diagnosis by Chromatin Accessibility.” Circulation 146 (11): 878–81. https://doi.org/10.1161/CIRCULATIONAHA.122.059659.