Data Preprocessing

Author

Jialei Duan

Published

Sun Sep 25, 2022 00:38:46-05:00

Doi
Abstract

Human blastoids provide a readily accessible, scalable, versatile and perturbable alternative to blastocysts for studying early human development, understanding early pregnancy loss and gaining insights into early developmental defects.


from datetime import datetime
datetime.today().strftime("%Y-%m-%d %H:%M:%S")
'2022-09-25 00:36:28'
import sys

sys.path.append("/Users/jialei/Dropbox/Data/Projects/UTSW/Scripts/utilities")

from pathlib import Path

import anndata as ad
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.sparse
import seaborn as sns
from matplotlib import __version__ as mpl_version
print(sys.version)
print("numpy", np.__version__)
print("pandas", pd.__version__)
print("scipy", scipy.__version__)
print("matplotlib", mpl_version)
print("seaborn", sns.__version__)
3.9.13 | packaged by conda-forge | (main, May 27 2022, 17:00:33) 
[Clang 13.0.1 ]
numpy 1.22.4
pandas 1.4.4
scipy 1.9.1
matplotlib 3.5.3
seaborn 0.12.0
params = {
    "pdf.fonttype": 42,
    "font.family": "sans-serif",
    "font.sans-serif": "Arial",
    "mathtext.default": "regular",
    "figure.dpi": 96 * 1.5,
}
plt.rcParams.update(params)
from utilities import (
    calc_mt_percentage,
    plot_barplot_mt_distribution,
    read_10x_matrix,
)

Parameters

PROJECT_DIR = "/Users/jialei/Dropbox/Data/Projects/UTSW/Human_blastoid"
PROJECT_DIR = Path(PROJECT_DIR)

Preprocessing

Part 1

batches = ["LW36", "LW58", "LW59", "LW60", "LW61"]

scrublet_files = [
    "cells_singlet_log_0.235_9129_637.txt",
    "cells_singlet_log_0.162_5628_874.txt",
    "cells_singlet_log_0.174_5494_626.txt",
    "cells_singlet_log_0.196_5512_189.txt",
    "cells_singlet_log_0.195_6611_707.txt"
]

mt_ratio = dict()

for i, j in zip(batches, scrublet_files):
    print(i, j)

    m = read_10x_matrix(
        data_directory=PROJECT_DIR / "raw" / i / "filtered_feature_bc_matrix",
        cell_id_prefix=i,
        features_selected="Gene Expression",
    )

    cells_included = [
        i + "_" + ii.rstrip().replace("-1", "")
        for ii in open(file=PROJECT_DIR / "raw" / i / "scrublet" / j, mode="r")
    ]

    adata = ad.AnnData(
        X=m["matrix"].T,
        obs=pd.DataFrame(m["barcodes"], index=m["barcodes"], columns=["cell"]),
        var=pd.DataFrame(
            m["features"], index=m["features"], columns=["features"]
        ),
        dtype=np.int64,
    )

    adata = adata[cells_included, :]
    mt_ratio[i] = calc_mt_percentage(adata.X.T, adata.var.index)

    del m
    del adata
LW36 cells_singlet_log_0.235_9129_637.txt
LW58 cells_singlet_log_0.162_5628_874.txt
LW59 cells_singlet_log_0.174_5494_626.txt
LW60 cells_singlet_log_0.196_5512_189.txt
LW61 cells_singlet_log_0.195_6611_707.txt
for i in mt_ratio:
    print(i)

    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(4 * 1, 3 * 1))
    plot_barplot_mt_distribution(x=mt_ratio[i], ax=ax)

    plt.tight_layout()
    plt.show()

    plt.close(fig=fig)
LW36

 

LW58

 

LW59

 

LW60

 

LW61

 

Part 2

batches = ["LW49", "LW50", "LW51", "LW52"]
adatas = list()

for i in batches:
    print(i)

    m = read_10x_matrix(
        data_directory=PROJECT_DIR / "raw" / i / "filtered_feature_bc_matrix",
        cell_id_prefix=i,
        features_selected="Gene Expression",
    )

    adata = ad.AnnData(
        X=m["matrix"].T,
        obs=pd.DataFrame(m["barcodes"], index=m["barcodes"], columns=["cell"]),
        var=pd.DataFrame(
            m["features"], index=m["features"], columns=["features"]
        ),
        dtype=np.int64,
    )

    adatas.append(adata)

    del m
    del adata
LW49
LW50
LW51
LW52
cells_included = [
    i.rstrip() for i in
    open(PROJECT_DIR
    / "raw"
    / "feeder_cell_detection"
    / "filtered_feature_bc_matrix_scrublet"
    / "clustering"
    / "LW49_LW50_LW51_LW52/cells_included.txt")
]
adata = ad.concat(adatas)
adata = adata[cells_included, :]
adata
View of AnnData object with n_obs × n_vars = 10842 × 33538
    obs: 'cell'
mt_ratio = calc_mt_percentage(adata.X.T, adata.var.index)

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(4 * 1, 3 * 1))
plot_barplot_mt_distribution(x=mt_ratio, ax=ax)

plt.tight_layout()
plt.show()

plt.close(fig=fig)

 

 

Citation

BibTeX citation:
@article{yu,
  author = {Leqian Yu and Yulei Wei and Jialei Duan and Daniel A.
    Schmitz and Masahiro Sakurai and Lei Wang and Kunhua Wang and Shuhua
    Zhao and Gary C. Hon and Jun Wu},
  editor = {},
  publisher = {Nature Publishing Group},
  title = {Blastocyst-Like Structures Generated from Human Pluripotent
    Stem Cells},
  journal = {Nature},
  volume = {591},
  number = {7851},
  pages = {620 - 626},
  date = {},
  url = {https://doi.org/10.1038/s41586-021-03356-y},
  doi = {10.1038/s41586-021-03356-y},
  langid = {en},
  abstract = {Human blastoids provide a readily accessible, scalable,
    versatile and perturbable alternative to blastocysts for studying
    early human development, understanding early pregnancy loss and
    gaining insights into early developmental defects.}
}
For attribution, please cite this work as:
Leqian Yu, Yulei Wei, Jialei Duan, Daniel A. Schmitz, Masahiro Sakurai, Lei Wang, Kunhua Wang, Shuhua Zhao, Gary C. Hon, and Jun Wu. n.d. “Blastocyst-Like Structures Generated from Human Pluripotent Stem Cells.” Nature 591 (7851): 620–26. https://doi.org/10.1038/s41586-021-03356-y.