Sys.time()
[1] "2023-02-25 17:07:22 CST"
Sys.time()
[1] "2023-02-25 17:07:22 CST"
[1] "America/Chicago"
import multiprocessing
import sys
"/Users/jialei/Dropbox/Data/Projects/UTSW/Scripts/utilities")
sys.path.append("/project/GCRB/Hon_lab/s166631/00.bin/utilities/")
sys.path.append(
from pathlib import Path
from random import sample
import harmonypy as hm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import phate
import scanpy as sc
import umap
from utilities import (
plot_embedding,
plot_pca_variance_explained,
plot_cluster_composition, )
= {
params "pdf.fonttype": 42,
"font.family": "sans-serif",
"font.sans-serif": "Arial",
"mathtext.default": "regular",
"figure.dpi": 96 * 1.5,
} plt.rcParams.update(params)
= Path(
PROJECT_DIR "/Users/jialei/Dropbox/Data/Projects/UTSW/Peri-implantation") / "raw"
= 20210719
SEED = 100
N_COMPONENTS
= 200
MINIMAL_NUM_GENES_REQUIRED_FOR_CELL = 30
MINIMAL_NUM_CELLS_REQUIRED_FOR_GENE = 60
MINIMAL_NUM_COUNTS_REQUIRED_FOR_GENE = 0.2
MT_PERCENTAGE_CUTOFF
= multiprocessing.cpu_count() - 1
N_THREADS = None N_SAMPLING
This study
# This study; LW119_LW120_LW121_LW122
= ["LW119", "LW120", "LW121", "LW122"]
libraries = [
adatas =PROJECT_DIR / i / "matrix" / "adata.h5ad")
sc.read_h5ad(filenamefor i in libraries
]
Yu et al. 2021
# Yu et al. 2021; PRJNA632839; LW60_LW61
= sc.read_h5ad(
adata =PROJECT_DIR
filename/ "PRJNA632839/LW36_LW58_LW59_LW60_LW61"
/ "matrix"
/ "adata.h5ad"
)"batch"].isin(["LW60", "LW61"]), :])
adatas.append(adata[adata.obs[
# filter mt_percentage
= [
adatas
i["mt_percentage"] <= MT_PERCENTAGE_CUTOFF,
i.obs[
]for i in adatas
]
Public
# Petropoulos et al. 2016; PRJEB11202
# Xiang et al. 2020; PRJNA562548
# Tyser et al. 2021; PRJEB40781
# Zheng et al. 2019; PRJNA555602
# Yanagida et al. 2021; PRJNA720968
= [
libraries "PRJEB11202",
"PRJNA562548",
"PRJEB40781",
"PRJNA555602",
"PRJNA720968",
]
= [
adatas_public =PROJECT_DIR / "public" / i / "matrix" / "adata.h5ad")
sc.read_h5ad(filenamefor i in libraries
]
adatas.extend(adatas_public)
del adatas_public
# merge adatas
= sc.concat(adatas, axis=0) adata
print(f"Raw median UMIs per cell: {np.median(a=adata.X.sum(axis=1).A1):,}")
## Raw median UMIs per cell: 19,032.0
print(f"Number of cells before filtering: {adata.n_obs:,}")
## Number of cells before filtering: 31,481
print(f"Number of features before filtering: {adata.n_vars:,}")
## Number of features before filtering: 33,538
print(adata.obs["batch"].value_counts())
## GSM3956280 5454
## LW61 5156
## LW121 5130
## GSM3956281 4512
## LW60 4497
## LW119 2236
## PRJEB11202 1529
## PRJEB40781 1195
## PRJNA562548 555
## PRJNA720968 495
## LW120 435
## LW122 287
## Name: batch, dtype: int64
# reorder cells
= adata[np.sort(adata.obs.index), :]
adata
# filter cells
= adata[
adata > 0).sum(axis=1).A1 >= MINIMAL_NUM_GENES_REQUIRED_FOR_CELL, :
(adata.X ]
= adata.X.transpose(copy=True)
matrix_cpm_use = adata.var_names features
# sample cells
if N_SAMPLING:
= adata[sample(list(adata.obs.index.values), N_SAMPLING), :]
adata print(f"Number of cells after sampling: {adata.n_obs:,}")
# filter features
= np.logical_and(
row_idx > 0).sum(axis=0).A1 >= MINIMAL_NUM_CELLS_REQUIRED_FOR_GENE,
(adata.X sum(axis=0).A1 >= MINIMAL_NUM_COUNTS_REQUIRED_FOR_GENE,
(adata.X).
)= adata[:, row_idx] adata
print(f"Median UMIs per cell: {np.median(a=adata.X.sum(axis=1).A1):,}")
## Median UMIs per cell: 19,032.0
print(f"Number of cells after filtering: {adata.n_obs:,}")
## Number of cells after filtering: 31,481
print(f"Number of features after filtering: {adata.n_vars:,}")
## Number of features after filtering: 26,297
# normalize
sc.pp.normalize_total(=adata,
adata=np.median(a=adata.X.sum(axis=1).A1),
target_sum=False,
exclude_highly_expressed )
/Users/jialei/.pyenv/versions/mambaforge-22.9.0-3/lib/python3.10/site-packages/scanpy/preprocessing/_normalization.py:138: UserWarning: Revieved a view of an AnnData. Making a copy.
view_to_actual(adata)
/Users/jialei/.pyenv/versions/mambaforge-22.9.0-3/lib/python3.10/site-packages/scanpy/preprocessing/_normalization.py:138: FutureWarning: X.dtype being converted to np.float32 from int64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.
view_to_actual(adata)
# logarithmize
sc.pp.log1p(adata)
# detect hvgs
if HVG:
sc.pp.highly_variable_genes(
adata,=0.0125,
min_mean=3,
max_mean=0.5,
min_disp="seurat",
flavor="batch",
batch_key
)
# standardize
/Users/jialei/.pyenv/versions/mambaforge-22.9.0-3/lib/python3.10/site-packages/scanpy/preprocessing/_highly_variable_genes.py:475: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
hvg = hvg.append(missing_hvg, ignore_index=True)
/Users/jialei/.pyenv/versions/mambaforge-22.9.0-3/lib/python3.10/site-packages/scanpy/preprocessing/_highly_variable_genes.py:475: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
hvg = hvg.append(missing_hvg, ignore_index=True)
/Users/jialei/.pyenv/versions/mambaforge-22.9.0-3/lib/python3.10/site-packages/scanpy/preprocessing/_highly_variable_genes.py:475: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
hvg = hvg.append(missing_hvg, ignore_index=True)
/Users/jialei/.pyenv/versions/mambaforge-22.9.0-3/lib/python3.10/site-packages/scanpy/preprocessing/_highly_variable_genes.py:475: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
hvg = hvg.append(missing_hvg, ignore_index=True)
/Users/jialei/.pyenv/versions/mambaforge-22.9.0-3/lib/python3.10/site-packages/scanpy/preprocessing/_highly_variable_genes.py:475: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
hvg = hvg.append(missing_hvg, ignore_index=True)
/Users/jialei/.pyenv/versions/mambaforge-22.9.0-3/lib/python3.10/site-packages/scanpy/preprocessing/_highly_variable_genes.py:475: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
hvg = hvg.append(missing_hvg, ignore_index=True)
/Users/jialei/.pyenv/versions/mambaforge-22.9.0-3/lib/python3.10/site-packages/scanpy/preprocessing/_highly_variable_genes.py:475: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
hvg = hvg.append(missing_hvg, ignore_index=True)
/Users/jialei/.pyenv/versions/mambaforge-22.9.0-3/lib/python3.10/site-packages/scanpy/preprocessing/_highly_variable_genes.py:475: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
hvg = hvg.append(missing_hvg, ignore_index=True)
/Users/jialei/.pyenv/versions/mambaforge-22.9.0-3/lib/python3.10/site-packages/scanpy/preprocessing/_highly_variable_genes.py:475: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
hvg = hvg.append(missing_hvg, ignore_index=True)
/Users/jialei/.pyenv/versions/mambaforge-22.9.0-3/lib/python3.10/site-packages/scanpy/preprocessing/_highly_variable_genes.py:475: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
hvg = hvg.append(missing_hvg, ignore_index=True)
/Users/jialei/.pyenv/versions/mambaforge-22.9.0-3/lib/python3.10/site-packages/scanpy/preprocessing/_highly_variable_genes.py:475: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
hvg = hvg.append(missing_hvg, ignore_index=True)
/Users/jialei/.pyenv/versions/mambaforge-22.9.0-3/lib/python3.10/site-packages/scanpy/preprocessing/_highly_variable_genes.py:475: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
hvg = hvg.append(missing_hvg, ignore_index=True)
sc.pp.scale(
adata,=ZERO_CENTER,
zero_center=None,
max_value=False,
copy=None,
layer=None,
obsm )
sc.tl.pca(
adata,=N_COMPONENTS,
n_comps=ZERO_CENTER,
zero_center="arpack",
svd_solver=SEED,
random_state=False,
return_info=HVG,
use_highly_variable="float64",
dtype=False,
copy=False,
chunked
)
= adata.obsm["X_pca"] principal_components
with plt.style.context("ggplot"):
= plt.subplots(nrows=1, ncols=1, figsize=(5, 2.5))
fig, ax
plot_pca_variance_explained(=adata.uns["pca"]["variance_ratio"],
x=30,
num_pcs=ax,
ax
)
set(ylim=(-0.0025, max(ax.get_ylim())))
ax.
plt.tight_layout()
plt.show()
=fig) plt.close(fig
<AxesSubplot: xlabel='Principal component', ylabel='Explained variance ratio'>
[(-0.0025, 0.049999999999999996)]
# remove batch effect
= adata.obs["batch"].values
batches print(adata.obs["batch"].value_counts())
GSM3956280 5454
LW61 5156
LW121 5130
GSM3956281 4512
LW60 4497
LW119 2236
PRJEB11202 1529
PRJEB40781 1195
PRJNA562548 555
PRJNA720968 495
LW120 435
LW122 287
Name: batch, dtype: int64
if len(set(batches)) > 1:
= hm.run_harmony(
ho =principal_components,
data_mat=pd.DataFrame(
meta_data"batch": batches}, index=adata.obs.index.values
{
),="batch",
vars_use=SEED,
random_state
)= ho.Z_corr.T.astype(np.float64)
principal_components_selected
else:
= principal_components.astype(np.float64) principal_components_selected
2023-02-25 17:08:20,808 - harmonypy - INFO - Iteration 1 of 10
2023-02-25 17:08:28,362 - harmonypy - INFO - Iteration 2 of 10
2023-02-25 17:08:35,736 - harmonypy - INFO - Converged after 2 iterations
= principal_components_selected[:, range(16)] principal_components_use
= umap.UMAP(
embedding_umap =15,
n_neighbors=2,
n_components="euclidean",
metric=0.1,
min_dist=1.0,
spread=SEED,
random_state=42,
transform_seed=True,
verbose ).fit_transform(principal_components_use)
UMAP(random_state=20210719, verbose=True)
Sat Feb 25 17:08:37 2023 Construct fuzzy simplicial set
Sat Feb 25 17:08:37 2023 Finding Nearest Neighbors
Sat Feb 25 17:08:37 2023 Building RP forest with 14 trees
Sat Feb 25 17:08:37 2023 NN descent for 15 iterations
1 / 15
2 / 15
3 / 15
Stopping threshold met -- exiting after 3 iterations
Sat Feb 25 17:08:44 2023 Finished Nearest Neighbor Search
Sat Feb 25 17:08:45 2023 Construct embedding
Sat Feb 25 17:08:57 2023 Finished embedding
Epochs completed: 0%| 0/200 [00:00]
Epochs completed: 0%| 1/200 [00:00]
Epochs completed: 2%| 2 5/200 [00:00]
Epochs completed: 4%| 3 7/200 [00:00]
Epochs completed: 4%| 4 9/200 [00:00]
Epochs completed: 6%| 5 11/200 [00:00]
Epochs completed: 6%| 6 13/200 [00:00]
Epochs completed: 8%| 7 15/200 [00:00]
Epochs completed: 8%| 8 17/200 [00:01]
Epochs completed: 10%| 9 19/200 [00:01]
Epochs completed: 10%| # 21/200 [00:01]
Epochs completed: 12%| #1 23/200 [00:01]
Epochs completed: 12%| #2 25/200 [00:01]
Epochs completed: 14%| #3 27/200 [00:01]
Epochs completed: 14%| #4 29/200 [00:01]
Epochs completed: 16%| #5 31/200 [00:01]
Epochs completed: 16%| #6 33/200 [00:01]
Epochs completed: 18%| #7 35/200 [00:02]
Epochs completed: 18%| #8 37/200 [00:02]
Epochs completed: 20%| #9 39/200 [00:02]
Epochs completed: 20%| ## 41/200 [00:02]
Epochs completed: 22%| ##1 43/200 [00:02]
Epochs completed: 22%| ##2 45/200 [00:02]
Epochs completed: 24%| ##3 47/200 [00:02]
Epochs completed: 24%| ##4 49/200 [00:02]
Epochs completed: 26%| ##5 51/200 [00:02]
Epochs completed: 26%| ##6 53/200 [00:03]
Epochs completed: 28%| ##7 55/200 [00:03]
Epochs completed: 28%| ##8 57/200 [00:03]
Epochs completed: 30%| ##9 59/200 [00:03]
Epochs completed: 30%| ### 61/200 [00:03]
Epochs completed: 32%| ###1 63/200 [00:03]
Epochs completed: 32%| ###2 65/200 [00:03]
Epochs completed: 34%| ###3 67/200 [00:03]
Epochs completed: 34%| ###4 69/200 [00:03]
Epochs completed: 36%| ###5 71/200 [00:04]
Epochs completed: 36%| ###6 73/200 [00:04]
Epochs completed: 38%| ###7 75/200 [00:04]
Epochs completed: 38%| ###8 77/200 [00:04]
Epochs completed: 40%| ###9 79/200 [00:04]
Epochs completed: 40%| #### 81/200 [00:04]
Epochs completed: 42%| ####1 83/200 [00:04]
Epochs completed: 42%| ####2 85/200 [00:04]
Epochs completed: 44%| ####3 87/200 [00:04]
Epochs completed: 44%| ####4 89/200 [00:05]
Epochs completed: 46%| ####5 91/200 [00:05]
Epochs completed: 46%| ####6 93/200 [00:05]
Epochs completed: 48%| ####7 95/200 [00:05]
Epochs completed: 48%| ####8 97/200 [00:05]
Epochs completed: 50%| ####9 99/200 [00:05]
Epochs completed: 50%| ##### 101/200 [00:05]
Epochs completed: 52%| #####1 103/200 [00:05]
Epochs completed: 52%| #####2 105/200 [00:05]
Epochs completed: 54%| #####3 107/200 [00:06]
Epochs completed: 55%| #####4 109/200 [00:06]
Epochs completed: 56%| #####5 111/200 [00:06]
Epochs completed: 56%| #####6 113/200 [00:06]
Epochs completed: 57%| #####7 115/200 [00:06]
Epochs completed: 58%| #####8 117/200 [00:06]
Epochs completed: 60%| #####9 119/200 [00:06]
Epochs completed: 60%| ###### 121/200 [00:06]
Epochs completed: 62%| ######1 123/200 [00:06]
Epochs completed: 62%| ######2 125/200 [00:07]
Epochs completed: 64%| ######3 127/200 [00:07]
Epochs completed: 64%| ######4 129/200 [00:07]
Epochs completed: 66%| ######5 131/200 [00:07]
Epochs completed: 66%| ######6 133/200 [00:07]
Epochs completed: 68%| ######7 135/200 [00:07]
Epochs completed: 68%| ######8 137/200 [00:07]
Epochs completed: 70%| ######9 139/200 [00:07]
Epochs completed: 70%| ####### 141/200 [00:07]
Epochs completed: 72%| #######1 143/200 [00:08]
Epochs completed: 72%| #######2 145/200 [00:08]
Epochs completed: 74%| #######3 147/200 [00:08]
Epochs completed: 74%| #######4 149/200 [00:08]
Epochs completed: 76%| #######5 151/200 [00:08]
Epochs completed: 76%| #######6 153/200 [00:08]
Epochs completed: 78%| #######7 155/200 [00:08]
Epochs completed: 78%| #######8 157/200 [00:08]
Epochs completed: 80%| #######9 159/200 [00:08]
Epochs completed: 80%| ######## 161/200 [00:09]
Epochs completed: 82%| ########1 163/200 [00:09]
Epochs completed: 82%| ########2 165/200 [00:09]
Epochs completed: 84%| ########3 167/200 [00:09]
Epochs completed: 84%| ########4 169/200 [00:09]
Epochs completed: 86%| ########5 171/200 [00:09]
Epochs completed: 86%| ########6 173/200 [00:09]
Epochs completed: 88%| ########7 175/200 [00:09]
Epochs completed: 88%| ########8 177/200 [00:09]
Epochs completed: 90%| ########9 179/200 [00:10]
Epochs completed: 90%| ######### 181/200 [00:10]
Epochs completed: 92%| #########1 183/200 [00:10]
Epochs completed: 92%| #########2 185/200 [00:10]
Epochs completed: 94%| #########3 187/200 [00:10]
Epochs completed: 94%| #########4 189/200 [00:10]
Epochs completed: 96%| #########5 191/200 [00:10]
Epochs completed: 96%| #########6 193/200 [00:10]
Epochs completed: 98%| #########7 195/200 [00:10]
Epochs completed: 98%| #########8 197/200 [00:11]
Epochs completed: 100%| #########9 199/200 [00:11]
Epochs completed: 100%| ########## 200/200 [00:11]
= sc.AnnData(
adata =principal_components_use,
X={"cell": adata.obs.index.values, "batch": batches},
obs="float64",
dtype
)= adata.obs["cell"]
adata.obs.index
sc.pp.neighbors(=adata,
adata=30,
n_neighbors=0,
n_pcs=None,
use_rep=True,
knn=SEED,
random_state="umap",
method="euclidean",
metric=False,
copy
)
sc.tl.leiden(=adata,
adata=1,
resolution=SEED,
random_state="leiden",
key_added=True,
directed=True,
use_weights=None,
partition_type=False,
copy )
= pd.DataFrame(
embedding =np.concatenate(
data"batch", "leiden"]], embedding_umap),
(adata.obs[[=1,
axis
),=adata.obs["cell"],
index=["batch", "leiden", "x_umap_min_dist=0.1", "y_umap_min_dist=0.1"],
columns
)
= embedding.astype(
embedding
{"leiden": int,
"x_umap_min_dist=0.1": float,
"y_umap_min_dist=0.1": float,
} )
= plt.subplots(nrows=1 * 1, ncols=1, figsize=(4 * 1, 3 * 1))
fig, ax
plot_embedding(=embedding.loc[:, ["x_umap_min_dist=0.1", "y_umap_min_dist=0.1"]],
embedding=ax,
ax=[str(i) for i in embedding["leiden"]],
color_values="UMAP; Leiden",
title=True,
show_color_value_labels=True,
rasterise
)
plt.tight_layout() plt.show()
=fig) plt.close(fig
="batch").size().to_frame(name="num_cells") embedding.groupby(by
num_cells
batch
GSM3956280 5454
GSM3956281 4512
LW119 2236
LW120 435
LW121 5130
LW122 287
LW60 4497
LW61 5156
PRJEB11202 1529
PRJEB40781 1195
PRJNA562548 555
PRJNA720968 495
= (
cluster_composition
(="leiden").aggregate("batch").value_counts()
embedding.groupby(by/ embedding.groupby(by="leiden").aggregate("batch").size()
)="percentage")
.to_frame(name
.reset_index()
)
= cluster_composition.astype({"leiden": str})
cluster_composition
= list(
clusters_selected "leiden"].unique()).astype(dtype=str)[::-1]
np.sort(embedding[
)
= list(embedding["batch"].unique())
batches_selected
= cluster_composition.astype(
cluster_composition
{"leiden": pd.api.types.CategoricalDtype(
=clusters_selected, ordered=True
categories
),"batch": pd.api.types.CategoricalDtype(
=batches_selected, ordered=True
categories
),
} )
= plt.subplots(nrows=1, ncols=1, figsize=(4.5, 4))
fig, ax
plot_cluster_composition(=cluster_composition,
cluster_composition="batch",
x="leiden",
y=ax,
ax=batches_selected,
x_order=clusters_selected,
y_order
)
plt.tight_layout() plt.show()
=fig) plt.close(fig
= (
matrix_cpm_use.data 1_000_000
* matrix_cpm_use.data
/ np.repeat(matrix_cpm_use.sum(axis=0).A1, np.diff(matrix_cpm_use.indptr))
)
= [
FEATURES_SELECTED "ENSG00000204531_POU5F1",
"ENSG00000203909_DPPA5",
"ENSG00000184344_GDF3",
"ENSG00000156574_NODAL",
"ENSG00000111704_NANOG",
"ENSG00000075388_FGF4",
#
"ENSG00000164736_SOX17",
"ENSG00000125798_FOXA2",
"ENSG00000136574_GATA4",
"ENSG00000141448_GATA6",
"ENSG00000134853_PDGFRA",
"ENSG00000115414_FN1",
#
"ENSG00000107485_GATA3",
"ENSG00000118777_ABCG2",
"ENSG00000165556_CDX2",
"ENSG00000137869_CYP19A1",
"ENSG00000172818_OVOL1",
"ENSG00000126353_CCR7"
]
for selected_feature in FEATURES_SELECTED:
print(selected_feature)
= np.where([i == selected_feature for i in features])[0]
idx = matrix_cpm_use[idx, :].toarray().flatten()
values = np.log10(values + 1)
values
= plt.subplots(nrows=1 * 1, ncols=1, figsize=(4 * 1, 3 * 1))
fig, ax
= plot_embedding(
p =embedding.loc[
embedding"x_umap_min_dist=0.1", "y_umap_min_dist=0.1"]
adata.obs.index, [
],=ax,
ax=values,
color_values=f"UMAP; {selected_feature}",
title=False,
show_color_value_labels=4,
marker_size=True,
sort_values
)
= ax.inset_axes((0.06, 0.70, 0.03, 0.2))
ax_ins
= fig.colorbar(
cbar =p,
mappable=ax_ins,
cax="vertical",
orientation="",
label=range(np.ceil(max(values)).astype(int)),
ticks
)
=0.2)
cbar.outline.set_linewidth(w=False)
cbar.outline.set_visible(b
ax_ins.tick_params(="y",
axis="out",
direction=1.5,
length=0.2,
width="#333333",
color=1.25,
pad=6,
labelsize="#4D4D4D",
labelcolor
)
plt.tight_layout()
plt.show()
=fig) plt.close(fig
ENSG00000204531_POU5F1
ENSG00000203909_DPPA5
ENSG00000184344_GDF3
ENSG00000156574_NODAL
ENSG00000111704_NANOG
ENSG00000075388_FGF4
ENSG00000164736_SOX17
ENSG00000125798_FOXA2
ENSG00000136574_GATA4
ENSG00000141448_GATA6
ENSG00000134853_PDGFRA
ENSG00000115414_FN1
ENSG00000107485_GATA3
ENSG00000118777_ABCG2
ENSG00000165556_CDX2
ENSG00000137869_CYP19A1
ENSG00000172818_OVOL1
ENSG00000126353_CCR7