Sys.time()
[1] "2023-02-25 17:04:52 CST"
Sys.time()
[1] "2023-02-25 17:04:52 CST"
[1] "America/Chicago"
import multiprocessing
import sys
"/Users/jialei/Dropbox/Data/Projects/UTSW/Scripts/utilities")
sys.path.append("/project/GCRB/Hon_lab/s166631/00.bin/utilities/")
sys.path.append(
from pathlib import Path
from random import sample
import harmonypy as hm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import phate
import scanpy as sc
import umap
from utilities import (
plot_embedding,
plot_pca_variance_explained,
plot_cluster_composition, )
= {
params "pdf.fonttype": 42,
"font.family": "sans-serif",
"font.sans-serif": "Arial",
"mathtext.default": "regular",
"figure.dpi": 96 * 1.5,
} plt.rcParams.update(params)
= Path(
PROJECT_DIR "/Users/jialei/Dropbox/Data/Projects/UTSW/Peri-implantation") / "raw"
= 20210719
SEED = 100
N_COMPONENTS
= 200
MINIMAL_NUM_GENES_REQUIRED_FOR_CELL = 30
MINIMAL_NUM_CELLS_REQUIRED_FOR_GENE = 60
MINIMAL_NUM_COUNTS_REQUIRED_FOR_GENE = 0.2
MT_PERCENTAGE_CUTOFF
= multiprocessing.cpu_count() - 1
N_THREADS = None N_SAMPLING
This study
= ["LW186", "LW187", "LW188", "LW189", "LW202", "LW203", "LW204"]
libraries = [
adatas =PROJECT_DIR / i / "matrix" / "adata.h5ad")
sc.read_h5ad(filenamefor i in libraries
]
= [
adatas
i["mt_percentage"] <= MT_PERCENTAGE_CUTOFF,
i.obs[
]for i in adatas
]
# merge adatas
= sc.concat(adatas, axis=0) adata
print(f"Raw median UMIs per cell: {np.median(a=adata.X.sum(axis=1).A1):,}")
## Raw median UMIs per cell: 3,728.0
print(f"Number of cells before filtering: {adata.n_obs:,}")
## Number of cells before filtering: 39,755
print(f"Number of features before filtering: {adata.n_vars:,}")
## Number of features before filtering: 33,538
print(adata.obs["batch"].value_counts())
## LW186 10977
## LW204 7596
## LW188 6838
## LW202 4833
## LW187 3690
## LW189 3250
## LW203 2571
## Name: batch, dtype: int64
# reorder cells
= adata[np.sort(adata.obs.index), :]
adata
# filter cells
= adata[
adata > 0).sum(axis=1).A1 >= MINIMAL_NUM_GENES_REQUIRED_FOR_CELL, :
(adata.X ]
= adata.X.transpose(copy=True)
matrix_cpm_use = adata.var_names features
# sample cells
if N_SAMPLING:
= adata[sample(list(adata.obs.index.values), N_SAMPLING), :]
adata print(f"Number of cells after sampling: {adata.n_obs:,}")
# filter features
= np.logical_and(
row_idx > 0).sum(axis=0).A1 >= MINIMAL_NUM_CELLS_REQUIRED_FOR_GENE,
(adata.X sum(axis=0).A1 >= MINIMAL_NUM_COUNTS_REQUIRED_FOR_GENE,
(adata.X).
)= adata[:, row_idx] adata
print(f"Median UMIs per cell: {np.median(a=adata.X.sum(axis=1).A1):,}")
## Median UMIs per cell: 3,725.0
print(f"Number of cells after filtering: {adata.n_obs:,}")
## Number of cells after filtering: 39,755
print(f"Number of features after filtering: {adata.n_vars:,}")
## Number of features after filtering: 16,037
# normalize
sc.pp.normalize_total(=adata,
adata=np.median(a=adata.X.sum(axis=1).A1),
target_sum=False,
exclude_highly_expressed )
/Users/jialei/.pyenv/versions/mambaforge-22.9.0-3/lib/python3.10/site-packages/scanpy/preprocessing/_normalization.py:138: UserWarning: Revieved a view of an AnnData. Making a copy.
view_to_actual(adata)
/Users/jialei/.pyenv/versions/mambaforge-22.9.0-3/lib/python3.10/site-packages/scanpy/preprocessing/_normalization.py:138: FutureWarning: X.dtype being converted to np.float32 from int64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.
view_to_actual(adata)
# logarithmize
sc.pp.log1p(adata)
# detect hvgs
if HVG:
sc.pp.highly_variable_genes(
adata,=0.0125,
min_mean=3,
max_mean=0.5,
min_disp="seurat",
flavor="batch",
batch_key
)
# standardize
/Users/jialei/.pyenv/versions/mambaforge-22.9.0-3/lib/python3.10/site-packages/scanpy/preprocessing/_highly_variable_genes.py:475: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
hvg = hvg.append(missing_hvg, ignore_index=True)
/Users/jialei/.pyenv/versions/mambaforge-22.9.0-3/lib/python3.10/site-packages/scanpy/preprocessing/_highly_variable_genes.py:475: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
hvg = hvg.append(missing_hvg, ignore_index=True)
/Users/jialei/.pyenv/versions/mambaforge-22.9.0-3/lib/python3.10/site-packages/scanpy/preprocessing/_highly_variable_genes.py:475: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
hvg = hvg.append(missing_hvg, ignore_index=True)
/Users/jialei/.pyenv/versions/mambaforge-22.9.0-3/lib/python3.10/site-packages/scanpy/preprocessing/_highly_variable_genes.py:475: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
hvg = hvg.append(missing_hvg, ignore_index=True)
/Users/jialei/.pyenv/versions/mambaforge-22.9.0-3/lib/python3.10/site-packages/scanpy/preprocessing/_highly_variable_genes.py:475: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
hvg = hvg.append(missing_hvg, ignore_index=True)
/Users/jialei/.pyenv/versions/mambaforge-22.9.0-3/lib/python3.10/site-packages/scanpy/preprocessing/_highly_variable_genes.py:475: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
hvg = hvg.append(missing_hvg, ignore_index=True)
/Users/jialei/.pyenv/versions/mambaforge-22.9.0-3/lib/python3.10/site-packages/scanpy/preprocessing/_highly_variable_genes.py:475: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
hvg = hvg.append(missing_hvg, ignore_index=True)
sc.pp.scale(
adata,=ZERO_CENTER,
zero_center=None,
max_value=False,
copy=None,
layer=None,
obsm )
sc.tl.pca(
adata,=N_COMPONENTS,
n_comps=ZERO_CENTER,
zero_center="arpack",
svd_solver=SEED,
random_state=False,
return_info=HVG,
use_highly_variable="float64",
dtype=False,
copy=False,
chunked
)
= adata.obsm["X_pca"] principal_components
with plt.style.context("ggplot"):
= plt.subplots(nrows=1, ncols=1, figsize=(5, 2.5))
fig, ax
plot_pca_variance_explained(=adata.uns["pca"]["variance_ratio"],
x=30,
num_pcs=ax,
ax
)
set(ylim=(-0.0025, max(ax.get_ylim())))
ax.
plt.tight_layout()
plt.show()
=fig) plt.close(fig
<AxesSubplot: xlabel='Principal component', ylabel='Explained variance ratio'>
[(-0.0025, 0.049999999999999996)]
# remove batch effect
= adata.obs["batch"].values
batches print(adata.obs["batch"].value_counts())
LW186 10977
LW204 7596
LW188 6838
LW202 4833
LW187 3690
LW189 3250
LW203 2571
Name: batch, dtype: int64
if len(set(batches)) > 1:
= hm.run_harmony(
ho =principal_components,
data_mat=pd.DataFrame(
meta_data"batch": batches}, index=adata.obs.index.values
{
),="batch",
vars_use=SEED,
random_state
)= ho.Z_corr.T.astype(np.float64)
principal_components_selected
else:
= principal_components.astype(np.float64) principal_components_selected
2023-02-25 17:05:41,697 - harmonypy - INFO - Iteration 1 of 10
2023-02-25 17:05:48,968 - harmonypy - INFO - Iteration 2 of 10
2023-02-25 17:05:56,651 - harmonypy - INFO - Converged after 2 iterations
= principal_components_selected[:, range(28)] principal_components_use
= umap.UMAP(
embedding_umap =15,
n_neighbors=2,
n_components="euclidean",
metric=0.1,
min_dist=1.0,
spread=SEED,
random_state=42,
transform_seed=True,
verbose ).fit_transform(principal_components_use)
UMAP(random_state=20210719, verbose=True)
Sat Feb 25 17:05:58 2023 Construct fuzzy simplicial set
Sat Feb 25 17:05:58 2023 Finding Nearest Neighbors
Sat Feb 25 17:05:58 2023 Building RP forest with 15 trees
Sat Feb 25 17:05:58 2023 NN descent for 15 iterations
1 / 15
2 / 15
3 / 15
4 / 15
Stopping threshold met -- exiting after 4 iterations
Sat Feb 25 17:06:05 2023 Finished Nearest Neighbor Search
Sat Feb 25 17:06:06 2023 Construct embedding
Sat Feb 25 17:06:21 2023 Finished embedding
Epochs completed: 0%| 0/200 [00:00]
Epochs completed: 0%| 1/200 [00:00]
Epochs completed: 2%| 2 4/200 [00:00]
Epochs completed: 3%| 3 6/200 [00:00]
Epochs completed: 4%| 4 8/200 [00:00]
Epochs completed: 5%| 5 10/200 [00:00]
Epochs completed: 6%| 6 12/200 [00:00]
Epochs completed: 7%| 7 14/200 [00:01]
Epochs completed: 8%| 8 16/200 [00:01]
Epochs completed: 9%| 9 18/200 [00:01]
Epochs completed: 10%| # 20/200 [00:01]
Epochs completed: 11%| #1 22/200 [00:01]
Epochs completed: 12%| #2 24/200 [00:01]
Epochs completed: 13%| #3 26/200 [00:01]
Epochs completed: 14%| #4 28/200 [00:02]
Epochs completed: 15%| #5 30/200 [00:02]
Epochs completed: 16%| #6 32/200 [00:02]
Epochs completed: 17%| #7 34/200 [00:02]
Epochs completed: 18%| #8 36/200 [00:02]
Epochs completed: 19%| #9 38/200 [00:02]
Epochs completed: 20%| ## 40/200 [00:02]
Epochs completed: 21%| ##1 42/200 [00:03]
Epochs completed: 22%| ##2 44/200 [00:03]
Epochs completed: 23%| ##3 46/200 [00:03]
Epochs completed: 24%| ##4 48/200 [00:03]
Epochs completed: 25%| ##5 50/200 [00:03]
Epochs completed: 26%| ##6 52/200 [00:03]
Epochs completed: 27%| ##7 54/200 [00:03]
Epochs completed: 28%| ##8 56/200 [00:04]
Epochs completed: 29%| ##9 58/200 [00:04]
Epochs completed: 30%| ### 60/200 [00:04]
Epochs completed: 31%| ###1 62/200 [00:04]
Epochs completed: 32%| ###2 64/200 [00:04]
Epochs completed: 33%| ###3 66/200 [00:04]
Epochs completed: 34%| ###4 68/200 [00:04]
Epochs completed: 35%| ###5 70/200 [00:05]
Epochs completed: 36%| ###6 72/200 [00:05]
Epochs completed: 37%| ###7 74/200 [00:05]
Epochs completed: 38%| ###8 76/200 [00:05]
Epochs completed: 39%| ###9 78/200 [00:05]
Epochs completed: 40%| #### 80/200 [00:05]
Epochs completed: 41%| ####1 82/200 [00:05]
Epochs completed: 42%| ####2 84/200 [00:06]
Epochs completed: 43%| ####3 86/200 [00:06]
Epochs completed: 44%| ####4 88/200 [00:06]
Epochs completed: 45%| ####5 90/200 [00:06]
Epochs completed: 46%| ####6 92/200 [00:06]
Epochs completed: 47%| ####6 94/200 [00:06]
Epochs completed: 48%| ####8 96/200 [00:07]
Epochs completed: 49%| ####9 98/200 [00:07]
Epochs completed: 50%| ##### 100/200 [00:07]
Epochs completed: 51%| #####1 102/200 [00:07]
Epochs completed: 52%| #####2 104/200 [00:07]
Epochs completed: 53%| #####3 106/200 [00:07]
Epochs completed: 54%| #####4 108/200 [00:07]
Epochs completed: 55%| #####5 110/200 [00:08]
Epochs completed: 56%| #####6 112/200 [00:08]
Epochs completed: 57%| #####6 114/200 [00:08]
Epochs completed: 58%| #####8 116/200 [00:08]
Epochs completed: 59%| #####8 118/200 [00:08]
Epochs completed: 60%| ###### 120/200 [00:08]
Epochs completed: 61%| ######1 122/200 [00:08]
Epochs completed: 62%| ######2 124/200 [00:09]
Epochs completed: 63%| ######3 126/200 [00:09]
Epochs completed: 64%| ######4 128/200 [00:09]
Epochs completed: 65%| ######5 130/200 [00:09]
Epochs completed: 66%| ######6 132/200 [00:09]
Epochs completed: 67%| ######7 134/200 [00:09]
Epochs completed: 68%| ######8 136/200 [00:09]
Epochs completed: 69%| ######9 138/200 [00:10]
Epochs completed: 70%| ####### 140/200 [00:10]
Epochs completed: 71%| #######1 142/200 [00:10]
Epochs completed: 72%| #######2 144/200 [00:10]
Epochs completed: 73%| #######3 146/200 [00:10]
Epochs completed: 74%| #######4 148/200 [00:10]
Epochs completed: 75%| #######5 150/200 [00:10]
Epochs completed: 76%| #######6 152/200 [00:11]
Epochs completed: 77%| #######7 154/200 [00:11]
Epochs completed: 78%| #######8 156/200 [00:11]
Epochs completed: 79%| #######9 158/200 [00:11]
Epochs completed: 80%| ######## 160/200 [00:11]
Epochs completed: 81%| ########1 162/200 [00:11]
Epochs completed: 82%| ########2 164/200 [00:11]
Epochs completed: 83%| ########2 166/200 [00:12]
Epochs completed: 84%| ########4 168/200 [00:12]
Epochs completed: 85%| ########5 170/200 [00:12]
Epochs completed: 86%| ########6 172/200 [00:12]
Epochs completed: 87%| ########7 174/200 [00:12]
Epochs completed: 88%| ########8 176/200 [00:12]
Epochs completed: 89%| ########9 178/200 [00:12]
Epochs completed: 90%| ######### 180/200 [00:13]
Epochs completed: 91%| #########1 182/200 [00:13]
Epochs completed: 92%| #########2 184/200 [00:13]
Epochs completed: 93%| #########3 186/200 [00:13]
Epochs completed: 94%| #########3 188/200 [00:13]
Epochs completed: 95%| #########5 190/200 [00:13]
Epochs completed: 96%| #########6 192/200 [00:13]
Epochs completed: 97%| #########7 194/200 [00:14]
Epochs completed: 98%| #########8 196/200 [00:14]
Epochs completed: 99%| #########9 198/200 [00:14]
Epochs completed: 100%| ########## 200/200 [00:14]
Epochs completed: 100%| ########## 200/200 [00:14]
= sc.AnnData(
adata =principal_components_use,
X={"cell": adata.obs.index.values, "batch": batches},
obs="float64",
dtype
)= adata.obs["cell"]
adata.obs.index
sc.pp.neighbors(=adata,
adata=30,
n_neighbors=0,
n_pcs=None,
use_rep=True,
knn=SEED,
random_state="umap",
method="euclidean",
metric=False,
copy
)
sc.tl.leiden(=adata,
adata=1,
resolution=SEED,
random_state="leiden",
key_added=True,
directed=True,
use_weights=None,
partition_type=False,
copy )
= pd.DataFrame(
embedding =np.concatenate(
data"batch", "leiden"]], embedding_umap),
(adata.obs[[=1,
axis
),=adata.obs["cell"],
index=["batch", "leiden", "x_umap_min_dist=0.1", "y_umap_min_dist=0.1"],
columns
)
= embedding.astype(
embedding
{"leiden": int,
"x_umap_min_dist=0.1": float,
"y_umap_min_dist=0.1": float,
} )
= plt.subplots(nrows=1 * 1, ncols=1, figsize=(4 * 1, 3 * 1))
fig, ax
plot_embedding(=embedding.loc[:, ["x_umap_min_dist=0.1", "y_umap_min_dist=0.1"]],
embedding=ax,
ax=[str(i) for i in embedding["leiden"]],
color_values="UMAP; Leiden",
title=True,
show_color_value_labels=True,
rasterise
)
plt.tight_layout() plt.show()
=fig) plt.close(fig
="batch").size().to_frame(name="num_cells") embedding.groupby(by
num_cells
batch
LW186 10977
LW187 3690
LW188 6838
LW189 3250
LW202 4833
LW203 2571
LW204 7596
= (
cluster_composition
(="leiden").aggregate("batch").value_counts()
embedding.groupby(by/ embedding.groupby(by="leiden").aggregate("batch").size()
)="percentage")
.to_frame(name
.reset_index()
)
= cluster_composition.astype({"leiden": str})
cluster_composition
= list(
clusters_selected "leiden"].unique()).astype(dtype=str)[::-1]
np.sort(embedding[
)
= list(embedding["batch"].unique())
batches_selected
= cluster_composition.astype(
cluster_composition
{"leiden": pd.api.types.CategoricalDtype(
=clusters_selected, ordered=True
categories
),"batch": pd.api.types.CategoricalDtype(
=batches_selected, ordered=True
categories
),
} )
= plt.subplots(nrows=1, ncols=1, figsize=(4.5, 4))
fig, ax
plot_cluster_composition(=cluster_composition,
cluster_composition="batch",
x="leiden",
y=ax,
ax=batches_selected,
x_order=clusters_selected,
y_order
)
plt.tight_layout() plt.show()
=fig) plt.close(fig
= (
matrix_cpm_use.data 1_000_000
* matrix_cpm_use.data
/ np.repeat(matrix_cpm_use.sum(axis=0).A1, np.diff(matrix_cpm_use.indptr))
)
= [
FEATURES_SELECTED "ENSG00000204531_POU5F1",
"ENSG00000203909_DPPA5",
"ENSG00000184344_GDF3",
"ENSG00000156574_NODAL",
"ENSG00000111704_NANOG",
"ENSG00000075388_FGF4",
#
"ENSG00000164736_SOX17",
"ENSG00000125798_FOXA2",
"ENSG00000136574_GATA4",
"ENSG00000141448_GATA6",
"ENSG00000134853_PDGFRA",
"ENSG00000115414_FN1",
#
"ENSG00000107485_GATA3",
"ENSG00000118777_ABCG2",
"ENSG00000165556_CDX2",
"ENSG00000137869_CYP19A1",
"ENSG00000172818_OVOL1",
"ENSG00000126353_CCR7"
]
for selected_feature in FEATURES_SELECTED:
print(selected_feature)
= np.where([i == selected_feature for i in features])[0]
idx = matrix_cpm_use[idx, :].toarray().flatten()
values = np.log10(values + 1)
values
= plt.subplots(nrows=1 * 1, ncols=1, figsize=(4 * 1, 3 * 1))
fig, ax
= plot_embedding(
p =embedding.loc[
embedding"x_umap_min_dist=0.1", "y_umap_min_dist=0.1"]
adata.obs.index, [
],=ax,
ax=values,
color_values=f"UMAP; {selected_feature}",
title=False,
show_color_value_labels=4,
marker_size=True,
sort_values
)
= ax.inset_axes((0.06, 0.70, 0.03, 0.2))
ax_ins
= fig.colorbar(
cbar =p,
mappable=ax_ins,
cax="vertical",
orientation="",
label=range(np.ceil(max(values)).astype(int)),
ticks
)
=0.2)
cbar.outline.set_linewidth(w=False)
cbar.outline.set_visible(b
ax_ins.tick_params(="y",
axis="out",
direction=1.5,
length=0.2,
width="#333333",
color=1.25,
pad=6,
labelsize="#4D4D4D",
labelcolor
)
plt.tight_layout()
plt.show()
=fig) plt.close(fig
ENSG00000204531_POU5F1
ENSG00000203909_DPPA5
ENSG00000184344_GDF3
ENSG00000156574_NODAL
ENSG00000111704_NANOG
ENSG00000075388_FGF4
ENSG00000164736_SOX17
ENSG00000125798_FOXA2
ENSG00000136574_GATA4
ENSG00000141448_GATA6
ENSG00000134853_PDGFRA
ENSG00000115414_FN1
ENSG00000107485_GATA3
ENSG00000118777_ABCG2
ENSG00000165556_CDX2
ENSG00000137869_CYP19A1
ENSG00000172818_OVOL1
ENSG00000126353_CCR7