scirpy.datasets.wu2020
- scirpy.datasets.wu2020()
Return the dataset from [WMdA+20] as MuData object.
140k cells, of which 100k have TCRs.
Note
Scirpy example datasets are managed through Pooch.
By default, the dataset will be downloaded into your operating system’s default cache directory (See
pooch.os_cache()
for more details). If it has already been downloaded, it will be retrieved from the cache.You can override the default cache dir by setting the
SCIRPY_DATA_DIR
environment variable to a path of your preference.This is how the dataset was processed:
# --- # jupyter: # jupytext: # cell_metadata_filter: -all # formats: py:light,ipynb # notebook_metadata_filter: -kernelspec # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.14.4 # --- # + # %load_ext autoreload # %autoreload 2 import sys import scanpy as sc # + sys.path.insert(0, "../../..") import os from glob import glob from multiprocessing import Pool import anndata import numpy as np import pandas as pd from mudata import MuData import scirpy as ir # + language="bash" # mkdir -p data # cd data # wget --no-verbose -O GSE139555_raw.tar "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE139555&format=file" # wget --no-verbose -O GSE139555_tcell_metadata.txt.gz "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE139555&format=file&file=GSE139555%5Ftcell%5Fmetadata%2Etxt%2Egz" # tar xvf GSE139555_raw.tar # + language="bash" # cd data # for f in *.matrix.mtx.gz; do # dirname=${f/\.matrix\.mtx\.gz/} # mkdir $dirname # mv $dirname.genes.tsv.gz $dirname/genes.tsv.gz # mv $dirname.matrix.mtx.gz $dirname/matrix.mtx.gz # mv $dirname.barcodes.tsv.gz $dirname/barcodes.tsv.gz # mv $dirname.filtered_contig_annotations.csv.gz $dirname/filtered_contig_annotations.csv.gz # # fix missing feature type column # zcat $dirname/genes.tsv.gz | awk '{print $0 "\tGene Expression"}' | gzip > $dirname/features.tsv.gz # done # - mtx_paths = glob("data/GSM*") mtx_paths metadata_all = pd.read_csv( "data/GSE139555_tcell_metadata.txt.gz", sep="\t", index_col=0 ) umap = metadata_all[["UMAP_1", "UMAP_2"]] metadata = metadata_all[["ident", "patient", "sample", "source", "clonotype"]] metadata = metadata.rename( columns={"clonotype": "clonotype_orig", "ident": "cluster_orig"} ) metadata def _load_adata(path): sample_id = path.split("-")[-1].upper() obs = metadata.loc[metadata["sample"] == sample_id, :] umap_coords = umap.loc[metadata["sample"] == sample_id, :].values adata = sc.read_10x_mtx(path) adata_tcr = ir.io.read_10x_vdj( os.path.join(path, "filtered_contig_annotations.csv.gz") ) adata.obs_names = [ "{}_{}".format(sample_id, barcode) for barcode in adata.obs_names ] adata_tcr.obs_names = [ "{}_{}".format(sample_id, barcode) for barcode in adata_tcr.obs_names ] # subset to cells with annotated metadata only adata = adata[obs.index, :].copy() # all metadata except clonotyp_orig in GEX modality adata.obs = adata.obs.join(obs.drop(columns=["clonotype_orig"]), how="inner") assert adata.shape[0] == umap_coords.shape[0] adata.obsm["X_umap_orig"] = umap_coords # #356: workaround for https://github.com/scverse/muon/issues/93 adata_tcr.X = np.ones((adata_tcr.shape[0], 0)) # clonotype orig column in TCR modality adata_tcr.obs = adata_tcr.obs.join( obs.loc[:, ["clonotype_orig"]], how="left", validate="one_to_one" ) return adata, adata_tcr p = Pool() adatas = p.map(_load_adata, mtx_paths) p.close() adatas, adatas_airr = zip(*adatas) adata = anndata.concat(adatas) adata_airr = anndata.concat(adatas_airr) # inverse umap X -coordinate adata.obsm["X_umap_orig"][:, 0] = ( np.max(adata.obsm["X_umap_orig"][:, 0]) - adata.obsm["X_umap_orig"][:, 0] ) mdata = MuData({"gex": adata, "airr": adata_airr}) mdata adata.obs adata_airr.obs mdata.obs sc.pl.embedding(adata, "umap_orig", color="cluster_orig", legend_loc="on data") mdata.write_h5mu("wu2020.h5mu", compression="lzf")
- Return type