scirpy.datasets.wu2020

scirpy.datasets.wu2020()

Return the dataset from [WMdA+20] as MuData object.

140k cells, of which 100k have TCRs.

Note

Scirpy example datasets are managed through Pooch.

By default, the dataset will be downloaded into your operating system’s default cache directory (See pooch.os_cache() for more details). If it has already been downloaded, it will be retrieved from the cache.

You can override the default cache dir by setting the SCIRPY_DATA_DIR environment variable to a path of your preference.

This is how the dataset was processed:

# ---
# jupyter:
#   jupytext:
#     cell_metadata_filter: -all
#     formats: py:light,ipynb
#     notebook_metadata_filter: -kernelspec
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.14.4
# ---

# +
# %load_ext autoreload
# %autoreload 2
import sys

import scanpy as sc

# +
sys.path.insert(0, "../../..")
import os
from glob import glob
from multiprocessing import Pool

import anndata
import numpy as np
import pandas as pd
from mudata import MuData

import scirpy as ir

# + language="bash"
# mkdir -p data
# cd data
# wget --no-verbose -O GSE139555_raw.tar "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE139555&format=file"
# wget --no-verbose -O GSE139555_tcell_metadata.txt.gz "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE139555&format=file&file=GSE139555%5Ftcell%5Fmetadata%2Etxt%2Egz"
# tar xvf GSE139555_raw.tar

# + language="bash"
# cd data
# for f in *.matrix.mtx.gz; do
#   dirname=${f/\.matrix\.mtx\.gz/}
#   mkdir $dirname
#   mv $dirname.genes.tsv.gz $dirname/genes.tsv.gz
#   mv $dirname.matrix.mtx.gz $dirname/matrix.mtx.gz
#   mv $dirname.barcodes.tsv.gz $dirname/barcodes.tsv.gz
#   mv $dirname.filtered_contig_annotations.csv.gz $dirname/filtered_contig_annotations.csv.gz
#   # fix missing feature type column
#   zcat $dirname/genes.tsv.gz |  awk '{print $0 "\tGene Expression"}' | gzip > $dirname/features.tsv.gz
# done
# -

mtx_paths = glob("data/GSM*")

mtx_paths

metadata_all = pd.read_csv(
    "data/GSE139555_tcell_metadata.txt.gz", sep="\t", index_col=0
)

umap = metadata_all[["UMAP_1", "UMAP_2"]]

metadata = metadata_all[["ident", "patient", "sample", "source", "clonotype"]]

metadata = metadata.rename(
    columns={"clonotype": "clonotype_orig", "ident": "cluster_orig"}
)

metadata


def _load_adata(path):
    sample_id = path.split("-")[-1].upper()
    obs = metadata.loc[metadata["sample"] == sample_id, :]
    umap_coords = umap.loc[metadata["sample"] == sample_id, :].values
    adata = sc.read_10x_mtx(path)
    adata_tcr = ir.io.read_10x_vdj(
        os.path.join(path, "filtered_contig_annotations.csv.gz")
    )
    adata.obs_names = [
        "{}_{}".format(sample_id, barcode) for barcode in adata.obs_names
    ]
    adata_tcr.obs_names = [
        "{}_{}".format(sample_id, barcode) for barcode in adata_tcr.obs_names
    ]
    # subset to cells with annotated metadata only
    adata = adata[obs.index, :].copy()
    # all metadata except clonotyp_orig in GEX modality
    adata.obs = adata.obs.join(obs.drop(columns=["clonotype_orig"]), how="inner")
    assert adata.shape[0] == umap_coords.shape[0]
    adata.obsm["X_umap_orig"] = umap_coords
    # #356: workaround for https://github.com/scverse/muon/issues/93
    adata_tcr.X = np.ones((adata_tcr.shape[0], 0))
    # clonotype orig column in TCR modality
    adata_tcr.obs = adata_tcr.obs.join(
        obs.loc[:, ["clonotype_orig"]], how="left", validate="one_to_one"
    )
    return adata, adata_tcr


p = Pool()
adatas = p.map(_load_adata, mtx_paths)
p.close()

adatas, adatas_airr = zip(*adatas)

adata = anndata.concat(adatas)

adata_airr = anndata.concat(adatas_airr)

# inverse umap X -coordinate
adata.obsm["X_umap_orig"][:, 0] = (
    np.max(adata.obsm["X_umap_orig"][:, 0]) - adata.obsm["X_umap_orig"][:, 0]
)

mdata = MuData({"gex": adata, "airr": adata_airr})

mdata

adata.obs

adata_airr.obs

mdata.obs

sc.pl.embedding(adata, "umap_orig", color="cluster_orig", legend_loc="on data")

mdata.write_h5mu("wu2020.h5mu", compression="lzf")
Return type

MuData