scirpy.datasets.wu2020

scirpy.datasets.wu2020()

Return the dataset from [WMdA+20] as AnnData object.

200k cells, of which 100k have TCRs.

This is how the dataset was processed:

# %load_ext autoreload
# %autoreload 2
import scanpy as sc
import sys

sys.path.insert(0, "../../..")
import scirpy as ir
from multiprocessing import Pool
import os
import pandas as pd
from glob import glob
import numpy as np

# + language="bash"
# mkdir -p data
# cd data
# wget -O GSE139555_raw.tar "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE139555&format=file"
# wget -O GSE139555_tcell_metadata.txt.gz "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE139555&format=file&file=GSE139555%5Ftcell%5Fmetadata%2Etxt%2Egz"
# tar xvf GSE139555_raw.tar

# + language="bash"
# cd data
# for f in *.matrix.mtx.gz; do
#   dirname=${f/\.matrix\.mtx\.gz/}
#   mkdir $dirname
#   mv $dirname.genes.tsv.gz $dirname/genes.tsv.gz
#   mv $dirname.matrix.mtx.gz $dirname/matrix.mtx.gz
#   mv $dirname.barcodes.tsv.gz $dirname/barcodes.tsv.gz
#   mv $dirname.filtered_contig_annotations.csv.gz $dirname/filtered_contig_annotations.csv.gz
#   # fix missing feature type column
#   zcat $dirname/genes.tsv.gz |  awk '{print $0 "\tGene Expression"}' | gzip > $dirname/features.tsv.gz
# done
# -

mtx_paths = glob("data/GSM*")

mtx_paths

metadata_all = pd.read_csv(
    "data/GSE139555_tcell_metadata.txt.gz", sep="\t", index_col=0
)

umap = metadata_all[["UMAP_1", "UMAP_2"]]

metadata = metadata_all[["ident", "patient", "sample", "source", "clonotype"]]

metadata = metadata.rename(
    columns={"clonotype": "clonotype_orig", "ident": "cluster_orig"}
)

metadata


def _load_adata(path):
    sample_id = path.split("-")[-1].upper()
    obs = metadata.loc[metadata["sample"] == sample_id, :]
    umap_coords = umap.loc[metadata["sample"] == sample_id, :].values
    adata = sc.read_10x_mtx(path)
    adata_tcr = ir.io.read_10x_vdj(
        os.path.join(path, "filtered_contig_annotations.csv.gz")
    )
    adata.obs_names = [
        "{}_{}".format(sample_id, barcode) for barcode in adata.obs_names
    ]
    adata_tcr.obs_names = [
        "{}_{}".format(sample_id, barcode) for barcode in adata_tcr.obs_names
    ]
    # subset to T cells only
    adata = adata[obs.index, :].copy()
    adata.obs = adata.obs.join(obs, how="inner")
    assert adata.shape[0] == umap_coords.shape[0]
    adata.obsm["X_umap_orig"] = umap_coords
    ir.pp.merge_with_ir(adata, adata_tcr)
    return adata


p = Pool()
adatas = p.map(_load_adata, mtx_paths)
p.close()

adata = adatas[0].concatenate(adatas[1:])

# inverse umap X -coordinate
adata.obsm["X_umap_orig"][:, 0] = (
    np.max(adata.obsm["X_umap_orig"][:, 0]) - adata.obsm["X_umap_orig"][:, 0]
)

np.sum(adata.obs["has_ir"] == "True")

adata.write_h5ad("wu2020.h5ad", compression="lzf")

adata.obs

sc.pl.embedding(adata, "umap_orig", color="cluster_orig", legend_loc="on data")
Return type

AnnDataAnnData