scirpy.datasets.maynard2020

scirpy.datasets.maynard2020()

Return the dataset from [MMR+20] as AnnData object.

21k cells from NSCLC profiled with Smart-seq2, of which 3,500 have TCRs and 1,500 have BCRs.

The raw FASTQ files have been obtained from PRJNA591860 and processed using the nf-core Smart-seq2 pipeline.

The processed files have been imported and transformed into an anndata.AnnData object using the following script:

# %env OPENBLAS_NUM_THREADS=16
# %env OMP_NUM_THREADS=16
# %env MKL_NUM_THREADS=16
# %env OMP_NUM_cpus=16
# %env MKL_NUM_cpus=16
# %env OPENBLAS_NUM_cpus=16
import sys

sys.path.insert(0, "../../..")

import scirpy as ir
import scanpy as sc
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from pathlib import Path

# The dataset has been downloaded from ENA and then processed using the Smart-seq2 Pipeline:
# https://github.com/nf-core/smartseq2/

DATASET_DIR = Path("/data/datasets/Maynard_Bivona_2020_NSCLC/")

# ### Read counts and TPMs

count_mat = pd.read_csv(
    DATASET_DIR / "smartseq2_pipeline/resultCOUNT.txt",
    sep="\t",
    low_memory=False,
    index_col="Geneid",
)

tpm_mat = pd.read_csv(
    DATASET_DIR / "smartseq2_pipeline/resultTPM.txt", sep="\t", low_memory=False
)

# summarize to gene symbol for the ~300 duplicated symbols.
tpm_mat_symbol = tpm_mat.drop("gene_id", axis="columns").groupby("gene_symbol").sum()

# ### Read and sanitize metadata

# +
sample_info = pd.read_csv(DATASET_DIR / "scripts/sra_sample_info.csv", low_memory=False)
cell_metadata = pd.read_csv(
    DATASET_DIR / "scripts/cell_metadata.csv", low_memory=False, index_col=0
)

# combine metadata
meta = sample_info.merge(
    cell_metadata, left_on="cell_ID", right_on="cell_id"
).set_index("Run")
# -

meta = meta.drop(
    [
        "Assay Type",
        "AvgSpotLen",
        "SRA Study",
        "ReleaseDate",
        "Bases",
        "disease",
        "Biomaterial_provider",
        "BioProject",
        "Isolate",
        "Sample Name",
        "BioSample",
        "BioSampleModel",
        "Bytes",
        "Center Name",
        "Consent",
        "DATASTORE filetype",
        "DATASTORE provider",
        "DATASTORE region",
        "Experiment",
        "Instrument",
        "LibraryLayout",
        "Library Name",
        "LibrarySelection",
        "cell_ID",
        "LibrarySource",
        "Organism",
        "Platform",
        "gender",
        "SAMPLE_TYPE",
        "TISSUE",
    ],
    axis="columns",
).rename(
    {
        "Age": "age",
        "smokingHx": "smoking_status",
        "stage.at.dx": "stage_at_diagnosis",
    },
    axis="columns",
)

meta.tail()

# ### Find all cells for which we have both counts, TPM and annotation

has_counts = set(count_mat.columns)
has_tpm = set(tpm_mat.columns)
has_meta = set(meta.index.values)

cell_ids = np.array(list(has_counts & has_tpm & has_meta))

# ### Build adata

var = (
    pd.DataFrame(count_mat.index)
    .rename({"Geneid": "gene_symbol"}, axis="columns")
    .set_index("gene_symbol")
    .sort_index()
)

adata = sc.AnnData(
    X=csr_matrix(tpm_mat_symbol.loc[var.index, cell_ids].values.T),
    layers={"raw_counts": csr_matrix(count_mat.loc[var.index, cell_ids].values.T)},
    var=var,
    obs=meta.loc[cell_ids, :],
)

adata_tcr = ir.io.read_tracer(
    "/data/datasets/Maynard_Bivona_2020_NSCLC/smartseq2_pipeline/TraCeR"
)
adata_bcr = ir.io.read_bracer(
    "/data/datasets/Maynard_Bivona_2020_NSCLC/smartseq2_pipeline/BraCeR/filtered_BCR_summary/changeodb.tab"
)

ir.pp.merge_with_ir(adata, adata_tcr)
ir.pp.merge_with_ir(adata, adata_bcr)

# Write out the dataset
adata.write_h5ad("maynard2020.h5ad", compression="lzf")
Return type

AnnDataAnnData