torch_em.data.datasets.electron_microscopy.human_organoids

The Human Organoids dataset contains annotations for several organelles in EM images (mitochondria, nuclei, actin, entotic cell, and junctions) for patient-derived colorectal cancer organoids.

This dataset is from the publication https://doi.org/10.1016/j.devcel.2023.03.001. Please cite it if you use this dataset in your research.

The data itself can be downloaded from EMPIAR via aspera.

  • You can install aspera via mamba. We recommend to do this in a separate environment to avoid dependency issues:
    • $ mamba create -c conda-forge -c hcc -n aspera aspera-cli
  • After this you can run $ mamba activate aspera to have an environment with aspera installed.
  • You can then download the data for one of the three datasets like this:
    • ascp -QT -l 200m -P33001 -i /etc/asperaweb_id_dsa.openssh emp_ext2@fasp.ebi.ac.uk:/
    • Where is the path to the mamba environment, the id of one of the three datasets and where you want to download the data.
  • After this you can use the functions in this file if you use as location for the data.

NOTE: We have implemented automatic download, but this leads to dependency issues, so we recommend to download the data manually and then run the loaders with the correct path.

  1"""The Human Organoids dataset contains annotations for several organelles in EM images
  2(mitochondria, nuclei, actin, entotic cell, and junctions) for patient-derived colorectal cancer organoids.
  3
  4This dataset is from the publication https://doi.org/10.1016/j.devcel.2023.03.001.
  5Please cite it if you use this dataset in your research.
  6
  7The data itself can be downloaded from EMPIAR via aspera.
  8- You can install aspera via mamba. We recommend to do this in a separate environment
  9  to avoid dependency issues:
 10    - `$ mamba create -c conda-forge -c hcc -n aspera aspera-cli`
 11- After this you can run `$ mamba activate aspera` to have an environment with aspera installed.
 12- You can then download the data for one of the three datasets like this:
 13    - ascp -QT -l 200m -P33001 -i <PREFIX>/etc/asperaweb_id_dsa.openssh emp_ext2@fasp.ebi.ac.uk:/<EMPIAR_ID> <PATH>
 14    - Where <PREFIX> is the path to the mamba environment, <EMPIAR_ID> the id of one of the three datasets
 15      and <PATH> where you want to download the data.
 16- After this you can use the functions in this file if you use <PATH> as location for the data.
 17
 18NOTE: We have implemented automatic download, but this leads to dependency
 19issues, so we recommend to download the data manually and then run the loaders with the correct path.
 20"""
 21
 22import os
 23from glob import glob
 24from tqdm import tqdm
 25from pathlib import Path
 26from typing import Union, List, Literal, Tuple
 27
 28from torch.utils.data import Dataset, DataLoader
 29
 30import torch_em
 31
 32from .. import util
 33
 34
 35def _prepare_dataset(data_root):
 36    import mrcfile
 37    import h5py
 38
 39    raw_paths = glob(os.path.join(data_root, "*bin2.mrc"))
 40    for raw_path in tqdm(raw_paths, desc="Preprocessing volumes"):
 41        vol_path = Path(raw_path).with_suffix(".h5")
 42        if os.path.exists(vol_path):
 43            continue
 44
 45        with mrcfile.open(raw_path, "r") as f:
 46            raw = f.data
 47
 48        # Get the corresponding label paths.
 49        label_paths = [p for p in glob(raw_path.replace(".mrc", "*.mrc")) if p != raw_path]
 50
 51        labels = {}
 52        for label_path in label_paths:
 53            label_name = Path(label_path).stem.split("_")[-1]
 54
 55            if label_name == "cell":  # A simple replacement for one outlier case.
 56                label_name = "entotic_cell"
 57
 58            with mrcfile.open(label_path, "r") as f:
 59                curr_label = f.data
 60
 61            labels[label_name] = curr_label
 62
 63        # Finally, drop them all in a single h5 file.
 64        with h5py.File(vol_path, "w") as f:
 65            f.create_dataset("raw", data=raw, chunks=(8, 128, 128), compression="gzip")
 66            for lname, lvol in labels.items():
 67                f.create_dataset(lname, data=lvol, chunks=(8, 128, 128), compression="gzip")
 68
 69        # And finally, remove all other volumes.
 70        os.remove(raw_path)
 71        [os.remove(p) for p in label_paths]
 72
 73
 74def get_human_organoids_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 75    """Download the Human Organoids data.
 76
 77    Args:
 78        path: Filepath to a folder where the downloaded data will be saved.
 79        download: Whether to download the data if it is not present.
 80
 81    Returns:
 82        The filepath for the downloaded data.
 83    """
 84    access_id = "11380"
 85    data_path = util.download_source_empiar(path, access_id, download)
 86
 87    data_root = os.path.join(data_path, "data")
 88    assert os.path.exists(data_root)
 89
 90    _prepare_dataset(data_root)
 91
 92    return data_root
 93
 94
 95def get_human_organoids_paths(
 96    path: Union[os.PathLike, str],
 97    organelle: Literal["mitos", "nuclei", "actin", "entotic_cell", "junctions"],
 98    download: bool = False,
 99) -> List[str]:
100    """Get the paths to Human Organoids data.
101
102    Args:
103        path: Filepath to a folder where the downloaded data will be saved.
104        organelle: The choice of organelle from 'mitos', 'nuclei', 'actin', 'entotic_cell', 'junctions'.
105        download: Whether to download the data if it is not present.
106
107    Returns:
108        List of filepaths for the volumetric data (both raw images and labels included).
109    """
110    import h5py
111
112    assert isinstance(organelle, str) and organelle in ["mitos", "nuclei", "actin", "entotic_cell", "junctions"], \
113        f"The choice of organelle '{organelle}' does not match the available choices."
114
115    data_path = get_human_organoids_data(path, download)
116    vol_paths = glob(os.path.join(data_path, "*.h5"))
117
118    # Filter out volumes without organelle labels.
119    vol_paths = [p for p in vol_paths if organelle in h5py.File(p, "r").keys()]
120    assert vol_paths, f"The provided organelle labels for '{organelle}' not found."
121
122    return vol_paths
123
124
125def get_human_organoids_dataset(
126    path: Union[os.PathLike, str],
127    patch_shape: Tuple[int, ...],
128    organelle: Literal["mitos", "nuclei", "actin", "entotic_cell", "junctions"],
129    download: bool = False,
130    **kwargs
131) -> Dataset:
132    """Get the dataset for the Human Organoids data.
133
134    Args:
135        path: Filepath to a folder where the downloaded data will be saved.
136        patch_shape: The patch shape to use for training.
137        organelle: The choice of organelle from 'mitos', 'nuclei', 'actin', 'entotic_cell', 'junctions'.
138        download: Whether to download the data if it is not present.
139        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
140
141    Returns:
142        The segmentation dataset.
143    """
144    vol_paths = get_human_organoids_paths(path, organelle, download)
145
146    return torch_em.default_segmentation_dataset(
147        raw_paths=vol_paths,
148        raw_key="raw",
149        label_paths=vol_paths,
150        label_key=organelle,
151        patch_shape=patch_shape,
152        **kwargs,
153    )
154
155
156def get_human_organoids_loader(
157    path: Union[os.PathLike, str],
158    batch_size: int,
159    patch_shape: Tuple[int, ...],
160    organelle: Literal["mitos", "nuclei", "actin", "entotic_cell", "junctions"],
161    download: bool = False,
162    **kwargs
163) -> DataLoader:
164    """Get the dataloader for the Human Organoids data.
165
166    Args:
167        path: Filepath to a folder where the downloaded data will be saved.
168        batch_size: The batch size for training.
169        patch_shape: The patch shape to use for training.
170        organelle: The choice of organelle from 'mitos', 'nuclei', 'actin', 'entotic_cell', 'junctions'.
171        download: Whether to download the data if it is not present.
172        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
173
174    Returns:
175        The DataLoader.
176    """
177    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
178    dataset = get_human_organoids_dataset(path, patch_shape, organelle, download, **ds_kwargs)
179    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
def get_human_organoids_data(path: Union[os.PathLike, str], download: bool = False) -> str:
75def get_human_organoids_data(path: Union[os.PathLike, str], download: bool = False) -> str:
76    """Download the Human Organoids data.
77
78    Args:
79        path: Filepath to a folder where the downloaded data will be saved.
80        download: Whether to download the data if it is not present.
81
82    Returns:
83        The filepath for the downloaded data.
84    """
85    access_id = "11380"
86    data_path = util.download_source_empiar(path, access_id, download)
87
88    data_root = os.path.join(data_path, "data")
89    assert os.path.exists(data_root)
90
91    _prepare_dataset(data_root)
92
93    return data_root

Download the Human Organoids data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • download: Whether to download the data if it is not present.
Returns:

The filepath for the downloaded data.

def get_human_organoids_paths( path: Union[os.PathLike, str], organelle: Literal['mitos', 'nuclei', 'actin', 'entotic_cell', 'junctions'], download: bool = False) -> List[str]:
 96def get_human_organoids_paths(
 97    path: Union[os.PathLike, str],
 98    organelle: Literal["mitos", "nuclei", "actin", "entotic_cell", "junctions"],
 99    download: bool = False,
100) -> List[str]:
101    """Get the paths to Human Organoids data.
102
103    Args:
104        path: Filepath to a folder where the downloaded data will be saved.
105        organelle: The choice of organelle from 'mitos', 'nuclei', 'actin', 'entotic_cell', 'junctions'.
106        download: Whether to download the data if it is not present.
107
108    Returns:
109        List of filepaths for the volumetric data (both raw images and labels included).
110    """
111    import h5py
112
113    assert isinstance(organelle, str) and organelle in ["mitos", "nuclei", "actin", "entotic_cell", "junctions"], \
114        f"The choice of organelle '{organelle}' does not match the available choices."
115
116    data_path = get_human_organoids_data(path, download)
117    vol_paths = glob(os.path.join(data_path, "*.h5"))
118
119    # Filter out volumes without organelle labels.
120    vol_paths = [p for p in vol_paths if organelle in h5py.File(p, "r").keys()]
121    assert vol_paths, f"The provided organelle labels for '{organelle}' not found."
122
123    return vol_paths

Get the paths to Human Organoids data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • organelle: The choice of organelle from 'mitos', 'nuclei', 'actin', 'entotic_cell', 'junctions'.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths for the volumetric data (both raw images and labels included).

def get_human_organoids_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], organelle: Literal['mitos', 'nuclei', 'actin', 'entotic_cell', 'junctions'], download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
126def get_human_organoids_dataset(
127    path: Union[os.PathLike, str],
128    patch_shape: Tuple[int, ...],
129    organelle: Literal["mitos", "nuclei", "actin", "entotic_cell", "junctions"],
130    download: bool = False,
131    **kwargs
132) -> Dataset:
133    """Get the dataset for the Human Organoids data.
134
135    Args:
136        path: Filepath to a folder where the downloaded data will be saved.
137        patch_shape: The patch shape to use for training.
138        organelle: The choice of organelle from 'mitos', 'nuclei', 'actin', 'entotic_cell', 'junctions'.
139        download: Whether to download the data if it is not present.
140        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
141
142    Returns:
143        The segmentation dataset.
144    """
145    vol_paths = get_human_organoids_paths(path, organelle, download)
146
147    return torch_em.default_segmentation_dataset(
148        raw_paths=vol_paths,
149        raw_key="raw",
150        label_paths=vol_paths,
151        label_key=organelle,
152        patch_shape=patch_shape,
153        **kwargs,
154    )

Get the dataset for the Human Organoids data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • organelle: The choice of organelle from 'mitos', 'nuclei', 'actin', 'entotic_cell', 'junctions'.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_human_organoids_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, ...], organelle: Literal['mitos', 'nuclei', 'actin', 'entotic_cell', 'junctions'], download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
157def get_human_organoids_loader(
158    path: Union[os.PathLike, str],
159    batch_size: int,
160    patch_shape: Tuple[int, ...],
161    organelle: Literal["mitos", "nuclei", "actin", "entotic_cell", "junctions"],
162    download: bool = False,
163    **kwargs
164) -> DataLoader:
165    """Get the dataloader for the Human Organoids data.
166
167    Args:
168        path: Filepath to a folder where the downloaded data will be saved.
169        batch_size: The batch size for training.
170        patch_shape: The patch shape to use for training.
171        organelle: The choice of organelle from 'mitos', 'nuclei', 'actin', 'entotic_cell', 'junctions'.
172        download: Whether to download the data if it is not present.
173        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
174
175    Returns:
176        The DataLoader.
177    """
178    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
179    dataset = get_human_organoids_dataset(path, patch_shape, organelle, download, **ds_kwargs)
180    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the dataloader for the Human Organoids data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape to use for training.
  • organelle: The choice of organelle from 'mitos', 'nuclei', 'actin', 'entotic_cell', 'junctions'.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.