torch_em.data.datasets.electron_microscopy.densecell

The DenseCell dataset contains annotations for semantic segmentation of densely-packed cellular organelles in serial block-face scanning electron microscopy (SBF-SEM) images of platelet tissue.

The dataset was published in https://doi.org/10.1038/s41598-021-81590-0. Please cite this publication if you use the dataset in your research.

View Source

  1"""The DenseCell dataset contains annotations for semantic segmentation of densely-packed cellular organelles
  2in serial block-face scanning electron microscopy (SBF-SEM) images of platelet tissue.
  3
  4The dataset was published in https://doi.org/10.1038/s41598-021-81590-0.
  5Please cite this publication if you use the dataset in your research.
  6"""
  7
  8import os
  9from shutil import rmtree
 10from typing import Tuple, Union, Literal, Optional
 11
 12import numpy as np
 13
 14import torch_em
 15
 16from torch.utils.data import Dataset, DataLoader
 17
 18from .. import util
 19
 20
 21URL = "https://www.dropbox.com/s/68yclbraqq1diza/platelet_data_1219.zip?dl=1"
 22CHECKSUM = None
 23
 24ORGANELLES = {
 25    1: "cell",
 26    2: "mitochondrion",
 27    3: "alpha_granule",
 28    4: "canalicular_vessel",
 29    5: "dense_granule",
 30    6: "dense_core",
 31}
 32
 33SPLIT_FILES = {
 34    "train": {"images": "train-images.tif", "labels": "train-labels.tif"},
 35    "val": {"images": "eval-images.tif", "labels": "eval-labels.tif"},
 36    "test": {"images": "test-images.tif", "labels": "test-labels.tif"},
 37}
 38
 39
 40def get_densecell_data(
 41    path: Union[os.PathLike, str], split: Literal["train", "val", "test"], download: bool = False
 42) -> str:
 43    """Download the DenseCell dataset.
 44
 45    Args:
 46        path: Filepath to a folder where the downloaded data will be saved.
 47        split: The split to download. Either 'train', 'val', or 'test'.
 48        download: Whether to download the data if it is not present.
 49
 50    Returns:
 51        The filepath for the downloaded data.
 52    """
 53    import h5py
 54    import tifffile
 55
 56    data_path = os.path.join(path, f"densecell_{split}.h5")
 57    if os.path.exists(data_path):
 58        with h5py.File(data_path, "r") as f:
 59            if "labels/original" in f:
 60                return data_path
 61
 62        # Remove old file with outdated structure.
 63        os.remove(data_path)
 64
 65    os.makedirs(path, exist_ok=True)
 66
 67    # Download and extract the ZIP if the source TIFFs are not available.
 68    platelet_dir = os.path.join(path, "platelet_data")
 69    if not os.path.exists(platelet_dir):
 70        zip_path = os.path.join(path, "platelet_data_1219.zip")
 71        util.download_source(zip_path, URL, download, checksum=CHECKSUM)
 72        util.unzip(zip_path, path, remove=True)
 73
 74    assert os.path.exists(platelet_dir), f"Expected extracted directory at {platelet_dir}"
 75
 76    for _split, files in SPLIT_FILES.items():
 77        out_path = os.path.join(path, f"densecell_{_split}.h5")
 78        if os.path.exists(out_path):
 79            with h5py.File(out_path, "r") as f:
 80                if "labels/original" in f:
 81                    continue
 82
 83            os.remove(out_path)
 84
 85        raw = tifffile.imread(os.path.join(platelet_dir, files["images"]))
 86        labels = tifffile.imread(os.path.join(platelet_dir, files["labels"]))
 87        assert raw.shape == labels.shape, f"Shape mismatch for {_split}: {raw.shape} vs {labels.shape}"
 88
 89        labels = labels.astype(np.uint8)
 90        with h5py.File(out_path, "w") as f:
 91            f.create_dataset("raw", data=raw, compression="gzip")
 92            f.create_dataset("labels/original", data=labels, compression="gzip")
 93            for label_id, name in ORGANELLES.items():
 94                # For cells, use all non-background labels to avoid holes from internal organelles.
 95                if name == "cell":
 96                    binary_mask = (labels >= 1).astype(np.uint8)
 97                else:
 98                    binary_mask = (labels == label_id).astype(np.uint8)
 99
100                f.create_dataset(f"labels/{name}", data=binary_mask, compression="gzip")
101
102    rmtree(platelet_dir)
103
104    assert os.path.exists(data_path), data_path
105    return data_path
106
107
108def get_densecell_paths(
109    path: Union[os.PathLike, str], split: Literal["train", "val", "test"], download: bool = False
110) -> str:
111    """Get paths to the DenseCell data.
112
113    Args:
114        path: Filepath to a folder where the downloaded data will be saved.
115        split: The data split. Either 'train', 'val', or 'test'.
116        download: Whether to download the data if it is not present.
117
118    Returns:
119        The filepath for the stored data.
120    """
121    get_densecell_data(path, split, download)
122    data_path = os.path.join(path, f"densecell_{split}.h5")
123    return data_path
124
125
126def get_densecell_dataset(
127    path: Union[os.PathLike, str],
128    split: Literal["train", "val", "test"],
129    patch_shape: Tuple[int, int, int],
130    label_choice: Optional[str] = None,
131    download: bool = False,
132    **kwargs
133) -> Dataset:
134    """Get dataset for segmentation of organelles in SBF-SEM platelet images.
135
136    Args:
137        path: Filepath to a folder where the downloaded data will be saved.
138        split: The data split. Either 'train', 'val', or 'test'.
139        patch_shape: The patch shape to use for training.
140        label_choice: The organelle to segment. Available choices are:
141            'cell', 'mitochondrion', 'alpha_granule', 'canalicular_vessel', 'dense_granule', 'dense_core'.
142            If None, uses 'original' which contains all semantic labels (0-6).
143        download: Whether to download the data if it is not present.
144        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
145
146    Returns:
147        The segmentation dataset.
148    """
149    assert split in ("train", "val", "test")
150
151    if label_choice is None:
152        label_key = "labels/original"
153    else:
154        valid_choices = list(ORGANELLES.values())
155        assert label_choice in valid_choices, f"'{label_choice}' is not valid. Choose from {valid_choices}."
156        label_key = f"labels/{label_choice}"
157
158    data_path = get_densecell_paths(path, split, download)
159
160    return torch_em.default_segmentation_dataset(
161        raw_paths=data_path,
162        raw_key="raw",
163        label_paths=data_path,
164        label_key=label_key,
165        patch_shape=patch_shape,
166        **kwargs
167    )
168
169
170def get_densecell_loader(
171    path: Union[os.PathLike, str],
172    split: Literal["train", "val", "test"],
173    patch_shape: Tuple[int, int, int],
174    batch_size: int,
175    label_choice: Optional[str] = None,
176    download: bool = False,
177    **kwargs
178) -> DataLoader:
179    """Get dataloader for segmentation of organelles in SBF-SEM platelet images.
180
181    Args:
182        path: Filepath to a folder where the downloaded data will be saved.
183        split: The data split. Either 'train', 'val', or 'test'.
184        patch_shape: The patch shape to use for training.
185        batch_size: The batch size for training.
186        label_choice: The organelle to segment. Available choices are:
187            'cell', 'mitochondrion', 'alpha_granule', 'canalicular_vessel', 'dense_granule', 'dense_core'.
188            If None, uses 'original' which contains all semantic labels (0-6).
189        download: Whether to download the data if it is not present.
190        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
191
192    Returns:
193        The PyTorch DataLoader.
194    """
195    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
196    dataset = get_densecell_dataset(path, split, patch_shape, label_choice=label_choice, download=download, **ds_kwargs)
197    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

URL = 'https://www.dropbox.com/s/68yclbraqq1diza/platelet_data_1219.zip?dl=1'

CHECKSUM = None

ORGANELLES = {1: 'cell', 2: 'mitochondrion', 3: 'alpha_granule', 4: 'canalicular_vessel', 5: 'dense_granule', 6: 'dense_core'}

SPLIT_FILES = {'train': {'images': 'train-images.tif', 'labels': 'train-labels.tif'}, 'val': {'images': 'eval-images.tif', 'labels': 'eval-labels.tif'}, 'test': {'images': 'test-images.tif', 'labels': 'test-labels.tif'}}

def get_densecell_data( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], download: bool = False) -> str: View Source

 41def get_densecell_data(
 42    path: Union[os.PathLike, str], split: Literal["train", "val", "test"], download: bool = False
 43) -> str:
 44    """Download the DenseCell dataset.
 45
 46    Args:
 47        path: Filepath to a folder where the downloaded data will be saved.
 48        split: The split to download. Either 'train', 'val', or 'test'.
 49        download: Whether to download the data if it is not present.
 50
 51    Returns:
 52        The filepath for the downloaded data.
 53    """
 54    import h5py
 55    import tifffile
 56
 57    data_path = os.path.join(path, f"densecell_{split}.h5")
 58    if os.path.exists(data_path):
 59        with h5py.File(data_path, "r") as f:
 60            if "labels/original" in f:
 61                return data_path
 62
 63        # Remove old file with outdated structure.
 64        os.remove(data_path)
 65
 66    os.makedirs(path, exist_ok=True)
 67
 68    # Download and extract the ZIP if the source TIFFs are not available.
 69    platelet_dir = os.path.join(path, "platelet_data")
 70    if not os.path.exists(platelet_dir):
 71        zip_path = os.path.join(path, "platelet_data_1219.zip")
 72        util.download_source(zip_path, URL, download, checksum=CHECKSUM)
 73        util.unzip(zip_path, path, remove=True)
 74
 75    assert os.path.exists(platelet_dir), f"Expected extracted directory at {platelet_dir}"
 76
 77    for _split, files in SPLIT_FILES.items():
 78        out_path = os.path.join(path, f"densecell_{_split}.h5")
 79        if os.path.exists(out_path):
 80            with h5py.File(out_path, "r") as f:
 81                if "labels/original" in f:
 82                    continue
 83
 84            os.remove(out_path)
 85
 86        raw = tifffile.imread(os.path.join(platelet_dir, files["images"]))
 87        labels = tifffile.imread(os.path.join(platelet_dir, files["labels"]))
 88        assert raw.shape == labels.shape, f"Shape mismatch for {_split}: {raw.shape} vs {labels.shape}"
 89
 90        labels = labels.astype(np.uint8)
 91        with h5py.File(out_path, "w") as f:
 92            f.create_dataset("raw", data=raw, compression="gzip")
 93            f.create_dataset("labels/original", data=labels, compression="gzip")
 94            for label_id, name in ORGANELLES.items():
 95                # For cells, use all non-background labels to avoid holes from internal organelles.
 96                if name == "cell":
 97                    binary_mask = (labels >= 1).astype(np.uint8)
 98                else:
 99                    binary_mask = (labels == label_id).astype(np.uint8)
100
101                f.create_dataset(f"labels/{name}", data=binary_mask, compression="gzip")
102
103    rmtree(platelet_dir)
104
105    assert os.path.exists(data_path), data_path
106    return data_path

Download the DenseCell dataset.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
split: The split to download. Either 'train', 'val', or 'test'.
download: Whether to download the data if it is not present.

Returns:

The filepath for the downloaded data.

def get_densecell_paths( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], download: bool = False) -> str: View Source

109def get_densecell_paths(
110    path: Union[os.PathLike, str], split: Literal["train", "val", "test"], download: bool = False
111) -> str:
112    """Get paths to the DenseCell data.
113
114    Args:
115        path: Filepath to a folder where the downloaded data will be saved.
116        split: The data split. Either 'train', 'val', or 'test'.
117        download: Whether to download the data if it is not present.
118
119    Returns:
120        The filepath for the stored data.
121    """
122    get_densecell_data(path, split, download)
123    data_path = os.path.join(path, f"densecell_{split}.h5")
124    return data_path

Get paths to the DenseCell data.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
split: The data split. Either 'train', 'val', or 'test'.
download: Whether to download the data if it is not present.

Returns:

The filepath for the stored data.

def get_densecell_dataset( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], patch_shape: Tuple[int, int, int], label_choice: Optional[str] = None, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset: View Source

127def get_densecell_dataset(
128    path: Union[os.PathLike, str],
129    split: Literal["train", "val", "test"],
130    patch_shape: Tuple[int, int, int],
131    label_choice: Optional[str] = None,
132    download: bool = False,
133    **kwargs
134) -> Dataset:
135    """Get dataset for segmentation of organelles in SBF-SEM platelet images.
136
137    Args:
138        path: Filepath to a folder where the downloaded data will be saved.
139        split: The data split. Either 'train', 'val', or 'test'.
140        patch_shape: The patch shape to use for training.
141        label_choice: The organelle to segment. Available choices are:
142            'cell', 'mitochondrion', 'alpha_granule', 'canalicular_vessel', 'dense_granule', 'dense_core'.
143            If None, uses 'original' which contains all semantic labels (0-6).
144        download: Whether to download the data if it is not present.
145        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
146
147    Returns:
148        The segmentation dataset.
149    """
150    assert split in ("train", "val", "test")
151
152    if label_choice is None:
153        label_key = "labels/original"
154    else:
155        valid_choices = list(ORGANELLES.values())
156        assert label_choice in valid_choices, f"'{label_choice}' is not valid. Choose from {valid_choices}."
157        label_key = f"labels/{label_choice}"
158
159    data_path = get_densecell_paths(path, split, download)
160
161    return torch_em.default_segmentation_dataset(
162        raw_paths=data_path,
163        raw_key="raw",
164        label_paths=data_path,
165        label_key=label_key,
166        patch_shape=patch_shape,
167        **kwargs
168    )

Get dataset for segmentation of organelles in SBF-SEM platelet images.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
split: The data split. Either 'train', 'val', or 'test'.
patch_shape: The patch shape to use for training.
label_choice: The organelle to segment. Available choices are: 'cell', 'mitochondrion', 'alpha_granule', 'canalicular_vessel', 'dense_granule', 'dense_core'. If None, uses 'original' which contains all semantic labels (0-6).
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

Returns:

The segmentation dataset.

def get_densecell_loader( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], patch_shape: Tuple[int, int, int], batch_size: int, label_choice: Optional[str] = None, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader: View Source

171def get_densecell_loader(
172    path: Union[os.PathLike, str],
173    split: Literal["train", "val", "test"],
174    patch_shape: Tuple[int, int, int],
175    batch_size: int,
176    label_choice: Optional[str] = None,
177    download: bool = False,
178    **kwargs
179) -> DataLoader:
180    """Get dataloader for segmentation of organelles in SBF-SEM platelet images.
181
182    Args:
183        path: Filepath to a folder where the downloaded data will be saved.
184        split: The data split. Either 'train', 'val', or 'test'.
185        patch_shape: The patch shape to use for training.
186        batch_size: The batch size for training.
187        label_choice: The organelle to segment. Available choices are:
188            'cell', 'mitochondrion', 'alpha_granule', 'canalicular_vessel', 'dense_granule', 'dense_core'.
189            If None, uses 'original' which contains all semantic labels (0-6).
190        download: Whether to download the data if it is not present.
191        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
192
193    Returns:
194        The PyTorch DataLoader.
195    """
196    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
197    dataset = get_densecell_dataset(path, split, patch_shape, label_choice=label_choice, download=download, **ds_kwargs)
198    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get dataloader for segmentation of organelles in SBF-SEM platelet images.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
split: The data split. Either 'train', 'val', or 'test'.
patch_shape: The patch shape to use for training.
batch_size: The batch size for training.
label_choice: The organelle to segment. Available choices are: 'cell', 'mitochondrion', 'alpha_granule', 'canalicular_vessel', 'dense_granule', 'dense_core'. If None, uses 'original' which contains all semantic labels (0-6).
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

Returns:

The PyTorch DataLoader.