torch_em.data.datasets.electron_microscopy.cremi

CREMI is a dataset for neuron segmentation in EM.

It contains three annotated volumes from the adult fruit-fly brain. It was held as a challenge at MICCAI 2016. For details on the dataset check out https://cremi.org/. Please cite the challenge if you use the dataset in your research.

  1"""CREMI is a dataset for neuron segmentation in EM.
  2
  3It contains three annotated volumes from the adult fruit-fly brain.
  4It was held as a challenge at MICCAI 2016. For details on the dataset check out https://cremi.org/.
  5Please cite the challenge if you use the dataset in your research.
  6"""
  7# TODO add support for realigned volumes
  8
  9import os
 10import warnings
 11from typing import Any, Dict, List, Optional, Tuple, Union
 12
 13import numpy as np
 14
 15from torch.utils.data import Dataset, DataLoader
 16
 17import torch_em
 18
 19from .. import util
 20from ....transform.raw import standardize
 21
 22
 23CREMI_URLS = {
 24    "cropped": {
 25        "A": "https://cremi.org/static/data/sample_A_20160501.hdf",
 26        "B": "https://cremi.org/static/data/sample_B_20160501.hdf",
 27        "C": "https://cremi.org/static/data/sample_C_20160501.hdf",
 28    },
 29    "padded": {
 30        "A": "https://cremi.org/static/data/sample_A_padded_20160501.hdf",
 31        "B": "https://cremi.org/static/data/sample_B_padded_20160501.hdf",
 32        "C": "https://cremi.org/static/data/sample_C_padded_20160501.hdf",
 33    },
 34    "realigned": {},
 35    "defects": "https://zenodo.org/record/5767036/files/sample_ABC_padded_defects.h5"
 36}
 37CHECKSUMS = {
 38    "cropped": {
 39        "A": "4c563d1b78acb2bcfb3ea958b6fe1533422f7f4a19f3e05b600bfa11430b510d",
 40        "B": "887e85521e00deead18c94a21ad71f278d88a5214c7edeed943130a1f4bb48b8",
 41        "C": "2874496f224d222ebc29d0e4753e8c458093e1d37bc53acd1b69b19ed1ae7052",
 42    },
 43    "padded": {
 44        "A": "c95dc4497ce0f0e7b70507c7253230fda95325ee91ea0d3253c9ef94b197050a",
 45        "B": "22917a25092d0b80175012c152da33e1a1d82e049c4a96dc747145d5ca5d1b87",
 46        "C": "aba27b165ef005d5fbebe0e3f8775ac903cf273b7e20381dff664d45065a3314",
 47    },
 48    "realigned": {},
 49    "defects": "7b06ffa34733b2c32956ea5005e0cf345e7d3a27477f42f7c905701cdc947bd0"
 50}
 51
 52
 53def get_cremi_data(
 54    path: Union[os.PathLike, str],
 55    samples: Tuple[str, ...] = ("A", "B", "C"),
 56    version: str = "cropped",
 57    use_realigned: bool = False,
 58    download: bool = False,
 59):
 60    """Download the CREMI training data.
 61
 62    Args:
 63        path: Filepath to a folder where the downloaded data will be saved.
 64        samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
 65        version: The dataset version to use. Either 'cropped' (default) or 'padded'.
 66        use_realigned: Use the realigned instead of the original training data.
 67        download: Whether to download the data if it is not present.
 68    """
 69    if use_realigned:
 70        # we need to sample batches in this case
 71        # sampler = torch_em.data.MinForegroundSampler(min_fraction=0.05, p_reject=.75)
 72        raise NotImplementedError
 73    if version not in CREMI_URLS:
 74        raise ValueError(f"Unknown version '{version}'. Choose from {list(CREMI_URLS.keys())}.")
 75
 76    urls = CREMI_URLS[version]
 77    checksums = CHECKSUMS[version]
 78    suffix = "_padded" if version == "padded" else ""
 79    os.makedirs(path, exist_ok=True)
 80    for name in samples:
 81        data_path = os.path.join(path, f"sample{name}{suffix}.h5")
 82        # CREMI SSL certificates expired, so we need to disable verification
 83        util.download_source(data_path, urls[name], download, checksums[name], verify=False)
 84
 85
 86def get_cremi_paths(
 87    path: Union[os.PathLike, str],
 88    samples: Tuple[str, ...] = ("A", "B", "C"),
 89    version: str = "cropped",
 90    use_realigned: bool = False,
 91    download: bool = False
 92) -> List[str]:
 93    """Get paths to the CREMI data.
 94
 95    Args:
 96        path: Filepath to a folder where the downloaded data will be saved.
 97        samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
 98        version: The dataset version to use. Either 'cropped' (default) or 'padded'.
 99            The padded volumes contain the same data with additional zero-padded borders;
100            label regions outside the original bounds are zero.
101        use_realigned: Use the realigned instead of the original training data.
102        download: Whether to download the data if it is not present.
103
104    Returns:
105        The filepaths to the training data.
106
107    Note:
108        The padded volumes are not available via the dataset and dataloader functions; use this function directly
109        if you need access to the padded data.
110    """
111    get_cremi_data(path, samples, version, use_realigned, download)
112    suffix = "_padded" if version == "padded" else ""
113    data_paths = [os.path.join(path, f"sample{name}{suffix}.h5") for name in samples]
114    return data_paths
115
116
117def get_cremi_dataset(
118    path: Union[os.PathLike, str],
119    patch_shape: Tuple[int, int, int],
120    samples: Tuple[str, ...] = ("A", "B", "C"),
121    use_realigned: bool = False,
122    download: bool = False,
123    offsets: Optional[List[List[int]]] = None,
124    boundaries: bool = False,
125    rois: Dict[str, Any] = {},
126    defect_augmentation_kwargs: Dict[str, Any] = {
127        "p_drop_slice": 0.025,
128        "p_low_contrast": 0.025,
129        "p_deform_slice": 0.0,
130        "deformation_mode": "compress",
131    },
132    **kwargs,
133) -> Dataset:
134    """Get the CREMI dataset for the segmentation of neurons in EM.
135
136    Args:
137        path: Filepath to a folder where the downloaded data will be saved.
138        patch_shape: The patch shape to use for training.
139        samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
140        use_realigned: Use the realigned instead of the original training data.
141        download: Whether to download the data if it is not present.
142        offsets: Offset values for affinity computation used as target.
143        boundaries: Whether to compute boundaries as the target.
144        rois: The region of interests to use for the samples.
145        defect_augmentation_kwargs: Keyword arguments for defect augmentations.
146        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
147
148    Returns:
149       The segmentation dataset.
150    """
151    assert len(patch_shape) == 3
152    if rois is not None:
153        assert isinstance(rois, dict)
154
155    data_paths = get_cremi_paths(path, samples, version="cropped", use_realigned=use_realigned, download=download)
156    data_rois = [rois.get(name, np.s_[:, :, :]) for name in samples]
157
158    if defect_augmentation_kwargs is not None and "artifact_source" not in defect_augmentation_kwargs:
159        # download the defect volume
160        url = CREMI_URLS["defects"]
161        checksum = CHECKSUMS["defects"]
162        defect_path = os.path.join(path, "cremi_defects.h5")
163        util.download_source(defect_path, url, download, checksum)
164        defect_patch_shape = (1,) + tuple(patch_shape[1:])
165        artifact_source = torch_em.transform.get_artifact_source(
166            defect_path, defect_patch_shape,
167            min_mask_fraction=0.75,
168            raw_key="defect_sections/raw",
169            mask_key="defect_sections/mask"
170        )
171        defect_augmentation_kwargs.update({"artifact_source": artifact_source})
172
173    # defect augmentations
174    if defect_augmentation_kwargs is not None:
175        if "raw_transform" in kwargs:
176            warnings.warn(
177                "'raw_transform' was found in kwargs. It will be used as the "
178                "normalizer for the defect augmentation pipeline, which may lead to incorrect results"
179                "if the normalizer maps to an unexpected data range."
180            )
181        raw_transform = torch_em.transform.get_raw_transform(
182            normalizer=kwargs.pop("raw_transform", standardize),
183            augmentation1=torch_em.transform.EMDefectAugmentation(**defect_augmentation_kwargs)
184        )
185        kwargs = util.update_kwargs(kwargs, "raw_transform", raw_transform)
186
187    kwargs, _ = util.add_instance_label_transform(
188        kwargs, add_binary_target=False, boundaries=boundaries, offsets=offsets
189    )
190
191    return torch_em.default_segmentation_dataset(
192        raw_paths=data_paths,
193        raw_key="volumes/raw",
194        label_paths=data_paths,
195        label_key="volumes/labels/neuron_ids",
196        patch_shape=patch_shape,
197        rois=data_rois,
198        **kwargs
199    )
200
201
202def get_cremi_loader(
203    path: Union[os.PathLike, str],
204    patch_shape: Tuple[int, int, int],
205    batch_size: int,
206    samples: Tuple[str, ...] = ("A", "B", "C"),
207    use_realigned: bool = False,
208    download: bool = False,
209    offsets: Optional[List[List[int]]] = None,
210    boundaries: bool = False,
211    rois: Dict[str, Any] = {},
212    defect_augmentation_kwargs: Dict[str, Any] = {
213        "p_drop_slice": 0.025,
214        "p_low_contrast": 0.025,
215        "p_deform_slice": 0.0,
216        "deformation_mode": "compress",
217    },
218    **kwargs,
219) -> DataLoader:
220    """Get the DataLoader for EM neuron segmentation in the CREMI dataset.
221
222    Args:
223        path: Filepath to a folder where the downloaded data will be saved.
224        patch_shape: The patch shape to use for training.
225        batch_size: The batch size for training.
226        samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
227        use_realigned: Use the realigned instead of the original training data.
228        download: Whether to download the data if it is not present.
229        offsets: Offset values for affinity computation used as target.
230        boundaries: Whether to compute boundaries as the target.
231        rois: The region of interests to use for the samples.
232        defect_augmentation_kwargs: Keyword arguments for defect augmentations.
233        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
234
235    Returns:
236        The DataLoader.
237    """
238    dataset_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
239    ds = get_cremi_dataset(
240        path=path,
241        patch_shape=patch_shape,
242        samples=samples,
243        use_realigned=use_realigned,
244        download=download,
245        offsets=offsets,
246        boundaries=boundaries,
247        rois=rois,
248        defect_augmentation_kwargs=defect_augmentation_kwargs,
249        **dataset_kwargs,
250    )
251    return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)
CREMI_URLS = {'cropped': {'A': 'https://cremi.org/static/data/sample_A_20160501.hdf', 'B': 'https://cremi.org/static/data/sample_B_20160501.hdf', 'C': 'https://cremi.org/static/data/sample_C_20160501.hdf'}, 'padded': {'A': 'https://cremi.org/static/data/sample_A_padded_20160501.hdf', 'B': 'https://cremi.org/static/data/sample_B_padded_20160501.hdf', 'C': 'https://cremi.org/static/data/sample_C_padded_20160501.hdf'}, 'realigned': {}, 'defects': 'https://zenodo.org/record/5767036/files/sample_ABC_padded_defects.h5'}
CHECKSUMS = {'cropped': {'A': '4c563d1b78acb2bcfb3ea958b6fe1533422f7f4a19f3e05b600bfa11430b510d', 'B': '887e85521e00deead18c94a21ad71f278d88a5214c7edeed943130a1f4bb48b8', 'C': '2874496f224d222ebc29d0e4753e8c458093e1d37bc53acd1b69b19ed1ae7052'}, 'padded': {'A': 'c95dc4497ce0f0e7b70507c7253230fda95325ee91ea0d3253c9ef94b197050a', 'B': '22917a25092d0b80175012c152da33e1a1d82e049c4a96dc747145d5ca5d1b87', 'C': 'aba27b165ef005d5fbebe0e3f8775ac903cf273b7e20381dff664d45065a3314'}, 'realigned': {}, 'defects': '7b06ffa34733b2c32956ea5005e0cf345e7d3a27477f42f7c905701cdc947bd0'}
def get_cremi_data( path: Union[os.PathLike, str], samples: Tuple[str, ...] = ('A', 'B', 'C'), version: str = 'cropped', use_realigned: bool = False, download: bool = False):
54def get_cremi_data(
55    path: Union[os.PathLike, str],
56    samples: Tuple[str, ...] = ("A", "B", "C"),
57    version: str = "cropped",
58    use_realigned: bool = False,
59    download: bool = False,
60):
61    """Download the CREMI training data.
62
63    Args:
64        path: Filepath to a folder where the downloaded data will be saved.
65        samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
66        version: The dataset version to use. Either 'cropped' (default) or 'padded'.
67        use_realigned: Use the realigned instead of the original training data.
68        download: Whether to download the data if it is not present.
69    """
70    if use_realigned:
71        # we need to sample batches in this case
72        # sampler = torch_em.data.MinForegroundSampler(min_fraction=0.05, p_reject=.75)
73        raise NotImplementedError
74    if version not in CREMI_URLS:
75        raise ValueError(f"Unknown version '{version}'. Choose from {list(CREMI_URLS.keys())}.")
76
77    urls = CREMI_URLS[version]
78    checksums = CHECKSUMS[version]
79    suffix = "_padded" if version == "padded" else ""
80    os.makedirs(path, exist_ok=True)
81    for name in samples:
82        data_path = os.path.join(path, f"sample{name}{suffix}.h5")
83        # CREMI SSL certificates expired, so we need to disable verification
84        util.download_source(data_path, urls[name], download, checksums[name], verify=False)

Download the CREMI training data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
  • version: The dataset version to use. Either 'cropped' (default) or 'padded'.
  • use_realigned: Use the realigned instead of the original training data.
  • download: Whether to download the data if it is not present.
def get_cremi_paths( path: Union[os.PathLike, str], samples: Tuple[str, ...] = ('A', 'B', 'C'), version: str = 'cropped', use_realigned: bool = False, download: bool = False) -> List[str]:
 87def get_cremi_paths(
 88    path: Union[os.PathLike, str],
 89    samples: Tuple[str, ...] = ("A", "B", "C"),
 90    version: str = "cropped",
 91    use_realigned: bool = False,
 92    download: bool = False
 93) -> List[str]:
 94    """Get paths to the CREMI data.
 95
 96    Args:
 97        path: Filepath to a folder where the downloaded data will be saved.
 98        samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
 99        version: The dataset version to use. Either 'cropped' (default) or 'padded'.
100            The padded volumes contain the same data with additional zero-padded borders;
101            label regions outside the original bounds are zero.
102        use_realigned: Use the realigned instead of the original training data.
103        download: Whether to download the data if it is not present.
104
105    Returns:
106        The filepaths to the training data.
107
108    Note:
109        The padded volumes are not available via the dataset and dataloader functions; use this function directly
110        if you need access to the padded data.
111    """
112    get_cremi_data(path, samples, version, use_realigned, download)
113    suffix = "_padded" if version == "padded" else ""
114    data_paths = [os.path.join(path, f"sample{name}{suffix}.h5") for name in samples]
115    return data_paths

Get paths to the CREMI data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
  • version: The dataset version to use. Either 'cropped' (default) or 'padded'. The padded volumes contain the same data with additional zero-padded borders; label regions outside the original bounds are zero.
  • use_realigned: Use the realigned instead of the original training data.
  • download: Whether to download the data if it is not present.
Returns:

The filepaths to the training data.

Note:

The padded volumes are not available via the dataset and dataloader functions; use this function directly if you need access to the padded data.

def get_cremi_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int, int], samples: Tuple[str, ...] = ('A', 'B', 'C'), use_realigned: bool = False, download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, rois: Dict[str, Any] = {}, defect_augmentation_kwargs: Dict[str, Any] = {'p_drop_slice': 0.025, 'p_low_contrast': 0.025, 'p_deform_slice': 0.0, 'deformation_mode': 'compress'}, **kwargs) -> torch.utils.data.dataset.Dataset:
118def get_cremi_dataset(
119    path: Union[os.PathLike, str],
120    patch_shape: Tuple[int, int, int],
121    samples: Tuple[str, ...] = ("A", "B", "C"),
122    use_realigned: bool = False,
123    download: bool = False,
124    offsets: Optional[List[List[int]]] = None,
125    boundaries: bool = False,
126    rois: Dict[str, Any] = {},
127    defect_augmentation_kwargs: Dict[str, Any] = {
128        "p_drop_slice": 0.025,
129        "p_low_contrast": 0.025,
130        "p_deform_slice": 0.0,
131        "deformation_mode": "compress",
132    },
133    **kwargs,
134) -> Dataset:
135    """Get the CREMI dataset for the segmentation of neurons in EM.
136
137    Args:
138        path: Filepath to a folder where the downloaded data will be saved.
139        patch_shape: The patch shape to use for training.
140        samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
141        use_realigned: Use the realigned instead of the original training data.
142        download: Whether to download the data if it is not present.
143        offsets: Offset values for affinity computation used as target.
144        boundaries: Whether to compute boundaries as the target.
145        rois: The region of interests to use for the samples.
146        defect_augmentation_kwargs: Keyword arguments for defect augmentations.
147        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
148
149    Returns:
150       The segmentation dataset.
151    """
152    assert len(patch_shape) == 3
153    if rois is not None:
154        assert isinstance(rois, dict)
155
156    data_paths = get_cremi_paths(path, samples, version="cropped", use_realigned=use_realigned, download=download)
157    data_rois = [rois.get(name, np.s_[:, :, :]) for name in samples]
158
159    if defect_augmentation_kwargs is not None and "artifact_source" not in defect_augmentation_kwargs:
160        # download the defect volume
161        url = CREMI_URLS["defects"]
162        checksum = CHECKSUMS["defects"]
163        defect_path = os.path.join(path, "cremi_defects.h5")
164        util.download_source(defect_path, url, download, checksum)
165        defect_patch_shape = (1,) + tuple(patch_shape[1:])
166        artifact_source = torch_em.transform.get_artifact_source(
167            defect_path, defect_patch_shape,
168            min_mask_fraction=0.75,
169            raw_key="defect_sections/raw",
170            mask_key="defect_sections/mask"
171        )
172        defect_augmentation_kwargs.update({"artifact_source": artifact_source})
173
174    # defect augmentations
175    if defect_augmentation_kwargs is not None:
176        if "raw_transform" in kwargs:
177            warnings.warn(
178                "'raw_transform' was found in kwargs. It will be used as the "
179                "normalizer for the defect augmentation pipeline, which may lead to incorrect results"
180                "if the normalizer maps to an unexpected data range."
181            )
182        raw_transform = torch_em.transform.get_raw_transform(
183            normalizer=kwargs.pop("raw_transform", standardize),
184            augmentation1=torch_em.transform.EMDefectAugmentation(**defect_augmentation_kwargs)
185        )
186        kwargs = util.update_kwargs(kwargs, "raw_transform", raw_transform)
187
188    kwargs, _ = util.add_instance_label_transform(
189        kwargs, add_binary_target=False, boundaries=boundaries, offsets=offsets
190    )
191
192    return torch_em.default_segmentation_dataset(
193        raw_paths=data_paths,
194        raw_key="volumes/raw",
195        label_paths=data_paths,
196        label_key="volumes/labels/neuron_ids",
197        patch_shape=patch_shape,
198        rois=data_rois,
199        **kwargs
200    )

Get the CREMI dataset for the segmentation of neurons in EM.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
  • use_realigned: Use the realigned instead of the original training data.
  • download: Whether to download the data if it is not present.
  • offsets: Offset values for affinity computation used as target.
  • boundaries: Whether to compute boundaries as the target.
  • rois: The region of interests to use for the samples.
  • defect_augmentation_kwargs: Keyword arguments for defect augmentations.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_cremi_loader( path: Union[os.PathLike, str], patch_shape: Tuple[int, int, int], batch_size: int, samples: Tuple[str, ...] = ('A', 'B', 'C'), use_realigned: bool = False, download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, rois: Dict[str, Any] = {}, defect_augmentation_kwargs: Dict[str, Any] = {'p_drop_slice': 0.025, 'p_low_contrast': 0.025, 'p_deform_slice': 0.0, 'deformation_mode': 'compress'}, **kwargs) -> torch.utils.data.dataloader.DataLoader:
203def get_cremi_loader(
204    path: Union[os.PathLike, str],
205    patch_shape: Tuple[int, int, int],
206    batch_size: int,
207    samples: Tuple[str, ...] = ("A", "B", "C"),
208    use_realigned: bool = False,
209    download: bool = False,
210    offsets: Optional[List[List[int]]] = None,
211    boundaries: bool = False,
212    rois: Dict[str, Any] = {},
213    defect_augmentation_kwargs: Dict[str, Any] = {
214        "p_drop_slice": 0.025,
215        "p_low_contrast": 0.025,
216        "p_deform_slice": 0.0,
217        "deformation_mode": "compress",
218    },
219    **kwargs,
220) -> DataLoader:
221    """Get the DataLoader for EM neuron segmentation in the CREMI dataset.
222
223    Args:
224        path: Filepath to a folder where the downloaded data will be saved.
225        patch_shape: The patch shape to use for training.
226        batch_size: The batch size for training.
227        samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
228        use_realigned: Use the realigned instead of the original training data.
229        download: Whether to download the data if it is not present.
230        offsets: Offset values for affinity computation used as target.
231        boundaries: Whether to compute boundaries as the target.
232        rois: The region of interests to use for the samples.
233        defect_augmentation_kwargs: Keyword arguments for defect augmentations.
234        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
235
236    Returns:
237        The DataLoader.
238    """
239    dataset_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
240    ds = get_cremi_dataset(
241        path=path,
242        patch_shape=patch_shape,
243        samples=samples,
244        use_realigned=use_realigned,
245        download=download,
246        offsets=offsets,
247        boundaries=boundaries,
248        rois=rois,
249        defect_augmentation_kwargs=defect_augmentation_kwargs,
250        **dataset_kwargs,
251    )
252    return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)

Get the DataLoader for EM neuron segmentation in the CREMI dataset.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • batch_size: The batch size for training.
  • samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
  • use_realigned: Use the realigned instead of the original training data.
  • download: Whether to download the data if it is not present.
  • offsets: Offset values for affinity computation used as target.
  • boundaries: Whether to compute boundaries as the target.
  • rois: The region of interests to use for the samples.
  • defect_augmentation_kwargs: Keyword arguments for defect augmentations.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.