torch_em.data.datasets.electron_microscopy.cremi

CREMI is a dataset for neuron segmentation in EM.

It contains three annotated volumes from the adult fruit-fly brain. It was held as a challenge at MICCAI 2016. For details on the dataset check out https://cremi.org/. Please cite the challenge if you use the dataset in your research.

  1"""CREMI is a dataset for neuron segmentation in EM.
  2
  3It contains three annotated volumes from the adult fruit-fly brain.
  4It was held as a challenge at MICCAI 2016. For details on the dataset check out https://cremi.org/.
  5Please cite the challenge if you use the dataset in your research.
  6"""
  7# TODO add support for realigned volumes
  8
  9import os
 10import warnings
 11from typing import Any, Dict, List, Optional, Tuple, Union
 12
 13import numpy as np
 14
 15from torch.utils.data import Dataset, DataLoader
 16
 17import torch_em
 18
 19from .. import util
 20from ....transform.raw import standardize
 21
 22
 23CREMI_URLS = {
 24    "original": {
 25        "A": "https://cremi.org/static/data/sample_A_20160501.hdf",
 26        "B": "https://cremi.org/static/data/sample_B_20160501.hdf",
 27        "C": "https://cremi.org/static/data/sample_C_20160501.hdf",
 28    },
 29    "realigned": {},
 30    "defects": "https://zenodo.org/record/5767036/files/sample_ABC_padded_defects.h5"
 31}
 32CHECKSUMS = {
 33    "original": {
 34        "A": "4c563d1b78acb2bcfb3ea958b6fe1533422f7f4a19f3e05b600bfa11430b510d",
 35        "B": "887e85521e00deead18c94a21ad71f278d88a5214c7edeed943130a1f4bb48b8",
 36        "C": "2874496f224d222ebc29d0e4753e8c458093e1d37bc53acd1b69b19ed1ae7052",
 37    },
 38    "realigned": {},
 39    "defects": "7b06ffa34733b2c32956ea5005e0cf345e7d3a27477f42f7c905701cdc947bd0"
 40}
 41
 42
 43def get_cremi_data(path: Union[os.PathLike, str], samples: Tuple[str], download: bool, use_realigned: bool = False):
 44    """Download the CREMI training data.
 45
 46    Args:
 47        path: Filepath to a folder where the downloaded data will be saved.
 48        samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
 49        download: Whether to download the data if it is not present.
 50        use_realigned: Use the realigned instead of the original training data.
 51    """
 52    if use_realigned:
 53        # we need to sample batches in this case
 54        # sampler = torch_em.data.MinForegroundSampler(min_fraction=0.05, p_reject=.75)
 55        raise NotImplementedError
 56    else:
 57        urls = CREMI_URLS["original"]
 58        checksums = CHECKSUMS["original"]
 59
 60    os.makedirs(path, exist_ok=True)
 61    for name in samples:
 62        url = urls[name]
 63        checksum = checksums[name]
 64        data_path = os.path.join(path, f"sample{name}.h5")
 65        # CREMI SSL certificates expired, so we need to disable verification
 66        util.download_source(data_path, url, download, checksum, verify=False)
 67
 68
 69def get_cremi_paths(
 70    path: Union[os.PathLike, str],
 71    samples: Tuple[str, ...] = ("A", "B", "C"),
 72    use_realigned: bool = False,
 73    download: bool = False
 74) -> List[str]:
 75    """Get paths to the CREMI data.
 76
 77    Args:
 78        path: Filepath to a folder where the downloaded data will be saved.
 79        samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
 80        use_realigned: Use the realigned instead of the original training data.
 81        download: Whether to download the data if it is not present.
 82
 83    Returns:
 84        The filepaths to the training data.
 85    """
 86    get_cremi_data(path, samples, download, use_realigned)
 87    data_paths = [os.path.join(path, f"sample{name}.h5") for name in samples]
 88    return data_paths
 89
 90
 91def get_cremi_dataset(
 92    path: Union[os.PathLike, str],
 93    patch_shape: Tuple[int, int, int],
 94    samples: Tuple[str, ...] = ("A", "B", "C"),
 95    use_realigned: bool = False,
 96    download: bool = False,
 97    offsets: Optional[List[List[int]]] = None,
 98    boundaries: bool = False,
 99    rois: Dict[str, Any] = {},
100    defect_augmentation_kwargs: Dict[str, Any] = {
101        "p_drop_slice": 0.025,
102        "p_low_contrast": 0.025,
103        "p_deform_slice": 0.0,
104        "deformation_mode": "compress",
105    },
106    **kwargs,
107) -> Dataset:
108    """Get the CREMI dataset for the segmentation of neurons in EM.
109
110    Args:
111        path: Filepath to a folder where the downloaded data will be saved.
112        patch_shape: The patch shape to use for training.
113        samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
114        use_realigned: Use the realigned instead of the original training data.
115        download: Whether to download the data if it is not present.
116        offsets: Offset values for affinity computation used as target.
117        boundaries: Whether to compute boundaries as the target.
118        rois: The region of interests to use for the samples.
119        defect_augmentation_kwargs: Keyword arguments for defect augmentations.
120        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
121
122    Returns:
123       The segmentation dataset.
124    """
125    assert len(patch_shape) == 3
126    if rois is not None:
127        assert isinstance(rois, dict)
128
129    data_paths = get_cremi_paths(path, samples, use_realigned, download)
130    data_rois = [rois.get(name, np.s_[:, :, :]) for name in samples]
131
132    if defect_augmentation_kwargs is not None and "artifact_source" not in defect_augmentation_kwargs:
133        # download the defect volume
134        url = CREMI_URLS["defects"]
135        checksum = CHECKSUMS["defects"]
136        defect_path = os.path.join(path, "cremi_defects.h5")
137        util.download_source(defect_path, url, download, checksum)
138        defect_patch_shape = (1,) + tuple(patch_shape[1:])
139        artifact_source = torch_em.transform.get_artifact_source(
140            defect_path, defect_patch_shape,
141            min_mask_fraction=0.75,
142            raw_key="defect_sections/raw",
143            mask_key="defect_sections/mask"
144        )
145        defect_augmentation_kwargs.update({"artifact_source": artifact_source})
146
147    # defect augmentations
148    if defect_augmentation_kwargs is not None:
149        if "raw_transform" in kwargs:
150            warnings.warn(
151                "'raw_transform' was found in kwargs. It will be used as the "
152                "normalizer for the defect augmentation pipeline, which may lead to incorrect results"
153                "if the normalizer maps to an unexpected data range."
154            )
155        raw_transform = torch_em.transform.get_raw_transform(
156            normalizer=kwargs.pop("raw_transform", standardize),
157            augmentation1=torch_em.transform.EMDefectAugmentation(**defect_augmentation_kwargs)
158        )
159        kwargs = util.update_kwargs(kwargs, "raw_transform", raw_transform)
160
161    kwargs, _ = util.add_instance_label_transform(
162        kwargs, add_binary_target=False, boundaries=boundaries, offsets=offsets
163    )
164
165    return torch_em.default_segmentation_dataset(
166        raw_paths=data_paths,
167        raw_key="volumes/raw",
168        label_paths=data_paths,
169        label_key="volumes/labels/neuron_ids",
170        patch_shape=patch_shape,
171        rois=data_rois,
172        **kwargs
173    )
174
175
176def get_cremi_loader(
177    path: Union[os.PathLike, str],
178    patch_shape: Tuple[int, int, int],
179    batch_size: int,
180    samples: Tuple[str, ...] = ("A", "B", "C"),
181    use_realigned: bool = False,
182    download: bool = False,
183    offsets: Optional[List[List[int]]] = None,
184    boundaries: bool = False,
185    rois: Dict[str, Any] = {},
186    defect_augmentation_kwargs: Dict[str, Any] = {
187        "p_drop_slice": 0.025,
188        "p_low_contrast": 0.025,
189        "p_deform_slice": 0.0,
190        "deformation_mode": "compress",
191    },
192    **kwargs,
193) -> DataLoader:
194    """Get the DataLoader for EM neuron segmentation in the CREMI dataset.
195
196    Args:
197        path: Filepath to a folder where the downloaded data will be saved.
198        patch_shape: The patch shape to use for training.
199        batch_size: The batch size for training.
200        samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
201        use_realigned: Use the realigned instead of the original training data.
202        download: Whether to download the data if it is not present.
203        offsets: Offset values for affinity computation used as target.
204        boundaries: Whether to compute boundaries as the target.
205        rois: The region of interests to use for the samples.
206        defect_augmentation_kwargs: Keyword arguments for defect augmentations.
207        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
208
209    Returns:
210        The DataLoader.
211    """
212    dataset_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
213    ds = get_cremi_dataset(
214        path=path,
215        patch_shape=patch_shape,
216        samples=samples,
217        use_realigned=use_realigned,
218        download=download,
219        offsets=offsets,
220        boundaries=boundaries,
221        rois=rois,
222        defect_augmentation_kwargs=defect_augmentation_kwargs,
223        **dataset_kwargs,
224    )
225    return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)
CREMI_URLS = {'original': {'A': 'https://cremi.org/static/data/sample_A_20160501.hdf', 'B': 'https://cremi.org/static/data/sample_B_20160501.hdf', 'C': 'https://cremi.org/static/data/sample_C_20160501.hdf'}, 'realigned': {}, 'defects': 'https://zenodo.org/record/5767036/files/sample_ABC_padded_defects.h5'}
CHECKSUMS = {'original': {'A': '4c563d1b78acb2bcfb3ea958b6fe1533422f7f4a19f3e05b600bfa11430b510d', 'B': '887e85521e00deead18c94a21ad71f278d88a5214c7edeed943130a1f4bb48b8', 'C': '2874496f224d222ebc29d0e4753e8c458093e1d37bc53acd1b69b19ed1ae7052'}, 'realigned': {}, 'defects': '7b06ffa34733b2c32956ea5005e0cf345e7d3a27477f42f7c905701cdc947bd0'}
def get_cremi_data( path: Union[os.PathLike, str], samples: Tuple[str], download: bool, use_realigned: bool = False):
44def get_cremi_data(path: Union[os.PathLike, str], samples: Tuple[str], download: bool, use_realigned: bool = False):
45    """Download the CREMI training data.
46
47    Args:
48        path: Filepath to a folder where the downloaded data will be saved.
49        samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
50        download: Whether to download the data if it is not present.
51        use_realigned: Use the realigned instead of the original training data.
52    """
53    if use_realigned:
54        # we need to sample batches in this case
55        # sampler = torch_em.data.MinForegroundSampler(min_fraction=0.05, p_reject=.75)
56        raise NotImplementedError
57    else:
58        urls = CREMI_URLS["original"]
59        checksums = CHECKSUMS["original"]
60
61    os.makedirs(path, exist_ok=True)
62    for name in samples:
63        url = urls[name]
64        checksum = checksums[name]
65        data_path = os.path.join(path, f"sample{name}.h5")
66        # CREMI SSL certificates expired, so we need to disable verification
67        util.download_source(data_path, url, download, checksum, verify=False)

Download the CREMI training data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
  • download: Whether to download the data if it is not present.
  • use_realigned: Use the realigned instead of the original training data.
def get_cremi_paths( path: Union[os.PathLike, str], samples: Tuple[str, ...] = ('A', 'B', 'C'), use_realigned: bool = False, download: bool = False) -> List[str]:
70def get_cremi_paths(
71    path: Union[os.PathLike, str],
72    samples: Tuple[str, ...] = ("A", "B", "C"),
73    use_realigned: bool = False,
74    download: bool = False
75) -> List[str]:
76    """Get paths to the CREMI data.
77
78    Args:
79        path: Filepath to a folder where the downloaded data will be saved.
80        samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
81        use_realigned: Use the realigned instead of the original training data.
82        download: Whether to download the data if it is not present.
83
84    Returns:
85        The filepaths to the training data.
86    """
87    get_cremi_data(path, samples, download, use_realigned)
88    data_paths = [os.path.join(path, f"sample{name}.h5") for name in samples]
89    return data_paths

Get paths to the CREMI data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
  • use_realigned: Use the realigned instead of the original training data.
  • download: Whether to download the data if it is not present.
Returns:

The filepaths to the training data.

def get_cremi_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int, int], samples: Tuple[str, ...] = ('A', 'B', 'C'), use_realigned: bool = False, download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, rois: Dict[str, Any] = {}, defect_augmentation_kwargs: Dict[str, Any] = {'p_drop_slice': 0.025, 'p_low_contrast': 0.025, 'p_deform_slice': 0.0, 'deformation_mode': 'compress'}, **kwargs) -> torch.utils.data.dataset.Dataset:
 92def get_cremi_dataset(
 93    path: Union[os.PathLike, str],
 94    patch_shape: Tuple[int, int, int],
 95    samples: Tuple[str, ...] = ("A", "B", "C"),
 96    use_realigned: bool = False,
 97    download: bool = False,
 98    offsets: Optional[List[List[int]]] = None,
 99    boundaries: bool = False,
100    rois: Dict[str, Any] = {},
101    defect_augmentation_kwargs: Dict[str, Any] = {
102        "p_drop_slice": 0.025,
103        "p_low_contrast": 0.025,
104        "p_deform_slice": 0.0,
105        "deformation_mode": "compress",
106    },
107    **kwargs,
108) -> Dataset:
109    """Get the CREMI dataset for the segmentation of neurons in EM.
110
111    Args:
112        path: Filepath to a folder where the downloaded data will be saved.
113        patch_shape: The patch shape to use for training.
114        samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
115        use_realigned: Use the realigned instead of the original training data.
116        download: Whether to download the data if it is not present.
117        offsets: Offset values for affinity computation used as target.
118        boundaries: Whether to compute boundaries as the target.
119        rois: The region of interests to use for the samples.
120        defect_augmentation_kwargs: Keyword arguments for defect augmentations.
121        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
122
123    Returns:
124       The segmentation dataset.
125    """
126    assert len(patch_shape) == 3
127    if rois is not None:
128        assert isinstance(rois, dict)
129
130    data_paths = get_cremi_paths(path, samples, use_realigned, download)
131    data_rois = [rois.get(name, np.s_[:, :, :]) for name in samples]
132
133    if defect_augmentation_kwargs is not None and "artifact_source" not in defect_augmentation_kwargs:
134        # download the defect volume
135        url = CREMI_URLS["defects"]
136        checksum = CHECKSUMS["defects"]
137        defect_path = os.path.join(path, "cremi_defects.h5")
138        util.download_source(defect_path, url, download, checksum)
139        defect_patch_shape = (1,) + tuple(patch_shape[1:])
140        artifact_source = torch_em.transform.get_artifact_source(
141            defect_path, defect_patch_shape,
142            min_mask_fraction=0.75,
143            raw_key="defect_sections/raw",
144            mask_key="defect_sections/mask"
145        )
146        defect_augmentation_kwargs.update({"artifact_source": artifact_source})
147
148    # defect augmentations
149    if defect_augmentation_kwargs is not None:
150        if "raw_transform" in kwargs:
151            warnings.warn(
152                "'raw_transform' was found in kwargs. It will be used as the "
153                "normalizer for the defect augmentation pipeline, which may lead to incorrect results"
154                "if the normalizer maps to an unexpected data range."
155            )
156        raw_transform = torch_em.transform.get_raw_transform(
157            normalizer=kwargs.pop("raw_transform", standardize),
158            augmentation1=torch_em.transform.EMDefectAugmentation(**defect_augmentation_kwargs)
159        )
160        kwargs = util.update_kwargs(kwargs, "raw_transform", raw_transform)
161
162    kwargs, _ = util.add_instance_label_transform(
163        kwargs, add_binary_target=False, boundaries=boundaries, offsets=offsets
164    )
165
166    return torch_em.default_segmentation_dataset(
167        raw_paths=data_paths,
168        raw_key="volumes/raw",
169        label_paths=data_paths,
170        label_key="volumes/labels/neuron_ids",
171        patch_shape=patch_shape,
172        rois=data_rois,
173        **kwargs
174    )

Get the CREMI dataset for the segmentation of neurons in EM.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
  • use_realigned: Use the realigned instead of the original training data.
  • download: Whether to download the data if it is not present.
  • offsets: Offset values for affinity computation used as target.
  • boundaries: Whether to compute boundaries as the target.
  • rois: The region of interests to use for the samples.
  • defect_augmentation_kwargs: Keyword arguments for defect augmentations.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_cremi_loader( path: Union[os.PathLike, str], patch_shape: Tuple[int, int, int], batch_size: int, samples: Tuple[str, ...] = ('A', 'B', 'C'), use_realigned: bool = False, download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, rois: Dict[str, Any] = {}, defect_augmentation_kwargs: Dict[str, Any] = {'p_drop_slice': 0.025, 'p_low_contrast': 0.025, 'p_deform_slice': 0.0, 'deformation_mode': 'compress'}, **kwargs) -> torch.utils.data.dataloader.DataLoader:
177def get_cremi_loader(
178    path: Union[os.PathLike, str],
179    patch_shape: Tuple[int, int, int],
180    batch_size: int,
181    samples: Tuple[str, ...] = ("A", "B", "C"),
182    use_realigned: bool = False,
183    download: bool = False,
184    offsets: Optional[List[List[int]]] = None,
185    boundaries: bool = False,
186    rois: Dict[str, Any] = {},
187    defect_augmentation_kwargs: Dict[str, Any] = {
188        "p_drop_slice": 0.025,
189        "p_low_contrast": 0.025,
190        "p_deform_slice": 0.0,
191        "deformation_mode": "compress",
192    },
193    **kwargs,
194) -> DataLoader:
195    """Get the DataLoader for EM neuron segmentation in the CREMI dataset.
196
197    Args:
198        path: Filepath to a folder where the downloaded data will be saved.
199        patch_shape: The patch shape to use for training.
200        batch_size: The batch size for training.
201        samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
202        use_realigned: Use the realigned instead of the original training data.
203        download: Whether to download the data if it is not present.
204        offsets: Offset values for affinity computation used as target.
205        boundaries: Whether to compute boundaries as the target.
206        rois: The region of interests to use for the samples.
207        defect_augmentation_kwargs: Keyword arguments for defect augmentations.
208        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
209
210    Returns:
211        The DataLoader.
212    """
213    dataset_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
214    ds = get_cremi_dataset(
215        path=path,
216        patch_shape=patch_shape,
217        samples=samples,
218        use_realigned=use_realigned,
219        download=download,
220        offsets=offsets,
221        boundaries=boundaries,
222        rois=rois,
223        defect_augmentation_kwargs=defect_augmentation_kwargs,
224        **dataset_kwargs,
225    )
226    return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)

Get the DataLoader for EM neuron segmentation in the CREMI dataset.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • batch_size: The batch size for training.
  • samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
  • use_realigned: Use the realigned instead of the original training data.
  • download: Whether to download the data if it is not present.
  • offsets: Offset values for affinity computation used as target.
  • boundaries: Whether to compute boundaries as the target.
  • rois: The region of interests to use for the samples.
  • defect_augmentation_kwargs: Keyword arguments for defect augmentations.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.