torch_em.data.datasets.electron_microscopy.cremi

CREMI is a dataset for neuron segmentation in EM.

It contains three annotated volumes from the adult fruit-fly brain. It was held as a challenge at MICCAI 2016. For details on the dataset check out https://cremi.org/. Please cite the challenge if you use the dataset in your research.

  1"""CREMI is a dataset for neuron segmentation in EM.
  2
  3It contains three annotated volumes from the adult fruit-fly brain.
  4It was held as a challenge at MICCAI 2016. For details on the dataset check out https://cremi.org/.
  5Please cite the challenge if you use the dataset in your research.
  6"""
  7# TODO add support for realigned volumes
  8
  9import os
 10from typing import Any, Dict, List, Optional, Tuple, Union
 11
 12import numpy as np
 13
 14from torch.utils.data import Dataset, DataLoader
 15
 16import torch_em
 17
 18from .. import util
 19
 20
 21CREMI_URLS = {
 22    "original": {
 23        "A": "https://cremi.org/static/data/sample_A_20160501.hdf",
 24        "B": "https://cremi.org/static/data/sample_B_20160501.hdf",
 25        "C": "https://cremi.org/static/data/sample_C_20160501.hdf",
 26    },
 27    "realigned": {},
 28    "defects": "https://zenodo.org/record/5767036/files/sample_ABC_padded_defects.h5"
 29}
 30CHECKSUMS = {
 31    "original": {
 32        "A": "4c563d1b78acb2bcfb3ea958b6fe1533422f7f4a19f3e05b600bfa11430b510d",
 33        "B": "887e85521e00deead18c94a21ad71f278d88a5214c7edeed943130a1f4bb48b8",
 34        "C": "2874496f224d222ebc29d0e4753e8c458093e1d37bc53acd1b69b19ed1ae7052",
 35    },
 36    "realigned": {},
 37    "defects": "7b06ffa34733b2c32956ea5005e0cf345e7d3a27477f42f7c905701cdc947bd0"
 38}
 39
 40
 41def get_cremi_data(path: Union[os.PathLike, str], samples: Tuple[str], download: bool, use_realigned: bool = False):
 42    """Download the CREMI training data.
 43
 44    Args:
 45        path: Filepath to a folder where the downloaded data will be saved.
 46        samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
 47        download: Whether to download the data if it is not present.
 48        use_realigned: Use the realigned instead of the original training data.
 49    """
 50    if use_realigned:
 51        # we need to sample batches in this case
 52        # sampler = torch_em.data.MinForegroundSampler(min_fraction=0.05, p_reject=.75)
 53        raise NotImplementedError
 54    else:
 55        urls = CREMI_URLS["original"]
 56        checksums = CHECKSUMS["original"]
 57
 58    os.makedirs(path, exist_ok=True)
 59    for name in samples:
 60        url = urls[name]
 61        checksum = checksums[name]
 62        data_path = os.path.join(path, f"sample{name}.h5")
 63        # CREMI SSL certificates expired, so we need to disable verification
 64        util.download_source(data_path, url, download, checksum, verify=False)
 65
 66
 67def get_cremi_paths(
 68    path: Union[os.PathLike, str],
 69    samples: Tuple[str, ...] = ("A", "B", "C"),
 70    use_realigned: bool = False,
 71    download: bool = False
 72) -> List[str]:
 73    """Get paths to the CREMI data.
 74
 75    Args:
 76        path: Filepath to a folder where the downloaded data will be saved.
 77        samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
 78        use_realigned: Use the realigned instead of the original training data.
 79        download: Whether to download the data if it is not present.
 80
 81    Returns:
 82        The filepaths to the training data.
 83    """
 84    get_cremi_data(path, samples, download, use_realigned)
 85    data_paths = [os.path.join(path, f"sample{name}.h5") for name in samples]
 86    return data_paths
 87
 88
 89def get_cremi_dataset(
 90    path: Union[os.PathLike, str],
 91    patch_shape: Tuple[int, int, int],
 92    samples: Tuple[str, ...] = ("A", "B", "C"),
 93    use_realigned: bool = False,
 94    download: bool = False,
 95    offsets: Optional[List[List[int]]] = None,
 96    boundaries: bool = False,
 97    rois: Dict[str, Any] = {},
 98    defect_augmentation_kwargs: Dict[str, Any] = {
 99        "p_drop_slice": 0.025,
100        "p_low_contrast": 0.025,
101        "p_deform_slice": 0.0,
102        "deformation_mode": "compress",
103    },
104    **kwargs,
105) -> Dataset:
106    """Get the CREMI dataset for the segmentation of neurons in EM.
107
108    Args:
109        path: Filepath to a folder where the downloaded data will be saved.
110        patch_shape: The patch shape to use for training.
111        samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
112        use_realigned: Use the realigned instead of the original training data.
113        download: Whether to download the data if it is not present.
114        offsets: Offset values for affinity computation used as target.
115        boundaries: Whether to compute boundaries as the target.
116        rois: The region of interests to use for the samples.
117        defect_augmentation_kwargs: Keyword arguments for defect augmentations.
118        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
119
120    Returns:
121       The segmentation dataset.
122    """
123    assert len(patch_shape) == 3
124    if rois is not None:
125        assert isinstance(rois, dict)
126
127    data_paths = get_cremi_paths(path, samples, use_realigned, download)
128    data_rois = [rois.get(name, np.s_[:, :, :]) for name in samples]
129
130    if defect_augmentation_kwargs is not None and "artifact_source" not in defect_augmentation_kwargs:
131        # download the defect volume
132        url = CREMI_URLS["defects"]
133        checksum = CHECKSUMS["defects"]
134        defect_path = os.path.join(path, "cremi_defects.h5")
135        util.download_source(defect_path, url, download, checksum)
136        defect_patch_shape = (1,) + tuple(patch_shape[1:])
137        artifact_source = torch_em.transform.get_artifact_source(
138            defect_path, defect_patch_shape,
139            min_mask_fraction=0.75,
140            raw_key="defect_sections/raw",
141            mask_key="defect_sections/mask"
142        )
143        defect_augmentation_kwargs.update({"artifact_source": artifact_source})
144
145    # defect augmentations
146    if defect_augmentation_kwargs is not None:
147        raw_transform = torch_em.transform.get_raw_transform(
148            augmentation1=torch_em.transform.EMDefectAugmentation(**defect_augmentation_kwargs)
149        )
150        kwargs = util.update_kwargs(kwargs, "raw_transform", raw_transform)
151
152    kwargs, _ = util.add_instance_label_transform(
153        kwargs, add_binary_target=False, boundaries=boundaries, offsets=offsets
154    )
155
156    return torch_em.default_segmentation_dataset(
157        raw_paths=data_paths,
158        raw_key="volumes/raw",
159        label_paths=data_paths,
160        label_key="volumes/labels/neuron_ids",
161        patch_shape=patch_shape,
162        rois=data_rois,
163        **kwargs
164    )
165
166
167def get_cremi_loader(
168    path: Union[os.PathLike, str],
169    patch_shape: Tuple[int, int, int],
170    batch_size: int,
171    samples: Tuple[str, ...] = ("A", "B", "C"),
172    use_realigned: bool = False,
173    download: bool = False,
174    offsets: Optional[List[List[int]]] = None,
175    boundaries: bool = False,
176    rois: Dict[str, Any] = {},
177    defect_augmentation_kwargs: Dict[str, Any] = {
178        "p_drop_slice": 0.025,
179        "p_low_contrast": 0.025,
180        "p_deform_slice": 0.0,
181        "deformation_mode": "compress",
182    },
183    **kwargs,
184) -> DataLoader:
185    """Get the DataLoader for EM neuron segmentation in the CREMI dataset.
186
187    Args:
188        path: Filepath to a folder where the downloaded data will be saved.
189        patch_shape: The patch shape to use for training.
190        batch_size: The batch size for training.
191        samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
192        use_realigned: Use the realigned instead of the original training data.
193        download: Whether to download the data if it is not present.
194        offsets: Offset values for affinity computation used as target.
195        boundaries: Whether to compute boundaries as the target.
196        rois: The region of interests to use for the samples.
197        defect_augmentation_kwargs: Keyword arguments for defect augmentations.
198        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
199
200    Returns:
201        The DataLoader.
202    """
203    dataset_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
204    ds = get_cremi_dataset(
205        path=path,
206        patch_shape=patch_shape,
207        samples=samples,
208        use_realigned=use_realigned,
209        download=download,
210        offsets=offsets,
211        boundaries=boundaries,
212        rois=rois,
213        defect_augmentation_kwargs=defect_augmentation_kwargs,
214        **dataset_kwargs,
215    )
216    return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)
CREMI_URLS = {'original': {'A': 'https://cremi.org/static/data/sample_A_20160501.hdf', 'B': 'https://cremi.org/static/data/sample_B_20160501.hdf', 'C': 'https://cremi.org/static/data/sample_C_20160501.hdf'}, 'realigned': {}, 'defects': 'https://zenodo.org/record/5767036/files/sample_ABC_padded_defects.h5'}
CHECKSUMS = {'original': {'A': '4c563d1b78acb2bcfb3ea958b6fe1533422f7f4a19f3e05b600bfa11430b510d', 'B': '887e85521e00deead18c94a21ad71f278d88a5214c7edeed943130a1f4bb48b8', 'C': '2874496f224d222ebc29d0e4753e8c458093e1d37bc53acd1b69b19ed1ae7052'}, 'realigned': {}, 'defects': '7b06ffa34733b2c32956ea5005e0cf345e7d3a27477f42f7c905701cdc947bd0'}
def get_cremi_data( path: Union[os.PathLike, str], samples: Tuple[str], download: bool, use_realigned: bool = False):
42def get_cremi_data(path: Union[os.PathLike, str], samples: Tuple[str], download: bool, use_realigned: bool = False):
43    """Download the CREMI training data.
44
45    Args:
46        path: Filepath to a folder where the downloaded data will be saved.
47        samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
48        download: Whether to download the data if it is not present.
49        use_realigned: Use the realigned instead of the original training data.
50    """
51    if use_realigned:
52        # we need to sample batches in this case
53        # sampler = torch_em.data.MinForegroundSampler(min_fraction=0.05, p_reject=.75)
54        raise NotImplementedError
55    else:
56        urls = CREMI_URLS["original"]
57        checksums = CHECKSUMS["original"]
58
59    os.makedirs(path, exist_ok=True)
60    for name in samples:
61        url = urls[name]
62        checksum = checksums[name]
63        data_path = os.path.join(path, f"sample{name}.h5")
64        # CREMI SSL certificates expired, so we need to disable verification
65        util.download_source(data_path, url, download, checksum, verify=False)

Download the CREMI training data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
  • download: Whether to download the data if it is not present.
  • use_realigned: Use the realigned instead of the original training data.
def get_cremi_paths( path: Union[os.PathLike, str], samples: Tuple[str, ...] = ('A', 'B', 'C'), use_realigned: bool = False, download: bool = False) -> List[str]:
68def get_cremi_paths(
69    path: Union[os.PathLike, str],
70    samples: Tuple[str, ...] = ("A", "B", "C"),
71    use_realigned: bool = False,
72    download: bool = False
73) -> List[str]:
74    """Get paths to the CREMI data.
75
76    Args:
77        path: Filepath to a folder where the downloaded data will be saved.
78        samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
79        use_realigned: Use the realigned instead of the original training data.
80        download: Whether to download the data if it is not present.
81
82    Returns:
83        The filepaths to the training data.
84    """
85    get_cremi_data(path, samples, download, use_realigned)
86    data_paths = [os.path.join(path, f"sample{name}.h5") for name in samples]
87    return data_paths

Get paths to the CREMI data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
  • use_realigned: Use the realigned instead of the original training data.
  • download: Whether to download the data if it is not present.
Returns:

The filepaths to the training data.

def get_cremi_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int, int], samples: Tuple[str, ...] = ('A', 'B', 'C'), use_realigned: bool = False, download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, rois: Dict[str, Any] = {}, defect_augmentation_kwargs: Dict[str, Any] = {'p_drop_slice': 0.025, 'p_low_contrast': 0.025, 'p_deform_slice': 0.0, 'deformation_mode': 'compress'}, **kwargs) -> torch.utils.data.dataset.Dataset:
 90def get_cremi_dataset(
 91    path: Union[os.PathLike, str],
 92    patch_shape: Tuple[int, int, int],
 93    samples: Tuple[str, ...] = ("A", "B", "C"),
 94    use_realigned: bool = False,
 95    download: bool = False,
 96    offsets: Optional[List[List[int]]] = None,
 97    boundaries: bool = False,
 98    rois: Dict[str, Any] = {},
 99    defect_augmentation_kwargs: Dict[str, Any] = {
100        "p_drop_slice": 0.025,
101        "p_low_contrast": 0.025,
102        "p_deform_slice": 0.0,
103        "deformation_mode": "compress",
104    },
105    **kwargs,
106) -> Dataset:
107    """Get the CREMI dataset for the segmentation of neurons in EM.
108
109    Args:
110        path: Filepath to a folder where the downloaded data will be saved.
111        patch_shape: The patch shape to use for training.
112        samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
113        use_realigned: Use the realigned instead of the original training data.
114        download: Whether to download the data if it is not present.
115        offsets: Offset values for affinity computation used as target.
116        boundaries: Whether to compute boundaries as the target.
117        rois: The region of interests to use for the samples.
118        defect_augmentation_kwargs: Keyword arguments for defect augmentations.
119        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
120
121    Returns:
122       The segmentation dataset.
123    """
124    assert len(patch_shape) == 3
125    if rois is not None:
126        assert isinstance(rois, dict)
127
128    data_paths = get_cremi_paths(path, samples, use_realigned, download)
129    data_rois = [rois.get(name, np.s_[:, :, :]) for name in samples]
130
131    if defect_augmentation_kwargs is not None and "artifact_source" not in defect_augmentation_kwargs:
132        # download the defect volume
133        url = CREMI_URLS["defects"]
134        checksum = CHECKSUMS["defects"]
135        defect_path = os.path.join(path, "cremi_defects.h5")
136        util.download_source(defect_path, url, download, checksum)
137        defect_patch_shape = (1,) + tuple(patch_shape[1:])
138        artifact_source = torch_em.transform.get_artifact_source(
139            defect_path, defect_patch_shape,
140            min_mask_fraction=0.75,
141            raw_key="defect_sections/raw",
142            mask_key="defect_sections/mask"
143        )
144        defect_augmentation_kwargs.update({"artifact_source": artifact_source})
145
146    # defect augmentations
147    if defect_augmentation_kwargs is not None:
148        raw_transform = torch_em.transform.get_raw_transform(
149            augmentation1=torch_em.transform.EMDefectAugmentation(**defect_augmentation_kwargs)
150        )
151        kwargs = util.update_kwargs(kwargs, "raw_transform", raw_transform)
152
153    kwargs, _ = util.add_instance_label_transform(
154        kwargs, add_binary_target=False, boundaries=boundaries, offsets=offsets
155    )
156
157    return torch_em.default_segmentation_dataset(
158        raw_paths=data_paths,
159        raw_key="volumes/raw",
160        label_paths=data_paths,
161        label_key="volumes/labels/neuron_ids",
162        patch_shape=patch_shape,
163        rois=data_rois,
164        **kwargs
165    )

Get the CREMI dataset for the segmentation of neurons in EM.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
  • use_realigned: Use the realigned instead of the original training data.
  • download: Whether to download the data if it is not present.
  • offsets: Offset values for affinity computation used as target.
  • boundaries: Whether to compute boundaries as the target.
  • rois: The region of interests to use for the samples.
  • defect_augmentation_kwargs: Keyword arguments for defect augmentations.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_cremi_loader( path: Union[os.PathLike, str], patch_shape: Tuple[int, int, int], batch_size: int, samples: Tuple[str, ...] = ('A', 'B', 'C'), use_realigned: bool = False, download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, rois: Dict[str, Any] = {}, defect_augmentation_kwargs: Dict[str, Any] = {'p_drop_slice': 0.025, 'p_low_contrast': 0.025, 'p_deform_slice': 0.0, 'deformation_mode': 'compress'}, **kwargs) -> torch.utils.data.dataloader.DataLoader:
168def get_cremi_loader(
169    path: Union[os.PathLike, str],
170    patch_shape: Tuple[int, int, int],
171    batch_size: int,
172    samples: Tuple[str, ...] = ("A", "B", "C"),
173    use_realigned: bool = False,
174    download: bool = False,
175    offsets: Optional[List[List[int]]] = None,
176    boundaries: bool = False,
177    rois: Dict[str, Any] = {},
178    defect_augmentation_kwargs: Dict[str, Any] = {
179        "p_drop_slice": 0.025,
180        "p_low_contrast": 0.025,
181        "p_deform_slice": 0.0,
182        "deformation_mode": "compress",
183    },
184    **kwargs,
185) -> DataLoader:
186    """Get the DataLoader for EM neuron segmentation in the CREMI dataset.
187
188    Args:
189        path: Filepath to a folder where the downloaded data will be saved.
190        patch_shape: The patch shape to use for training.
191        batch_size: The batch size for training.
192        samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
193        use_realigned: Use the realigned instead of the original training data.
194        download: Whether to download the data if it is not present.
195        offsets: Offset values for affinity computation used as target.
196        boundaries: Whether to compute boundaries as the target.
197        rois: The region of interests to use for the samples.
198        defect_augmentation_kwargs: Keyword arguments for defect augmentations.
199        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
200
201    Returns:
202        The DataLoader.
203    """
204    dataset_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
205    ds = get_cremi_dataset(
206        path=path,
207        patch_shape=patch_shape,
208        samples=samples,
209        use_realigned=use_realigned,
210        download=download,
211        offsets=offsets,
212        boundaries=boundaries,
213        rois=rois,
214        defect_augmentation_kwargs=defect_augmentation_kwargs,
215        **dataset_kwargs,
216    )
217    return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)

Get the DataLoader for EM neuron segmentation in the CREMI dataset.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • batch_size: The batch size for training.
  • samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
  • use_realigned: Use the realigned instead of the original training data.
  • download: Whether to download the data if it is not present.
  • offsets: Offset values for affinity computation used as target.
  • boundaries: Whether to compute boundaries as the target.
  • rois: The region of interests to use for the samples.
  • defect_augmentation_kwargs: Keyword arguments for defect augmentations.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.