torch_em.data.datasets.electron_microscopy.cremi
CREMI is a dataset for neuron segmentation in EM.
It contains three annotated volumes from the adult fruit-fly brain. It was held as a challenge at MICCAI 2016. For details on the dataset check out https://cremi.org/. Please cite the challenge if you use the dataset in your research.
1"""CREMI is a dataset for neuron segmentation in EM. 2 3It contains three annotated volumes from the adult fruit-fly brain. 4It was held as a challenge at MICCAI 2016. For details on the dataset check out https://cremi.org/. 5Please cite the challenge if you use the dataset in your research. 6""" 7# TODO add support for realigned volumes 8 9import os 10import warnings 11from typing import Any, Dict, List, Optional, Tuple, Union 12 13import numpy as np 14 15from torch.utils.data import Dataset, DataLoader 16 17import torch_em 18 19from .. import util 20from ....transform.raw import standardize 21 22 23CREMI_URLS = { 24 "original": { 25 "A": "https://cremi.org/static/data/sample_A_20160501.hdf", 26 "B": "https://cremi.org/static/data/sample_B_20160501.hdf", 27 "C": "https://cremi.org/static/data/sample_C_20160501.hdf", 28 }, 29 "realigned": {}, 30 "defects": "https://zenodo.org/record/5767036/files/sample_ABC_padded_defects.h5" 31} 32CHECKSUMS = { 33 "original": { 34 "A": "4c563d1b78acb2bcfb3ea958b6fe1533422f7f4a19f3e05b600bfa11430b510d", 35 "B": "887e85521e00deead18c94a21ad71f278d88a5214c7edeed943130a1f4bb48b8", 36 "C": "2874496f224d222ebc29d0e4753e8c458093e1d37bc53acd1b69b19ed1ae7052", 37 }, 38 "realigned": {}, 39 "defects": "7b06ffa34733b2c32956ea5005e0cf345e7d3a27477f42f7c905701cdc947bd0" 40} 41 42 43def get_cremi_data(path: Union[os.PathLike, str], samples: Tuple[str], download: bool, use_realigned: bool = False): 44 """Download the CREMI training data. 45 46 Args: 47 path: Filepath to a folder where the downloaded data will be saved. 48 samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'. 49 download: Whether to download the data if it is not present. 50 use_realigned: Use the realigned instead of the original training data. 51 """ 52 if use_realigned: 53 # we need to sample batches in this case 54 # sampler = torch_em.data.MinForegroundSampler(min_fraction=0.05, p_reject=.75) 55 raise NotImplementedError 56 else: 57 urls = CREMI_URLS["original"] 58 checksums = CHECKSUMS["original"] 59 60 os.makedirs(path, exist_ok=True) 61 for name in samples: 62 url = urls[name] 63 checksum = checksums[name] 64 data_path = os.path.join(path, f"sample{name}.h5") 65 # CREMI SSL certificates expired, so we need to disable verification 66 util.download_source(data_path, url, download, checksum, verify=False) 67 68 69def get_cremi_paths( 70 path: Union[os.PathLike, str], 71 samples: Tuple[str, ...] = ("A", "B", "C"), 72 use_realigned: bool = False, 73 download: bool = False 74) -> List[str]: 75 """Get paths to the CREMI data. 76 77 Args: 78 path: Filepath to a folder where the downloaded data will be saved. 79 samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'. 80 use_realigned: Use the realigned instead of the original training data. 81 download: Whether to download the data if it is not present. 82 83 Returns: 84 The filepaths to the training data. 85 """ 86 get_cremi_data(path, samples, download, use_realigned) 87 data_paths = [os.path.join(path, f"sample{name}.h5") for name in samples] 88 return data_paths 89 90 91def get_cremi_dataset( 92 path: Union[os.PathLike, str], 93 patch_shape: Tuple[int, int, int], 94 samples: Tuple[str, ...] = ("A", "B", "C"), 95 use_realigned: bool = False, 96 download: bool = False, 97 offsets: Optional[List[List[int]]] = None, 98 boundaries: bool = False, 99 rois: Dict[str, Any] = {}, 100 defect_augmentation_kwargs: Dict[str, Any] = { 101 "p_drop_slice": 0.025, 102 "p_low_contrast": 0.025, 103 "p_deform_slice": 0.0, 104 "deformation_mode": "compress", 105 }, 106 **kwargs, 107) -> Dataset: 108 """Get the CREMI dataset for the segmentation of neurons in EM. 109 110 Args: 111 path: Filepath to a folder where the downloaded data will be saved. 112 patch_shape: The patch shape to use for training. 113 samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'. 114 use_realigned: Use the realigned instead of the original training data. 115 download: Whether to download the data if it is not present. 116 offsets: Offset values for affinity computation used as target. 117 boundaries: Whether to compute boundaries as the target. 118 rois: The region of interests to use for the samples. 119 defect_augmentation_kwargs: Keyword arguments for defect augmentations. 120 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 121 122 Returns: 123 The segmentation dataset. 124 """ 125 assert len(patch_shape) == 3 126 if rois is not None: 127 assert isinstance(rois, dict) 128 129 data_paths = get_cremi_paths(path, samples, use_realigned, download) 130 data_rois = [rois.get(name, np.s_[:, :, :]) for name in samples] 131 132 if defect_augmentation_kwargs is not None and "artifact_source" not in defect_augmentation_kwargs: 133 # download the defect volume 134 url = CREMI_URLS["defects"] 135 checksum = CHECKSUMS["defects"] 136 defect_path = os.path.join(path, "cremi_defects.h5") 137 util.download_source(defect_path, url, download, checksum) 138 defect_patch_shape = (1,) + tuple(patch_shape[1:]) 139 artifact_source = torch_em.transform.get_artifact_source( 140 defect_path, defect_patch_shape, 141 min_mask_fraction=0.75, 142 raw_key="defect_sections/raw", 143 mask_key="defect_sections/mask" 144 ) 145 defect_augmentation_kwargs.update({"artifact_source": artifact_source}) 146 147 # defect augmentations 148 if defect_augmentation_kwargs is not None: 149 if "raw_transform" in kwargs: 150 warnings.warn( 151 "'raw_transform' was found in kwargs. It will be used as the " 152 "normalizer for the defect augmentation pipeline, which may lead to incorrect results" 153 "if the normalizer maps to an unexpected data range." 154 ) 155 raw_transform = torch_em.transform.get_raw_transform( 156 normalizer=kwargs.pop("raw_transform", standardize), 157 augmentation1=torch_em.transform.EMDefectAugmentation(**defect_augmentation_kwargs) 158 ) 159 kwargs = util.update_kwargs(kwargs, "raw_transform", raw_transform) 160 161 kwargs, _ = util.add_instance_label_transform( 162 kwargs, add_binary_target=False, boundaries=boundaries, offsets=offsets 163 ) 164 165 return torch_em.default_segmentation_dataset( 166 raw_paths=data_paths, 167 raw_key="volumes/raw", 168 label_paths=data_paths, 169 label_key="volumes/labels/neuron_ids", 170 patch_shape=patch_shape, 171 rois=data_rois, 172 **kwargs 173 ) 174 175 176def get_cremi_loader( 177 path: Union[os.PathLike, str], 178 patch_shape: Tuple[int, int, int], 179 batch_size: int, 180 samples: Tuple[str, ...] = ("A", "B", "C"), 181 use_realigned: bool = False, 182 download: bool = False, 183 offsets: Optional[List[List[int]]] = None, 184 boundaries: bool = False, 185 rois: Dict[str, Any] = {}, 186 defect_augmentation_kwargs: Dict[str, Any] = { 187 "p_drop_slice": 0.025, 188 "p_low_contrast": 0.025, 189 "p_deform_slice": 0.0, 190 "deformation_mode": "compress", 191 }, 192 **kwargs, 193) -> DataLoader: 194 """Get the DataLoader for EM neuron segmentation in the CREMI dataset. 195 196 Args: 197 path: Filepath to a folder where the downloaded data will be saved. 198 patch_shape: The patch shape to use for training. 199 batch_size: The batch size for training. 200 samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'. 201 use_realigned: Use the realigned instead of the original training data. 202 download: Whether to download the data if it is not present. 203 offsets: Offset values for affinity computation used as target. 204 boundaries: Whether to compute boundaries as the target. 205 rois: The region of interests to use for the samples. 206 defect_augmentation_kwargs: Keyword arguments for defect augmentations. 207 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 208 209 Returns: 210 The DataLoader. 211 """ 212 dataset_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 213 ds = get_cremi_dataset( 214 path=path, 215 patch_shape=patch_shape, 216 samples=samples, 217 use_realigned=use_realigned, 218 download=download, 219 offsets=offsets, 220 boundaries=boundaries, 221 rois=rois, 222 defect_augmentation_kwargs=defect_augmentation_kwargs, 223 **dataset_kwargs, 224 ) 225 return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)
CREMI_URLS =
{'original': {'A': 'https://cremi.org/static/data/sample_A_20160501.hdf', 'B': 'https://cremi.org/static/data/sample_B_20160501.hdf', 'C': 'https://cremi.org/static/data/sample_C_20160501.hdf'}, 'realigned': {}, 'defects': 'https://zenodo.org/record/5767036/files/sample_ABC_padded_defects.h5'}
CHECKSUMS =
{'original': {'A': '4c563d1b78acb2bcfb3ea958b6fe1533422f7f4a19f3e05b600bfa11430b510d', 'B': '887e85521e00deead18c94a21ad71f278d88a5214c7edeed943130a1f4bb48b8', 'C': '2874496f224d222ebc29d0e4753e8c458093e1d37bc53acd1b69b19ed1ae7052'}, 'realigned': {}, 'defects': '7b06ffa34733b2c32956ea5005e0cf345e7d3a27477f42f7c905701cdc947bd0'}
def
get_cremi_data( path: Union[os.PathLike, str], samples: Tuple[str], download: bool, use_realigned: bool = False):
44def get_cremi_data(path: Union[os.PathLike, str], samples: Tuple[str], download: bool, use_realigned: bool = False): 45 """Download the CREMI training data. 46 47 Args: 48 path: Filepath to a folder where the downloaded data will be saved. 49 samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'. 50 download: Whether to download the data if it is not present. 51 use_realigned: Use the realigned instead of the original training data. 52 """ 53 if use_realigned: 54 # we need to sample batches in this case 55 # sampler = torch_em.data.MinForegroundSampler(min_fraction=0.05, p_reject=.75) 56 raise NotImplementedError 57 else: 58 urls = CREMI_URLS["original"] 59 checksums = CHECKSUMS["original"] 60 61 os.makedirs(path, exist_ok=True) 62 for name in samples: 63 url = urls[name] 64 checksum = checksums[name] 65 data_path = os.path.join(path, f"sample{name}.h5") 66 # CREMI SSL certificates expired, so we need to disable verification 67 util.download_source(data_path, url, download, checksum, verify=False)
Download the CREMI training data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
- download: Whether to download the data if it is not present.
- use_realigned: Use the realigned instead of the original training data.
def
get_cremi_paths( path: Union[os.PathLike, str], samples: Tuple[str, ...] = ('A', 'B', 'C'), use_realigned: bool = False, download: bool = False) -> List[str]:
70def get_cremi_paths( 71 path: Union[os.PathLike, str], 72 samples: Tuple[str, ...] = ("A", "B", "C"), 73 use_realigned: bool = False, 74 download: bool = False 75) -> List[str]: 76 """Get paths to the CREMI data. 77 78 Args: 79 path: Filepath to a folder where the downloaded data will be saved. 80 samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'. 81 use_realigned: Use the realigned instead of the original training data. 82 download: Whether to download the data if it is not present. 83 84 Returns: 85 The filepaths to the training data. 86 """ 87 get_cremi_data(path, samples, download, use_realigned) 88 data_paths = [os.path.join(path, f"sample{name}.h5") for name in samples] 89 return data_paths
Get paths to the CREMI data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
- use_realigned: Use the realigned instead of the original training data.
- download: Whether to download the data if it is not present.
Returns:
The filepaths to the training data.
def
get_cremi_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int, int], samples: Tuple[str, ...] = ('A', 'B', 'C'), use_realigned: bool = False, download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, rois: Dict[str, Any] = {}, defect_augmentation_kwargs: Dict[str, Any] = {'p_drop_slice': 0.025, 'p_low_contrast': 0.025, 'p_deform_slice': 0.0, 'deformation_mode': 'compress'}, **kwargs) -> torch.utils.data.dataset.Dataset:
92def get_cremi_dataset( 93 path: Union[os.PathLike, str], 94 patch_shape: Tuple[int, int, int], 95 samples: Tuple[str, ...] = ("A", "B", "C"), 96 use_realigned: bool = False, 97 download: bool = False, 98 offsets: Optional[List[List[int]]] = None, 99 boundaries: bool = False, 100 rois: Dict[str, Any] = {}, 101 defect_augmentation_kwargs: Dict[str, Any] = { 102 "p_drop_slice": 0.025, 103 "p_low_contrast": 0.025, 104 "p_deform_slice": 0.0, 105 "deformation_mode": "compress", 106 }, 107 **kwargs, 108) -> Dataset: 109 """Get the CREMI dataset for the segmentation of neurons in EM. 110 111 Args: 112 path: Filepath to a folder where the downloaded data will be saved. 113 patch_shape: The patch shape to use for training. 114 samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'. 115 use_realigned: Use the realigned instead of the original training data. 116 download: Whether to download the data if it is not present. 117 offsets: Offset values for affinity computation used as target. 118 boundaries: Whether to compute boundaries as the target. 119 rois: The region of interests to use for the samples. 120 defect_augmentation_kwargs: Keyword arguments for defect augmentations. 121 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 122 123 Returns: 124 The segmentation dataset. 125 """ 126 assert len(patch_shape) == 3 127 if rois is not None: 128 assert isinstance(rois, dict) 129 130 data_paths = get_cremi_paths(path, samples, use_realigned, download) 131 data_rois = [rois.get(name, np.s_[:, :, :]) for name in samples] 132 133 if defect_augmentation_kwargs is not None and "artifact_source" not in defect_augmentation_kwargs: 134 # download the defect volume 135 url = CREMI_URLS["defects"] 136 checksum = CHECKSUMS["defects"] 137 defect_path = os.path.join(path, "cremi_defects.h5") 138 util.download_source(defect_path, url, download, checksum) 139 defect_patch_shape = (1,) + tuple(patch_shape[1:]) 140 artifact_source = torch_em.transform.get_artifact_source( 141 defect_path, defect_patch_shape, 142 min_mask_fraction=0.75, 143 raw_key="defect_sections/raw", 144 mask_key="defect_sections/mask" 145 ) 146 defect_augmentation_kwargs.update({"artifact_source": artifact_source}) 147 148 # defect augmentations 149 if defect_augmentation_kwargs is not None: 150 if "raw_transform" in kwargs: 151 warnings.warn( 152 "'raw_transform' was found in kwargs. It will be used as the " 153 "normalizer for the defect augmentation pipeline, which may lead to incorrect results" 154 "if the normalizer maps to an unexpected data range." 155 ) 156 raw_transform = torch_em.transform.get_raw_transform( 157 normalizer=kwargs.pop("raw_transform", standardize), 158 augmentation1=torch_em.transform.EMDefectAugmentation(**defect_augmentation_kwargs) 159 ) 160 kwargs = util.update_kwargs(kwargs, "raw_transform", raw_transform) 161 162 kwargs, _ = util.add_instance_label_transform( 163 kwargs, add_binary_target=False, boundaries=boundaries, offsets=offsets 164 ) 165 166 return torch_em.default_segmentation_dataset( 167 raw_paths=data_paths, 168 raw_key="volumes/raw", 169 label_paths=data_paths, 170 label_key="volumes/labels/neuron_ids", 171 patch_shape=patch_shape, 172 rois=data_rois, 173 **kwargs 174 )
Get the CREMI dataset for the segmentation of neurons in EM.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
- use_realigned: Use the realigned instead of the original training data.
- download: Whether to download the data if it is not present.
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- rois: The region of interests to use for the samples.
- defect_augmentation_kwargs: Keyword arguments for defect augmentations.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset.
Returns:
The segmentation dataset.
def
get_cremi_loader( path: Union[os.PathLike, str], patch_shape: Tuple[int, int, int], batch_size: int, samples: Tuple[str, ...] = ('A', 'B', 'C'), use_realigned: bool = False, download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, rois: Dict[str, Any] = {}, defect_augmentation_kwargs: Dict[str, Any] = {'p_drop_slice': 0.025, 'p_low_contrast': 0.025, 'p_deform_slice': 0.0, 'deformation_mode': 'compress'}, **kwargs) -> torch.utils.data.dataloader.DataLoader:
177def get_cremi_loader( 178 path: Union[os.PathLike, str], 179 patch_shape: Tuple[int, int, int], 180 batch_size: int, 181 samples: Tuple[str, ...] = ("A", "B", "C"), 182 use_realigned: bool = False, 183 download: bool = False, 184 offsets: Optional[List[List[int]]] = None, 185 boundaries: bool = False, 186 rois: Dict[str, Any] = {}, 187 defect_augmentation_kwargs: Dict[str, Any] = { 188 "p_drop_slice": 0.025, 189 "p_low_contrast": 0.025, 190 "p_deform_slice": 0.0, 191 "deformation_mode": "compress", 192 }, 193 **kwargs, 194) -> DataLoader: 195 """Get the DataLoader for EM neuron segmentation in the CREMI dataset. 196 197 Args: 198 path: Filepath to a folder where the downloaded data will be saved. 199 patch_shape: The patch shape to use for training. 200 batch_size: The batch size for training. 201 samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'. 202 use_realigned: Use the realigned instead of the original training data. 203 download: Whether to download the data if it is not present. 204 offsets: Offset values for affinity computation used as target. 205 boundaries: Whether to compute boundaries as the target. 206 rois: The region of interests to use for the samples. 207 defect_augmentation_kwargs: Keyword arguments for defect augmentations. 208 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 209 210 Returns: 211 The DataLoader. 212 """ 213 dataset_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 214 ds = get_cremi_dataset( 215 path=path, 216 patch_shape=patch_shape, 217 samples=samples, 218 use_realigned=use_realigned, 219 download=download, 220 offsets=offsets, 221 boundaries=boundaries, 222 rois=rois, 223 defect_augmentation_kwargs=defect_augmentation_kwargs, 224 **dataset_kwargs, 225 ) 226 return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)
Get the DataLoader for EM neuron segmentation in the CREMI dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- batch_size: The batch size for training.
- samples: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
- use_realigned: Use the realigned instead of the original training data.
- download: Whether to download the data if it is not present.
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- rois: The region of interests to use for the samples.
- defect_augmentation_kwargs: Keyword arguments for defect augmentations.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_datasetor for the PyTorch DataLoader.
Returns:
The DataLoader.