torch_em.data.datasets.electron_microscopy.mitoem
MitoEM is a dataset for segmenting mitochondria in electron microscopy. It contains two large annotated volumes, one from rat cortex, the other from human cortex. This dataset was used for a segmentation challenge at ISBI 2022.
If you use it in your research then please cite https://doi.org/10.1007/978-3-030-59722-1_7.
1"""MitoEM is a dataset for segmenting mitochondria in electron microscopy. 2It contains two large annotated volumes, one from rat cortex, the other from human cortex. 3This dataset was used for a segmentation challenge at ISBI 2022. 4 5If you use it in your research then please cite https://doi.org/10.1007/978-3-030-59722-1_7. 6""" 7 8import os 9from tqdm import tqdm 10import multiprocessing 11from shutil import rmtree 12from concurrent import futures 13from typing import List, Optional, Sequence, Tuple, Union 14 15import numpy as np 16import imageio.v3 as imageio 17 18import torch_em 19 20from torch.utils.data import Dataset, DataLoader 21 22from .. import util 23 24 25URLS = { 26 "raw": { 27 "human": "https://www.dropbox.com/s/z41qtu4y735j95e/EM30-H-im.zip?dl=1", 28 "rat": "https://huggingface.co/datasets/pytc/EM30/resolve/main/EM30-R-im.zip" 29 }, 30 "labels": { 31 "human": "https://www.dropbox.com/s/dhf89bc14kemw4e/EM30-H-mito-train-val-v2.zip?dl=1", 32 "rat": "https://huggingface.co/datasets/pytc/MitoEM/resolve/main/EM30-R-mito-train-val-v2.zip" 33 } 34} 35 36CHECKSUMS = { 37 "raw": { 38 "human": "98fe259f36a7d8d43f99981b7a0ef8cdeba2ce2615ff91595f428ae57207a041", 39 "rat": "6a2cac68adde5d01984542d3ee1d7753d1fa3e6eb2a042ce15ce297c95885bbe" 40 }, 41 "labels": { 42 "human": "0e8ed292cfcd0c58701d9f4299244a1b66d6aeb506c85754c34f98a4eda0ef1b", 43 "rat": "c56380ac575428a818bd293ca3509d1249999846c3702ccbf11d308acdd2ae86" 44 } 45} 46 47 48def _check_data(path, sample): 49 splits = ["train", "val", "test"] 50 expected_paths = [os.path.join(path, f"{sample}_{split}.n5") for split in splits] 51 return all(os.path.exists(pp) for pp in expected_paths) 52 53 54def _get_slices(folder): 55 files = os.listdir(folder) 56 files.sort() 57 files = [os.path.splitext(ff)[0] for ff in files] 58 slice_ids = [int(ff[2:]) if ff.startswith('im') else int(ff[3:]) for ff in files] 59 return slice_ids 60 61 62def _load_vol(pattern, slice_ids, desc, n_threads, dtype=None): 63 im0 = pattern % slice_ids[0] 64 im0 = imageio.imread(im0) 65 66 shape = (len(slice_ids),) + im0.shape 67 68 dtype = im0.dtype if dtype is None else dtype 69 out = np.zeros(shape, dtype=dtype) 70 out[0] = im0 71 72 def load_slice(z, slice_id): 73 out[z] = imageio.imread(pattern % slice_id) 74 75 zs = list(range(1, len(slice_ids))) 76 assert len(zs) == len(slice_ids) - 1 77 with futures.ThreadPoolExecutor(n_threads) as tp: 78 list(tqdm(tp.map(load_slice, zs, slice_ids[1:]), total=len(slice_ids) - 1, desc=desc)) 79 80 return out 81 82 83def _create_volume(out_path, im_folder, label_folder=None, z_start=None): 84 import z5py 85 86 if label_folder is None: 87 assert z_start is not None 88 n_slices = len(_get_slices(im_folder)) 89 slices = list(range(z_start, n_slices)) 90 else: 91 assert z_start is None 92 slices = _get_slices(label_folder) 93 94 n_threads = min(16, multiprocessing.cpu_count()) 95 raw = _load_vol(os.path.join(im_folder, "im%04i.png"), slices, "load raw", n_threads) 96 if label_folder is not None: 97 labels = _load_vol(os.path.join(label_folder, "seg%04i.tif"), slices, "load labels", n_threads, dtype="uint64") 98 99 print("Write volume to", out_path) 100 chunks = (32, 256, 256) 101 with z5py.File(out_path, "a") as f: 102 f.create_dataset("raw", data=raw, chunks=chunks, compression="gzip", n_threads=n_threads) 103 if label_folder is not None: 104 ds = f.create_dataset("labels", data=labels, chunks=chunks, compression="gzip", n_threads=n_threads) 105 ds.attrs["maxId"] = int(labels.max()) + 1 106 107 return slices[-1] 108 109 110def _require_mitoem_sample(path, sample, download): 111 os.makedirs(path, exist_ok=True) 112 113 for name in ("raw", "labels"): 114 url = URLS[name][sample] 115 checksum = CHECKSUMS[name][sample] 116 zip_path = os.path.join(path, f"{sample}.zip") 117 util.download_source(zip_path, url, download, checksum) 118 util.unzip(zip_path, path, remove=True) 119 120 im_folder = os.path.join(path, "im") 121 train_folder = os.path.join(path, "mito-train-v2") 122 val_folder = os.path.join(path, "mito-val-v2") 123 124 print("Create train volume") 125 train_path = os.path.join(path, f"{sample}_train.n5") 126 _create_volume(train_path, im_folder, train_folder) 127 128 print("Create validation volume") 129 val_path = os.path.join(path, f"{sample}_val.n5") 130 z = _create_volume(val_path, im_folder, val_folder) 131 132 print("Create test volume") 133 test_path = os.path.join(path, f"{sample}_test.n5") 134 _create_volume(test_path, im_folder, z_start=z) 135 136 rmtree(im_folder) 137 rmtree(train_folder) 138 rmtree(val_folder) 139 140 141def get_mitoem_data(path: Union[os.PathLike, str], samples: Sequence[str], splits: Sequence[str], download: bool): 142 """Download the MitoEM training data. 143 144 Args: 145 path: Filepath to a folder where the downloaded data will be saved. 146 samples: The samples to download. The available samples are 'human' and 'rat'. 147 splits: The data splits to download. The available splits are 'train', 'val' and 'test'. 148 download: Whether to download the data if it is not present. 149 """ 150 assert len(set(splits) - {"train", "val"}) == 0, f"{splits}" 151 assert len(set(samples) - {"human", "rat"}) == 0, f"{samples}" 152 os.makedirs(path, exist_ok=True) 153 154 for sample in samples: 155 if not _check_data(path, sample): 156 print("The MitoEM data for sample", sample, "is not available yet and will be downloaded and created.") 157 print("Note that this dataset is large, so this step can take several hours (depending on your internet).") 158 _require_mitoem_sample(path, sample, download) 159 print("The MitoEM data for sample", sample, "has been created.") 160 161 for split in splits: 162 split_path = os.path.join(path, f"{sample}_{split}.n5") 163 assert os.path.exists(split_path), split_path 164 165 166def get_mitoem_paths( 167 path: Union[os.PathLike, str], 168 splits: Sequence[str], 169 samples: Sequence[str] = ("human", "rat"), 170 download: bool = False, 171) -> List[str]: 172 """Get paths for MitoEM data. 173 174 Args: 175 path: Filepath to a folder where the downloaded data will be saved. 176 samples: The samples to download. The available samples are 'human' and 'rat'. 177 splits: The data splits to download. The available splits are 'train', 'val' and 'test'. 178 download: Whether to download the data if it is not present. 179 180 Returns: 181 The filepaths for the stored data. 182 """ 183 if isinstance(splits, str): 184 splits = [splits] 185 186 if isinstance(samples, str): 187 samples = [samples] 188 189 get_mitoem_data(path, samples, splits, download) 190 data_paths = [os.path.join(path, f"{sample}_{split}.n5") for split in splits for sample in samples] 191 192 return data_paths 193 194 195def get_mitoem_dataset( 196 path: Union[os.PathLike, str], 197 splits: Sequence[str], 198 patch_shape: Tuple[int, int, int], 199 samples: Sequence[str] = ("human", "rat"), 200 download: bool = False, 201 offsets: Optional[List[List[int]]] = None, 202 boundaries: bool = False, 203 binary: bool = False, 204 **kwargs, 205) -> Dataset: 206 """Get the MitoEM dataset for the segmentation of mitochondria in EM. 207 208 Args: 209 path: Filepath to a folder where the downloaded data will be saved. 210 splits: The splits to use for the dataset. Available values are 'train', 'val' and 'test'. 211 patch_shape: The patch shape to use for training. 212 samples: The samples to use for the dataset. The available samples are 'human' and 'rat'. 213 download: Whether to download the data if it is not present. 214 offsets: Offset values for affinity computation used as target. 215 boundaries: Whether to compute boundaries as the target. 216 binary: Whether to return a binary segmentation target. 217 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 218 219 Returns: 220 The segmentation dataset. 221 """ 222 assert len(patch_shape) == 3 223 224 data_paths = get_mitoem_paths(path, splits, samples, download) 225 226 kwargs, _ = util.add_instance_label_transform( 227 kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets 228 ) 229 230 return torch_em.default_segmentation_dataset( 231 raw_paths=data_paths, 232 raw_key="raw", 233 label_paths=data_paths, 234 label_key="labels", 235 patch_shape=patch_shape, 236 **kwargs 237 ) 238 239 240def get_mitoem_loader( 241 path: Union[os.PathLike, str], 242 splits: Sequence[str], 243 patch_shape: Tuple[int, int, int], 244 batch_size: int, 245 samples: Sequence[str] = ("human", "rat"), 246 download: bool = False, 247 offsets: Optional[List[List[int]]] = None, 248 boundaries: bool = False, 249 binary: bool = False, 250 **kwargs, 251) -> DataLoader: 252 """Get the MitoEM dataloader for the segmentation of mitochondria in EM. 253 254 Args: 255 path: Filepath to a folder where the downloaded data will be saved. 256 splits: The splits to use for the dataset. Available values are 'train', 'val' and 'test'. 257 patch_shape: The patch shape to use for training. 258 batch_size: The batch size for training. 259 samples: The samples to use for the dataset. The available samples are 'human' and 'rat'. 260 download: Whether to download the data if it is not present. 261 offsets: Offset values for affinity computation used as target. 262 boundaries: Whether to compute boundaries as the target. 263 binary: Whether to return a binary segmentation target. 264 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 265 266 Returns: 267 The DataLoader. 268 """ 269 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 270 dataset = get_mitoem_dataset(path, splits, patch_shape, samples, download, offsets, boundaries, binary, **ds_kwargs) 271 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URLS =
{'raw': {'human': 'https://www.dropbox.com/s/z41qtu4y735j95e/EM30-H-im.zip?dl=1', 'rat': 'https://huggingface.co/datasets/pytc/EM30/resolve/main/EM30-R-im.zip'}, 'labels': {'human': 'https://www.dropbox.com/s/dhf89bc14kemw4e/EM30-H-mito-train-val-v2.zip?dl=1', 'rat': 'https://huggingface.co/datasets/pytc/MitoEM/resolve/main/EM30-R-mito-train-val-v2.zip'}}
CHECKSUMS =
{'raw': {'human': '98fe259f36a7d8d43f99981b7a0ef8cdeba2ce2615ff91595f428ae57207a041', 'rat': '6a2cac68adde5d01984542d3ee1d7753d1fa3e6eb2a042ce15ce297c95885bbe'}, 'labels': {'human': '0e8ed292cfcd0c58701d9f4299244a1b66d6aeb506c85754c34f98a4eda0ef1b', 'rat': 'c56380ac575428a818bd293ca3509d1249999846c3702ccbf11d308acdd2ae86'}}
def
get_mitoem_data( path: Union[os.PathLike, str], samples: Sequence[str], splits: Sequence[str], download: bool):
142def get_mitoem_data(path: Union[os.PathLike, str], samples: Sequence[str], splits: Sequence[str], download: bool): 143 """Download the MitoEM training data. 144 145 Args: 146 path: Filepath to a folder where the downloaded data will be saved. 147 samples: The samples to download. The available samples are 'human' and 'rat'. 148 splits: The data splits to download. The available splits are 'train', 'val' and 'test'. 149 download: Whether to download the data if it is not present. 150 """ 151 assert len(set(splits) - {"train", "val"}) == 0, f"{splits}" 152 assert len(set(samples) - {"human", "rat"}) == 0, f"{samples}" 153 os.makedirs(path, exist_ok=True) 154 155 for sample in samples: 156 if not _check_data(path, sample): 157 print("The MitoEM data for sample", sample, "is not available yet and will be downloaded and created.") 158 print("Note that this dataset is large, so this step can take several hours (depending on your internet).") 159 _require_mitoem_sample(path, sample, download) 160 print("The MitoEM data for sample", sample, "has been created.") 161 162 for split in splits: 163 split_path = os.path.join(path, f"{sample}_{split}.n5") 164 assert os.path.exists(split_path), split_path
Download the MitoEM training data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- samples: The samples to download. The available samples are 'human' and 'rat'.
- splits: The data splits to download. The available splits are 'train', 'val' and 'test'.
- download: Whether to download the data if it is not present.
def
get_mitoem_paths( path: Union[os.PathLike, str], splits: Sequence[str], samples: Sequence[str] = ('human', 'rat'), download: bool = False) -> List[str]:
167def get_mitoem_paths( 168 path: Union[os.PathLike, str], 169 splits: Sequence[str], 170 samples: Sequence[str] = ("human", "rat"), 171 download: bool = False, 172) -> List[str]: 173 """Get paths for MitoEM data. 174 175 Args: 176 path: Filepath to a folder where the downloaded data will be saved. 177 samples: The samples to download. The available samples are 'human' and 'rat'. 178 splits: The data splits to download. The available splits are 'train', 'val' and 'test'. 179 download: Whether to download the data if it is not present. 180 181 Returns: 182 The filepaths for the stored data. 183 """ 184 if isinstance(splits, str): 185 splits = [splits] 186 187 if isinstance(samples, str): 188 samples = [samples] 189 190 get_mitoem_data(path, samples, splits, download) 191 data_paths = [os.path.join(path, f"{sample}_{split}.n5") for split in splits for sample in samples] 192 193 return data_paths
Get paths for MitoEM data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- samples: The samples to download. The available samples are 'human' and 'rat'.
- splits: The data splits to download. The available splits are 'train', 'val' and 'test'.
- download: Whether to download the data if it is not present.
Returns:
The filepaths for the stored data.
def
get_mitoem_dataset( path: Union[os.PathLike, str], splits: Sequence[str], patch_shape: Tuple[int, int, int], samples: Sequence[str] = ('human', 'rat'), download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
196def get_mitoem_dataset( 197 path: Union[os.PathLike, str], 198 splits: Sequence[str], 199 patch_shape: Tuple[int, int, int], 200 samples: Sequence[str] = ("human", "rat"), 201 download: bool = False, 202 offsets: Optional[List[List[int]]] = None, 203 boundaries: bool = False, 204 binary: bool = False, 205 **kwargs, 206) -> Dataset: 207 """Get the MitoEM dataset for the segmentation of mitochondria in EM. 208 209 Args: 210 path: Filepath to a folder where the downloaded data will be saved. 211 splits: The splits to use for the dataset. Available values are 'train', 'val' and 'test'. 212 patch_shape: The patch shape to use for training. 213 samples: The samples to use for the dataset. The available samples are 'human' and 'rat'. 214 download: Whether to download the data if it is not present. 215 offsets: Offset values for affinity computation used as target. 216 boundaries: Whether to compute boundaries as the target. 217 binary: Whether to return a binary segmentation target. 218 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 219 220 Returns: 221 The segmentation dataset. 222 """ 223 assert len(patch_shape) == 3 224 225 data_paths = get_mitoem_paths(path, splits, samples, download) 226 227 kwargs, _ = util.add_instance_label_transform( 228 kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets 229 ) 230 231 return torch_em.default_segmentation_dataset( 232 raw_paths=data_paths, 233 raw_key="raw", 234 label_paths=data_paths, 235 label_key="labels", 236 patch_shape=patch_shape, 237 **kwargs 238 )
Get the MitoEM dataset for the segmentation of mitochondria in EM.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- splits: The splits to use for the dataset. Available values are 'train', 'val' and 'test'.
- patch_shape: The patch shape to use for training.
- samples: The samples to use for the dataset. The available samples are 'human' and 'rat'.
- download: Whether to download the data if it is not present.
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- binary: Whether to return a binary segmentation target.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
def
get_mitoem_loader( path: Union[os.PathLike, str], splits: Sequence[str], patch_shape: Tuple[int, int, int], batch_size: int, samples: Sequence[str] = ('human', 'rat'), download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
241def get_mitoem_loader( 242 path: Union[os.PathLike, str], 243 splits: Sequence[str], 244 patch_shape: Tuple[int, int, int], 245 batch_size: int, 246 samples: Sequence[str] = ("human", "rat"), 247 download: bool = False, 248 offsets: Optional[List[List[int]]] = None, 249 boundaries: bool = False, 250 binary: bool = False, 251 **kwargs, 252) -> DataLoader: 253 """Get the MitoEM dataloader for the segmentation of mitochondria in EM. 254 255 Args: 256 path: Filepath to a folder where the downloaded data will be saved. 257 splits: The splits to use for the dataset. Available values are 'train', 'val' and 'test'. 258 patch_shape: The patch shape to use for training. 259 batch_size: The batch size for training. 260 samples: The samples to use for the dataset. The available samples are 'human' and 'rat'. 261 download: Whether to download the data if it is not present. 262 offsets: Offset values for affinity computation used as target. 263 boundaries: Whether to compute boundaries as the target. 264 binary: Whether to return a binary segmentation target. 265 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 266 267 Returns: 268 The DataLoader. 269 """ 270 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 271 dataset = get_mitoem_dataset(path, splits, patch_shape, samples, download, offsets, boundaries, binary, **ds_kwargs) 272 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the MitoEM dataloader for the segmentation of mitochondria in EM.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- splits: The splits to use for the dataset. Available values are 'train', 'val' and 'test'.
- patch_shape: The patch shape to use for training.
- batch_size: The batch size for training.
- samples: The samples to use for the dataset. The available samples are 'human' and 'rat'.
- download: Whether to download the data if it is not present.
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- binary: Whether to return a binary segmentation target.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader.