torch_em.data.datasets.electron_microscopy.mitoem
MitoEM is a dataset for segmenting mitochondria in electron microscopy. It contains two large annotated volumes, one from rat cortex, the other from human cortex. This dataset was used for a segmentation challenge at ISBI 2022.
If you use it in your research then please cite https://doi.org/10.1007/978-3-030-59722-1_7.
1"""MitoEM is a dataset for segmenting mitochondria in electron microscopy. 2It contains two large annotated volumes, one from rat cortex, the other from human cortex. 3This dataset was used for a segmentation challenge at ISBI 2022. 4 5If you use it in your research then please cite https://doi.org/10.1007/978-3-030-59722-1_7. 6""" 7 8import os 9from tqdm import tqdm 10import multiprocessing 11from shutil import rmtree 12from concurrent import futures 13from typing import List, Optional, Sequence, Tuple, Union 14 15import imageio 16import numpy as np 17 18import torch_em 19 20from torch.utils.data import Dataset, DataLoader 21 22from .. import util 23 24 25URLS = { 26 "raw": { 27 "human": "https://www.dropbox.com/s/z41qtu4y735j95e/EM30-H-im.zip?dl=1", 28 "rat": "https://huggingface.co/datasets/pytc/EM30/resolve/main/EM30-R-im.zip" 29 }, 30 "labels": { 31 "human": "https://www.dropbox.com/s/dhf89bc14kemw4e/EM30-H-mito-train-val-v2.zip?dl=1", 32 "rat": "https://huggingface.co/datasets/pytc/MitoEM/resolve/main/EM30-R-mito-train-val-v2.zip" 33 } 34} 35CHECKSUMS = { 36 "raw": { 37 "human": "98fe259f36a7d8d43f99981b7a0ef8cdeba2ce2615ff91595f428ae57207a041", 38 "rat": "6a2cac68adde5d01984542d3ee1d7753d1fa3e6eb2a042ce15ce297c95885bbe" 39 }, 40 "labels": { 41 "human": "0e8ed292cfcd0c58701d9f4299244a1b66d6aeb506c85754c34f98a4eda0ef1b", 42 "rat": "c56380ac575428a818bd293ca3509d1249999846c3702ccbf11d308acdd2ae86" 43 } 44} 45 46 47def _check_data(path, sample): 48 splits = ["train", "val", "test"] 49 expected_paths = [os.path.join(path, f"{sample}_{split}.n5") for split in splits] 50 return all(os.path.exists(pp) for pp in expected_paths) 51 52 53def get_slices(folder): 54 files = os.listdir(folder) 55 files.sort() 56 files = [os.path.splitext(ff)[0] for ff in files] 57 slice_ids = [int(ff[2:]) if ff.startswith('im') else int(ff[3:]) for ff in files] 58 return slice_ids 59 60 61def _load_vol(pattern, slice_ids, desc, n_threads, dtype=None): 62 im0 = pattern % slice_ids[0] 63 im0 = imageio.imread(im0) 64 65 shape = (len(slice_ids),) + im0.shape 66 67 dtype = im0.dtype if dtype is None else dtype 68 out = np.zeros(shape, dtype=dtype) 69 out[0] = im0 70 71 def load_slice(z, slice_id): 72 out[z] = imageio.imread(pattern % slice_id) 73 74 zs = list(range(1, len(slice_ids))) 75 assert len(zs) == len(slice_ids) - 1 76 with futures.ThreadPoolExecutor(n_threads) as tp: 77 list(tqdm(tp.map(load_slice, zs, slice_ids[1:]), total=len(slice_ids) - 1, desc=desc)) 78 79 return out 80 81 82def _create_volume(out_path, im_folder, label_folder=None, z_start=None): 83 import z5py 84 85 if label_folder is None: 86 assert z_start is not None 87 n_slices = len(get_slices(im_folder)) 88 slices = list(range(z_start, n_slices)) 89 else: 90 assert z_start is None 91 slices = get_slices(label_folder) 92 93 n_threads = min(16, multiprocessing.cpu_count()) 94 raw = _load_vol(os.path.join(im_folder, "im%04i.png"), slices, "load raw", n_threads) 95 if label_folder is not None: 96 labels = _load_vol(os.path.join(label_folder, "seg%04i.tif"), slices, "load labels", n_threads, dtype="uint64") 97 98 print("Write volume to", out_path) 99 chunks = (32, 256, 256) 100 with z5py.File(out_path, "a") as f: 101 f.create_dataset("raw", data=raw, chunks=chunks, compression="gzip", n_threads=n_threads) 102 if label_folder is not None: 103 ds = f.create_dataset("labels", data=labels, chunks=chunks, compression="gzip", n_threads=n_threads) 104 ds.attrs["maxId"] = int(labels.max()) + 1 105 106 return slices[-1] 107 108 109def _require_mitoem_sample(path, sample, download): 110 os.makedirs(path, exist_ok=True) 111 112 for name in ("raw", "labels"): 113 url = URLS[name][sample] 114 checksum = CHECKSUMS[name][sample] 115 zip_path = os.path.join(path, f"{sample}.zip") 116 util.download_source(zip_path, url, download, checksum) 117 util.unzip(zip_path, path, remove=True) 118 119 im_folder = os.path.join(path, "im") 120 train_folder = os.path.join(path, "mito-train-v2") 121 val_folder = os.path.join(path, "mito-val-v2") 122 123 print("Create train volume") 124 train_path = os.path.join(path, f"{sample}_train.n5") 125 _create_volume(train_path, im_folder, train_folder) 126 127 print("Create validation volume") 128 val_path = os.path.join(path, f"{sample}_val.n5") 129 z = _create_volume(val_path, im_folder, val_folder) 130 131 print("Create test volume") 132 test_path = os.path.join(path, f"{sample}_test.n5") 133 _create_volume(test_path, im_folder, z_start=z) 134 135 rmtree(im_folder) 136 rmtree(train_folder) 137 rmtree(val_folder) 138 139 140def get_mitoem_data(path: Union[os.PathLike, str], samples: Sequence[str], splits: Sequence[str], download: bool): 141 """Download the MitoEM training data. 142 143 Args: 144 path: Filepath to a folder where the downloaded data will be saved. 145 samples: The samples to download. The available samples are 'human' and 'rat'. 146 splits: The data splits to download. The available splits are 'train', 'val' and 'test'. 147 download: Whether to download the data if it is not present. 148 """ 149 assert len(set(splits) - {"train", "val"}) == 0, f"{splits}" 150 assert len(set(samples) - {"human", "rat"}) == 0, f"{samples}" 151 os.makedirs(path, exist_ok=True) 152 153 for sample in samples: 154 if not _check_data(path, sample): 155 print("The MitoEM data for sample", sample, "is not available yet and will be downloaded and created.") 156 print("Note that this dataset is large, so this step can take several hours (depending on your internet).") 157 _require_mitoem_sample(path, sample, download) 158 print("The MitoEM data for sample", sample, "has been created.") 159 160 for split in splits: 161 split_path = os.path.join(path, f"{sample}_{split}.n5") 162 assert os.path.exists(split_path), split_path 163 164 165def get_mitoem_paths( 166 path: Union[os.PathLike, str], 167 splits: Sequence[str], 168 samples: Sequence[str] = ("human", "rat"), 169 download: bool = False, 170) -> List[str]: 171 """Get paths for MitoEM data. 172 173 Args: 174 path: Filepath to a folder where the downloaded data will be saved. 175 samples: The samples to download. The available samples are 'human' and 'rat'. 176 splits: The data splits to download. The available splits are 'train', 'val' and 'test'. 177 download: Whether to download the data if it is not present. 178 179 Returns: 180 The filepaths for the stored data. 181 """ 182 if isinstance(splits, str): 183 splits = [splits] 184 185 if isinstance(samples, str): 186 samples = [samples] 187 188 get_mitoem_data(path, samples, splits, download) 189 data_paths = [os.path.join(path, f"{sample}_{split}.n5") for split in splits for sample in samples] 190 191 return data_paths 192 193 194def get_mitoem_dataset( 195 path: Union[os.PathLike, str], 196 splits: Sequence[str], 197 patch_shape: Tuple[int, int, int], 198 samples: Sequence[str] = ("human", "rat"), 199 download: bool = False, 200 offsets: Optional[List[List[int]]] = None, 201 boundaries: bool = False, 202 binary: bool = False, 203 **kwargs, 204) -> Dataset: 205 """Get the MitoEM dataset for the segmentation of mitochondria in EM. 206 207 Args: 208 path: Filepath to a folder where the downloaded data will be saved. 209 splits: The splits to use for the dataset. Available values are 'train', 'val' and 'test'. 210 patch_shape: The patch shape to use for training. 211 samples: The samples to use for the dataset. The available samples are 'human' and 'rat'. 212 download: Whether to download the data if it is not present. 213 offsets: Offset values for affinity computation used as target. 214 boundaries: Whether to compute boundaries as the target. 215 binary: Whether to return a binary segmentation target. 216 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 217 218 Returns: 219 The segmentation dataset. 220 """ 221 assert len(patch_shape) == 3 222 223 data_paths = get_mitoem_paths(path, splits, samples, download) 224 225 kwargs, _ = util.add_instance_label_transform( 226 kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets 227 ) 228 229 return torch_em.default_segmentation_dataset( 230 raw_paths=data_paths, 231 raw_key="raw", 232 label_paths=data_paths, 233 label_key="labels", 234 patch_shape=patch_shape, 235 **kwargs 236 ) 237 238 239def get_mitoem_loader( 240 path: Union[os.PathLike, str], 241 splits: Sequence[str], 242 patch_shape: Tuple[int, int, int], 243 batch_size: int, 244 samples: Sequence[str] = ("human", "rat"), 245 download: bool = False, 246 offsets: Optional[List[List[int]]] = None, 247 boundaries: bool = False, 248 binary: bool = False, 249 **kwargs, 250) -> DataLoader: 251 """Get the MitoEM dataloader for the segmentation of mitochondria in EM. 252 253 Args: 254 path: Filepath to a folder where the downloaded data will be saved. 255 splits: The splits to use for the dataset. Available values are 'train', 'val' and 'test'. 256 patch_shape: The patch shape to use for training. 257 batch_size: The batch size for training. 258 samples: The samples to use for the dataset. The available samples are 'human' and 'rat'. 259 download: Whether to download the data if it is not present. 260 offsets: Offset values for affinity computation used as target. 261 boundaries: Whether to compute boundaries as the target. 262 binary: Whether to return a binary segmentation target. 263 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 264 265 Returns: 266 The DataLoader. 267 """ 268 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 269 dataset = get_mitoem_dataset(path, splits, patch_shape, samples, download, offsets, boundaries, binary, **ds_kwargs) 270 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URLS =
{'raw': {'human': 'https://www.dropbox.com/s/z41qtu4y735j95e/EM30-H-im.zip?dl=1', 'rat': 'https://huggingface.co/datasets/pytc/EM30/resolve/main/EM30-R-im.zip'}, 'labels': {'human': 'https://www.dropbox.com/s/dhf89bc14kemw4e/EM30-H-mito-train-val-v2.zip?dl=1', 'rat': 'https://huggingface.co/datasets/pytc/MitoEM/resolve/main/EM30-R-mito-train-val-v2.zip'}}
CHECKSUMS =
{'raw': {'human': '98fe259f36a7d8d43f99981b7a0ef8cdeba2ce2615ff91595f428ae57207a041', 'rat': '6a2cac68adde5d01984542d3ee1d7753d1fa3e6eb2a042ce15ce297c95885bbe'}, 'labels': {'human': '0e8ed292cfcd0c58701d9f4299244a1b66d6aeb506c85754c34f98a4eda0ef1b', 'rat': 'c56380ac575428a818bd293ca3509d1249999846c3702ccbf11d308acdd2ae86'}}
def
get_slices(folder):
def
get_mitoem_data( path: Union[os.PathLike, str], samples: Sequence[str], splits: Sequence[str], download: bool):
141def get_mitoem_data(path: Union[os.PathLike, str], samples: Sequence[str], splits: Sequence[str], download: bool): 142 """Download the MitoEM training data. 143 144 Args: 145 path: Filepath to a folder where the downloaded data will be saved. 146 samples: The samples to download. The available samples are 'human' and 'rat'. 147 splits: The data splits to download. The available splits are 'train', 'val' and 'test'. 148 download: Whether to download the data if it is not present. 149 """ 150 assert len(set(splits) - {"train", "val"}) == 0, f"{splits}" 151 assert len(set(samples) - {"human", "rat"}) == 0, f"{samples}" 152 os.makedirs(path, exist_ok=True) 153 154 for sample in samples: 155 if not _check_data(path, sample): 156 print("The MitoEM data for sample", sample, "is not available yet and will be downloaded and created.") 157 print("Note that this dataset is large, so this step can take several hours (depending on your internet).") 158 _require_mitoem_sample(path, sample, download) 159 print("The MitoEM data for sample", sample, "has been created.") 160 161 for split in splits: 162 split_path = os.path.join(path, f"{sample}_{split}.n5") 163 assert os.path.exists(split_path), split_path
Download the MitoEM training data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- samples: The samples to download. The available samples are 'human' and 'rat'.
- splits: The data splits to download. The available splits are 'train', 'val' and 'test'.
- download: Whether to download the data if it is not present.
def
get_mitoem_paths( path: Union[os.PathLike, str], splits: Sequence[str], samples: Sequence[str] = ('human', 'rat'), download: bool = False) -> List[str]:
166def get_mitoem_paths( 167 path: Union[os.PathLike, str], 168 splits: Sequence[str], 169 samples: Sequence[str] = ("human", "rat"), 170 download: bool = False, 171) -> List[str]: 172 """Get paths for MitoEM data. 173 174 Args: 175 path: Filepath to a folder where the downloaded data will be saved. 176 samples: The samples to download. The available samples are 'human' and 'rat'. 177 splits: The data splits to download. The available splits are 'train', 'val' and 'test'. 178 download: Whether to download the data if it is not present. 179 180 Returns: 181 The filepaths for the stored data. 182 """ 183 if isinstance(splits, str): 184 splits = [splits] 185 186 if isinstance(samples, str): 187 samples = [samples] 188 189 get_mitoem_data(path, samples, splits, download) 190 data_paths = [os.path.join(path, f"{sample}_{split}.n5") for split in splits for sample in samples] 191 192 return data_paths
Get paths for MitoEM data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- samples: The samples to download. The available samples are 'human' and 'rat'.
- splits: The data splits to download. The available splits are 'train', 'val' and 'test'.
- download: Whether to download the data if it is not present.
Returns:
The filepaths for the stored data.
def
get_mitoem_dataset( path: Union[os.PathLike, str], splits: Sequence[str], patch_shape: Tuple[int, int, int], samples: Sequence[str] = ('human', 'rat'), download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
195def get_mitoem_dataset( 196 path: Union[os.PathLike, str], 197 splits: Sequence[str], 198 patch_shape: Tuple[int, int, int], 199 samples: Sequence[str] = ("human", "rat"), 200 download: bool = False, 201 offsets: Optional[List[List[int]]] = None, 202 boundaries: bool = False, 203 binary: bool = False, 204 **kwargs, 205) -> Dataset: 206 """Get the MitoEM dataset for the segmentation of mitochondria in EM. 207 208 Args: 209 path: Filepath to a folder where the downloaded data will be saved. 210 splits: The splits to use for the dataset. Available values are 'train', 'val' and 'test'. 211 patch_shape: The patch shape to use for training. 212 samples: The samples to use for the dataset. The available samples are 'human' and 'rat'. 213 download: Whether to download the data if it is not present. 214 offsets: Offset values for affinity computation used as target. 215 boundaries: Whether to compute boundaries as the target. 216 binary: Whether to return a binary segmentation target. 217 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 218 219 Returns: 220 The segmentation dataset. 221 """ 222 assert len(patch_shape) == 3 223 224 data_paths = get_mitoem_paths(path, splits, samples, download) 225 226 kwargs, _ = util.add_instance_label_transform( 227 kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets 228 ) 229 230 return torch_em.default_segmentation_dataset( 231 raw_paths=data_paths, 232 raw_key="raw", 233 label_paths=data_paths, 234 label_key="labels", 235 patch_shape=patch_shape, 236 **kwargs 237 )
Get the MitoEM dataset for the segmentation of mitochondria in EM.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- splits: The splits to use for the dataset. Available values are 'train', 'val' and 'test'.
- patch_shape: The patch shape to use for training.
- samples: The samples to use for the dataset. The available samples are 'human' and 'rat'.
- download: Whether to download the data if it is not present.
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- binary: Whether to return a binary segmentation target.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
def
get_mitoem_loader( path: Union[os.PathLike, str], splits: Sequence[str], patch_shape: Tuple[int, int, int], batch_size: int, samples: Sequence[str] = ('human', 'rat'), download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
240def get_mitoem_loader( 241 path: Union[os.PathLike, str], 242 splits: Sequence[str], 243 patch_shape: Tuple[int, int, int], 244 batch_size: int, 245 samples: Sequence[str] = ("human", "rat"), 246 download: bool = False, 247 offsets: Optional[List[List[int]]] = None, 248 boundaries: bool = False, 249 binary: bool = False, 250 **kwargs, 251) -> DataLoader: 252 """Get the MitoEM dataloader for the segmentation of mitochondria in EM. 253 254 Args: 255 path: Filepath to a folder where the downloaded data will be saved. 256 splits: The splits to use for the dataset. Available values are 'train', 'val' and 'test'. 257 patch_shape: The patch shape to use for training. 258 batch_size: The batch size for training. 259 samples: The samples to use for the dataset. The available samples are 'human' and 'rat'. 260 download: Whether to download the data if it is not present. 261 offsets: Offset values for affinity computation used as target. 262 boundaries: Whether to compute boundaries as the target. 263 binary: Whether to return a binary segmentation target. 264 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 265 266 Returns: 267 The DataLoader. 268 """ 269 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 270 dataset = get_mitoem_dataset(path, splits, patch_shape, samples, download, offsets, boundaries, binary, **ds_kwargs) 271 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the MitoEM dataloader for the segmentation of mitochondria in EM.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- splits: The splits to use for the dataset. Available values are 'train', 'val' and 'test'.
- patch_shape: The patch shape to use for training.
- batch_size: The batch size for training.
- samples: The samples to use for the dataset. The available samples are 'human' and 'rat'.
- download: Whether to download the data if it is not present.
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- binary: Whether to return a binary segmentation target.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader.