torch_em.data.datasets.electron_microscopy.mitoem
MitoEM is a dataset for segmenting mitochondria in electron microscopy.
It contains two large annotated volumes, one from rat cortex, the other from human cortex. This dataset was used for a segmentation challenge at ISBI 2022. If you use it in your research then please cite https://doi.org/10.1007/978-3-030-59722-1_7.
1"""MitoEM is a dataset for segmenting mitochondria in electron microscopy. 2 3It contains two large annotated volumes, one from rat cortex, the other from human cortex. 4This dataset was used for a segmentation challenge at ISBI 2022. 5If you use it in your research then please cite https://doi.org/10.1007/978-3-030-59722-1_7. 6""" 7 8import os 9import multiprocessing 10from concurrent import futures 11from shutil import rmtree 12from typing import List, Optional, Sequence, Tuple, Union 13 14import imageio 15import numpy as np 16import torch_em 17import z5py 18 19from torch.utils.data import Dataset, DataLoader 20from tqdm import tqdm 21from .. import util 22 23URLS = { 24 "raw": { 25 "human": "https://www.dropbox.com/s/z41qtu4y735j95e/EM30-H-im.zip?dl=1", 26 "rat": "https://huggingface.co/datasets/pytc/EM30/resolve/main/EM30-R-im.zip" 27 }, 28 "labels": { 29 "human": "https://www.dropbox.com/s/dhf89bc14kemw4e/EM30-H-mito-train-val-v2.zip?dl=1", 30 "rat": "https://huggingface.co/datasets/pytc/MitoEM/blob/main/EM30-R-mito-train-val-v2.zip" 31 } 32} 33CHECKSUMS = { 34 "raw": { 35 "human": "98fe259f36a7d8d43f99981b7a0ef8cdeba2ce2615ff91595f428ae57207a041", 36 "rat": "6a2cac68adde5d01984542d3ee1d7753d1fa3e6eb2a042ce15ce297c95885bbe" 37 }, 38 "labels": { 39 "human": "0e8ed292cfcd0c58701d9f4299244a1b66d6aeb506c85754c34f98a4eda0ef1b", 40 "rat": "c56380ac575428a818bd293ca3509d1249999846c3702ccbf11d308acdd2ae86" 41 } 42} 43 44 45def _check_data(path, sample): 46 splits = ["train", "val", "test"] 47 expected_paths = [os.path.join(path, f"{sample}_{split}.n5") for split in splits] 48 return all(os.path.exists(pp) for pp in expected_paths) 49 50 51def get_slices(folder): 52 files = os.listdir(folder) 53 files.sort() 54 files = [os.path.splitext(ff)[0] for ff in files] 55 slice_ids = [int(ff[2:]) if ff.startswith('im') else int(ff[3:]) for ff in files] 56 return slice_ids 57 58 59def _load_vol(pattern, slice_ids, desc, n_threads, dtype=None): 60 im0 = pattern % slice_ids[0] 61 im0 = imageio.imread(im0) 62 63 shape = (len(slice_ids),) + im0.shape 64 65 dtype = im0.dtype if dtype is None else dtype 66 out = np.zeros(shape, dtype=dtype) 67 out[0] = im0 68 69 def load_slice(z, slice_id): 70 out[z] = imageio.imread(pattern % slice_id) 71 72 zs = list(range(1, len(slice_ids))) 73 assert len(zs) == len(slice_ids) - 1 74 with futures.ThreadPoolExecutor(n_threads) as tp: 75 list(tqdm(tp.map(load_slice, zs, slice_ids[1:]), total=len(slice_ids) - 1, desc=desc)) 76 77 return out 78 79 80def _create_volume(out_path, im_folder, label_folder=None, z_start=None): 81 if label_folder is None: 82 assert z_start is not None 83 n_slices = len(get_slices(im_folder)) 84 slices = list(range(z_start, n_slices)) 85 else: 86 assert z_start is None 87 slices = get_slices(label_folder) 88 89 n_threads = min(16, multiprocessing.cpu_count()) 90 raw = _load_vol(os.path.join(im_folder, "im%04i.png"), slices, "load raw", n_threads) 91 if label_folder is not None: 92 labels = _load_vol(os.path.join(label_folder, "seg%04i.tif"), slices, "load labels", n_threads, dtype="uint64") 93 94 print("Write volume to", out_path) 95 chunks = (32, 256, 256) 96 with z5py.File(out_path, "a") as f: 97 f.create_dataset("raw", data=raw, chunks=chunks, compression="gzip", n_threads=n_threads) 98 if label_folder is not None: 99 ds = f.create_dataset("labels", data=labels, chunks=chunks, compression="gzip", n_threads=n_threads) 100 ds.attrs["maxId"] = int(labels.max()) + 1 101 102 return slices[-1] 103 104 105def _require_mitoem_sample(path, sample, download): 106 os.makedirs(path, exist_ok=True) 107 108 for name in ("raw", "labels"): 109 url = URLS[name][sample] 110 checksum = CHECKSUMS[name][sample] 111 zip_path = os.path.join(path, f"{sample}.zip") 112 util.download_source(zip_path, url, download, checksum) 113 util.unzip(zip_path, path, remove=True) 114 115 im_folder = os.path.join(path, "im") 116 train_folder = os.path.join(path, "mito-train-v2") 117 val_folder = os.path.join(path, "mito-val-v2") 118 119 print("Create train volume") 120 train_path = os.path.join(path, f"{sample}_train.n5") 121 _create_volume(train_path, im_folder, train_folder) 122 123 print("Create validation volume") 124 val_path = os.path.join(path, f"{sample}_val.n5") 125 z = _create_volume(val_path, im_folder, val_folder) 126 127 print("Create test volume") 128 test_path = os.path.join(path, f"{sample}_test.n5") 129 _create_volume(test_path, im_folder, z_start=z) 130 131 rmtree(im_folder) 132 rmtree(train_folder) 133 rmtree(val_folder) 134 135 136def get_mitoem_data( 137 path: Union[os.PathLike, str], samples: Sequence[str], splits: Sequence[str], download: bool 138) -> List[str]: 139 """Download the MitoEM training data. 140 141 Args: 142 path: Filepath to a folder where the downloaded data will be saved. 143 samples: The samples to download. The available samples are 'human' and 'rat'. 144 splits: The data splits to download. The available splits are 'train', 'val' and 'test'. 145 download: Whether to download the data if it is not present. 146 147 Returns: 148 The paths to the downloaded and converted files. 149 """ 150 if isinstance(splits, str): 151 splits = [splits] 152 assert len(set(splits) - {"train", "val"}) == 0, f"{splits}" 153 assert len(set(samples) - {"human", "rat"}) == 0, f"{samples}" 154 os.makedirs(path, exist_ok=True) 155 156 data_paths = [] 157 for sample in samples: 158 if not _check_data(path, sample): 159 print("The MitoEM data for sample", sample, "is not available yet and will be downloaded and created.") 160 print("Note that this dataset is large, so this step can take several hours (depending on your internet).") 161 _require_mitoem_sample(path, sample, download) 162 print("The MitoEM data for sample", sample, "has been created.") 163 164 for split in splits: 165 split_path = os.path.join(path, f"{sample}_{split}.n5") 166 assert os.path.exists(split_path), split_path 167 data_paths.append(split_path) 168 return data_paths 169 170 171def get_mitoem_dataset( 172 path: Union[os.PathLike, str], 173 splits: Sequence[str], 174 patch_shape: Tuple[int, int, int], 175 samples: Sequence[str] = ("human", "rat"), 176 download: bool = False, 177 offsets: Optional[List[List[int]]] = None, 178 boundaries: bool = False, 179 binary: bool = False, 180 **kwargs, 181) -> Dataset: 182 """Get the MitoEM dataset for the segmentation of mitochondria in EM. 183 184 Args: 185 path: Filepath to a folder where the downloaded data will be saved. 186 splits: The splits to use for the dataset. Available values are 'train', 'val' and 'test'. 187 patch_shape: The patch shape to use for training. 188 samples: The samples to use for the dataset. The available samples are 'human' and 'rat'. 189 download: Whether to download the data if it is not present. 190 offsets: Offset values for affinity computation used as target. 191 boundaries: Whether to compute boundaries as the target. 192 binary: Whether to return a binary segmentation target. 193 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 194 195 Returns: 196 The segmentation dataset. 197 """ 198 assert len(patch_shape) == 3 199 200 data_paths = get_mitoem_data(path, samples, splits, download) 201 202 kwargs, _ = util.add_instance_label_transform( 203 kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets 204 ) 205 raw_key = "raw" 206 label_key = "labels" 207 return torch_em.default_segmentation_dataset(data_paths, raw_key, data_paths, label_key, patch_shape, **kwargs) 208 209 210def get_mitoem_loader( 211 path: Union[os.PathLike, str], 212 splits: Sequence[str], 213 patch_shape: Tuple[int, int, int], 214 batch_size: int, 215 samples: Sequence[str] = ("human", "rat"), 216 download: bool = False, 217 offsets: Optional[List[List[int]]] = None, 218 boundaries: bool = False, 219 binary: bool = False, 220 **kwargs, 221) -> DataLoader: 222 """Get the MitoEM dataload for the segmentation of mitochondria in EM. 223 224 Args: 225 path: Filepath to a folder where the downloaded data will be saved. 226 splits: The splits to use for the dataset. Available values are 'train', 'val' and 'test'. 227 patch_shape: The patch shape to use for training. 228 batch_size: The batch size for training. 229 samples: The samples to use for the dataset. The available samples are 'human' and 'rat'. 230 download: Whether to download the data if it is not present. 231 offsets: Offset values for affinity computation used as target. 232 boundaries: Whether to compute boundaries as the target. 233 binary: Whether to return a binary segmentation target. 234 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 235 236 Returns: 237 The DataLoader. 238 """ 239 ds_kwargs, loader_kwargs = util.split_kwargs( 240 torch_em.default_segmentation_dataset, **kwargs 241 ) 242 dataset = get_mitoem_dataset( 243 path, splits, patch_shape, 244 samples=samples, download=download, 245 offsets=offsets, boundaries=boundaries, binary=binary, 246 **ds_kwargs 247 ) 248 loader = torch_em.get_data_loader(dataset, batch_size, **loader_kwargs) 249 return loader
URLS =
{'raw': {'human': 'https://www.dropbox.com/s/z41qtu4y735j95e/EM30-H-im.zip?dl=1', 'rat': 'https://huggingface.co/datasets/pytc/EM30/resolve/main/EM30-R-im.zip'}, 'labels': {'human': 'https://www.dropbox.com/s/dhf89bc14kemw4e/EM30-H-mito-train-val-v2.zip?dl=1', 'rat': 'https://huggingface.co/datasets/pytc/MitoEM/blob/main/EM30-R-mito-train-val-v2.zip'}}
CHECKSUMS =
{'raw': {'human': '98fe259f36a7d8d43f99981b7a0ef8cdeba2ce2615ff91595f428ae57207a041', 'rat': '6a2cac68adde5d01984542d3ee1d7753d1fa3e6eb2a042ce15ce297c95885bbe'}, 'labels': {'human': '0e8ed292cfcd0c58701d9f4299244a1b66d6aeb506c85754c34f98a4eda0ef1b', 'rat': 'c56380ac575428a818bd293ca3509d1249999846c3702ccbf11d308acdd2ae86'}}
def
get_slices(folder):
def
get_mitoem_data( path: Union[os.PathLike, str], samples: Sequence[str], splits: Sequence[str], download: bool) -> List[str]:
137def get_mitoem_data( 138 path: Union[os.PathLike, str], samples: Sequence[str], splits: Sequence[str], download: bool 139) -> List[str]: 140 """Download the MitoEM training data. 141 142 Args: 143 path: Filepath to a folder where the downloaded data will be saved. 144 samples: The samples to download. The available samples are 'human' and 'rat'. 145 splits: The data splits to download. The available splits are 'train', 'val' and 'test'. 146 download: Whether to download the data if it is not present. 147 148 Returns: 149 The paths to the downloaded and converted files. 150 """ 151 if isinstance(splits, str): 152 splits = [splits] 153 assert len(set(splits) - {"train", "val"}) == 0, f"{splits}" 154 assert len(set(samples) - {"human", "rat"}) == 0, f"{samples}" 155 os.makedirs(path, exist_ok=True) 156 157 data_paths = [] 158 for sample in samples: 159 if not _check_data(path, sample): 160 print("The MitoEM data for sample", sample, "is not available yet and will be downloaded and created.") 161 print("Note that this dataset is large, so this step can take several hours (depending on your internet).") 162 _require_mitoem_sample(path, sample, download) 163 print("The MitoEM data for sample", sample, "has been created.") 164 165 for split in splits: 166 split_path = os.path.join(path, f"{sample}_{split}.n5") 167 assert os.path.exists(split_path), split_path 168 data_paths.append(split_path) 169 return data_paths
Download the MitoEM training data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- samples: The samples to download. The available samples are 'human' and 'rat'.
- splits: The data splits to download. The available splits are 'train', 'val' and 'test'.
- download: Whether to download the data if it is not present.
Returns:
The paths to the downloaded and converted files.
def
get_mitoem_dataset( path: Union[os.PathLike, str], splits: Sequence[str], patch_shape: Tuple[int, int, int], samples: Sequence[str] = ('human', 'rat'), download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
172def get_mitoem_dataset( 173 path: Union[os.PathLike, str], 174 splits: Sequence[str], 175 patch_shape: Tuple[int, int, int], 176 samples: Sequence[str] = ("human", "rat"), 177 download: bool = False, 178 offsets: Optional[List[List[int]]] = None, 179 boundaries: bool = False, 180 binary: bool = False, 181 **kwargs, 182) -> Dataset: 183 """Get the MitoEM dataset for the segmentation of mitochondria in EM. 184 185 Args: 186 path: Filepath to a folder where the downloaded data will be saved. 187 splits: The splits to use for the dataset. Available values are 'train', 'val' and 'test'. 188 patch_shape: The patch shape to use for training. 189 samples: The samples to use for the dataset. The available samples are 'human' and 'rat'. 190 download: Whether to download the data if it is not present. 191 offsets: Offset values for affinity computation used as target. 192 boundaries: Whether to compute boundaries as the target. 193 binary: Whether to return a binary segmentation target. 194 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 195 196 Returns: 197 The segmentation dataset. 198 """ 199 assert len(patch_shape) == 3 200 201 data_paths = get_mitoem_data(path, samples, splits, download) 202 203 kwargs, _ = util.add_instance_label_transform( 204 kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets 205 ) 206 raw_key = "raw" 207 label_key = "labels" 208 return torch_em.default_segmentation_dataset(data_paths, raw_key, data_paths, label_key, patch_shape, **kwargs)
Get the MitoEM dataset for the segmentation of mitochondria in EM.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- splits: The splits to use for the dataset. Available values are 'train', 'val' and 'test'.
- patch_shape: The patch shape to use for training.
- samples: The samples to use for the dataset. The available samples are 'human' and 'rat'.
- download: Whether to download the data if it is not present.
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- binary: Whether to return a binary segmentation target.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
def
get_mitoem_loader( path: Union[os.PathLike, str], splits: Sequence[str], patch_shape: Tuple[int, int, int], batch_size: int, samples: Sequence[str] = ('human', 'rat'), download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
211def get_mitoem_loader( 212 path: Union[os.PathLike, str], 213 splits: Sequence[str], 214 patch_shape: Tuple[int, int, int], 215 batch_size: int, 216 samples: Sequence[str] = ("human", "rat"), 217 download: bool = False, 218 offsets: Optional[List[List[int]]] = None, 219 boundaries: bool = False, 220 binary: bool = False, 221 **kwargs, 222) -> DataLoader: 223 """Get the MitoEM dataload for the segmentation of mitochondria in EM. 224 225 Args: 226 path: Filepath to a folder where the downloaded data will be saved. 227 splits: The splits to use for the dataset. Available values are 'train', 'val' and 'test'. 228 patch_shape: The patch shape to use for training. 229 batch_size: The batch size for training. 230 samples: The samples to use for the dataset. The available samples are 'human' and 'rat'. 231 download: Whether to download the data if it is not present. 232 offsets: Offset values for affinity computation used as target. 233 boundaries: Whether to compute boundaries as the target. 234 binary: Whether to return a binary segmentation target. 235 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 236 237 Returns: 238 The DataLoader. 239 """ 240 ds_kwargs, loader_kwargs = util.split_kwargs( 241 torch_em.default_segmentation_dataset, **kwargs 242 ) 243 dataset = get_mitoem_dataset( 244 path, splits, patch_shape, 245 samples=samples, download=download, 246 offsets=offsets, boundaries=boundaries, binary=binary, 247 **ds_kwargs 248 ) 249 loader = torch_em.get_data_loader(dataset, batch_size, **loader_kwargs) 250 return loader
Get the MitoEM dataload for the segmentation of mitochondria in EM.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- splits: The splits to use for the dataset. Available values are 'train', 'val' and 'test'.
- patch_shape: The patch shape to use for training.
- batch_size: The batch size for training.
- samples: The samples to use for the dataset. The available samples are 'human' and 'rat'.
- download: Whether to download the data if it is not present.
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- binary: Whether to return a binary segmentation target.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader.