torch_em.data.datasets.electron_microscopy.mitoemv2
MitoEM v2 is a benchmark collection for 3D mitochondria instance segmentation in electron microscopy.
It contains eight challenging datasets with expert-verified labels, covering biologically complex scenarios such as dense mitochondrial packing, hyperfused networks, thin-necked morphologies, and ultrastructurally ambiguous boundaries.
The data is located at https://doi.org/10.5281/zenodo.17635006. The dataset is from the publication https://doi.org/10.1101/2025.11.12.687478. Please cite it if you use this dataset in your research.
1"""MitoEM v2 is a benchmark collection for 3D mitochondria instance segmentation in electron microscopy. 2 3It contains eight challenging datasets with expert-verified labels, covering biologically complex 4scenarios such as dense mitochondrial packing, hyperfused networks, thin-necked morphologies, 5and ultrastructurally ambiguous boundaries. 6 7The data is located at https://doi.org/10.5281/zenodo.17635006. 8The dataset is from the publication https://doi.org/10.1101/2025.11.12.687478. 9Please cite it if you use this dataset in your research. 10""" 11 12import os 13from glob import glob 14from typing import Union, Literal, Optional, Tuple, List 15 16import numpy as np 17 18from torch.utils.data import Dataset, DataLoader 19 20import torch_em 21 22from .. import util 23 24 25BASE_URL = "https://zenodo.org/records/17635006/files" 26 27DATASETS = { 28 "beta": "Dataset001_ME2-Beta", 29 "jurkat": "Dataset002_ME2-Jurkat", 30 "macro": "Dataset003_ME2-Macro", 31 "mossy": "Dataset004_ME2-Mossy", 32 "podo": "Dataset005_ME2-Podo", 33 "pyra": "Dataset006_ME2-Pyra", 34 "sperm": "Dataset007_ME2-Sperm", 35 "stem": "Dataset008_ME2-Stem", 36} 37 38DATASET_NAMES = list(DATASETS.keys()) 39 40 41def _convert_nifti_to_n5(nifti_path, n5_path): 42 """Convert NIfTI file to n5 format for efficient access.""" 43 import nibabel as nib 44 import z5py 45 46 if os.path.exists(n5_path): 47 return 48 49 nii = nib.load(nifti_path) 50 data = np.asarray(nii.dataobj) 51 52 # NIfTI stores as (X, Y, Z), we want (Z, Y, X) 53 data = np.transpose(data, (2, 1, 0)) 54 55 chunks = (32, 256, 256) 56 with z5py.File(n5_path, "a") as f: 57 f.create_dataset("data", data=data, chunks=chunks, compression="gzip") 58 59 60def _preprocess_dataset(path, dataset_name, dataset_dir): 61 """Preprocess a single dataset: convert NIfTI to n5.""" 62 import json 63 64 n5_dir = os.path.join(path, "n5_data", dataset_name) 65 os.makedirs(n5_dir, exist_ok=True) 66 67 # Read split info 68 with open(os.path.join(dataset_dir, "split.json")) as f: 69 split_info = json.load(f)[0] 70 71 processed = {} 72 for split_name, split_key in [("train", "train"), ("val", "val"), ("test", "test")]: 73 samples = split_info.get(split_key, []) 74 if not samples: 75 continue 76 77 for sample in samples: 78 # Determine source directories based on split 79 if split_name == "test": 80 img_dir = "imagesTs" 81 lbl_dir = "labelsTs" 82 else: 83 img_dir = "imagesTr" 84 lbl_dir = "labelsTr" 85 86 img_nifti = os.path.join(dataset_dir, img_dir, f"{sample}_0000.nii.gz") 87 lbl_nifti = os.path.join(dataset_dir, lbl_dir, f"{sample}.nii.gz") 88 89 if not os.path.exists(img_nifti) or not os.path.exists(lbl_nifti): 90 continue 91 92 n5_path = os.path.join(n5_dir, f"{sample}.n5") 93 94 if not os.path.exists(n5_path): 95 print(f"Converting {sample} to n5...") 96 _convert_nifti_to_n5(img_nifti, os.path.join(n5_dir, f"{sample}_raw.n5")) 97 _convert_nifti_to_n5(lbl_nifti, os.path.join(n5_dir, f"{sample}_labels.n5")) 98 99 # Combine into single n5 file 100 import z5py 101 with z5py.File(os.path.join(n5_dir, f"{sample}_raw.n5"), "r") as f_raw: 102 raw = f_raw["data"][:] 103 with z5py.File(os.path.join(n5_dir, f"{sample}_labels.n5"), "r") as f_lbl: 104 labels = f_lbl["data"][:] 105 106 with z5py.File(n5_path, "a") as f: 107 f.create_dataset("raw", data=raw, chunks=(32, 256, 256), compression="gzip") 108 f.create_dataset("labels", data=labels.astype("uint64"), chunks=(32, 256, 256), compression="gzip") 109 110 # Clean up temp files 111 import shutil 112 shutil.rmtree(os.path.join(n5_dir, f"{sample}_raw.n5")) 113 shutil.rmtree(os.path.join(n5_dir, f"{sample}_labels.n5")) 114 115 if split_name not in processed: 116 processed[split_name] = [] 117 processed[split_name].append(n5_path) 118 119 return processed 120 121 122def get_mitoemv2_data( 123 path: Union[os.PathLike, str], 124 dataset: str, 125 download: bool = False, 126) -> str: 127 """Download and preprocess a MitoEM v2 dataset. 128 129 Args: 130 path: Filepath to a folder where the downloaded data will be saved. 131 dataset: The dataset to download. One of 'beta', 'jurkat', 'macro', 'mossy', 132 'podo', 'pyra', 'sperm', or 'stem'. 133 download: Whether to download the data if it is not present. 134 135 Returns: 136 The filepath to the preprocessed n5 data directory. 137 """ 138 assert dataset in DATASETS, f"'{dataset}' is not valid. Choose from {DATASET_NAMES}." 139 140 dataset_folder = DATASETS[dataset] 141 n5_dir = os.path.join(path, "n5_data", dataset) 142 143 # Check if already preprocessed 144 if os.path.exists(n5_dir) and len(glob(os.path.join(n5_dir, "*.n5"))) > 0: 145 return n5_dir 146 147 # Download if needed 148 zip_path = os.path.join(path, f"{dataset_folder}.zip") 149 dataset_dir = os.path.join(path, dataset_folder) 150 151 if not os.path.exists(dataset_dir): 152 os.makedirs(path, exist_ok=True) 153 url = f"{BASE_URL}/{dataset_folder}.zip" 154 util.download_source(path=zip_path, url=url, download=download, checksum=None) 155 util.unzip(zip_path=zip_path, dst=path) 156 157 # Preprocess 158 _preprocess_dataset(path, dataset, dataset_dir) 159 160 return n5_dir 161 162 163def get_mitoemv2_paths( 164 path: Union[os.PathLike, str], 165 dataset: Optional[Union[str, List[str]]] = None, 166 split: Literal["train", "val", "test"] = "train", 167 download: bool = False, 168) -> List[str]: 169 """Get paths to the MitoEM v2 data. 170 171 Args: 172 path: Filepath to a folder where the downloaded data will be saved. 173 dataset: The dataset(s) to use. One of 'beta', 'jurkat', 'macro', 'mossy', 174 'podo', 'pyra', 'sperm', or 'stem'. Can also be a list of dataset names. 175 If None, all datasets will be used. 176 split: The data split to use. One of 'train', 'val', or 'test'. 177 download: Whether to download the data if it is not present. 178 179 Returns: 180 List of filepaths for the n5 data. 181 """ 182 import json 183 from natsort import natsorted 184 185 assert split in ("train", "val", "test"), f"'{split}' is not a valid split." 186 187 if dataset is None: 188 dataset = DATASET_NAMES 189 elif isinstance(dataset, str): 190 dataset = [dataset] 191 192 all_n5_paths = [] 193 for ds in dataset: 194 n5_dir = get_mitoemv2_data(path, ds, download) 195 196 # Read split info to get correct samples 197 dataset_folder = DATASETS[ds] 198 dataset_dir = os.path.join(path, dataset_folder) 199 with open(os.path.join(dataset_dir, "split.json")) as f: 200 split_info = json.load(f)[0] 201 202 samples = split_info.get(split, []) 203 n5_paths = [os.path.join(n5_dir, f"{sample}.n5") for sample in samples] 204 n5_paths = [p for p in n5_paths if os.path.exists(p)] 205 all_n5_paths.extend(n5_paths) 206 207 assert len(all_n5_paths) > 0, f"No data found for {dataset}/{split}" 208 209 return natsorted(all_n5_paths) 210 211 212def get_mitoemv2_dataset( 213 path: Union[os.PathLike, str], 214 patch_shape: Tuple[int, int, int], 215 dataset: Optional[Union[str, List[str]]] = None, 216 split: Literal["train", "val", "test"] = "train", 217 download: bool = False, 218 offsets: Optional[List[List[int]]] = None, 219 boundaries: bool = False, 220 binary: bool = False, 221 **kwargs 222) -> Dataset: 223 """Get the MitoEM v2 dataset for mitochondria segmentation in EM. 224 225 Args: 226 path: Filepath to a folder where the downloaded data will be saved. 227 patch_shape: The patch shape to use for training. 228 dataset: The dataset(s) to use. One of 'beta', 'jurkat', 'macro', 'mossy', 229 'podo', 'pyra', 'sperm', or 'stem'. Can also be a list of dataset names. 230 If None, all datasets will be used. 231 split: The data split to use. One of 'train', 'val', or 'test'. 232 download: Whether to download the data if it is not present. 233 offsets: Offset values for affinity computation used as target. 234 boundaries: Whether to compute boundaries as the target. 235 binary: Whether to return a binary segmentation target. 236 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 237 238 Returns: 239 The segmentation dataset. 240 """ 241 assert len(patch_shape) == 3 242 243 n5_paths = get_mitoemv2_paths(path, dataset, split, download) 244 245 kwargs, _ = util.add_instance_label_transform( 246 kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets 247 ) 248 249 return torch_em.default_segmentation_dataset( 250 raw_paths=n5_paths, 251 raw_key="raw", 252 label_paths=n5_paths, 253 label_key="labels", 254 patch_shape=patch_shape, 255 **kwargs 256 ) 257 258 259def get_mitoemv2_loader( 260 path: Union[os.PathLike, str], 261 batch_size: int, 262 patch_shape: Tuple[int, int, int], 263 dataset: Optional[Union[str, List[str]]] = None, 264 split: Literal["train", "val", "test"] = "train", 265 download: bool = False, 266 offsets: Optional[List[List[int]]] = None, 267 boundaries: bool = False, 268 binary: bool = False, 269 **kwargs 270) -> DataLoader: 271 """Get the MitoEM v2 dataloader for mitochondria segmentation in EM. 272 273 Args: 274 path: Filepath to a folder where the downloaded data will be saved. 275 batch_size: The batch size for training. 276 patch_shape: The patch shape to use for training. 277 dataset: The dataset(s) to use. One of 'beta', 'jurkat', 'macro', 'mossy', 278 'podo', 'pyra', 'sperm', or 'stem'. Can also be a list of dataset names. 279 If None, all datasets will be used. 280 split: The data split to use. One of 'train', 'val', or 'test'. 281 download: Whether to download the data if it is not present. 282 offsets: Offset values for affinity computation used as target. 283 boundaries: Whether to compute boundaries as the target. 284 binary: Whether to return a binary segmentation target. 285 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 286 287 Returns: 288 The DataLoader. 289 """ 290 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 291 dataset_obj = get_mitoemv2_dataset( 292 path=path, 293 patch_shape=patch_shape, 294 dataset=dataset, 295 split=split, 296 download=download, 297 offsets=offsets, 298 boundaries=boundaries, 299 binary=binary, 300 **ds_kwargs, 301 ) 302 return torch_em.get_data_loader(dataset=dataset_obj, batch_size=batch_size, **loader_kwargs)
123def get_mitoemv2_data( 124 path: Union[os.PathLike, str], 125 dataset: str, 126 download: bool = False, 127) -> str: 128 """Download and preprocess a MitoEM v2 dataset. 129 130 Args: 131 path: Filepath to a folder where the downloaded data will be saved. 132 dataset: The dataset to download. One of 'beta', 'jurkat', 'macro', 'mossy', 133 'podo', 'pyra', 'sperm', or 'stem'. 134 download: Whether to download the data if it is not present. 135 136 Returns: 137 The filepath to the preprocessed n5 data directory. 138 """ 139 assert dataset in DATASETS, f"'{dataset}' is not valid. Choose from {DATASET_NAMES}." 140 141 dataset_folder = DATASETS[dataset] 142 n5_dir = os.path.join(path, "n5_data", dataset) 143 144 # Check if already preprocessed 145 if os.path.exists(n5_dir) and len(glob(os.path.join(n5_dir, "*.n5"))) > 0: 146 return n5_dir 147 148 # Download if needed 149 zip_path = os.path.join(path, f"{dataset_folder}.zip") 150 dataset_dir = os.path.join(path, dataset_folder) 151 152 if not os.path.exists(dataset_dir): 153 os.makedirs(path, exist_ok=True) 154 url = f"{BASE_URL}/{dataset_folder}.zip" 155 util.download_source(path=zip_path, url=url, download=download, checksum=None) 156 util.unzip(zip_path=zip_path, dst=path) 157 158 # Preprocess 159 _preprocess_dataset(path, dataset, dataset_dir) 160 161 return n5_dir
Download and preprocess a MitoEM v2 dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- dataset: The dataset to download. One of 'beta', 'jurkat', 'macro', 'mossy', 'podo', 'pyra', 'sperm', or 'stem'.
- download: Whether to download the data if it is not present.
Returns:
The filepath to the preprocessed n5 data directory.
164def get_mitoemv2_paths( 165 path: Union[os.PathLike, str], 166 dataset: Optional[Union[str, List[str]]] = None, 167 split: Literal["train", "val", "test"] = "train", 168 download: bool = False, 169) -> List[str]: 170 """Get paths to the MitoEM v2 data. 171 172 Args: 173 path: Filepath to a folder where the downloaded data will be saved. 174 dataset: The dataset(s) to use. One of 'beta', 'jurkat', 'macro', 'mossy', 175 'podo', 'pyra', 'sperm', or 'stem'. Can also be a list of dataset names. 176 If None, all datasets will be used. 177 split: The data split to use. One of 'train', 'val', or 'test'. 178 download: Whether to download the data if it is not present. 179 180 Returns: 181 List of filepaths for the n5 data. 182 """ 183 import json 184 from natsort import natsorted 185 186 assert split in ("train", "val", "test"), f"'{split}' is not a valid split." 187 188 if dataset is None: 189 dataset = DATASET_NAMES 190 elif isinstance(dataset, str): 191 dataset = [dataset] 192 193 all_n5_paths = [] 194 for ds in dataset: 195 n5_dir = get_mitoemv2_data(path, ds, download) 196 197 # Read split info to get correct samples 198 dataset_folder = DATASETS[ds] 199 dataset_dir = os.path.join(path, dataset_folder) 200 with open(os.path.join(dataset_dir, "split.json")) as f: 201 split_info = json.load(f)[0] 202 203 samples = split_info.get(split, []) 204 n5_paths = [os.path.join(n5_dir, f"{sample}.n5") for sample in samples] 205 n5_paths = [p for p in n5_paths if os.path.exists(p)] 206 all_n5_paths.extend(n5_paths) 207 208 assert len(all_n5_paths) > 0, f"No data found for {dataset}/{split}" 209 210 return natsorted(all_n5_paths)
Get paths to the MitoEM v2 data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- dataset: The dataset(s) to use. One of 'beta', 'jurkat', 'macro', 'mossy', 'podo', 'pyra', 'sperm', or 'stem'. Can also be a list of dataset names. If None, all datasets will be used.
- split: The data split to use. One of 'train', 'val', or 'test'.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the n5 data.
213def get_mitoemv2_dataset( 214 path: Union[os.PathLike, str], 215 patch_shape: Tuple[int, int, int], 216 dataset: Optional[Union[str, List[str]]] = None, 217 split: Literal["train", "val", "test"] = "train", 218 download: bool = False, 219 offsets: Optional[List[List[int]]] = None, 220 boundaries: bool = False, 221 binary: bool = False, 222 **kwargs 223) -> Dataset: 224 """Get the MitoEM v2 dataset for mitochondria segmentation in EM. 225 226 Args: 227 path: Filepath to a folder where the downloaded data will be saved. 228 patch_shape: The patch shape to use for training. 229 dataset: The dataset(s) to use. One of 'beta', 'jurkat', 'macro', 'mossy', 230 'podo', 'pyra', 'sperm', or 'stem'. Can also be a list of dataset names. 231 If None, all datasets will be used. 232 split: The data split to use. One of 'train', 'val', or 'test'. 233 download: Whether to download the data if it is not present. 234 offsets: Offset values for affinity computation used as target. 235 boundaries: Whether to compute boundaries as the target. 236 binary: Whether to return a binary segmentation target. 237 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 238 239 Returns: 240 The segmentation dataset. 241 """ 242 assert len(patch_shape) == 3 243 244 n5_paths = get_mitoemv2_paths(path, dataset, split, download) 245 246 kwargs, _ = util.add_instance_label_transform( 247 kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets 248 ) 249 250 return torch_em.default_segmentation_dataset( 251 raw_paths=n5_paths, 252 raw_key="raw", 253 label_paths=n5_paths, 254 label_key="labels", 255 patch_shape=patch_shape, 256 **kwargs 257 )
Get the MitoEM v2 dataset for mitochondria segmentation in EM.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- dataset: The dataset(s) to use. One of 'beta', 'jurkat', 'macro', 'mossy', 'podo', 'pyra', 'sperm', or 'stem'. Can also be a list of dataset names. If None, all datasets will be used.
- split: The data split to use. One of 'train', 'val', or 'test'.
- download: Whether to download the data if it is not present.
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- binary: Whether to return a binary segmentation target.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset.
Returns:
The segmentation dataset.
260def get_mitoemv2_loader( 261 path: Union[os.PathLike, str], 262 batch_size: int, 263 patch_shape: Tuple[int, int, int], 264 dataset: Optional[Union[str, List[str]]] = None, 265 split: Literal["train", "val", "test"] = "train", 266 download: bool = False, 267 offsets: Optional[List[List[int]]] = None, 268 boundaries: bool = False, 269 binary: bool = False, 270 **kwargs 271) -> DataLoader: 272 """Get the MitoEM v2 dataloader for mitochondria segmentation in EM. 273 274 Args: 275 path: Filepath to a folder where the downloaded data will be saved. 276 batch_size: The batch size for training. 277 patch_shape: The patch shape to use for training. 278 dataset: The dataset(s) to use. One of 'beta', 'jurkat', 'macro', 'mossy', 279 'podo', 'pyra', 'sperm', or 'stem'. Can also be a list of dataset names. 280 If None, all datasets will be used. 281 split: The data split to use. One of 'train', 'val', or 'test'. 282 download: Whether to download the data if it is not present. 283 offsets: Offset values for affinity computation used as target. 284 boundaries: Whether to compute boundaries as the target. 285 binary: Whether to return a binary segmentation target. 286 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 287 288 Returns: 289 The DataLoader. 290 """ 291 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 292 dataset_obj = get_mitoemv2_dataset( 293 path=path, 294 patch_shape=patch_shape, 295 dataset=dataset, 296 split=split, 297 download=download, 298 offsets=offsets, 299 boundaries=boundaries, 300 binary=binary, 301 **ds_kwargs, 302 ) 303 return torch_em.get_data_loader(dataset=dataset_obj, batch_size=batch_size, **loader_kwargs)
Get the MitoEM v2 dataloader for mitochondria segmentation in EM.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- dataset: The dataset(s) to use. One of 'beta', 'jurkat', 'macro', 'mossy', 'podo', 'pyra', 'sperm', or 'stem'. Can also be a list of dataset names. If None, all datasets will be used.
- split: The data split to use. One of 'train', 'val', or 'test'.
- download: Whether to download the data if it is not present.
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- binary: Whether to return a binary segmentation target.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_datasetor for the PyTorch DataLoader.
Returns:
The DataLoader.