torch_em.data.datasets.electron_microscopy.mitoemv2
MitoEM v2 is a benchmark collection for 3D mitochondria instance segmentation in electron microscopy.
It contains eight challenging datasets with expert-verified labels, covering biologically complex scenarios such as dense mitochondrial packing, hyperfused networks, thin-necked morphologies, and ultrastructurally ambiguous boundaries.
The data is located at https://doi.org/10.5281/zenodo.17635006. The dataset is from the publication https://doi.org/10.1101/2025.11.12.687478. Please cite it if you use this dataset in your research.
1"""MitoEM v2 is a benchmark collection for 3D mitochondria instance segmentation in electron microscopy. 2 3It contains eight challenging datasets with expert-verified labels, covering biologically complex 4scenarios such as dense mitochondrial packing, hyperfused networks, thin-necked morphologies, 5and ultrastructurally ambiguous boundaries. 6 7The data is located at https://doi.org/10.5281/zenodo.17635006. 8The dataset is from the publication https://doi.org/10.1101/2025.11.12.687478. 9Please cite it if you use this dataset in your research. 10""" 11 12import os 13from glob import glob 14from typing import Union, Literal, Optional, Tuple, List 15 16import numpy as np 17 18from torch.utils.data import Dataset, DataLoader 19 20import torch_em 21 22from .. import util 23 24 25BASE_URL = "https://zenodo.org/records/17635006/files" 26 27DATASETS = { 28 "beta": "Dataset001_ME2-Beta", 29 "jurkat": "Dataset002_ME2-Jurkat", 30 "macro": "Dataset003_ME2-Macro", 31 "mossy": "Dataset004_ME2-Mossy", 32 "podo": "Dataset005_ME2-Podo", 33 "pyra": "Dataset006_ME2-Pyra", 34 "sperm": "Dataset007_ME2-Sperm", 35 "stem": "Dataset008_ME2-Stem", 36} 37 38DATASET_NAMES = list(DATASETS.keys()) 39 40 41def _convert_nifti_to_n5(nifti_path, n5_path): 42 """Convert NIfTI file to n5 format for efficient access.""" 43 import nibabel as nib 44 import z5py 45 46 if os.path.exists(n5_path): 47 return 48 49 nii = nib.load(nifti_path) 50 data = np.asarray(nii.dataobj) 51 52 # NIfTI stores as (X, Y, Z), we want (Z, Y, X) 53 data = np.transpose(data, (2, 1, 0)) 54 55 chunks = (32, 256, 256) 56 with z5py.File(n5_path, "a") as f: 57 f.create_dataset("data", data=data, chunks=chunks, compression="gzip") 58 59 60def _preprocess_dataset(path, dataset_name, dataset_dir): 61 """Preprocess a single dataset: convert NIfTI to n5.""" 62 import json 63 64 n5_dir = os.path.join(path, "n5_data", dataset_name) 65 os.makedirs(n5_dir, exist_ok=True) 66 67 # Read split info 68 with open(os.path.join(dataset_dir, "split.json")) as f: 69 split_info = json.load(f)[0] 70 71 processed = {} 72 for split_name, split_key in [("train", "train"), ("val", "val"), ("test", "test")]: 73 samples = split_info.get(split_key, []) 74 if not samples: 75 continue 76 77 for sample in samples: 78 # Determine source directories based on split 79 if split_name == "test": 80 img_dir = "imagesTs" 81 lbl_dir = "labelsTs" 82 else: 83 img_dir = "imagesTr" 84 lbl_dir = "labelsTr" 85 86 img_nifti = os.path.join(dataset_dir, img_dir, f"{sample}_0000.nii.gz") 87 lbl_nifti = os.path.join(dataset_dir, lbl_dir, f"{sample}.nii.gz") 88 89 if not os.path.exists(img_nifti) or not os.path.exists(lbl_nifti): 90 continue 91 92 n5_path = os.path.join(n5_dir, f"{sample}.n5") 93 94 if not os.path.exists(n5_path): 95 print(f"Converting {sample} to n5...") 96 _convert_nifti_to_n5(img_nifti, os.path.join(n5_dir, f"{sample}_raw.n5")) 97 _convert_nifti_to_n5(lbl_nifti, os.path.join(n5_dir, f"{sample}_labels.n5")) 98 99 # Combine into single n5 file 100 import z5py 101 with z5py.File(os.path.join(n5_dir, f"{sample}_raw.n5"), "r") as f_raw: 102 raw = f_raw["data"][:] 103 with z5py.File(os.path.join(n5_dir, f"{sample}_labels.n5"), "r") as f_lbl: 104 labels = f_lbl["data"][:] 105 106 if sample == "me2-jurkat_train02": 107 print("Label dimensions in nifti are stored the other way around for this sample, transposing labels...") 108 labels = np.transpose(labels, (2, 1, 0)) 109 110 if raw.shape != labels.shape: 111 raise RuntimeError("There is a shape mismatch between raw and labels.") 112 113 with z5py.File(n5_path, "a") as f: 114 f.create_dataset("raw", data=raw, chunks=(32, 256, 256), compression="gzip") 115 f.create_dataset("labels", data=labels.astype("uint64"), chunks=(32, 256, 256), compression="gzip") 116 117 # Clean up temp files 118 import shutil 119 shutil.rmtree(os.path.join(n5_dir, f"{sample}_raw.n5")) 120 shutil.rmtree(os.path.join(n5_dir, f"{sample}_labels.n5")) 121 122 if split_name not in processed: 123 processed[split_name] = [] 124 processed[split_name].append(n5_path) 125 126 return processed 127 128 129def get_mitoemv2_data( 130 path: Union[os.PathLike, str], 131 dataset: str, 132 download: bool = False, 133) -> str: 134 """Download and preprocess a MitoEM v2 dataset. 135 136 Args: 137 path: Filepath to a folder where the downloaded data will be saved. 138 dataset: The dataset to download. One of 'beta', 'jurkat', 'macro', 'mossy', 139 'podo', 'pyra', 'sperm', or 'stem'. 140 download: Whether to download the data if it is not present. 141 142 Returns: 143 The filepath to the preprocessed n5 data directory. 144 """ 145 assert dataset in DATASETS, f"'{dataset}' is not valid. Choose from {DATASET_NAMES}." 146 147 dataset_folder = DATASETS[dataset] 148 n5_dir = os.path.join(path, "n5_data", dataset) 149 150 # Check if already preprocessed 151 if os.path.exists(n5_dir) and len(glob(os.path.join(n5_dir, "*.n5"))) > 0: 152 return n5_dir 153 154 # Download if needed 155 zip_path = os.path.join(path, f"{dataset_folder}.zip") 156 dataset_dir = os.path.join(path, dataset_folder) 157 158 if not os.path.exists(dataset_dir): 159 os.makedirs(path, exist_ok=True) 160 url = f"{BASE_URL}/{dataset_folder}.zip" 161 util.download_source(path=zip_path, url=url, download=download, checksum=None) 162 util.unzip(zip_path=zip_path, dst=path) 163 164 # Preprocess 165 _preprocess_dataset(path, dataset, dataset_dir) 166 167 return n5_dir 168 169 170def get_mitoemv2_paths( 171 path: Union[os.PathLike, str], 172 dataset: Optional[Union[str, List[str]]] = None, 173 split: Literal["train", "val", "test"] = "train", 174 download: bool = False, 175) -> List[str]: 176 """Get paths to the MitoEM v2 data. 177 178 Args: 179 path: Filepath to a folder where the downloaded data will be saved. 180 dataset: The dataset(s) to use. One of 'beta', 'jurkat', 'macro', 'mossy', 181 'podo', 'pyra', 'sperm', or 'stem'. Can also be a list of dataset names. 182 If None, all datasets will be used. 183 split: The data split to use. One of 'train', 'val', or 'test'. 184 download: Whether to download the data if it is not present. 185 186 Returns: 187 List of filepaths for the n5 data. 188 """ 189 import json 190 from natsort import natsorted 191 192 assert split in ("train", "val", "test"), f"'{split}' is not a valid split." 193 194 if dataset is None: 195 dataset = DATASET_NAMES 196 elif isinstance(dataset, str): 197 dataset = [dataset] 198 199 all_n5_paths = [] 200 for ds in dataset: 201 n5_dir = get_mitoemv2_data(path, ds, download) 202 203 # Read split info to get correct samples 204 dataset_folder = DATASETS[ds] 205 dataset_dir = os.path.join(path, dataset_folder) 206 with open(os.path.join(dataset_dir, "split.json")) as f: 207 split_info = json.load(f)[0] 208 209 samples = split_info.get(split, []) 210 n5_paths = [os.path.join(n5_dir, f"{sample}.n5") for sample in samples] 211 n5_paths = [p for p in n5_paths if os.path.exists(p)] 212 all_n5_paths.extend(n5_paths) 213 214 assert len(all_n5_paths) > 0, f"No data found for {dataset}/{split}" 215 216 return natsorted(all_n5_paths) 217 218 219def get_mitoemv2_dataset( 220 path: Union[os.PathLike, str], 221 patch_shape: Tuple[int, int, int], 222 dataset: Optional[Union[str, List[str]]] = None, 223 split: Literal["train", "val", "test"] = "train", 224 download: bool = False, 225 offsets: Optional[List[List[int]]] = None, 226 boundaries: bool = False, 227 binary: bool = False, 228 **kwargs 229) -> Dataset: 230 """Get the MitoEM v2 dataset for mitochondria segmentation in EM. 231 232 Args: 233 path: Filepath to a folder where the downloaded data will be saved. 234 patch_shape: The patch shape to use for training. 235 dataset: The dataset(s) to use. One of 'beta', 'jurkat', 'macro', 'mossy', 236 'podo', 'pyra', 'sperm', or 'stem'. Can also be a list of dataset names. 237 If None, all datasets will be used. 238 split: The data split to use. One of 'train', 'val', or 'test'. 239 download: Whether to download the data if it is not present. 240 offsets: Offset values for affinity computation used as target. 241 boundaries: Whether to compute boundaries as the target. 242 binary: Whether to return a binary segmentation target. 243 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 244 245 Returns: 246 The segmentation dataset. 247 """ 248 assert len(patch_shape) == 3 249 250 n5_paths = get_mitoemv2_paths(path, dataset, split, download) 251 252 kwargs, _ = util.add_instance_label_transform( 253 kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets 254 ) 255 256 return torch_em.default_segmentation_dataset( 257 raw_paths=n5_paths, 258 raw_key="raw", 259 label_paths=n5_paths, 260 label_key="labels", 261 patch_shape=patch_shape, 262 **kwargs 263 ) 264 265 266def get_mitoemv2_loader( 267 path: Union[os.PathLike, str], 268 batch_size: int, 269 patch_shape: Tuple[int, int, int], 270 dataset: Optional[Union[str, List[str]]] = None, 271 split: Literal["train", "val", "test"] = "train", 272 download: bool = False, 273 offsets: Optional[List[List[int]]] = None, 274 boundaries: bool = False, 275 binary: bool = False, 276 **kwargs 277) -> DataLoader: 278 """Get the MitoEM v2 dataloader for mitochondria segmentation in EM. 279 280 Args: 281 path: Filepath to a folder where the downloaded data will be saved. 282 batch_size: The batch size for training. 283 patch_shape: The patch shape to use for training. 284 dataset: The dataset(s) to use. One of 'beta', 'jurkat', 'macro', 'mossy', 285 'podo', 'pyra', 'sperm', or 'stem'. Can also be a list of dataset names. 286 If None, all datasets will be used. 287 split: The data split to use. One of 'train', 'val', or 'test'. 288 download: Whether to download the data if it is not present. 289 offsets: Offset values for affinity computation used as target. 290 boundaries: Whether to compute boundaries as the target. 291 binary: Whether to return a binary segmentation target. 292 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 293 294 Returns: 295 The DataLoader. 296 """ 297 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 298 dataset_obj = get_mitoemv2_dataset( 299 path=path, 300 patch_shape=patch_shape, 301 dataset=dataset, 302 split=split, 303 download=download, 304 offsets=offsets, 305 boundaries=boundaries, 306 binary=binary, 307 **ds_kwargs, 308 ) 309 return torch_em.get_data_loader(dataset=dataset_obj, batch_size=batch_size, **loader_kwargs)
130def get_mitoemv2_data( 131 path: Union[os.PathLike, str], 132 dataset: str, 133 download: bool = False, 134) -> str: 135 """Download and preprocess a MitoEM v2 dataset. 136 137 Args: 138 path: Filepath to a folder where the downloaded data will be saved. 139 dataset: The dataset to download. One of 'beta', 'jurkat', 'macro', 'mossy', 140 'podo', 'pyra', 'sperm', or 'stem'. 141 download: Whether to download the data if it is not present. 142 143 Returns: 144 The filepath to the preprocessed n5 data directory. 145 """ 146 assert dataset in DATASETS, f"'{dataset}' is not valid. Choose from {DATASET_NAMES}." 147 148 dataset_folder = DATASETS[dataset] 149 n5_dir = os.path.join(path, "n5_data", dataset) 150 151 # Check if already preprocessed 152 if os.path.exists(n5_dir) and len(glob(os.path.join(n5_dir, "*.n5"))) > 0: 153 return n5_dir 154 155 # Download if needed 156 zip_path = os.path.join(path, f"{dataset_folder}.zip") 157 dataset_dir = os.path.join(path, dataset_folder) 158 159 if not os.path.exists(dataset_dir): 160 os.makedirs(path, exist_ok=True) 161 url = f"{BASE_URL}/{dataset_folder}.zip" 162 util.download_source(path=zip_path, url=url, download=download, checksum=None) 163 util.unzip(zip_path=zip_path, dst=path) 164 165 # Preprocess 166 _preprocess_dataset(path, dataset, dataset_dir) 167 168 return n5_dir
Download and preprocess a MitoEM v2 dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- dataset: The dataset to download. One of 'beta', 'jurkat', 'macro', 'mossy', 'podo', 'pyra', 'sperm', or 'stem'.
- download: Whether to download the data if it is not present.
Returns:
The filepath to the preprocessed n5 data directory.
171def get_mitoemv2_paths( 172 path: Union[os.PathLike, str], 173 dataset: Optional[Union[str, List[str]]] = None, 174 split: Literal["train", "val", "test"] = "train", 175 download: bool = False, 176) -> List[str]: 177 """Get paths to the MitoEM v2 data. 178 179 Args: 180 path: Filepath to a folder where the downloaded data will be saved. 181 dataset: The dataset(s) to use. One of 'beta', 'jurkat', 'macro', 'mossy', 182 'podo', 'pyra', 'sperm', or 'stem'. Can also be a list of dataset names. 183 If None, all datasets will be used. 184 split: The data split to use. One of 'train', 'val', or 'test'. 185 download: Whether to download the data if it is not present. 186 187 Returns: 188 List of filepaths for the n5 data. 189 """ 190 import json 191 from natsort import natsorted 192 193 assert split in ("train", "val", "test"), f"'{split}' is not a valid split." 194 195 if dataset is None: 196 dataset = DATASET_NAMES 197 elif isinstance(dataset, str): 198 dataset = [dataset] 199 200 all_n5_paths = [] 201 for ds in dataset: 202 n5_dir = get_mitoemv2_data(path, ds, download) 203 204 # Read split info to get correct samples 205 dataset_folder = DATASETS[ds] 206 dataset_dir = os.path.join(path, dataset_folder) 207 with open(os.path.join(dataset_dir, "split.json")) as f: 208 split_info = json.load(f)[0] 209 210 samples = split_info.get(split, []) 211 n5_paths = [os.path.join(n5_dir, f"{sample}.n5") for sample in samples] 212 n5_paths = [p for p in n5_paths if os.path.exists(p)] 213 all_n5_paths.extend(n5_paths) 214 215 assert len(all_n5_paths) > 0, f"No data found for {dataset}/{split}" 216 217 return natsorted(all_n5_paths)
Get paths to the MitoEM v2 data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- dataset: The dataset(s) to use. One of 'beta', 'jurkat', 'macro', 'mossy', 'podo', 'pyra', 'sperm', or 'stem'. Can also be a list of dataset names. If None, all datasets will be used.
- split: The data split to use. One of 'train', 'val', or 'test'.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the n5 data.
220def get_mitoemv2_dataset( 221 path: Union[os.PathLike, str], 222 patch_shape: Tuple[int, int, int], 223 dataset: Optional[Union[str, List[str]]] = None, 224 split: Literal["train", "val", "test"] = "train", 225 download: bool = False, 226 offsets: Optional[List[List[int]]] = None, 227 boundaries: bool = False, 228 binary: bool = False, 229 **kwargs 230) -> Dataset: 231 """Get the MitoEM v2 dataset for mitochondria segmentation in EM. 232 233 Args: 234 path: Filepath to a folder where the downloaded data will be saved. 235 patch_shape: The patch shape to use for training. 236 dataset: The dataset(s) to use. One of 'beta', 'jurkat', 'macro', 'mossy', 237 'podo', 'pyra', 'sperm', or 'stem'. Can also be a list of dataset names. 238 If None, all datasets will be used. 239 split: The data split to use. One of 'train', 'val', or 'test'. 240 download: Whether to download the data if it is not present. 241 offsets: Offset values for affinity computation used as target. 242 boundaries: Whether to compute boundaries as the target. 243 binary: Whether to return a binary segmentation target. 244 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 245 246 Returns: 247 The segmentation dataset. 248 """ 249 assert len(patch_shape) == 3 250 251 n5_paths = get_mitoemv2_paths(path, dataset, split, download) 252 253 kwargs, _ = util.add_instance_label_transform( 254 kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets 255 ) 256 257 return torch_em.default_segmentation_dataset( 258 raw_paths=n5_paths, 259 raw_key="raw", 260 label_paths=n5_paths, 261 label_key="labels", 262 patch_shape=patch_shape, 263 **kwargs 264 )
Get the MitoEM v2 dataset for mitochondria segmentation in EM.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- dataset: The dataset(s) to use. One of 'beta', 'jurkat', 'macro', 'mossy', 'podo', 'pyra', 'sperm', or 'stem'. Can also be a list of dataset names. If None, all datasets will be used.
- split: The data split to use. One of 'train', 'val', or 'test'.
- download: Whether to download the data if it is not present.
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- binary: Whether to return a binary segmentation target.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset.
Returns:
The segmentation dataset.
267def get_mitoemv2_loader( 268 path: Union[os.PathLike, str], 269 batch_size: int, 270 patch_shape: Tuple[int, int, int], 271 dataset: Optional[Union[str, List[str]]] = None, 272 split: Literal["train", "val", "test"] = "train", 273 download: bool = False, 274 offsets: Optional[List[List[int]]] = None, 275 boundaries: bool = False, 276 binary: bool = False, 277 **kwargs 278) -> DataLoader: 279 """Get the MitoEM v2 dataloader for mitochondria segmentation in EM. 280 281 Args: 282 path: Filepath to a folder where the downloaded data will be saved. 283 batch_size: The batch size for training. 284 patch_shape: The patch shape to use for training. 285 dataset: The dataset(s) to use. One of 'beta', 'jurkat', 'macro', 'mossy', 286 'podo', 'pyra', 'sperm', or 'stem'. Can also be a list of dataset names. 287 If None, all datasets will be used. 288 split: The data split to use. One of 'train', 'val', or 'test'. 289 download: Whether to download the data if it is not present. 290 offsets: Offset values for affinity computation used as target. 291 boundaries: Whether to compute boundaries as the target. 292 binary: Whether to return a binary segmentation target. 293 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 294 295 Returns: 296 The DataLoader. 297 """ 298 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 299 dataset_obj = get_mitoemv2_dataset( 300 path=path, 301 patch_shape=patch_shape, 302 dataset=dataset, 303 split=split, 304 download=download, 305 offsets=offsets, 306 boundaries=boundaries, 307 binary=binary, 308 **ds_kwargs, 309 ) 310 return torch_em.get_data_loader(dataset=dataset_obj, batch_size=batch_size, **loader_kwargs)
Get the MitoEM v2 dataloader for mitochondria segmentation in EM.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- dataset: The dataset(s) to use. One of 'beta', 'jurkat', 'macro', 'mossy', 'podo', 'pyra', 'sperm', or 'stem'. Can also be a list of dataset names. If None, all datasets will be used.
- split: The data split to use. One of 'train', 'val', or 'test'.
- download: Whether to download the data if it is not present.
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- binary: Whether to return a binary segmentation target.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_datasetor for the PyTorch DataLoader.
Returns:
The DataLoader.