torch_em.data.datasets.electron_microscopy.mitoemv2

MitoEM v2 is a benchmark collection for 3D mitochondria instance segmentation in electron microscopy.

It contains eight challenging datasets with expert-verified labels, covering biologically complex scenarios such as dense mitochondrial packing, hyperfused networks, thin-necked morphologies, and ultrastructurally ambiguous boundaries.

The data is located at https://doi.org/10.5281/zenodo.17635006. The dataset is from the publication https://doi.org/10.1101/2025.11.12.687478. Please cite it if you use this dataset in your research.

  1"""MitoEM v2 is a benchmark collection for 3D mitochondria instance segmentation in electron microscopy.
  2
  3It contains eight challenging datasets with expert-verified labels, covering biologically complex
  4scenarios such as dense mitochondrial packing, hyperfused networks, thin-necked morphologies,
  5and ultrastructurally ambiguous boundaries.
  6
  7The data is located at https://doi.org/10.5281/zenodo.17635006.
  8The dataset is from the publication https://doi.org/10.1101/2025.11.12.687478.
  9Please cite it if you use this dataset in your research.
 10"""
 11
 12import os
 13from glob import glob
 14from typing import Union, Literal, Optional, Tuple, List
 15
 16import numpy as np
 17
 18from torch.utils.data import Dataset, DataLoader
 19
 20import torch_em
 21
 22from .. import util
 23
 24
 25BASE_URL = "https://zenodo.org/records/17635006/files"
 26
 27DATASETS = {
 28    "beta": "Dataset001_ME2-Beta",
 29    "jurkat": "Dataset002_ME2-Jurkat",
 30    "macro": "Dataset003_ME2-Macro",
 31    "mossy": "Dataset004_ME2-Mossy",
 32    "podo": "Dataset005_ME2-Podo",
 33    "pyra": "Dataset006_ME2-Pyra",
 34    "sperm": "Dataset007_ME2-Sperm",
 35    "stem": "Dataset008_ME2-Stem",
 36}
 37
 38DATASET_NAMES = list(DATASETS.keys())
 39
 40
 41def _convert_nifti_to_n5(nifti_path, n5_path):
 42    """Convert NIfTI file to n5 format for efficient access."""
 43    import nibabel as nib
 44    import z5py
 45
 46    if os.path.exists(n5_path):
 47        return
 48
 49    nii = nib.load(nifti_path)
 50    data = np.asarray(nii.dataobj)
 51
 52    # NIfTI stores as (X, Y, Z), we want (Z, Y, X)
 53    data = np.transpose(data, (2, 1, 0))
 54
 55    chunks = (32, 256, 256)
 56    with z5py.File(n5_path, "a") as f:
 57        f.create_dataset("data", data=data, chunks=chunks, compression="gzip")
 58
 59
 60def _preprocess_dataset(path, dataset_name, dataset_dir):
 61    """Preprocess a single dataset: convert NIfTI to n5."""
 62    import json
 63
 64    n5_dir = os.path.join(path, "n5_data", dataset_name)
 65    os.makedirs(n5_dir, exist_ok=True)
 66
 67    # Read split info
 68    with open(os.path.join(dataset_dir, "split.json")) as f:
 69        split_info = json.load(f)[0]
 70
 71    processed = {}
 72    for split_name, split_key in [("train", "train"), ("val", "val"), ("test", "test")]:
 73        samples = split_info.get(split_key, [])
 74        if not samples:
 75            continue
 76
 77        for sample in samples:
 78            # Determine source directories based on split
 79            if split_name == "test":
 80                img_dir = "imagesTs"
 81                lbl_dir = "labelsTs"
 82            else:
 83                img_dir = "imagesTr"
 84                lbl_dir = "labelsTr"
 85
 86            img_nifti = os.path.join(dataset_dir, img_dir, f"{sample}_0000.nii.gz")
 87            lbl_nifti = os.path.join(dataset_dir, lbl_dir, f"{sample}.nii.gz")
 88
 89            if not os.path.exists(img_nifti) or not os.path.exists(lbl_nifti):
 90                continue
 91
 92            n5_path = os.path.join(n5_dir, f"{sample}.n5")
 93
 94            if not os.path.exists(n5_path):
 95                print(f"Converting {sample} to n5...")
 96                _convert_nifti_to_n5(img_nifti, os.path.join(n5_dir, f"{sample}_raw.n5"))
 97                _convert_nifti_to_n5(lbl_nifti, os.path.join(n5_dir, f"{sample}_labels.n5"))
 98
 99                # Combine into single n5 file
100                import z5py
101                with z5py.File(os.path.join(n5_dir, f"{sample}_raw.n5"), "r") as f_raw:
102                    raw = f_raw["data"][:]
103                with z5py.File(os.path.join(n5_dir, f"{sample}_labels.n5"), "r") as f_lbl:
104                    labels = f_lbl["data"][:]
105                
106                if sample == "me2-jurkat_train02":
107                    print("Label dimensions in nifti are stored the other way around for this sample, transposing labels...")
108                    labels = np.transpose(labels, (2, 1, 0))
109
110                if raw.shape != labels.shape:
111                    raise RuntimeError("There is a shape mismatch between raw and labels.")
112
113                with z5py.File(n5_path, "a") as f:
114                    f.create_dataset("raw", data=raw, chunks=(32, 256, 256), compression="gzip")
115                    f.create_dataset("labels", data=labels.astype("uint64"), chunks=(32, 256, 256), compression="gzip")
116
117                # Clean up temp files
118                import shutil
119                shutil.rmtree(os.path.join(n5_dir, f"{sample}_raw.n5"))
120                shutil.rmtree(os.path.join(n5_dir, f"{sample}_labels.n5"))
121
122            if split_name not in processed:
123                processed[split_name] = []
124            processed[split_name].append(n5_path)
125
126    return processed
127
128
129def get_mitoemv2_data(
130    path: Union[os.PathLike, str],
131    dataset: str,
132    download: bool = False,
133) -> str:
134    """Download and preprocess a MitoEM v2 dataset.
135
136    Args:
137        path: Filepath to a folder where the downloaded data will be saved.
138        dataset: The dataset to download. One of 'beta', 'jurkat', 'macro', 'mossy',
139            'podo', 'pyra', 'sperm', or 'stem'.
140        download: Whether to download the data if it is not present.
141
142    Returns:
143        The filepath to the preprocessed n5 data directory.
144    """
145    assert dataset in DATASETS, f"'{dataset}' is not valid. Choose from {DATASET_NAMES}."
146
147    dataset_folder = DATASETS[dataset]
148    n5_dir = os.path.join(path, "n5_data", dataset)
149
150    # Check if already preprocessed
151    if os.path.exists(n5_dir) and len(glob(os.path.join(n5_dir, "*.n5"))) > 0:
152        return n5_dir
153
154    # Download if needed
155    zip_path = os.path.join(path, f"{dataset_folder}.zip")
156    dataset_dir = os.path.join(path, dataset_folder)
157
158    if not os.path.exists(dataset_dir):
159        os.makedirs(path, exist_ok=True)
160        url = f"{BASE_URL}/{dataset_folder}.zip"
161        util.download_source(path=zip_path, url=url, download=download, checksum=None)
162        util.unzip(zip_path=zip_path, dst=path)
163
164    # Preprocess
165    _preprocess_dataset(path, dataset, dataset_dir)
166
167    return n5_dir
168
169
170def get_mitoemv2_paths(
171    path: Union[os.PathLike, str],
172    dataset: Optional[Union[str, List[str]]] = None,
173    split: Literal["train", "val", "test"] = "train",
174    download: bool = False,
175) -> List[str]:
176    """Get paths to the MitoEM v2 data.
177
178    Args:
179        path: Filepath to a folder where the downloaded data will be saved.
180        dataset: The dataset(s) to use. One of 'beta', 'jurkat', 'macro', 'mossy',
181            'podo', 'pyra', 'sperm', or 'stem'. Can also be a list of dataset names.
182            If None, all datasets will be used.
183        split: The data split to use. One of 'train', 'val', or 'test'.
184        download: Whether to download the data if it is not present.
185
186    Returns:
187        List of filepaths for the n5 data.
188    """
189    import json
190    from natsort import natsorted
191
192    assert split in ("train", "val", "test"), f"'{split}' is not a valid split."
193
194    if dataset is None:
195        dataset = DATASET_NAMES
196    elif isinstance(dataset, str):
197        dataset = [dataset]
198
199    all_n5_paths = []
200    for ds in dataset:
201        n5_dir = get_mitoemv2_data(path, ds, download)
202
203        # Read split info to get correct samples
204        dataset_folder = DATASETS[ds]
205        dataset_dir = os.path.join(path, dataset_folder)
206        with open(os.path.join(dataset_dir, "split.json")) as f:
207            split_info = json.load(f)[0]
208
209        samples = split_info.get(split, [])
210        n5_paths = [os.path.join(n5_dir, f"{sample}.n5") for sample in samples]
211        n5_paths = [p for p in n5_paths if os.path.exists(p)]
212        all_n5_paths.extend(n5_paths)
213
214    assert len(all_n5_paths) > 0, f"No data found for {dataset}/{split}"
215
216    return natsorted(all_n5_paths)
217
218
219def get_mitoemv2_dataset(
220    path: Union[os.PathLike, str],
221    patch_shape: Tuple[int, int, int],
222    dataset: Optional[Union[str, List[str]]] = None,
223    split: Literal["train", "val", "test"] = "train",
224    download: bool = False,
225    offsets: Optional[List[List[int]]] = None,
226    boundaries: bool = False,
227    binary: bool = False,
228    **kwargs
229) -> Dataset:
230    """Get the MitoEM v2 dataset for mitochondria segmentation in EM.
231
232    Args:
233        path: Filepath to a folder where the downloaded data will be saved.
234        patch_shape: The patch shape to use for training.
235        dataset: The dataset(s) to use. One of 'beta', 'jurkat', 'macro', 'mossy',
236            'podo', 'pyra', 'sperm', or 'stem'. Can also be a list of dataset names.
237            If None, all datasets will be used.
238        split: The data split to use. One of 'train', 'val', or 'test'.
239        download: Whether to download the data if it is not present.
240        offsets: Offset values for affinity computation used as target.
241        boundaries: Whether to compute boundaries as the target.
242        binary: Whether to return a binary segmentation target.
243        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
244
245    Returns:
246        The segmentation dataset.
247    """
248    assert len(patch_shape) == 3
249
250    n5_paths = get_mitoemv2_paths(path, dataset, split, download)
251
252    kwargs, _ = util.add_instance_label_transform(
253        kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets
254    )
255
256    return torch_em.default_segmentation_dataset(
257        raw_paths=n5_paths,
258        raw_key="raw",
259        label_paths=n5_paths,
260        label_key="labels",
261        patch_shape=patch_shape,
262        **kwargs
263    )
264
265
266def get_mitoemv2_loader(
267    path: Union[os.PathLike, str],
268    batch_size: int,
269    patch_shape: Tuple[int, int, int],
270    dataset: Optional[Union[str, List[str]]] = None,
271    split: Literal["train", "val", "test"] = "train",
272    download: bool = False,
273    offsets: Optional[List[List[int]]] = None,
274    boundaries: bool = False,
275    binary: bool = False,
276    **kwargs
277) -> DataLoader:
278    """Get the MitoEM v2 dataloader for mitochondria segmentation in EM.
279
280    Args:
281        path: Filepath to a folder where the downloaded data will be saved.
282        batch_size: The batch size for training.
283        patch_shape: The patch shape to use for training.
284        dataset: The dataset(s) to use. One of 'beta', 'jurkat', 'macro', 'mossy',
285            'podo', 'pyra', 'sperm', or 'stem'. Can also be a list of dataset names.
286            If None, all datasets will be used.
287        split: The data split to use. One of 'train', 'val', or 'test'.
288        download: Whether to download the data if it is not present.
289        offsets: Offset values for affinity computation used as target.
290        boundaries: Whether to compute boundaries as the target.
291        binary: Whether to return a binary segmentation target.
292        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
293
294    Returns:
295        The DataLoader.
296    """
297    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
298    dataset_obj = get_mitoemv2_dataset(
299        path=path,
300        patch_shape=patch_shape,
301        dataset=dataset,
302        split=split,
303        download=download,
304        offsets=offsets,
305        boundaries=boundaries,
306        binary=binary,
307        **ds_kwargs,
308    )
309    return torch_em.get_data_loader(dataset=dataset_obj, batch_size=batch_size, **loader_kwargs)
BASE_URL = 'https://zenodo.org/records/17635006/files'
DATASETS = {'beta': 'Dataset001_ME2-Beta', 'jurkat': 'Dataset002_ME2-Jurkat', 'macro': 'Dataset003_ME2-Macro', 'mossy': 'Dataset004_ME2-Mossy', 'podo': 'Dataset005_ME2-Podo', 'pyra': 'Dataset006_ME2-Pyra', 'sperm': 'Dataset007_ME2-Sperm', 'stem': 'Dataset008_ME2-Stem'}
DATASET_NAMES = ['beta', 'jurkat', 'macro', 'mossy', 'podo', 'pyra', 'sperm', 'stem']
def get_mitoemv2_data( path: Union[os.PathLike, str], dataset: str, download: bool = False) -> str:
130def get_mitoemv2_data(
131    path: Union[os.PathLike, str],
132    dataset: str,
133    download: bool = False,
134) -> str:
135    """Download and preprocess a MitoEM v2 dataset.
136
137    Args:
138        path: Filepath to a folder where the downloaded data will be saved.
139        dataset: The dataset to download. One of 'beta', 'jurkat', 'macro', 'mossy',
140            'podo', 'pyra', 'sperm', or 'stem'.
141        download: Whether to download the data if it is not present.
142
143    Returns:
144        The filepath to the preprocessed n5 data directory.
145    """
146    assert dataset in DATASETS, f"'{dataset}' is not valid. Choose from {DATASET_NAMES}."
147
148    dataset_folder = DATASETS[dataset]
149    n5_dir = os.path.join(path, "n5_data", dataset)
150
151    # Check if already preprocessed
152    if os.path.exists(n5_dir) and len(glob(os.path.join(n5_dir, "*.n5"))) > 0:
153        return n5_dir
154
155    # Download if needed
156    zip_path = os.path.join(path, f"{dataset_folder}.zip")
157    dataset_dir = os.path.join(path, dataset_folder)
158
159    if not os.path.exists(dataset_dir):
160        os.makedirs(path, exist_ok=True)
161        url = f"{BASE_URL}/{dataset_folder}.zip"
162        util.download_source(path=zip_path, url=url, download=download, checksum=None)
163        util.unzip(zip_path=zip_path, dst=path)
164
165    # Preprocess
166    _preprocess_dataset(path, dataset, dataset_dir)
167
168    return n5_dir

Download and preprocess a MitoEM v2 dataset.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • dataset: The dataset to download. One of 'beta', 'jurkat', 'macro', 'mossy', 'podo', 'pyra', 'sperm', or 'stem'.
  • download: Whether to download the data if it is not present.
Returns:

The filepath to the preprocessed n5 data directory.

def get_mitoemv2_paths( path: Union[os.PathLike, str], dataset: Union[List[str], str, NoneType] = None, split: Literal['train', 'val', 'test'] = 'train', download: bool = False) -> List[str]:
171def get_mitoemv2_paths(
172    path: Union[os.PathLike, str],
173    dataset: Optional[Union[str, List[str]]] = None,
174    split: Literal["train", "val", "test"] = "train",
175    download: bool = False,
176) -> List[str]:
177    """Get paths to the MitoEM v2 data.
178
179    Args:
180        path: Filepath to a folder where the downloaded data will be saved.
181        dataset: The dataset(s) to use. One of 'beta', 'jurkat', 'macro', 'mossy',
182            'podo', 'pyra', 'sperm', or 'stem'. Can also be a list of dataset names.
183            If None, all datasets will be used.
184        split: The data split to use. One of 'train', 'val', or 'test'.
185        download: Whether to download the data if it is not present.
186
187    Returns:
188        List of filepaths for the n5 data.
189    """
190    import json
191    from natsort import natsorted
192
193    assert split in ("train", "val", "test"), f"'{split}' is not a valid split."
194
195    if dataset is None:
196        dataset = DATASET_NAMES
197    elif isinstance(dataset, str):
198        dataset = [dataset]
199
200    all_n5_paths = []
201    for ds in dataset:
202        n5_dir = get_mitoemv2_data(path, ds, download)
203
204        # Read split info to get correct samples
205        dataset_folder = DATASETS[ds]
206        dataset_dir = os.path.join(path, dataset_folder)
207        with open(os.path.join(dataset_dir, "split.json")) as f:
208            split_info = json.load(f)[0]
209
210        samples = split_info.get(split, [])
211        n5_paths = [os.path.join(n5_dir, f"{sample}.n5") for sample in samples]
212        n5_paths = [p for p in n5_paths if os.path.exists(p)]
213        all_n5_paths.extend(n5_paths)
214
215    assert len(all_n5_paths) > 0, f"No data found for {dataset}/{split}"
216
217    return natsorted(all_n5_paths)

Get paths to the MitoEM v2 data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • dataset: The dataset(s) to use. One of 'beta', 'jurkat', 'macro', 'mossy', 'podo', 'pyra', 'sperm', or 'stem'. Can also be a list of dataset names. If None, all datasets will be used.
  • split: The data split to use. One of 'train', 'val', or 'test'.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths for the n5 data.

def get_mitoemv2_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int, int], dataset: Union[List[str], str, NoneType] = None, split: Literal['train', 'val', 'test'] = 'train', download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
220def get_mitoemv2_dataset(
221    path: Union[os.PathLike, str],
222    patch_shape: Tuple[int, int, int],
223    dataset: Optional[Union[str, List[str]]] = None,
224    split: Literal["train", "val", "test"] = "train",
225    download: bool = False,
226    offsets: Optional[List[List[int]]] = None,
227    boundaries: bool = False,
228    binary: bool = False,
229    **kwargs
230) -> Dataset:
231    """Get the MitoEM v2 dataset for mitochondria segmentation in EM.
232
233    Args:
234        path: Filepath to a folder where the downloaded data will be saved.
235        patch_shape: The patch shape to use for training.
236        dataset: The dataset(s) to use. One of 'beta', 'jurkat', 'macro', 'mossy',
237            'podo', 'pyra', 'sperm', or 'stem'. Can also be a list of dataset names.
238            If None, all datasets will be used.
239        split: The data split to use. One of 'train', 'val', or 'test'.
240        download: Whether to download the data if it is not present.
241        offsets: Offset values for affinity computation used as target.
242        boundaries: Whether to compute boundaries as the target.
243        binary: Whether to return a binary segmentation target.
244        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
245
246    Returns:
247        The segmentation dataset.
248    """
249    assert len(patch_shape) == 3
250
251    n5_paths = get_mitoemv2_paths(path, dataset, split, download)
252
253    kwargs, _ = util.add_instance_label_transform(
254        kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets
255    )
256
257    return torch_em.default_segmentation_dataset(
258        raw_paths=n5_paths,
259        raw_key="raw",
260        label_paths=n5_paths,
261        label_key="labels",
262        patch_shape=patch_shape,
263        **kwargs
264    )

Get the MitoEM v2 dataset for mitochondria segmentation in EM.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • dataset: The dataset(s) to use. One of 'beta', 'jurkat', 'macro', 'mossy', 'podo', 'pyra', 'sperm', or 'stem'. Can also be a list of dataset names. If None, all datasets will be used.
  • split: The data split to use. One of 'train', 'val', or 'test'.
  • download: Whether to download the data if it is not present.
  • offsets: Offset values for affinity computation used as target.
  • boundaries: Whether to compute boundaries as the target.
  • binary: Whether to return a binary segmentation target.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_mitoemv2_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int, int], dataset: Union[List[str], str, NoneType] = None, split: Literal['train', 'val', 'test'] = 'train', download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
267def get_mitoemv2_loader(
268    path: Union[os.PathLike, str],
269    batch_size: int,
270    patch_shape: Tuple[int, int, int],
271    dataset: Optional[Union[str, List[str]]] = None,
272    split: Literal["train", "val", "test"] = "train",
273    download: bool = False,
274    offsets: Optional[List[List[int]]] = None,
275    boundaries: bool = False,
276    binary: bool = False,
277    **kwargs
278) -> DataLoader:
279    """Get the MitoEM v2 dataloader for mitochondria segmentation in EM.
280
281    Args:
282        path: Filepath to a folder where the downloaded data will be saved.
283        batch_size: The batch size for training.
284        patch_shape: The patch shape to use for training.
285        dataset: The dataset(s) to use. One of 'beta', 'jurkat', 'macro', 'mossy',
286            'podo', 'pyra', 'sperm', or 'stem'. Can also be a list of dataset names.
287            If None, all datasets will be used.
288        split: The data split to use. One of 'train', 'val', or 'test'.
289        download: Whether to download the data if it is not present.
290        offsets: Offset values for affinity computation used as target.
291        boundaries: Whether to compute boundaries as the target.
292        binary: Whether to return a binary segmentation target.
293        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
294
295    Returns:
296        The DataLoader.
297    """
298    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
299    dataset_obj = get_mitoemv2_dataset(
300        path=path,
301        patch_shape=patch_shape,
302        dataset=dataset,
303        split=split,
304        download=download,
305        offsets=offsets,
306        boundaries=boundaries,
307        binary=binary,
308        **ds_kwargs,
309    )
310    return torch_em.get_data_loader(dataset=dataset_obj, batch_size=batch_size, **loader_kwargs)

Get the MitoEM v2 dataloader for mitochondria segmentation in EM.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape to use for training.
  • dataset: The dataset(s) to use. One of 'beta', 'jurkat', 'macro', 'mossy', 'podo', 'pyra', 'sperm', or 'stem'. Can also be a list of dataset names. If None, all datasets will be used.
  • split: The data split to use. One of 'train', 'val', or 'test'.
  • download: Whether to download the data if it is not present.
  • offsets: Offset values for affinity computation used as target.
  • boundaries: Whether to compute boundaries as the target.
  • binary: Whether to return a binary segmentation target.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.