torch_em.data.datasets.electron_microscopy.mitoemv2

MitoEM v2 is a benchmark collection for 3D mitochondria instance segmentation in electron microscopy.

It contains eight challenging datasets with expert-verified labels, covering biologically complex scenarios such as dense mitochondrial packing, hyperfused networks, thin-necked morphologies, and ultrastructurally ambiguous boundaries.

The data is located at https://doi.org/10.5281/zenodo.17635006. The dataset is from the publication https://doi.org/10.1101/2025.11.12.687478. Please cite it if you use this dataset in your research.

View Source

  1"""MitoEM v2 is a benchmark collection for 3D mitochondria instance segmentation in electron microscopy.
  2
  3It contains eight challenging datasets with expert-verified labels, covering biologically complex
  4scenarios such as dense mitochondrial packing, hyperfused networks, thin-necked morphologies,
  5and ultrastructurally ambiguous boundaries.
  6
  7The data is located at https://doi.org/10.5281/zenodo.17635006.
  8The dataset is from the publication https://doi.org/10.1101/2025.11.12.687478.
  9Please cite it if you use this dataset in your research.
 10"""
 11
 12import os
 13from glob import glob
 14from typing import Union, Literal, Optional, Tuple, List
 15
 16import numpy as np
 17
 18from torch.utils.data import Dataset, DataLoader
 19
 20import torch_em
 21
 22from .. import util
 23
 24
 25BASE_URL = "https://zenodo.org/records/17635006/files"
 26
 27DATASETS = {
 28    "beta": "Dataset001_ME2-Beta",
 29    "jurkat": "Dataset002_ME2-Jurkat",
 30    "macro": "Dataset003_ME2-Macro",
 31    "mossy": "Dataset004_ME2-Mossy",
 32    "podo": "Dataset005_ME2-Podo",
 33    "pyra": "Dataset006_ME2-Pyra",
 34    "sperm": "Dataset007_ME2-Sperm",
 35    "stem": "Dataset008_ME2-Stem",
 36}
 37
 38DATASET_NAMES = list(DATASETS.keys())
 39
 40
 41def _convert_nifti_to_n5(nifti_path, n5_path):
 42    """Convert NIfTI file to n5 format for efficient access."""
 43    import nibabel as nib
 44    import z5py
 45
 46    if os.path.exists(n5_path):
 47        return
 48
 49    nii = nib.load(nifti_path)
 50    data = np.asarray(nii.dataobj)
 51
 52    # NIfTI stores as (X, Y, Z), we want (Z, Y, X)
 53    data = np.transpose(data, (2, 1, 0))
 54
 55    chunks = (32, 256, 256)
 56    with z5py.File(n5_path, "a") as f:
 57        f.create_dataset("data", data=data, chunks=chunks, compression="gzip")
 58
 59
 60def _preprocess_dataset(path, dataset_name, dataset_dir):
 61    """Preprocess a single dataset: convert NIfTI to n5."""
 62    import json
 63
 64    n5_dir = os.path.join(path, "n5_data", dataset_name)
 65    os.makedirs(n5_dir, exist_ok=True)
 66
 67    # Read split info
 68    with open(os.path.join(dataset_dir, "split.json")) as f:
 69        split_info = json.load(f)[0]
 70
 71    processed = {}
 72    for split_name, split_key in [("train", "train"), ("val", "val"), ("test", "test")]:
 73        samples = split_info.get(split_key, [])
 74        if not samples:
 75            continue
 76
 77        for sample in samples:
 78            # Determine source directories based on split
 79            if split_name == "test":
 80                img_dir = "imagesTs"
 81                lbl_dir = "labelsTs"
 82            else:
 83                img_dir = "imagesTr"
 84                lbl_dir = "labelsTr"
 85
 86            img_nifti = os.path.join(dataset_dir, img_dir, f"{sample}_0000.nii.gz")
 87            lbl_nifti = os.path.join(dataset_dir, lbl_dir, f"{sample}.nii.gz")
 88
 89            if not os.path.exists(img_nifti) or not os.path.exists(lbl_nifti):
 90                continue
 91
 92            n5_path = os.path.join(n5_dir, f"{sample}.n5")
 93
 94            if not os.path.exists(n5_path):
 95                print(f"Converting {sample} to n5...")
 96                _convert_nifti_to_n5(img_nifti, os.path.join(n5_dir, f"{sample}_raw.n5"))
 97                _convert_nifti_to_n5(lbl_nifti, os.path.join(n5_dir, f"{sample}_labels.n5"))
 98
 99                # Combine into single n5 file
100                import z5py
101                with z5py.File(os.path.join(n5_dir, f"{sample}_raw.n5"), "r") as f_raw:
102                    raw = f_raw["data"][:]
103                with z5py.File(os.path.join(n5_dir, f"{sample}_labels.n5"), "r") as f_lbl:
104                    labels = f_lbl["data"][:]
105
106                with z5py.File(n5_path, "a") as f:
107                    f.create_dataset("raw", data=raw, chunks=(32, 256, 256), compression="gzip")
108                    f.create_dataset("labels", data=labels.astype("uint64"), chunks=(32, 256, 256), compression="gzip")
109
110                # Clean up temp files
111                import shutil
112                shutil.rmtree(os.path.join(n5_dir, f"{sample}_raw.n5"))
113                shutil.rmtree(os.path.join(n5_dir, f"{sample}_labels.n5"))
114
115            if split_name not in processed:
116                processed[split_name] = []
117            processed[split_name].append(n5_path)
118
119    return processed
120
121
122def get_mitoemv2_data(
123    path: Union[os.PathLike, str],
124    dataset: str,
125    download: bool = False,
126) -> str:
127    """Download and preprocess a MitoEM v2 dataset.
128
129    Args:
130        path: Filepath to a folder where the downloaded data will be saved.
131        dataset: The dataset to download. One of 'beta', 'jurkat', 'macro', 'mossy',
132            'podo', 'pyra', 'sperm', or 'stem'.
133        download: Whether to download the data if it is not present.
134
135    Returns:
136        The filepath to the preprocessed n5 data directory.
137    """
138    assert dataset in DATASETS, f"'{dataset}' is not valid. Choose from {DATASET_NAMES}."
139
140    dataset_folder = DATASETS[dataset]
141    n5_dir = os.path.join(path, "n5_data", dataset)
142
143    # Check if already preprocessed
144    if os.path.exists(n5_dir) and len(glob(os.path.join(n5_dir, "*.n5"))) > 0:
145        return n5_dir
146
147    # Download if needed
148    zip_path = os.path.join(path, f"{dataset_folder}.zip")
149    dataset_dir = os.path.join(path, dataset_folder)
150
151    if not os.path.exists(dataset_dir):
152        os.makedirs(path, exist_ok=True)
153        url = f"{BASE_URL}/{dataset_folder}.zip"
154        util.download_source(path=zip_path, url=url, download=download, checksum=None)
155        util.unzip(zip_path=zip_path, dst=path)
156
157    # Preprocess
158    _preprocess_dataset(path, dataset, dataset_dir)
159
160    return n5_dir
161
162
163def get_mitoemv2_paths(
164    path: Union[os.PathLike, str],
165    dataset: Optional[Union[str, List[str]]] = None,
166    split: Literal["train", "val", "test"] = "train",
167    download: bool = False,
168) -> List[str]:
169    """Get paths to the MitoEM v2 data.
170
171    Args:
172        path: Filepath to a folder where the downloaded data will be saved.
173        dataset: The dataset(s) to use. One of 'beta', 'jurkat', 'macro', 'mossy',
174            'podo', 'pyra', 'sperm', or 'stem'. Can also be a list of dataset names.
175            If None, all datasets will be used.
176        split: The data split to use. One of 'train', 'val', or 'test'.
177        download: Whether to download the data if it is not present.
178
179    Returns:
180        List of filepaths for the n5 data.
181    """
182    import json
183    from natsort import natsorted
184
185    assert split in ("train", "val", "test"), f"'{split}' is not a valid split."
186
187    if dataset is None:
188        dataset = DATASET_NAMES
189    elif isinstance(dataset, str):
190        dataset = [dataset]
191
192    all_n5_paths = []
193    for ds in dataset:
194        n5_dir = get_mitoemv2_data(path, ds, download)
195
196        # Read split info to get correct samples
197        dataset_folder = DATASETS[ds]
198        dataset_dir = os.path.join(path, dataset_folder)
199        with open(os.path.join(dataset_dir, "split.json")) as f:
200            split_info = json.load(f)[0]
201
202        samples = split_info.get(split, [])
203        n5_paths = [os.path.join(n5_dir, f"{sample}.n5") for sample in samples]
204        n5_paths = [p for p in n5_paths if os.path.exists(p)]
205        all_n5_paths.extend(n5_paths)
206
207    assert len(all_n5_paths) > 0, f"No data found for {dataset}/{split}"
208
209    return natsorted(all_n5_paths)
210
211
212def get_mitoemv2_dataset(
213    path: Union[os.PathLike, str],
214    patch_shape: Tuple[int, int, int],
215    dataset: Optional[Union[str, List[str]]] = None,
216    split: Literal["train", "val", "test"] = "train",
217    download: bool = False,
218    offsets: Optional[List[List[int]]] = None,
219    boundaries: bool = False,
220    binary: bool = False,
221    **kwargs
222) -> Dataset:
223    """Get the MitoEM v2 dataset for mitochondria segmentation in EM.
224
225    Args:
226        path: Filepath to a folder where the downloaded data will be saved.
227        patch_shape: The patch shape to use for training.
228        dataset: The dataset(s) to use. One of 'beta', 'jurkat', 'macro', 'mossy',
229            'podo', 'pyra', 'sperm', or 'stem'. Can also be a list of dataset names.
230            If None, all datasets will be used.
231        split: The data split to use. One of 'train', 'val', or 'test'.
232        download: Whether to download the data if it is not present.
233        offsets: Offset values for affinity computation used as target.
234        boundaries: Whether to compute boundaries as the target.
235        binary: Whether to return a binary segmentation target.
236        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
237
238    Returns:
239        The segmentation dataset.
240    """
241    assert len(patch_shape) == 3
242
243    n5_paths = get_mitoemv2_paths(path, dataset, split, download)
244
245    kwargs, _ = util.add_instance_label_transform(
246        kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets
247    )
248
249    return torch_em.default_segmentation_dataset(
250        raw_paths=n5_paths,
251        raw_key="raw",
252        label_paths=n5_paths,
253        label_key="labels",
254        patch_shape=patch_shape,
255        **kwargs
256    )
257
258
259def get_mitoemv2_loader(
260    path: Union[os.PathLike, str],
261    batch_size: int,
262    patch_shape: Tuple[int, int, int],
263    dataset: Optional[Union[str, List[str]]] = None,
264    split: Literal["train", "val", "test"] = "train",
265    download: bool = False,
266    offsets: Optional[List[List[int]]] = None,
267    boundaries: bool = False,
268    binary: bool = False,
269    **kwargs
270) -> DataLoader:
271    """Get the MitoEM v2 dataloader for mitochondria segmentation in EM.
272
273    Args:
274        path: Filepath to a folder where the downloaded data will be saved.
275        batch_size: The batch size for training.
276        patch_shape: The patch shape to use for training.
277        dataset: The dataset(s) to use. One of 'beta', 'jurkat', 'macro', 'mossy',
278            'podo', 'pyra', 'sperm', or 'stem'. Can also be a list of dataset names.
279            If None, all datasets will be used.
280        split: The data split to use. One of 'train', 'val', or 'test'.
281        download: Whether to download the data if it is not present.
282        offsets: Offset values for affinity computation used as target.
283        boundaries: Whether to compute boundaries as the target.
284        binary: Whether to return a binary segmentation target.
285        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
286
287    Returns:
288        The DataLoader.
289    """
290    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
291    dataset_obj = get_mitoemv2_dataset(
292        path=path,
293        patch_shape=patch_shape,
294        dataset=dataset,
295        split=split,
296        download=download,
297        offsets=offsets,
298        boundaries=boundaries,
299        binary=binary,
300        **ds_kwargs,
301    )
302    return torch_em.get_data_loader(dataset=dataset_obj, batch_size=batch_size, **loader_kwargs)

BASE_URL = 'https://zenodo.org/records/17635006/files'

DATASETS = {'beta': 'Dataset001_ME2-Beta', 'jurkat': 'Dataset002_ME2-Jurkat', 'macro': 'Dataset003_ME2-Macro', 'mossy': 'Dataset004_ME2-Mossy', 'podo': 'Dataset005_ME2-Podo', 'pyra': 'Dataset006_ME2-Pyra', 'sperm': 'Dataset007_ME2-Sperm', 'stem': 'Dataset008_ME2-Stem'}

DATASET_NAMES = ['beta', 'jurkat', 'macro', 'mossy', 'podo', 'pyra', 'sperm', 'stem']

def get_mitoemv2_data( path: Union[os.PathLike, str], dataset: str, download: bool = False) -> str: View Source

123def get_mitoemv2_data(
124    path: Union[os.PathLike, str],
125    dataset: str,
126    download: bool = False,
127) -> str:
128    """Download and preprocess a MitoEM v2 dataset.
129
130    Args:
131        path: Filepath to a folder where the downloaded data will be saved.
132        dataset: The dataset to download. One of 'beta', 'jurkat', 'macro', 'mossy',
133            'podo', 'pyra', 'sperm', or 'stem'.
134        download: Whether to download the data if it is not present.
135
136    Returns:
137        The filepath to the preprocessed n5 data directory.
138    """
139    assert dataset in DATASETS, f"'{dataset}' is not valid. Choose from {DATASET_NAMES}."
140
141    dataset_folder = DATASETS[dataset]
142    n5_dir = os.path.join(path, "n5_data", dataset)
143
144    # Check if already preprocessed
145    if os.path.exists(n5_dir) and len(glob(os.path.join(n5_dir, "*.n5"))) > 0:
146        return n5_dir
147
148    # Download if needed
149    zip_path = os.path.join(path, f"{dataset_folder}.zip")
150    dataset_dir = os.path.join(path, dataset_folder)
151
152    if not os.path.exists(dataset_dir):
153        os.makedirs(path, exist_ok=True)
154        url = f"{BASE_URL}/{dataset_folder}.zip"
155        util.download_source(path=zip_path, url=url, download=download, checksum=None)
156        util.unzip(zip_path=zip_path, dst=path)
157
158    # Preprocess
159    _preprocess_dataset(path, dataset, dataset_dir)
160
161    return n5_dir

Download and preprocess a MitoEM v2 dataset.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
dataset: The dataset to download. One of 'beta', 'jurkat', 'macro', 'mossy', 'podo', 'pyra', 'sperm', or 'stem'.
download: Whether to download the data if it is not present.

Returns:

The filepath to the preprocessed n5 data directory.

def get_mitoemv2_paths( path: Union[os.PathLike, str], dataset: Union[List[str], str, NoneType] = None, split: Literal['train', 'val', 'test'] = 'train', download: bool = False) -> List[str]: View Source

164def get_mitoemv2_paths(
165    path: Union[os.PathLike, str],
166    dataset: Optional[Union[str, List[str]]] = None,
167    split: Literal["train", "val", "test"] = "train",
168    download: bool = False,
169) -> List[str]:
170    """Get paths to the MitoEM v2 data.
171
172    Args:
173        path: Filepath to a folder where the downloaded data will be saved.
174        dataset: The dataset(s) to use. One of 'beta', 'jurkat', 'macro', 'mossy',
175            'podo', 'pyra', 'sperm', or 'stem'. Can also be a list of dataset names.
176            If None, all datasets will be used.
177        split: The data split to use. One of 'train', 'val', or 'test'.
178        download: Whether to download the data if it is not present.
179
180    Returns:
181        List of filepaths for the n5 data.
182    """
183    import json
184    from natsort import natsorted
185
186    assert split in ("train", "val", "test"), f"'{split}' is not a valid split."
187
188    if dataset is None:
189        dataset = DATASET_NAMES
190    elif isinstance(dataset, str):
191        dataset = [dataset]
192
193    all_n5_paths = []
194    for ds in dataset:
195        n5_dir = get_mitoemv2_data(path, ds, download)
196
197        # Read split info to get correct samples
198        dataset_folder = DATASETS[ds]
199        dataset_dir = os.path.join(path, dataset_folder)
200        with open(os.path.join(dataset_dir, "split.json")) as f:
201            split_info = json.load(f)[0]
202
203        samples = split_info.get(split, [])
204        n5_paths = [os.path.join(n5_dir, f"{sample}.n5") for sample in samples]
205        n5_paths = [p for p in n5_paths if os.path.exists(p)]
206        all_n5_paths.extend(n5_paths)
207
208    assert len(all_n5_paths) > 0, f"No data found for {dataset}/{split}"
209
210    return natsorted(all_n5_paths)

Get paths to the MitoEM v2 data.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
dataset: The dataset(s) to use. One of 'beta', 'jurkat', 'macro', 'mossy', 'podo', 'pyra', 'sperm', or 'stem'. Can also be a list of dataset names. If None, all datasets will be used.
split: The data split to use. One of 'train', 'val', or 'test'.
download: Whether to download the data if it is not present.

Returns:

List of filepaths for the n5 data.

def get_mitoemv2_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int, int], dataset: Union[List[str], str, NoneType] = None, split: Literal['train', 'val', 'test'] = 'train', download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset: View Source

213def get_mitoemv2_dataset(
214    path: Union[os.PathLike, str],
215    patch_shape: Tuple[int, int, int],
216    dataset: Optional[Union[str, List[str]]] = None,
217    split: Literal["train", "val", "test"] = "train",
218    download: bool = False,
219    offsets: Optional[List[List[int]]] = None,
220    boundaries: bool = False,
221    binary: bool = False,
222    **kwargs
223) -> Dataset:
224    """Get the MitoEM v2 dataset for mitochondria segmentation in EM.
225
226    Args:
227        path: Filepath to a folder where the downloaded data will be saved.
228        patch_shape: The patch shape to use for training.
229        dataset: The dataset(s) to use. One of 'beta', 'jurkat', 'macro', 'mossy',
230            'podo', 'pyra', 'sperm', or 'stem'. Can also be a list of dataset names.
231            If None, all datasets will be used.
232        split: The data split to use. One of 'train', 'val', or 'test'.
233        download: Whether to download the data if it is not present.
234        offsets: Offset values for affinity computation used as target.
235        boundaries: Whether to compute boundaries as the target.
236        binary: Whether to return a binary segmentation target.
237        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
238
239    Returns:
240        The segmentation dataset.
241    """
242    assert len(patch_shape) == 3
243
244    n5_paths = get_mitoemv2_paths(path, dataset, split, download)
245
246    kwargs, _ = util.add_instance_label_transform(
247        kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets
248    )
249
250    return torch_em.default_segmentation_dataset(
251        raw_paths=n5_paths,
252        raw_key="raw",
253        label_paths=n5_paths,
254        label_key="labels",
255        patch_shape=patch_shape,
256        **kwargs
257    )

Get the MitoEM v2 dataset for mitochondria segmentation in EM.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
patch_shape: The patch shape to use for training.
dataset: The dataset(s) to use. One of 'beta', 'jurkat', 'macro', 'mossy', 'podo', 'pyra', 'sperm', or 'stem'. Can also be a list of dataset names. If None, all datasets will be used.
split: The data split to use. One of 'train', 'val', or 'test'.
download: Whether to download the data if it is not present.
offsets: Offset values for affinity computation used as target.
boundaries: Whether to compute boundaries as the target.
binary: Whether to return a binary segmentation target.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

Returns:

The segmentation dataset.

def get_mitoemv2_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int, int], dataset: Union[List[str], str, NoneType] = None, split: Literal['train', 'val', 'test'] = 'train', download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader: View Source

260def get_mitoemv2_loader(
261    path: Union[os.PathLike, str],
262    batch_size: int,
263    patch_shape: Tuple[int, int, int],
264    dataset: Optional[Union[str, List[str]]] = None,
265    split: Literal["train", "val", "test"] = "train",
266    download: bool = False,
267    offsets: Optional[List[List[int]]] = None,
268    boundaries: bool = False,
269    binary: bool = False,
270    **kwargs
271) -> DataLoader:
272    """Get the MitoEM v2 dataloader for mitochondria segmentation in EM.
273
274    Args:
275        path: Filepath to a folder where the downloaded data will be saved.
276        batch_size: The batch size for training.
277        patch_shape: The patch shape to use for training.
278        dataset: The dataset(s) to use. One of 'beta', 'jurkat', 'macro', 'mossy',
279            'podo', 'pyra', 'sperm', or 'stem'. Can also be a list of dataset names.
280            If None, all datasets will be used.
281        split: The data split to use. One of 'train', 'val', or 'test'.
282        download: Whether to download the data if it is not present.
283        offsets: Offset values for affinity computation used as target.
284        boundaries: Whether to compute boundaries as the target.
285        binary: Whether to return a binary segmentation target.
286        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
287
288    Returns:
289        The DataLoader.
290    """
291    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
292    dataset_obj = get_mitoemv2_dataset(
293        path=path,
294        patch_shape=patch_shape,
295        dataset=dataset,
296        split=split,
297        download=download,
298        offsets=offsets,
299        boundaries=boundaries,
300        binary=binary,
301        **ds_kwargs,
302    )
303    return torch_em.get_data_loader(dataset=dataset_obj, batch_size=batch_size, **loader_kwargs)

Get the MitoEM v2 dataloader for mitochondria segmentation in EM.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
batch_size: The batch size for training.
patch_shape: The patch shape to use for training.
dataset: The dataset(s) to use. One of 'beta', 'jurkat', 'macro', 'mossy', 'podo', 'pyra', 'sperm', or 'stem'. Can also be a list of dataset names. If None, all datasets will be used.
split: The data split to use. One of 'train', 'val', or 'test'.
download: Whether to download the data if it is not present.
offsets: Offset values for affinity computation used as target.
boundaries: Whether to compute boundaries as the target.
binary: Whether to return a binary segmentation target.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

Returns:

The DataLoader.