torch_em.data.datasets.electron_microscopy.probtem

ProbTEM dataset for mitochondria segmentation in 2D TEM images.

The dataset contains TEM images of skeletal muscle with binary semantic segmentation masks for mitochondria (0=background, 1=mitochondria). Images are 2560 x 2560 pixels at 65 nm sample thickness acquired with a JEM-1011 microscope at 80 kV.

The dataset has 21 training and 6 test images. There is no validation split.

Masks are stored as grayscale PNGs (0=background, 255=mitochondria) with slight anti-aliased edges. They are thresholded to binary during preprocessing.

This dataset is from the publication https://doi.org/10.1038/s41598-025-03311-1. Please cite it if you use this dataset in your research.

The data is available at https://yoonlab.unist.ac.kr/index.php/research/mitochondria-tem-dataset/ and requires a Google Drive download via gdown: pip install gdown.

  1"""ProbTEM dataset for mitochondria segmentation in 2D TEM images.
  2
  3The dataset contains TEM images of skeletal muscle with binary semantic segmentation
  4masks for mitochondria (0=background, 1=mitochondria). Images are 2560 x 2560 pixels
  5at 65 nm sample thickness acquired with a JEM-1011 microscope at 80 kV.
  6
  7The dataset has 21 training and 6 test images. There is no validation split.
  8
  9Masks are stored as grayscale PNGs (0=background, 255=mitochondria) with slight
 10anti-aliased edges. They are thresholded to binary during preprocessing.
 11
 12This dataset is from the publication https://doi.org/10.1038/s41598-025-03311-1.
 13Please cite it if you use this dataset in your research.
 14
 15The data is available at https://yoonlab.unist.ac.kr/index.php/research/mitochondria-tem-dataset/
 16and requires a Google Drive download via gdown: pip install gdown.
 17"""
 18
 19import os
 20from glob import glob
 21from typing import List, Literal, Tuple, Union
 22
 23import h5py
 24import imageio.v3 as imageio
 25import numpy as np
 26
 27import torch_em
 28from torch.utils.data import Dataset, DataLoader
 29from .. import util
 30
 31
 32PROBTEM_GDRIVE_FOLDER = "1n2ZqbJEHPyMB_6a6OTBBACt5Jct2PZJc"
 33PROBTEM_DATA_ROOT = "Deeppi-EM/mitoseg_deploy/datasets/Skeletal_muscle"
 34
 35
 36def _preprocess_probtem(raw_dir, label_dir, out_dir):
 37    os.makedirs(out_dir, exist_ok=True)
 38    raw_paths = sorted(glob(os.path.join(raw_dir, "*.tif")) + glob(os.path.join(raw_dir, "*.tiff")))
 39    for rp in raw_paths:
 40        name = os.path.splitext(os.path.basename(rp))[0]
 41        out_path = os.path.join(out_dir, f"{name}.h5")
 42        if os.path.exists(out_path):
 43            continue
 44
 45        raw = imageio.imread(rp)
 46        if raw.ndim == 3:
 47            raw = raw[..., 0]
 48
 49        label_name = name.replace("x_", "y_")
 50        lp = os.path.join(label_dir, f"{label_name}.png")
 51        if not os.path.exists(lp):
 52            continue
 53
 54        labels = imageio.imread(lp)
 55        if labels.ndim == 3:
 56            labels = labels[..., 0]
 57        labels = (labels >= 127).astype(np.uint8)
 58
 59        with h5py.File(out_path, "w") as f:
 60            f.create_dataset("raw", data=raw, compression="gzip")
 61            f.create_dataset("labels", data=labels, compression="gzip")
 62
 63
 64def get_probtem_data(
 65    path: Union[os.PathLike, str],
 66    split: Literal["train", "test"] = "train",
 67    download: bool = False,
 68) -> str:
 69    """Download and preprocess the ProbTEM dataset.
 70
 71    Args:
 72        path: Filepath to a folder where the data will be saved.
 73        split: The data split to use, either "train" or "test".
 74        download: Whether to download the data if not present.
 75
 76    Returns:
 77        Path to the folder containing preprocessed HDF5 files.
 78    """
 79    processed_dir = os.path.join(str(path), "processed", split)
 80    if os.path.isdir(processed_dir) and len(glob(os.path.join(processed_dir, "*.h5"))) > 0:
 81        return processed_dir
 82
 83    raw_dir = os.path.join(str(path), PROBTEM_DATA_ROOT, split, "input")
 84    label_dir = os.path.join(str(path), PROBTEM_DATA_ROOT, split, "target")
 85
 86    if not os.path.isdir(raw_dir):
 87        if not download:
 88            raise RuntimeError(
 89                f"ProbTEM data not found at '{path}'. Set download=True or download manually from "
 90                "https://yoonlab.unist.ac.kr/index.php/research/mitochondria-tem-dataset/ "
 91                "and place in the given path."
 92            )
 93        try:
 94            import gdown
 95        except ImportError:
 96            raise ImportError("gdown is required to download ProbTEM: pip install gdown")
 97        gdown.download_folder(id=PROBTEM_GDRIVE_FOLDER, output=str(path), quiet=False)
 98
 99    _preprocess_probtem(raw_dir, label_dir, processed_dir)
100    return processed_dir
101
102
103def get_probtem_paths(
104    path: Union[os.PathLike, str],
105    split: Literal["train", "test"] = "train",
106    download: bool = False,
107) -> List[str]:
108    """Get paths to ProbTEM HDF5 files.
109
110    Args:
111        path: Filepath to a folder where the data will be saved.
112        split: The data split to use, either "train" or "test".
113        download: Whether to download the data if not present.
114
115    Returns:
116        List of paths to HDF5 files.
117    """
118    processed_dir = get_probtem_data(path, split, download)
119    return sorted(glob(os.path.join(processed_dir, "*.h5")))
120
121
122def get_probtem_dataset(
123    path: Union[os.PathLike, str],
124    patch_shape: Tuple[int, int],
125    split: Literal["train", "test"] = "train",
126    download: bool = False,
127    **kwargs,
128) -> Dataset:
129    """Get the ProbTEM dataset for mitochondria segmentation in 2D TEM images.
130
131    Args:
132        path: Filepath to a folder where the data will be saved.
133        patch_shape: The patch shape (H, W) for training.
134        split: The data split to use, either "train" or "test".
135        download: Whether to download the data if not present.
136        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
137
138    Returns:
139        The segmentation dataset.
140    """
141    assert len(patch_shape) == 2
142    data_paths = get_probtem_paths(path, split, download)
143
144    return torch_em.default_segmentation_dataset(
145        raw_paths=data_paths,
146        raw_key="raw",
147        label_paths=data_paths,
148        label_key="labels",
149        patch_shape=patch_shape,
150        is_seg_dataset=True,
151        **kwargs,
152    )
153
154
155def get_probtem_loader(
156    path: Union[os.PathLike, str],
157    batch_size: int,
158    patch_shape: Tuple[int, int],
159    split: Literal["train", "test"] = "train",
160    download: bool = False,
161    **kwargs,
162) -> DataLoader:
163    """Get the DataLoader for mitochondria segmentation in the ProbTEM dataset.
164
165    Args:
166        path: Filepath to a folder where the data will be saved.
167        batch_size: The batch size for training.
168        patch_shape: The patch shape (H, W) for training.
169        split: The data split to use, either "train" or "test".
170        download: Whether to download the data if not present.
171        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`
172            or for the PyTorch DataLoader.
173
174    Returns:
175        The DataLoader.
176    """
177    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
178    ds = get_probtem_dataset(path=path, patch_shape=patch_shape, split=split, download=download, **ds_kwargs)
179    return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)
PROBTEM_GDRIVE_FOLDER = '1n2ZqbJEHPyMB_6a6OTBBACt5Jct2PZJc'
PROBTEM_DATA_ROOT = 'Deeppi-EM/mitoseg_deploy/datasets/Skeletal_muscle'
def get_probtem_data( path: Union[os.PathLike, str], split: Literal['train', 'test'] = 'train', download: bool = False) -> str:
 65def get_probtem_data(
 66    path: Union[os.PathLike, str],
 67    split: Literal["train", "test"] = "train",
 68    download: bool = False,
 69) -> str:
 70    """Download and preprocess the ProbTEM dataset.
 71
 72    Args:
 73        path: Filepath to a folder where the data will be saved.
 74        split: The data split to use, either "train" or "test".
 75        download: Whether to download the data if not present.
 76
 77    Returns:
 78        Path to the folder containing preprocessed HDF5 files.
 79    """
 80    processed_dir = os.path.join(str(path), "processed", split)
 81    if os.path.isdir(processed_dir) and len(glob(os.path.join(processed_dir, "*.h5"))) > 0:
 82        return processed_dir
 83
 84    raw_dir = os.path.join(str(path), PROBTEM_DATA_ROOT, split, "input")
 85    label_dir = os.path.join(str(path), PROBTEM_DATA_ROOT, split, "target")
 86
 87    if not os.path.isdir(raw_dir):
 88        if not download:
 89            raise RuntimeError(
 90                f"ProbTEM data not found at '{path}'. Set download=True or download manually from "
 91                "https://yoonlab.unist.ac.kr/index.php/research/mitochondria-tem-dataset/ "
 92                "and place in the given path."
 93            )
 94        try:
 95            import gdown
 96        except ImportError:
 97            raise ImportError("gdown is required to download ProbTEM: pip install gdown")
 98        gdown.download_folder(id=PROBTEM_GDRIVE_FOLDER, output=str(path), quiet=False)
 99
100    _preprocess_probtem(raw_dir, label_dir, processed_dir)
101    return processed_dir

Download and preprocess the ProbTEM dataset.

Arguments:
  • path: Filepath to a folder where the data will be saved.
  • split: The data split to use, either "train" or "test".
  • download: Whether to download the data if not present.
Returns:

Path to the folder containing preprocessed HDF5 files.

def get_probtem_paths( path: Union[os.PathLike, str], split: Literal['train', 'test'] = 'train', download: bool = False) -> List[str]:
104def get_probtem_paths(
105    path: Union[os.PathLike, str],
106    split: Literal["train", "test"] = "train",
107    download: bool = False,
108) -> List[str]:
109    """Get paths to ProbTEM HDF5 files.
110
111    Args:
112        path: Filepath to a folder where the data will be saved.
113        split: The data split to use, either "train" or "test".
114        download: Whether to download the data if not present.
115
116    Returns:
117        List of paths to HDF5 files.
118    """
119    processed_dir = get_probtem_data(path, split, download)
120    return sorted(glob(os.path.join(processed_dir, "*.h5")))

Get paths to ProbTEM HDF5 files.

Arguments:
  • path: Filepath to a folder where the data will be saved.
  • split: The data split to use, either "train" or "test".
  • download: Whether to download the data if not present.
Returns:

List of paths to HDF5 files.

def get_probtem_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Literal['train', 'test'] = 'train', download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
123def get_probtem_dataset(
124    path: Union[os.PathLike, str],
125    patch_shape: Tuple[int, int],
126    split: Literal["train", "test"] = "train",
127    download: bool = False,
128    **kwargs,
129) -> Dataset:
130    """Get the ProbTEM dataset for mitochondria segmentation in 2D TEM images.
131
132    Args:
133        path: Filepath to a folder where the data will be saved.
134        patch_shape: The patch shape (H, W) for training.
135        split: The data split to use, either "train" or "test".
136        download: Whether to download the data if not present.
137        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
138
139    Returns:
140        The segmentation dataset.
141    """
142    assert len(patch_shape) == 2
143    data_paths = get_probtem_paths(path, split, download)
144
145    return torch_em.default_segmentation_dataset(
146        raw_paths=data_paths,
147        raw_key="raw",
148        label_paths=data_paths,
149        label_key="labels",
150        patch_shape=patch_shape,
151        is_seg_dataset=True,
152        **kwargs,
153    )

Get the ProbTEM dataset for mitochondria segmentation in 2D TEM images.

Arguments:
  • path: Filepath to a folder where the data will be saved.
  • patch_shape: The patch shape (H, W) for training.
  • split: The data split to use, either "train" or "test".
  • download: Whether to download the data if not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_probtem_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Literal['train', 'test'] = 'train', download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
156def get_probtem_loader(
157    path: Union[os.PathLike, str],
158    batch_size: int,
159    patch_shape: Tuple[int, int],
160    split: Literal["train", "test"] = "train",
161    download: bool = False,
162    **kwargs,
163) -> DataLoader:
164    """Get the DataLoader for mitochondria segmentation in the ProbTEM dataset.
165
166    Args:
167        path: Filepath to a folder where the data will be saved.
168        batch_size: The batch size for training.
169        patch_shape: The patch shape (H, W) for training.
170        split: The data split to use, either "train" or "test".
171        download: Whether to download the data if not present.
172        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`
173            or for the PyTorch DataLoader.
174
175    Returns:
176        The DataLoader.
177    """
178    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
179    ds = get_probtem_dataset(path=path, patch_shape=patch_shape, split=split, download=download, **ds_kwargs)
180    return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)

Get the DataLoader for mitochondria segmentation in the ProbTEM dataset.

Arguments:
  • path: Filepath to a folder where the data will be saved.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape (H, W) for training.
  • split: The data split to use, either "train" or "test".
  • download: Whether to download the data if not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.