torch_em.data.datasets.electron_microscopy.probtem
ProbTEM dataset for mitochondria segmentation in 2D TEM images.
The dataset contains TEM images of skeletal muscle with binary semantic segmentation masks for mitochondria (0=background, 1=mitochondria). Images are 2560 x 2560 pixels at 65 nm sample thickness acquired with a JEM-1011 microscope at 80 kV.
The dataset has 21 training and 6 test images. There is no validation split.
Masks are stored as grayscale PNGs (0=background, 255=mitochondria) with slight anti-aliased edges. They are thresholded to binary during preprocessing.
This dataset is from the publication https://doi.org/10.1038/s41598-025-03311-1. Please cite it if you use this dataset in your research.
The data is available at https://yoonlab.unist.ac.kr/index.php/research/mitochondria-tem-dataset/ and requires a Google Drive download via gdown: pip install gdown.
1"""ProbTEM dataset for mitochondria segmentation in 2D TEM images. 2 3The dataset contains TEM images of skeletal muscle with binary semantic segmentation 4masks for mitochondria (0=background, 1=mitochondria). Images are 2560 x 2560 pixels 5at 65 nm sample thickness acquired with a JEM-1011 microscope at 80 kV. 6 7The dataset has 21 training and 6 test images. There is no validation split. 8 9Masks are stored as grayscale PNGs (0=background, 255=mitochondria) with slight 10anti-aliased edges. They are thresholded to binary during preprocessing. 11 12This dataset is from the publication https://doi.org/10.1038/s41598-025-03311-1. 13Please cite it if you use this dataset in your research. 14 15The data is available at https://yoonlab.unist.ac.kr/index.php/research/mitochondria-tem-dataset/ 16and requires a Google Drive download via gdown: pip install gdown. 17""" 18 19import os 20from glob import glob 21from typing import List, Literal, Tuple, Union 22 23import h5py 24import imageio.v3 as imageio 25import numpy as np 26 27import torch_em 28from torch.utils.data import Dataset, DataLoader 29from .. import util 30 31 32PROBTEM_GDRIVE_FOLDER = "1n2ZqbJEHPyMB_6a6OTBBACt5Jct2PZJc" 33PROBTEM_DATA_ROOT = "Deeppi-EM/mitoseg_deploy/datasets/Skeletal_muscle" 34 35 36def _preprocess_probtem(raw_dir, label_dir, out_dir): 37 os.makedirs(out_dir, exist_ok=True) 38 raw_paths = sorted(glob(os.path.join(raw_dir, "*.tif")) + glob(os.path.join(raw_dir, "*.tiff"))) 39 for rp in raw_paths: 40 name = os.path.splitext(os.path.basename(rp))[0] 41 out_path = os.path.join(out_dir, f"{name}.h5") 42 if os.path.exists(out_path): 43 continue 44 45 raw = imageio.imread(rp) 46 if raw.ndim == 3: 47 raw = raw[..., 0] 48 49 label_name = name.replace("x_", "y_") 50 lp = os.path.join(label_dir, f"{label_name}.png") 51 if not os.path.exists(lp): 52 continue 53 54 labels = imageio.imread(lp) 55 if labels.ndim == 3: 56 labels = labels[..., 0] 57 labels = (labels >= 127).astype(np.uint8) 58 59 with h5py.File(out_path, "w") as f: 60 f.create_dataset("raw", data=raw, compression="gzip") 61 f.create_dataset("labels", data=labels, compression="gzip") 62 63 64def get_probtem_data( 65 path: Union[os.PathLike, str], 66 split: Literal["train", "test"] = "train", 67 download: bool = False, 68) -> str: 69 """Download and preprocess the ProbTEM dataset. 70 71 Args: 72 path: Filepath to a folder where the data will be saved. 73 split: The data split to use, either "train" or "test". 74 download: Whether to download the data if not present. 75 76 Returns: 77 Path to the folder containing preprocessed HDF5 files. 78 """ 79 processed_dir = os.path.join(str(path), "processed", split) 80 if os.path.isdir(processed_dir) and len(glob(os.path.join(processed_dir, "*.h5"))) > 0: 81 return processed_dir 82 83 raw_dir = os.path.join(str(path), PROBTEM_DATA_ROOT, split, "input") 84 label_dir = os.path.join(str(path), PROBTEM_DATA_ROOT, split, "target") 85 86 if not os.path.isdir(raw_dir): 87 if not download: 88 raise RuntimeError( 89 f"ProbTEM data not found at '{path}'. Set download=True or download manually from " 90 "https://yoonlab.unist.ac.kr/index.php/research/mitochondria-tem-dataset/ " 91 "and place in the given path." 92 ) 93 try: 94 import gdown 95 except ImportError: 96 raise ImportError("gdown is required to download ProbTEM: pip install gdown") 97 gdown.download_folder(id=PROBTEM_GDRIVE_FOLDER, output=str(path), quiet=False) 98 99 _preprocess_probtem(raw_dir, label_dir, processed_dir) 100 return processed_dir 101 102 103def get_probtem_paths( 104 path: Union[os.PathLike, str], 105 split: Literal["train", "test"] = "train", 106 download: bool = False, 107) -> List[str]: 108 """Get paths to ProbTEM HDF5 files. 109 110 Args: 111 path: Filepath to a folder where the data will be saved. 112 split: The data split to use, either "train" or "test". 113 download: Whether to download the data if not present. 114 115 Returns: 116 List of paths to HDF5 files. 117 """ 118 processed_dir = get_probtem_data(path, split, download) 119 return sorted(glob(os.path.join(processed_dir, "*.h5"))) 120 121 122def get_probtem_dataset( 123 path: Union[os.PathLike, str], 124 patch_shape: Tuple[int, int], 125 split: Literal["train", "test"] = "train", 126 download: bool = False, 127 **kwargs, 128) -> Dataset: 129 """Get the ProbTEM dataset for mitochondria segmentation in 2D TEM images. 130 131 Args: 132 path: Filepath to a folder where the data will be saved. 133 patch_shape: The patch shape (H, W) for training. 134 split: The data split to use, either "train" or "test". 135 download: Whether to download the data if not present. 136 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 137 138 Returns: 139 The segmentation dataset. 140 """ 141 assert len(patch_shape) == 2 142 data_paths = get_probtem_paths(path, split, download) 143 144 return torch_em.default_segmentation_dataset( 145 raw_paths=data_paths, 146 raw_key="raw", 147 label_paths=data_paths, 148 label_key="labels", 149 patch_shape=patch_shape, 150 is_seg_dataset=True, 151 **kwargs, 152 ) 153 154 155def get_probtem_loader( 156 path: Union[os.PathLike, str], 157 batch_size: int, 158 patch_shape: Tuple[int, int], 159 split: Literal["train", "test"] = "train", 160 download: bool = False, 161 **kwargs, 162) -> DataLoader: 163 """Get the DataLoader for mitochondria segmentation in the ProbTEM dataset. 164 165 Args: 166 path: Filepath to a folder where the data will be saved. 167 batch_size: The batch size for training. 168 patch_shape: The patch shape (H, W) for training. 169 split: The data split to use, either "train" or "test". 170 download: Whether to download the data if not present. 171 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` 172 or for the PyTorch DataLoader. 173 174 Returns: 175 The DataLoader. 176 """ 177 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 178 ds = get_probtem_dataset(path=path, patch_shape=patch_shape, split=split, download=download, **ds_kwargs) 179 return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)
65def get_probtem_data( 66 path: Union[os.PathLike, str], 67 split: Literal["train", "test"] = "train", 68 download: bool = False, 69) -> str: 70 """Download and preprocess the ProbTEM dataset. 71 72 Args: 73 path: Filepath to a folder where the data will be saved. 74 split: The data split to use, either "train" or "test". 75 download: Whether to download the data if not present. 76 77 Returns: 78 Path to the folder containing preprocessed HDF5 files. 79 """ 80 processed_dir = os.path.join(str(path), "processed", split) 81 if os.path.isdir(processed_dir) and len(glob(os.path.join(processed_dir, "*.h5"))) > 0: 82 return processed_dir 83 84 raw_dir = os.path.join(str(path), PROBTEM_DATA_ROOT, split, "input") 85 label_dir = os.path.join(str(path), PROBTEM_DATA_ROOT, split, "target") 86 87 if not os.path.isdir(raw_dir): 88 if not download: 89 raise RuntimeError( 90 f"ProbTEM data not found at '{path}'. Set download=True or download manually from " 91 "https://yoonlab.unist.ac.kr/index.php/research/mitochondria-tem-dataset/ " 92 "and place in the given path." 93 ) 94 try: 95 import gdown 96 except ImportError: 97 raise ImportError("gdown is required to download ProbTEM: pip install gdown") 98 gdown.download_folder(id=PROBTEM_GDRIVE_FOLDER, output=str(path), quiet=False) 99 100 _preprocess_probtem(raw_dir, label_dir, processed_dir) 101 return processed_dir
Download and preprocess the ProbTEM dataset.
Arguments:
- path: Filepath to a folder where the data will be saved.
- split: The data split to use, either "train" or "test".
- download: Whether to download the data if not present.
Returns:
Path to the folder containing preprocessed HDF5 files.
104def get_probtem_paths( 105 path: Union[os.PathLike, str], 106 split: Literal["train", "test"] = "train", 107 download: bool = False, 108) -> List[str]: 109 """Get paths to ProbTEM HDF5 files. 110 111 Args: 112 path: Filepath to a folder where the data will be saved. 113 split: The data split to use, either "train" or "test". 114 download: Whether to download the data if not present. 115 116 Returns: 117 List of paths to HDF5 files. 118 """ 119 processed_dir = get_probtem_data(path, split, download) 120 return sorted(glob(os.path.join(processed_dir, "*.h5")))
Get paths to ProbTEM HDF5 files.
Arguments:
- path: Filepath to a folder where the data will be saved.
- split: The data split to use, either "train" or "test".
- download: Whether to download the data if not present.
Returns:
List of paths to HDF5 files.
123def get_probtem_dataset( 124 path: Union[os.PathLike, str], 125 patch_shape: Tuple[int, int], 126 split: Literal["train", "test"] = "train", 127 download: bool = False, 128 **kwargs, 129) -> Dataset: 130 """Get the ProbTEM dataset for mitochondria segmentation in 2D TEM images. 131 132 Args: 133 path: Filepath to a folder where the data will be saved. 134 patch_shape: The patch shape (H, W) for training. 135 split: The data split to use, either "train" or "test". 136 download: Whether to download the data if not present. 137 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 138 139 Returns: 140 The segmentation dataset. 141 """ 142 assert len(patch_shape) == 2 143 data_paths = get_probtem_paths(path, split, download) 144 145 return torch_em.default_segmentation_dataset( 146 raw_paths=data_paths, 147 raw_key="raw", 148 label_paths=data_paths, 149 label_key="labels", 150 patch_shape=patch_shape, 151 is_seg_dataset=True, 152 **kwargs, 153 )
Get the ProbTEM dataset for mitochondria segmentation in 2D TEM images.
Arguments:
- path: Filepath to a folder where the data will be saved.
- patch_shape: The patch shape (H, W) for training.
- split: The data split to use, either "train" or "test".
- download: Whether to download the data if not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset.
Returns:
The segmentation dataset.
156def get_probtem_loader( 157 path: Union[os.PathLike, str], 158 batch_size: int, 159 patch_shape: Tuple[int, int], 160 split: Literal["train", "test"] = "train", 161 download: bool = False, 162 **kwargs, 163) -> DataLoader: 164 """Get the DataLoader for mitochondria segmentation in the ProbTEM dataset. 165 166 Args: 167 path: Filepath to a folder where the data will be saved. 168 batch_size: The batch size for training. 169 patch_shape: The patch shape (H, W) for training. 170 split: The data split to use, either "train" or "test". 171 download: Whether to download the data if not present. 172 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` 173 or for the PyTorch DataLoader. 174 175 Returns: 176 The DataLoader. 177 """ 178 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 179 ds = get_probtem_dataset(path=path, patch_shape=patch_shape, split=split, download=download, **ds_kwargs) 180 return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)
Get the DataLoader for mitochondria segmentation in the ProbTEM dataset.
Arguments:
- path: Filepath to a folder where the data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape (H, W) for training.
- split: The data split to use, either "train" or "test".
- download: Whether to download the data if not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_datasetor for the PyTorch DataLoader.
Returns:
The DataLoader.