torch_em.data.datasets.light_microscopy.mndino
The mnDINO dataset contains annotated micronuclei for training deep learning segmentation models.
The dataset comprises 232 fluorescence microscopy images of HeLa, U2OS, and RPE1 cell lines with 5,685 annotated micronuclei. Each image comes with two types of instance segmentation masks: nuclei masks (main nucleus bodies) and micronuclei masks (small nuclear fragments). Images were acquired on four different microscopy platforms.
The dataset is located at https://www.ebi.ac.uk/biostudies/bioimages/studies/S-BIAD2809. This dataset is from the publication https://doi.org/10.7554/elife.101579 and https://doi.org/10.64898/2026.03.09.710648.
Please cite them if you use this dataset for your research.
1"""The mnDINO dataset contains annotated micronuclei for training deep learning segmentation models. 2 3The dataset comprises 232 fluorescence microscopy images of HeLa, U2OS, and RPE1 cell lines 4with 5,685 annotated micronuclei. Each image comes with two types of instance segmentation masks: 5nuclei masks (main nucleus bodies) and micronuclei masks (small nuclear fragments). 6Images were acquired on four different microscopy platforms. 7 8The dataset is located at https://www.ebi.ac.uk/biostudies/bioimages/studies/S-BIAD2809. 9This dataset is from the publication https://doi.org/10.7554/elife.101579 10and https://doi.org/10.64898/2026.03.09.710648. 11 12Please cite them if you use this dataset for your research. 13""" 14 15import os 16import tarfile 17from glob import glob 18from typing import List, Literal, Optional, Tuple, Union 19 20from natsort import natsorted 21from tqdm import tqdm 22 23from torch.utils.data import Dataset, DataLoader 24 25import torch_em 26 27from .. import util 28 29 30URL = "https://www.ebi.ac.uk/biostudies/files/S-BIAD2809/annotated_mn_datasets.tar.gz" 31CHECKSUM = None 32 33# The archive folder is "validation" but we expose it as "val" to callers. 34_SPLIT_DIR = {"train": "train", "val": "validation", "test": "test"} 35 36 37def _preprocess_data(path: str) -> None: 38 import h5py 39 import imageio.v3 as imageio 40 41 extracted_root = os.path.join(path, "annotated_mn_datasets") 42 43 for split_key, split_dir in _SPLIT_DIR.items(): 44 h5_dir = os.path.join(path, "h5_data", split_key) 45 os.makedirs(h5_dir, exist_ok=True) 46 47 image_paths = natsorted(glob(os.path.join(extracted_root, split_dir, "images", "*.tif"))) 48 if not image_paths: 49 continue 50 51 for img_path in tqdm(image_paths, desc=f"Preprocessing '{split_key}'"): 52 fname = os.path.splitext(os.path.basename(img_path))[0] 53 h5_path = os.path.join(h5_dir, f"{fname}.h5") 54 if os.path.exists(h5_path): 55 continue 56 57 nuclei_path = os.path.join(extracted_root, split_dir, "nuclei_masks", f"{fname}.tif") 58 mn_path = os.path.join(extracted_root, split_dir, "mn_masks", f"{fname}.png") 59 60 raw = imageio.imread(img_path) 61 nuclei_labels = imageio.imread(nuclei_path) if os.path.exists(nuclei_path) else None 62 mn_labels = imageio.imread(mn_path) if os.path.exists(mn_path) else None 63 64 with h5py.File(h5_path, "w") as f: 65 f.create_dataset("raw", data=raw, compression="gzip") 66 if nuclei_labels is not None: 67 f.create_dataset("labels/nuclei", data=nuclei_labels.astype("int64"), compression="gzip") 68 if mn_labels is not None: 69 f.create_dataset("labels/micronuclei", data=mn_labels.astype("int64"), compression="gzip") 70 71 72def get_mndino_data( 73 path: Union[os.PathLike, str], 74 download: bool = False, 75) -> str: 76 """Download the mnDINO dataset. 77 78 Args: 79 path: Filepath to a folder where the downloaded data will be saved. 80 download: Whether to download the data if it is not present. 81 82 Returns: 83 The filepath to the folder with the downloaded data. 84 """ 85 path = str(path) 86 os.makedirs(path, exist_ok=True) 87 88 extracted_root = os.path.join(path, "annotated_mn_datasets") 89 if not os.path.exists(extracted_root): 90 tar_path = os.path.join(path, "annotated_mn_datasets.tar.gz") 91 util.download_source(path=tar_path, url=URL, download=download, checksum=CHECKSUM) 92 93 # The file is a plain tar archive despite the .tar.gz extension. 94 with tarfile.open(tar_path, "r") as tf: 95 tf.extractall(path) 96 os.remove(tar_path) 97 98 h5_root = os.path.join(path, "h5_data") 99 if not os.path.exists(h5_root): 100 _preprocess_data(path) 101 102 return path 103 104 105def get_mndino_paths( 106 path: Union[os.PathLike, str], 107 split: Literal["train", "val", "test"], 108 download: bool = False, 109) -> List[str]: 110 """Get paths to the mnDINO HDF5 files. 111 112 Args: 113 path: Filepath to a folder where the downloaded data will be saved. 114 split: The data split. One of 'train', 'val', or 'test'. 115 download: Whether to download the data if it is not present. 116 117 Returns: 118 List of filepaths to the HDF5 files for the given split. 119 """ 120 if split not in _SPLIT_DIR: 121 raise ValueError(f"'{split}' is not a valid split. Choose from {list(_SPLIT_DIR)}.") 122 123 get_mndino_data(path, download) 124 125 h5_dir = os.path.join(path, "h5_data", split) 126 if not os.path.exists(h5_dir) or len(glob(os.path.join(h5_dir, "*.h5"))) == 0: 127 _preprocess_data(str(path)) 128 129 h5_paths = natsorted(glob(os.path.join(h5_dir, "*.h5"))) 130 assert len(h5_paths) > 0, f"No data found for split '{split}' at '{h5_dir}'." 131 return h5_paths 132 133 134def get_mndino_dataset( 135 path: Union[os.PathLike, str], 136 patch_shape: Tuple[int, int], 137 split: Literal["train", "val", "test"], 138 label_choice: Literal["nuclei", "micronuclei"] = "micronuclei", 139 download: bool = False, 140 offsets: Optional[List[List[int]]] = None, 141 boundaries: bool = False, 142 binary: bool = False, 143 **kwargs, 144) -> Dataset: 145 """Get the mnDINO dataset for nucleus / micronucleus segmentation. 146 147 Args: 148 path: Filepath to a folder where the downloaded data will be saved. 149 patch_shape: The patch shape (height, width) to use for training. 150 split: The data split. One of 'train', 'val', or 'test'. 151 label_choice: Which segmentation target to use. Either 'nuclei' (main nucleus 152 instance masks) or 'micronuclei' (micronucleus instance masks). 153 download: Whether to download the data if it is not present. 154 offsets: Offset values for affinity computation used as target. 155 boundaries: Whether to compute boundaries as the target. 156 binary: Whether to use a binary segmentation target. 157 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 158 159 Returns: 160 The segmentation dataset. 161 """ 162 if label_choice == "nuclei": 163 label_key = "labels/nuclei" 164 elif label_choice == "micronuclei": 165 label_key = "labels/micronuclei" 166 else: 167 raise ValueError(f"'{label_choice}' is not a valid label_choice. Choose 'nuclei' or 'micronuclei'.") 168 169 h5_paths = get_mndino_paths(path, split, download) 170 171 kwargs, _ = util.add_instance_label_transform( 172 kwargs, add_binary_target=binary, boundaries=boundaries, offsets=offsets, 173 ) 174 kwargs = util.ensure_transforms(ndim=2, **kwargs) 175 176 return torch_em.default_segmentation_dataset( 177 raw_paths=h5_paths, 178 raw_key="raw", 179 label_paths=h5_paths, 180 label_key=label_key, 181 patch_shape=patch_shape, 182 ndim=2, 183 **kwargs, 184 ) 185 186 187def get_mndino_loader( 188 path: Union[os.PathLike, str], 189 batch_size: int, 190 patch_shape: Tuple[int, int], 191 split: Literal["train", "val", "test"], 192 label_choice: Literal["nuclei", "micronuclei"] = "micronuclei", 193 download: bool = False, 194 offsets: Optional[List[List[int]]] = None, 195 boundaries: bool = False, 196 binary: bool = False, 197 **kwargs, 198) -> DataLoader: 199 """Get the DataLoader for the mnDINO dataset. 200 201 Args: 202 path: Filepath to a folder where the downloaded data will be saved. 203 batch_size: The batch size for training. 204 patch_shape: The patch shape (height, width) to use for training. 205 split: The data split. One of 'train', 'val', or 'test'. 206 label_choice: Which segmentation target to use. Either 'nuclei' (main nucleus 207 instance masks) or 'micronuclei' (micronucleus instance masks). 208 download: Whether to download the data if it is not present. 209 offsets: Offset values for affinity computation used as target. 210 boundaries: Whether to compute boundaries as the target. 211 binary: Whether to use a binary segmentation target. 212 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` 213 or for the PyTorch DataLoader. 214 215 Returns: 216 The DataLoader. 217 """ 218 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 219 dataset = get_mndino_dataset( 220 path=path, 221 patch_shape=patch_shape, 222 split=split, 223 label_choice=label_choice, 224 download=download, 225 offsets=offsets, 226 boundaries=boundaries, 227 binary=binary, 228 **ds_kwargs, 229 ) 230 return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)
73def get_mndino_data( 74 path: Union[os.PathLike, str], 75 download: bool = False, 76) -> str: 77 """Download the mnDINO dataset. 78 79 Args: 80 path: Filepath to a folder where the downloaded data will be saved. 81 download: Whether to download the data if it is not present. 82 83 Returns: 84 The filepath to the folder with the downloaded data. 85 """ 86 path = str(path) 87 os.makedirs(path, exist_ok=True) 88 89 extracted_root = os.path.join(path, "annotated_mn_datasets") 90 if not os.path.exists(extracted_root): 91 tar_path = os.path.join(path, "annotated_mn_datasets.tar.gz") 92 util.download_source(path=tar_path, url=URL, download=download, checksum=CHECKSUM) 93 94 # The file is a plain tar archive despite the .tar.gz extension. 95 with tarfile.open(tar_path, "r") as tf: 96 tf.extractall(path) 97 os.remove(tar_path) 98 99 h5_root = os.path.join(path, "h5_data") 100 if not os.path.exists(h5_root): 101 _preprocess_data(path) 102 103 return path
Download the mnDINO dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- download: Whether to download the data if it is not present.
Returns:
The filepath to the folder with the downloaded data.
106def get_mndino_paths( 107 path: Union[os.PathLike, str], 108 split: Literal["train", "val", "test"], 109 download: bool = False, 110) -> List[str]: 111 """Get paths to the mnDINO HDF5 files. 112 113 Args: 114 path: Filepath to a folder where the downloaded data will be saved. 115 split: The data split. One of 'train', 'val', or 'test'. 116 download: Whether to download the data if it is not present. 117 118 Returns: 119 List of filepaths to the HDF5 files for the given split. 120 """ 121 if split not in _SPLIT_DIR: 122 raise ValueError(f"'{split}' is not a valid split. Choose from {list(_SPLIT_DIR)}.") 123 124 get_mndino_data(path, download) 125 126 h5_dir = os.path.join(path, "h5_data", split) 127 if not os.path.exists(h5_dir) or len(glob(os.path.join(h5_dir, "*.h5"))) == 0: 128 _preprocess_data(str(path)) 129 130 h5_paths = natsorted(glob(os.path.join(h5_dir, "*.h5"))) 131 assert len(h5_paths) > 0, f"No data found for split '{split}' at '{h5_dir}'." 132 return h5_paths
Get paths to the mnDINO HDF5 files.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The data split. One of 'train', 'val', or 'test'.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths to the HDF5 files for the given split.
135def get_mndino_dataset( 136 path: Union[os.PathLike, str], 137 patch_shape: Tuple[int, int], 138 split: Literal["train", "val", "test"], 139 label_choice: Literal["nuclei", "micronuclei"] = "micronuclei", 140 download: bool = False, 141 offsets: Optional[List[List[int]]] = None, 142 boundaries: bool = False, 143 binary: bool = False, 144 **kwargs, 145) -> Dataset: 146 """Get the mnDINO dataset for nucleus / micronucleus segmentation. 147 148 Args: 149 path: Filepath to a folder where the downloaded data will be saved. 150 patch_shape: The patch shape (height, width) to use for training. 151 split: The data split. One of 'train', 'val', or 'test'. 152 label_choice: Which segmentation target to use. Either 'nuclei' (main nucleus 153 instance masks) or 'micronuclei' (micronucleus instance masks). 154 download: Whether to download the data if it is not present. 155 offsets: Offset values for affinity computation used as target. 156 boundaries: Whether to compute boundaries as the target. 157 binary: Whether to use a binary segmentation target. 158 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 159 160 Returns: 161 The segmentation dataset. 162 """ 163 if label_choice == "nuclei": 164 label_key = "labels/nuclei" 165 elif label_choice == "micronuclei": 166 label_key = "labels/micronuclei" 167 else: 168 raise ValueError(f"'{label_choice}' is not a valid label_choice. Choose 'nuclei' or 'micronuclei'.") 169 170 h5_paths = get_mndino_paths(path, split, download) 171 172 kwargs, _ = util.add_instance_label_transform( 173 kwargs, add_binary_target=binary, boundaries=boundaries, offsets=offsets, 174 ) 175 kwargs = util.ensure_transforms(ndim=2, **kwargs) 176 177 return torch_em.default_segmentation_dataset( 178 raw_paths=h5_paths, 179 raw_key="raw", 180 label_paths=h5_paths, 181 label_key=label_key, 182 patch_shape=patch_shape, 183 ndim=2, 184 **kwargs, 185 )
Get the mnDINO dataset for nucleus / micronucleus segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape (height, width) to use for training.
- split: The data split. One of 'train', 'val', or 'test'.
- label_choice: Which segmentation target to use. Either 'nuclei' (main nucleus instance masks) or 'micronuclei' (micronucleus instance masks).
- download: Whether to download the data if it is not present.
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- binary: Whether to use a binary segmentation target.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset.
Returns:
The segmentation dataset.
188def get_mndino_loader( 189 path: Union[os.PathLike, str], 190 batch_size: int, 191 patch_shape: Tuple[int, int], 192 split: Literal["train", "val", "test"], 193 label_choice: Literal["nuclei", "micronuclei"] = "micronuclei", 194 download: bool = False, 195 offsets: Optional[List[List[int]]] = None, 196 boundaries: bool = False, 197 binary: bool = False, 198 **kwargs, 199) -> DataLoader: 200 """Get the DataLoader for the mnDINO dataset. 201 202 Args: 203 path: Filepath to a folder where the downloaded data will be saved. 204 batch_size: The batch size for training. 205 patch_shape: The patch shape (height, width) to use for training. 206 split: The data split. One of 'train', 'val', or 'test'. 207 label_choice: Which segmentation target to use. Either 'nuclei' (main nucleus 208 instance masks) or 'micronuclei' (micronucleus instance masks). 209 download: Whether to download the data if it is not present. 210 offsets: Offset values for affinity computation used as target. 211 boundaries: Whether to compute boundaries as the target. 212 binary: Whether to use a binary segmentation target. 213 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` 214 or for the PyTorch DataLoader. 215 216 Returns: 217 The DataLoader. 218 """ 219 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 220 dataset = get_mndino_dataset( 221 path=path, 222 patch_shape=patch_shape, 223 split=split, 224 label_choice=label_choice, 225 download=download, 226 offsets=offsets, 227 boundaries=boundaries, 228 binary=binary, 229 **ds_kwargs, 230 ) 231 return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)
Get the DataLoader for the mnDINO dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape (height, width) to use for training.
- split: The data split. One of 'train', 'val', or 'test'.
- label_choice: Which segmentation target to use. Either 'nuclei' (main nucleus instance masks) or 'micronuclei' (micronucleus instance masks).
- download: Whether to download the data if it is not present.
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- binary: Whether to use a binary segmentation target.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_datasetor for the PyTorch DataLoader.
Returns:
The DataLoader.