torch_em.data.datasets.light_microscopy.mndino

The mnDINO dataset contains annotated micronuclei for training deep learning segmentation models.

The dataset comprises 232 fluorescence microscopy images of HeLa, U2OS, and RPE1 cell lines with 5,685 annotated micronuclei. Each image comes with two types of instance segmentation masks: nuclei masks (main nucleus bodies) and micronuclei masks (small nuclear fragments). Images were acquired on four different microscopy platforms.

The dataset is located at https://www.ebi.ac.uk/biostudies/bioimages/studies/S-BIAD2809. This dataset is from the publication https://doi.org/10.7554/elife.101579 and https://doi.org/10.64898/2026.03.09.710648.

Please cite them if you use this dataset for your research.

View Source

  1"""The mnDINO dataset contains annotated micronuclei for training deep learning segmentation models.
  2
  3The dataset comprises 232 fluorescence microscopy images of HeLa, U2OS, and RPE1 cell lines
  4with 5,685 annotated micronuclei. Each image comes with two types of instance segmentation masks:
  5nuclei masks (main nucleus bodies) and micronuclei masks (small nuclear fragments).
  6Images were acquired on four different microscopy platforms.
  7
  8The dataset is located at https://www.ebi.ac.uk/biostudies/bioimages/studies/S-BIAD2809.
  9This dataset is from the publication https://doi.org/10.7554/elife.101579
 10and https://doi.org/10.64898/2026.03.09.710648.
 11
 12Please cite them if you use this dataset for your research.
 13"""
 14
 15import os
 16import tarfile
 17from glob import glob
 18from typing import List, Literal, Optional, Tuple, Union
 19
 20from natsort import natsorted
 21from tqdm import tqdm
 22
 23from torch.utils.data import Dataset, DataLoader
 24
 25import torch_em
 26
 27from .. import util
 28
 29
 30URL = "https://www.ebi.ac.uk/biostudies/files/S-BIAD2809/annotated_mn_datasets.tar.gz"
 31CHECKSUM = None
 32
 33# The archive folder is "validation" but we expose it as "val" to callers.
 34_SPLIT_DIR = {"train": "train", "val": "validation", "test": "test"}
 35
 36
 37def _preprocess_data(path: str) -> None:
 38    import h5py
 39    import imageio.v3 as imageio
 40
 41    extracted_root = os.path.join(path, "annotated_mn_datasets")
 42
 43    for split_key, split_dir in _SPLIT_DIR.items():
 44        h5_dir = os.path.join(path, "h5_data", split_key)
 45        os.makedirs(h5_dir, exist_ok=True)
 46
 47        image_paths = natsorted(glob(os.path.join(extracted_root, split_dir, "images", "*.tif")))
 48        if not image_paths:
 49            continue
 50
 51        for img_path in tqdm(image_paths, desc=f"Preprocessing '{split_key}'"):
 52            fname = os.path.splitext(os.path.basename(img_path))[0]
 53            h5_path = os.path.join(h5_dir, f"{fname}.h5")
 54            if os.path.exists(h5_path):
 55                continue
 56
 57            nuclei_path = os.path.join(extracted_root, split_dir, "nuclei_masks", f"{fname}.tif")
 58            mn_path = os.path.join(extracted_root, split_dir, "mn_masks", f"{fname}.png")
 59
 60            raw = imageio.imread(img_path)
 61            nuclei_labels = imageio.imread(nuclei_path) if os.path.exists(nuclei_path) else None
 62            mn_labels = imageio.imread(mn_path) if os.path.exists(mn_path) else None
 63
 64            with h5py.File(h5_path, "w") as f:
 65                f.create_dataset("raw", data=raw, compression="gzip")
 66                if nuclei_labels is not None:
 67                    f.create_dataset("labels/nuclei", data=nuclei_labels.astype("int64"), compression="gzip")
 68                if mn_labels is not None:
 69                    f.create_dataset("labels/micronuclei", data=mn_labels.astype("int64"), compression="gzip")
 70
 71
 72def get_mndino_data(
 73    path: Union[os.PathLike, str],
 74    download: bool = False,
 75) -> str:
 76    """Download the mnDINO dataset.
 77
 78    Args:
 79        path: Filepath to a folder where the downloaded data will be saved.
 80        download: Whether to download the data if it is not present.
 81
 82    Returns:
 83        The filepath to the folder with the downloaded data.
 84    """
 85    path = str(path)
 86    os.makedirs(path, exist_ok=True)
 87
 88    extracted_root = os.path.join(path, "annotated_mn_datasets")
 89    if not os.path.exists(extracted_root):
 90        tar_path = os.path.join(path, "annotated_mn_datasets.tar.gz")
 91        util.download_source(path=tar_path, url=URL, download=download, checksum=CHECKSUM)
 92
 93        # The file is a plain tar archive despite the .tar.gz extension.
 94        with tarfile.open(tar_path, "r") as tf:
 95            tf.extractall(path)
 96        os.remove(tar_path)
 97
 98    h5_root = os.path.join(path, "h5_data")
 99    if not os.path.exists(h5_root):
100        _preprocess_data(path)
101
102    return path
103
104
105def get_mndino_paths(
106    path: Union[os.PathLike, str],
107    split: Literal["train", "val", "test"],
108    download: bool = False,
109) -> List[str]:
110    """Get paths to the mnDINO HDF5 files.
111
112    Args:
113        path: Filepath to a folder where the downloaded data will be saved.
114        split: The data split. One of 'train', 'val', or 'test'.
115        download: Whether to download the data if it is not present.
116
117    Returns:
118        List of filepaths to the HDF5 files for the given split.
119    """
120    if split not in _SPLIT_DIR:
121        raise ValueError(f"'{split}' is not a valid split. Choose from {list(_SPLIT_DIR)}.")
122
123    get_mndino_data(path, download)
124
125    h5_dir = os.path.join(path, "h5_data", split)
126    if not os.path.exists(h5_dir) or len(glob(os.path.join(h5_dir, "*.h5"))) == 0:
127        _preprocess_data(str(path))
128
129    h5_paths = natsorted(glob(os.path.join(h5_dir, "*.h5")))
130    assert len(h5_paths) > 0, f"No data found for split '{split}' at '{h5_dir}'."
131    return h5_paths
132
133
134def get_mndino_dataset(
135    path: Union[os.PathLike, str],
136    patch_shape: Tuple[int, int],
137    split: Literal["train", "val", "test"],
138    label_choice: Literal["nuclei", "micronuclei"] = "micronuclei",
139    download: bool = False,
140    offsets: Optional[List[List[int]]] = None,
141    boundaries: bool = False,
142    binary: bool = False,
143    **kwargs,
144) -> Dataset:
145    """Get the mnDINO dataset for nucleus / micronucleus segmentation.
146
147    Args:
148        path: Filepath to a folder where the downloaded data will be saved.
149        patch_shape: The patch shape (height, width) to use for training.
150        split: The data split. One of 'train', 'val', or 'test'.
151        label_choice: Which segmentation target to use. Either 'nuclei' (main nucleus
152            instance masks) or 'micronuclei' (micronucleus instance masks).
153        download: Whether to download the data if it is not present.
154        offsets: Offset values for affinity computation used as target.
155        boundaries: Whether to compute boundaries as the target.
156        binary: Whether to use a binary segmentation target.
157        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
158
159    Returns:
160        The segmentation dataset.
161    """
162    if label_choice == "nuclei":
163        label_key = "labels/nuclei"
164    elif label_choice == "micronuclei":
165        label_key = "labels/micronuclei"
166    else:
167        raise ValueError(f"'{label_choice}' is not a valid label_choice. Choose 'nuclei' or 'micronuclei'.")
168
169    h5_paths = get_mndino_paths(path, split, download)
170
171    kwargs, _ = util.add_instance_label_transform(
172        kwargs, add_binary_target=binary, boundaries=boundaries, offsets=offsets,
173    )
174    kwargs = util.ensure_transforms(ndim=2, **kwargs)
175
176    return torch_em.default_segmentation_dataset(
177        raw_paths=h5_paths,
178        raw_key="raw",
179        label_paths=h5_paths,
180        label_key=label_key,
181        patch_shape=patch_shape,
182        ndim=2,
183        **kwargs,
184    )
185
186
187def get_mndino_loader(
188    path: Union[os.PathLike, str],
189    batch_size: int,
190    patch_shape: Tuple[int, int],
191    split: Literal["train", "val", "test"],
192    label_choice: Literal["nuclei", "micronuclei"] = "micronuclei",
193    download: bool = False,
194    offsets: Optional[List[List[int]]] = None,
195    boundaries: bool = False,
196    binary: bool = False,
197    **kwargs,
198) -> DataLoader:
199    """Get the DataLoader for the mnDINO dataset.
200
201    Args:
202        path: Filepath to a folder where the downloaded data will be saved.
203        batch_size: The batch size for training.
204        patch_shape: The patch shape (height, width) to use for training.
205        split: The data split. One of 'train', 'val', or 'test'.
206        label_choice: Which segmentation target to use. Either 'nuclei' (main nucleus
207            instance masks) or 'micronuclei' (micronucleus instance masks).
208        download: Whether to download the data if it is not present.
209        offsets: Offset values for affinity computation used as target.
210        boundaries: Whether to compute boundaries as the target.
211        binary: Whether to use a binary segmentation target.
212        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`
213            or for the PyTorch DataLoader.
214
215    Returns:
216        The DataLoader.
217    """
218    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
219    dataset = get_mndino_dataset(
220        path=path,
221        patch_shape=patch_shape,
222        split=split,
223        label_choice=label_choice,
224        download=download,
225        offsets=offsets,
226        boundaries=boundaries,
227        binary=binary,
228        **ds_kwargs,
229    )
230    return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)

URL = 'https://www.ebi.ac.uk/biostudies/files/S-BIAD2809/annotated_mn_datasets.tar.gz'

CHECKSUM = None

def get_mndino_data(path: Union[os.PathLike, str], download: bool = False) -> str: View Source

 73def get_mndino_data(
 74    path: Union[os.PathLike, str],
 75    download: bool = False,
 76) -> str:
 77    """Download the mnDINO dataset.
 78
 79    Args:
 80        path: Filepath to a folder where the downloaded data will be saved.
 81        download: Whether to download the data if it is not present.
 82
 83    Returns:
 84        The filepath to the folder with the downloaded data.
 85    """
 86    path = str(path)
 87    os.makedirs(path, exist_ok=True)
 88
 89    extracted_root = os.path.join(path, "annotated_mn_datasets")
 90    if not os.path.exists(extracted_root):
 91        tar_path = os.path.join(path, "annotated_mn_datasets.tar.gz")
 92        util.download_source(path=tar_path, url=URL, download=download, checksum=CHECKSUM)
 93
 94        # The file is a plain tar archive despite the .tar.gz extension.
 95        with tarfile.open(tar_path, "r") as tf:
 96            tf.extractall(path)
 97        os.remove(tar_path)
 98
 99    h5_root = os.path.join(path, "h5_data")
100    if not os.path.exists(h5_root):
101        _preprocess_data(path)
102
103    return path

Download the mnDINO dataset.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
download: Whether to download the data if it is not present.

Returns:

The filepath to the folder with the downloaded data.

def get_mndino_paths( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], download: bool = False) -> List[str]: View Source

106def get_mndino_paths(
107    path: Union[os.PathLike, str],
108    split: Literal["train", "val", "test"],
109    download: bool = False,
110) -> List[str]:
111    """Get paths to the mnDINO HDF5 files.
112
113    Args:
114        path: Filepath to a folder where the downloaded data will be saved.
115        split: The data split. One of 'train', 'val', or 'test'.
116        download: Whether to download the data if it is not present.
117
118    Returns:
119        List of filepaths to the HDF5 files for the given split.
120    """
121    if split not in _SPLIT_DIR:
122        raise ValueError(f"'{split}' is not a valid split. Choose from {list(_SPLIT_DIR)}.")
123
124    get_mndino_data(path, download)
125
126    h5_dir = os.path.join(path, "h5_data", split)
127    if not os.path.exists(h5_dir) or len(glob(os.path.join(h5_dir, "*.h5"))) == 0:
128        _preprocess_data(str(path))
129
130    h5_paths = natsorted(glob(os.path.join(h5_dir, "*.h5")))
131    assert len(h5_paths) > 0, f"No data found for split '{split}' at '{h5_dir}'."
132    return h5_paths

Get paths to the mnDINO HDF5 files.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
split: The data split. One of 'train', 'val', or 'test'.
download: Whether to download the data if it is not present.

Returns:

List of filepaths to the HDF5 files for the given split.

def get_mndino_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], label_choice: Literal['nuclei', 'micronuclei'] = 'micronuclei', download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset: View Source

135def get_mndino_dataset(
136    path: Union[os.PathLike, str],
137    patch_shape: Tuple[int, int],
138    split: Literal["train", "val", "test"],
139    label_choice: Literal["nuclei", "micronuclei"] = "micronuclei",
140    download: bool = False,
141    offsets: Optional[List[List[int]]] = None,
142    boundaries: bool = False,
143    binary: bool = False,
144    **kwargs,
145) -> Dataset:
146    """Get the mnDINO dataset for nucleus / micronucleus segmentation.
147
148    Args:
149        path: Filepath to a folder where the downloaded data will be saved.
150        patch_shape: The patch shape (height, width) to use for training.
151        split: The data split. One of 'train', 'val', or 'test'.
152        label_choice: Which segmentation target to use. Either 'nuclei' (main nucleus
153            instance masks) or 'micronuclei' (micronucleus instance masks).
154        download: Whether to download the data if it is not present.
155        offsets: Offset values for affinity computation used as target.
156        boundaries: Whether to compute boundaries as the target.
157        binary: Whether to use a binary segmentation target.
158        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
159
160    Returns:
161        The segmentation dataset.
162    """
163    if label_choice == "nuclei":
164        label_key = "labels/nuclei"
165    elif label_choice == "micronuclei":
166        label_key = "labels/micronuclei"
167    else:
168        raise ValueError(f"'{label_choice}' is not a valid label_choice. Choose 'nuclei' or 'micronuclei'.")
169
170    h5_paths = get_mndino_paths(path, split, download)
171
172    kwargs, _ = util.add_instance_label_transform(
173        kwargs, add_binary_target=binary, boundaries=boundaries, offsets=offsets,
174    )
175    kwargs = util.ensure_transforms(ndim=2, **kwargs)
176
177    return torch_em.default_segmentation_dataset(
178        raw_paths=h5_paths,
179        raw_key="raw",
180        label_paths=h5_paths,
181        label_key=label_key,
182        patch_shape=patch_shape,
183        ndim=2,
184        **kwargs,
185    )

Get the mnDINO dataset for nucleus / micronucleus segmentation.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
patch_shape: The patch shape (height, width) to use for training.
split: The data split. One of 'train', 'val', or 'test'.
label_choice: Which segmentation target to use. Either 'nuclei' (main nucleus instance masks) or 'micronuclei' (micronucleus instance masks).
download: Whether to download the data if it is not present.
offsets: Offset values for affinity computation used as target.
boundaries: Whether to compute boundaries as the target.
binary: Whether to use a binary segmentation target.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

Returns:

The segmentation dataset.

def get_mndino_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], label_choice: Literal['nuclei', 'micronuclei'] = 'micronuclei', download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader: View Source

188def get_mndino_loader(
189    path: Union[os.PathLike, str],
190    batch_size: int,
191    patch_shape: Tuple[int, int],
192    split: Literal["train", "val", "test"],
193    label_choice: Literal["nuclei", "micronuclei"] = "micronuclei",
194    download: bool = False,
195    offsets: Optional[List[List[int]]] = None,
196    boundaries: bool = False,
197    binary: bool = False,
198    **kwargs,
199) -> DataLoader:
200    """Get the DataLoader for the mnDINO dataset.
201
202    Args:
203        path: Filepath to a folder where the downloaded data will be saved.
204        batch_size: The batch size for training.
205        patch_shape: The patch shape (height, width) to use for training.
206        split: The data split. One of 'train', 'val', or 'test'.
207        label_choice: Which segmentation target to use. Either 'nuclei' (main nucleus
208            instance masks) or 'micronuclei' (micronucleus instance masks).
209        download: Whether to download the data if it is not present.
210        offsets: Offset values for affinity computation used as target.
211        boundaries: Whether to compute boundaries as the target.
212        binary: Whether to use a binary segmentation target.
213        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`
214            or for the PyTorch DataLoader.
215
216    Returns:
217        The DataLoader.
218    """
219    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
220    dataset = get_mndino_dataset(
221        path=path,
222        patch_shape=patch_shape,
223        split=split,
224        label_choice=label_choice,
225        download=download,
226        offsets=offsets,
227        boundaries=boundaries,
228        binary=binary,
229        **ds_kwargs,
230    )
231    return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)

Get the DataLoader for the mnDINO dataset.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
batch_size: The batch size for training.
patch_shape: The patch shape (height, width) to use for training.
split: The data split. One of 'train', 'val', or 'test'.
label_choice: Which segmentation target to use. Either 'nuclei' (main nucleus instance masks) or 'micronuclei' (micronucleus instance masks).
download: Whether to download the data if it is not present.
offsets: Offset values for affinity computation used as target.
boundaries: Whether to compute boundaries as the target.
binary: Whether to use a binary segmentation target.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

Returns:

The DataLoader.