torch_em.data.datasets.light_microscopy.morphoseg

The MorphoSeg dataset contains bright-field microscopy images of NTERA-2 (NT2) human preneuronal embryonic cells at day 11 and day 12 of all-trans-retinoic acid differentiation, annotated for cell instance segmentation.

Images were acquired with a Leica DM IRB bright-field microscope (10x and 20x) and a Google Pixel 4 mobile phone camera. The dataset has 36 annotated training images and an unannotated test set.

Note: annotations are sparse - only a subset of the visible cells in each image are labeled (~10% pixel coverage despite ~20% cell-like content).

The dataset is located at https://doi.org/10.15131/shef.data.25604421. This dataset is from the following publication:

Zhang et al. (2025): https://doi.org/10.1016/j.neucom.2025.130511 Please cite it if you use this dataset in your research.

View Source

  1"""The MorphoSeg dataset contains bright-field microscopy images of NTERA-2 (NT2)
  2human preneuronal embryonic cells at day 11 and day 12 of all-trans-retinoic acid
  3differentiation, annotated for cell instance segmentation.
  4
  5Images were acquired with a Leica DM IRB bright-field microscope (10x and 20x) and
  6a Google Pixel 4 mobile phone camera. The dataset has 36 annotated training images
  7and an unannotated test set.
  8
  9Note: annotations are sparse - only a subset of the visible cells in each image
 10are labeled (~10% pixel coverage despite ~20% cell-like content).
 11
 12The dataset is located at https://doi.org/10.15131/shef.data.25604421.
 13This dataset is from the following publication:
 14- Zhang et al. (2025): https://doi.org/10.1016/j.neucom.2025.130511
 15Please cite it if you use this dataset in your research.
 16"""
 17
 18import os
 19import json
 20from glob import glob
 21from natsort import natsorted
 22from typing import List, Tuple, Union
 23
 24import numpy as np
 25
 26from torch.utils.data import Dataset, DataLoader
 27
 28import torch_em
 29
 30from .. import util
 31
 32
 33URLS = {
 34    "train": "https://ndownloader.figshare.com/files/45654198",
 35    "test": "https://ndownloader.figshare.com/files/45654201",
 36    "rois": "https://ndownloader.figshare.com/files/45654207",
 37}
 38CHECKSUMS = {
 39    "train": None,
 40    "test": None,
 41    "rois": None,
 42}
 43
 44
 45def _rois_to_masks(data_dir: str) -> None:
 46    """Convert polygon ROI JSON files to per-image instance segmentation TIF masks."""
 47    import imageio.v3 as imageio
 48    from skimage.draw import polygon as draw_polygon
 49
 50    roi_dir = os.path.join(data_dir, "roi_jsons_combined")
 51    mask_dir = os.path.join(data_dir, "masks")
 52    os.makedirs(mask_dir, exist_ok=True)
 53
 54    img_dir = os.path.join(data_dir, "training_dataset")
 55    for json_path in natsorted(glob(os.path.join(roi_dir, "*_ROI.json"))):
 56        stem = os.path.basename(json_path).replace("_ROI.json", "")
 57        img_path = os.path.join(img_dir, stem + ".tif")
 58        if not os.path.exists(img_path):
 59            # Try .MP.tif variant.
 60            img_path = os.path.join(img_dir, stem + ".MP.tif")
 61            if not os.path.exists(img_path):
 62                continue
 63
 64        img = imageio.imread(img_path)
 65        h, w = img.shape[:2]
 66
 67        with open(json_path) as f:
 68            rois = json.load(f)
 69
 70        mask = np.zeros((h, w), dtype=np.int32)
 71        for instance_id, roi in enumerate(rois, start=1):
 72            pts = np.array(roi["points"])  # [[x, y], ...]
 73            rr, cc = draw_polygon(pts[:, 1], pts[:, 0], shape=(h, w))
 74            mask[rr, cc] = instance_id
 75
 76        imageio.imwrite(os.path.join(mask_dir, stem + "_mask.tif"), mask)
 77
 78
 79def get_morphoseg_data(path: Union[os.PathLike, str], split: str, download: bool = False) -> str:
 80    """Download the MorphoSeg (NTERA-2) dataset.
 81
 82    Args:
 83        path: Filepath to a folder where the downloaded data will be saved.
 84        split: The data split. Either 'train' or 'test'.
 85        download: Whether to download the data if it is not present.
 86
 87    Returns:
 88        The filepath to the extracted data directory.
 89    """
 90    assert split in ("train", "test"), f"'{split}' is not a valid split. Choose 'train' or 'test'."
 91
 92    data_dir = os.path.join(path, split)
 93    if os.path.exists(data_dir):
 94        return data_dir
 95
 96    os.makedirs(path, exist_ok=True)
 97    zip_path = os.path.join(path, f"{split}_dataset.zip")
 98    util.download_source(zip_path, URLS[split], download, checksum=CHECKSUMS[split])
 99    util.unzip(zip_path, data_dir)
100
101    if split == "train":
102        roi_zip = os.path.join(path, "Training_ROIs_json.zip")
103        util.download_source(roi_zip, URLS["rois"], download, checksum=CHECKSUMS["rois"])
104        util.unzip(roi_zip, data_dir)
105        _rois_to_masks(data_dir)
106
107    return data_dir
108
109
110def get_morphoseg_paths(
111    path: Union[os.PathLike, str],
112    split: str,
113    download: bool = False,
114) -> Tuple[List[str], List[str]]:
115    """Get paths to the MorphoSeg (NTERA-2) data.
116
117    NOTE: Only the training split has segmentation masks (36 annotated images).
118    The test split contains images without annotations.
119
120    Args:
121        path: Filepath to a folder where the downloaded data will be saved.
122        split: The data split. Either 'train' or 'test'.
123        download: Whether to download the data if it is not present.
124
125    Returns:
126        List of filepaths for the image data.
127        List of filepaths for the label data.
128    """
129    if split == "test":
130        raise RuntimeError(
131            "The MorphoSeg test split does not contain segmentation masks - only images are available."
132        )
133
134    data_dir = get_morphoseg_data(path, split, download)
135    mask_dir = os.path.join(data_dir, "masks")
136
137    if not os.path.isdir(mask_dir) or len(glob(os.path.join(mask_dir, "*_mask.tif"))) == 0:
138        raise RuntimeError(
139            f"No mask files found in {mask_dir}. Check the dataset structure after downloading."
140        )
141
142    label_paths = natsorted(glob(os.path.join(mask_dir, "*_mask.tif")))
143    img_dir = os.path.join(data_dir, "training_dataset")
144
145    raw_paths = []
146    for lp in label_paths:
147        stem = os.path.basename(lp).replace("_mask.tif", "")
148        candidate = os.path.join(img_dir, stem + ".tif")
149        if not os.path.exists(candidate):
150            candidate = os.path.join(img_dir, stem + ".MP.tif")
151        raw_paths.append(candidate)
152
153    missing = [r for r in raw_paths if not os.path.exists(r)]
154    if missing:
155        raise RuntimeError(
156            f"{len(missing)} image file(s) not found for their masks. First missing: {missing[0]}"
157        )
158
159    return raw_paths, label_paths
160
161
162def get_morphoseg_dataset(
163    path: Union[os.PathLike, str],
164    patch_shape: Tuple[int, int],
165    split: str = "train",
166    download: bool = False,
167    **kwargs,
168) -> Dataset:
169    """Get the MorphoSeg dataset for bright-field NTERA-2 cell instance segmentation.
170
171    Args:
172        path: Filepath to a folder where the downloaded data will be saved.
173        patch_shape: The patch shape to use for training.
174        split: The data split. Either 'train' or 'test'.
175        download: Whether to download the data if it is not present.
176        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
177
178    Returns:
179        The segmentation dataset.
180    """
181    raw_paths, label_paths = get_morphoseg_paths(path, split, download)
182
183    return torch_em.default_segmentation_dataset(
184        raw_paths=raw_paths,
185        raw_key=None,
186        label_paths=label_paths,
187        label_key=None,
188        patch_shape=patch_shape,
189        **kwargs,
190    )
191
192
193def get_morphoseg_loader(
194    path: Union[os.PathLike, str],
195    batch_size: int,
196    patch_shape: Tuple[int, int],
197    split: str = "train",
198    download: bool = False,
199    **kwargs,
200) -> DataLoader:
201    """Get the MorphoSeg dataloader for bright-field NTERA-2 cell instance segmentation.
202
203    Args:
204        path: Filepath to a folder where the downloaded data will be saved.
205        batch_size: The batch size for training.
206        patch_shape: The patch shape to use for training.
207        split: The data split. Either 'train' or 'test'.
208        download: Whether to download the data if it is not present.
209        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
210
211    Returns:
212        The DataLoader.
213    """
214    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
215    dataset = get_morphoseg_dataset(path, patch_shape, split, download, **ds_kwargs)
216    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

URLS = {'train': 'https://ndownloader.figshare.com/files/45654198', 'test': 'https://ndownloader.figshare.com/files/45654201', 'rois': 'https://ndownloader.figshare.com/files/45654207'}

CHECKSUMS = {'train': None, 'test': None, 'rois': None}

def get_morphoseg_data(path: Union[os.PathLike, str], split: str, download: bool = False) -> str: View Source

 80def get_morphoseg_data(path: Union[os.PathLike, str], split: str, download: bool = False) -> str:
 81    """Download the MorphoSeg (NTERA-2) dataset.
 82
 83    Args:
 84        path: Filepath to a folder where the downloaded data will be saved.
 85        split: The data split. Either 'train' or 'test'.
 86        download: Whether to download the data if it is not present.
 87
 88    Returns:
 89        The filepath to the extracted data directory.
 90    """
 91    assert split in ("train", "test"), f"'{split}' is not a valid split. Choose 'train' or 'test'."
 92
 93    data_dir = os.path.join(path, split)
 94    if os.path.exists(data_dir):
 95        return data_dir
 96
 97    os.makedirs(path, exist_ok=True)
 98    zip_path = os.path.join(path, f"{split}_dataset.zip")
 99    util.download_source(zip_path, URLS[split], download, checksum=CHECKSUMS[split])
100    util.unzip(zip_path, data_dir)
101
102    if split == "train":
103        roi_zip = os.path.join(path, "Training_ROIs_json.zip")
104        util.download_source(roi_zip, URLS["rois"], download, checksum=CHECKSUMS["rois"])
105        util.unzip(roi_zip, data_dir)
106        _rois_to_masks(data_dir)
107
108    return data_dir

Download the MorphoSeg (NTERA-2) dataset.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
split: The data split. Either 'train' or 'test'.
download: Whether to download the data if it is not present.

Returns:

The filepath to the extracted data directory.

def get_morphoseg_paths( path: Union[os.PathLike, str], split: str, download: bool = False) -> Tuple[List[str], List[str]]: View Source

111def get_morphoseg_paths(
112    path: Union[os.PathLike, str],
113    split: str,
114    download: bool = False,
115) -> Tuple[List[str], List[str]]:
116    """Get paths to the MorphoSeg (NTERA-2) data.
117
118    NOTE: Only the training split has segmentation masks (36 annotated images).
119    The test split contains images without annotations.
120
121    Args:
122        path: Filepath to a folder where the downloaded data will be saved.
123        split: The data split. Either 'train' or 'test'.
124        download: Whether to download the data if it is not present.
125
126    Returns:
127        List of filepaths for the image data.
128        List of filepaths for the label data.
129    """
130    if split == "test":
131        raise RuntimeError(
132            "The MorphoSeg test split does not contain segmentation masks - only images are available."
133        )
134
135    data_dir = get_morphoseg_data(path, split, download)
136    mask_dir = os.path.join(data_dir, "masks")
137
138    if not os.path.isdir(mask_dir) or len(glob(os.path.join(mask_dir, "*_mask.tif"))) == 0:
139        raise RuntimeError(
140            f"No mask files found in {mask_dir}. Check the dataset structure after downloading."
141        )
142
143    label_paths = natsorted(glob(os.path.join(mask_dir, "*_mask.tif")))
144    img_dir = os.path.join(data_dir, "training_dataset")
145
146    raw_paths = []
147    for lp in label_paths:
148        stem = os.path.basename(lp).replace("_mask.tif", "")
149        candidate = os.path.join(img_dir, stem + ".tif")
150        if not os.path.exists(candidate):
151            candidate = os.path.join(img_dir, stem + ".MP.tif")
152        raw_paths.append(candidate)
153
154    missing = [r for r in raw_paths if not os.path.exists(r)]
155    if missing:
156        raise RuntimeError(
157            f"{len(missing)} image file(s) not found for their masks. First missing: {missing[0]}"
158        )
159
160    return raw_paths, label_paths

Get paths to the MorphoSeg (NTERA-2) data.

NOTE: Only the training split has segmentation masks (36 annotated images). The test split contains images without annotations.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
split: The data split. Either 'train' or 'test'.
download: Whether to download the data if it is not present.

Returns:

List of filepaths for the image data. List of filepaths for the label data.

def get_morphoseg_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: str = 'train', download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset: View Source

163def get_morphoseg_dataset(
164    path: Union[os.PathLike, str],
165    patch_shape: Tuple[int, int],
166    split: str = "train",
167    download: bool = False,
168    **kwargs,
169) -> Dataset:
170    """Get the MorphoSeg dataset for bright-field NTERA-2 cell instance segmentation.
171
172    Args:
173        path: Filepath to a folder where the downloaded data will be saved.
174        patch_shape: The patch shape to use for training.
175        split: The data split. Either 'train' or 'test'.
176        download: Whether to download the data if it is not present.
177        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
178
179    Returns:
180        The segmentation dataset.
181    """
182    raw_paths, label_paths = get_morphoseg_paths(path, split, download)
183
184    return torch_em.default_segmentation_dataset(
185        raw_paths=raw_paths,
186        raw_key=None,
187        label_paths=label_paths,
188        label_key=None,
189        patch_shape=patch_shape,
190        **kwargs,
191    )

Get the MorphoSeg dataset for bright-field NTERA-2 cell instance segmentation.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
patch_shape: The patch shape to use for training.
split: The data split. Either 'train' or 'test'.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

Returns:

The segmentation dataset.

def get_morphoseg_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: str = 'train', download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader: View Source

194def get_morphoseg_loader(
195    path: Union[os.PathLike, str],
196    batch_size: int,
197    patch_shape: Tuple[int, int],
198    split: str = "train",
199    download: bool = False,
200    **kwargs,
201) -> DataLoader:
202    """Get the MorphoSeg dataloader for bright-field NTERA-2 cell instance segmentation.
203
204    Args:
205        path: Filepath to a folder where the downloaded data will be saved.
206        batch_size: The batch size for training.
207        patch_shape: The patch shape to use for training.
208        split: The data split. Either 'train' or 'test'.
209        download: Whether to download the data if it is not present.
210        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
211
212    Returns:
213        The DataLoader.
214    """
215    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
216    dataset = get_morphoseg_dataset(path, patch_shape, split, download, **ds_kwargs)
217    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the MorphoSeg dataloader for bright-field NTERA-2 cell instance segmentation.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
batch_size: The batch size for training.
patch_shape: The patch shape to use for training.
split: The data split. Either 'train' or 'test'.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

Returns:

The DataLoader.