torch_em.data.datasets.light_microscopy.morphoseg
The MorphoSeg dataset contains bright-field microscopy images of NTERA-2 (NT2) human preneuronal embryonic cells at day 11 and day 12 of all-trans-retinoic acid differentiation, annotated for cell instance segmentation.
Images were acquired with a Leica DM IRB bright-field microscope (10x and 20x) and a Google Pixel 4 mobile phone camera. The dataset has 36 annotated training images and an unannotated test set.
Note: annotations are sparse - only a subset of the visible cells in each image are labeled (~10% pixel coverage despite ~20% cell-like content).
The dataset is located at https://doi.org/10.15131/shef.data.25604421. This dataset is from the following publication:
- Zhang et al. (2025): https://doi.org/10.1016/j.neucom.2025.130511 Please cite it if you use this dataset in your research.
1"""The MorphoSeg dataset contains bright-field microscopy images of NTERA-2 (NT2) 2human preneuronal embryonic cells at day 11 and day 12 of all-trans-retinoic acid 3differentiation, annotated for cell instance segmentation. 4 5Images were acquired with a Leica DM IRB bright-field microscope (10x and 20x) and 6a Google Pixel 4 mobile phone camera. The dataset has 36 annotated training images 7and an unannotated test set. 8 9Note: annotations are sparse - only a subset of the visible cells in each image 10are labeled (~10% pixel coverage despite ~20% cell-like content). 11 12The dataset is located at https://doi.org/10.15131/shef.data.25604421. 13This dataset is from the following publication: 14- Zhang et al. (2025): https://doi.org/10.1016/j.neucom.2025.130511 15Please cite it if you use this dataset in your research. 16""" 17 18import os 19import json 20from glob import glob 21from natsort import natsorted 22from typing import List, Tuple, Union 23 24import numpy as np 25 26from torch.utils.data import Dataset, DataLoader 27 28import torch_em 29 30from .. import util 31 32 33URLS = { 34 "train": "https://ndownloader.figshare.com/files/45654198", 35 "test": "https://ndownloader.figshare.com/files/45654201", 36 "rois": "https://ndownloader.figshare.com/files/45654207", 37} 38CHECKSUMS = { 39 "train": None, 40 "test": None, 41 "rois": None, 42} 43 44 45def _rois_to_masks(data_dir: str) -> None: 46 """Convert polygon ROI JSON files to per-image instance segmentation TIF masks.""" 47 import imageio.v3 as imageio 48 from skimage.draw import polygon as draw_polygon 49 50 roi_dir = os.path.join(data_dir, "roi_jsons_combined") 51 mask_dir = os.path.join(data_dir, "masks") 52 os.makedirs(mask_dir, exist_ok=True) 53 54 img_dir = os.path.join(data_dir, "training_dataset") 55 for json_path in natsorted(glob(os.path.join(roi_dir, "*_ROI.json"))): 56 stem = os.path.basename(json_path).replace("_ROI.json", "") 57 img_path = os.path.join(img_dir, stem + ".tif") 58 if not os.path.exists(img_path): 59 # Try .MP.tif variant. 60 img_path = os.path.join(img_dir, stem + ".MP.tif") 61 if not os.path.exists(img_path): 62 continue 63 64 img = imageio.imread(img_path) 65 h, w = img.shape[:2] 66 67 with open(json_path) as f: 68 rois = json.load(f) 69 70 mask = np.zeros((h, w), dtype=np.int32) 71 for instance_id, roi in enumerate(rois, start=1): 72 pts = np.array(roi["points"]) # [[x, y], ...] 73 rr, cc = draw_polygon(pts[:, 1], pts[:, 0], shape=(h, w)) 74 mask[rr, cc] = instance_id 75 76 imageio.imwrite(os.path.join(mask_dir, stem + "_mask.tif"), mask) 77 78 79def get_morphoseg_data(path: Union[os.PathLike, str], split: str, download: bool = False) -> str: 80 """Download the MorphoSeg (NTERA-2) dataset. 81 82 Args: 83 path: Filepath to a folder where the downloaded data will be saved. 84 split: The data split. Either 'train' or 'test'. 85 download: Whether to download the data if it is not present. 86 87 Returns: 88 The filepath to the extracted data directory. 89 """ 90 assert split in ("train", "test"), f"'{split}' is not a valid split. Choose 'train' or 'test'." 91 92 data_dir = os.path.join(path, split) 93 if os.path.exists(data_dir): 94 return data_dir 95 96 os.makedirs(path, exist_ok=True) 97 zip_path = os.path.join(path, f"{split}_dataset.zip") 98 util.download_source(zip_path, URLS[split], download, checksum=CHECKSUMS[split]) 99 util.unzip(zip_path, data_dir) 100 101 if split == "train": 102 roi_zip = os.path.join(path, "Training_ROIs_json.zip") 103 util.download_source(roi_zip, URLS["rois"], download, checksum=CHECKSUMS["rois"]) 104 util.unzip(roi_zip, data_dir) 105 _rois_to_masks(data_dir) 106 107 return data_dir 108 109 110def get_morphoseg_paths( 111 path: Union[os.PathLike, str], 112 split: str, 113 download: bool = False, 114) -> Tuple[List[str], List[str]]: 115 """Get paths to the MorphoSeg (NTERA-2) data. 116 117 NOTE: Only the training split has segmentation masks (36 annotated images). 118 The test split contains images without annotations. 119 120 Args: 121 path: Filepath to a folder where the downloaded data will be saved. 122 split: The data split. Either 'train' or 'test'. 123 download: Whether to download the data if it is not present. 124 125 Returns: 126 List of filepaths for the image data. 127 List of filepaths for the label data. 128 """ 129 if split == "test": 130 raise RuntimeError( 131 "The MorphoSeg test split does not contain segmentation masks - only images are available." 132 ) 133 134 data_dir = get_morphoseg_data(path, split, download) 135 mask_dir = os.path.join(data_dir, "masks") 136 137 if not os.path.isdir(mask_dir) or len(glob(os.path.join(mask_dir, "*_mask.tif"))) == 0: 138 raise RuntimeError( 139 f"No mask files found in {mask_dir}. Check the dataset structure after downloading." 140 ) 141 142 label_paths = natsorted(glob(os.path.join(mask_dir, "*_mask.tif"))) 143 img_dir = os.path.join(data_dir, "training_dataset") 144 145 raw_paths = [] 146 for lp in label_paths: 147 stem = os.path.basename(lp).replace("_mask.tif", "") 148 candidate = os.path.join(img_dir, stem + ".tif") 149 if not os.path.exists(candidate): 150 candidate = os.path.join(img_dir, stem + ".MP.tif") 151 raw_paths.append(candidate) 152 153 missing = [r for r in raw_paths if not os.path.exists(r)] 154 if missing: 155 raise RuntimeError( 156 f"{len(missing)} image file(s) not found for their masks. First missing: {missing[0]}" 157 ) 158 159 return raw_paths, label_paths 160 161 162def get_morphoseg_dataset( 163 path: Union[os.PathLike, str], 164 patch_shape: Tuple[int, int], 165 split: str = "train", 166 download: bool = False, 167 **kwargs, 168) -> Dataset: 169 """Get the MorphoSeg dataset for bright-field NTERA-2 cell instance segmentation. 170 171 Args: 172 path: Filepath to a folder where the downloaded data will be saved. 173 patch_shape: The patch shape to use for training. 174 split: The data split. Either 'train' or 'test'. 175 download: Whether to download the data if it is not present. 176 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 177 178 Returns: 179 The segmentation dataset. 180 """ 181 raw_paths, label_paths = get_morphoseg_paths(path, split, download) 182 183 return torch_em.default_segmentation_dataset( 184 raw_paths=raw_paths, 185 raw_key=None, 186 label_paths=label_paths, 187 label_key=None, 188 patch_shape=patch_shape, 189 **kwargs, 190 ) 191 192 193def get_morphoseg_loader( 194 path: Union[os.PathLike, str], 195 batch_size: int, 196 patch_shape: Tuple[int, int], 197 split: str = "train", 198 download: bool = False, 199 **kwargs, 200) -> DataLoader: 201 """Get the MorphoSeg dataloader for bright-field NTERA-2 cell instance segmentation. 202 203 Args: 204 path: Filepath to a folder where the downloaded data will be saved. 205 batch_size: The batch size for training. 206 patch_shape: The patch shape to use for training. 207 split: The data split. Either 'train' or 'test'. 208 download: Whether to download the data if it is not present. 209 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 210 211 Returns: 212 The DataLoader. 213 """ 214 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 215 dataset = get_morphoseg_dataset(path, patch_shape, split, download, **ds_kwargs) 216 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
80def get_morphoseg_data(path: Union[os.PathLike, str], split: str, download: bool = False) -> str: 81 """Download the MorphoSeg (NTERA-2) dataset. 82 83 Args: 84 path: Filepath to a folder where the downloaded data will be saved. 85 split: The data split. Either 'train' or 'test'. 86 download: Whether to download the data if it is not present. 87 88 Returns: 89 The filepath to the extracted data directory. 90 """ 91 assert split in ("train", "test"), f"'{split}' is not a valid split. Choose 'train' or 'test'." 92 93 data_dir = os.path.join(path, split) 94 if os.path.exists(data_dir): 95 return data_dir 96 97 os.makedirs(path, exist_ok=True) 98 zip_path = os.path.join(path, f"{split}_dataset.zip") 99 util.download_source(zip_path, URLS[split], download, checksum=CHECKSUMS[split]) 100 util.unzip(zip_path, data_dir) 101 102 if split == "train": 103 roi_zip = os.path.join(path, "Training_ROIs_json.zip") 104 util.download_source(roi_zip, URLS["rois"], download, checksum=CHECKSUMS["rois"]) 105 util.unzip(roi_zip, data_dir) 106 _rois_to_masks(data_dir) 107 108 return data_dir
Download the MorphoSeg (NTERA-2) dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The data split. Either 'train' or 'test'.
- download: Whether to download the data if it is not present.
Returns:
The filepath to the extracted data directory.
111def get_morphoseg_paths( 112 path: Union[os.PathLike, str], 113 split: str, 114 download: bool = False, 115) -> Tuple[List[str], List[str]]: 116 """Get paths to the MorphoSeg (NTERA-2) data. 117 118 NOTE: Only the training split has segmentation masks (36 annotated images). 119 The test split contains images without annotations. 120 121 Args: 122 path: Filepath to a folder where the downloaded data will be saved. 123 split: The data split. Either 'train' or 'test'. 124 download: Whether to download the data if it is not present. 125 126 Returns: 127 List of filepaths for the image data. 128 List of filepaths for the label data. 129 """ 130 if split == "test": 131 raise RuntimeError( 132 "The MorphoSeg test split does not contain segmentation masks - only images are available." 133 ) 134 135 data_dir = get_morphoseg_data(path, split, download) 136 mask_dir = os.path.join(data_dir, "masks") 137 138 if not os.path.isdir(mask_dir) or len(glob(os.path.join(mask_dir, "*_mask.tif"))) == 0: 139 raise RuntimeError( 140 f"No mask files found in {mask_dir}. Check the dataset structure after downloading." 141 ) 142 143 label_paths = natsorted(glob(os.path.join(mask_dir, "*_mask.tif"))) 144 img_dir = os.path.join(data_dir, "training_dataset") 145 146 raw_paths = [] 147 for lp in label_paths: 148 stem = os.path.basename(lp).replace("_mask.tif", "") 149 candidate = os.path.join(img_dir, stem + ".tif") 150 if not os.path.exists(candidate): 151 candidate = os.path.join(img_dir, stem + ".MP.tif") 152 raw_paths.append(candidate) 153 154 missing = [r for r in raw_paths if not os.path.exists(r)] 155 if missing: 156 raise RuntimeError( 157 f"{len(missing)} image file(s) not found for their masks. First missing: {missing[0]}" 158 ) 159 160 return raw_paths, label_paths
Get paths to the MorphoSeg (NTERA-2) data.
NOTE: Only the training split has segmentation masks (36 annotated images). The test split contains images without annotations.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The data split. Either 'train' or 'test'.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the image data. List of filepaths for the label data.
163def get_morphoseg_dataset( 164 path: Union[os.PathLike, str], 165 patch_shape: Tuple[int, int], 166 split: str = "train", 167 download: bool = False, 168 **kwargs, 169) -> Dataset: 170 """Get the MorphoSeg dataset for bright-field NTERA-2 cell instance segmentation. 171 172 Args: 173 path: Filepath to a folder where the downloaded data will be saved. 174 patch_shape: The patch shape to use for training. 175 split: The data split. Either 'train' or 'test'. 176 download: Whether to download the data if it is not present. 177 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 178 179 Returns: 180 The segmentation dataset. 181 """ 182 raw_paths, label_paths = get_morphoseg_paths(path, split, download) 183 184 return torch_em.default_segmentation_dataset( 185 raw_paths=raw_paths, 186 raw_key=None, 187 label_paths=label_paths, 188 label_key=None, 189 patch_shape=patch_shape, 190 **kwargs, 191 )
Get the MorphoSeg dataset for bright-field NTERA-2 cell instance segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- split: The data split. Either 'train' or 'test'.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset.
Returns:
The segmentation dataset.
194def get_morphoseg_loader( 195 path: Union[os.PathLike, str], 196 batch_size: int, 197 patch_shape: Tuple[int, int], 198 split: str = "train", 199 download: bool = False, 200 **kwargs, 201) -> DataLoader: 202 """Get the MorphoSeg dataloader for bright-field NTERA-2 cell instance segmentation. 203 204 Args: 205 path: Filepath to a folder where the downloaded data will be saved. 206 batch_size: The batch size for training. 207 patch_shape: The patch shape to use for training. 208 split: The data split. Either 'train' or 'test'. 209 download: Whether to download the data if it is not present. 210 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 211 212 Returns: 213 The DataLoader. 214 """ 215 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 216 dataset = get_morphoseg_dataset(path, patch_shape, split, download, **ds_kwargs) 217 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the MorphoSeg dataloader for bright-field NTERA-2 cell instance segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- split: The data split. Either 'train' or 'test'.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_datasetor for the PyTorch DataLoader.
Returns:
The DataLoader.