torch_em.data.datasets.light_microscopy.nuc_morph

The NucMorph dataset contains 3D fluorescence microscopy images of hiPSC nuclei with instance segmentation annotations.

NOTE: The annotations are super strange and bad. Only the high-resolution regions around the middle slice are marked well. Otherwise, the annotations are like bad.

The dataset provides 410 paired 100x 3D images and watershed-based nuclear instance segmentation masks from human induced pluripotent stem cells (hiPSCs). It includes train (372), validation (20), and test (18) splits.

The dataset is located at https://open.quiltdata.com/b/allencell/tree/aics/nuc-morph-dataset/. This dataset is from the publication https://doi.org/10.1016/j.cels.2025.101265. Please cite it if you use this dataset in your research.

View Source

  1"""The NucMorph dataset contains 3D fluorescence microscopy images of hiPSC nuclei
  2with instance segmentation annotations.
  3
  4NOTE: The annotations are super strange and bad. Only the high-resolution regions
  5around the middle slice are marked well. Otherwise, the annotations are like bad.
  6
  7The dataset provides 410 paired 100x 3D images and watershed-based nuclear instance
  8segmentation masks from human induced pluripotent stem cells (hiPSCs). It includes
  9train (372), validation (20), and test (18) splits.
 10
 11The dataset is located at https://open.quiltdata.com/b/allencell/tree/aics/nuc-morph-dataset/.
 12This dataset is from the publication https://doi.org/10.1016/j.cels.2025.101265.
 13Please cite it if you use this dataset in your research.
 14"""
 15
 16import os
 17from glob import glob
 18from typing import Union, Tuple, List, Literal
 19
 20from torch.utils.data import Dataset, DataLoader
 21
 22import torch_em
 23
 24from .. import util
 25
 26
 27S3_BASE = (
 28    "https://allencell.s3.amazonaws.com/aics/nuc-morph-dataset/"
 29    "hipsc_nuclei_image_datasets_for_training_deep_learning_models/"
 30    "segmentation_decoder_training_fov_dataset"
 31)
 32
 33NUM_FILES = 410
 34VALID_SPLITS = ["train", "val", "test"]
 35
 36
 37def _download_manifest(path):
 38    """Download the training data manifest CSV."""
 39    manifest_path = os.path.join(path, "training_data_manifest.csv")
 40    if not os.path.exists(manifest_path):
 41        url = f"{S3_BASE}/training_data_manifest.csv"
 42        util.download_source(path=manifest_path, url=url, download=True, checksum=None)
 43    return manifest_path
 44
 45
 46def _get_split_indices(path, split):
 47    """Get file indices for a given split from the manifest."""
 48    import pandas as pd
 49
 50    manifest_path = _download_manifest(path)
 51    df = pd.read_csv(manifest_path)
 52
 53    # Map split names: manifest uses "valid" but we expose "val".
 54    manifest_split = "valid" if split == "val" else split
 55    indices = df[df["mode"] == manifest_split].iloc[:, 0].tolist()
 56    return sorted(indices)
 57
 58
 59def _download_files(path, split, download):
 60    """Download raw and segmentation files for a given split."""
 61    from tqdm import tqdm
 62
 63    raw_dir = os.path.join(path, "high_res_100x")
 64    seg_dir = os.path.join(path, "watershed_segmentation_100x")
 65    os.makedirs(raw_dir, exist_ok=True)
 66    os.makedirs(seg_dir, exist_ok=True)
 67
 68    indices = _get_split_indices(path, split)
 69
 70    for idx in tqdm(indices, desc=f"Downloading {split} data"):
 71        fname = f"IMG_{idx:04d}.tif"
 72
 73        raw_path = os.path.join(raw_dir, fname)
 74        if not os.path.exists(raw_path):
 75            url = f"{S3_BASE}/high_res_100x/{fname}"
 76            util.download_source(path=raw_path, url=url, download=download, checksum=None)
 77
 78        seg_path = os.path.join(seg_dir, fname)
 79        if not os.path.exists(seg_path):
 80            url = f"{S3_BASE}/watershed_segmentation_100x/{fname}"
 81            util.download_source(path=seg_path, url=url, download=download, checksum=None)
 82
 83
 84def _create_h5_data(path, split):
 85    """Create h5 files with raw images and nuclear instance labels."""
 86    import h5py
 87    import imageio.v3 as imageio
 88    from tqdm import tqdm
 89
 90    h5_dir = os.path.join(path, "h5_data", split)
 91    os.makedirs(h5_dir, exist_ok=True)
 92
 93    indices = _get_split_indices(path, split)
 94
 95    for idx in tqdm(indices, desc=f"Creating h5 for '{split}'"):
 96        fname = f"IMG_{idx:04d}"
 97        h5_path = os.path.join(h5_dir, f"{fname}.h5")
 98
 99        if os.path.exists(h5_path):
100            continue
101
102        raw_path = os.path.join(path, "high_res_100x", f"{fname}.tif")
103        seg_path = os.path.join(path, "watershed_segmentation_100x", f"{fname}.tif")
104
105        raw = imageio.imread(raw_path)
106        seg = imageio.imread(seg_path)
107
108        # Crop to the minimum shape along each axis to handle off-by-one mismatches
109        # (weird one-pixel interpolation shifts across one axis)
110        min_shape = tuple(min(r, s) for r, s in zip(raw.shape, seg.shape))
111        raw = raw[:min_shape[0], :min_shape[1], :min_shape[2]]
112        seg = seg[:min_shape[0], :min_shape[1], :min_shape[2]]
113
114        with h5py.File(h5_path, "w") as f:
115            f.create_dataset("raw", data=raw, compression="gzip")
116            f.create_dataset("labels", data=seg.astype("int64"), compression="gzip")
117
118    return h5_dir
119
120
121def get_nuc_morph_data(
122    path: Union[os.PathLike, str],
123    split: Literal["train", "val", "test"] = "train",
124    download: bool = False,
125) -> str:
126    """Download the NucMorph dataset.
127
128    Args:
129        path: Filepath to a folder where the downloaded data will be saved.
130        split: The data split to use. One of 'train', 'val' or 'test'.
131        download: Whether to download the data if it is not present.
132
133    Returns:
134        The filepath to the directory with the data.
135    """
136    assert split in VALID_SPLITS, f"'{split}' is not a valid split. Choose from {VALID_SPLITS}."
137    _download_files(path, split, download)
138    return path
139
140
141def get_nuc_morph_paths(
142    path: Union[os.PathLike, str],
143    split: Literal["train", "val", "test"] = "train",
144    download: bool = False,
145) -> List[str]:
146    """Get paths to the NucMorph data.
147
148    Args:
149        path: Filepath to a folder where the downloaded data will be saved.
150        split: The data split to use. One of 'train', 'val' or 'test'.
151        download: Whether to download the data if it is not present.
152
153    Returns:
154        List of filepaths for the h5 data.
155    """
156    from natsort import natsorted
157
158    assert split in VALID_SPLITS, f"'{split}' is not a valid split. Choose from {VALID_SPLITS}."
159
160    get_nuc_morph_data(path, split, download)
161
162    h5_dir = os.path.join(path, "h5_data", split)
163    if not os.path.exists(h5_dir) or len(glob(os.path.join(h5_dir, "*.h5"))) == 0:
164        _create_h5_data(path, split)
165
166    h5_paths = natsorted(glob(os.path.join(h5_dir, "*.h5")))
167    assert len(h5_paths) > 0, f"No data found for split '{split}'"
168
169    return h5_paths
170
171
172def get_nuc_morph_dataset(
173    path: Union[os.PathLike, str],
174    patch_shape: Tuple[int, int, int],
175    split: Literal["train", "val", "test"] = "train",
176    download: bool = False,
177    **kwargs
178) -> Dataset:
179    """Get the NucMorph dataset for 3D nuclear instance segmentation.
180
181    Args:
182        path: Filepath to a folder where the downloaded data will be saved.
183        patch_shape: The patch shape to use for training.
184        split: The data split to use. One of 'train', 'val' or 'test'.
185        download: Whether to download the data if it is not present.
186        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
187
188    Returns:
189        The segmentation dataset.
190    """
191    h5_paths = get_nuc_morph_paths(path, split, download)
192
193    kwargs, _ = util.add_instance_label_transform(
194        kwargs, add_binary_target=True,
195    )
196    kwargs = util.ensure_transforms(ndim=3, **kwargs)
197
198    return torch_em.default_segmentation_dataset(
199        raw_paths=h5_paths,
200        raw_key="raw",
201        label_paths=h5_paths,
202        label_key="labels",
203        patch_shape=patch_shape,
204        ndim=3,
205        **kwargs
206    )
207
208
209def get_nuc_morph_loader(
210    path: Union[os.PathLike, str],
211    batch_size: int,
212    patch_shape: Tuple[int, int, int],
213    split: Literal["train", "val", "test"] = "train",
214    download: bool = False,
215    **kwargs
216) -> DataLoader:
217    """Get the NucMorph dataloader for 3D nuclear instance segmentation.
218
219    Args:
220        path: Filepath to a folder where the downloaded data will be saved.
221        batch_size: The batch size for training.
222        patch_shape: The patch shape to use for training.
223        split: The data split to use. One of 'train', 'val' or 'test'.
224        download: Whether to download the data if it is not present.
225        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
226
227    Returns:
228        The DataLoader.
229    """
230    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
231    dataset = get_nuc_morph_dataset(
232        path=path,
233        patch_shape=patch_shape,
234        split=split,
235        download=download,
236        **ds_kwargs,
237    )
238    return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)

S3_BASE = 'https://allencell.s3.amazonaws.com/aics/nuc-morph-dataset/hipsc_nuclei_image_datasets_for_training_deep_learning_models/segmentation_decoder_training_fov_dataset'

NUM_FILES = 410

VALID_SPLITS = ['train', 'val', 'test']

def get_nuc_morph_data( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'] = 'train', download: bool = False) -> str: View Source

122def get_nuc_morph_data(
123    path: Union[os.PathLike, str],
124    split: Literal["train", "val", "test"] = "train",
125    download: bool = False,
126) -> str:
127    """Download the NucMorph dataset.
128
129    Args:
130        path: Filepath to a folder where the downloaded data will be saved.
131        split: The data split to use. One of 'train', 'val' or 'test'.
132        download: Whether to download the data if it is not present.
133
134    Returns:
135        The filepath to the directory with the data.
136    """
137    assert split in VALID_SPLITS, f"'{split}' is not a valid split. Choose from {VALID_SPLITS}."
138    _download_files(path, split, download)
139    return path

Download the NucMorph dataset.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
split: The data split to use. One of 'train', 'val' or 'test'.
download: Whether to download the data if it is not present.

Returns:

The filepath to the directory with the data.

def get_nuc_morph_paths( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'] = 'train', download: bool = False) -> List[str]: View Source

142def get_nuc_morph_paths(
143    path: Union[os.PathLike, str],
144    split: Literal["train", "val", "test"] = "train",
145    download: bool = False,
146) -> List[str]:
147    """Get paths to the NucMorph data.
148
149    Args:
150        path: Filepath to a folder where the downloaded data will be saved.
151        split: The data split to use. One of 'train', 'val' or 'test'.
152        download: Whether to download the data if it is not present.
153
154    Returns:
155        List of filepaths for the h5 data.
156    """
157    from natsort import natsorted
158
159    assert split in VALID_SPLITS, f"'{split}' is not a valid split. Choose from {VALID_SPLITS}."
160
161    get_nuc_morph_data(path, split, download)
162
163    h5_dir = os.path.join(path, "h5_data", split)
164    if not os.path.exists(h5_dir) or len(glob(os.path.join(h5_dir, "*.h5"))) == 0:
165        _create_h5_data(path, split)
166
167    h5_paths = natsorted(glob(os.path.join(h5_dir, "*.h5")))
168    assert len(h5_paths) > 0, f"No data found for split '{split}'"
169
170    return h5_paths

Get paths to the NucMorph data.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
split: The data split to use. One of 'train', 'val' or 'test'.
download: Whether to download the data if it is not present.

Returns:

List of filepaths for the h5 data.

def get_nuc_morph_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int, int], split: Literal['train', 'val', 'test'] = 'train', download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset: View Source

173def get_nuc_morph_dataset(
174    path: Union[os.PathLike, str],
175    patch_shape: Tuple[int, int, int],
176    split: Literal["train", "val", "test"] = "train",
177    download: bool = False,
178    **kwargs
179) -> Dataset:
180    """Get the NucMorph dataset for 3D nuclear instance segmentation.
181
182    Args:
183        path: Filepath to a folder where the downloaded data will be saved.
184        patch_shape: The patch shape to use for training.
185        split: The data split to use. One of 'train', 'val' or 'test'.
186        download: Whether to download the data if it is not present.
187        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
188
189    Returns:
190        The segmentation dataset.
191    """
192    h5_paths = get_nuc_morph_paths(path, split, download)
193
194    kwargs, _ = util.add_instance_label_transform(
195        kwargs, add_binary_target=True,
196    )
197    kwargs = util.ensure_transforms(ndim=3, **kwargs)
198
199    return torch_em.default_segmentation_dataset(
200        raw_paths=h5_paths,
201        raw_key="raw",
202        label_paths=h5_paths,
203        label_key="labels",
204        patch_shape=patch_shape,
205        ndim=3,
206        **kwargs
207    )

Get the NucMorph dataset for 3D nuclear instance segmentation.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
patch_shape: The patch shape to use for training.
split: The data split to use. One of 'train', 'val' or 'test'.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

Returns:

The segmentation dataset.

def get_nuc_morph_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int, int], split: Literal['train', 'val', 'test'] = 'train', download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader: View Source

210def get_nuc_morph_loader(
211    path: Union[os.PathLike, str],
212    batch_size: int,
213    patch_shape: Tuple[int, int, int],
214    split: Literal["train", "val", "test"] = "train",
215    download: bool = False,
216    **kwargs
217) -> DataLoader:
218    """Get the NucMorph dataloader for 3D nuclear instance segmentation.
219
220    Args:
221        path: Filepath to a folder where the downloaded data will be saved.
222        batch_size: The batch size for training.
223        patch_shape: The patch shape to use for training.
224        split: The data split to use. One of 'train', 'val' or 'test'.
225        download: Whether to download the data if it is not present.
226        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
227
228    Returns:
229        The DataLoader.
230    """
231    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
232    dataset = get_nuc_morph_dataset(
233        path=path,
234        patch_shape=patch_shape,
235        split=split,
236        download=download,
237        **ds_kwargs,
238    )
239    return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)

Get the NucMorph dataloader for 3D nuclear instance segmentation.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
batch_size: The batch size for training.
patch_shape: The patch shape to use for training.
split: The data split to use. One of 'train', 'val' or 'test'.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

Returns:

The DataLoader.