torch_em.data.datasets.light_microscopy.nuc_morph
The NucMorph dataset contains 3D fluorescence microscopy images of hiPSC nuclei with instance segmentation annotations.
NOTE: The annotations are super strange and bad. Only the high-resolution regions around the middle slice are marked well. Otherwise, the annotations are like bad.
The dataset provides 410 paired 100x 3D images and watershed-based nuclear instance segmentation masks from human induced pluripotent stem cells (hiPSCs). It includes train (372), validation (20), and test (18) splits.
The dataset is located at https://open.quiltdata.com/b/allencell/tree/aics/nuc-morph-dataset/. This dataset is from the publication https://doi.org/10.1016/j.cels.2025.101265. Please cite it if you use this dataset in your research.
1"""The NucMorph dataset contains 3D fluorescence microscopy images of hiPSC nuclei 2with instance segmentation annotations. 3 4NOTE: The annotations are super strange and bad. Only the high-resolution regions 5around the middle slice are marked well. Otherwise, the annotations are like bad. 6 7The dataset provides 410 paired 100x 3D images and watershed-based nuclear instance 8segmentation masks from human induced pluripotent stem cells (hiPSCs). It includes 9train (372), validation (20), and test (18) splits. 10 11The dataset is located at https://open.quiltdata.com/b/allencell/tree/aics/nuc-morph-dataset/. 12This dataset is from the publication https://doi.org/10.1016/j.cels.2025.101265. 13Please cite it if you use this dataset in your research. 14""" 15 16import os 17from glob import glob 18from typing import Union, Tuple, List, Literal 19 20from torch.utils.data import Dataset, DataLoader 21 22import torch_em 23 24from .. import util 25 26 27S3_BASE = ( 28 "https://allencell.s3.amazonaws.com/aics/nuc-morph-dataset/" 29 "hipsc_nuclei_image_datasets_for_training_deep_learning_models/" 30 "segmentation_decoder_training_fov_dataset" 31) 32 33NUM_FILES = 410 34VALID_SPLITS = ["train", "val", "test"] 35 36 37def _download_manifest(path): 38 """Download the training data manifest CSV.""" 39 manifest_path = os.path.join(path, "training_data_manifest.csv") 40 if not os.path.exists(manifest_path): 41 url = f"{S3_BASE}/training_data_manifest.csv" 42 util.download_source(path=manifest_path, url=url, download=True, checksum=None) 43 return manifest_path 44 45 46def _get_split_indices(path, split): 47 """Get file indices for a given split from the manifest.""" 48 import pandas as pd 49 50 manifest_path = _download_manifest(path) 51 df = pd.read_csv(manifest_path) 52 53 # Map split names: manifest uses "valid" but we expose "val". 54 manifest_split = "valid" if split == "val" else split 55 indices = df[df["mode"] == manifest_split].iloc[:, 0].tolist() 56 return sorted(indices) 57 58 59def _download_files(path, split, download): 60 """Download raw and segmentation files for a given split.""" 61 from tqdm import tqdm 62 63 raw_dir = os.path.join(path, "high_res_100x") 64 seg_dir = os.path.join(path, "watershed_segmentation_100x") 65 os.makedirs(raw_dir, exist_ok=True) 66 os.makedirs(seg_dir, exist_ok=True) 67 68 indices = _get_split_indices(path, split) 69 70 for idx in tqdm(indices, desc=f"Downloading {split} data"): 71 fname = f"IMG_{idx:04d}.tif" 72 73 raw_path = os.path.join(raw_dir, fname) 74 if not os.path.exists(raw_path): 75 url = f"{S3_BASE}/high_res_100x/{fname}" 76 util.download_source(path=raw_path, url=url, download=download, checksum=None) 77 78 seg_path = os.path.join(seg_dir, fname) 79 if not os.path.exists(seg_path): 80 url = f"{S3_BASE}/watershed_segmentation_100x/{fname}" 81 util.download_source(path=seg_path, url=url, download=download, checksum=None) 82 83 84def _create_h5_data(path, split): 85 """Create h5 files with raw images and nuclear instance labels.""" 86 import h5py 87 import imageio.v3 as imageio 88 from tqdm import tqdm 89 90 h5_dir = os.path.join(path, "h5_data", split) 91 os.makedirs(h5_dir, exist_ok=True) 92 93 indices = _get_split_indices(path, split) 94 95 for idx in tqdm(indices, desc=f"Creating h5 for '{split}'"): 96 fname = f"IMG_{idx:04d}" 97 h5_path = os.path.join(h5_dir, f"{fname}.h5") 98 99 if os.path.exists(h5_path): 100 continue 101 102 raw_path = os.path.join(path, "high_res_100x", f"{fname}.tif") 103 seg_path = os.path.join(path, "watershed_segmentation_100x", f"{fname}.tif") 104 105 raw = imageio.imread(raw_path) 106 seg = imageio.imread(seg_path) 107 108 # Crop to the minimum shape along each axis to handle off-by-one mismatches 109 # (weird one-pixel interpolation shifts across one axis) 110 min_shape = tuple(min(r, s) for r, s in zip(raw.shape, seg.shape)) 111 raw = raw[:min_shape[0], :min_shape[1], :min_shape[2]] 112 seg = seg[:min_shape[0], :min_shape[1], :min_shape[2]] 113 114 with h5py.File(h5_path, "w") as f: 115 f.create_dataset("raw", data=raw, compression="gzip") 116 f.create_dataset("labels", data=seg.astype("int64"), compression="gzip") 117 118 return h5_dir 119 120 121def get_nuc_morph_data( 122 path: Union[os.PathLike, str], 123 split: Literal["train", "val", "test"] = "train", 124 download: bool = False, 125) -> str: 126 """Download the NucMorph dataset. 127 128 Args: 129 path: Filepath to a folder where the downloaded data will be saved. 130 split: The data split to use. One of 'train', 'val' or 'test'. 131 download: Whether to download the data if it is not present. 132 133 Returns: 134 The filepath to the directory with the data. 135 """ 136 assert split in VALID_SPLITS, f"'{split}' is not a valid split. Choose from {VALID_SPLITS}." 137 _download_files(path, split, download) 138 return path 139 140 141def get_nuc_morph_paths( 142 path: Union[os.PathLike, str], 143 split: Literal["train", "val", "test"] = "train", 144 download: bool = False, 145) -> List[str]: 146 """Get paths to the NucMorph data. 147 148 Args: 149 path: Filepath to a folder where the downloaded data will be saved. 150 split: The data split to use. One of 'train', 'val' or 'test'. 151 download: Whether to download the data if it is not present. 152 153 Returns: 154 List of filepaths for the h5 data. 155 """ 156 from natsort import natsorted 157 158 assert split in VALID_SPLITS, f"'{split}' is not a valid split. Choose from {VALID_SPLITS}." 159 160 get_nuc_morph_data(path, split, download) 161 162 h5_dir = os.path.join(path, "h5_data", split) 163 if not os.path.exists(h5_dir) or len(glob(os.path.join(h5_dir, "*.h5"))) == 0: 164 _create_h5_data(path, split) 165 166 h5_paths = natsorted(glob(os.path.join(h5_dir, "*.h5"))) 167 assert len(h5_paths) > 0, f"No data found for split '{split}'" 168 169 return h5_paths 170 171 172def get_nuc_morph_dataset( 173 path: Union[os.PathLike, str], 174 patch_shape: Tuple[int, int, int], 175 split: Literal["train", "val", "test"] = "train", 176 download: bool = False, 177 **kwargs 178) -> Dataset: 179 """Get the NucMorph dataset for 3D nuclear instance segmentation. 180 181 Args: 182 path: Filepath to a folder where the downloaded data will be saved. 183 patch_shape: The patch shape to use for training. 184 split: The data split to use. One of 'train', 'val' or 'test'. 185 download: Whether to download the data if it is not present. 186 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 187 188 Returns: 189 The segmentation dataset. 190 """ 191 h5_paths = get_nuc_morph_paths(path, split, download) 192 193 kwargs, _ = util.add_instance_label_transform( 194 kwargs, add_binary_target=True, 195 ) 196 kwargs = util.ensure_transforms(ndim=3, **kwargs) 197 198 return torch_em.default_segmentation_dataset( 199 raw_paths=h5_paths, 200 raw_key="raw", 201 label_paths=h5_paths, 202 label_key="labels", 203 patch_shape=patch_shape, 204 ndim=3, 205 **kwargs 206 ) 207 208 209def get_nuc_morph_loader( 210 path: Union[os.PathLike, str], 211 batch_size: int, 212 patch_shape: Tuple[int, int, int], 213 split: Literal["train", "val", "test"] = "train", 214 download: bool = False, 215 **kwargs 216) -> DataLoader: 217 """Get the NucMorph dataloader for 3D nuclear instance segmentation. 218 219 Args: 220 path: Filepath to a folder where the downloaded data will be saved. 221 batch_size: The batch size for training. 222 patch_shape: The patch shape to use for training. 223 split: The data split to use. One of 'train', 'val' or 'test'. 224 download: Whether to download the data if it is not present. 225 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 226 227 Returns: 228 The DataLoader. 229 """ 230 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 231 dataset = get_nuc_morph_dataset( 232 path=path, 233 patch_shape=patch_shape, 234 split=split, 235 download=download, 236 **ds_kwargs, 237 ) 238 return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)
122def get_nuc_morph_data( 123 path: Union[os.PathLike, str], 124 split: Literal["train", "val", "test"] = "train", 125 download: bool = False, 126) -> str: 127 """Download the NucMorph dataset. 128 129 Args: 130 path: Filepath to a folder where the downloaded data will be saved. 131 split: The data split to use. One of 'train', 'val' or 'test'. 132 download: Whether to download the data if it is not present. 133 134 Returns: 135 The filepath to the directory with the data. 136 """ 137 assert split in VALID_SPLITS, f"'{split}' is not a valid split. Choose from {VALID_SPLITS}." 138 _download_files(path, split, download) 139 return path
Download the NucMorph dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The data split to use. One of 'train', 'val' or 'test'.
- download: Whether to download the data if it is not present.
Returns:
The filepath to the directory with the data.
142def get_nuc_morph_paths( 143 path: Union[os.PathLike, str], 144 split: Literal["train", "val", "test"] = "train", 145 download: bool = False, 146) -> List[str]: 147 """Get paths to the NucMorph data. 148 149 Args: 150 path: Filepath to a folder where the downloaded data will be saved. 151 split: The data split to use. One of 'train', 'val' or 'test'. 152 download: Whether to download the data if it is not present. 153 154 Returns: 155 List of filepaths for the h5 data. 156 """ 157 from natsort import natsorted 158 159 assert split in VALID_SPLITS, f"'{split}' is not a valid split. Choose from {VALID_SPLITS}." 160 161 get_nuc_morph_data(path, split, download) 162 163 h5_dir = os.path.join(path, "h5_data", split) 164 if not os.path.exists(h5_dir) or len(glob(os.path.join(h5_dir, "*.h5"))) == 0: 165 _create_h5_data(path, split) 166 167 h5_paths = natsorted(glob(os.path.join(h5_dir, "*.h5"))) 168 assert len(h5_paths) > 0, f"No data found for split '{split}'" 169 170 return h5_paths
Get paths to the NucMorph data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The data split to use. One of 'train', 'val' or 'test'.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the h5 data.
173def get_nuc_morph_dataset( 174 path: Union[os.PathLike, str], 175 patch_shape: Tuple[int, int, int], 176 split: Literal["train", "val", "test"] = "train", 177 download: bool = False, 178 **kwargs 179) -> Dataset: 180 """Get the NucMorph dataset for 3D nuclear instance segmentation. 181 182 Args: 183 path: Filepath to a folder where the downloaded data will be saved. 184 patch_shape: The patch shape to use for training. 185 split: The data split to use. One of 'train', 'val' or 'test'. 186 download: Whether to download the data if it is not present. 187 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 188 189 Returns: 190 The segmentation dataset. 191 """ 192 h5_paths = get_nuc_morph_paths(path, split, download) 193 194 kwargs, _ = util.add_instance_label_transform( 195 kwargs, add_binary_target=True, 196 ) 197 kwargs = util.ensure_transforms(ndim=3, **kwargs) 198 199 return torch_em.default_segmentation_dataset( 200 raw_paths=h5_paths, 201 raw_key="raw", 202 label_paths=h5_paths, 203 label_key="labels", 204 patch_shape=patch_shape, 205 ndim=3, 206 **kwargs 207 )
Get the NucMorph dataset for 3D nuclear instance segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- split: The data split to use. One of 'train', 'val' or 'test'.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset.
Returns:
The segmentation dataset.
210def get_nuc_morph_loader( 211 path: Union[os.PathLike, str], 212 batch_size: int, 213 patch_shape: Tuple[int, int, int], 214 split: Literal["train", "val", "test"] = "train", 215 download: bool = False, 216 **kwargs 217) -> DataLoader: 218 """Get the NucMorph dataloader for 3D nuclear instance segmentation. 219 220 Args: 221 path: Filepath to a folder where the downloaded data will be saved. 222 batch_size: The batch size for training. 223 patch_shape: The patch shape to use for training. 224 split: The data split to use. One of 'train', 'val' or 'test'. 225 download: Whether to download the data if it is not present. 226 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 227 228 Returns: 229 The DataLoader. 230 """ 231 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 232 dataset = get_nuc_morph_dataset( 233 path=path, 234 patch_shape=patch_shape, 235 split=split, 236 download=download, 237 **ds_kwargs, 238 ) 239 return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)
Get the NucMorph dataloader for 3D nuclear instance segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- split: The data split to use. One of 'train', 'val' or 'test'.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_datasetor for the PyTorch DataLoader.
Returns:
The DataLoader.