torch_em.data.datasets.electron_microscopy.nisb
NISB is a large-scale synthetic benchmark for neuron instance segmentation in connectomics.
It comprises 9 settings with varying difficulty and imaging conditions, each providing 5 training cubes, 1 validation cube, and 1 test cube. The train_100 setting is an exception with 100 training cubes for scaling analysis. Cubes are 27µm side length at 9x9x20 nm voxel size (liconn: 9x9x12 nm). The multichannel setting stores 8-channel embeddings instead of a single grayscale image.
Data is streamed directly from S3 via s3fs and written to local zarr v3 stores (chunk 64^3, shard 512^3, zstd compression) with (z, y, x) axis order. The source is zarr v2 with (x, y, z) axis order; spatial axes are transposed and the trailing singleton channel dim on img is squeezed during the write. Requires s3fs (pip install s3fs).
The data is described in https://doi.org/10.17617/1.r2mm-1h33. Please cite it if you use this dataset for a publication.
1"""NISB is a large-scale synthetic benchmark for neuron instance segmentation in connectomics. 2 3It comprises 9 settings with varying difficulty and imaging conditions, each providing 5 training 4cubes, 1 validation cube, and 1 test cube. The train_100 setting is an exception with 100 training 5cubes for scaling analysis. Cubes are 27µm side length at 9x9x20 nm voxel size (liconn: 9x9x12 nm). 6The multichannel setting stores 8-channel embeddings instead of a single grayscale image. 7 8Data is streamed directly from S3 via s3fs and written to local zarr v3 stores (chunk 64^3, 9shard 512^3, zstd compression) with (z, y, x) axis order. The source is zarr v2 with (x, y, z) 10axis order; spatial axes are transposed and the trailing singleton channel dim on img is squeezed 11during the write. Requires s3fs (pip install s3fs). 12 13The data is described in https://doi.org/10.17617/1.r2mm-1h33. 14Please cite it if you use this dataset for a publication. 15""" 16 17import os 18import shutil 19import warnings 20from glob import glob 21from typing import List, Literal, Optional, Tuple, Union 22 23import numpy as np 24from tqdm import tqdm 25from torch.utils.data import DataLoader, Dataset 26 27import torch_em 28from .. import util 29 30 31NISB_S3_ENDPOINT = "https://s3.nexus.mpcdf.mpg.de:443" 32NISB_S3_BUCKET = "nisb" 33 34NISB_SETTINGS = [ 35 "base", "train_100", "slice_perturbed", "pos_guidance", "neg_guidance", 36 "no_touch_thick", "touching_thin", "liconn", "multichannel", 37] 38 39NISB_CHUNK_SHAPE = (64, 64, 64) 40NISB_SHARD_SHAPE = (512, 512, 512) 41 42 43def _nisb_n_seeds(setting: str, split: str) -> int: 44 if split in ("val", "test"): 45 return 1 46 return 100 if setting == "train_100" else 5 47 48 49def _nisb_zarr_complete(zarr_path: str) -> bool: 50 return ( 51 os.path.isfile(os.path.join(zarr_path, "zarr.json")) 52 and os.path.isdir(os.path.join(zarr_path, "img")) 53 and os.path.isdir(os.path.join(zarr_path, "seg")) 54 ) 55 56 57def _nisb_create_v3_array(root, name, shape, dtype, is_label): 58 from zarr.codecs import BloscCodec 59 shuffle = "bitshuffle" if (np.issubdtype(np.dtype(dtype), np.integer) and is_label) else "shuffle" 60 chunks = NISB_CHUNK_SHAPE + tuple(shape[3:]) 61 shards = NISB_SHARD_SHAPE + tuple(shape[3:]) 62 return root.create_array( 63 name, shape=shape, chunks=chunks, shards=shards, dtype=dtype, 64 compressors=BloscCodec(cname="zstd", clevel=6, shuffle=shuffle), 65 ) 66 67 68def _nisb_write_cube_v3(src, v3_path: str) -> None: 69 """Stream a NISB cube from a zarr v2 source to a local zarr v3 store. 70 71 Transposes axes from (x, y, z) to (z, y, x) and squeezes the trailing singleton 72 channel dimension on the image array. 73 """ 74 import zarr 75 76 img_v2 = src["img"] 77 seg_v2 = src["seg"] 78 79 squeeze_img = img_v2.ndim == 4 and img_v2.shape[-1] == 1 80 if squeeze_img: 81 img_shape_v3 = (img_v2.shape[2], img_v2.shape[1], img_v2.shape[0]) 82 else: 83 img_shape_v3 = (img_v2.shape[2], img_v2.shape[1], img_v2.shape[0], img_v2.shape[3]) 84 seg_shape_v3 = (seg_v2.shape[2], seg_v2.shape[1], seg_v2.shape[0]) 85 86 tmp_path = v3_path + ".tmp" 87 if os.path.exists(tmp_path): 88 shutil.rmtree(tmp_path) 89 90 root = zarr.open_group(tmp_path, mode="w", zarr_format=3) 91 img_v3 = _nisb_create_v3_array(root, "img", img_shape_v3, np.dtype("uint8"), False) 92 seg_v3 = _nisb_create_v3_array(root, "seg", seg_shape_v3, np.dtype("uint16"), True) 93 94 Z, Y, X = seg_shape_v3 95 sz, sy, sx = NISB_SHARD_SHAPE 96 for z0 in range(0, Z, sz): 97 for y0 in range(0, Y, sy): 98 for x0 in range(0, X, sx): 99 z1, y1, x1 = min(z0 + sz, Z), min(y0 + sy, Y), min(x0 + sx, X) 100 block_img = np.asarray(img_v2[x0:x1, y0:y1, z0:z1]) 101 if squeeze_img: 102 block_img = block_img[..., 0] 103 img_v3[z0:z1, y0:y1, x0:x1] = np.moveaxis(block_img, [0, 2], [2, 0]) 104 block_seg = np.asarray(seg_v2[x0:x1, y0:y1, z0:z1]) 105 seg_v3[z0:z1, y0:y1, x0:x1] = block_seg.transpose(2, 1, 0) 106 107 shutil.move(tmp_path, v3_path) 108 109 110def _nisb_open_remote(setting: str, split: str, seed_idx: int): 111 """Open a NISB seed cube from S3 as a zarr v2 group via s3fs.""" 112 try: 113 import s3fs 114 except ImportError: 115 raise ImportError("The 's3fs' package is required to download NISB data. Install it with: pip install s3fs") 116 import zarr 117 118 fs = s3fs.S3FileSystem(anon=True, endpoint_url=NISB_S3_ENDPOINT) 119 s3_path = f"{NISB_S3_BUCKET}/{setting}/{split}/seed{seed_idx}/data.zarr" 120 with warnings.catch_warnings(): 121 warnings.filterwarnings("ignore", message=".*asynchronous.*") 122 store = zarr.storage.FsspecStore(fs=fs, path=s3_path) 123 return zarr.open_group(store, mode="r", zarr_format=2) 124 125 126def get_nisb_data(path: Union[os.PathLike, str], setting: str, split: str, download: bool) -> str: 127 """Stream and cache NISB data for a given setting and split from S3. 128 129 Data is read from S3 via s3fs and written to local zarr v3 stores with (z, y, x) axis 130 order, sharding (chunk 64^3, shard 512^3), and zstd compression. Already-cached seeds 131 are skipped on subsequent calls. 132 133 Args: 134 path: Filepath to a folder where the cached data will be saved. 135 setting: The NISB setting. One of NISB_SETTINGS. 136 split: The data split, one of 'train', 'val', 'test'. 137 download: Whether to stream and cache the data if it is not present. 138 139 Returns: 140 The filepath to the split directory containing seed subdirectories. 141 """ 142 assert setting in NISB_SETTINGS, f"Invalid setting '{setting}'. Choose from {NISB_SETTINGS}." 143 assert split in ("train", "val", "test"), f"Invalid split '{split}'. Choose 'train', 'val', or 'test'." 144 145 split_dir = os.path.join(str(path), setting, split) 146 n = _nisb_n_seeds(setting, split) 147 148 for i in tqdm(range(n), desc=f"NISB {setting}/{split}", leave=False): 149 seed_dir = os.path.join(split_dir, f"seed{i}") 150 zarr_path = os.path.join(seed_dir, "data.zarr") 151 152 if _nisb_zarr_complete(zarr_path): 153 continue 154 155 if not download: 156 raise RuntimeError( 157 f"No NISB data for setting '{setting}' split '{split}' seed {i} at '{zarr_path}'. " 158 "Set download=True to stream it from S3." 159 ) 160 161 os.makedirs(seed_dir, exist_ok=True) 162 print(f"Streaming NISB {setting}/{split}/seed{i} from S3 ...") 163 src = _nisb_open_remote(setting, split, i) 164 _nisb_write_cube_v3(src, zarr_path) 165 166 return split_dir 167 168 169def get_nisb_paths( 170 path: Union[os.PathLike, str], 171 setting: str = "base", 172 split: Literal["train", "val", "test"] = "train", 173 download: bool = False, 174) -> List[str]: 175 """Get paths to NISB zarr stores for a given setting and split. 176 177 Args: 178 path: Filepath to a folder where the cached data is saved. 179 setting: The NISB setting. One of NISB_SETTINGS. 180 split: The data split, one of 'train', 'val', 'test'. 181 download: Whether to stream and cache the data if it is not present. 182 183 Returns: 184 Sorted list of filepaths to the zarr stores, one per cube/seed. 185 """ 186 split_dir = get_nisb_data(path, setting, split, download) 187 paths = sorted(glob(os.path.join(split_dir, "seed*", "data.zarr"))) 188 if not paths: 189 raise RuntimeError( 190 f"No zarr files found in '{split_dir}'. The download may have failed or the directory is empty." 191 ) 192 return paths 193 194 195def get_nisb_dataset( 196 path: Union[os.PathLike, str], 197 patch_shape: Tuple[int, int, int], 198 setting: str = "base", 199 split: Literal["train", "val", "test"] = "train", 200 download: bool = False, 201 offsets: Optional[List[List[int]]] = None, 202 boundaries: bool = False, 203 **kwargs, 204) -> Dataset: 205 """Get the NISB dataset for neuron instance segmentation in EM. 206 207 NISB provides 9 settings of varying difficulty, each with multiple cubes at 27µm side length. 208 Image data is stored under the zarr key 'img' with shape (z, y, x) and segmentation under 'seg'. 209 The multichannel setting stores 8-channel data with shape (z, y, x, 8). 210 211 Args: 212 path: Filepath to a folder where the cached data will be saved. 213 patch_shape: The patch shape to use for training. 214 setting: The NISB setting. One of NISB_SETTINGS. Default 'base'. 215 split: The data split, one of 'train', 'val', 'test'. 216 download: Whether to stream and cache the data if it is not present. 217 Requires s3fs (pip install s3fs). 218 offsets: Offset values for affinity computation used as target. 219 boundaries: Whether to compute boundaries as the target. 220 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 221 222 Returns: 223 The segmentation dataset. 224 """ 225 assert len(patch_shape) == 3 226 227 paths = get_nisb_paths(path, setting, split, download) 228 229 kwargs = util.update_kwargs(kwargs, "is_seg_dataset", True) 230 kwargs, _ = util.add_instance_label_transform( 231 kwargs, add_binary_target=False, boundaries=boundaries, offsets=offsets 232 ) 233 234 return torch_em.default_segmentation_dataset( 235 raw_paths=paths, 236 raw_key="img", 237 label_paths=paths, 238 label_key="seg", 239 patch_shape=patch_shape, 240 **kwargs, 241 ) 242 243 244def get_nisb_loader( 245 path: Union[os.PathLike, str], 246 patch_shape: Tuple[int, int, int], 247 batch_size: int, 248 setting: str = "base", 249 split: Literal["train", "val", "test"] = "train", 250 download: bool = False, 251 offsets: Optional[List[List[int]]] = None, 252 boundaries: bool = False, 253 **kwargs, 254) -> DataLoader: 255 """Get the DataLoader for neuron instance segmentation in the NISB dataset. 256 257 Args: 258 path: Filepath to a folder where the cached data will be saved. 259 patch_shape: The patch shape to use for training. 260 batch_size: The batch size for training. 261 setting: The NISB setting. One of NISB_SETTINGS. Default 'base'. 262 split: The data split, one of 'train', 'val', 'test'. 263 download: Whether to stream and cache the data if it is not present. 264 Requires s3fs (pip install s3fs). 265 offsets: Offset values for affinity computation used as target. 266 boundaries: Whether to compute boundaries as the target. 267 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 268 269 Returns: 270 The DataLoader. 271 """ 272 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 273 ds = get_nisb_dataset( 274 path=path, 275 patch_shape=patch_shape, 276 setting=setting, 277 split=split, 278 download=download, 279 offsets=offsets, 280 boundaries=boundaries, 281 **ds_kwargs, 282 ) 283 return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)
127def get_nisb_data(path: Union[os.PathLike, str], setting: str, split: str, download: bool) -> str: 128 """Stream and cache NISB data for a given setting and split from S3. 129 130 Data is read from S3 via s3fs and written to local zarr v3 stores with (z, y, x) axis 131 order, sharding (chunk 64^3, shard 512^3), and zstd compression. Already-cached seeds 132 are skipped on subsequent calls. 133 134 Args: 135 path: Filepath to a folder where the cached data will be saved. 136 setting: The NISB setting. One of NISB_SETTINGS. 137 split: The data split, one of 'train', 'val', 'test'. 138 download: Whether to stream and cache the data if it is not present. 139 140 Returns: 141 The filepath to the split directory containing seed subdirectories. 142 """ 143 assert setting in NISB_SETTINGS, f"Invalid setting '{setting}'. Choose from {NISB_SETTINGS}." 144 assert split in ("train", "val", "test"), f"Invalid split '{split}'. Choose 'train', 'val', or 'test'." 145 146 split_dir = os.path.join(str(path), setting, split) 147 n = _nisb_n_seeds(setting, split) 148 149 for i in tqdm(range(n), desc=f"NISB {setting}/{split}", leave=False): 150 seed_dir = os.path.join(split_dir, f"seed{i}") 151 zarr_path = os.path.join(seed_dir, "data.zarr") 152 153 if _nisb_zarr_complete(zarr_path): 154 continue 155 156 if not download: 157 raise RuntimeError( 158 f"No NISB data for setting '{setting}' split '{split}' seed {i} at '{zarr_path}'. " 159 "Set download=True to stream it from S3." 160 ) 161 162 os.makedirs(seed_dir, exist_ok=True) 163 print(f"Streaming NISB {setting}/{split}/seed{i} from S3 ...") 164 src = _nisb_open_remote(setting, split, i) 165 _nisb_write_cube_v3(src, zarr_path) 166 167 return split_dir
Stream and cache NISB data for a given setting and split from S3.
Data is read from S3 via s3fs and written to local zarr v3 stores with (z, y, x) axis order, sharding (chunk 64^3, shard 512^3), and zstd compression. Already-cached seeds are skipped on subsequent calls.
Arguments:
- path: Filepath to a folder where the cached data will be saved.
- setting: The NISB setting. One of NISB_SETTINGS.
- split: The data split, one of 'train', 'val', 'test'.
- download: Whether to stream and cache the data if it is not present.
Returns:
The filepath to the split directory containing seed subdirectories.
170def get_nisb_paths( 171 path: Union[os.PathLike, str], 172 setting: str = "base", 173 split: Literal["train", "val", "test"] = "train", 174 download: bool = False, 175) -> List[str]: 176 """Get paths to NISB zarr stores for a given setting and split. 177 178 Args: 179 path: Filepath to a folder where the cached data is saved. 180 setting: The NISB setting. One of NISB_SETTINGS. 181 split: The data split, one of 'train', 'val', 'test'. 182 download: Whether to stream and cache the data if it is not present. 183 184 Returns: 185 Sorted list of filepaths to the zarr stores, one per cube/seed. 186 """ 187 split_dir = get_nisb_data(path, setting, split, download) 188 paths = sorted(glob(os.path.join(split_dir, "seed*", "data.zarr"))) 189 if not paths: 190 raise RuntimeError( 191 f"No zarr files found in '{split_dir}'. The download may have failed or the directory is empty." 192 ) 193 return paths
Get paths to NISB zarr stores for a given setting and split.
Arguments:
- path: Filepath to a folder where the cached data is saved.
- setting: The NISB setting. One of NISB_SETTINGS.
- split: The data split, one of 'train', 'val', 'test'.
- download: Whether to stream and cache the data if it is not present.
Returns:
Sorted list of filepaths to the zarr stores, one per cube/seed.
196def get_nisb_dataset( 197 path: Union[os.PathLike, str], 198 patch_shape: Tuple[int, int, int], 199 setting: str = "base", 200 split: Literal["train", "val", "test"] = "train", 201 download: bool = False, 202 offsets: Optional[List[List[int]]] = None, 203 boundaries: bool = False, 204 **kwargs, 205) -> Dataset: 206 """Get the NISB dataset for neuron instance segmentation in EM. 207 208 NISB provides 9 settings of varying difficulty, each with multiple cubes at 27µm side length. 209 Image data is stored under the zarr key 'img' with shape (z, y, x) and segmentation under 'seg'. 210 The multichannel setting stores 8-channel data with shape (z, y, x, 8). 211 212 Args: 213 path: Filepath to a folder where the cached data will be saved. 214 patch_shape: The patch shape to use for training. 215 setting: The NISB setting. One of NISB_SETTINGS. Default 'base'. 216 split: The data split, one of 'train', 'val', 'test'. 217 download: Whether to stream and cache the data if it is not present. 218 Requires s3fs (pip install s3fs). 219 offsets: Offset values for affinity computation used as target. 220 boundaries: Whether to compute boundaries as the target. 221 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 222 223 Returns: 224 The segmentation dataset. 225 """ 226 assert len(patch_shape) == 3 227 228 paths = get_nisb_paths(path, setting, split, download) 229 230 kwargs = util.update_kwargs(kwargs, "is_seg_dataset", True) 231 kwargs, _ = util.add_instance_label_transform( 232 kwargs, add_binary_target=False, boundaries=boundaries, offsets=offsets 233 ) 234 235 return torch_em.default_segmentation_dataset( 236 raw_paths=paths, 237 raw_key="img", 238 label_paths=paths, 239 label_key="seg", 240 patch_shape=patch_shape, 241 **kwargs, 242 )
Get the NISB dataset for neuron instance segmentation in EM.
NISB provides 9 settings of varying difficulty, each with multiple cubes at 27µm side length. Image data is stored under the zarr key 'img' with shape (z, y, x) and segmentation under 'seg'. The multichannel setting stores 8-channel data with shape (z, y, x, 8).
Arguments:
- path: Filepath to a folder where the cached data will be saved.
- patch_shape: The patch shape to use for training.
- setting: The NISB setting. One of NISB_SETTINGS. Default 'base'.
- split: The data split, one of 'train', 'val', 'test'.
- download: Whether to stream and cache the data if it is not present. Requires s3fs (pip install s3fs).
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset.
Returns:
The segmentation dataset.
245def get_nisb_loader( 246 path: Union[os.PathLike, str], 247 patch_shape: Tuple[int, int, int], 248 batch_size: int, 249 setting: str = "base", 250 split: Literal["train", "val", "test"] = "train", 251 download: bool = False, 252 offsets: Optional[List[List[int]]] = None, 253 boundaries: bool = False, 254 **kwargs, 255) -> DataLoader: 256 """Get the DataLoader for neuron instance segmentation in the NISB dataset. 257 258 Args: 259 path: Filepath to a folder where the cached data will be saved. 260 patch_shape: The patch shape to use for training. 261 batch_size: The batch size for training. 262 setting: The NISB setting. One of NISB_SETTINGS. Default 'base'. 263 split: The data split, one of 'train', 'val', 'test'. 264 download: Whether to stream and cache the data if it is not present. 265 Requires s3fs (pip install s3fs). 266 offsets: Offset values for affinity computation used as target. 267 boundaries: Whether to compute boundaries as the target. 268 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 269 270 Returns: 271 The DataLoader. 272 """ 273 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 274 ds = get_nisb_dataset( 275 path=path, 276 patch_shape=patch_shape, 277 setting=setting, 278 split=split, 279 download=download, 280 offsets=offsets, 281 boundaries=boundaries, 282 **ds_kwargs, 283 ) 284 return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)
Get the DataLoader for neuron instance segmentation in the NISB dataset.
Arguments:
- path: Filepath to a folder where the cached data will be saved.
- patch_shape: The patch shape to use for training.
- batch_size: The batch size for training.
- setting: The NISB setting. One of NISB_SETTINGS. Default 'base'.
- split: The data split, one of 'train', 'val', 'test'.
- download: Whether to stream and cache the data if it is not present. Requires s3fs (pip install s3fs).
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_datasetor for the PyTorch DataLoader.
Returns:
The DataLoader.