torch_em.data.datasets.electron_microscopy.nisb

NISB is a large-scale synthetic benchmark for neuron instance segmentation in connectomics.

It comprises 9 settings with varying difficulty and imaging conditions, each providing 5 training cubes, 1 validation cube, and 1 test cube. The train_100 setting is an exception with 100 training cubes for scaling analysis. Cubes are 27µm side length at 9x9x20 nm voxel size (liconn: 9x9x12 nm). The multichannel setting stores 8-channel embeddings instead of a single grayscale image.

Data is streamed directly from S3 via s3fs and written to local zarr v3 stores (chunk 64^3, shard 512^3, zstd compression) with (z, y, x) axis order. The source is zarr v2 with (x, y, z) axis order; spatial axes are transposed and the trailing singleton channel dim on img is squeezed during the write. Requires s3fs (pip install s3fs).

The data is described in https://doi.org/10.17617/1.r2mm-1h33. Please cite it if you use this dataset for a publication.

  1"""NISB is a large-scale synthetic benchmark for neuron instance segmentation in connectomics.
  2
  3It comprises 9 settings with varying difficulty and imaging conditions, each providing 5 training
  4cubes, 1 validation cube, and 1 test cube. The train_100 setting is an exception with 100 training
  5cubes for scaling analysis. Cubes are 27µm side length at 9x9x20 nm voxel size (liconn: 9x9x12 nm).
  6The multichannel setting stores 8-channel embeddings instead of a single grayscale image.
  7
  8Data is streamed directly from S3 via s3fs and written to local zarr v3 stores (chunk 64^3,
  9shard 512^3, zstd compression) with (z, y, x) axis order. The source is zarr v2 with (x, y, z)
 10axis order; spatial axes are transposed and the trailing singleton channel dim on img is squeezed
 11during the write. Requires s3fs (pip install s3fs).
 12
 13The data is described in https://doi.org/10.17617/1.r2mm-1h33.
 14Please cite it if you use this dataset for a publication.
 15"""
 16
 17import os
 18import shutil
 19import warnings
 20from glob import glob
 21from typing import List, Literal, Optional, Tuple, Union
 22
 23import numpy as np
 24from tqdm import tqdm
 25from torch.utils.data import DataLoader, Dataset
 26
 27import torch_em
 28from .. import util
 29
 30
 31NISB_S3_ENDPOINT = "https://s3.nexus.mpcdf.mpg.de:443"
 32NISB_S3_BUCKET = "nisb"
 33
 34NISB_SETTINGS = [
 35    "base", "train_100", "slice_perturbed", "pos_guidance", "neg_guidance",
 36    "no_touch_thick", "touching_thin", "liconn", "multichannel",
 37]
 38
 39NISB_CHUNK_SHAPE = (64, 64, 64)
 40NISB_SHARD_SHAPE = (512, 512, 512)
 41
 42
 43def _nisb_n_seeds(setting: str, split: str) -> int:
 44    if split in ("val", "test"):
 45        return 1
 46    return 100 if setting == "train_100" else 5
 47
 48
 49def _nisb_zarr_complete(zarr_path: str) -> bool:
 50    return (
 51        os.path.isfile(os.path.join(zarr_path, "zarr.json"))
 52        and os.path.isdir(os.path.join(zarr_path, "img"))
 53        and os.path.isdir(os.path.join(zarr_path, "seg"))
 54    )
 55
 56
 57def _nisb_create_v3_array(root, name, shape, dtype, is_label):
 58    from zarr.codecs import BloscCodec
 59    shuffle = "bitshuffle" if (np.issubdtype(np.dtype(dtype), np.integer) and is_label) else "shuffle"
 60    chunks = NISB_CHUNK_SHAPE + tuple(shape[3:])
 61    shards = NISB_SHARD_SHAPE + tuple(shape[3:])
 62    return root.create_array(
 63        name, shape=shape, chunks=chunks, shards=shards, dtype=dtype,
 64        compressors=BloscCodec(cname="zstd", clevel=6, shuffle=shuffle),
 65    )
 66
 67
 68def _nisb_write_cube_v3(src, v3_path: str) -> None:
 69    """Stream a NISB cube from a zarr v2 source to a local zarr v3 store.
 70
 71    Transposes axes from (x, y, z) to (z, y, x) and squeezes the trailing singleton
 72    channel dimension on the image array.
 73    """
 74    import zarr
 75
 76    img_v2 = src["img"]
 77    seg_v2 = src["seg"]
 78
 79    squeeze_img = img_v2.ndim == 4 and img_v2.shape[-1] == 1
 80    if squeeze_img:
 81        img_shape_v3 = (img_v2.shape[2], img_v2.shape[1], img_v2.shape[0])
 82    else:
 83        img_shape_v3 = (img_v2.shape[2], img_v2.shape[1], img_v2.shape[0], img_v2.shape[3])
 84    seg_shape_v3 = (seg_v2.shape[2], seg_v2.shape[1], seg_v2.shape[0])
 85
 86    tmp_path = v3_path + ".tmp"
 87    if os.path.exists(tmp_path):
 88        shutil.rmtree(tmp_path)
 89
 90    root = zarr.open_group(tmp_path, mode="w", zarr_format=3)
 91    img_v3 = _nisb_create_v3_array(root, "img", img_shape_v3, np.dtype("uint8"), False)
 92    seg_v3 = _nisb_create_v3_array(root, "seg", seg_shape_v3, np.dtype("uint16"), True)
 93
 94    Z, Y, X = seg_shape_v3
 95    sz, sy, sx = NISB_SHARD_SHAPE
 96    for z0 in range(0, Z, sz):
 97        for y0 in range(0, Y, sy):
 98            for x0 in range(0, X, sx):
 99                z1, y1, x1 = min(z0 + sz, Z), min(y0 + sy, Y), min(x0 + sx, X)
100                block_img = np.asarray(img_v2[x0:x1, y0:y1, z0:z1])
101                if squeeze_img:
102                    block_img = block_img[..., 0]
103                img_v3[z0:z1, y0:y1, x0:x1] = np.moveaxis(block_img, [0, 2], [2, 0])
104                block_seg = np.asarray(seg_v2[x0:x1, y0:y1, z0:z1])
105                seg_v3[z0:z1, y0:y1, x0:x1] = block_seg.transpose(2, 1, 0)
106
107    shutil.move(tmp_path, v3_path)
108
109
110def _nisb_open_remote(setting: str, split: str, seed_idx: int):
111    """Open a NISB seed cube from S3 as a zarr v2 group via s3fs."""
112    try:
113        import s3fs
114    except ImportError:
115        raise ImportError("The 's3fs' package is required to download NISB data. Install it with: pip install s3fs")
116    import zarr
117
118    fs = s3fs.S3FileSystem(anon=True, endpoint_url=NISB_S3_ENDPOINT)
119    s3_path = f"{NISB_S3_BUCKET}/{setting}/{split}/seed{seed_idx}/data.zarr"
120    with warnings.catch_warnings():
121        warnings.filterwarnings("ignore", message=".*asynchronous.*")
122        store = zarr.storage.FsspecStore(fs=fs, path=s3_path)
123        return zarr.open_group(store, mode="r", zarr_format=2)
124
125
126def get_nisb_data(path: Union[os.PathLike, str], setting: str, split: str, download: bool) -> str:
127    """Stream and cache NISB data for a given setting and split from S3.
128
129    Data is read from S3 via s3fs and written to local zarr v3 stores with (z, y, x) axis
130    order, sharding (chunk 64^3, shard 512^3), and zstd compression. Already-cached seeds
131    are skipped on subsequent calls.
132
133    Args:
134        path: Filepath to a folder where the cached data will be saved.
135        setting: The NISB setting. One of NISB_SETTINGS.
136        split: The data split, one of 'train', 'val', 'test'.
137        download: Whether to stream and cache the data if it is not present.
138
139    Returns:
140        The filepath to the split directory containing seed subdirectories.
141    """
142    assert setting in NISB_SETTINGS, f"Invalid setting '{setting}'. Choose from {NISB_SETTINGS}."
143    assert split in ("train", "val", "test"), f"Invalid split '{split}'. Choose 'train', 'val', or 'test'."
144
145    split_dir = os.path.join(str(path), setting, split)
146    n = _nisb_n_seeds(setting, split)
147
148    for i in tqdm(range(n), desc=f"NISB {setting}/{split}", leave=False):
149        seed_dir = os.path.join(split_dir, f"seed{i}")
150        zarr_path = os.path.join(seed_dir, "data.zarr")
151
152        if _nisb_zarr_complete(zarr_path):
153            continue
154
155        if not download:
156            raise RuntimeError(
157                f"No NISB data for setting '{setting}' split '{split}' seed {i} at '{zarr_path}'. "
158                "Set download=True to stream it from S3."
159            )
160
161        os.makedirs(seed_dir, exist_ok=True)
162        print(f"Streaming NISB {setting}/{split}/seed{i} from S3 ...")
163        src = _nisb_open_remote(setting, split, i)
164        _nisb_write_cube_v3(src, zarr_path)
165
166    return split_dir
167
168
169def get_nisb_paths(
170    path: Union[os.PathLike, str],
171    setting: str = "base",
172    split: Literal["train", "val", "test"] = "train",
173    download: bool = False,
174) -> List[str]:
175    """Get paths to NISB zarr stores for a given setting and split.
176
177    Args:
178        path: Filepath to a folder where the cached data is saved.
179        setting: The NISB setting. One of NISB_SETTINGS.
180        split: The data split, one of 'train', 'val', 'test'.
181        download: Whether to stream and cache the data if it is not present.
182
183    Returns:
184        Sorted list of filepaths to the zarr stores, one per cube/seed.
185    """
186    split_dir = get_nisb_data(path, setting, split, download)
187    paths = sorted(glob(os.path.join(split_dir, "seed*", "data.zarr")))
188    if not paths:
189        raise RuntimeError(
190            f"No zarr files found in '{split_dir}'. The download may have failed or the directory is empty."
191        )
192    return paths
193
194
195def get_nisb_dataset(
196    path: Union[os.PathLike, str],
197    patch_shape: Tuple[int, int, int],
198    setting: str = "base",
199    split: Literal["train", "val", "test"] = "train",
200    download: bool = False,
201    offsets: Optional[List[List[int]]] = None,
202    boundaries: bool = False,
203    **kwargs,
204) -> Dataset:
205    """Get the NISB dataset for neuron instance segmentation in EM.
206
207    NISB provides 9 settings of varying difficulty, each with multiple cubes at 27µm side length.
208    Image data is stored under the zarr key 'img' with shape (z, y, x) and segmentation under 'seg'.
209    The multichannel setting stores 8-channel data with shape (z, y, x, 8).
210
211    Args:
212        path: Filepath to a folder where the cached data will be saved.
213        patch_shape: The patch shape to use for training.
214        setting: The NISB setting. One of NISB_SETTINGS. Default 'base'.
215        split: The data split, one of 'train', 'val', 'test'.
216        download: Whether to stream and cache the data if it is not present.
217            Requires s3fs (pip install s3fs).
218        offsets: Offset values for affinity computation used as target.
219        boundaries: Whether to compute boundaries as the target.
220        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
221
222    Returns:
223        The segmentation dataset.
224    """
225    assert len(patch_shape) == 3
226
227    paths = get_nisb_paths(path, setting, split, download)
228
229    kwargs = util.update_kwargs(kwargs, "is_seg_dataset", True)
230    kwargs, _ = util.add_instance_label_transform(
231        kwargs, add_binary_target=False, boundaries=boundaries, offsets=offsets
232    )
233
234    return torch_em.default_segmentation_dataset(
235        raw_paths=paths,
236        raw_key="img",
237        label_paths=paths,
238        label_key="seg",
239        patch_shape=patch_shape,
240        **kwargs,
241    )
242
243
244def get_nisb_loader(
245    path: Union[os.PathLike, str],
246    patch_shape: Tuple[int, int, int],
247    batch_size: int,
248    setting: str = "base",
249    split: Literal["train", "val", "test"] = "train",
250    download: bool = False,
251    offsets: Optional[List[List[int]]] = None,
252    boundaries: bool = False,
253    **kwargs,
254) -> DataLoader:
255    """Get the DataLoader for neuron instance segmentation in the NISB dataset.
256
257    Args:
258        path: Filepath to a folder where the cached data will be saved.
259        patch_shape: The patch shape to use for training.
260        batch_size: The batch size for training.
261        setting: The NISB setting. One of NISB_SETTINGS. Default 'base'.
262        split: The data split, one of 'train', 'val', 'test'.
263        download: Whether to stream and cache the data if it is not present.
264            Requires s3fs (pip install s3fs).
265        offsets: Offset values for affinity computation used as target.
266        boundaries: Whether to compute boundaries as the target.
267        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
268
269    Returns:
270        The DataLoader.
271    """
272    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
273    ds = get_nisb_dataset(
274        path=path,
275        patch_shape=patch_shape,
276        setting=setting,
277        split=split,
278        download=download,
279        offsets=offsets,
280        boundaries=boundaries,
281        **ds_kwargs,
282    )
283    return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)
NISB_S3_ENDPOINT = 'https://s3.nexus.mpcdf.mpg.de:443'
NISB_S3_BUCKET = 'nisb'
NISB_SETTINGS = ['base', 'train_100', 'slice_perturbed', 'pos_guidance', 'neg_guidance', 'no_touch_thick', 'touching_thin', 'liconn', 'multichannel']
NISB_CHUNK_SHAPE = (64, 64, 64)
NISB_SHARD_SHAPE = (512, 512, 512)
def get_nisb_data( path: Union[os.PathLike, str], setting: str, split: str, download: bool) -> str:
127def get_nisb_data(path: Union[os.PathLike, str], setting: str, split: str, download: bool) -> str:
128    """Stream and cache NISB data for a given setting and split from S3.
129
130    Data is read from S3 via s3fs and written to local zarr v3 stores with (z, y, x) axis
131    order, sharding (chunk 64^3, shard 512^3), and zstd compression. Already-cached seeds
132    are skipped on subsequent calls.
133
134    Args:
135        path: Filepath to a folder where the cached data will be saved.
136        setting: The NISB setting. One of NISB_SETTINGS.
137        split: The data split, one of 'train', 'val', 'test'.
138        download: Whether to stream and cache the data if it is not present.
139
140    Returns:
141        The filepath to the split directory containing seed subdirectories.
142    """
143    assert setting in NISB_SETTINGS, f"Invalid setting '{setting}'. Choose from {NISB_SETTINGS}."
144    assert split in ("train", "val", "test"), f"Invalid split '{split}'. Choose 'train', 'val', or 'test'."
145
146    split_dir = os.path.join(str(path), setting, split)
147    n = _nisb_n_seeds(setting, split)
148
149    for i in tqdm(range(n), desc=f"NISB {setting}/{split}", leave=False):
150        seed_dir = os.path.join(split_dir, f"seed{i}")
151        zarr_path = os.path.join(seed_dir, "data.zarr")
152
153        if _nisb_zarr_complete(zarr_path):
154            continue
155
156        if not download:
157            raise RuntimeError(
158                f"No NISB data for setting '{setting}' split '{split}' seed {i} at '{zarr_path}'. "
159                "Set download=True to stream it from S3."
160            )
161
162        os.makedirs(seed_dir, exist_ok=True)
163        print(f"Streaming NISB {setting}/{split}/seed{i} from S3 ...")
164        src = _nisb_open_remote(setting, split, i)
165        _nisb_write_cube_v3(src, zarr_path)
166
167    return split_dir

Stream and cache NISB data for a given setting and split from S3.

Data is read from S3 via s3fs and written to local zarr v3 stores with (z, y, x) axis order, sharding (chunk 64^3, shard 512^3), and zstd compression. Already-cached seeds are skipped on subsequent calls.

Arguments:
  • path: Filepath to a folder where the cached data will be saved.
  • setting: The NISB setting. One of NISB_SETTINGS.
  • split: The data split, one of 'train', 'val', 'test'.
  • download: Whether to stream and cache the data if it is not present.
Returns:

The filepath to the split directory containing seed subdirectories.

def get_nisb_paths( path: Union[os.PathLike, str], setting: str = 'base', split: Literal['train', 'val', 'test'] = 'train', download: bool = False) -> List[str]:
170def get_nisb_paths(
171    path: Union[os.PathLike, str],
172    setting: str = "base",
173    split: Literal["train", "val", "test"] = "train",
174    download: bool = False,
175) -> List[str]:
176    """Get paths to NISB zarr stores for a given setting and split.
177
178    Args:
179        path: Filepath to a folder where the cached data is saved.
180        setting: The NISB setting. One of NISB_SETTINGS.
181        split: The data split, one of 'train', 'val', 'test'.
182        download: Whether to stream and cache the data if it is not present.
183
184    Returns:
185        Sorted list of filepaths to the zarr stores, one per cube/seed.
186    """
187    split_dir = get_nisb_data(path, setting, split, download)
188    paths = sorted(glob(os.path.join(split_dir, "seed*", "data.zarr")))
189    if not paths:
190        raise RuntimeError(
191            f"No zarr files found in '{split_dir}'. The download may have failed or the directory is empty."
192        )
193    return paths

Get paths to NISB zarr stores for a given setting and split.

Arguments:
  • path: Filepath to a folder where the cached data is saved.
  • setting: The NISB setting. One of NISB_SETTINGS.
  • split: The data split, one of 'train', 'val', 'test'.
  • download: Whether to stream and cache the data if it is not present.
Returns:

Sorted list of filepaths to the zarr stores, one per cube/seed.

def get_nisb_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int, int], setting: str = 'base', split: Literal['train', 'val', 'test'] = 'train', download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
196def get_nisb_dataset(
197    path: Union[os.PathLike, str],
198    patch_shape: Tuple[int, int, int],
199    setting: str = "base",
200    split: Literal["train", "val", "test"] = "train",
201    download: bool = False,
202    offsets: Optional[List[List[int]]] = None,
203    boundaries: bool = False,
204    **kwargs,
205) -> Dataset:
206    """Get the NISB dataset for neuron instance segmentation in EM.
207
208    NISB provides 9 settings of varying difficulty, each with multiple cubes at 27µm side length.
209    Image data is stored under the zarr key 'img' with shape (z, y, x) and segmentation under 'seg'.
210    The multichannel setting stores 8-channel data with shape (z, y, x, 8).
211
212    Args:
213        path: Filepath to a folder where the cached data will be saved.
214        patch_shape: The patch shape to use for training.
215        setting: The NISB setting. One of NISB_SETTINGS. Default 'base'.
216        split: The data split, one of 'train', 'val', 'test'.
217        download: Whether to stream and cache the data if it is not present.
218            Requires s3fs (pip install s3fs).
219        offsets: Offset values for affinity computation used as target.
220        boundaries: Whether to compute boundaries as the target.
221        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
222
223    Returns:
224        The segmentation dataset.
225    """
226    assert len(patch_shape) == 3
227
228    paths = get_nisb_paths(path, setting, split, download)
229
230    kwargs = util.update_kwargs(kwargs, "is_seg_dataset", True)
231    kwargs, _ = util.add_instance_label_transform(
232        kwargs, add_binary_target=False, boundaries=boundaries, offsets=offsets
233    )
234
235    return torch_em.default_segmentation_dataset(
236        raw_paths=paths,
237        raw_key="img",
238        label_paths=paths,
239        label_key="seg",
240        patch_shape=patch_shape,
241        **kwargs,
242    )

Get the NISB dataset for neuron instance segmentation in EM.

NISB provides 9 settings of varying difficulty, each with multiple cubes at 27µm side length. Image data is stored under the zarr key 'img' with shape (z, y, x) and segmentation under 'seg'. The multichannel setting stores 8-channel data with shape (z, y, x, 8).

Arguments:
  • path: Filepath to a folder where the cached data will be saved.
  • patch_shape: The patch shape to use for training.
  • setting: The NISB setting. One of NISB_SETTINGS. Default 'base'.
  • split: The data split, one of 'train', 'val', 'test'.
  • download: Whether to stream and cache the data if it is not present. Requires s3fs (pip install s3fs).
  • offsets: Offset values for affinity computation used as target.
  • boundaries: Whether to compute boundaries as the target.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_nisb_loader( path: Union[os.PathLike, str], patch_shape: Tuple[int, int, int], batch_size: int, setting: str = 'base', split: Literal['train', 'val', 'test'] = 'train', download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
245def get_nisb_loader(
246    path: Union[os.PathLike, str],
247    patch_shape: Tuple[int, int, int],
248    batch_size: int,
249    setting: str = "base",
250    split: Literal["train", "val", "test"] = "train",
251    download: bool = False,
252    offsets: Optional[List[List[int]]] = None,
253    boundaries: bool = False,
254    **kwargs,
255) -> DataLoader:
256    """Get the DataLoader for neuron instance segmentation in the NISB dataset.
257
258    Args:
259        path: Filepath to a folder where the cached data will be saved.
260        patch_shape: The patch shape to use for training.
261        batch_size: The batch size for training.
262        setting: The NISB setting. One of NISB_SETTINGS. Default 'base'.
263        split: The data split, one of 'train', 'val', 'test'.
264        download: Whether to stream and cache the data if it is not present.
265            Requires s3fs (pip install s3fs).
266        offsets: Offset values for affinity computation used as target.
267        boundaries: Whether to compute boundaries as the target.
268        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
269
270    Returns:
271        The DataLoader.
272    """
273    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
274    ds = get_nisb_dataset(
275        path=path,
276        patch_shape=patch_shape,
277        setting=setting,
278        split=split,
279        download=download,
280        offsets=offsets,
281        boundaries=boundaries,
282        **ds_kwargs,
283    )
284    return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)

Get the DataLoader for neuron instance segmentation in the NISB dataset.

Arguments:
  • path: Filepath to a folder where the cached data will be saved.
  • patch_shape: The patch shape to use for training.
  • batch_size: The batch size for training.
  • setting: The NISB setting. One of NISB_SETTINGS. Default 'base'.
  • split: The data split, one of 'train', 'val', 'test'.
  • download: Whether to stream and cache the data if it is not present. Requires s3fs (pip install s3fs).
  • offsets: Offset values for affinity computation used as target.
  • boundaries: Whether to compute boundaries as the target.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.