torch_em.data.datasets.electron_microscopy.wildenberg

Wildenberg 2023 dataset for synaptic structure segmentation in 3DEM.

The dataset contains two FIB-SEM volumes from mouse primary visual cortex (V1) layer 4, acquired at 6 x 6 x 40 nm native resolution. Synaptic structures are annotated at 12 x 12 x 40 nm resolution across three auto-segmentation channels:

psd: postsynaptic density (binary, uint8)
vesicle_cloud: presynaptic vesicle cloud (binary, uint8)
saturated: saturated synapse mask (instance, uint32)

Two experiments are available:

p105: postnatal day 105 mouse (adult, fully developed cortex)
p14: postnatal day 14 mouse (early developmental stage)

Data is streamed from the BossDB public S3 bucket via cloud-volume and cached locally as zarr v3 stores in (z, y, x) axis order.

This dataset is from the publication https://doi.org/10.1038/s41467-023-43088-3. Please cite it if you use this dataset in your research.

The dataset is publicly available at https://bossdb.org/project/wildenberg2023. Requires cloud-volume: pip install cloud-volume.

View Source

  1"""Wildenberg 2023 dataset for synaptic structure segmentation in 3DEM.
  2
  3The dataset contains two FIB-SEM volumes from mouse primary visual cortex (V1) layer 4,
  4acquired at 6 x 6 x 40 nm native resolution. Synaptic structures are annotated at
  512 x 12 x 40 nm resolution across three auto-segmentation channels:
  6- psd: postsynaptic density (binary, uint8)
  7- vesicle_cloud: presynaptic vesicle cloud (binary, uint8)
  8- saturated: saturated synapse mask (instance, uint32)
  9
 10Two experiments are available:
 11- p105: postnatal day 105 mouse (adult, fully developed cortex)
 12- p14: postnatal day 14 mouse (early developmental stage)
 13
 14Data is streamed from the BossDB public S3 bucket via cloud-volume and cached locally as
 15zarr v3 stores in (z, y, x) axis order.
 16
 17This dataset is from the publication https://doi.org/10.1038/s41467-023-43088-3.
 18Please cite it if you use this dataset in your research.
 19
 20The dataset is publicly available at https://bossdb.org/project/wildenberg2023.
 21Requires cloud-volume: pip install cloud-volume.
 22"""
 23
 24import hashlib
 25import os
 26from concurrent.futures import ThreadPoolExecutor, as_completed
 27from typing import Dict, List, Literal, Optional, Sequence, Tuple, Union
 28
 29import numpy as np
 30from tqdm import tqdm
 31from torch.utils.data import Dataset, DataLoader
 32
 33import torch_em
 34from .. import util
 35
 36
 37WILDENBERG_S3_BASE = "precomputed://https://bossdb-open-data.s3.amazonaws.com/wildenberg2023"
 38
 39# Per-experiment metadata: BossDB experiment name, EM channel, annotation bounding box in nm
 40# (x_min, x_max, y_min, y_max, z_min, z_max) covering the full annotated region.
 41WILDENBERG_EXPERIMENTS: Dict[str, dict] = {
 42    "p105": {
 43        "exp_name": "mouse_v1_l4_p105",
 44        "em_channel": "em",
 45        # The p105 EM channel on BossDB has cv_x=physical_y and cv_y=physical_x (axes transposed
 46        # relative to the annotation channels). The download code corrects for this.
 47        "em_axes_swapped": True,
 48        "bbox_nm": (576, 120576, 576, 136512, 160, 36320),
 49    },
 50    "p14": {
 51        "exp_name": "mouse_v1_l4_p14",
 52        "em_channel": "em_aligned",
 53        "em_axes_swapped": False,
 54        "bbox_nm": (0, 80256, 0, 115200, 0, 52320),
 55    },
 56}
 57
 58# (channel_name, numpy_dtype, use_bitshuffle_for_compression)
 59WILDENBERG_LABEL_CHANNELS: Dict[str, tuple] = {
 60    "psd": ("psd_autoseg", np.dtype("uint8"), False),
 61    "vesicle_cloud": ("vesicle_autoseg", np.dtype("uint8"), False),
 62    "saturated": ("saturated_autoseg", np.dtype("uint32"), True),
 63}
 64
 65WILDENBERG_CHUNK_SHAPE = (64, 128, 128)
 66WILDENBERG_SHARD_SHAPE = (128, 512, 512)
 67
 68
 69def _wildenberg_bbox_to_str(bbox):
 70    return hashlib.md5("_".join(str(v) for v in bbox).encode()).hexdigest()[:12]
 71
 72
 73def _wildenberg_create_array(root, name, shape, dtype, is_label):
 74    from zarr.codecs import BloscCodec
 75    shuffle = "bitshuffle" if (np.issubdtype(dtype, np.integer) and is_label) else "shuffle"
 76    return root.create_array(
 77        name,
 78        shape=shape,
 79        chunks=WILDENBERG_CHUNK_SHAPE,
 80        shards=WILDENBERG_SHARD_SHAPE,
 81        dtype=dtype,
 82        compressors=BloscCodec(cname="zstd", clevel=6, shuffle=shuffle),
 83    )
 84
 85
 86def _wildenberg_bbox_voxels(cv, x_min_nm, x_max_nm, y_min_nm, y_max_nm, z_min_nm, z_max_nm):
 87    scale = np.array(cv.resolution)
 88    x0 = int(np.floor(x_min_nm / scale[0]))
 89    x1 = int(np.ceil(x_max_nm / scale[0]))
 90    y0 = int(np.floor(y_min_nm / scale[1]))
 91    y1 = int(np.ceil(y_max_nm / scale[1]))
 92    z0 = int(np.floor(z_min_nm / scale[2]))
 93    z1 = int(np.ceil(z_max_nm / scale[2]))
 94    return x0, x1, y0, y1, z0, z1, (z1 - z0, y1 - y0, x1 - x0)
 95
 96
 97def _wildenberg_download_to_zarr(cv, ds, x0g, y0g, z0g, name, swap_xy=False):
 98    shape = ds.shape  # (z, y, x) in physical space
 99    sz, sy, sx = WILDENBERG_SHARD_SHAPE
100
101    tasks = []
102    for z0_ in range(0, shape[0], sz):
103        for y0_ in range(0, shape[1], sy):
104            for x0_ in range(0, shape[2], sx):
105                z1_ = min(z0_ + sz, shape[0])
106                y1_ = min(y0_ + sy, shape[1])
107                x1_ = min(x0_ + sx, shape[2])
108                gz0, gz1 = z0g + z0_, z0g + z1_
109                if swap_xy:
110                    # em cv_x=physical_y, cv_y=physical_x: map physical y->cv_x, physical x->cv_y
111                    gx0, gx1 = y0g + y0_, y0g + y1_
112                    gy0, gy1 = x0g + x0_, x0g + x1_
113                else:
114                    gx0, gx1 = x0g + x0_, x0g + x1_
115                    gy0, gy1 = y0g + y0_, y0g + y1_
116                tasks.append(((z0_, z1_), (y0_, y1_), (x0_, x1_), (gx0, gx1, gy0, gy1, gz0, gz1)))
117
118    target_dtype = np.dtype(ds.dtype)
119    # swap_xy: block axes are (phys_y, phys_x, phys_z) -> transpose(2,0,1) -> (z,y,x)
120    # normal: block axes are (phys_x, phys_y, phys_z) -> transpose(2,1,0) -> (z,y,x)
121    transpose_order = (2, 0, 1) if swap_xy else (2, 1, 0)
122
123    def worker(item):
124        (z0_, z1_), (y0_, y1_), (x0_, x1_), (gx0, gx1, gy0, gy1, gz0, gz1) = item
125        block = np.asarray(cv[gx0:gx1, gy0:gy1, gz0:gz1])
126        if block.ndim == 4:
127            block = block[..., 0]
128        ds[z0_:z1_, y0_:y1_, x0_:x1_] = block.transpose(*transpose_order).astype(target_dtype)
129
130    with ThreadPoolExecutor(max_workers=8) as ex:
131        futures = [ex.submit(worker, t) for t in tasks]
132        for fut in tqdm(as_completed(futures), total=len(futures), desc=f"Downloading '{name}'", smoothing=0.05):
133            fut.result()
134
135
136def get_wildenberg_data(
137    path: Union[os.PathLike, str],
138    experiment: Literal["p105", "p14"],
139    label_choice: Literal["psd", "vesicle_cloud", "saturated"],
140    bounding_box: Optional[Tuple[float, ...]] = None,
141    em_mip: int = 1,
142    seg_mip: int = 0,
143    download: bool = False,
144) -> str:
145    """Stream and cache one Wildenberg experiment as a zarr v3 store.
146
147    The zarr store contains:
148      - raw: EM grayscale (uint8, z/y/x)
149      - labels: synaptic annotation (uint8 for psd/vesicle_cloud, uint32 for saturated, z/y/x)
150
151    Args:
152        path: Filepath to a folder where the cached zarr store will be saved.
153        experiment: Which experiment to load. Either 'p105' (adult) or 'p14' (developing).
154        label_choice: Which annotation channel to use. One of 'psd', 'vesicle_cloud', or 'saturated'.
155        bounding_box: Region in nm as (x_min, x_max, y_min, y_max, z_min, z_max).
156            Defaults to the full annotation extent of the chosen experiment.
157        em_mip: MIP level for the EM image. Default mip=1 gives 12 x 12 x 40 nm resolution.
158        seg_mip: MIP level for the annotation. Default mip=0 gives 12 x 12 x 40 nm resolution.
159        download: Whether to stream and cache the data if not present.
160
161    Returns:
162        Filepath to the cached zarr store.
163    """
164    import zarr
165
166    if experiment not in WILDENBERG_EXPERIMENTS:
167        raise ValueError(f"Invalid experiment: '{experiment}'. Choose from {list(WILDENBERG_EXPERIMENTS.keys())}.")
168    if label_choice not in WILDENBERG_LABEL_CHANNELS:
169        raise ValueError(
170            f"Invalid label_choice: '{label_choice}'. Choose from {list(WILDENBERG_LABEL_CHANNELS.keys())}."
171        )
172
173    os.makedirs(str(path), exist_ok=True)
174    bbox = bounding_box if bounding_box is not None else WILDENBERG_EXPERIMENTS[experiment]["bbox_nm"]
175    bbox_hash = _wildenberg_bbox_to_str(bbox)
176    zarr_path = os.path.join(str(path), f"{experiment}_{label_choice}_{bbox_hash}.zarr")
177
178    def _complete(zp):
179        return os.path.isdir(os.path.join(zp, "raw")) and os.path.isdir(os.path.join(zp, "labels"))
180
181    if _complete(zarr_path):
182        return zarr_path
183    if not download:
184        raise RuntimeError(
185            f"No cached data at '{zarr_path}'. Set download=True to stream from BossDB."
186        )
187
188    try:
189        from cloudvolume import CloudVolume
190    except ImportError:
191        raise ImportError("The 'cloud-volume' package is required: pip install cloud-volume")
192
193    exp_info = WILDENBERG_EXPERIMENTS[experiment]
194    exp_name = exp_info["exp_name"]
195    em_channel = exp_info["em_channel"]
196    em_axes_swapped = exp_info.get("em_axes_swapped", False)
197    x_min_nm, x_max_nm, y_min_nm, y_max_nm, z_min_nm, z_max_nm = bbox
198
199    label_channel, label_dtype, label_compress = WILDENBERG_LABEL_CHANNELS[label_choice]
200
201    print(f"Streaming Wildenberg2023 {experiment}/{label_choice} at em_mip={em_mip}, seg_mip={seg_mip} ...")
202
203    em_url = f"{WILDENBERG_S3_BASE}/{exp_name}/{em_channel}"
204    seg_url = f"{WILDENBERG_S3_BASE}/{exp_name}/{label_channel}"
205
206    em_cv = CloudVolume(em_url, use_https=True, mip=em_mip, progress=False, fill_missing=True)
207    seg_cv = CloudVolume(seg_url, use_https=True, mip=seg_mip, progress=False, fill_missing=True)
208
209    # Clip the requested nm bbox to the intersection of both volumes' actual extents so that
210    # the EM and label arrays start at the same physical coordinate (avoiding spatial offsets).
211    em_scale = np.array(em_cv.resolution, dtype=float)
212    seg_scale = np.array(seg_cv.resolution, dtype=float)
213    em_bb = em_cv.meta.bbox(em_mip)
214    seg_bb = seg_cv.meta.bbox(seg_mip)
215    em_min_nm = np.array(em_bb.minpt[:3], dtype=float) * em_scale
216    em_max_nm = np.array(em_bb.maxpt[:3], dtype=float) * em_scale
217    seg_min_nm = np.array(seg_bb.minpt[:3], dtype=float) * seg_scale
218    seg_max_nm = np.array(seg_bb.maxpt[:3], dtype=float) * seg_scale
219
220    cx_min = max(x_min_nm, float(em_min_nm[0]), float(seg_min_nm[0]))
221    cx_max = min(x_max_nm, float(em_max_nm[0]), float(seg_max_nm[0]))
222    cy_min = max(y_min_nm, float(em_min_nm[1]), float(seg_min_nm[1]))
223    cy_max = min(y_max_nm, float(em_max_nm[1]), float(seg_max_nm[1]))
224    cz_min = max(z_min_nm, float(em_min_nm[2]), float(seg_min_nm[2]))
225    cz_max = min(z_max_nm, float(em_max_nm[2]), float(seg_max_nm[2]))
226
227    ex0, ex1, ey0, ey1, ez0, ez1, em_shape = _wildenberg_bbox_voxels(
228        em_cv, cx_min, cx_max, cy_min, cy_max, cz_min, cz_max
229    )
230    sx0, sx1, sy0, sy1, sz0, sz1, seg_shape = _wildenberg_bbox_voxels(
231        seg_cv, cx_min, cx_max, cy_min, cy_max, cz_min, cz_max
232    )
233
234    shape = tuple(min(e, s) for e, s in zip(em_shape, seg_shape))
235
236    root = zarr.open_group(zarr_path, mode="a")
237    root.attrs["experiment"] = experiment
238    root.attrs["label_choice"] = label_choice
239    root.attrs["bounding_box_nm"] = list(bbox)
240    root.attrs["em_mip"] = em_mip
241    root.attrs["seg_mip"] = seg_mip
242
243    if "raw" not in root:
244        ds_raw = _wildenberg_create_array(root, "raw", shape, np.dtype("uint8"), is_label=False)
245        _wildenberg_download_to_zarr(em_cv, ds_raw, ex0, ey0, ez0, name="raw", swap_xy=em_axes_swapped)
246
247    if "labels" not in root:
248        ds_lbl = _wildenberg_create_array(root, "labels", shape, label_dtype, is_label=label_compress)
249        _wildenberg_download_to_zarr(seg_cv, ds_lbl, sx0, sy0, sz0, name="labels")
250
251    print(f"Cached to {zarr_path} (shape {shape})")
252    return zarr_path
253
254
255def get_wildenberg_paths(
256    path: Union[os.PathLike, str],
257    experiments: Optional[Sequence[str]] = None,
258    label_choice: Literal["psd", "vesicle_cloud", "saturated"] = "psd",
259    bounding_box: Optional[Tuple[float, ...]] = None,
260    em_mip: int = 1,
261    seg_mip: int = 0,
262    download: bool = False,
263) -> List[str]:
264    """Get paths to cached Wildenberg zarr stores.
265
266    Args:
267        path: Filepath to a folder where the cached zarr stores will be saved.
268        experiments: Experiments to load. Defaults to both ('p105', 'p14').
269        label_choice: Which annotation channel to use. One of 'psd', 'vesicle_cloud', or 'saturated'.
270        bounding_box: Region in nm as (x_min, x_max, y_min, y_max, z_min, z_max).
271            Defaults to the full annotation extent per experiment.
272        em_mip: MIP level for the EM image.
273        seg_mip: MIP level for the annotation.
274        download: Whether to stream and cache the data if not present.
275
276    Returns:
277        Filepaths to the cached zarr stores.
278    """
279    exps = list(experiments) if experiments is not None else list(WILDENBERG_EXPERIMENTS.keys())
280    return [get_wildenberg_data(path, exp, label_choice, bounding_box, em_mip, seg_mip, download) for exp in exps]
281
282
283def get_wildenberg_dataset(
284    path: Union[os.PathLike, str],
285    patch_shape: Tuple[int, int, int],
286    experiments: Optional[Sequence[str]] = None,
287    label_choice: Literal["psd", "vesicle_cloud", "saturated"] = "psd",
288    bounding_box: Optional[Tuple[float, ...]] = None,
289    em_mip: int = 1,
290    seg_mip: int = 0,
291    download: bool = False,
292    offsets: Optional[List[List[int]]] = None,
293    boundaries: bool = False,
294    **kwargs,
295) -> Dataset:
296    """Get the Wildenberg 2023 dataset for synaptic structure segmentation in 3DEM.
297
298    Args:
299        path: Filepath to a folder where the cached zarr stores will be saved.
300        patch_shape: The patch shape (z, y, x) to use for training.
301        experiments: Experiments to load. Defaults to both ('p105', 'p14').
302        label_choice: Which annotation channel to use. 'psd' for postsynaptic density,
303            'vesicle_cloud' for presynaptic vesicle cloud, or 'saturated' for instance-labeled saturated synapses.
304        bounding_box: Region in nm as (x_min, x_max, y_min, y_max, z_min, z_max).
305            Defaults to the full annotation extent per experiment.
306        em_mip: MIP level for the EM image. Default mip=1 gives 12 x 12 x 40 nm.
307        seg_mip: MIP level for the annotation. Default mip=0 gives 12 x 12 x 40 nm.
308        download: Whether to stream and cache data if not already present.
309        offsets: Offset values for affinity computation (only applied when label_choice='saturated').
310        boundaries: Whether to compute boundaries (only applied when label_choice='saturated').
311        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
312
313    Returns:
314        The segmentation dataset.
315    """
316    assert len(patch_shape) == 3
317    paths = get_wildenberg_paths(path, experiments, label_choice, bounding_box, em_mip, seg_mip, download)
318
319    if label_choice == "saturated":
320        kwargs = util.update_kwargs(kwargs, "is_seg_dataset", True)
321        kwargs, _ = util.add_instance_label_transform(
322            kwargs, add_binary_target=False, boundaries=boundaries, offsets=offsets
323        )
324
325    return torch_em.default_segmentation_dataset(
326        raw_paths=paths,
327        raw_key="raw",
328        label_paths=paths,
329        label_key="labels",
330        patch_shape=patch_shape,
331        **kwargs,
332    )
333
334
335def get_wildenberg_loader(
336    path: Union[os.PathLike, str],
337    batch_size: int,
338    patch_shape: Tuple[int, int, int],
339    experiments: Optional[Sequence[str]] = None,
340    label_choice: Literal["psd", "vesicle_cloud", "saturated"] = "psd",
341    bounding_box: Optional[Tuple[float, ...]] = None,
342    em_mip: int = 1,
343    seg_mip: int = 0,
344    download: bool = False,
345    offsets: Optional[List[List[int]]] = None,
346    boundaries: bool = False,
347    **kwargs,
348) -> DataLoader:
349    """Get the DataLoader for synaptic structure segmentation in Wildenberg 2023 3DEM data.
350
351    Args:
352        path: Filepath to a folder where the cached zarr stores will be saved.
353        batch_size: The batch size for training.
354        patch_shape: The patch shape (z, y, x) to use for training.
355        experiments: Experiments to load. Defaults to both ('p105', 'p14').
356        label_choice: Which annotation channel to use. 'psd' for postsynaptic density,
357            'vesicle_cloud' for presynaptic vesicle cloud, or 'saturated' for instance-labeled saturated synapses.
358        bounding_box: Region in nm as (x_min, x_max, y_min, y_max, z_min, z_max).
359            Defaults to the full annotation extent per experiment.
360        em_mip: MIP level for the EM image. Default mip=1 gives 12 x 12 x 40 nm.
361        seg_mip: MIP level for the annotation. Default mip=0 gives 12 x 12 x 40 nm.
362        download: Whether to stream and cache data if not already present.
363        offsets: Offset values for affinity computation (only applied when label_choice='saturated').
364        boundaries: Whether to compute boundaries (only applied when label_choice='saturated').
365        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
366
367    Returns:
368        The DataLoader.
369    """
370    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
371    ds = get_wildenberg_dataset(
372        path=path,
373        patch_shape=patch_shape,
374        experiments=experiments,
375        label_choice=label_choice,
376        bounding_box=bounding_box,
377        em_mip=em_mip,
378        seg_mip=seg_mip,
379        download=download,
380        offsets=offsets,
381        boundaries=boundaries,
382        **ds_kwargs,
383    )
384    return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)

WILDENBERG_S3_BASE = 'precomputed://https://bossdb-open-data.s3.amazonaws.com/wildenberg2023'

WILDENBERG_EXPERIMENTS: Dict[str, dict] = {'p105': {'exp_name': 'mouse_v1_l4_p105', 'em_channel': 'em', 'em_axes_swapped': True, 'bbox_nm': (576, 120576, 576, 136512, 160, 36320)}, 'p14': {'exp_name': 'mouse_v1_l4_p14', 'em_channel': 'em_aligned', 'em_axes_swapped': False, 'bbox_nm': (0, 80256, 0, 115200, 0, 52320)}}

WILDENBERG_LABEL_CHANNELS: Dict[str, tuple] = {'psd': ('psd_autoseg', dtype('uint8'), False), 'vesicle_cloud': ('vesicle_autoseg', dtype('uint8'), False), 'saturated': ('saturated_autoseg', dtype('uint32'), True)}

WILDENBERG_CHUNK_SHAPE = (64, 128, 128)

WILDENBERG_SHARD_SHAPE = (128, 512, 512)

def get_wildenberg_data( path: Union[os.PathLike, str], experiment: Literal['p105', 'p14'], label_choice: Literal['psd', 'vesicle_cloud', 'saturated'], bounding_box: Optional[Tuple[float, ...]] = None, em_mip: int = 1, seg_mip: int = 0, download: bool = False) -> str: View Source

137def get_wildenberg_data(
138    path: Union[os.PathLike, str],
139    experiment: Literal["p105", "p14"],
140    label_choice: Literal["psd", "vesicle_cloud", "saturated"],
141    bounding_box: Optional[Tuple[float, ...]] = None,
142    em_mip: int = 1,
143    seg_mip: int = 0,
144    download: bool = False,
145) -> str:
146    """Stream and cache one Wildenberg experiment as a zarr v3 store.
147
148    The zarr store contains:
149      - raw: EM grayscale (uint8, z/y/x)
150      - labels: synaptic annotation (uint8 for psd/vesicle_cloud, uint32 for saturated, z/y/x)
151
152    Args:
153        path: Filepath to a folder where the cached zarr store will be saved.
154        experiment: Which experiment to load. Either 'p105' (adult) or 'p14' (developing).
155        label_choice: Which annotation channel to use. One of 'psd', 'vesicle_cloud', or 'saturated'.
156        bounding_box: Region in nm as (x_min, x_max, y_min, y_max, z_min, z_max).
157            Defaults to the full annotation extent of the chosen experiment.
158        em_mip: MIP level for the EM image. Default mip=1 gives 12 x 12 x 40 nm resolution.
159        seg_mip: MIP level for the annotation. Default mip=0 gives 12 x 12 x 40 nm resolution.
160        download: Whether to stream and cache the data if not present.
161
162    Returns:
163        Filepath to the cached zarr store.
164    """
165    import zarr
166
167    if experiment not in WILDENBERG_EXPERIMENTS:
168        raise ValueError(f"Invalid experiment: '{experiment}'. Choose from {list(WILDENBERG_EXPERIMENTS.keys())}.")
169    if label_choice not in WILDENBERG_LABEL_CHANNELS:
170        raise ValueError(
171            f"Invalid label_choice: '{label_choice}'. Choose from {list(WILDENBERG_LABEL_CHANNELS.keys())}."
172        )
173
174    os.makedirs(str(path), exist_ok=True)
175    bbox = bounding_box if bounding_box is not None else WILDENBERG_EXPERIMENTS[experiment]["bbox_nm"]
176    bbox_hash = _wildenberg_bbox_to_str(bbox)
177    zarr_path = os.path.join(str(path), f"{experiment}_{label_choice}_{bbox_hash}.zarr")
178
179    def _complete(zp):
180        return os.path.isdir(os.path.join(zp, "raw")) and os.path.isdir(os.path.join(zp, "labels"))
181
182    if _complete(zarr_path):
183        return zarr_path
184    if not download:
185        raise RuntimeError(
186            f"No cached data at '{zarr_path}'. Set download=True to stream from BossDB."
187        )
188
189    try:
190        from cloudvolume import CloudVolume
191    except ImportError:
192        raise ImportError("The 'cloud-volume' package is required: pip install cloud-volume")
193
194    exp_info = WILDENBERG_EXPERIMENTS[experiment]
195    exp_name = exp_info["exp_name"]
196    em_channel = exp_info["em_channel"]
197    em_axes_swapped = exp_info.get("em_axes_swapped", False)
198    x_min_nm, x_max_nm, y_min_nm, y_max_nm, z_min_nm, z_max_nm = bbox
199
200    label_channel, label_dtype, label_compress = WILDENBERG_LABEL_CHANNELS[label_choice]
201
202    print(f"Streaming Wildenberg2023 {experiment}/{label_choice} at em_mip={em_mip}, seg_mip={seg_mip} ...")
203
204    em_url = f"{WILDENBERG_S3_BASE}/{exp_name}/{em_channel}"
205    seg_url = f"{WILDENBERG_S3_BASE}/{exp_name}/{label_channel}"
206
207    em_cv = CloudVolume(em_url, use_https=True, mip=em_mip, progress=False, fill_missing=True)
208    seg_cv = CloudVolume(seg_url, use_https=True, mip=seg_mip, progress=False, fill_missing=True)
209
210    # Clip the requested nm bbox to the intersection of both volumes' actual extents so that
211    # the EM and label arrays start at the same physical coordinate (avoiding spatial offsets).
212    em_scale = np.array(em_cv.resolution, dtype=float)
213    seg_scale = np.array(seg_cv.resolution, dtype=float)
214    em_bb = em_cv.meta.bbox(em_mip)
215    seg_bb = seg_cv.meta.bbox(seg_mip)
216    em_min_nm = np.array(em_bb.minpt[:3], dtype=float) * em_scale
217    em_max_nm = np.array(em_bb.maxpt[:3], dtype=float) * em_scale
218    seg_min_nm = np.array(seg_bb.minpt[:3], dtype=float) * seg_scale
219    seg_max_nm = np.array(seg_bb.maxpt[:3], dtype=float) * seg_scale
220
221    cx_min = max(x_min_nm, float(em_min_nm[0]), float(seg_min_nm[0]))
222    cx_max = min(x_max_nm, float(em_max_nm[0]), float(seg_max_nm[0]))
223    cy_min = max(y_min_nm, float(em_min_nm[1]), float(seg_min_nm[1]))
224    cy_max = min(y_max_nm, float(em_max_nm[1]), float(seg_max_nm[1]))
225    cz_min = max(z_min_nm, float(em_min_nm[2]), float(seg_min_nm[2]))
226    cz_max = min(z_max_nm, float(em_max_nm[2]), float(seg_max_nm[2]))
227
228    ex0, ex1, ey0, ey1, ez0, ez1, em_shape = _wildenberg_bbox_voxels(
229        em_cv, cx_min, cx_max, cy_min, cy_max, cz_min, cz_max
230    )
231    sx0, sx1, sy0, sy1, sz0, sz1, seg_shape = _wildenberg_bbox_voxels(
232        seg_cv, cx_min, cx_max, cy_min, cy_max, cz_min, cz_max
233    )
234
235    shape = tuple(min(e, s) for e, s in zip(em_shape, seg_shape))
236
237    root = zarr.open_group(zarr_path, mode="a")
238    root.attrs["experiment"] = experiment
239    root.attrs["label_choice"] = label_choice
240    root.attrs["bounding_box_nm"] = list(bbox)
241    root.attrs["em_mip"] = em_mip
242    root.attrs["seg_mip"] = seg_mip
243
244    if "raw" not in root:
245        ds_raw = _wildenberg_create_array(root, "raw", shape, np.dtype("uint8"), is_label=False)
246        _wildenberg_download_to_zarr(em_cv, ds_raw, ex0, ey0, ez0, name="raw", swap_xy=em_axes_swapped)
247
248    if "labels" not in root:
249        ds_lbl = _wildenberg_create_array(root, "labels", shape, label_dtype, is_label=label_compress)
250        _wildenberg_download_to_zarr(seg_cv, ds_lbl, sx0, sy0, sz0, name="labels")
251
252    print(f"Cached to {zarr_path} (shape {shape})")
253    return zarr_path

Stream and cache one Wildenberg experiment as a zarr v3 store.

The zarr store contains:

raw: EM grayscale (uint8, z/y/x)

labels: synaptic annotation (uint8 for psd/vesicle_cloud, uint32 for saturated, z/y/x)

Arguments:

path: Filepath to a folder where the cached zarr store will be saved.
experiment: Which experiment to load. Either 'p105' (adult) or 'p14' (developing).
label_choice: Which annotation channel to use. One of 'psd', 'vesicle_cloud', or 'saturated'.
bounding_box: Region in nm as (x_min, x_max, y_min, y_max, z_min, z_max). Defaults to the full annotation extent of the chosen experiment.
em_mip: MIP level for the EM image. Default mip=1 gives 12 x 12 x 40 nm resolution.
seg_mip: MIP level for the annotation. Default mip=0 gives 12 x 12 x 40 nm resolution.
download: Whether to stream and cache the data if not present.

Returns:

Filepath to the cached zarr store.

def get_wildenberg_paths( path: Union[os.PathLike, str], experiments: Optional[Sequence[str]] = None, label_choice: Literal['psd', 'vesicle_cloud', 'saturated'] = 'psd', bounding_box: Optional[Tuple[float, ...]] = None, em_mip: int = 1, seg_mip: int = 0, download: bool = False) -> List[str]: View Source

256def get_wildenberg_paths(
257    path: Union[os.PathLike, str],
258    experiments: Optional[Sequence[str]] = None,
259    label_choice: Literal["psd", "vesicle_cloud", "saturated"] = "psd",
260    bounding_box: Optional[Tuple[float, ...]] = None,
261    em_mip: int = 1,
262    seg_mip: int = 0,
263    download: bool = False,
264) -> List[str]:
265    """Get paths to cached Wildenberg zarr stores.
266
267    Args:
268        path: Filepath to a folder where the cached zarr stores will be saved.
269        experiments: Experiments to load. Defaults to both ('p105', 'p14').
270        label_choice: Which annotation channel to use. One of 'psd', 'vesicle_cloud', or 'saturated'.
271        bounding_box: Region in nm as (x_min, x_max, y_min, y_max, z_min, z_max).
272            Defaults to the full annotation extent per experiment.
273        em_mip: MIP level for the EM image.
274        seg_mip: MIP level for the annotation.
275        download: Whether to stream and cache the data if not present.
276
277    Returns:
278        Filepaths to the cached zarr stores.
279    """
280    exps = list(experiments) if experiments is not None else list(WILDENBERG_EXPERIMENTS.keys())
281    return [get_wildenberg_data(path, exp, label_choice, bounding_box, em_mip, seg_mip, download) for exp in exps]

Get paths to cached Wildenberg zarr stores.

Arguments:

path: Filepath to a folder where the cached zarr stores will be saved.
experiments: Experiments to load. Defaults to both ('p105', 'p14').
label_choice: Which annotation channel to use. One of 'psd', 'vesicle_cloud', or 'saturated'.
bounding_box: Region in nm as (x_min, x_max, y_min, y_max, z_min, z_max). Defaults to the full annotation extent per experiment.
em_mip: MIP level for the EM image.
seg_mip: MIP level for the annotation.
download: Whether to stream and cache the data if not present.

Returns:

Filepaths to the cached zarr stores.

def get_wildenberg_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int, int], experiments: Optional[Sequence[str]] = None, label_choice: Literal['psd', 'vesicle_cloud', 'saturated'] = 'psd', bounding_box: Optional[Tuple[float, ...]] = None, em_mip: int = 1, seg_mip: int = 0, download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset: View Source

284def get_wildenberg_dataset(
285    path: Union[os.PathLike, str],
286    patch_shape: Tuple[int, int, int],
287    experiments: Optional[Sequence[str]] = None,
288    label_choice: Literal["psd", "vesicle_cloud", "saturated"] = "psd",
289    bounding_box: Optional[Tuple[float, ...]] = None,
290    em_mip: int = 1,
291    seg_mip: int = 0,
292    download: bool = False,
293    offsets: Optional[List[List[int]]] = None,
294    boundaries: bool = False,
295    **kwargs,
296) -> Dataset:
297    """Get the Wildenberg 2023 dataset for synaptic structure segmentation in 3DEM.
298
299    Args:
300        path: Filepath to a folder where the cached zarr stores will be saved.
301        patch_shape: The patch shape (z, y, x) to use for training.
302        experiments: Experiments to load. Defaults to both ('p105', 'p14').
303        label_choice: Which annotation channel to use. 'psd' for postsynaptic density,
304            'vesicle_cloud' for presynaptic vesicle cloud, or 'saturated' for instance-labeled saturated synapses.
305        bounding_box: Region in nm as (x_min, x_max, y_min, y_max, z_min, z_max).
306            Defaults to the full annotation extent per experiment.
307        em_mip: MIP level for the EM image. Default mip=1 gives 12 x 12 x 40 nm.
308        seg_mip: MIP level for the annotation. Default mip=0 gives 12 x 12 x 40 nm.
309        download: Whether to stream and cache data if not already present.
310        offsets: Offset values for affinity computation (only applied when label_choice='saturated').
311        boundaries: Whether to compute boundaries (only applied when label_choice='saturated').
312        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
313
314    Returns:
315        The segmentation dataset.
316    """
317    assert len(patch_shape) == 3
318    paths = get_wildenberg_paths(path, experiments, label_choice, bounding_box, em_mip, seg_mip, download)
319
320    if label_choice == "saturated":
321        kwargs = util.update_kwargs(kwargs, "is_seg_dataset", True)
322        kwargs, _ = util.add_instance_label_transform(
323            kwargs, add_binary_target=False, boundaries=boundaries, offsets=offsets
324        )
325
326    return torch_em.default_segmentation_dataset(
327        raw_paths=paths,
328        raw_key="raw",
329        label_paths=paths,
330        label_key="labels",
331        patch_shape=patch_shape,
332        **kwargs,
333    )

Get the Wildenberg 2023 dataset for synaptic structure segmentation in 3DEM.

Arguments:

path: Filepath to a folder where the cached zarr stores will be saved.
patch_shape: The patch shape (z, y, x) to use for training.
experiments: Experiments to load. Defaults to both ('p105', 'p14').
label_choice: Which annotation channel to use. 'psd' for postsynaptic density, 'vesicle_cloud' for presynaptic vesicle cloud, or 'saturated' for instance-labeled saturated synapses.
bounding_box: Region in nm as (x_min, x_max, y_min, y_max, z_min, z_max). Defaults to the full annotation extent per experiment.
em_mip: MIP level for the EM image. Default mip=1 gives 12 x 12 x 40 nm.
seg_mip: MIP level for the annotation. Default mip=0 gives 12 x 12 x 40 nm.
download: Whether to stream and cache data if not already present.
offsets: Offset values for affinity computation (only applied when label_choice='saturated').
boundaries: Whether to compute boundaries (only applied when label_choice='saturated').
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

Returns:

The segmentation dataset.

def get_wildenberg_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int, int], experiments: Optional[Sequence[str]] = None, label_choice: Literal['psd', 'vesicle_cloud', 'saturated'] = 'psd', bounding_box: Optional[Tuple[float, ...]] = None, em_mip: int = 1, seg_mip: int = 0, download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader: View Source

336def get_wildenberg_loader(
337    path: Union[os.PathLike, str],
338    batch_size: int,
339    patch_shape: Tuple[int, int, int],
340    experiments: Optional[Sequence[str]] = None,
341    label_choice: Literal["psd", "vesicle_cloud", "saturated"] = "psd",
342    bounding_box: Optional[Tuple[float, ...]] = None,
343    em_mip: int = 1,
344    seg_mip: int = 0,
345    download: bool = False,
346    offsets: Optional[List[List[int]]] = None,
347    boundaries: bool = False,
348    **kwargs,
349) -> DataLoader:
350    """Get the DataLoader for synaptic structure segmentation in Wildenberg 2023 3DEM data.
351
352    Args:
353        path: Filepath to a folder where the cached zarr stores will be saved.
354        batch_size: The batch size for training.
355        patch_shape: The patch shape (z, y, x) to use for training.
356        experiments: Experiments to load. Defaults to both ('p105', 'p14').
357        label_choice: Which annotation channel to use. 'psd' for postsynaptic density,
358            'vesicle_cloud' for presynaptic vesicle cloud, or 'saturated' for instance-labeled saturated synapses.
359        bounding_box: Region in nm as (x_min, x_max, y_min, y_max, z_min, z_max).
360            Defaults to the full annotation extent per experiment.
361        em_mip: MIP level for the EM image. Default mip=1 gives 12 x 12 x 40 nm.
362        seg_mip: MIP level for the annotation. Default mip=0 gives 12 x 12 x 40 nm.
363        download: Whether to stream and cache data if not already present.
364        offsets: Offset values for affinity computation (only applied when label_choice='saturated').
365        boundaries: Whether to compute boundaries (only applied when label_choice='saturated').
366        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
367
368    Returns:
369        The DataLoader.
370    """
371    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
372    ds = get_wildenberg_dataset(
373        path=path,
374        patch_shape=patch_shape,
375        experiments=experiments,
376        label_choice=label_choice,
377        bounding_box=bounding_box,
378        em_mip=em_mip,
379        seg_mip=seg_mip,
380        download=download,
381        offsets=offsets,
382        boundaries=boundaries,
383        **ds_kwargs,
384    )
385    return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)

Get the DataLoader for synaptic structure segmentation in Wildenberg 2023 3DEM data.

Arguments:

path: Filepath to a folder where the cached zarr stores will be saved.
batch_size: The batch size for training.
patch_shape: The patch shape (z, y, x) to use for training.
experiments: Experiments to load. Defaults to both ('p105', 'p14').
label_choice: Which annotation channel to use. 'psd' for postsynaptic density, 'vesicle_cloud' for presynaptic vesicle cloud, or 'saturated' for instance-labeled saturated synapses.
bounding_box: Region in nm as (x_min, x_max, y_min, y_max, z_min, z_max). Defaults to the full annotation extent per experiment.
em_mip: MIP level for the EM image. Default mip=1 gives 12 x 12 x 40 nm.
seg_mip: MIP level for the annotation. Default mip=0 gives 12 x 12 x 40 nm.
download: Whether to stream and cache data if not already present.
offsets: Offset values for affinity computation (only applied when label_choice='saturated').
boundaries: Whether to compute boundaries (only applied when label_choice='saturated').
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

Returns:

The DataLoader.