torch_em.data.datasets.electron_microscopy.microns

Datasets from the MICrONS (Machine Intelligence from Cortical Networks) project.

Two sources are provided:

Zenodo training volumes (basil, minnie, pinky) - https://doi.org/10.5281/zenodo.5760218 Three EM volumes with sparse neuron instance segmentation and (pinky only) sparse mitochondria labels. Most patches contain no annotations. Downloaded as tar.gz archives and cached as HDF5 files.

minnie65 cubic millimeter - https://doi.org/10.1038/s41586-025-08790-w Full ~1.75 x 1.29 x 1.11 mm volume of mouse primary visual cortex with ~200,000 annotated cells. EM and neuron segmentation (version m1300, Jan 2025) are both at 8x8x40 nm native (mip=0). Data is streamed from public cloud storage using cloud-volume and cached locally as zarr v3 stores (512x4096x4096 vox per box) with sharding and zstd compression.

Please cite the relevant publication if you use either dataset in your research.

  1"""Datasets from the MICrONS (Machine Intelligence from Cortical Networks) project.
  2
  3Two sources are provided:
  4
  5**Zenodo training volumes** (basil, minnie, pinky) - https://doi.org/10.5281/zenodo.5760218
  6  Three EM volumes with sparse neuron instance segmentation and (pinky only) sparse mitochondria
  7  labels. Most patches contain no annotations. Downloaded as tar.gz archives and cached as HDF5 files.
  8
  9**minnie65 cubic millimeter** - https://doi.org/10.1038/s41586-025-08790-w
 10  Full ~1.75 x 1.29 x 1.11 mm volume of mouse primary visual cortex with ~200,000 annotated
 11  cells. EM and neuron segmentation (version m1300, Jan 2025) are both at 8x8x40 nm native
 12  (mip=0). Data is streamed from public cloud storage using cloud-volume and cached locally as
 13  zarr v3 stores (512x4096x4096 vox per box) with sharding and zstd compression.
 14
 15Please cite the relevant publication if you use either dataset in your research.
 16"""
 17
 18import glob
 19import hashlib
 20import os
 21from concurrent.futures import ThreadPoolExecutor, as_completed
 22from typing import List, Literal, Optional, Sequence, Tuple, Union
 23
 24import numpy as np
 25from tqdm import tqdm
 26
 27import torch_em
 28from torch.utils.data import DataLoader, Dataset
 29
 30from .. import util
 31
 32
 33ZENODO_URLS = {
 34    "basil": "https://zenodo.org/records/5760218/files/basil.tar.gz?download=1",
 35    "minnie": "https://zenodo.org/records/5760218/files/minnie.tar.gz?download=1",
 36    "pinky": "https://zenodo.org/records/5760218/files/pinky.tar.gz?download=1",
 37}
 38
 39# SHA256 checksums are not yet available; download will warn but will not fail.
 40ZENODO_CHECKSUMS = {
 41    "basil": None,
 42    "minnie": None,
 43    "pinky": None,
 44}
 45
 46ZENODO_LABEL_KEYS = {
 47    "neuron": "volumes/segmentation",
 48    "mitochondria": "volumes/mitochondria",
 49}
 50
 51# Mitochondria labels are only present in the pinky volume.
 52ZENODO_MITO_VOLUMES = ["pinky"]
 53
 54MINNIE65_EM_URL = "precomputed://https://bossdb-open-data.s3.amazonaws.com/iarpa_microns/minnie/minnie65/em"
 55MINNIE65_SEG_URL = "precomputed://https://storage.googleapis.com/iarpa_microns/minnie/minnie65/seg_m1300/"
 56
 57# Pre-defined bounding boxes in nm (x_min, x_max, y_min, y_max, z_min, z_max).
 58# Derived from the same cortical regions used in microns_nuclei (same center coordinates).
 59# Block size 32768 x 32768 x 20480 nm = 4096x4096x512 vox at 8x8x40 nm (mip=0).
 60MINNIE65_BOUNDING_BOXES = {
 61    "train": [
 62        (384792, 417560, 549540, 582308, 833880, 854360),
 63        (263368, 296136, 489060, 521828, 836200, 856680),
 64        (268376, 301144, 562448, 595216, 829560, 850040),
 65        (279428, 312196, 802124, 834892, 796920, 817400),
 66        (365248, 398016, 1005708, 1038476, 796920, 817400),
 67        (462808, 495576, 1054232, 1087000, 796920, 817400),
 68        (506668, 539436, 1006572, 1039340, 781720, 802200),
 69        (588344, 621112, 973072, 1005840, 781720, 802200),
 70    ],
 71    "val": [
 72        (733048, 765816, 525324, 558092, 787320, 807800),
 73        (1131936, 1164704, 618204, 650972, 723880, 744360),
 74    ],
 75    "test": [
 76        (822668, 855436, 414812, 447580, 787320, 807800),
 77        (986364, 1019132, 398236, 431004, 789320, 809800),
 78        (1101648, 1134416, 563036, 595804, 789320, 809800),
 79        (1152312, 1185080, 453124, 485892, 784280, 804760),
 80    ],
 81}
 82
 83MINNIE65_SHARD_SHAPE = (128, 512, 512)
 84MINNIE65_CHUNK_SHAPE = (64, 128, 128)
 85
 86
 87def get_microns_data(path: Union[os.PathLike, str], volume: str, download: bool) -> str:
 88    """Download and extract a single MICrONS Zenodo volume.
 89
 90    Args:
 91        path: Filepath to a folder where the downloaded data will be saved.
 92        volume: The volume to download. One of 'basil', 'minnie', 'pinky'.
 93        download: Whether to download the data if it is not present.
 94
 95    Returns:
 96        The filepath to the directory containing the extracted HDF5 files.
 97    """
 98    assert volume in ZENODO_URLS, f"Invalid volume '{volume}'. Choose from {list(ZENODO_URLS.keys())}."
 99    os.makedirs(path, exist_ok=True)
100    volume_dir = os.path.join(path, volume)
101    if not os.path.exists(volume_dir):
102        tar_path = os.path.join(path, f"{volume}.tar.gz")
103        util.download_source(tar_path, ZENODO_URLS[volume], download, ZENODO_CHECKSUMS[volume])
104        util.unzip_tarfile(tar_path, path, remove=True)
105    return volume_dir
106
107
108def get_microns_paths(
109    path: Union[os.PathLike, str],
110    volumes: Optional[Sequence[str]],
111    download: bool,
112    label_key: str = "volumes/segmentation",
113) -> List[str]:
114    """Get paths to MICrONS Zenodo volume HDF5 files.
115
116    Each volume's tar.gz extracts to a subdirectory containing multiple per-volume HDF5 files.
117    Files where the image and label shapes do not match are skipped with a warning.
118
119    Args:
120        path: Filepath to a folder where the downloaded data will be saved.
121        volumes: The volumes to use. One or more of 'basil', 'minnie', 'pinky'.
122            Pass None to use all three volumes.
123        download: Whether to download the data if it is not present.
124        label_key: HDF5 key for the label array, used to validate shape consistency.
125
126    Returns:
127        The filepaths to the stored HDF5 files.
128    """
129    import h5py
130
131    if volumes is None:
132        volumes = list(ZENODO_URLS.keys())
133    paths = []
134    for vol in volumes:
135        vol_dir = get_microns_data(path, vol, download)
136        for fpath in sorted(glob.glob(os.path.join(vol_dir, "*.h5"))):
137            with h5py.File(fpath, "r") as f:
138                if label_key not in f:
139                    continue
140                img_shape = f["volumes/image"].shape
141                lbl_shape = f[label_key].shape
142            if img_shape == lbl_shape:
143                paths.append(fpath)
144            else:
145                print(
146                    f"Skipping {os.path.basename(fpath)}: image {img_shape} != {label_key} {lbl_shape}"
147                )
148    return paths
149
150
151def get_microns_dataset(
152    path: Union[os.PathLike, str],
153    patch_shape: Tuple[int, int, int],
154    volumes: Optional[Sequence[str]] = None,
155    label_choice: str = "neuron",
156    download: bool = False,
157    offsets: Optional[List[List[int]]] = None,
158    boundaries: bool = False,
159    **kwargs,
160) -> Dataset:
161    """Get the MICrONS Zenodo dataset for the segmentation of neurons or mitochondria in EM.
162
163    Note: annotations are sparse - most patches contain no labels. Mitochondria labels
164    are only available in the pinky volume and are also sparsely annotated.
165
166    Args:
167        path: Filepath to a folder where the downloaded data will be saved.
168        patch_shape: The patch shape to use for training.
169        volumes: The volumes to use. One or more of 'basil', 'minnie', 'pinky'.
170            Pass None to use all three volumes.
171        label_choice: Which labels to segment. One of 'neuron' or 'mitochondria'.
172        download: Whether to download the data if it is not present.
173        offsets: Offset values for affinity computation used as target.
174        boundaries: Whether to compute boundaries as the target.
175        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
176
177    Returns:
178        The segmentation dataset.
179    """
180    assert len(patch_shape) == 3
181    assert label_choice in ZENODO_LABEL_KEYS, \
182        f"Invalid label_choice '{label_choice}'. Choose from {list(ZENODO_LABEL_KEYS.keys())}."
183    if label_choice == "mitochondria":
184        volumes = [v for v in (volumes or list(ZENODO_URLS.keys())) if v in ZENODO_MITO_VOLUMES]
185        if not volumes:
186            raise ValueError(f"Mitochondria labels are only available in: {ZENODO_MITO_VOLUMES}.")
187    label_key = ZENODO_LABEL_KEYS[label_choice]
188    h5_paths = get_microns_paths(path, volumes, download, label_key=label_key)
189
190    kwargs = util.update_kwargs(kwargs, "is_seg_dataset", True)
191    kwargs, _ = util.add_instance_label_transform(
192        kwargs, add_binary_target=False, boundaries=boundaries, offsets=offsets
193    )
194
195    return torch_em.default_segmentation_dataset(
196        raw_paths=h5_paths,
197        raw_key="volumes/image",
198        label_paths=h5_paths,
199        label_key=label_key,
200        patch_shape=patch_shape,
201        **kwargs,
202    )
203
204
205def get_microns_loader(
206    path: Union[os.PathLike, str],
207    batch_size: int,
208    patch_shape: Tuple[int, int, int],
209    volumes: Optional[Sequence[str]] = None,
210    label_choice: str = "neuron",
211    download: bool = False,
212    offsets: Optional[List[List[int]]] = None,
213    boundaries: bool = False,
214    **kwargs,
215) -> DataLoader:
216    """Get the DataLoader for EM neuron or mitochondria segmentation for the MICrONS Zenodo dataset.
217
218    Args:
219        path: Filepath to a folder where the downloaded data will be saved.
220        batch_size: The batch size for training.
221        patch_shape: The patch shape to use for training.
222        volumes: The volumes to use. One or more of 'basil', 'minnie', 'pinky'.
223            Pass None to use all three volumes.
224        label_choice: Which labels to segment. One of 'neuron' or 'mitochondria'.
225        download: Whether to download the data if it is not present.
226        offsets: Offset values for affinity computation used as target.
227        boundaries: Whether to compute boundaries as the target.
228        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
229
230    Returns:
231        The DataLoader.
232    """
233    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
234    ds = get_microns_dataset(
235        path=path,
236        patch_shape=patch_shape,
237        volumes=volumes,
238        label_choice=label_choice,
239        download=download,
240        offsets=offsets,
241        boundaries=boundaries,
242        **ds_kwargs,
243    )
244    return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)
245
246
247def _minnie65_bbox_to_str(bbox: tuple) -> str:
248    return hashlib.md5("_".join(str(v) for v in bbox).encode()).hexdigest()[:12]
249
250
251def _minnie65_create_array(root, name: str, shape: tuple, dtype, is_label: bool):
252    from zarr.codecs import BloscCodec
253    shuffle = "bitshuffle" if (np.issubdtype(dtype, np.integer) and is_label) else "shuffle"
254    return root.create_array(
255        name,
256        shape=shape,
257        chunks=MINNIE65_CHUNK_SHAPE,
258        shards=MINNIE65_SHARD_SHAPE,
259        dtype=dtype,
260        compressors=BloscCodec(cname="zstd", clevel=6, shuffle=shuffle),
261    )
262
263
264def _minnie65_bbox_voxels(cv, x_min_nm, x_max_nm, y_min_nm, y_max_nm, z_min_nm, z_max_nm):
265    """Return (x0, x1, y0, y1, z0, z1) voxel bounds and (nz, ny, nx) shape for a CloudVolume."""
266    scale = np.array(cv.resolution)
267    x0 = int(np.floor(x_min_nm / scale[0]))
268    x1 = int(np.ceil(x_max_nm / scale[0]))
269    y0 = int(np.floor(y_min_nm / scale[1]))
270    y1 = int(np.ceil(y_max_nm / scale[1]))
271    z0 = int(np.floor(z_min_nm / scale[2]))
272    z1 = int(np.ceil(z_max_nm / scale[2]))
273    return x0, x1, y0, y1, z0, z1, (z1 - z0, y1 - y0, x1 - x0)
274
275
276def _minnie65_download_to_zarr(cv, ds, x0g, y0g, z0g, name: str) -> None:
277    """Download a bbox shard-by-shard into a zarr array using a thread pool."""
278    shape = ds.shape  # (z, y, x)
279    sz, sy, sx = MINNIE65_SHARD_SHAPE
280
281    tasks = []
282    for z0_ in range(0, shape[0], sz):
283        for y0_ in range(0, shape[1], sy):
284            for x0_ in range(0, shape[2], sx):
285                z1_ = min(z0_ + sz, shape[0])
286                y1_ = min(y0_ + sy, shape[1])
287                x1_ = min(x0_ + sx, shape[2])
288                tasks.append((
289                    (z0_, z1_), (y0_, y1_), (x0_, x1_),
290                    (x0g + x0_, x0g + x1_, y0g + y0_, y0g + y1_, z0g + z0_, z0g + z1_),
291                ))
292
293    max_workers = 8  # network-bound; more workers with large shards causes OOM
294
295    target_dtype = np.dtype(ds.dtype)
296
297    def worker(item):
298        (z0_, z1_), (y0_, y1_), (x0_, x1_), (gx0, gx1, gy0, gy1, gz0, gz1) = item
299        block = np.asarray(cv[gx0:gx1, gy0:gy1, gz0:gz1])
300        if block.ndim == 4:
301            block = block[..., 0]
302        ds[z0_:z1_, y0_:y1_, x0_:x1_] = block.transpose(2, 1, 0).astype(target_dtype)
303
304    with ThreadPoolExecutor(max_workers=max_workers) as ex:
305        futures = [ex.submit(worker, t) for t in tasks]
306        for fut in tqdm(as_completed(futures), total=len(futures), desc=f"Downloading '{name}'", smoothing=0.05):
307            fut.result()
308
309
310def get_microns_minnie65_data(
311    path: Union[os.PathLike, str],
312    bounding_box: Tuple[float, ...],
313    em_mip: int = 0,
314    seg_mip: int = 0,
315    download: bool = False,
316) -> str:
317    """Stream and cache one minnie65 bounding box as a zarr v3 store.
318
319    The zarr store contains:
320      - raw: EM grayscale (uint8, z/y/x)
321      - labels: neuron instance segmentation (uint32, z/y/x)
322
323    Both arrays use sharding (shard shape MINNIE65_SHARD_SHAPE, inner chunk shape
324    MINNIE65_CHUNK_SHAPE) with zstd+blosc compression. Download is parallelised
325    over shards using a thread pool.
326
327    Args:
328        path: Filepath to a folder where the cached zarr store will be saved.
329        bounding_box: Region in nm as (x_min, x_max, y_min, y_max, z_min, z_max).
330        em_mip: MIP level for the EM volume. Default mip=0 gives 8x8x40 nm native resolution.
331        seg_mip: MIP level for the segmentation. Default mip=0 gives 8x8x40 nm native resolution.
332        download: Whether to stream and cache the data if not present.
333
334    Returns:
335        Filepath to the cached zarr store.
336    """
337    import zarr
338
339    os.makedirs(path, exist_ok=True)
340    stem = _minnie65_bbox_to_str(bounding_box)
341    zarr_path = os.path.join(str(path), f"{stem}.zarr")
342
343    def _complete(zp):
344        return (
345            os.path.isdir(os.path.join(zp, "raw"))
346            and os.path.isdir(os.path.join(zp, "labels"))
347        )
348
349    if _complete(zarr_path):
350        return zarr_path
351    if not download:
352        raise RuntimeError(
353            f"No cached data at '{zarr_path}'. Set download=True to stream it from cloud storage."
354        )
355
356    try:
357        from cloudvolume import CloudVolume
358    except ImportError:
359        raise ImportError(
360            "The 'cloud-volume' package is required to access the minnie65 dataset. "
361            "Install it with: pip install cloud-volume"
362        )
363
364    x_min_nm, x_max_nm, y_min_nm, y_max_nm, z_min_nm, z_max_nm = bounding_box
365    print(f"Streaming minnie65 bbox {bounding_box} at em_mip={em_mip}, seg_mip={seg_mip} ...")
366
367    em_cv = CloudVolume(MINNIE65_EM_URL, use_https=True, mip=em_mip, progress=False, fill_missing=True)
368    seg_cv = CloudVolume(MINNIE65_SEG_URL, use_https=True, mip=seg_mip, progress=False, fill_missing=True)
369
370    ex0, ex1, ey0, ey1, ez0, ez1, em_shape = _minnie65_bbox_voxels(
371        em_cv, x_min_nm, x_max_nm, y_min_nm, y_max_nm, z_min_nm, z_max_nm
372    )
373    sx0, sx1, sy0, sy1, sz0, sz1, seg_shape = _minnie65_bbox_voxels(
374        seg_cv, x_min_nm, x_max_nm, y_min_nm, y_max_nm, z_min_nm, z_max_nm
375    )
376
377    # Use the minimum shape along each axis to handle ceiling-rounding differences.
378    shape = tuple(min(e, s) for e, s in zip(em_shape, seg_shape))
379
380    root = zarr.open_group(zarr_path, mode="a")
381    root.attrs["bounding_box_nm"] = list(bounding_box)
382    root.attrs["em_mip"] = em_mip
383    root.attrs["seg_mip"] = seg_mip
384
385    if "raw" not in root:
386        ds_raw = _minnie65_create_array(root, "raw", shape, np.dtype("uint8"), is_label=False)
387        _minnie65_download_to_zarr(em_cv, ds_raw, ex0, ey0, ez0, name="raw")
388
389    if "labels" not in root:
390        ds_lbl = _minnie65_create_array(root, "labels", shape, np.dtype("uint32"), is_label=True)
391        _minnie65_download_to_zarr(seg_cv, ds_lbl, sx0, sy0, sz0, name="labels")
392
393    print(f"Cached to {zarr_path} (shape {shape})")
394    return zarr_path
395
396
397def get_microns_minnie65_paths(
398    path: Union[os.PathLike, str],
399    split: Optional[Literal["train", "val", "test"]] = None,
400    bounding_boxes: Optional[Sequence[Tuple[float, ...]]] = None,
401    em_mip: int = 0,
402    seg_mip: int = 0,
403    download: bool = False,
404) -> List[str]:
405    """Get paths to cached minnie65 zarr stores.
406
407    Args:
408        path: Filepath to a folder where the cached zarr stores will be saved.
409        split: Which pre-defined split to use - 'train', 'val', or 'test'.
410            Ignored if bounding_boxes is provided. Pass None with no bounding_boxes to use all boxes.
411        bounding_boxes: Custom bounding boxes in nm (x_min, x_max, y_min, y_max, z_min, z_max).
412            Overrides split-based selection when provided.
413        em_mip: MIP level for the EM volume.
414        seg_mip: MIP level for the segmentation.
415        download: Whether to stream and cache the data if not present.
416
417    Returns:
418        Filepaths to the cached zarr stores.
419    """
420    if bounding_boxes is not None:
421        boxes = list(bounding_boxes)
422    elif split is not None:
423        assert split in MINNIE65_BOUNDING_BOXES, \
424            f"Invalid split '{split}'. Choose from {list(MINNIE65_BOUNDING_BOXES.keys())}."
425        boxes = MINNIE65_BOUNDING_BOXES[split]
426    else:
427        boxes = [bb for split_boxes in MINNIE65_BOUNDING_BOXES.values() for bb in split_boxes]
428    return [get_microns_minnie65_data(path, bb, em_mip, seg_mip, download) for bb in boxes]
429
430
431def get_microns_minnie65_dataset(
432    path: Union[os.PathLike, str],
433    patch_shape: Tuple[int, int, int],
434    split: Optional[Literal["train", "val", "test"]] = None,
435    bounding_boxes: Optional[Sequence[Tuple[float, ...]]] = None,
436    em_mip: int = 0,
437    seg_mip: int = 0,
438    download: bool = False,
439    offsets: Optional[List[List[int]]] = None,
440    boundaries: bool = False,
441    **kwargs,
442) -> Dataset:
443    """Get the minnie65 dataset for neuron instance segmentation in EM.
444
445    Args:
446        path: Filepath to a folder where the cached zarr stores will be saved.
447        patch_shape: The patch shape (z, y, x) to use for training.
448        split: Which pre-defined split to use - 'train', 'val', or 'test'.
449            Ignored if bounding_boxes is provided.
450        bounding_boxes: Custom bounding boxes in nm (x_min, x_max, y_min, y_max, z_min, z_max).
451            Overrides split-based selection when provided.
452        em_mip: MIP level for the EM volume. Default mip=0 gives 8x8x40 nm native resolution.
453        seg_mip: MIP level for the segmentation. Default mip=0 gives 8x8x40 nm native resolution.
454        download: Whether to stream and cache data if not already present.
455        offsets: Offset values for affinity computation used as target.
456        boundaries: Whether to compute boundaries as the target.
457        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
458
459    Returns:
460        The segmentation dataset.
461    """
462    assert len(patch_shape) == 3
463
464    paths = get_microns_minnie65_paths(path, split, bounding_boxes, em_mip, seg_mip, download)
465
466    kwargs = util.update_kwargs(kwargs, "is_seg_dataset", True)
467    kwargs, _ = util.add_instance_label_transform(
468        kwargs, add_binary_target=False, boundaries=boundaries, offsets=offsets
469    )
470
471    return torch_em.default_segmentation_dataset(
472        raw_paths=paths,
473        raw_key="raw",
474        label_paths=paths,
475        label_key="labels",
476        patch_shape=patch_shape,
477        **kwargs,
478    )
479
480
481def get_microns_minnie65_loader(
482    path: Union[os.PathLike, str],
483    batch_size: int,
484    patch_shape: Tuple[int, int, int],
485    split: Optional[Literal["train", "val", "test"]] = None,
486    bounding_boxes: Optional[Sequence[Tuple[float, ...]]] = None,
487    em_mip: int = 0,
488    seg_mip: int = 0,
489    download: bool = False,
490    offsets: Optional[List[List[int]]] = None,
491    boundaries: bool = False,
492    **kwargs,
493) -> DataLoader:
494    """Get the DataLoader for neuron instance segmentation in the minnie65 dataset.
495
496    Args:
497        path: Filepath to a folder where the cached zarr stores will be saved.
498        batch_size: The batch size for training.
499        patch_shape: The patch shape (z, y, x) to use for training.
500        split: Which pre-defined split to use - 'train', 'val', or 'test'.
501            Ignored if bounding_boxes is provided.
502        bounding_boxes: Custom bounding boxes in nm (x_min, x_max, y_min, y_max, z_min, z_max).
503            Overrides split-based selection when provided.
504        em_mip: MIP level for the EM volume. Default mip=0 gives 8x8x40 nm native resolution.
505        seg_mip: MIP level for the segmentation. Default mip=0 gives 8x8x40 nm native resolution.
506        download: Whether to stream and cache data if not already present.
507        offsets: Offset values for affinity computation used as target.
508        boundaries: Whether to compute boundaries as the target.
509        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
510
511    Returns:
512        The DataLoader.
513    """
514    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
515    ds = get_microns_minnie65_dataset(
516        path=path,
517        patch_shape=patch_shape,
518        split=split,
519        bounding_boxes=bounding_boxes,
520        em_mip=em_mip,
521        seg_mip=seg_mip,
522        download=download,
523        offsets=offsets,
524        boundaries=boundaries,
525        **ds_kwargs,
526    )
527    return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)
ZENODO_URLS = {'basil': 'https://zenodo.org/records/5760218/files/basil.tar.gz?download=1', 'minnie': 'https://zenodo.org/records/5760218/files/minnie.tar.gz?download=1', 'pinky': 'https://zenodo.org/records/5760218/files/pinky.tar.gz?download=1'}
ZENODO_CHECKSUMS = {'basil': None, 'minnie': None, 'pinky': None}
ZENODO_LABEL_KEYS = {'neuron': 'volumes/segmentation', 'mitochondria': 'volumes/mitochondria'}
ZENODO_MITO_VOLUMES = ['pinky']
MINNIE65_EM_URL = 'precomputed://https://bossdb-open-data.s3.amazonaws.com/iarpa_microns/minnie/minnie65/em'
MINNIE65_SEG_URL = 'precomputed://https://storage.googleapis.com/iarpa_microns/minnie/minnie65/seg_m1300/'
MINNIE65_BOUNDING_BOXES = {'train': [(384792, 417560, 549540, 582308, 833880, 854360), (263368, 296136, 489060, 521828, 836200, 856680), (268376, 301144, 562448, 595216, 829560, 850040), (279428, 312196, 802124, 834892, 796920, 817400), (365248, 398016, 1005708, 1038476, 796920, 817400), (462808, 495576, 1054232, 1087000, 796920, 817400), (506668, 539436, 1006572, 1039340, 781720, 802200), (588344, 621112, 973072, 1005840, 781720, 802200)], 'val': [(733048, 765816, 525324, 558092, 787320, 807800), (1131936, 1164704, 618204, 650972, 723880, 744360)], 'test': [(822668, 855436, 414812, 447580, 787320, 807800), (986364, 1019132, 398236, 431004, 789320, 809800), (1101648, 1134416, 563036, 595804, 789320, 809800), (1152312, 1185080, 453124, 485892, 784280, 804760)]}
MINNIE65_SHARD_SHAPE = (128, 512, 512)
MINNIE65_CHUNK_SHAPE = (64, 128, 128)
def get_microns_data(path: Union[os.PathLike, str], volume: str, download: bool) -> str:
 88def get_microns_data(path: Union[os.PathLike, str], volume: str, download: bool) -> str:
 89    """Download and extract a single MICrONS Zenodo volume.
 90
 91    Args:
 92        path: Filepath to a folder where the downloaded data will be saved.
 93        volume: The volume to download. One of 'basil', 'minnie', 'pinky'.
 94        download: Whether to download the data if it is not present.
 95
 96    Returns:
 97        The filepath to the directory containing the extracted HDF5 files.
 98    """
 99    assert volume in ZENODO_URLS, f"Invalid volume '{volume}'. Choose from {list(ZENODO_URLS.keys())}."
100    os.makedirs(path, exist_ok=True)
101    volume_dir = os.path.join(path, volume)
102    if not os.path.exists(volume_dir):
103        tar_path = os.path.join(path, f"{volume}.tar.gz")
104        util.download_source(tar_path, ZENODO_URLS[volume], download, ZENODO_CHECKSUMS[volume])
105        util.unzip_tarfile(tar_path, path, remove=True)
106    return volume_dir

Download and extract a single MICrONS Zenodo volume.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • volume: The volume to download. One of 'basil', 'minnie', 'pinky'.
  • download: Whether to download the data if it is not present.
Returns:

The filepath to the directory containing the extracted HDF5 files.

def get_microns_paths( path: Union[os.PathLike, str], volumes: Optional[Sequence[str]], download: bool, label_key: str = 'volumes/segmentation') -> List[str]:
109def get_microns_paths(
110    path: Union[os.PathLike, str],
111    volumes: Optional[Sequence[str]],
112    download: bool,
113    label_key: str = "volumes/segmentation",
114) -> List[str]:
115    """Get paths to MICrONS Zenodo volume HDF5 files.
116
117    Each volume's tar.gz extracts to a subdirectory containing multiple per-volume HDF5 files.
118    Files where the image and label shapes do not match are skipped with a warning.
119
120    Args:
121        path: Filepath to a folder where the downloaded data will be saved.
122        volumes: The volumes to use. One or more of 'basil', 'minnie', 'pinky'.
123            Pass None to use all three volumes.
124        download: Whether to download the data if it is not present.
125        label_key: HDF5 key for the label array, used to validate shape consistency.
126
127    Returns:
128        The filepaths to the stored HDF5 files.
129    """
130    import h5py
131
132    if volumes is None:
133        volumes = list(ZENODO_URLS.keys())
134    paths = []
135    for vol in volumes:
136        vol_dir = get_microns_data(path, vol, download)
137        for fpath in sorted(glob.glob(os.path.join(vol_dir, "*.h5"))):
138            with h5py.File(fpath, "r") as f:
139                if label_key not in f:
140                    continue
141                img_shape = f["volumes/image"].shape
142                lbl_shape = f[label_key].shape
143            if img_shape == lbl_shape:
144                paths.append(fpath)
145            else:
146                print(
147                    f"Skipping {os.path.basename(fpath)}: image {img_shape} != {label_key} {lbl_shape}"
148                )
149    return paths

Get paths to MICrONS Zenodo volume HDF5 files.

Each volume's tar.gz extracts to a subdirectory containing multiple per-volume HDF5 files. Files where the image and label shapes do not match are skipped with a warning.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • volumes: The volumes to use. One or more of 'basil', 'minnie', 'pinky'. Pass None to use all three volumes.
  • download: Whether to download the data if it is not present.
  • label_key: HDF5 key for the label array, used to validate shape consistency.
Returns:

The filepaths to the stored HDF5 files.

def get_microns_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int, int], volumes: Optional[Sequence[str]] = None, label_choice: str = 'neuron', download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
152def get_microns_dataset(
153    path: Union[os.PathLike, str],
154    patch_shape: Tuple[int, int, int],
155    volumes: Optional[Sequence[str]] = None,
156    label_choice: str = "neuron",
157    download: bool = False,
158    offsets: Optional[List[List[int]]] = None,
159    boundaries: bool = False,
160    **kwargs,
161) -> Dataset:
162    """Get the MICrONS Zenodo dataset for the segmentation of neurons or mitochondria in EM.
163
164    Note: annotations are sparse - most patches contain no labels. Mitochondria labels
165    are only available in the pinky volume and are also sparsely annotated.
166
167    Args:
168        path: Filepath to a folder where the downloaded data will be saved.
169        patch_shape: The patch shape to use for training.
170        volumes: The volumes to use. One or more of 'basil', 'minnie', 'pinky'.
171            Pass None to use all three volumes.
172        label_choice: Which labels to segment. One of 'neuron' or 'mitochondria'.
173        download: Whether to download the data if it is not present.
174        offsets: Offset values for affinity computation used as target.
175        boundaries: Whether to compute boundaries as the target.
176        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
177
178    Returns:
179        The segmentation dataset.
180    """
181    assert len(patch_shape) == 3
182    assert label_choice in ZENODO_LABEL_KEYS, \
183        f"Invalid label_choice '{label_choice}'. Choose from {list(ZENODO_LABEL_KEYS.keys())}."
184    if label_choice == "mitochondria":
185        volumes = [v for v in (volumes or list(ZENODO_URLS.keys())) if v in ZENODO_MITO_VOLUMES]
186        if not volumes:
187            raise ValueError(f"Mitochondria labels are only available in: {ZENODO_MITO_VOLUMES}.")
188    label_key = ZENODO_LABEL_KEYS[label_choice]
189    h5_paths = get_microns_paths(path, volumes, download, label_key=label_key)
190
191    kwargs = util.update_kwargs(kwargs, "is_seg_dataset", True)
192    kwargs, _ = util.add_instance_label_transform(
193        kwargs, add_binary_target=False, boundaries=boundaries, offsets=offsets
194    )
195
196    return torch_em.default_segmentation_dataset(
197        raw_paths=h5_paths,
198        raw_key="volumes/image",
199        label_paths=h5_paths,
200        label_key=label_key,
201        patch_shape=patch_shape,
202        **kwargs,
203    )

Get the MICrONS Zenodo dataset for the segmentation of neurons or mitochondria in EM.

Note: annotations are sparse - most patches contain no labels. Mitochondria labels are only available in the pinky volume and are also sparsely annotated.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • volumes: The volumes to use. One or more of 'basil', 'minnie', 'pinky'. Pass None to use all three volumes.
  • label_choice: Which labels to segment. One of 'neuron' or 'mitochondria'.
  • download: Whether to download the data if it is not present.
  • offsets: Offset values for affinity computation used as target.
  • boundaries: Whether to compute boundaries as the target.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_microns_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int, int], volumes: Optional[Sequence[str]] = None, label_choice: str = 'neuron', download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
206def get_microns_loader(
207    path: Union[os.PathLike, str],
208    batch_size: int,
209    patch_shape: Tuple[int, int, int],
210    volumes: Optional[Sequence[str]] = None,
211    label_choice: str = "neuron",
212    download: bool = False,
213    offsets: Optional[List[List[int]]] = None,
214    boundaries: bool = False,
215    **kwargs,
216) -> DataLoader:
217    """Get the DataLoader for EM neuron or mitochondria segmentation for the MICrONS Zenodo dataset.
218
219    Args:
220        path: Filepath to a folder where the downloaded data will be saved.
221        batch_size: The batch size for training.
222        patch_shape: The patch shape to use for training.
223        volumes: The volumes to use. One or more of 'basil', 'minnie', 'pinky'.
224            Pass None to use all three volumes.
225        label_choice: Which labels to segment. One of 'neuron' or 'mitochondria'.
226        download: Whether to download the data if it is not present.
227        offsets: Offset values for affinity computation used as target.
228        boundaries: Whether to compute boundaries as the target.
229        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
230
231    Returns:
232        The DataLoader.
233    """
234    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
235    ds = get_microns_dataset(
236        path=path,
237        patch_shape=patch_shape,
238        volumes=volumes,
239        label_choice=label_choice,
240        download=download,
241        offsets=offsets,
242        boundaries=boundaries,
243        **ds_kwargs,
244    )
245    return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)

Get the DataLoader for EM neuron or mitochondria segmentation for the MICrONS Zenodo dataset.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape to use for training.
  • volumes: The volumes to use. One or more of 'basil', 'minnie', 'pinky'. Pass None to use all three volumes.
  • label_choice: Which labels to segment. One of 'neuron' or 'mitochondria'.
  • download: Whether to download the data if it is not present.
  • offsets: Offset values for affinity computation used as target.
  • boundaries: Whether to compute boundaries as the target.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.

def get_microns_minnie65_data( path: Union[os.PathLike, str], bounding_box: Tuple[float, ...], em_mip: int = 0, seg_mip: int = 0, download: bool = False) -> str:
311def get_microns_minnie65_data(
312    path: Union[os.PathLike, str],
313    bounding_box: Tuple[float, ...],
314    em_mip: int = 0,
315    seg_mip: int = 0,
316    download: bool = False,
317) -> str:
318    """Stream and cache one minnie65 bounding box as a zarr v3 store.
319
320    The zarr store contains:
321      - raw: EM grayscale (uint8, z/y/x)
322      - labels: neuron instance segmentation (uint32, z/y/x)
323
324    Both arrays use sharding (shard shape MINNIE65_SHARD_SHAPE, inner chunk shape
325    MINNIE65_CHUNK_SHAPE) with zstd+blosc compression. Download is parallelised
326    over shards using a thread pool.
327
328    Args:
329        path: Filepath to a folder where the cached zarr store will be saved.
330        bounding_box: Region in nm as (x_min, x_max, y_min, y_max, z_min, z_max).
331        em_mip: MIP level for the EM volume. Default mip=0 gives 8x8x40 nm native resolution.
332        seg_mip: MIP level for the segmentation. Default mip=0 gives 8x8x40 nm native resolution.
333        download: Whether to stream and cache the data if not present.
334
335    Returns:
336        Filepath to the cached zarr store.
337    """
338    import zarr
339
340    os.makedirs(path, exist_ok=True)
341    stem = _minnie65_bbox_to_str(bounding_box)
342    zarr_path = os.path.join(str(path), f"{stem}.zarr")
343
344    def _complete(zp):
345        return (
346            os.path.isdir(os.path.join(zp, "raw"))
347            and os.path.isdir(os.path.join(zp, "labels"))
348        )
349
350    if _complete(zarr_path):
351        return zarr_path
352    if not download:
353        raise RuntimeError(
354            f"No cached data at '{zarr_path}'. Set download=True to stream it from cloud storage."
355        )
356
357    try:
358        from cloudvolume import CloudVolume
359    except ImportError:
360        raise ImportError(
361            "The 'cloud-volume' package is required to access the minnie65 dataset. "
362            "Install it with: pip install cloud-volume"
363        )
364
365    x_min_nm, x_max_nm, y_min_nm, y_max_nm, z_min_nm, z_max_nm = bounding_box
366    print(f"Streaming minnie65 bbox {bounding_box} at em_mip={em_mip}, seg_mip={seg_mip} ...")
367
368    em_cv = CloudVolume(MINNIE65_EM_URL, use_https=True, mip=em_mip, progress=False, fill_missing=True)
369    seg_cv = CloudVolume(MINNIE65_SEG_URL, use_https=True, mip=seg_mip, progress=False, fill_missing=True)
370
371    ex0, ex1, ey0, ey1, ez0, ez1, em_shape = _minnie65_bbox_voxels(
372        em_cv, x_min_nm, x_max_nm, y_min_nm, y_max_nm, z_min_nm, z_max_nm
373    )
374    sx0, sx1, sy0, sy1, sz0, sz1, seg_shape = _minnie65_bbox_voxels(
375        seg_cv, x_min_nm, x_max_nm, y_min_nm, y_max_nm, z_min_nm, z_max_nm
376    )
377
378    # Use the minimum shape along each axis to handle ceiling-rounding differences.
379    shape = tuple(min(e, s) for e, s in zip(em_shape, seg_shape))
380
381    root = zarr.open_group(zarr_path, mode="a")
382    root.attrs["bounding_box_nm"] = list(bounding_box)
383    root.attrs["em_mip"] = em_mip
384    root.attrs["seg_mip"] = seg_mip
385
386    if "raw" not in root:
387        ds_raw = _minnie65_create_array(root, "raw", shape, np.dtype("uint8"), is_label=False)
388        _minnie65_download_to_zarr(em_cv, ds_raw, ex0, ey0, ez0, name="raw")
389
390    if "labels" not in root:
391        ds_lbl = _minnie65_create_array(root, "labels", shape, np.dtype("uint32"), is_label=True)
392        _minnie65_download_to_zarr(seg_cv, ds_lbl, sx0, sy0, sz0, name="labels")
393
394    print(f"Cached to {zarr_path} (shape {shape})")
395    return zarr_path

Stream and cache one minnie65 bounding box as a zarr v3 store.

The zarr store contains:
  • raw: EM grayscale (uint8, z/y/x)
  • labels: neuron instance segmentation (uint32, z/y/x)

Both arrays use sharding (shard shape MINNIE65_SHARD_SHAPE, inner chunk shape MINNIE65_CHUNK_SHAPE) with zstd+blosc compression. Download is parallelised over shards using a thread pool.

Arguments:
  • path: Filepath to a folder where the cached zarr store will be saved.
  • bounding_box: Region in nm as (x_min, x_max, y_min, y_max, z_min, z_max).
  • em_mip: MIP level for the EM volume. Default mip=0 gives 8x8x40 nm native resolution.
  • seg_mip: MIP level for the segmentation. Default mip=0 gives 8x8x40 nm native resolution.
  • download: Whether to stream and cache the data if not present.
Returns:

Filepath to the cached zarr store.

def get_microns_minnie65_paths( path: Union[os.PathLike, str], split: Optional[Literal['train', 'val', 'test']] = None, bounding_boxes: Optional[Sequence[Tuple[float, ...]]] = None, em_mip: int = 0, seg_mip: int = 0, download: bool = False) -> List[str]:
398def get_microns_minnie65_paths(
399    path: Union[os.PathLike, str],
400    split: Optional[Literal["train", "val", "test"]] = None,
401    bounding_boxes: Optional[Sequence[Tuple[float, ...]]] = None,
402    em_mip: int = 0,
403    seg_mip: int = 0,
404    download: bool = False,
405) -> List[str]:
406    """Get paths to cached minnie65 zarr stores.
407
408    Args:
409        path: Filepath to a folder where the cached zarr stores will be saved.
410        split: Which pre-defined split to use - 'train', 'val', or 'test'.
411            Ignored if bounding_boxes is provided. Pass None with no bounding_boxes to use all boxes.
412        bounding_boxes: Custom bounding boxes in nm (x_min, x_max, y_min, y_max, z_min, z_max).
413            Overrides split-based selection when provided.
414        em_mip: MIP level for the EM volume.
415        seg_mip: MIP level for the segmentation.
416        download: Whether to stream and cache the data if not present.
417
418    Returns:
419        Filepaths to the cached zarr stores.
420    """
421    if bounding_boxes is not None:
422        boxes = list(bounding_boxes)
423    elif split is not None:
424        assert split in MINNIE65_BOUNDING_BOXES, \
425            f"Invalid split '{split}'. Choose from {list(MINNIE65_BOUNDING_BOXES.keys())}."
426        boxes = MINNIE65_BOUNDING_BOXES[split]
427    else:
428        boxes = [bb for split_boxes in MINNIE65_BOUNDING_BOXES.values() for bb in split_boxes]
429    return [get_microns_minnie65_data(path, bb, em_mip, seg_mip, download) for bb in boxes]

Get paths to cached minnie65 zarr stores.

Arguments:
  • path: Filepath to a folder where the cached zarr stores will be saved.
  • split: Which pre-defined split to use - 'train', 'val', or 'test'. Ignored if bounding_boxes is provided. Pass None with no bounding_boxes to use all boxes.
  • bounding_boxes: Custom bounding boxes in nm (x_min, x_max, y_min, y_max, z_min, z_max). Overrides split-based selection when provided.
  • em_mip: MIP level for the EM volume.
  • seg_mip: MIP level for the segmentation.
  • download: Whether to stream and cache the data if not present.
Returns:

Filepaths to the cached zarr stores.

def get_microns_minnie65_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int, int], split: Optional[Literal['train', 'val', 'test']] = None, bounding_boxes: Optional[Sequence[Tuple[float, ...]]] = None, em_mip: int = 0, seg_mip: int = 0, download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
432def get_microns_minnie65_dataset(
433    path: Union[os.PathLike, str],
434    patch_shape: Tuple[int, int, int],
435    split: Optional[Literal["train", "val", "test"]] = None,
436    bounding_boxes: Optional[Sequence[Tuple[float, ...]]] = None,
437    em_mip: int = 0,
438    seg_mip: int = 0,
439    download: bool = False,
440    offsets: Optional[List[List[int]]] = None,
441    boundaries: bool = False,
442    **kwargs,
443) -> Dataset:
444    """Get the minnie65 dataset for neuron instance segmentation in EM.
445
446    Args:
447        path: Filepath to a folder where the cached zarr stores will be saved.
448        patch_shape: The patch shape (z, y, x) to use for training.
449        split: Which pre-defined split to use - 'train', 'val', or 'test'.
450            Ignored if bounding_boxes is provided.
451        bounding_boxes: Custom bounding boxes in nm (x_min, x_max, y_min, y_max, z_min, z_max).
452            Overrides split-based selection when provided.
453        em_mip: MIP level for the EM volume. Default mip=0 gives 8x8x40 nm native resolution.
454        seg_mip: MIP level for the segmentation. Default mip=0 gives 8x8x40 nm native resolution.
455        download: Whether to stream and cache data if not already present.
456        offsets: Offset values for affinity computation used as target.
457        boundaries: Whether to compute boundaries as the target.
458        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
459
460    Returns:
461        The segmentation dataset.
462    """
463    assert len(patch_shape) == 3
464
465    paths = get_microns_minnie65_paths(path, split, bounding_boxes, em_mip, seg_mip, download)
466
467    kwargs = util.update_kwargs(kwargs, "is_seg_dataset", True)
468    kwargs, _ = util.add_instance_label_transform(
469        kwargs, add_binary_target=False, boundaries=boundaries, offsets=offsets
470    )
471
472    return torch_em.default_segmentation_dataset(
473        raw_paths=paths,
474        raw_key="raw",
475        label_paths=paths,
476        label_key="labels",
477        patch_shape=patch_shape,
478        **kwargs,
479    )

Get the minnie65 dataset for neuron instance segmentation in EM.

Arguments:
  • path: Filepath to a folder where the cached zarr stores will be saved.
  • patch_shape: The patch shape (z, y, x) to use for training.
  • split: Which pre-defined split to use - 'train', 'val', or 'test'. Ignored if bounding_boxes is provided.
  • bounding_boxes: Custom bounding boxes in nm (x_min, x_max, y_min, y_max, z_min, z_max). Overrides split-based selection when provided.
  • em_mip: MIP level for the EM volume. Default mip=0 gives 8x8x40 nm native resolution.
  • seg_mip: MIP level for the segmentation. Default mip=0 gives 8x8x40 nm native resolution.
  • download: Whether to stream and cache data if not already present.
  • offsets: Offset values for affinity computation used as target.
  • boundaries: Whether to compute boundaries as the target.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_microns_minnie65_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int, int], split: Optional[Literal['train', 'val', 'test']] = None, bounding_boxes: Optional[Sequence[Tuple[float, ...]]] = None, em_mip: int = 0, seg_mip: int = 0, download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
482def get_microns_minnie65_loader(
483    path: Union[os.PathLike, str],
484    batch_size: int,
485    patch_shape: Tuple[int, int, int],
486    split: Optional[Literal["train", "val", "test"]] = None,
487    bounding_boxes: Optional[Sequence[Tuple[float, ...]]] = None,
488    em_mip: int = 0,
489    seg_mip: int = 0,
490    download: bool = False,
491    offsets: Optional[List[List[int]]] = None,
492    boundaries: bool = False,
493    **kwargs,
494) -> DataLoader:
495    """Get the DataLoader for neuron instance segmentation in the minnie65 dataset.
496
497    Args:
498        path: Filepath to a folder where the cached zarr stores will be saved.
499        batch_size: The batch size for training.
500        patch_shape: The patch shape (z, y, x) to use for training.
501        split: Which pre-defined split to use - 'train', 'val', or 'test'.
502            Ignored if bounding_boxes is provided.
503        bounding_boxes: Custom bounding boxes in nm (x_min, x_max, y_min, y_max, z_min, z_max).
504            Overrides split-based selection when provided.
505        em_mip: MIP level for the EM volume. Default mip=0 gives 8x8x40 nm native resolution.
506        seg_mip: MIP level for the segmentation. Default mip=0 gives 8x8x40 nm native resolution.
507        download: Whether to stream and cache data if not already present.
508        offsets: Offset values for affinity computation used as target.
509        boundaries: Whether to compute boundaries as the target.
510        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
511
512    Returns:
513        The DataLoader.
514    """
515    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
516    ds = get_microns_minnie65_dataset(
517        path=path,
518        patch_shape=patch_shape,
519        split=split,
520        bounding_boxes=bounding_boxes,
521        em_mip=em_mip,
522        seg_mip=seg_mip,
523        download=download,
524        offsets=offsets,
525        boundaries=boundaries,
526        **ds_kwargs,
527    )
528    return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)

Get the DataLoader for neuron instance segmentation in the minnie65 dataset.

Arguments:
  • path: Filepath to a folder where the cached zarr stores will be saved.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape (z, y, x) to use for training.
  • split: Which pre-defined split to use - 'train', 'val', or 'test'. Ignored if bounding_boxes is provided.
  • bounding_boxes: Custom bounding boxes in nm (x_min, x_max, y_min, y_max, z_min, z_max). Overrides split-based selection when provided.
  • em_mip: MIP level for the EM volume. Default mip=0 gives 8x8x40 nm native resolution.
  • seg_mip: MIP level for the segmentation. Default mip=0 gives 8x8x40 nm native resolution.
  • download: Whether to stream and cache data if not already present.
  • offsets: Offset values for affinity computation used as target.
  • boundaries: Whether to compute boundaries as the target.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.