torch_em.data.datasets.electron_microscopy.fib25

FIB-25 is a dataset for neuron segmentation in EM.

It contains FIB-SEM data and segmentation ground truth from the Drosophila medulla, as part of the FlyEM project at Janelia Research Campus.

The dataset is from the publication https://doi.org/10.1073/pnas.1509820112. Please cite this publication if you use the dataset in your research.

The data is hosted at https://github.com/google/ffn via Google Cloud Storage.

  1"""FIB-25 is a dataset for neuron segmentation in EM.
  2
  3It contains FIB-SEM data and segmentation ground truth from the Drosophila medulla,
  4as part of the FlyEM project at Janelia Research Campus.
  5
  6The dataset is from the publication https://doi.org/10.1073/pnas.1509820112.
  7Please cite this publication if you use the dataset in your research.
  8
  9The data is hosted at https://github.com/google/ffn via Google Cloud Storage.
 10"""
 11
 12import os
 13from typing import List, Optional, Tuple, Union
 14
 15import numpy as np
 16
 17import torch_em
 18
 19from torch.utils.data import Dataset, DataLoader
 20
 21from .. import util
 22
 23
 24GCS_BUCKET = "https://storage.googleapis.com/ffn-flyem-fib25"
 25
 26URLS = {
 27    "training_sample2": {
 28        "raw": f"{GCS_BUCKET}/training_sample2/grayscale_maps.h5",
 29        "labels": f"{GCS_BUCKET}/training_sample2/groundtruth.h5",
 30    },
 31    "validation_sample": {
 32        "raw": f"{GCS_BUCKET}/validation_sample/grayscale_maps.h5",
 33        "labels": f"{GCS_BUCKET}/validation_sample/groundtruth.h5",
 34    },
 35    "tstvol-520-1": {
 36        "raw": f"{GCS_BUCKET}/tstvol-520-1/raw.h5",
 37        "labels": f"{GCS_BUCKET}/tstvol-520-1/groundtruth.h5",
 38    },
 39}
 40
 41CHECKSUMS = {
 42    "training_sample2": {
 43        "raw": "ea031c98ee2de778a9a3a1e6d410df5de73e4ac28022df8e7255d84e3394cafa",
 44        "labels": "fd508e7aee1fe51ac9ae0460db4a841d275236f013c1f2552314b4f21b1010ea",
 45    },
 46    "validation_sample": {
 47        "raw": "400ccb2a7268a3880c63656e0d794f8e6252e62031869455cc8caeef245b2a83",
 48        "labels": "2c5e31af0af5476bc9669b88d01a4570a26eb020799eaf6131aa75f2f7d92e98",
 49    },
 50    "tstvol-520-1": {
 51        "raw": "0667e701c8b4464003d8a6cb0cf9deb2aa79fb415ec51deeac92e5f9c67a5a66",
 52        "labels": "ae61ae78a9874eb35ae8e5ed29b4cbfe7bbd07a61789ddb70aef4deb2532eb4e",
 53    },
 54}
 55
 56SAMPLES = list(URLS.keys())
 57
 58
 59def _apply_transforms(groundtruth_path):
 60    """Apply the supervoxel-to-neuron mapping from the 'transforms' dataset.
 61
 62    The groundtruth h5 files contain a 'stack' dataset with supervoxel IDs
 63    and a 'transforms' dataset that maps supervoxels to neuron body IDs.
 64    This function applies the mapping and saves the result as 'neuron_ids'.
 65    """
 66    import h5py
 67
 68    with h5py.File(groundtruth_path, "a") as f:
 69        if "neuron_ids" in f:
 70            return
 71
 72        stack = f["stack"][:]
 73        transforms = f["transforms"][:]
 74
 75        # Build the mapping from supervoxel IDs to neuron body IDs.
 76        mapping = np.zeros(stack.max() + 1, dtype=stack.dtype)
 77        for src, dst in transforms:
 78            mapping[src] = dst
 79        neuron_ids = mapping[stack]
 80
 81        f.create_dataset("neuron_ids", data=neuron_ids, compression="gzip")
 82
 83
 84def get_fib25_data(
 85    path: Union[os.PathLike, str], samples: Tuple[str, ...], download: bool = False
 86):
 87    """Download the FIB-25 dataset.
 88
 89    Args:
 90        path: Filepath to a folder where the downloaded data will be saved.
 91        samples: The samples to download. Available samples are
 92            'training_sample2', 'validation_sample', and 'tstvol-520-1'.
 93        download: Whether to download the data if it is not present.
 94    """
 95    os.makedirs(path, exist_ok=True)
 96    for sample in samples:
 97        assert sample in URLS, f"Invalid sample: {sample}. Choose from {SAMPLES}."
 98        urls = URLS[sample]
 99        checksums = CHECKSUMS[sample]
100
101        sample_dir = os.path.join(path, sample)
102        os.makedirs(sample_dir, exist_ok=True)
103
104        raw_path = os.path.join(sample_dir, "raw.h5")
105        labels_path = os.path.join(sample_dir, "groundtruth.h5")
106
107        util.download_source(raw_path, urls["raw"], download, checksum=checksums["raw"])
108        util.download_source(labels_path, urls["labels"], download, checksum=checksums["labels"])
109
110        # Apply the supervoxel-to-neuron mapping.
111        _apply_transforms(labels_path)
112
113
114def get_fib25_paths(
115    path: Union[os.PathLike, str],
116    samples: Tuple[str, ...] = ("training_sample2",),
117    download: bool = False,
118) -> Tuple[List[str], List[str]]:
119    """Get paths to the FIB-25 data.
120
121    Args:
122        path: Filepath to a folder where the downloaded data will be saved.
123        samples: The samples to use. Available samples are
124            'training_sample2', 'validation_sample', and 'tstvol-520-1'.
125        download: Whether to download the data if it is not present.
126
127    Returns:
128        The filepaths to the raw data and the label data.
129    """
130    get_fib25_data(path, samples, download)
131    raw_paths = [os.path.join(path, sample, "raw.h5") for sample in samples]
132    label_paths = [os.path.join(path, sample, "groundtruth.h5") for sample in samples]
133    return raw_paths, label_paths
134
135
136def get_fib25_dataset(
137    path: Union[os.PathLike, str],
138    patch_shape: Tuple[int, int, int],
139    samples: Tuple[str, ...] = ("training_sample2",),
140    download: bool = False,
141    offsets: Optional[List[List[int]]] = None,
142    boundaries: bool = False,
143    **kwargs,
144) -> Dataset:
145    """Get the FIB-25 dataset for the segmentation of neurons in EM.
146
147    Args:
148        path: Filepath to a folder where the downloaded data will be saved.
149        patch_shape: The patch shape to use for training.
150        samples: The samples to use. Available samples are
151            'training_sample2', 'validation_sample', and 'tstvol-520-1'.
152        download: Whether to download the data if it is not present.
153        offsets: Offset values for affinity computation used as target.
154        boundaries: Whether to compute boundaries as the target.
155        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
156
157    Returns:
158        The segmentation dataset.
159    """
160    assert len(patch_shape) == 3
161
162    raw_paths, label_paths = get_fib25_paths(path, samples, download)
163
164    kwargs = util.update_kwargs(kwargs, "is_seg_dataset", True)
165    kwargs, _ = util.add_instance_label_transform(
166        kwargs, add_binary_target=False, boundaries=boundaries, offsets=offsets
167    )
168
169    return torch_em.default_segmentation_dataset(
170        raw_paths=raw_paths,
171        raw_key="raw",
172        label_paths=label_paths,
173        label_key="neuron_ids",
174        patch_shape=patch_shape,
175        **kwargs,
176    )
177
178
179def get_fib25_loader(
180    path: Union[os.PathLike, str],
181    patch_shape: Tuple[int, int, int],
182    batch_size: int,
183    samples: Tuple[str, ...] = ("training_sample2",),
184    download: bool = False,
185    offsets: Optional[List[List[int]]] = None,
186    boundaries: bool = False,
187    **kwargs,
188) -> DataLoader:
189    """Get the DataLoader for EM neuron segmentation in the FIB-25 dataset.
190
191    Args:
192        path: Filepath to a folder where the downloaded data will be saved.
193        patch_shape: The patch shape to use for training.
194        batch_size: The batch size for training.
195        samples: The samples to use. Available samples are
196            'training_sample2', 'validation_sample', and 'tstvol-520-1'.
197        download: Whether to download the data if it is not present.
198        offsets: Offset values for affinity computation used as target.
199        boundaries: Whether to compute boundaries as the target.
200        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
201
202    Returns:
203        The DataLoader.
204    """
205    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
206    ds = get_fib25_dataset(
207        path=path,
208        patch_shape=patch_shape,
209        samples=samples,
210        download=download,
211        offsets=offsets,
212        boundaries=boundaries,
213        **ds_kwargs,
214    )
215    return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)
GCS_BUCKET = 'https://storage.googleapis.com/ffn-flyem-fib25'
URLS = {'training_sample2': {'raw': 'https://storage.googleapis.com/ffn-flyem-fib25/training_sample2/grayscale_maps.h5', 'labels': 'https://storage.googleapis.com/ffn-flyem-fib25/training_sample2/groundtruth.h5'}, 'validation_sample': {'raw': 'https://storage.googleapis.com/ffn-flyem-fib25/validation_sample/grayscale_maps.h5', 'labels': 'https://storage.googleapis.com/ffn-flyem-fib25/validation_sample/groundtruth.h5'}, 'tstvol-520-1': {'raw': 'https://storage.googleapis.com/ffn-flyem-fib25/tstvol-520-1/raw.h5', 'labels': 'https://storage.googleapis.com/ffn-flyem-fib25/tstvol-520-1/groundtruth.h5'}}
CHECKSUMS = {'training_sample2': {'raw': 'ea031c98ee2de778a9a3a1e6d410df5de73e4ac28022df8e7255d84e3394cafa', 'labels': 'fd508e7aee1fe51ac9ae0460db4a841d275236f013c1f2552314b4f21b1010ea'}, 'validation_sample': {'raw': '400ccb2a7268a3880c63656e0d794f8e6252e62031869455cc8caeef245b2a83', 'labels': '2c5e31af0af5476bc9669b88d01a4570a26eb020799eaf6131aa75f2f7d92e98'}, 'tstvol-520-1': {'raw': '0667e701c8b4464003d8a6cb0cf9deb2aa79fb415ec51deeac92e5f9c67a5a66', 'labels': 'ae61ae78a9874eb35ae8e5ed29b4cbfe7bbd07a61789ddb70aef4deb2532eb4e'}}
SAMPLES = ['training_sample2', 'validation_sample', 'tstvol-520-1']
def get_fib25_data( path: Union[os.PathLike, str], samples: Tuple[str, ...], download: bool = False):
 85def get_fib25_data(
 86    path: Union[os.PathLike, str], samples: Tuple[str, ...], download: bool = False
 87):
 88    """Download the FIB-25 dataset.
 89
 90    Args:
 91        path: Filepath to a folder where the downloaded data will be saved.
 92        samples: The samples to download. Available samples are
 93            'training_sample2', 'validation_sample', and 'tstvol-520-1'.
 94        download: Whether to download the data if it is not present.
 95    """
 96    os.makedirs(path, exist_ok=True)
 97    for sample in samples:
 98        assert sample in URLS, f"Invalid sample: {sample}. Choose from {SAMPLES}."
 99        urls = URLS[sample]
100        checksums = CHECKSUMS[sample]
101
102        sample_dir = os.path.join(path, sample)
103        os.makedirs(sample_dir, exist_ok=True)
104
105        raw_path = os.path.join(sample_dir, "raw.h5")
106        labels_path = os.path.join(sample_dir, "groundtruth.h5")
107
108        util.download_source(raw_path, urls["raw"], download, checksum=checksums["raw"])
109        util.download_source(labels_path, urls["labels"], download, checksum=checksums["labels"])
110
111        # Apply the supervoxel-to-neuron mapping.
112        _apply_transforms(labels_path)

Download the FIB-25 dataset.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • samples: The samples to download. Available samples are 'training_sample2', 'validation_sample', and 'tstvol-520-1'.
  • download: Whether to download the data if it is not present.
def get_fib25_paths( path: Union[os.PathLike, str], samples: Tuple[str, ...] = ('training_sample2',), download: bool = False) -> Tuple[List[str], List[str]]:
115def get_fib25_paths(
116    path: Union[os.PathLike, str],
117    samples: Tuple[str, ...] = ("training_sample2",),
118    download: bool = False,
119) -> Tuple[List[str], List[str]]:
120    """Get paths to the FIB-25 data.
121
122    Args:
123        path: Filepath to a folder where the downloaded data will be saved.
124        samples: The samples to use. Available samples are
125            'training_sample2', 'validation_sample', and 'tstvol-520-1'.
126        download: Whether to download the data if it is not present.
127
128    Returns:
129        The filepaths to the raw data and the label data.
130    """
131    get_fib25_data(path, samples, download)
132    raw_paths = [os.path.join(path, sample, "raw.h5") for sample in samples]
133    label_paths = [os.path.join(path, sample, "groundtruth.h5") for sample in samples]
134    return raw_paths, label_paths

Get paths to the FIB-25 data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • samples: The samples to use. Available samples are 'training_sample2', 'validation_sample', and 'tstvol-520-1'.
  • download: Whether to download the data if it is not present.
Returns:

The filepaths to the raw data and the label data.

def get_fib25_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int, int], samples: Tuple[str, ...] = ('training_sample2',), download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
137def get_fib25_dataset(
138    path: Union[os.PathLike, str],
139    patch_shape: Tuple[int, int, int],
140    samples: Tuple[str, ...] = ("training_sample2",),
141    download: bool = False,
142    offsets: Optional[List[List[int]]] = None,
143    boundaries: bool = False,
144    **kwargs,
145) -> Dataset:
146    """Get the FIB-25 dataset for the segmentation of neurons in EM.
147
148    Args:
149        path: Filepath to a folder where the downloaded data will be saved.
150        patch_shape: The patch shape to use for training.
151        samples: The samples to use. Available samples are
152            'training_sample2', 'validation_sample', and 'tstvol-520-1'.
153        download: Whether to download the data if it is not present.
154        offsets: Offset values for affinity computation used as target.
155        boundaries: Whether to compute boundaries as the target.
156        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
157
158    Returns:
159        The segmentation dataset.
160    """
161    assert len(patch_shape) == 3
162
163    raw_paths, label_paths = get_fib25_paths(path, samples, download)
164
165    kwargs = util.update_kwargs(kwargs, "is_seg_dataset", True)
166    kwargs, _ = util.add_instance_label_transform(
167        kwargs, add_binary_target=False, boundaries=boundaries, offsets=offsets
168    )
169
170    return torch_em.default_segmentation_dataset(
171        raw_paths=raw_paths,
172        raw_key="raw",
173        label_paths=label_paths,
174        label_key="neuron_ids",
175        patch_shape=patch_shape,
176        **kwargs,
177    )

Get the FIB-25 dataset for the segmentation of neurons in EM.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • samples: The samples to use. Available samples are 'training_sample2', 'validation_sample', and 'tstvol-520-1'.
  • download: Whether to download the data if it is not present.
  • offsets: Offset values for affinity computation used as target.
  • boundaries: Whether to compute boundaries as the target.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_fib25_loader( path: Union[os.PathLike, str], patch_shape: Tuple[int, int, int], batch_size: int, samples: Tuple[str, ...] = ('training_sample2',), download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
180def get_fib25_loader(
181    path: Union[os.PathLike, str],
182    patch_shape: Tuple[int, int, int],
183    batch_size: int,
184    samples: Tuple[str, ...] = ("training_sample2",),
185    download: bool = False,
186    offsets: Optional[List[List[int]]] = None,
187    boundaries: bool = False,
188    **kwargs,
189) -> DataLoader:
190    """Get the DataLoader for EM neuron segmentation in the FIB-25 dataset.
191
192    Args:
193        path: Filepath to a folder where the downloaded data will be saved.
194        patch_shape: The patch shape to use for training.
195        batch_size: The batch size for training.
196        samples: The samples to use. Available samples are
197            'training_sample2', 'validation_sample', and 'tstvol-520-1'.
198        download: Whether to download the data if it is not present.
199        offsets: Offset values for affinity computation used as target.
200        boundaries: Whether to compute boundaries as the target.
201        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
202
203    Returns:
204        The DataLoader.
205    """
206    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
207    ds = get_fib25_dataset(
208        path=path,
209        patch_shape=patch_shape,
210        samples=samples,
211        download=download,
212        offsets=offsets,
213        boundaries=boundaries,
214        **ds_kwargs,
215    )
216    return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)

Get the DataLoader for EM neuron segmentation in the FIB-25 dataset.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • batch_size: The batch size for training.
  • samples: The samples to use. Available samples are 'training_sample2', 'validation_sample', and 'tstvol-520-1'.
  • download: Whether to download the data if it is not present.
  • offsets: Offset values for affinity computation used as target.
  • boundaries: Whether to compute boundaries as the target.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.