torch_em.data.datasets.electron_microscopy.betaseg

The BetaSeg dataset contains annotations for organelle segmentation in FIB-SEM data.

More information for this dataset is located at https://betaseg.github.io/. And the original publication where this entire data is presented is https://arxiv.org/abs/2303.03876. Please cite it if you use this dataset for your research.

  1"""The BetaSeg dataset contains annotations for organelle segmentation in FIB-SEM data.
  2
  3More information for this dataset is located at https://betaseg.github.io/.
  4And the original publication where this entire data is presented is https://arxiv.org/abs/2303.03876.
  5Please cite it if you use this dataset for your research.
  6"""
  7
  8import os
  9import shutil
 10from glob import glob
 11from tqdm import tqdm
 12from pathlib import Path
 13from typing import Union, Tuple, List
 14
 15import imageio.v3 as imageio
 16
 17from torch.utils.data import Dataset, DataLoader
 18
 19import torch_em
 20
 21from .. import util
 22
 23
 24URL = "https://cloud.mpi-cbg.de/index.php/s/UJopHTRuh6f4wR8/download"
 25CHECKSUM = "4872eec0211721dc224acee319c27c4f51c190adc36004e3d5bb60dfcd67eb7b"
 26
 27
 28def get_betaseg_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 29    """Download the BetaSeg dataset.
 30
 31    Args:
 32        path: Filepath to a folder where the data will be downloaded for further processing.
 33        download: Whether to download the data if it is not present.
 34
 35    Returns:
 36        Filepath where the data is stored.
 37    """
 38    data_dir = os.path.join(path, "data")
 39    if os.path.exists(data_dir):
 40        return data_dir
 41
 42    os.makedirs(data_dir)
 43
 44    zip_path = os.path.join(path, "data.zip")
 45    print("The BetaSeg dataset is quite large. It might take a couple of hours depending on your internet connection.")
 46    util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM)
 47    util.unzip(zip_path=zip_path, dst=data_dir)
 48
 49    # Group all files into h5 files.
 50    vol_dirs = glob(os.path.join(data_dir, "download", "*"))
 51    for vol_dir in tqdm(vol_dirs, desc="Preprocessing volumes"):
 52        # Get the image path.
 53        raw_path = os.path.join(vol_dir, f"{os.path.basename(vol_dir)}_source.tif")
 54        assert os.path.exists(raw_path), raw_path
 55
 56        # Get the corresponding labels which would always exist.
 57        label_paths = {
 58            "centriole": os.path.join(vol_dir, f"{os.path.basename(vol_dir)}_centrioles.tif"),
 59            "golgi": os.path.join(vol_dir, f"{os.path.basename(vol_dir)}_golgi_corrected.tif"),
 60            "granules": os.path.join(vol_dir, f"{os.path.basename(vol_dir)}_granules.tif"),
 61            "membrane": os.path.join(vol_dir, f"{os.path.basename(vol_dir)}_membrane_full_mask.tif"),
 62            "microtubules": os.path.join(vol_dir, f"{os.path.basename(vol_dir)}_microtubules.tif"),
 63            "mitochondria": os.path.join(vol_dir, f"{os.path.basename(vol_dir)}_mitochondria_mask.tif"),
 64            "nucleus": os.path.join(vol_dir, f"{os.path.basename(vol_dir)}_nucleus_mask.tif")
 65        }
 66        for p in label_paths.values():
 67            assert os.path.exists(p), p
 68
 69        # Load all images.
 70        raw = imageio.imread(raw_path)
 71        labels = {k: imageio.imread(v) for k, v in label_paths.items()}
 72
 73        # Now, let's get all in an h5 file.
 74        import h5py
 75        vol_path = os.path.join(data_dir, Path(os.path.basename(raw_path)).with_suffix(".h5"))
 76        with h5py.File(vol_path, "w") as f:
 77            f.create_dataset("raw", data=raw, dtype=raw.dtype, compression="gzip")
 78            for label_key, label in labels.items():
 79                f.create_dataset(f"labels/{label_key}", data=label, dtype=label.dtype, compression="gzip")
 80
 81    # Remove all other stuff
 82    shutil.rmtree(os.path.join(data_dir, "download"))
 83
 84    return data_dir
 85
 86
 87def get_betaseg_paths(path: Union[os.PathLike, str], download: bool = False) -> List[str]:
 88    """Get filepaths to the BetaSeg data.
 89
 90    Args:
 91        path: Filepath to a folder where the data will be downloaded for further processing.
 92        download: Whether to download the data if it is not present.
 93
 94    Returns:
 95        List of filepaths for the volumetric data.
 96    """
 97    data_dir = get_betaseg_data(path, download)
 98    volume_paths = glob(os.path.join(data_dir, "*.h5"))
 99    return volume_paths
100
101
102def get_betaseg_dataset(
103    path: Union[os.PathLike, str],
104    patch_shape: Tuple[int, ...],
105    label_choice: Union[str, List[str]],
106    download: bool = False,
107    **kwargs
108) -> Dataset:
109    """Get the BetaSeg dataset for organelle segmentation.
110
111    Args:
112        path: Filepath to a folder where the data will be downloaded for further processing.
113        patch_shape: The patch shape to use for training.
114        label_choice: The choice of label. The choices available are: 'centriole',
115            'golgi', 'granules', 'membrane', 'microtubules', 'mitochondria', 'nucleus'.
116        download: Whether to download the data if it is not present.
117        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
118
119    Returns:
120        The segmentation dataset.
121    """
122    volume_paths = get_betaseg_paths(path, download)
123
124    # Arrange the organelle choices as expecting for loading labels.
125    if isinstance(label_choice, str):
126        label_choices = f"labels/{label_choice}"
127    else:
128        label_choices = [f"labels/{organelle}" for organelle in label_choices]
129        kwargs = util.update_kwargs(kwargs, "with_label_channels", True)
130
131    return torch_em.default_segmentation_dataset(
132        raw_paths=volume_paths,
133        raw_key="raw",
134        label_paths=volume_paths,
135        label_key=label_choices,
136        patch_shape=patch_shape,
137        is_seg_dataset=True,
138        **kwargs,
139    )
140
141
142def get_betaseg_loader(
143    path: Union[os.PathLike, str],
144    batch_size: int,
145    patch_shape: Tuple[int, ...],
146    label_choice: Union[str, List[str]],
147    download: bool = False,
148    **kwargs
149) -> DataLoader:
150    """Get the BetaSeg dataloader for organelle segmentation.
151
152    Args:
153        path: Filepath to a folder where the data will be downloaded for further processing.
154        batch_size: The batch size for training.
155        patch_shape: The patch shape to use for training.
156        label_choice: The choice of label. The choices available are: 'centriole',
157            'golgi', 'granules', 'membrane', 'microtubules', 'mitochondria', 'nucleus'.
158        download: Whether to download the data if it is not present.
159        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
160
161    Returns:
162        The DataLoader.
163    """
164    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
165    dataset = get_betaseg_dataset(path, patch_shape, label_choice, download, **ds_kwargs)
166    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URL = 'https://cloud.mpi-cbg.de/index.php/s/UJopHTRuh6f4wR8/download'
CHECKSUM = '4872eec0211721dc224acee319c27c4f51c190adc36004e3d5bb60dfcd67eb7b'
def get_betaseg_data(path: Union[os.PathLike, str], download: bool = False) -> str:
29def get_betaseg_data(path: Union[os.PathLike, str], download: bool = False) -> str:
30    """Download the BetaSeg dataset.
31
32    Args:
33        path: Filepath to a folder where the data will be downloaded for further processing.
34        download: Whether to download the data if it is not present.
35
36    Returns:
37        Filepath where the data is stored.
38    """
39    data_dir = os.path.join(path, "data")
40    if os.path.exists(data_dir):
41        return data_dir
42
43    os.makedirs(data_dir)
44
45    zip_path = os.path.join(path, "data.zip")
46    print("The BetaSeg dataset is quite large. It might take a couple of hours depending on your internet connection.")
47    util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM)
48    util.unzip(zip_path=zip_path, dst=data_dir)
49
50    # Group all files into h5 files.
51    vol_dirs = glob(os.path.join(data_dir, "download", "*"))
52    for vol_dir in tqdm(vol_dirs, desc="Preprocessing volumes"):
53        # Get the image path.
54        raw_path = os.path.join(vol_dir, f"{os.path.basename(vol_dir)}_source.tif")
55        assert os.path.exists(raw_path), raw_path
56
57        # Get the corresponding labels which would always exist.
58        label_paths = {
59            "centriole": os.path.join(vol_dir, f"{os.path.basename(vol_dir)}_centrioles.tif"),
60            "golgi": os.path.join(vol_dir, f"{os.path.basename(vol_dir)}_golgi_corrected.tif"),
61            "granules": os.path.join(vol_dir, f"{os.path.basename(vol_dir)}_granules.tif"),
62            "membrane": os.path.join(vol_dir, f"{os.path.basename(vol_dir)}_membrane_full_mask.tif"),
63            "microtubules": os.path.join(vol_dir, f"{os.path.basename(vol_dir)}_microtubules.tif"),
64            "mitochondria": os.path.join(vol_dir, f"{os.path.basename(vol_dir)}_mitochondria_mask.tif"),
65            "nucleus": os.path.join(vol_dir, f"{os.path.basename(vol_dir)}_nucleus_mask.tif")
66        }
67        for p in label_paths.values():
68            assert os.path.exists(p), p
69
70        # Load all images.
71        raw = imageio.imread(raw_path)
72        labels = {k: imageio.imread(v) for k, v in label_paths.items()}
73
74        # Now, let's get all in an h5 file.
75        import h5py
76        vol_path = os.path.join(data_dir, Path(os.path.basename(raw_path)).with_suffix(".h5"))
77        with h5py.File(vol_path, "w") as f:
78            f.create_dataset("raw", data=raw, dtype=raw.dtype, compression="gzip")
79            for label_key, label in labels.items():
80                f.create_dataset(f"labels/{label_key}", data=label, dtype=label.dtype, compression="gzip")
81
82    # Remove all other stuff
83    shutil.rmtree(os.path.join(data_dir, "download"))
84
85    return data_dir

Download the BetaSeg dataset.

Arguments:
  • path: Filepath to a folder where the data will be downloaded for further processing.
  • download: Whether to download the data if it is not present.
Returns:

Filepath where the data is stored.

def get_betaseg_paths(path: Union[os.PathLike, str], download: bool = False) -> List[str]:
 88def get_betaseg_paths(path: Union[os.PathLike, str], download: bool = False) -> List[str]:
 89    """Get filepaths to the BetaSeg data.
 90
 91    Args:
 92        path: Filepath to a folder where the data will be downloaded for further processing.
 93        download: Whether to download the data if it is not present.
 94
 95    Returns:
 96        List of filepaths for the volumetric data.
 97    """
 98    data_dir = get_betaseg_data(path, download)
 99    volume_paths = glob(os.path.join(data_dir, "*.h5"))
100    return volume_paths

Get filepaths to the BetaSeg data.

Arguments:
  • path: Filepath to a folder where the data will be downloaded for further processing.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths for the volumetric data.

def get_betaseg_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], label_choice: Union[str, List[str]], download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
103def get_betaseg_dataset(
104    path: Union[os.PathLike, str],
105    patch_shape: Tuple[int, ...],
106    label_choice: Union[str, List[str]],
107    download: bool = False,
108    **kwargs
109) -> Dataset:
110    """Get the BetaSeg dataset for organelle segmentation.
111
112    Args:
113        path: Filepath to a folder where the data will be downloaded for further processing.
114        patch_shape: The patch shape to use for training.
115        label_choice: The choice of label. The choices available are: 'centriole',
116            'golgi', 'granules', 'membrane', 'microtubules', 'mitochondria', 'nucleus'.
117        download: Whether to download the data if it is not present.
118        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
119
120    Returns:
121        The segmentation dataset.
122    """
123    volume_paths = get_betaseg_paths(path, download)
124
125    # Arrange the organelle choices as expecting for loading labels.
126    if isinstance(label_choice, str):
127        label_choices = f"labels/{label_choice}"
128    else:
129        label_choices = [f"labels/{organelle}" for organelle in label_choices]
130        kwargs = util.update_kwargs(kwargs, "with_label_channels", True)
131
132    return torch_em.default_segmentation_dataset(
133        raw_paths=volume_paths,
134        raw_key="raw",
135        label_paths=volume_paths,
136        label_key=label_choices,
137        patch_shape=patch_shape,
138        is_seg_dataset=True,
139        **kwargs,
140    )

Get the BetaSeg dataset for organelle segmentation.

Arguments:
  • path: Filepath to a folder where the data will be downloaded for further processing.
  • patch_shape: The patch shape to use for training.
  • label_choice: The choice of label. The choices available are: 'centriole', 'golgi', 'granules', 'membrane', 'microtubules', 'mitochondria', 'nucleus'.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_betaseg_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, ...], label_choice: Union[str, List[str]], download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
143def get_betaseg_loader(
144    path: Union[os.PathLike, str],
145    batch_size: int,
146    patch_shape: Tuple[int, ...],
147    label_choice: Union[str, List[str]],
148    download: bool = False,
149    **kwargs
150) -> DataLoader:
151    """Get the BetaSeg dataloader for organelle segmentation.
152
153    Args:
154        path: Filepath to a folder where the data will be downloaded for further processing.
155        batch_size: The batch size for training.
156        patch_shape: The patch shape to use for training.
157        label_choice: The choice of label. The choices available are: 'centriole',
158            'golgi', 'granules', 'membrane', 'microtubules', 'mitochondria', 'nucleus'.
159        download: Whether to download the data if it is not present.
160        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
161
162    Returns:
163        The DataLoader.
164    """
165    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
166    dataset = get_betaseg_dataset(path, patch_shape, label_choice, download, **ds_kwargs)
167    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the BetaSeg dataloader for organelle segmentation.

Arguments:
  • path: Filepath to a folder where the data will be downloaded for further processing.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape to use for training.
  • label_choice: The choice of label. The choices available are: 'centriole', 'golgi', 'granules', 'membrane', 'microtubules', 'mitochondria', 'nucleus'.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.