torch_em.data.datasets.electron_microscopy.betaseg
The BetaSeg dataset contains annotations for organelle segmentation in FIB-SEM data.
More information for this dataset is located at https://betaseg.github.io/. And the original publication where this entire data is presented is https://arxiv.org/abs/2303.03876. Please cite it if you use this dataset for your research.
1"""The BetaSeg dataset contains annotations for organelle segmentation in FIB-SEM data. 2 3More information for this dataset is located at https://betaseg.github.io/. 4And the original publication where this entire data is presented is https://arxiv.org/abs/2303.03876. 5Please cite it if you use this dataset for your research. 6""" 7 8import os 9import shutil 10from glob import glob 11from tqdm import tqdm 12from pathlib import Path 13from typing import Union, Tuple, List 14 15import imageio.v3 as imageio 16 17from torch.utils.data import Dataset, DataLoader 18 19import torch_em 20 21from .. import util 22 23 24URL = "https://cloud.mpi-cbg.de/index.php/s/UJopHTRuh6f4wR8/download" 25CHECKSUM = "4872eec0211721dc224acee319c27c4f51c190adc36004e3d5bb60dfcd67eb7b" 26 27 28def get_betaseg_data(path: Union[os.PathLike, str], download: bool = False) -> str: 29 """Download the BetaSeg dataset. 30 31 Args: 32 path: Filepath to a folder where the data will be downloaded for further processing. 33 download: Whether to download the data if it is not present. 34 35 Returns: 36 Filepath where the data is stored. 37 """ 38 data_dir = os.path.join(path, "data") 39 if os.path.exists(data_dir): 40 return data_dir 41 42 os.makedirs(data_dir) 43 44 zip_path = os.path.join(path, "data.zip") 45 print("The BetaSeg dataset is quite large. It might take a couple of hours depending on your internet connection.") 46 util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM) 47 util.unzip(zip_path=zip_path, dst=data_dir) 48 49 # Group all files into h5 files. 50 vol_dirs = glob(os.path.join(data_dir, "download", "*")) 51 for vol_dir in tqdm(vol_dirs, desc="Preprocessing volumes"): 52 # Get the image path. 53 raw_path = os.path.join(vol_dir, f"{os.path.basename(vol_dir)}_source.tif") 54 assert os.path.exists(raw_path), raw_path 55 56 # Get the corresponding labels which would always exist. 57 label_paths = { 58 "centriole": os.path.join(vol_dir, f"{os.path.basename(vol_dir)}_centrioles.tif"), 59 "golgi": os.path.join(vol_dir, f"{os.path.basename(vol_dir)}_golgi_corrected.tif"), 60 "granules": os.path.join(vol_dir, f"{os.path.basename(vol_dir)}_granules.tif"), 61 "membrane": os.path.join(vol_dir, f"{os.path.basename(vol_dir)}_membrane_full_mask.tif"), 62 "microtubules": os.path.join(vol_dir, f"{os.path.basename(vol_dir)}_microtubules.tif"), 63 "mitochondria": os.path.join(vol_dir, f"{os.path.basename(vol_dir)}_mitochondria_mask.tif"), 64 "nucleus": os.path.join(vol_dir, f"{os.path.basename(vol_dir)}_nucleus_mask.tif") 65 } 66 for p in label_paths.values(): 67 assert os.path.exists(p), p 68 69 # Load all images. 70 raw = imageio.imread(raw_path) 71 labels = {k: imageio.imread(v) for k, v in label_paths.items()} 72 73 # Now, let's get all in an h5 file. 74 import h5py 75 vol_path = os.path.join(data_dir, Path(os.path.basename(raw_path)).with_suffix(".h5")) 76 with h5py.File(vol_path, "w") as f: 77 f.create_dataset("raw", data=raw, dtype=raw.dtype, compression="gzip") 78 for label_key, label in labels.items(): 79 f.create_dataset(f"labels/{label_key}", data=label, dtype=label.dtype, compression="gzip") 80 81 # Remove all other stuff 82 shutil.rmtree(os.path.join(data_dir, "download")) 83 84 return data_dir 85 86 87def get_betaseg_paths(path: Union[os.PathLike, str], download: bool = False) -> List[str]: 88 """Get filepaths to the BetaSeg data. 89 90 Args: 91 path: Filepath to a folder where the data will be downloaded for further processing. 92 download: Whether to download the data if it is not present. 93 94 Returns: 95 List of filepaths for the volumetric data. 96 """ 97 data_dir = get_betaseg_data(path, download) 98 volume_paths = glob(os.path.join(data_dir, "*.h5")) 99 return volume_paths 100 101 102def get_betaseg_dataset( 103 path: Union[os.PathLike, str], 104 patch_shape: Tuple[int, ...], 105 label_choice: Union[str, List[str]], 106 download: bool = False, 107 **kwargs 108) -> Dataset: 109 """Get the BetaSeg dataset for organelle segmentation. 110 111 Args: 112 path: Filepath to a folder where the data will be downloaded for further processing. 113 patch_shape: The patch shape to use for training. 114 label_choice: The choice of label. The choices available are: 'centriole', 115 'golgi', 'granules', 'membrane', 'microtubules', 'mitochondria', 'nucleus'. 116 download: Whether to download the data if it is not present. 117 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 118 119 Returns: 120 The segmentation dataset. 121 """ 122 volume_paths = get_betaseg_paths(path, download) 123 124 # Arrange the organelle choices as expecting for loading labels. 125 if isinstance(label_choice, str): 126 label_choices = f"labels/{label_choice}" 127 else: 128 label_choices = [f"labels/{organelle}" for organelle in label_choices] 129 kwargs = util.update_kwargs(kwargs, "with_label_channels", True) 130 131 return torch_em.default_segmentation_dataset( 132 raw_paths=volume_paths, 133 raw_key="raw", 134 label_paths=volume_paths, 135 label_key=label_choices, 136 patch_shape=patch_shape, 137 is_seg_dataset=True, 138 **kwargs, 139 ) 140 141 142def get_betaseg_loader( 143 path: Union[os.PathLike, str], 144 batch_size: int, 145 patch_shape: Tuple[int, ...], 146 label_choice: Union[str, List[str]], 147 download: bool = False, 148 **kwargs 149) -> DataLoader: 150 """Get the BetaSeg dataloader for organelle segmentation. 151 152 Args: 153 path: Filepath to a folder where the data will be downloaded for further processing. 154 batch_size: The batch size for training. 155 patch_shape: The patch shape to use for training. 156 label_choice: The choice of label. The choices available are: 'centriole', 157 'golgi', 'granules', 'membrane', 'microtubules', 'mitochondria', 'nucleus'. 158 download: Whether to download the data if it is not present. 159 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 160 161 Returns: 162 The DataLoader. 163 """ 164 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 165 dataset = get_betaseg_dataset(path, patch_shape, label_choice, download, **ds_kwargs) 166 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URL =
'https://cloud.mpi-cbg.de/index.php/s/UJopHTRuh6f4wR8/download'
CHECKSUM =
'4872eec0211721dc224acee319c27c4f51c190adc36004e3d5bb60dfcd67eb7b'
def
get_betaseg_data(path: Union[os.PathLike, str], download: bool = False) -> str:
29def get_betaseg_data(path: Union[os.PathLike, str], download: bool = False) -> str: 30 """Download the BetaSeg dataset. 31 32 Args: 33 path: Filepath to a folder where the data will be downloaded for further processing. 34 download: Whether to download the data if it is not present. 35 36 Returns: 37 Filepath where the data is stored. 38 """ 39 data_dir = os.path.join(path, "data") 40 if os.path.exists(data_dir): 41 return data_dir 42 43 os.makedirs(data_dir) 44 45 zip_path = os.path.join(path, "data.zip") 46 print("The BetaSeg dataset is quite large. It might take a couple of hours depending on your internet connection.") 47 util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM) 48 util.unzip(zip_path=zip_path, dst=data_dir) 49 50 # Group all files into h5 files. 51 vol_dirs = glob(os.path.join(data_dir, "download", "*")) 52 for vol_dir in tqdm(vol_dirs, desc="Preprocessing volumes"): 53 # Get the image path. 54 raw_path = os.path.join(vol_dir, f"{os.path.basename(vol_dir)}_source.tif") 55 assert os.path.exists(raw_path), raw_path 56 57 # Get the corresponding labels which would always exist. 58 label_paths = { 59 "centriole": os.path.join(vol_dir, f"{os.path.basename(vol_dir)}_centrioles.tif"), 60 "golgi": os.path.join(vol_dir, f"{os.path.basename(vol_dir)}_golgi_corrected.tif"), 61 "granules": os.path.join(vol_dir, f"{os.path.basename(vol_dir)}_granules.tif"), 62 "membrane": os.path.join(vol_dir, f"{os.path.basename(vol_dir)}_membrane_full_mask.tif"), 63 "microtubules": os.path.join(vol_dir, f"{os.path.basename(vol_dir)}_microtubules.tif"), 64 "mitochondria": os.path.join(vol_dir, f"{os.path.basename(vol_dir)}_mitochondria_mask.tif"), 65 "nucleus": os.path.join(vol_dir, f"{os.path.basename(vol_dir)}_nucleus_mask.tif") 66 } 67 for p in label_paths.values(): 68 assert os.path.exists(p), p 69 70 # Load all images. 71 raw = imageio.imread(raw_path) 72 labels = {k: imageio.imread(v) for k, v in label_paths.items()} 73 74 # Now, let's get all in an h5 file. 75 import h5py 76 vol_path = os.path.join(data_dir, Path(os.path.basename(raw_path)).with_suffix(".h5")) 77 with h5py.File(vol_path, "w") as f: 78 f.create_dataset("raw", data=raw, dtype=raw.dtype, compression="gzip") 79 for label_key, label in labels.items(): 80 f.create_dataset(f"labels/{label_key}", data=label, dtype=label.dtype, compression="gzip") 81 82 # Remove all other stuff 83 shutil.rmtree(os.path.join(data_dir, "download")) 84 85 return data_dir
Download the BetaSeg dataset.
Arguments:
- path: Filepath to a folder where the data will be downloaded for further processing.
- download: Whether to download the data if it is not present.
Returns:
Filepath where the data is stored.
def
get_betaseg_paths(path: Union[os.PathLike, str], download: bool = False) -> List[str]:
88def get_betaseg_paths(path: Union[os.PathLike, str], download: bool = False) -> List[str]: 89 """Get filepaths to the BetaSeg data. 90 91 Args: 92 path: Filepath to a folder where the data will be downloaded for further processing. 93 download: Whether to download the data if it is not present. 94 95 Returns: 96 List of filepaths for the volumetric data. 97 """ 98 data_dir = get_betaseg_data(path, download) 99 volume_paths = glob(os.path.join(data_dir, "*.h5")) 100 return volume_paths
Get filepaths to the BetaSeg data.
Arguments:
- path: Filepath to a folder where the data will be downloaded for further processing.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the volumetric data.
def
get_betaseg_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], label_choice: Union[str, List[str]], download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
103def get_betaseg_dataset( 104 path: Union[os.PathLike, str], 105 patch_shape: Tuple[int, ...], 106 label_choice: Union[str, List[str]], 107 download: bool = False, 108 **kwargs 109) -> Dataset: 110 """Get the BetaSeg dataset for organelle segmentation. 111 112 Args: 113 path: Filepath to a folder where the data will be downloaded for further processing. 114 patch_shape: The patch shape to use for training. 115 label_choice: The choice of label. The choices available are: 'centriole', 116 'golgi', 'granules', 'membrane', 'microtubules', 'mitochondria', 'nucleus'. 117 download: Whether to download the data if it is not present. 118 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 119 120 Returns: 121 The segmentation dataset. 122 """ 123 volume_paths = get_betaseg_paths(path, download) 124 125 # Arrange the organelle choices as expecting for loading labels. 126 if isinstance(label_choice, str): 127 label_choices = f"labels/{label_choice}" 128 else: 129 label_choices = [f"labels/{organelle}" for organelle in label_choices] 130 kwargs = util.update_kwargs(kwargs, "with_label_channels", True) 131 132 return torch_em.default_segmentation_dataset( 133 raw_paths=volume_paths, 134 raw_key="raw", 135 label_paths=volume_paths, 136 label_key=label_choices, 137 patch_shape=patch_shape, 138 is_seg_dataset=True, 139 **kwargs, 140 )
Get the BetaSeg dataset for organelle segmentation.
Arguments:
- path: Filepath to a folder where the data will be downloaded for further processing.
- patch_shape: The patch shape to use for training.
- label_choice: The choice of label. The choices available are: 'centriole', 'golgi', 'granules', 'membrane', 'microtubules', 'mitochondria', 'nucleus'.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
def
get_betaseg_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, ...], label_choice: Union[str, List[str]], download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
143def get_betaseg_loader( 144 path: Union[os.PathLike, str], 145 batch_size: int, 146 patch_shape: Tuple[int, ...], 147 label_choice: Union[str, List[str]], 148 download: bool = False, 149 **kwargs 150) -> DataLoader: 151 """Get the BetaSeg dataloader for organelle segmentation. 152 153 Args: 154 path: Filepath to a folder where the data will be downloaded for further processing. 155 batch_size: The batch size for training. 156 patch_shape: The patch shape to use for training. 157 label_choice: The choice of label. The choices available are: 'centriole', 158 'golgi', 'granules', 'membrane', 'microtubules', 'mitochondria', 'nucleus'. 159 download: Whether to download the data if it is not present. 160 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 161 162 Returns: 163 The DataLoader. 164 """ 165 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 166 dataset = get_betaseg_dataset(path, patch_shape, label_choice, download, **ds_kwargs) 167 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the BetaSeg dataloader for organelle segmentation.
Arguments:
- path: Filepath to a folder where the data will be downloaded for further processing.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- label_choice: The choice of label. The choices available are: 'centriole', 'golgi', 'granules', 'membrane', 'microtubules', 'mitochondria', 'nucleus'.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader.