torch_em.data.datasets.electron_microscopy.asem
ASEM is a dataset for segmentation of cellular structures in FIB-SEM.
The dataset was publised in https://doi.org/10.1083/jcb.202208005. Please cite this publication if you use the dataset in your research.
1"""ASEM is a dataset for segmentation of cellular structures in FIB-SEM. 2 3The dataset was publised in https://doi.org/10.1083/jcb.202208005. 4Please cite this publication if you use the dataset in your research. 5""" 6 7import os 8from typing import Union, Tuple, Optional, List 9 10import numpy as np 11 12from torch.utils.data import Dataset, DataLoader 13 14import torch_em 15 16from .. import util 17from ... import ConcatDataset 18 19try: 20 import quilt3 as q3 21 have_quilt = True 22except ModuleNotFoundError: 23 have_quilt = False 24 25 26# The following volumes do not have labels: 27# - cell_8, cell_14, cell_15, cell_16, cell_17 28 29# (E): "RuntimeError: Exception during zlib decompression: (-5)" (with `z5py`) 30# (Y): similar shapes 31# (N): dissimilar shapes (e.g. raw: (1000, 1100, 1200), labels: (200, 300, 400)) 32 33INCONSISTENT_VOLUMES = { 34 "mito": ["cell_6.zarr", "cell_13.zarr", "cell_13a.zarr"], 35 "golgi": ["cell_3.zarr", "cell_6.zarr"], 36 "er": ["cell_3.zarr", "cell_6.zarr", "cell_13.zarr"], 37} 38 39VOLUMES = { 40 "cell_1": "cell_1/cell_1.zarr", # mito (Y) golgi (Y) er (Y) 41 "cell_2": "cell_2/cell_2.zarr", # mito (Y) golgi (Y) er (Y) 42 "cell_3": "cell_3/cell_3.zarr", # mito (Y) golgi (N) er (N) 43 "cell_6": "cell_6/cell_6.zarr", # mito (N) golgi (N) er (N) 44 "cell_12": "cell_12/cell_12.zarr", # ccp (Y) 45 "cell_13": "cell_13/cell_13.zarr", # ccp (Y) er (E) mito (N) 46 "cell_13a": "cell_13a/cell_13a.zarr", # np (Y) np_bottom (Y) mito (N) 47} 48 49ORGANELLES = { 50 "mito": ["cell_1", "cell_2", "cell_3", "cell_6", "cell_13", "cell_13a"], 51 "golgi": ["cell_1", "cell_2", "cell_3", "cell_6",], 52 "er": ["cell_1", "cell_2", "cell_3", "cell_6",], 53 "ccp": ["cell_12", "cell_13"], 54 "np": ["cell_13a"], 55 "np_bottom": ["cell_13a"] 56} 57 58 59def get_asem_data(path: Union[os.PathLike, str], volume_ids: List[str], download: bool = False): 60 """Download the ASEM dataset. 61 62 The dataset is located at https://open.quiltdata.com/b/asem-project. 63 64 Args: 65 path: Filepath to a folder where the downloaded data will be saved. 66 volume_ids: List of volumes to download. 67 download: Whether to download the data if it is not present. 68 """ 69 if download and not have_quilt: 70 raise ModuleNotFoundError("Please install quilt3: 'pip install quilt3'.") 71 72 b = q3.Bucket("s3://asem-project") 73 74 for volume_id in volume_ids: 75 volume_path = os.path.join(path, VOLUMES[volume_id]) 76 if os.path.exists(volume_path): 77 continue 78 79 if not download: 80 raise FileNotFoundError(f"{VOLUMES[volume_id]} is not found, and 'download' is set to False.") 81 82 print(f"The ASEM dataset for sample '{volume_id}' is not available yet and will be downloaded and created.") 83 print("Note that this dataset is large, so this step can take several hours (depending on your internet).") 84 b.fetch( 85 key=f"datasets/{VOLUMES[volume_id]}/volumes/labels/", 86 path=os.path.join(volume_path, "volumes", "labels/") 87 ) 88 b.fetch( 89 key=f"datasets/{VOLUMES[volume_id]}/volumes/raw/", 90 path=os.path.join(volume_path, "volumes", "raw/") 91 ) 92 # let's get the group metadata keyfiles 93 b.fetch(key=f"datasets/{VOLUMES[volume_id]}/.zgroup", path=f"{volume_path}/") 94 b.fetch(key=f"datasets/{VOLUMES[volume_id]}/volumes/.zgroup", path=f"{volume_path}/volumes/") 95 96 97def get_asem_paths(path: Union[os.PathLike, str], volume_ids: List[str], download: bool = False) -> List[str]: 98 """Get paths to the ASEM data. 99 100 Args: 101 path: Filepath to a folder where the downloaded data will be saved. 102 volume_ids: List of volumes to download. 103 download: Whether to download the data if it is not present. 104 105 Returns: 106 List of paths for all volume ids. 107 """ 108 get_asem_data(path, volume_ids, download) 109 volume_paths = [os.path.join(path, VOLUMES[vol_id]) for vol_id in volume_ids] 110 return volume_paths 111 112 113def _make_volumes_consistent(volume_path, organelle): 114 import zarr 115 116 have_inconsistent_volumes = False 117 118 # we shouldn't load the volumes which are already consistent 119 volume_name = os.path.split(volume_path)[-1] 120 # there are organelles which aren't inconsistent at all, we ignore them. 121 inc_vols = INCONSISTENT_VOLUMES.get(organelle) 122 if inc_vols is None: # i.e. the organelles have no inconsistency 123 return have_inconsistent_volumes 124 else: # i.e. the organelles have some known inconsistency 125 if volume_name not in inc_vols: # if the current volume has inconsistency in the desired organelle or not 126 return have_inconsistent_volumes 127 128 with zarr.open(volume_path, "r+") as f: 129 all_keys = list(f["volumes"].keys()) 130 # we shouldn't load the volume to make checks in case the processing has taken place already 131 for this_key in all_keys: 132 if this_key == f"raw_{organelle}": 133 return True 134 135 raw = f["volumes/raw"][:] 136 137 this_key = f"volumes/labels/{organelle}" 138 labels = f[this_key][:] 139 140 if labels.shape != raw.shape: 141 print("Found inconsistent volumes. Will save the desired crops of the volume.") 142 have_inconsistent_volumes = True 143 img_offset = np.array( 144 np.array(f["volumes/raw"].attrs["offset"]) // np.array(f["volumes/raw"].attrs["resolution"]) 145 ) 146 label_offset = np.array( 147 np.array(f[this_key].attrs["offset"]) // np.array(f[this_key].attrs["resolution"]) 148 ) 149 offset = label_offset - img_offset 150 desired_slices = tuple(slice(o, s) for o, s in zip(offset, offset + labels.shape)) 151 new_raw = raw[desired_slices] 152 153 assert new_raw.shape == labels.shape 154 155 # HACK: current way-to-go is to create a new hierarchy where we store the desired volume patches 156 # TODO: we want to integrate this so that this slicing can be done just by passing the offsets 157 f.create_dataset(f"volumes/raw_{organelle}", data=new_raw, chunks=new_raw.shape) 158 159 return have_inconsistent_volumes 160 161 162def _check_input_args(input_arg, default_values): 163 if input_arg is None: 164 input_arg = default_values 165 else: 166 if isinstance(input_arg, str): 167 assert input_arg in default_values 168 input_arg = [input_arg] 169 170 return input_arg 171 172 173def get_asem_dataset( 174 path: Union[os.PathLike, str], 175 patch_shape: Tuple[int, ...], 176 download: bool = False, 177 organelles: Optional[Union[List[str], str]] = None, 178 volume_ids: Optional[Union[List[str], str]] = None, 179 **kwargs 180) -> Dataset: 181 """Get dataset for segmentation of organelles in FIB-SEM cells. 182 183 Args: 184 path: Filepath to a folder where the downloaded data will be saved. 185 patch_shape: The patch shape to use for training. 186 download: Whether to download the data if it is not present. 187 organelles: The choice of organelles. 188 volume_ids: The choice of volumes. 189 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 190 191 Returns: 192 The segmentation dataset. 193 """ 194 # let's get the choice of organelles sorted 195 organelles = _check_input_args(organelles, ORGANELLES) 196 197 # now let's get the chosen volumes have the chosen organelles 198 all_datasets = [] 199 for organelle in organelles: 200 if volume_ids is None: 201 volume_ids = ORGANELLES[organelle] 202 else: 203 if isinstance(volume_ids, str): 204 volume_ids = [volume_ids] 205 206 for volume_id in volume_ids: 207 assert volume_id in ORGANELLES[organelle], \ 208 f"The chosen volume and organelle combination does not match: '{volume_id}' & '{organelle}'" 209 210 volume_paths = get_asem_paths(path, volume_ids, download) 211 212 for volume_path in volume_paths: 213 have_volumes_inconsistent = _make_volumes_consistent(volume_path, organelle) 214 215 dataset = torch_em.default_segmentation_dataset( 216 raw_paths=volume_path, 217 raw_key=f"volumes/raw_{organelle}" if have_volumes_inconsistent else "volumes/raw", 218 label_paths=volume_path, 219 label_key=f"volumes/labels/{organelle}", 220 patch_shape=patch_shape, 221 is_seg_dataset=True, 222 **kwargs 223 ) 224 dataset.max_sampling_attempts = 5000 225 all_datasets.append(dataset) 226 227 return ConcatDataset(*all_datasets) 228 229 230def get_asem_loader( 231 path: Union[os.PathLike, str], 232 patch_shape: Tuple[int, ...], 233 batch_size: int, 234 download: bool = False, 235 organelles: Optional[Union[List[str], str]] = None, 236 volume_ids: Optional[Union[List[str], str]] = None, 237 **kwargs 238) -> DataLoader: 239 """Get dataloader for the segmentation of organelles in FIB-SEM cells. 240 241 Args: 242 path: Filepath to a folder where the downloaded data will be saved. 243 patch_shape: The patch shape to use for training. 244 batch_size: The batch size for training. 245 download: Whether to download the data if it is not present. 246 organelles: The choice of organelles. 247 volume_ids: The choice of volumes. 248 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 249 250 Returns: 251 The DataLoader. 252 """ 253 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 254 ds = get_asem_dataset(path, patch_shape, download, organelles, volume_ids, **ds_kwargs) 255 loader = torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs) 256 return loader
INCONSISTENT_VOLUMES =
{'mito': ['cell_6.zarr', 'cell_13.zarr', 'cell_13a.zarr'], 'golgi': ['cell_3.zarr', 'cell_6.zarr'], 'er': ['cell_3.zarr', 'cell_6.zarr', 'cell_13.zarr']}
VOLUMES =
{'cell_1': 'cell_1/cell_1.zarr', 'cell_2': 'cell_2/cell_2.zarr', 'cell_3': 'cell_3/cell_3.zarr', 'cell_6': 'cell_6/cell_6.zarr', 'cell_12': 'cell_12/cell_12.zarr', 'cell_13': 'cell_13/cell_13.zarr', 'cell_13a': 'cell_13a/cell_13a.zarr'}
ORGANELLES =
{'mito': ['cell_1', 'cell_2', 'cell_3', 'cell_6', 'cell_13', 'cell_13a'], 'golgi': ['cell_1', 'cell_2', 'cell_3', 'cell_6'], 'er': ['cell_1', 'cell_2', 'cell_3', 'cell_6'], 'ccp': ['cell_12', 'cell_13'], 'np': ['cell_13a'], 'np_bottom': ['cell_13a']}
def
get_asem_data( path: Union[os.PathLike, str], volume_ids: List[str], download: bool = False):
60def get_asem_data(path: Union[os.PathLike, str], volume_ids: List[str], download: bool = False): 61 """Download the ASEM dataset. 62 63 The dataset is located at https://open.quiltdata.com/b/asem-project. 64 65 Args: 66 path: Filepath to a folder where the downloaded data will be saved. 67 volume_ids: List of volumes to download. 68 download: Whether to download the data if it is not present. 69 """ 70 if download and not have_quilt: 71 raise ModuleNotFoundError("Please install quilt3: 'pip install quilt3'.") 72 73 b = q3.Bucket("s3://asem-project") 74 75 for volume_id in volume_ids: 76 volume_path = os.path.join(path, VOLUMES[volume_id]) 77 if os.path.exists(volume_path): 78 continue 79 80 if not download: 81 raise FileNotFoundError(f"{VOLUMES[volume_id]} is not found, and 'download' is set to False.") 82 83 print(f"The ASEM dataset for sample '{volume_id}' is not available yet and will be downloaded and created.") 84 print("Note that this dataset is large, so this step can take several hours (depending on your internet).") 85 b.fetch( 86 key=f"datasets/{VOLUMES[volume_id]}/volumes/labels/", 87 path=os.path.join(volume_path, "volumes", "labels/") 88 ) 89 b.fetch( 90 key=f"datasets/{VOLUMES[volume_id]}/volumes/raw/", 91 path=os.path.join(volume_path, "volumes", "raw/") 92 ) 93 # let's get the group metadata keyfiles 94 b.fetch(key=f"datasets/{VOLUMES[volume_id]}/.zgroup", path=f"{volume_path}/") 95 b.fetch(key=f"datasets/{VOLUMES[volume_id]}/volumes/.zgroup", path=f"{volume_path}/volumes/")
Download the ASEM dataset.
The dataset is located at https://open.quiltdata.com/b/asem-project.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- volume_ids: List of volumes to download.
- download: Whether to download the data if it is not present.
def
get_asem_paths( path: Union[os.PathLike, str], volume_ids: List[str], download: bool = False) -> List[str]:
98def get_asem_paths(path: Union[os.PathLike, str], volume_ids: List[str], download: bool = False) -> List[str]: 99 """Get paths to the ASEM data. 100 101 Args: 102 path: Filepath to a folder where the downloaded data will be saved. 103 volume_ids: List of volumes to download. 104 download: Whether to download the data if it is not present. 105 106 Returns: 107 List of paths for all volume ids. 108 """ 109 get_asem_data(path, volume_ids, download) 110 volume_paths = [os.path.join(path, VOLUMES[vol_id]) for vol_id in volume_ids] 111 return volume_paths
Get paths to the ASEM data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- volume_ids: List of volumes to download.
- download: Whether to download the data if it is not present.
Returns:
List of paths for all volume ids.
def
get_asem_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], download: bool = False, organelles: Union[List[str], str, NoneType] = None, volume_ids: Union[List[str], str, NoneType] = None, **kwargs) -> torch.utils.data.dataset.Dataset:
174def get_asem_dataset( 175 path: Union[os.PathLike, str], 176 patch_shape: Tuple[int, ...], 177 download: bool = False, 178 organelles: Optional[Union[List[str], str]] = None, 179 volume_ids: Optional[Union[List[str], str]] = None, 180 **kwargs 181) -> Dataset: 182 """Get dataset for segmentation of organelles in FIB-SEM cells. 183 184 Args: 185 path: Filepath to a folder where the downloaded data will be saved. 186 patch_shape: The patch shape to use for training. 187 download: Whether to download the data if it is not present. 188 organelles: The choice of organelles. 189 volume_ids: The choice of volumes. 190 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 191 192 Returns: 193 The segmentation dataset. 194 """ 195 # let's get the choice of organelles sorted 196 organelles = _check_input_args(organelles, ORGANELLES) 197 198 # now let's get the chosen volumes have the chosen organelles 199 all_datasets = [] 200 for organelle in organelles: 201 if volume_ids is None: 202 volume_ids = ORGANELLES[organelle] 203 else: 204 if isinstance(volume_ids, str): 205 volume_ids = [volume_ids] 206 207 for volume_id in volume_ids: 208 assert volume_id in ORGANELLES[organelle], \ 209 f"The chosen volume and organelle combination does not match: '{volume_id}' & '{organelle}'" 210 211 volume_paths = get_asem_paths(path, volume_ids, download) 212 213 for volume_path in volume_paths: 214 have_volumes_inconsistent = _make_volumes_consistent(volume_path, organelle) 215 216 dataset = torch_em.default_segmentation_dataset( 217 raw_paths=volume_path, 218 raw_key=f"volumes/raw_{organelle}" if have_volumes_inconsistent else "volumes/raw", 219 label_paths=volume_path, 220 label_key=f"volumes/labels/{organelle}", 221 patch_shape=patch_shape, 222 is_seg_dataset=True, 223 **kwargs 224 ) 225 dataset.max_sampling_attempts = 5000 226 all_datasets.append(dataset) 227 228 return ConcatDataset(*all_datasets)
Get dataset for segmentation of organelles in FIB-SEM cells.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- download: Whether to download the data if it is not present.
- organelles: The choice of organelles.
- volume_ids: The choice of volumes.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
def
get_asem_loader( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], batch_size: int, download: bool = False, organelles: Union[List[str], str, NoneType] = None, volume_ids: Union[List[str], str, NoneType] = None, **kwargs) -> torch.utils.data.dataloader.DataLoader:
231def get_asem_loader( 232 path: Union[os.PathLike, str], 233 patch_shape: Tuple[int, ...], 234 batch_size: int, 235 download: bool = False, 236 organelles: Optional[Union[List[str], str]] = None, 237 volume_ids: Optional[Union[List[str], str]] = None, 238 **kwargs 239) -> DataLoader: 240 """Get dataloader for the segmentation of organelles in FIB-SEM cells. 241 242 Args: 243 path: Filepath to a folder where the downloaded data will be saved. 244 patch_shape: The patch shape to use for training. 245 batch_size: The batch size for training. 246 download: Whether to download the data if it is not present. 247 organelles: The choice of organelles. 248 volume_ids: The choice of volumes. 249 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 250 251 Returns: 252 The DataLoader. 253 """ 254 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 255 ds = get_asem_dataset(path, patch_shape, download, organelles, volume_ids, **ds_kwargs) 256 loader = torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs) 257 return loader
Get dataloader for the segmentation of organelles in FIB-SEM cells.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- batch_size: The batch size for training.
- download: Whether to download the data if it is not present.
- organelles: The choice of organelles.
- volume_ids: The choice of volumes.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader.