torch_em.data.datasets.electron_microscopy.asem

ASEM is a dataset for segmentation of cellular structures in FIB-SEM.

The dataset was publised in https://doi.org/10.1083/jcb.202208005. Please cite this publication if you use the dataset in your research.

View Source

  1"""ASEM is a dataset for segmentation of cellular structures in FIB-SEM.
  2
  3The dataset was publised in https://doi.org/10.1083/jcb.202208005.
  4Please cite this publication if you use the dataset in your research.
  5"""
  6
  7import os
  8from typing import Union, Tuple, Optional, List
  9
 10import numpy as np
 11
 12from torch.utils.data import Dataset, DataLoader
 13
 14import torch_em
 15
 16from .. import util
 17from ... import ConcatDataset
 18
 19try:
 20    import quilt3 as q3
 21    have_quilt = True
 22except ModuleNotFoundError:
 23    have_quilt = False
 24
 25
 26# The following volumes do not have labels:
 27#   - cell_8, cell_14, cell_15, cell_16, cell_17
 28
 29# (E): "RuntimeError: Exception during zlib decompression: (-5)" (with `z5py`)
 30# (Y): similar shapes
 31# (N): dissimilar shapes (e.g. raw: (1000, 1100, 1200), labels: (200, 300, 400))
 32
 33INCONSISTENT_VOLUMES = {
 34    "mito": ["cell_6.zarr", "cell_13.zarr", "cell_13a.zarr"],
 35    "golgi": ["cell_3.zarr", "cell_6.zarr"],
 36    "er": ["cell_3.zarr", "cell_6.zarr", "cell_13.zarr"],
 37}
 38
 39VOLUMES = {
 40    "cell_1": "cell_1/cell_1.zarr",  # mito (Y) golgi (Y) er (Y)
 41    "cell_2": "cell_2/cell_2.zarr",  # mito (Y) golgi (Y) er (Y)
 42    "cell_3": "cell_3/cell_3.zarr",  # mito (Y) golgi (N) er (N)
 43    "cell_6": "cell_6/cell_6.zarr",  # mito (N) golgi (N) er (N)
 44    "cell_12": "cell_12/cell_12.zarr",  # ccp (Y)
 45    "cell_13": "cell_13/cell_13.zarr",  # ccp (Y) er (E) mito (N)
 46    "cell_13a": "cell_13a/cell_13a.zarr",  # np (Y) np_bottom (Y) mito (N)
 47}
 48
 49ORGANELLES = {
 50    "mito": ["cell_1", "cell_2", "cell_3", "cell_6", "cell_13", "cell_13a"],
 51    "golgi": ["cell_1", "cell_2", "cell_3", "cell_6",],
 52    "er": ["cell_1", "cell_2", "cell_3", "cell_6",],
 53    "ccp": ["cell_12", "cell_13"],
 54    "np": ["cell_13a"],
 55    "np_bottom": ["cell_13a"]
 56}
 57
 58
 59def get_asem_data(path: Union[os.PathLike, str], volume_ids: List[str], download: bool = False):
 60    """Download the ASEM dataset.
 61
 62    The dataset is located at https://open.quiltdata.com/b/asem-project.
 63
 64    Args:
 65        path: Filepath to a folder where the downloaded data will be saved.
 66        volume_ids: List of volumes to download.
 67        download: Whether to download the data if it is not present.
 68    """
 69    if download and not have_quilt:
 70        raise ModuleNotFoundError("Please install quilt3: 'pip install quilt3'.")
 71
 72    b = q3.Bucket("s3://asem-project")
 73
 74    for volume_id in volume_ids:
 75        volume_path = os.path.join(path, VOLUMES[volume_id])
 76        if os.path.exists(volume_path):
 77            continue
 78
 79        if not download:
 80            raise FileNotFoundError(f"{VOLUMES[volume_id]} is not found, and 'download' is set to False.")
 81
 82        print(f"The ASEM dataset for sample '{volume_id}' is not available yet and will be downloaded and created.")
 83        print("Note that this dataset is large, so this step can take several hours (depending on your internet).")
 84        b.fetch(
 85            key=f"datasets/{VOLUMES[volume_id]}/volumes/labels/",
 86            path=os.path.join(volume_path, "volumes", "labels/")
 87        )
 88        b.fetch(
 89            key=f"datasets/{VOLUMES[volume_id]}/volumes/raw/",
 90            path=os.path.join(volume_path, "volumes", "raw/")
 91        )
 92        # let's get the group metadata keyfiles
 93        b.fetch(key=f"datasets/{VOLUMES[volume_id]}/.zgroup", path=f"{volume_path}/")
 94        b.fetch(key=f"datasets/{VOLUMES[volume_id]}/volumes/.zgroup", path=f"{volume_path}/volumes/")
 95
 96
 97def get_asem_paths(path: Union[os.PathLike, str], volume_ids: List[str], download: bool = False) -> List[str]:
 98    """Get paths to the ASEM data.
 99
100    Args:
101        path: Filepath to a folder where the downloaded data will be saved.
102        volume_ids: List of volumes to download.
103        download: Whether to download the data if it is not present.
104
105    Returns:
106        List of paths for all volume ids.
107    """
108    get_asem_data(path, volume_ids, download)
109    volume_paths = [os.path.join(path, VOLUMES[vol_id]) for vol_id in volume_ids]
110    return volume_paths
111
112
113def _make_volumes_consistent(volume_path, organelle):
114    import zarr
115
116    have_inconsistent_volumes = False
117
118    # we shouldn't load the volumes which are already consistent
119    volume_name = os.path.split(volume_path)[-1]
120    # there are organelles which aren't inconsistent at all, we ignore them.
121    inc_vols = INCONSISTENT_VOLUMES.get(organelle)
122    if inc_vols is None:  # i.e. the organelles have no inconsistency
123        return have_inconsistent_volumes
124    else:  # i.e. the organelles have some known inconsistency
125        if volume_name not in inc_vols:  # if the current volume has inconsistency in the desired organelle or not
126            return have_inconsistent_volumes
127
128    with zarr.open(volume_path, "r+") as f:
129        all_keys = list(f["volumes"].keys())
130        # we shouldn't load the volume to make checks in case the processing has taken place already
131        for this_key in all_keys:
132            if this_key == f"raw_{organelle}":
133                return True
134
135        raw = f["volumes/raw"][:]
136
137        this_key = f"volumes/labels/{organelle}"
138        labels = f[this_key][:]
139
140        if labels.shape != raw.shape:
141            print("Found inconsistent volumes. Will save the desired crops of the volume.")
142            have_inconsistent_volumes = True
143            img_offset = np.array(
144                np.array(f["volumes/raw"].attrs["offset"]) // np.array(f["volumes/raw"].attrs["resolution"])
145            )
146            label_offset = np.array(
147                np.array(f[this_key].attrs["offset"]) // np.array(f[this_key].attrs["resolution"])
148            )
149            offset = label_offset - img_offset
150            desired_slices = tuple(slice(o, s) for o, s in zip(offset, offset + labels.shape))
151            new_raw = raw[desired_slices]
152
153            assert new_raw.shape == labels.shape
154
155            # HACK: current way-to-go is to create a new hierarchy where we store the desired volume patches
156            # TODO: we want to integrate this so that this slicing can be done just by passing the offsets
157            f.create_dataset(f"volumes/raw_{organelle}", data=new_raw, chunks=new_raw.shape)
158
159    return have_inconsistent_volumes
160
161
162def _check_input_args(input_arg, default_values):
163    if input_arg is None:
164        input_arg = default_values
165    else:
166        if isinstance(input_arg, str):
167            assert input_arg in default_values
168            input_arg = [input_arg]
169
170    return input_arg
171
172
173def get_asem_dataset(
174    path: Union[os.PathLike, str],
175    patch_shape: Tuple[int, ...],
176    download: bool = False,
177    organelles: Optional[Union[List[str], str]] = None,
178    volume_ids: Optional[Union[List[str], str]] = None,
179    **kwargs
180) -> Dataset:
181    """Get dataset for segmentation of organelles in FIB-SEM cells.
182
183    Args:
184        path: Filepath to a folder where the downloaded data will be saved.
185        patch_shape: The patch shape to use for training.
186        download: Whether to download the data if it is not present.
187        organelles: The choice of organelles.
188        volume_ids: The choice of volumes.
189        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
190
191    Returns:
192        The segmentation dataset.
193    """
194    # let's get the choice of organelles sorted
195    organelles = _check_input_args(organelles, ORGANELLES)
196
197    # now let's get the chosen volumes have the chosen organelles
198    all_datasets = []
199    for organelle in organelles:
200        if volume_ids is None:
201            volume_ids = ORGANELLES[organelle]
202        else:
203            if isinstance(volume_ids, str):
204                volume_ids = [volume_ids]
205
206            for volume_id in volume_ids:
207                assert volume_id in ORGANELLES[organelle], \
208                    f"The chosen volume and organelle combination does not match: '{volume_id}' & '{organelle}'"
209
210        volume_paths = get_asem_paths(path, volume_ids, download)
211
212        for volume_path in volume_paths:
213            have_volumes_inconsistent = _make_volumes_consistent(volume_path, organelle)
214
215            dataset = torch_em.default_segmentation_dataset(
216                raw_paths=volume_path,
217                raw_key=f"volumes/raw_{organelle}" if have_volumes_inconsistent else "volumes/raw",
218                label_paths=volume_path,
219                label_key=f"volumes/labels/{organelle}",
220                patch_shape=patch_shape,
221                is_seg_dataset=True,
222                **kwargs
223            )
224            dataset.max_sampling_attempts = 5000
225            all_datasets.append(dataset)
226
227    return ConcatDataset(*all_datasets)
228
229
230def get_asem_loader(
231    path: Union[os.PathLike, str],
232    patch_shape: Tuple[int, ...],
233    batch_size: int,
234    download: bool = False,
235    organelles: Optional[Union[List[str], str]] = None,
236    volume_ids: Optional[Union[List[str], str]] = None,
237    **kwargs
238) -> DataLoader:
239    """Get dataloader for the segmentation of organelles in FIB-SEM cells.
240
241    Args:
242        path: Filepath to a folder where the downloaded data will be saved.
243        patch_shape: The patch shape to use for training.
244        batch_size: The batch size for training.
245        download: Whether to download the data if it is not present.
246        organelles: The choice of organelles.
247        volume_ids: The choice of volumes.
248        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
249
250    Returns:
251        The DataLoader.
252    """
253    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
254    ds = get_asem_dataset(path, patch_shape, download, organelles, volume_ids, **ds_kwargs)
255    loader = torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)
256    return loader

INCONSISTENT_VOLUMES = {'mito': ['cell_6.zarr', 'cell_13.zarr', 'cell_13a.zarr'], 'golgi': ['cell_3.zarr', 'cell_6.zarr'], 'er': ['cell_3.zarr', 'cell_6.zarr', 'cell_13.zarr']}

VOLUMES = {'cell_1': 'cell_1/cell_1.zarr', 'cell_2': 'cell_2/cell_2.zarr', 'cell_3': 'cell_3/cell_3.zarr', 'cell_6': 'cell_6/cell_6.zarr', 'cell_12': 'cell_12/cell_12.zarr', 'cell_13': 'cell_13/cell_13.zarr', 'cell_13a': 'cell_13a/cell_13a.zarr'}

ORGANELLES = {'mito': ['cell_1', 'cell_2', 'cell_3', 'cell_6', 'cell_13', 'cell_13a'], 'golgi': ['cell_1', 'cell_2', 'cell_3', 'cell_6'], 'er': ['cell_1', 'cell_2', 'cell_3', 'cell_6'], 'ccp': ['cell_12', 'cell_13'], 'np': ['cell_13a'], 'np_bottom': ['cell_13a']}

def get_asem_data( path: Union[os.PathLike, str], volume_ids: List[str], download: bool = False): View Source

60def get_asem_data(path: Union[os.PathLike, str], volume_ids: List[str], download: bool = False):
61    """Download the ASEM dataset.
62
63    The dataset is located at https://open.quiltdata.com/b/asem-project.
64
65    Args:
66        path: Filepath to a folder where the downloaded data will be saved.
67        volume_ids: List of volumes to download.
68        download: Whether to download the data if it is not present.
69    """
70    if download and not have_quilt:
71        raise ModuleNotFoundError("Please install quilt3: 'pip install quilt3'.")
72
73    b = q3.Bucket("s3://asem-project")
74
75    for volume_id in volume_ids:
76        volume_path = os.path.join(path, VOLUMES[volume_id])
77        if os.path.exists(volume_path):
78            continue
79
80        if not download:
81            raise FileNotFoundError(f"{VOLUMES[volume_id]} is not found, and 'download' is set to False.")
82
83        print(f"The ASEM dataset for sample '{volume_id}' is not available yet and will be downloaded and created.")
84        print("Note that this dataset is large, so this step can take several hours (depending on your internet).")
85        b.fetch(
86            key=f"datasets/{VOLUMES[volume_id]}/volumes/labels/",
87            path=os.path.join(volume_path, "volumes", "labels/")
88        )
89        b.fetch(
90            key=f"datasets/{VOLUMES[volume_id]}/volumes/raw/",
91            path=os.path.join(volume_path, "volumes", "raw/")
92        )
93        # let's get the group metadata keyfiles
94        b.fetch(key=f"datasets/{VOLUMES[volume_id]}/.zgroup", path=f"{volume_path}/")
95        b.fetch(key=f"datasets/{VOLUMES[volume_id]}/volumes/.zgroup", path=f"{volume_path}/volumes/")

Download the ASEM dataset.

The dataset is located at https://open.quiltdata.com/b/asem-project.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
volume_ids: List of volumes to download.
download: Whether to download the data if it is not present.

def get_asem_paths( path: Union[os.PathLike, str], volume_ids: List[str], download: bool = False) -> List[str]: View Source

 98def get_asem_paths(path: Union[os.PathLike, str], volume_ids: List[str], download: bool = False) -> List[str]:
 99    """Get paths to the ASEM data.
100
101    Args:
102        path: Filepath to a folder where the downloaded data will be saved.
103        volume_ids: List of volumes to download.
104        download: Whether to download the data if it is not present.
105
106    Returns:
107        List of paths for all volume ids.
108    """
109    get_asem_data(path, volume_ids, download)
110    volume_paths = [os.path.join(path, VOLUMES[vol_id]) for vol_id in volume_ids]
111    return volume_paths

Get paths to the ASEM data.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
volume_ids: List of volumes to download.
download: Whether to download the data if it is not present.

Returns:

List of paths for all volume ids.

def get_asem_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], download: bool = False, organelles: Union[List[str], str, NoneType] = None, volume_ids: Union[List[str], str, NoneType] = None, **kwargs) -> torch.utils.data.dataset.Dataset: View Source

174def get_asem_dataset(
175    path: Union[os.PathLike, str],
176    patch_shape: Tuple[int, ...],
177    download: bool = False,
178    organelles: Optional[Union[List[str], str]] = None,
179    volume_ids: Optional[Union[List[str], str]] = None,
180    **kwargs
181) -> Dataset:
182    """Get dataset for segmentation of organelles in FIB-SEM cells.
183
184    Args:
185        path: Filepath to a folder where the downloaded data will be saved.
186        patch_shape: The patch shape to use for training.
187        download: Whether to download the data if it is not present.
188        organelles: The choice of organelles.
189        volume_ids: The choice of volumes.
190        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
191
192    Returns:
193        The segmentation dataset.
194    """
195    # let's get the choice of organelles sorted
196    organelles = _check_input_args(organelles, ORGANELLES)
197
198    # now let's get the chosen volumes have the chosen organelles
199    all_datasets = []
200    for organelle in organelles:
201        if volume_ids is None:
202            volume_ids = ORGANELLES[organelle]
203        else:
204            if isinstance(volume_ids, str):
205                volume_ids = [volume_ids]
206
207            for volume_id in volume_ids:
208                assert volume_id in ORGANELLES[organelle], \
209                    f"The chosen volume and organelle combination does not match: '{volume_id}' & '{organelle}'"
210
211        volume_paths = get_asem_paths(path, volume_ids, download)
212
213        for volume_path in volume_paths:
214            have_volumes_inconsistent = _make_volumes_consistent(volume_path, organelle)
215
216            dataset = torch_em.default_segmentation_dataset(
217                raw_paths=volume_path,
218                raw_key=f"volumes/raw_{organelle}" if have_volumes_inconsistent else "volumes/raw",
219                label_paths=volume_path,
220                label_key=f"volumes/labels/{organelle}",
221                patch_shape=patch_shape,
222                is_seg_dataset=True,
223                **kwargs
224            )
225            dataset.max_sampling_attempts = 5000
226            all_datasets.append(dataset)
227
228    return ConcatDataset(*all_datasets)

Get dataset for segmentation of organelles in FIB-SEM cells.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
patch_shape: The patch shape to use for training.
download: Whether to download the data if it is not present.
organelles: The choice of organelles.
volume_ids: The choice of volumes.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

Returns:

The segmentation dataset.

def get_asem_loader( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], batch_size: int, download: bool = False, organelles: Union[List[str], str, NoneType] = None, volume_ids: Union[List[str], str, NoneType] = None, **kwargs) -> torch.utils.data.dataloader.DataLoader: View Source

231def get_asem_loader(
232    path: Union[os.PathLike, str],
233    patch_shape: Tuple[int, ...],
234    batch_size: int,
235    download: bool = False,
236    organelles: Optional[Union[List[str], str]] = None,
237    volume_ids: Optional[Union[List[str], str]] = None,
238    **kwargs
239) -> DataLoader:
240    """Get dataloader for the segmentation of organelles in FIB-SEM cells.
241
242    Args:
243        path: Filepath to a folder where the downloaded data will be saved.
244        patch_shape: The patch shape to use for training.
245        batch_size: The batch size for training.
246        download: Whether to download the data if it is not present.
247        organelles: The choice of organelles.
248        volume_ids: The choice of volumes.
249        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
250
251    Returns:
252        The DataLoader.
253    """
254    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
255    ds = get_asem_dataset(path, patch_shape, download, organelles, volume_ids, **ds_kwargs)
256    loader = torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)
257    return loader

Get dataloader for the segmentation of organelles in FIB-SEM cells.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
patch_shape: The patch shape to use for training.
batch_size: The batch size for training.
download: Whether to download the data if it is not present.
organelles: The choice of organelles.
volume_ids: The choice of volumes.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

Returns:

The DataLoader.