torch_em.data.datasets.electron_microscopy.asem

  1import os
  2import numpy as np
  3
  4import zarr
  5
  6import torch_em
  7
  8from .. import util
  9from ... import ConcatDataset
 10
 11try:
 12    import quilt3 as q3
 13    have_quilt = True
 14except ModuleNotFoundError:
 15    have_quilt = False
 16
 17
 18# The following volumes do not have labels:
 19#   - cell_8, cell_14, cell_15, cell_16, cell_17
 20
 21# (E): "RuntimeError: Exception during zlib decompression: (-5)" (with `z5py`)
 22# (Y): similar shapes
 23# (N): dissimilar shapes (e.g. raw: (1000, 1100, 1200), labels: (200, 300, 400))
 24
 25INCONSISTENT_VOLUMES = {
 26    "mito": ["cell_6.zarr", "cell_13.zarr", "cell_13a.zarr"],
 27    "golgi": ["cell_3.zarr", "cell_6.zarr"],
 28    "er": ["cell_3.zarr", "cell_6.zarr", "cell_13.zarr"],
 29}
 30
 31
 32VOLUMES = {
 33    "cell_1": "cell_1/cell_1.zarr",  # mito (Y) golgi (Y) er (Y)
 34    "cell_2": "cell_2/cell_2.zarr",  # mito (Y) golgi (Y) er (Y)
 35    "cell_3": "cell_3/cell_3.zarr",  # mito (Y) golgi (N) er (N)
 36    "cell_6": "cell_6/cell_6.zarr",  # mito (N) golgi (N) er (N)
 37    "cell_12": "cell_12/cell_12.zarr",  # ccp (Y)
 38    "cell_13": "cell_13/cell_13.zarr",  # ccp (Y) er (E) mito (N)
 39    "cell_13a": "cell_13a/cell_13a.zarr",  # np (Y) np_bottom (Y) mito (N)
 40}
 41
 42ORGANELLES = {
 43    "mito": ["cell_1", "cell_2", "cell_3", "cell_6", "cell_13", "cell_13a"],
 44    "golgi": ["cell_1", "cell_2", "cell_3", "cell_6",],
 45    "er": ["cell_1", "cell_2", "cell_3", "cell_6",],
 46    "ccp": ["cell_12", "cell_13"],
 47    "np": ["cell_13a"],
 48    "np_bottom": ["cell_13a"]
 49}
 50
 51
 52def _download_asem_dataset(path, volume_ids, download):
 53    """https://open.quiltdata.com/b/asem-project"""
 54    if download and not have_quilt:
 55        raise ModuleNotFoundError("Please install quilt3: 'pip install quilt3'.")
 56
 57    b = q3.Bucket("s3://asem-project")
 58
 59    volume_paths = []
 60    for volume_id in volume_ids:
 61        volume_path = os.path.join(path, VOLUMES[volume_id])
 62        if not os.path.exists(volume_path):
 63            if not download:
 64                raise FileNotFoundError(f"{VOLUMES[volume_id]} is not found, and 'download' is set to False.")
 65
 66            print(f"The ASEM dataset for sample '{volume_id}' is not available yet and will be downloaded and created.")
 67            print("Note that this dataset is large, so this step can take several hours (depending on your internet).")
 68            b.fetch(
 69                key=f"datasets/{VOLUMES[volume_id]}/volumes/labels/",
 70                path=os.path.join(volume_path, "volumes", "labels/")
 71            )
 72            b.fetch(
 73                key=f"datasets/{VOLUMES[volume_id]}/volumes/raw/",
 74                path=os.path.join(volume_path, "volumes", "raw/")
 75            )
 76            # let's get the group metadata keyfiles
 77            b.fetch(key=f"datasets/{VOLUMES[volume_id]}/.zgroup", path=f"{volume_path}/")
 78            b.fetch(key=f"datasets/{VOLUMES[volume_id]}/volumes/.zgroup", path=f"{volume_path}/volumes/")
 79
 80        volume_paths.append(volume_path)
 81
 82    return volume_paths
 83
 84
 85def _make_volumes_consistent(volume_path, organelle):
 86    have_inconsistent_volumes = False
 87
 88    # we shouldn't load the volumes which are already consistent
 89    volume_name = os.path.split(volume_path)[-1]
 90    # there are organelles which aren't inconsistent at all, we ignore them.
 91    inc_vols = INCONSISTENT_VOLUMES.get(organelle)
 92    if inc_vols is None:  # i.e. the organelles have no inconsistency
 93        return have_inconsistent_volumes
 94    else:  # i.e. the organelles have some known inconsistency
 95        if volume_name not in inc_vols:  # if the current volume has inconsistency in the desired organelle or not
 96            return have_inconsistent_volumes
 97
 98    with zarr.open(volume_path, "r+") as f:
 99        all_keys = list(f["volumes"].keys())
100        # we shouldn't load the volume to make checks in case the processing has taken place already
101        for this_key in all_keys:
102            if this_key == f"raw_{organelle}":
103                return True
104
105        raw = f["volumes/raw"][:]
106
107        this_key = f"volumes/labels/{organelle}"
108        labels = f[this_key][:]
109
110        if labels.shape != raw.shape:
111            print("Found inconsistent volumes. Will save the desired crops of the volume.")
112            have_inconsistent_volumes = True
113            img_offset = np.array(
114                np.array(f["volumes/raw"].attrs["offset"]) // np.array(f["volumes/raw"].attrs["resolution"])
115            )
116            label_offset = np.array(
117                np.array(f[this_key].attrs["offset"]) // np.array(f[this_key].attrs["resolution"])
118            )
119            offset = label_offset - img_offset
120            desired_slices = tuple(slice(o, s) for o, s in zip(offset, offset + labels.shape))
121            new_raw = raw[desired_slices]
122
123            assert new_raw.shape == labels.shape
124
125            # HACK: current way-to-go is to create a new hierarchy where we store the desired volume patches
126            # TODO: we want to integrate this so that this slicing can be done just by passing the offsets
127            f.create_dataset(f"volumes/raw_{organelle}", data=new_raw, chunks=new_raw.shape)
128
129    return have_inconsistent_volumes
130
131
132def _check_input_args(input_arg, default_values):
133    if input_arg is None:
134        input_arg = default_values
135    else:
136        if isinstance(input_arg, str):
137            assert input_arg in default_values
138            input_arg = [input_arg]
139
140    return input_arg
141
142
143# TODO download the asem data and re-use this function in get_asem_data
144def get_asem_data(path):
145    pass
146
147
148def get_asem_dataset(
149    path, patch_shape, ndim, download, organelles=None, volume_ids=None, **kwargs
150):
151    """Dataset for the segmentation of organelles in FIB-SEM cells.
152
153    This dataset provides access to 3d images of organelles (mitochondria, golgi, endoplasmic reticulum)
154    segmented in cells. If you use this data in your research, please cite: https://doi.org/10.1083/jcb.202208005
155    """
156    # let's get the choice of organelles sorted
157    organelles = _check_input_args(organelles, ORGANELLES)
158
159    # now let's get the chosen volumes have the chosen organelles
160    all_datasets = []
161    for organelle in organelles:
162        if volume_ids is None:
163            volume_ids = ORGANELLES[organelle]
164        else:
165            if isinstance(volume_ids, str):
166                volume_ids = [volume_ids]
167
168            for volume_id in volume_ids:
169                assert volume_id in ORGANELLES[organelle], \
170                    f"The chosen volume and organelle combination does not match: '{volume_id}' & '{organelle}'"
171
172        volume_paths = _download_asem_dataset(path, volume_ids, download)
173
174        for volume_path in volume_paths:
175            have_volumes_inconsistent = _make_volumes_consistent(volume_path, organelle)
176
177            raw_key = f"volumes/raw_{organelle}" if have_volumes_inconsistent else "volumes/raw"
178            dataset = torch_em.default_segmentation_dataset(
179                volume_path, raw_key,
180                volume_path, f"volumes/labels/{organelle}",
181                patch_shape, ndim=ndim, is_seg_dataset=True,
182                **kwargs
183            )
184            dataset.max_sampling_attempts = 5000
185            all_datasets.append(dataset)
186
187    return ConcatDataset(*all_datasets)
188
189
190def get_asem_loader(
191    path, patch_shape, batch_size, ndim, download=False, organelles=None, volume_ids=None, **kwargs
192):
193    """Dataloader for organelle segmentation in FIB-SEM cells. See `get_asem_dataset` for details."""
194    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
195    ds = get_asem_dataset(path, patch_shape, ndim, download, organelles, volume_ids, **ds_kwargs)
196    loader = torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)
197    return loader
INCONSISTENT_VOLUMES = {'mito': ['cell_6.zarr', 'cell_13.zarr', 'cell_13a.zarr'], 'golgi': ['cell_3.zarr', 'cell_6.zarr'], 'er': ['cell_3.zarr', 'cell_6.zarr', 'cell_13.zarr']}
VOLUMES = {'cell_1': 'cell_1/cell_1.zarr', 'cell_2': 'cell_2/cell_2.zarr', 'cell_3': 'cell_3/cell_3.zarr', 'cell_6': 'cell_6/cell_6.zarr', 'cell_12': 'cell_12/cell_12.zarr', 'cell_13': 'cell_13/cell_13.zarr', 'cell_13a': 'cell_13a/cell_13a.zarr'}
ORGANELLES = {'mito': ['cell_1', 'cell_2', 'cell_3', 'cell_6', 'cell_13', 'cell_13a'], 'golgi': ['cell_1', 'cell_2', 'cell_3', 'cell_6'], 'er': ['cell_1', 'cell_2', 'cell_3', 'cell_6'], 'ccp': ['cell_12', 'cell_13'], 'np': ['cell_13a'], 'np_bottom': ['cell_13a']}
def get_asem_data(path):
145def get_asem_data(path):
146    pass
def get_asem_dataset( path, patch_shape, ndim, download, organelles=None, volume_ids=None, **kwargs):
149def get_asem_dataset(
150    path, patch_shape, ndim, download, organelles=None, volume_ids=None, **kwargs
151):
152    """Dataset for the segmentation of organelles in FIB-SEM cells.
153
154    This dataset provides access to 3d images of organelles (mitochondria, golgi, endoplasmic reticulum)
155    segmented in cells. If you use this data in your research, please cite: https://doi.org/10.1083/jcb.202208005
156    """
157    # let's get the choice of organelles sorted
158    organelles = _check_input_args(organelles, ORGANELLES)
159
160    # now let's get the chosen volumes have the chosen organelles
161    all_datasets = []
162    for organelle in organelles:
163        if volume_ids is None:
164            volume_ids = ORGANELLES[organelle]
165        else:
166            if isinstance(volume_ids, str):
167                volume_ids = [volume_ids]
168
169            for volume_id in volume_ids:
170                assert volume_id in ORGANELLES[organelle], \
171                    f"The chosen volume and organelle combination does not match: '{volume_id}' & '{organelle}'"
172
173        volume_paths = _download_asem_dataset(path, volume_ids, download)
174
175        for volume_path in volume_paths:
176            have_volumes_inconsistent = _make_volumes_consistent(volume_path, organelle)
177
178            raw_key = f"volumes/raw_{organelle}" if have_volumes_inconsistent else "volumes/raw"
179            dataset = torch_em.default_segmentation_dataset(
180                volume_path, raw_key,
181                volume_path, f"volumes/labels/{organelle}",
182                patch_shape, ndim=ndim, is_seg_dataset=True,
183                **kwargs
184            )
185            dataset.max_sampling_attempts = 5000
186            all_datasets.append(dataset)
187
188    return ConcatDataset(*all_datasets)

Dataset for the segmentation of organelles in FIB-SEM cells.

This dataset provides access to 3d images of organelles (mitochondria, golgi, endoplasmic reticulum) segmented in cells. If you use this data in your research, please cite: https://doi.org/10.1083/jcb.202208005

def get_asem_loader( path, patch_shape, batch_size, ndim, download=False, organelles=None, volume_ids=None, **kwargs):
191def get_asem_loader(
192    path, patch_shape, batch_size, ndim, download=False, organelles=None, volume_ids=None, **kwargs
193):
194    """Dataloader for organelle segmentation in FIB-SEM cells. See `get_asem_dataset` for details."""
195    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
196    ds = get_asem_dataset(path, patch_shape, ndim, download, organelles, volume_ids, **ds_kwargs)
197    loader = torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)
198    return loader

Dataloader for organelle segmentation in FIB-SEM cells. See get_asem_dataset for details.