torch_em.data.datasets.electron_microscopy.asem
1import os 2import numpy as np 3 4import zarr 5 6import torch_em 7 8from .. import util 9from ... import ConcatDataset 10 11try: 12 import quilt3 as q3 13 have_quilt = True 14except ModuleNotFoundError: 15 have_quilt = False 16 17 18# The following volumes do not have labels: 19# - cell_8, cell_14, cell_15, cell_16, cell_17 20 21# (E): "RuntimeError: Exception during zlib decompression: (-5)" (with `z5py`) 22# (Y): similar shapes 23# (N): dissimilar shapes (e.g. raw: (1000, 1100, 1200), labels: (200, 300, 400)) 24 25INCONSISTENT_VOLUMES = { 26 "mito": ["cell_6.zarr", "cell_13.zarr", "cell_13a.zarr"], 27 "golgi": ["cell_3.zarr", "cell_6.zarr"], 28 "er": ["cell_3.zarr", "cell_6.zarr", "cell_13.zarr"], 29} 30 31 32VOLUMES = { 33 "cell_1": "cell_1/cell_1.zarr", # mito (Y) golgi (Y) er (Y) 34 "cell_2": "cell_2/cell_2.zarr", # mito (Y) golgi (Y) er (Y) 35 "cell_3": "cell_3/cell_3.zarr", # mito (Y) golgi (N) er (N) 36 "cell_6": "cell_6/cell_6.zarr", # mito (N) golgi (N) er (N) 37 "cell_12": "cell_12/cell_12.zarr", # ccp (Y) 38 "cell_13": "cell_13/cell_13.zarr", # ccp (Y) er (E) mito (N) 39 "cell_13a": "cell_13a/cell_13a.zarr", # np (Y) np_bottom (Y) mito (N) 40} 41 42ORGANELLES = { 43 "mito": ["cell_1", "cell_2", "cell_3", "cell_6", "cell_13", "cell_13a"], 44 "golgi": ["cell_1", "cell_2", "cell_3", "cell_6",], 45 "er": ["cell_1", "cell_2", "cell_3", "cell_6",], 46 "ccp": ["cell_12", "cell_13"], 47 "np": ["cell_13a"], 48 "np_bottom": ["cell_13a"] 49} 50 51 52def _download_asem_dataset(path, volume_ids, download): 53 """https://open.quiltdata.com/b/asem-project""" 54 if download and not have_quilt: 55 raise ModuleNotFoundError("Please install quilt3: 'pip install quilt3'.") 56 57 b = q3.Bucket("s3://asem-project") 58 59 volume_paths = [] 60 for volume_id in volume_ids: 61 volume_path = os.path.join(path, VOLUMES[volume_id]) 62 if not os.path.exists(volume_path): 63 if not download: 64 raise FileNotFoundError(f"{VOLUMES[volume_id]} is not found, and 'download' is set to False.") 65 66 print(f"The ASEM dataset for sample '{volume_id}' is not available yet and will be downloaded and created.") 67 print("Note that this dataset is large, so this step can take several hours (depending on your internet).") 68 b.fetch( 69 key=f"datasets/{VOLUMES[volume_id]}/volumes/labels/", 70 path=os.path.join(volume_path, "volumes", "labels/") 71 ) 72 b.fetch( 73 key=f"datasets/{VOLUMES[volume_id]}/volumes/raw/", 74 path=os.path.join(volume_path, "volumes", "raw/") 75 ) 76 # let's get the group metadata keyfiles 77 b.fetch(key=f"datasets/{VOLUMES[volume_id]}/.zgroup", path=f"{volume_path}/") 78 b.fetch(key=f"datasets/{VOLUMES[volume_id]}/volumes/.zgroup", path=f"{volume_path}/volumes/") 79 80 volume_paths.append(volume_path) 81 82 return volume_paths 83 84 85def _make_volumes_consistent(volume_path, organelle): 86 have_inconsistent_volumes = False 87 88 # we shouldn't load the volumes which are already consistent 89 volume_name = os.path.split(volume_path)[-1] 90 # there are organelles which aren't inconsistent at all, we ignore them. 91 inc_vols = INCONSISTENT_VOLUMES.get(organelle) 92 if inc_vols is None: # i.e. the organelles have no inconsistency 93 return have_inconsistent_volumes 94 else: # i.e. the organelles have some known inconsistency 95 if volume_name not in inc_vols: # if the current volume has inconsistency in the desired organelle or not 96 return have_inconsistent_volumes 97 98 with zarr.open(volume_path, "r+") as f: 99 all_keys = list(f["volumes"].keys()) 100 # we shouldn't load the volume to make checks in case the processing has taken place already 101 for this_key in all_keys: 102 if this_key == f"raw_{organelle}": 103 return True 104 105 raw = f["volumes/raw"][:] 106 107 this_key = f"volumes/labels/{organelle}" 108 labels = f[this_key][:] 109 110 if labels.shape != raw.shape: 111 print("Found inconsistent volumes. Will save the desired crops of the volume.") 112 have_inconsistent_volumes = True 113 img_offset = np.array( 114 np.array(f["volumes/raw"].attrs["offset"]) // np.array(f["volumes/raw"].attrs["resolution"]) 115 ) 116 label_offset = np.array( 117 np.array(f[this_key].attrs["offset"]) // np.array(f[this_key].attrs["resolution"]) 118 ) 119 offset = label_offset - img_offset 120 desired_slices = tuple(slice(o, s) for o, s in zip(offset, offset + labels.shape)) 121 new_raw = raw[desired_slices] 122 123 assert new_raw.shape == labels.shape 124 125 # HACK: current way-to-go is to create a new hierarchy where we store the desired volume patches 126 # TODO: we want to integrate this so that this slicing can be done just by passing the offsets 127 f.create_dataset(f"volumes/raw_{organelle}", data=new_raw, chunks=new_raw.shape) 128 129 return have_inconsistent_volumes 130 131 132def _check_input_args(input_arg, default_values): 133 if input_arg is None: 134 input_arg = default_values 135 else: 136 if isinstance(input_arg, str): 137 assert input_arg in default_values 138 input_arg = [input_arg] 139 140 return input_arg 141 142 143# TODO download the asem data and re-use this function in get_asem_data 144def get_asem_data(path): 145 pass 146 147 148def get_asem_dataset( 149 path, patch_shape, ndim, download, organelles=None, volume_ids=None, **kwargs 150): 151 """Dataset for the segmentation of organelles in FIB-SEM cells. 152 153 This dataset provides access to 3d images of organelles (mitochondria, golgi, endoplasmic reticulum) 154 segmented in cells. If you use this data in your research, please cite: https://doi.org/10.1083/jcb.202208005 155 """ 156 # let's get the choice of organelles sorted 157 organelles = _check_input_args(organelles, ORGANELLES) 158 159 # now let's get the chosen volumes have the chosen organelles 160 all_datasets = [] 161 for organelle in organelles: 162 if volume_ids is None: 163 volume_ids = ORGANELLES[organelle] 164 else: 165 if isinstance(volume_ids, str): 166 volume_ids = [volume_ids] 167 168 for volume_id in volume_ids: 169 assert volume_id in ORGANELLES[organelle], \ 170 f"The chosen volume and organelle combination does not match: '{volume_id}' & '{organelle}'" 171 172 volume_paths = _download_asem_dataset(path, volume_ids, download) 173 174 for volume_path in volume_paths: 175 have_volumes_inconsistent = _make_volumes_consistent(volume_path, organelle) 176 177 raw_key = f"volumes/raw_{organelle}" if have_volumes_inconsistent else "volumes/raw" 178 dataset = torch_em.default_segmentation_dataset( 179 volume_path, raw_key, 180 volume_path, f"volumes/labels/{organelle}", 181 patch_shape, ndim=ndim, is_seg_dataset=True, 182 **kwargs 183 ) 184 dataset.max_sampling_attempts = 5000 185 all_datasets.append(dataset) 186 187 return ConcatDataset(*all_datasets) 188 189 190def get_asem_loader( 191 path, patch_shape, batch_size, ndim, download=False, organelles=None, volume_ids=None, **kwargs 192): 193 """Dataloader for organelle segmentation in FIB-SEM cells. See `get_asem_dataset` for details.""" 194 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 195 ds = get_asem_dataset(path, patch_shape, ndim, download, organelles, volume_ids, **ds_kwargs) 196 loader = torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs) 197 return loader
INCONSISTENT_VOLUMES =
{'mito': ['cell_6.zarr', 'cell_13.zarr', 'cell_13a.zarr'], 'golgi': ['cell_3.zarr', 'cell_6.zarr'], 'er': ['cell_3.zarr', 'cell_6.zarr', 'cell_13.zarr']}
VOLUMES =
{'cell_1': 'cell_1/cell_1.zarr', 'cell_2': 'cell_2/cell_2.zarr', 'cell_3': 'cell_3/cell_3.zarr', 'cell_6': 'cell_6/cell_6.zarr', 'cell_12': 'cell_12/cell_12.zarr', 'cell_13': 'cell_13/cell_13.zarr', 'cell_13a': 'cell_13a/cell_13a.zarr'}
ORGANELLES =
{'mito': ['cell_1', 'cell_2', 'cell_3', 'cell_6', 'cell_13', 'cell_13a'], 'golgi': ['cell_1', 'cell_2', 'cell_3', 'cell_6'], 'er': ['cell_1', 'cell_2', 'cell_3', 'cell_6'], 'ccp': ['cell_12', 'cell_13'], 'np': ['cell_13a'], 'np_bottom': ['cell_13a']}
def
get_asem_data(path):
def
get_asem_dataset( path, patch_shape, ndim, download, organelles=None, volume_ids=None, **kwargs):
149def get_asem_dataset( 150 path, patch_shape, ndim, download, organelles=None, volume_ids=None, **kwargs 151): 152 """Dataset for the segmentation of organelles in FIB-SEM cells. 153 154 This dataset provides access to 3d images of organelles (mitochondria, golgi, endoplasmic reticulum) 155 segmented in cells. If you use this data in your research, please cite: https://doi.org/10.1083/jcb.202208005 156 """ 157 # let's get the choice of organelles sorted 158 organelles = _check_input_args(organelles, ORGANELLES) 159 160 # now let's get the chosen volumes have the chosen organelles 161 all_datasets = [] 162 for organelle in organelles: 163 if volume_ids is None: 164 volume_ids = ORGANELLES[organelle] 165 else: 166 if isinstance(volume_ids, str): 167 volume_ids = [volume_ids] 168 169 for volume_id in volume_ids: 170 assert volume_id in ORGANELLES[organelle], \ 171 f"The chosen volume and organelle combination does not match: '{volume_id}' & '{organelle}'" 172 173 volume_paths = _download_asem_dataset(path, volume_ids, download) 174 175 for volume_path in volume_paths: 176 have_volumes_inconsistent = _make_volumes_consistent(volume_path, organelle) 177 178 raw_key = f"volumes/raw_{organelle}" if have_volumes_inconsistent else "volumes/raw" 179 dataset = torch_em.default_segmentation_dataset( 180 volume_path, raw_key, 181 volume_path, f"volumes/labels/{organelle}", 182 patch_shape, ndim=ndim, is_seg_dataset=True, 183 **kwargs 184 ) 185 dataset.max_sampling_attempts = 5000 186 all_datasets.append(dataset) 187 188 return ConcatDataset(*all_datasets)
Dataset for the segmentation of organelles in FIB-SEM cells.
This dataset provides access to 3d images of organelles (mitochondria, golgi, endoplasmic reticulum) segmented in cells. If you use this data in your research, please cite: https://doi.org/10.1083/jcb.202208005
def
get_asem_loader( path, patch_shape, batch_size, ndim, download=False, organelles=None, volume_ids=None, **kwargs):
191def get_asem_loader( 192 path, patch_shape, batch_size, ndim, download=False, organelles=None, volume_ids=None, **kwargs 193): 194 """Dataloader for organelle segmentation in FIB-SEM cells. See `get_asem_dataset` for details.""" 195 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 196 ds = get_asem_dataset(path, patch_shape, ndim, download, organelles, volume_ids, **ds_kwargs) 197 loader = torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs) 198 return loader
Dataloader for organelle segmentation in FIB-SEM cells. See get_asem_dataset
for details.