torch_em.data.datasets.light_microscopy.wing_disc
The Wing Disc dataset contains annotations for 3D cell instance segmentation in confocal microscopy images of Drosophila wing discs.
The dataset is located at https://www.ebi.ac.uk/biostudies/BioImages/studies/S-BIAD843. This dataset is from the publication https://www.nature.com/articles/s44303-025-00099-7. Please cite it if you use this dataset in your research.
1"""The Wing Disc dataset contains annotations for 3D cell instance segmentation 2in confocal microscopy images of Drosophila wing discs. 3 4The dataset is located at https://www.ebi.ac.uk/biostudies/BioImages/studies/S-BIAD843. 5This dataset is from the publication https://www.nature.com/articles/s44303-025-00099-7. 6Please cite it if you use this dataset in your research. 7""" 8 9import os 10from glob import glob 11from natsort import natsorted 12from typing import Union, Tuple, Optional, List 13 14import numpy as np 15 16from torch.utils.data import Dataset, DataLoader 17 18import torch_em 19 20from .. import util 21 22 23BASE_URL = "https://ftp.ebi.ac.uk/biostudies/fire/S-BIAD/843/S-BIAD843/Files" 24 25VOLUMES = { 26 "WD1_15-02_WT_confocalonly": "confocal", 27 "WD2.1_21-02_WT_confocalonly": "confocal", 28 "WD1.1_17-03_WT_MP": "multiphoton", 29 "WD3.2_21-03_WT_MP": "multiphoton", 30} 31 32 33def _preprocess_volumes(path, data_dir): 34 """Convert OME-Zarr volumes to HDF5 files with raw and labels datasets.""" 35 import h5py 36 import zarr 37 38 os.makedirs(data_dir, exist_ok=True) 39 40 zarr_dir = os.path.join(path, "zarr") 41 42 for name in VOLUMES: 43 h5_path = os.path.join(data_dir, f"{name}.h5") 44 if os.path.exists(h5_path): 45 continue 46 47 # Read raw volume: shape (1, 1, Z, Y, X) and squeeze to (Z, Y, X). 48 raw_zarr = os.path.join(zarr_dir, f"{name}.zarr", "0", "0") 49 raw = np.array(zarr.open(store=zarr.storage.LocalStore(raw_zarr))) 50 raw = raw.squeeze() 51 52 # Read segmentation: shape (Z, 1, 1, Y, X) and squeeze to (Z, Y, X). 53 seg_zarr = os.path.join(zarr_dir, f"{name}_segmented.zarr", "0", "0") 54 seg = np.array(zarr.open(store=zarr.storage.LocalStore(seg_zarr))) 55 seg = seg.squeeze().astype("uint32") 56 57 assert raw.shape == seg.shape, f"Shape mismatch for {name}: raw={raw.shape}, seg={seg.shape}" 58 59 with h5py.File(h5_path, "w") as f: 60 f.create_dataset("raw", data=raw, compression="gzip") 61 f.create_dataset("labels", data=seg, compression="gzip") 62 63 64def get_wing_disc_data(path: Union[os.PathLike, str], download: bool = False) -> str: 65 """Download the Wing Disc dataset. 66 67 Args: 68 path: Filepath to a folder where the downloaded data will be saved. 69 download: Whether to download the data if it is not present. 70 71 Returns: 72 The filepath to the preprocessed data directory. 73 """ 74 data_dir = os.path.join(path, "data") 75 if os.path.exists(data_dir) and len(glob(os.path.join(data_dir, "*.h5"))) == len(VOLUMES): 76 return data_dir 77 78 zarr_dir = os.path.join(path, "zarr") 79 os.makedirs(zarr_dir, exist_ok=True) 80 81 for name in VOLUMES: 82 zarr_path = os.path.join(zarr_dir, f"{name}.zarr") 83 if not os.path.exists(zarr_path): 84 zip_fname = f"{name}.ome.zarr.zip" 85 zip_path = os.path.join(path, zip_fname) 86 url = f"{BASE_URL}/{zip_fname}" 87 util.download_source(path=zip_path, url=url, download=download, checksum=None) 88 util.unzip(zip_path=zip_path, dst=zarr_dir) 89 90 seg_zarr_path = os.path.join(zarr_dir, f"{name}_segmented.zarr") 91 if not os.path.exists(seg_zarr_path): 92 seg_zip_fname = f"{name}_segmented.ome.zarr.zip" 93 seg_zip_path = os.path.join(path, seg_zip_fname) 94 seg_url = f"{BASE_URL}/{seg_zip_fname}" 95 util.download_source(path=seg_zip_path, url=seg_url, download=download, checksum=None) 96 util.unzip(zip_path=seg_zip_path, dst=zarr_dir) 97 98 _preprocess_volumes(path, data_dir) 99 100 return data_dir 101 102 103def get_wing_disc_paths( 104 path: Union[os.PathLike, str], 105 download: bool = False, 106) -> List[str]: 107 """Get paths to the Wing Disc data. 108 109 Args: 110 path: Filepath to a folder where the downloaded data will be saved. 111 download: Whether to download the data if it is not present. 112 113 Returns: 114 List of filepaths for the stored data. 115 """ 116 data_dir = get_wing_disc_data(path, download) 117 data_paths = natsorted(glob(os.path.join(data_dir, "*.h5"))) 118 assert len(data_paths) > 0 119 return data_paths 120 121 122def get_wing_disc_dataset( 123 path: Union[os.PathLike, str], 124 patch_shape: Tuple[int, int, int], 125 offsets: Optional[List[List[int]]] = None, 126 boundaries: bool = False, 127 binary: bool = False, 128 download: bool = False, 129 **kwargs 130) -> Dataset: 131 """Get the Wing Disc dataset for 3D cell segmentation in Drosophila wing discs. 132 133 Args: 134 path: Filepath to a folder where the downloaded data will be saved. 135 patch_shape: The patch shape to use for training. 136 offsets: Offset values for affinity computation used as target. 137 boundaries: Whether to compute boundaries as the target. 138 binary: Whether to use a binary segmentation target. 139 download: Whether to download the data if it is not present. 140 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 141 142 Returns: 143 The segmentation dataset. 144 """ 145 data_paths = get_wing_disc_paths(path, download) 146 147 kwargs = util.ensure_transforms(ndim=3, **kwargs) 148 kwargs, _ = util.add_instance_label_transform( 149 kwargs, add_binary_target=True, offsets=offsets, boundaries=boundaries, binary=binary 150 ) 151 152 return torch_em.default_segmentation_dataset( 153 raw_paths=data_paths, 154 raw_key="raw", 155 label_paths=data_paths, 156 label_key="labels", 157 patch_shape=patch_shape, 158 ndim=3, 159 **kwargs 160 ) 161 162 163def get_wing_disc_loader( 164 path: Union[os.PathLike, str], 165 batch_size: int, 166 patch_shape: Tuple[int, int, int], 167 offsets: Optional[List[List[int]]] = None, 168 boundaries: bool = False, 169 binary: bool = False, 170 download: bool = False, 171 **kwargs 172) -> DataLoader: 173 """Get the Wing Disc dataloader for 3D cell segmentation in Drosophila wing discs. 174 175 Args: 176 path: Filepath to a folder where the downloaded data will be saved. 177 batch_size: The batch size for training. 178 patch_shape: The patch shape to use for training. 179 offsets: Offset values for affinity computation used as target. 180 boundaries: Whether to compute boundaries as the target. 181 binary: Whether to use a binary segmentation target. 182 download: Whether to download the data if it is not present. 183 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 184 185 Returns: 186 The DataLoader. 187 """ 188 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 189 dataset = get_wing_disc_dataset( 190 path=path, 191 patch_shape=patch_shape, 192 offsets=offsets, 193 boundaries=boundaries, 194 binary=binary, 195 download=download, 196 **ds_kwargs, 197 ) 198 return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)
BASE_URL =
'https://ftp.ebi.ac.uk/biostudies/fire/S-BIAD/843/S-BIAD843/Files'
VOLUMES =
{'WD1_15-02_WT_confocalonly': 'confocal', 'WD2.1_21-02_WT_confocalonly': 'confocal', 'WD1.1_17-03_WT_MP': 'multiphoton', 'WD3.2_21-03_WT_MP': 'multiphoton'}
def
get_wing_disc_data(path: Union[os.PathLike, str], download: bool = False) -> str:
65def get_wing_disc_data(path: Union[os.PathLike, str], download: bool = False) -> str: 66 """Download the Wing Disc dataset. 67 68 Args: 69 path: Filepath to a folder where the downloaded data will be saved. 70 download: Whether to download the data if it is not present. 71 72 Returns: 73 The filepath to the preprocessed data directory. 74 """ 75 data_dir = os.path.join(path, "data") 76 if os.path.exists(data_dir) and len(glob(os.path.join(data_dir, "*.h5"))) == len(VOLUMES): 77 return data_dir 78 79 zarr_dir = os.path.join(path, "zarr") 80 os.makedirs(zarr_dir, exist_ok=True) 81 82 for name in VOLUMES: 83 zarr_path = os.path.join(zarr_dir, f"{name}.zarr") 84 if not os.path.exists(zarr_path): 85 zip_fname = f"{name}.ome.zarr.zip" 86 zip_path = os.path.join(path, zip_fname) 87 url = f"{BASE_URL}/{zip_fname}" 88 util.download_source(path=zip_path, url=url, download=download, checksum=None) 89 util.unzip(zip_path=zip_path, dst=zarr_dir) 90 91 seg_zarr_path = os.path.join(zarr_dir, f"{name}_segmented.zarr") 92 if not os.path.exists(seg_zarr_path): 93 seg_zip_fname = f"{name}_segmented.ome.zarr.zip" 94 seg_zip_path = os.path.join(path, seg_zip_fname) 95 seg_url = f"{BASE_URL}/{seg_zip_fname}" 96 util.download_source(path=seg_zip_path, url=seg_url, download=download, checksum=None) 97 util.unzip(zip_path=seg_zip_path, dst=zarr_dir) 98 99 _preprocess_volumes(path, data_dir) 100 101 return data_dir
Download the Wing Disc dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- download: Whether to download the data if it is not present.
Returns:
The filepath to the preprocessed data directory.
def
get_wing_disc_paths(path: Union[os.PathLike, str], download: bool = False) -> List[str]:
104def get_wing_disc_paths( 105 path: Union[os.PathLike, str], 106 download: bool = False, 107) -> List[str]: 108 """Get paths to the Wing Disc data. 109 110 Args: 111 path: Filepath to a folder where the downloaded data will be saved. 112 download: Whether to download the data if it is not present. 113 114 Returns: 115 List of filepaths for the stored data. 116 """ 117 data_dir = get_wing_disc_data(path, download) 118 data_paths = natsorted(glob(os.path.join(data_dir, "*.h5"))) 119 assert len(data_paths) > 0 120 return data_paths
Get paths to the Wing Disc data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the stored data.
def
get_wing_disc_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int, int], offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
123def get_wing_disc_dataset( 124 path: Union[os.PathLike, str], 125 patch_shape: Tuple[int, int, int], 126 offsets: Optional[List[List[int]]] = None, 127 boundaries: bool = False, 128 binary: bool = False, 129 download: bool = False, 130 **kwargs 131) -> Dataset: 132 """Get the Wing Disc dataset for 3D cell segmentation in Drosophila wing discs. 133 134 Args: 135 path: Filepath to a folder where the downloaded data will be saved. 136 patch_shape: The patch shape to use for training. 137 offsets: Offset values for affinity computation used as target. 138 boundaries: Whether to compute boundaries as the target. 139 binary: Whether to use a binary segmentation target. 140 download: Whether to download the data if it is not present. 141 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 142 143 Returns: 144 The segmentation dataset. 145 """ 146 data_paths = get_wing_disc_paths(path, download) 147 148 kwargs = util.ensure_transforms(ndim=3, **kwargs) 149 kwargs, _ = util.add_instance_label_transform( 150 kwargs, add_binary_target=True, offsets=offsets, boundaries=boundaries, binary=binary 151 ) 152 153 return torch_em.default_segmentation_dataset( 154 raw_paths=data_paths, 155 raw_key="raw", 156 label_paths=data_paths, 157 label_key="labels", 158 patch_shape=patch_shape, 159 ndim=3, 160 **kwargs 161 )
Get the Wing Disc dataset for 3D cell segmentation in Drosophila wing discs.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- binary: Whether to use a binary segmentation target.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset.
Returns:
The segmentation dataset.
def
get_wing_disc_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int, int], offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
164def get_wing_disc_loader( 165 path: Union[os.PathLike, str], 166 batch_size: int, 167 patch_shape: Tuple[int, int, int], 168 offsets: Optional[List[List[int]]] = None, 169 boundaries: bool = False, 170 binary: bool = False, 171 download: bool = False, 172 **kwargs 173) -> DataLoader: 174 """Get the Wing Disc dataloader for 3D cell segmentation in Drosophila wing discs. 175 176 Args: 177 path: Filepath to a folder where the downloaded data will be saved. 178 batch_size: The batch size for training. 179 patch_shape: The patch shape to use for training. 180 offsets: Offset values for affinity computation used as target. 181 boundaries: Whether to compute boundaries as the target. 182 binary: Whether to use a binary segmentation target. 183 download: Whether to download the data if it is not present. 184 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 185 186 Returns: 187 The DataLoader. 188 """ 189 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 190 dataset = get_wing_disc_dataset( 191 path=path, 192 patch_shape=patch_shape, 193 offsets=offsets, 194 boundaries=boundaries, 195 binary=binary, 196 download=download, 197 **ds_kwargs, 198 ) 199 return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)
Get the Wing Disc dataloader for 3D cell segmentation in Drosophila wing discs.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- binary: Whether to use a binary segmentation target.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_datasetor for the PyTorch DataLoader.
Returns:
The DataLoader.