torch_em.data.datasets.light_microscopy.oocyteseg
The OocyteSeg dataset contains annotations for binary membrane segmentation in transmitted light microscopy images of oocytes from multiple species.
NOTE: The dataset only has semantic (binary) segmentation.
The dataset is from the publication https://doi.org/10.1242/jcs.260281. Please cite it if you use this dataset in your research.
1"""The OocyteSeg dataset contains annotations for binary membrane segmentation 2in transmitted light microscopy images of oocytes from multiple species. 3 4NOTE: The dataset only has semantic (binary) segmentation. 5 6The dataset is from the publication https://doi.org/10.1242/jcs.260281. 7Please cite it if you use this dataset in your research. 8""" 9 10import os 11from glob import glob 12from typing import Union, Literal, Optional, Tuple, List 13 14import numpy as np 15import imageio.v3 as imageio 16 17from torch.utils.data import Dataset, DataLoader 18 19import torch_em 20 21from .. import util 22 23 24URL = "https://zenodo.org/records/6502830/files/SegmentationCortex.tar.gz" 25CHECKSUM = "1da5d4fd102d8e903744db424f6114c6" 26 27SPECIES = ["mouse", "human", "sea_urchin"] 28 29_SUBDIRS = { 30 "mouse": { 31 "train": ["exp1", "exp2"], 32 "test": ["exp1_test", "exp2_test"], 33 }, 34 "human": { 35 "train": ["clin1", "clin2"], 36 "test": ["clin1_test", "clin2_test"], 37 }, 38 "sea_urchin": { 39 "train": ["train"], 40 "test": ["test"], 41 }, 42} 43 44 45def _preprocess_data(data_dir, processed_dir, species, split): 46 """Preprocess images and masks to ensure consistent format. 47 48 Some sea urchin images are stored as RGB instead of grayscale. 49 Masks are stored as 0/255 and need to be normalized to 0/1. 50 This function converts all data to a consistent single-channel uint8 format. 51 """ 52 img_out_dir = os.path.join(processed_dir, "images") 53 mask_out_dir = os.path.join(processed_dir, "masks") 54 os.makedirs(img_out_dir, exist_ok=True) 55 os.makedirs(mask_out_dir, exist_ok=True) 56 57 subdirs = _SUBDIRS[species][split] 58 59 for subdir in subdirs: 60 input_dir = os.path.join(data_dir, species, subdir, "input") 61 mask_dir = os.path.join(data_dir, species, subdir, "mask") 62 63 input_names = {os.path.splitext(f)[0] for f in os.listdir(input_dir) if f.endswith(".png")} 64 mask_names = {os.path.splitext(f)[0] for f in os.listdir(mask_dir) if f.endswith(".png")} 65 matched = sorted(input_names & mask_names) 66 67 for name in matched: 68 img_out = os.path.join(img_out_dir, f"{subdir}_{name}.tif") 69 mask_out = os.path.join(mask_out_dir, f"{subdir}_{name}.tif") 70 71 if os.path.exists(img_out) and os.path.exists(mask_out): 72 continue 73 74 img = imageio.imread(os.path.join(input_dir, f"{name}.png")) 75 if img.ndim == 3: 76 img = np.mean(img[..., :3], axis=-1).astype("uint8") 77 imageio.imwrite(img_out, img, compression="zlib") 78 79 mask = imageio.imread(os.path.join(mask_dir, f"{name}.png")) 80 if mask.ndim == 3: 81 mask = mask[..., 0] 82 mask = (mask > 0).astype("uint8") 83 imageio.imwrite(mask_out, mask, compression="zlib") 84 85 86def get_oocyteseg_data(path: Union[os.PathLike, str], download: bool = False) -> str: 87 """Download the OocyteSeg dataset. 88 89 Args: 90 path: Filepath to a folder where the downloaded data will be saved. 91 download: Whether to download the data if it is not present. 92 93 Returns: 94 The filepath to the extracted data directory. 95 """ 96 data_dir = os.path.join(path, "SegmentationCortex") 97 if os.path.exists(data_dir): 98 return data_dir 99 100 os.makedirs(path, exist_ok=True) 101 tar_path = os.path.join(path, "SegmentationCortex.tar.gz") 102 util.download_source(path=tar_path, url=URL, download=download, checksum=CHECKSUM) 103 util.unzip(zip_path=tar_path, dst=path) 104 105 return data_dir 106 107 108def get_oocyteseg_paths( 109 path: Union[os.PathLike, str], 110 split: Literal["train", "test"] = "train", 111 species: Optional[str] = None, 112 download: bool = False, 113) -> Tuple[List[str], List[str]]: 114 """Get paths to the OocyteSeg data. 115 116 Args: 117 path: Filepath to a folder where the downloaded data will be saved. 118 split: The data split to use. One of 'train' or 'test'. 119 species: The species to select. One of 'mouse', 'human' or 'sea_urchin'. 120 If None, data from all species is returned. 121 download: Whether to download the data if it is not present. 122 123 Returns: 124 List of filepaths for the image data. 125 List of filepaths for the label data. 126 """ 127 assert split in ("train", "test"), f"'{split}' is not a valid split. Choose from 'train' or 'test'." 128 129 if species is None: 130 species_list = SPECIES 131 else: 132 assert species in SPECIES, f"'{species}' is not a valid species. Choose from {SPECIES}." 133 species_list = [species] 134 135 data_dir = get_oocyteseg_data(path, download) 136 137 all_image_paths = [] 138 all_seg_paths = [] 139 140 from natsort import natsorted 141 142 for sp in species_list: 143 processed_dir = os.path.join(path, "processed", sp, split) 144 img_out_dir = os.path.join(processed_dir, "images") 145 mask_out_dir = os.path.join(processed_dir, "masks") 146 147 if not os.path.exists(img_out_dir) or len(glob(os.path.join(img_out_dir, "*.tif"))) == 0: 148 _preprocess_data(data_dir, processed_dir, sp, split) 149 150 image_paths = natsorted(glob(os.path.join(img_out_dir, "*.tif"))) 151 seg_paths = natsorted(glob(os.path.join(mask_out_dir, "*.tif"))) 152 153 assert len(image_paths) == len(seg_paths), \ 154 f"Mismatch: {len(image_paths)} images vs {len(seg_paths)} masks for {sp}/{split}" 155 assert len(image_paths) > 0, f"No images found for {sp}/{split}" 156 157 all_image_paths.extend(image_paths) 158 all_seg_paths.extend(seg_paths) 159 160 return all_image_paths, all_seg_paths 161 162 163def get_oocyteseg_dataset( 164 path: Union[os.PathLike, str], 165 patch_shape: Tuple[int, int], 166 split: Literal["train", "test"] = "train", 167 species: Optional[str] = None, 168 download: bool = False, 169 **kwargs 170) -> Dataset: 171 """Get the OocyteSeg dataset for binary membrane segmentation. 172 173 Args: 174 path: Filepath to a folder where the downloaded data will be saved. 175 patch_shape: The patch shape to use for training. 176 split: The data split to use. One of 'train' or 'test'. 177 species: The species to select. One of 'mouse', 'human' or 'sea_urchin'. 178 If None, data from all species is returned. 179 download: Whether to download the data if it is not present. 180 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 181 182 Returns: 183 The segmentation dataset. 184 """ 185 image_paths, seg_paths = get_oocyteseg_paths(path, split, species, download) 186 187 kwargs = util.ensure_transforms(ndim=2, **kwargs) 188 189 return torch_em.default_segmentation_dataset( 190 raw_paths=image_paths, 191 raw_key=None, 192 label_paths=seg_paths, 193 label_key=None, 194 patch_shape=patch_shape, 195 is_seg_dataset=False, 196 ndim=2, 197 **kwargs 198 ) 199 200 201def get_oocyteseg_loader( 202 path: Union[os.PathLike, str], 203 batch_size: int, 204 patch_shape: Tuple[int, int], 205 split: Literal["train", "test"] = "train", 206 species: Optional[str] = None, 207 download: bool = False, 208 **kwargs 209) -> DataLoader: 210 """Get the OocyteSeg dataloader for binary membrane segmentation. 211 212 Args: 213 path: Filepath to a folder where the downloaded data will be saved. 214 batch_size: The batch size for training. 215 patch_shape: The patch shape to use for training. 216 split: The data split to use. One of 'train' or 'test'. 217 species: The species to select. One of 'mouse', 'human' or 'sea_urchin'. 218 If None, data from all species is returned. 219 download: Whether to download the data if it is not present. 220 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 221 222 Returns: 223 The DataLoader. 224 """ 225 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 226 dataset = get_oocyteseg_dataset( 227 path=path, 228 patch_shape=patch_shape, 229 split=split, 230 species=species, 231 download=download, 232 **ds_kwargs, 233 ) 234 return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)
URL =
'https://zenodo.org/records/6502830/files/SegmentationCortex.tar.gz'
CHECKSUM =
'1da5d4fd102d8e903744db424f6114c6'
SPECIES =
['mouse', 'human', 'sea_urchin']
def
get_oocyteseg_data(path: Union[os.PathLike, str], download: bool = False) -> str:
87def get_oocyteseg_data(path: Union[os.PathLike, str], download: bool = False) -> str: 88 """Download the OocyteSeg dataset. 89 90 Args: 91 path: Filepath to a folder where the downloaded data will be saved. 92 download: Whether to download the data if it is not present. 93 94 Returns: 95 The filepath to the extracted data directory. 96 """ 97 data_dir = os.path.join(path, "SegmentationCortex") 98 if os.path.exists(data_dir): 99 return data_dir 100 101 os.makedirs(path, exist_ok=True) 102 tar_path = os.path.join(path, "SegmentationCortex.tar.gz") 103 util.download_source(path=tar_path, url=URL, download=download, checksum=CHECKSUM) 104 util.unzip(zip_path=tar_path, dst=path) 105 106 return data_dir
Download the OocyteSeg dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- download: Whether to download the data if it is not present.
Returns:
The filepath to the extracted data directory.
def
get_oocyteseg_paths( path: Union[os.PathLike, str], split: Literal['train', 'test'] = 'train', species: Optional[str] = None, download: bool = False) -> Tuple[List[str], List[str]]:
109def get_oocyteseg_paths( 110 path: Union[os.PathLike, str], 111 split: Literal["train", "test"] = "train", 112 species: Optional[str] = None, 113 download: bool = False, 114) -> Tuple[List[str], List[str]]: 115 """Get paths to the OocyteSeg data. 116 117 Args: 118 path: Filepath to a folder where the downloaded data will be saved. 119 split: The data split to use. One of 'train' or 'test'. 120 species: The species to select. One of 'mouse', 'human' or 'sea_urchin'. 121 If None, data from all species is returned. 122 download: Whether to download the data if it is not present. 123 124 Returns: 125 List of filepaths for the image data. 126 List of filepaths for the label data. 127 """ 128 assert split in ("train", "test"), f"'{split}' is not a valid split. Choose from 'train' or 'test'." 129 130 if species is None: 131 species_list = SPECIES 132 else: 133 assert species in SPECIES, f"'{species}' is not a valid species. Choose from {SPECIES}." 134 species_list = [species] 135 136 data_dir = get_oocyteseg_data(path, download) 137 138 all_image_paths = [] 139 all_seg_paths = [] 140 141 from natsort import natsorted 142 143 for sp in species_list: 144 processed_dir = os.path.join(path, "processed", sp, split) 145 img_out_dir = os.path.join(processed_dir, "images") 146 mask_out_dir = os.path.join(processed_dir, "masks") 147 148 if not os.path.exists(img_out_dir) or len(glob(os.path.join(img_out_dir, "*.tif"))) == 0: 149 _preprocess_data(data_dir, processed_dir, sp, split) 150 151 image_paths = natsorted(glob(os.path.join(img_out_dir, "*.tif"))) 152 seg_paths = natsorted(glob(os.path.join(mask_out_dir, "*.tif"))) 153 154 assert len(image_paths) == len(seg_paths), \ 155 f"Mismatch: {len(image_paths)} images vs {len(seg_paths)} masks for {sp}/{split}" 156 assert len(image_paths) > 0, f"No images found for {sp}/{split}" 157 158 all_image_paths.extend(image_paths) 159 all_seg_paths.extend(seg_paths) 160 161 return all_image_paths, all_seg_paths
Get paths to the OocyteSeg data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The data split to use. One of 'train' or 'test'.
- species: The species to select. One of 'mouse', 'human' or 'sea_urchin'. If None, data from all species is returned.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the image data. List of filepaths for the label data.
def
get_oocyteseg_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Literal['train', 'test'] = 'train', species: Optional[str] = None, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
164def get_oocyteseg_dataset( 165 path: Union[os.PathLike, str], 166 patch_shape: Tuple[int, int], 167 split: Literal["train", "test"] = "train", 168 species: Optional[str] = None, 169 download: bool = False, 170 **kwargs 171) -> Dataset: 172 """Get the OocyteSeg dataset for binary membrane segmentation. 173 174 Args: 175 path: Filepath to a folder where the downloaded data will be saved. 176 patch_shape: The patch shape to use for training. 177 split: The data split to use. One of 'train' or 'test'. 178 species: The species to select. One of 'mouse', 'human' or 'sea_urchin'. 179 If None, data from all species is returned. 180 download: Whether to download the data if it is not present. 181 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 182 183 Returns: 184 The segmentation dataset. 185 """ 186 image_paths, seg_paths = get_oocyteseg_paths(path, split, species, download) 187 188 kwargs = util.ensure_transforms(ndim=2, **kwargs) 189 190 return torch_em.default_segmentation_dataset( 191 raw_paths=image_paths, 192 raw_key=None, 193 label_paths=seg_paths, 194 label_key=None, 195 patch_shape=patch_shape, 196 is_seg_dataset=False, 197 ndim=2, 198 **kwargs 199 )
Get the OocyteSeg dataset for binary membrane segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- split: The data split to use. One of 'train' or 'test'.
- species: The species to select. One of 'mouse', 'human' or 'sea_urchin'. If None, data from all species is returned.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset.
Returns:
The segmentation dataset.
def
get_oocyteseg_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Literal['train', 'test'] = 'train', species: Optional[str] = None, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
202def get_oocyteseg_loader( 203 path: Union[os.PathLike, str], 204 batch_size: int, 205 patch_shape: Tuple[int, int], 206 split: Literal["train", "test"] = "train", 207 species: Optional[str] = None, 208 download: bool = False, 209 **kwargs 210) -> DataLoader: 211 """Get the OocyteSeg dataloader for binary membrane segmentation. 212 213 Args: 214 path: Filepath to a folder where the downloaded data will be saved. 215 batch_size: The batch size for training. 216 patch_shape: The patch shape to use for training. 217 split: The data split to use. One of 'train' or 'test'. 218 species: The species to select. One of 'mouse', 'human' or 'sea_urchin'. 219 If None, data from all species is returned. 220 download: Whether to download the data if it is not present. 221 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 222 223 Returns: 224 The DataLoader. 225 """ 226 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 227 dataset = get_oocyteseg_dataset( 228 path=path, 229 patch_shape=patch_shape, 230 split=split, 231 species=species, 232 download=download, 233 **ds_kwargs, 234 ) 235 return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)
Get the OocyteSeg dataloader for binary membrane segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- split: The data split to use. One of 'train' or 'test'.
- species: The species to select. One of 'mouse', 'human' or 'sea_urchin'. If None, data from all species is returned.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_datasetor for the PyTorch DataLoader.
Returns:
The DataLoader.