torch_em.data.datasets.light_microscopy.oocyteseg

The OocyteSeg dataset contains annotations for binary membrane segmentation in transmitted light microscopy images of oocytes from multiple species.

NOTE: The dataset only has semantic (binary) segmentation.

The dataset is from the publication https://doi.org/10.1242/jcs.260281. Please cite it if you use this dataset in your research.

View Source

  1"""The OocyteSeg dataset contains annotations for binary membrane segmentation
  2in transmitted light microscopy images of oocytes from multiple species.
  3
  4NOTE: The dataset only has semantic (binary) segmentation.
  5
  6The dataset is from the publication https://doi.org/10.1242/jcs.260281.
  7Please cite it if you use this dataset in your research.
  8"""
  9
 10import os
 11from glob import glob
 12from typing import Union, Literal, Optional, Tuple, List
 13
 14import numpy as np
 15import imageio.v3 as imageio
 16
 17from torch.utils.data import Dataset, DataLoader
 18
 19import torch_em
 20
 21from .. import util
 22
 23
 24URL = "https://zenodo.org/records/6502830/files/SegmentationCortex.tar.gz"
 25CHECKSUM = "1da5d4fd102d8e903744db424f6114c6"
 26
 27SPECIES = ["mouse", "human", "sea_urchin"]
 28
 29_SUBDIRS = {
 30    "mouse": {
 31        "train": ["exp1", "exp2"],
 32        "test": ["exp1_test", "exp2_test"],
 33    },
 34    "human": {
 35        "train": ["clin1", "clin2"],
 36        "test": ["clin1_test", "clin2_test"],
 37    },
 38    "sea_urchin": {
 39        "train": ["train"],
 40        "test": ["test"],
 41    },
 42}
 43
 44
 45def _preprocess_data(data_dir, processed_dir, species, split):
 46    """Preprocess images and masks to ensure consistent format.
 47
 48    Some sea urchin images are stored as RGB instead of grayscale.
 49    Masks are stored as 0/255 and need to be normalized to 0/1.
 50    This function converts all data to a consistent single-channel uint8 format.
 51    """
 52    img_out_dir = os.path.join(processed_dir, "images")
 53    mask_out_dir = os.path.join(processed_dir, "masks")
 54    os.makedirs(img_out_dir, exist_ok=True)
 55    os.makedirs(mask_out_dir, exist_ok=True)
 56
 57    subdirs = _SUBDIRS[species][split]
 58
 59    for subdir in subdirs:
 60        input_dir = os.path.join(data_dir, species, subdir, "input")
 61        mask_dir = os.path.join(data_dir, species, subdir, "mask")
 62
 63        input_names = {os.path.splitext(f)[0] for f in os.listdir(input_dir) if f.endswith(".png")}
 64        mask_names = {os.path.splitext(f)[0] for f in os.listdir(mask_dir) if f.endswith(".png")}
 65        matched = sorted(input_names & mask_names)
 66
 67        for name in matched:
 68            img_out = os.path.join(img_out_dir, f"{subdir}_{name}.tif")
 69            mask_out = os.path.join(mask_out_dir, f"{subdir}_{name}.tif")
 70
 71            if os.path.exists(img_out) and os.path.exists(mask_out):
 72                continue
 73
 74            img = imageio.imread(os.path.join(input_dir, f"{name}.png"))
 75            if img.ndim == 3:
 76                img = np.mean(img[..., :3], axis=-1).astype("uint8")
 77            imageio.imwrite(img_out, img, compression="zlib")
 78
 79            mask = imageio.imread(os.path.join(mask_dir, f"{name}.png"))
 80            if mask.ndim == 3:
 81                mask = mask[..., 0]
 82            mask = (mask > 0).astype("uint8")
 83            imageio.imwrite(mask_out, mask, compression="zlib")
 84
 85
 86def get_oocyteseg_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 87    """Download the OocyteSeg dataset.
 88
 89    Args:
 90        path: Filepath to a folder where the downloaded data will be saved.
 91        download: Whether to download the data if it is not present.
 92
 93    Returns:
 94        The filepath to the extracted data directory.
 95    """
 96    data_dir = os.path.join(path, "SegmentationCortex")
 97    if os.path.exists(data_dir):
 98        return data_dir
 99
100    os.makedirs(path, exist_ok=True)
101    tar_path = os.path.join(path, "SegmentationCortex.tar.gz")
102    util.download_source(path=tar_path, url=URL, download=download, checksum=CHECKSUM)
103    util.unzip(zip_path=tar_path, dst=path)
104
105    return data_dir
106
107
108def get_oocyteseg_paths(
109    path: Union[os.PathLike, str],
110    split: Literal["train", "test"] = "train",
111    species: Optional[str] = None,
112    download: bool = False,
113) -> Tuple[List[str], List[str]]:
114    """Get paths to the OocyteSeg data.
115
116    Args:
117        path: Filepath to a folder where the downloaded data will be saved.
118        split: The data split to use. One of 'train' or 'test'.
119        species: The species to select. One of 'mouse', 'human' or 'sea_urchin'.
120            If None, data from all species is returned.
121        download: Whether to download the data if it is not present.
122
123    Returns:
124        List of filepaths for the image data.
125        List of filepaths for the label data.
126    """
127    assert split in ("train", "test"), f"'{split}' is not a valid split. Choose from 'train' or 'test'."
128
129    if species is None:
130        species_list = SPECIES
131    else:
132        assert species in SPECIES, f"'{species}' is not a valid species. Choose from {SPECIES}."
133        species_list = [species]
134
135    data_dir = get_oocyteseg_data(path, download)
136
137    all_image_paths = []
138    all_seg_paths = []
139
140    from natsort import natsorted
141
142    for sp in species_list:
143        processed_dir = os.path.join(path, "processed", sp, split)
144        img_out_dir = os.path.join(processed_dir, "images")
145        mask_out_dir = os.path.join(processed_dir, "masks")
146
147        if not os.path.exists(img_out_dir) or len(glob(os.path.join(img_out_dir, "*.tif"))) == 0:
148            _preprocess_data(data_dir, processed_dir, sp, split)
149
150        image_paths = natsorted(glob(os.path.join(img_out_dir, "*.tif")))
151        seg_paths = natsorted(glob(os.path.join(mask_out_dir, "*.tif")))
152
153        assert len(image_paths) == len(seg_paths), \
154            f"Mismatch: {len(image_paths)} images vs {len(seg_paths)} masks for {sp}/{split}"
155        assert len(image_paths) > 0, f"No images found for {sp}/{split}"
156
157        all_image_paths.extend(image_paths)
158        all_seg_paths.extend(seg_paths)
159
160    return all_image_paths, all_seg_paths
161
162
163def get_oocyteseg_dataset(
164    path: Union[os.PathLike, str],
165    patch_shape: Tuple[int, int],
166    split: Literal["train", "test"] = "train",
167    species: Optional[str] = None,
168    download: bool = False,
169    **kwargs
170) -> Dataset:
171    """Get the OocyteSeg dataset for binary membrane segmentation.
172
173    Args:
174        path: Filepath to a folder where the downloaded data will be saved.
175        patch_shape: The patch shape to use for training.
176        split: The data split to use. One of 'train' or 'test'.
177        species: The species to select. One of 'mouse', 'human' or 'sea_urchin'.
178            If None, data from all species is returned.
179        download: Whether to download the data if it is not present.
180        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
181
182    Returns:
183        The segmentation dataset.
184    """
185    image_paths, seg_paths = get_oocyteseg_paths(path, split, species, download)
186
187    kwargs = util.ensure_transforms(ndim=2, **kwargs)
188
189    return torch_em.default_segmentation_dataset(
190        raw_paths=image_paths,
191        raw_key=None,
192        label_paths=seg_paths,
193        label_key=None,
194        patch_shape=patch_shape,
195        is_seg_dataset=False,
196        ndim=2,
197        **kwargs
198    )
199
200
201def get_oocyteseg_loader(
202    path: Union[os.PathLike, str],
203    batch_size: int,
204    patch_shape: Tuple[int, int],
205    split: Literal["train", "test"] = "train",
206    species: Optional[str] = None,
207    download: bool = False,
208    **kwargs
209) -> DataLoader:
210    """Get the OocyteSeg dataloader for binary membrane segmentation.
211
212    Args:
213        path: Filepath to a folder where the downloaded data will be saved.
214        batch_size: The batch size for training.
215        patch_shape: The patch shape to use for training.
216        split: The data split to use. One of 'train' or 'test'.
217        species: The species to select. One of 'mouse', 'human' or 'sea_urchin'.
218            If None, data from all species is returned.
219        download: Whether to download the data if it is not present.
220        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
221
222    Returns:
223        The DataLoader.
224    """
225    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
226    dataset = get_oocyteseg_dataset(
227        path=path,
228        patch_shape=patch_shape,
229        split=split,
230        species=species,
231        download=download,
232        **ds_kwargs,
233    )
234    return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)

URL = 'https://zenodo.org/records/6502830/files/SegmentationCortex.tar.gz'

CHECKSUM = '1da5d4fd102d8e903744db424f6114c6'

SPECIES = ['mouse', 'human', 'sea_urchin']

def get_oocyteseg_data(path: Union[os.PathLike, str], download: bool = False) -> str: View Source

 87def get_oocyteseg_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 88    """Download the OocyteSeg dataset.
 89
 90    Args:
 91        path: Filepath to a folder where the downloaded data will be saved.
 92        download: Whether to download the data if it is not present.
 93
 94    Returns:
 95        The filepath to the extracted data directory.
 96    """
 97    data_dir = os.path.join(path, "SegmentationCortex")
 98    if os.path.exists(data_dir):
 99        return data_dir
100
101    os.makedirs(path, exist_ok=True)
102    tar_path = os.path.join(path, "SegmentationCortex.tar.gz")
103    util.download_source(path=tar_path, url=URL, download=download, checksum=CHECKSUM)
104    util.unzip(zip_path=tar_path, dst=path)
105
106    return data_dir

Download the OocyteSeg dataset.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
download: Whether to download the data if it is not present.

Returns:

The filepath to the extracted data directory.

def get_oocyteseg_paths( path: Union[os.PathLike, str], split: Literal['train', 'test'] = 'train', species: Optional[str] = None, download: bool = False) -> Tuple[List[str], List[str]]: View Source

109def get_oocyteseg_paths(
110    path: Union[os.PathLike, str],
111    split: Literal["train", "test"] = "train",
112    species: Optional[str] = None,
113    download: bool = False,
114) -> Tuple[List[str], List[str]]:
115    """Get paths to the OocyteSeg data.
116
117    Args:
118        path: Filepath to a folder where the downloaded data will be saved.
119        split: The data split to use. One of 'train' or 'test'.
120        species: The species to select. One of 'mouse', 'human' or 'sea_urchin'.
121            If None, data from all species is returned.
122        download: Whether to download the data if it is not present.
123
124    Returns:
125        List of filepaths for the image data.
126        List of filepaths for the label data.
127    """
128    assert split in ("train", "test"), f"'{split}' is not a valid split. Choose from 'train' or 'test'."
129
130    if species is None:
131        species_list = SPECIES
132    else:
133        assert species in SPECIES, f"'{species}' is not a valid species. Choose from {SPECIES}."
134        species_list = [species]
135
136    data_dir = get_oocyteseg_data(path, download)
137
138    all_image_paths = []
139    all_seg_paths = []
140
141    from natsort import natsorted
142
143    for sp in species_list:
144        processed_dir = os.path.join(path, "processed", sp, split)
145        img_out_dir = os.path.join(processed_dir, "images")
146        mask_out_dir = os.path.join(processed_dir, "masks")
147
148        if not os.path.exists(img_out_dir) or len(glob(os.path.join(img_out_dir, "*.tif"))) == 0:
149            _preprocess_data(data_dir, processed_dir, sp, split)
150
151        image_paths = natsorted(glob(os.path.join(img_out_dir, "*.tif")))
152        seg_paths = natsorted(glob(os.path.join(mask_out_dir, "*.tif")))
153
154        assert len(image_paths) == len(seg_paths), \
155            f"Mismatch: {len(image_paths)} images vs {len(seg_paths)} masks for {sp}/{split}"
156        assert len(image_paths) > 0, f"No images found for {sp}/{split}"
157
158        all_image_paths.extend(image_paths)
159        all_seg_paths.extend(seg_paths)
160
161    return all_image_paths, all_seg_paths

Get paths to the OocyteSeg data.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
split: The data split to use. One of 'train' or 'test'.
species: The species to select. One of 'mouse', 'human' or 'sea_urchin'. If None, data from all species is returned.
download: Whether to download the data if it is not present.

Returns:

List of filepaths for the image data. List of filepaths for the label data.

def get_oocyteseg_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Literal['train', 'test'] = 'train', species: Optional[str] = None, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset: View Source

164def get_oocyteseg_dataset(
165    path: Union[os.PathLike, str],
166    patch_shape: Tuple[int, int],
167    split: Literal["train", "test"] = "train",
168    species: Optional[str] = None,
169    download: bool = False,
170    **kwargs
171) -> Dataset:
172    """Get the OocyteSeg dataset for binary membrane segmentation.
173
174    Args:
175        path: Filepath to a folder where the downloaded data will be saved.
176        patch_shape: The patch shape to use for training.
177        split: The data split to use. One of 'train' or 'test'.
178        species: The species to select. One of 'mouse', 'human' or 'sea_urchin'.
179            If None, data from all species is returned.
180        download: Whether to download the data if it is not present.
181        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
182
183    Returns:
184        The segmentation dataset.
185    """
186    image_paths, seg_paths = get_oocyteseg_paths(path, split, species, download)
187
188    kwargs = util.ensure_transforms(ndim=2, **kwargs)
189
190    return torch_em.default_segmentation_dataset(
191        raw_paths=image_paths,
192        raw_key=None,
193        label_paths=seg_paths,
194        label_key=None,
195        patch_shape=patch_shape,
196        is_seg_dataset=False,
197        ndim=2,
198        **kwargs
199    )

Get the OocyteSeg dataset for binary membrane segmentation.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
patch_shape: The patch shape to use for training.
split: The data split to use. One of 'train' or 'test'.
species: The species to select. One of 'mouse', 'human' or 'sea_urchin'. If None, data from all species is returned.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

Returns:

The segmentation dataset.

def get_oocyteseg_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Literal['train', 'test'] = 'train', species: Optional[str] = None, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader: View Source

202def get_oocyteseg_loader(
203    path: Union[os.PathLike, str],
204    batch_size: int,
205    patch_shape: Tuple[int, int],
206    split: Literal["train", "test"] = "train",
207    species: Optional[str] = None,
208    download: bool = False,
209    **kwargs
210) -> DataLoader:
211    """Get the OocyteSeg dataloader for binary membrane segmentation.
212
213    Args:
214        path: Filepath to a folder where the downloaded data will be saved.
215        batch_size: The batch size for training.
216        patch_shape: The patch shape to use for training.
217        split: The data split to use. One of 'train' or 'test'.
218        species: The species to select. One of 'mouse', 'human' or 'sea_urchin'.
219            If None, data from all species is returned.
220        download: Whether to download the data if it is not present.
221        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
222
223    Returns:
224        The DataLoader.
225    """
226    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
227    dataset = get_oocyteseg_dataset(
228        path=path,
229        patch_shape=patch_shape,
230        split=split,
231        species=species,
232        download=download,
233        **ds_kwargs,
234    )
235    return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)

Get the OocyteSeg dataloader for binary membrane segmentation.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
batch_size: The batch size for training.
patch_shape: The patch shape to use for training.
split: The data split to use. One of 'train' or 'test'.
species: The species to select. One of 'mouse', 'human' or 'sea_urchin'. If None, data from all species is returned.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

Returns:

The DataLoader.