torch_em.data.datasets.light_microscopy.yeastcellseg

The YeastCellSeg dataset contains annotations for yeast cell segmentation in 2D bright field microscopy images.

The dataset provides 15 images of 1024x1024 pixels with binary cell body annotations. Instance segmentation labels are derived via connected components.

The dataset is from the publication https://doi.org/10.1109/ISBI.2014.6868107. Please cite it if you use this dataset in your research.

View Source

  1"""The YeastCellSeg dataset contains annotations for yeast cell segmentation
  2in 2D bright field microscopy images.
  3
  4The dataset provides 15 images of 1024x1024 pixels with binary cell body annotations.
  5Instance segmentation labels are derived via connected components.
  6
  7The dataset is from the publication https://doi.org/10.1109/ISBI.2014.6868107.
  8Please cite it if you use this dataset in your research.
  9"""
 10
 11import os
 12from glob import glob
 13from typing import Union, Literal, Tuple, List
 14
 15import numpy as np
 16import imageio.v3 as imageio
 17
 18from torch.utils.data import Dataset, DataLoader
 19
 20import torch_em
 21
 22from .. import util
 23
 24
 25BASE_URL = "https://zenodo.org/records/344879/files"
 26_FILENAMES = [f"DS01_{i:02d}" for i in range(1, 16)]
 27
 28
 29def _create_h5_data(path, raw_dir, gt_dir):
 30    """Create h5 files with raw images, semantic masks and instance labels.
 31
 32    Each h5 file contains:
 33        - 'raw': (H, W) uint8 grayscale bright field image.
 34        - 'labels/semantic': (H, W) uint8 binary mask (0=background, 1=cell).
 35        - 'labels/instances': (H, W) int64 connected component labels.
 36    """
 37    import h5py
 38    from skimage.measure import label
 39
 40    h5_dir = os.path.join(path, "h5_data")
 41    os.makedirs(h5_dir, exist_ok=True)
 42
 43    for name in _FILENAMES:
 44        h5_path = os.path.join(h5_dir, f"{name}.h5")
 45        if os.path.exists(h5_path):
 46            continue
 47
 48        img = imageio.imread(os.path.join(raw_dir, f"{name}.tif"))
 49        gt = imageio.imread(os.path.join(gt_dir, f"{name}_gt.tif"))
 50
 51        semantic = (gt > 0).astype("uint8")
 52        instances = label(semantic).astype("int64")
 53
 54        with h5py.File(h5_path, "w") as f:
 55            f.create_dataset("raw", data=img, compression="gzip")
 56            f.create_dataset("labels/semantic", data=semantic, compression="gzip")
 57            f.create_dataset("labels/instances", data=instances, compression="gzip")
 58
 59    return h5_dir
 60
 61
 62def get_yeastcellseg_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 63    """Download the YeastCellSeg dataset.
 64
 65    Args:
 66        path: Filepath to a folder where the downloaded data will be saved.
 67        download: Whether to download the data if it is not present.
 68
 69    Returns:
 70        The filepath to the directory with the h5 data.
 71    """
 72    h5_dir = os.path.join(path, "h5_data")
 73    if os.path.exists(h5_dir) and len(glob(os.path.join(h5_dir, "*.h5"))) == len(_FILENAMES):
 74        return h5_dir
 75
 76    raw_dir = os.path.join(path, "raw")
 77    gt_dir = os.path.join(path, "gt")
 78    os.makedirs(raw_dir, exist_ok=True)
 79    os.makedirs(gt_dir, exist_ok=True)
 80
 81    for name in _FILENAMES:
 82        raw_path = os.path.join(raw_dir, f"{name}.tif")
 83        if not os.path.exists(raw_path):
 84            util.download_source(path=raw_path, url=f"{BASE_URL}/{name}.tif", download=download, checksum=None)
 85
 86        gt_path = os.path.join(gt_dir, f"{name}_gt.tif")
 87        if not os.path.exists(gt_path):
 88            util.download_source(path=gt_path, url=f"{BASE_URL}/{name}_gt.tif", download=download, checksum=None)
 89
 90    return _create_h5_data(path, raw_dir, gt_dir)
 91
 92
 93def get_yeastcellseg_paths(
 94    path: Union[os.PathLike, str],
 95    download: bool = False,
 96) -> List[str]:
 97    """Get paths to the YeastCellSeg data.
 98
 99    Args:
100        path: Filepath to a folder where the downloaded data will be saved.
101        download: Whether to download the data if it is not present.
102
103    Returns:
104        List of filepaths for the h5 data.
105    """
106    from natsort import natsorted
107
108    h5_dir = get_yeastcellseg_data(path, download)
109    h5_paths = natsorted(glob(os.path.join(h5_dir, "*.h5")))
110    assert len(h5_paths) == len(_FILENAMES), f"Expected {len(_FILENAMES)} h5 files, found {len(h5_paths)}"
111    return h5_paths
112
113
114def get_yeastcellseg_dataset(
115    path: Union[os.PathLike, str],
116    patch_shape: Tuple[int, int],
117    segmentation_type: Literal["instances", "semantic"] = "instances",
118    download: bool = False,
119    **kwargs
120) -> Dataset:
121    """Get the YeastCellSeg dataset for yeast cell segmentation.
122
123    Args:
124        path: Filepath to a folder where the downloaded data will be saved.
125        patch_shape: The patch shape to use for training.
126        segmentation_type: The type of segmentation labels to use.
127            One of 'instances' (connected component instance labels) or 'semantic' (binary cell mask).
128        download: Whether to download the data if it is not present.
129        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
130
131    Returns:
132        The segmentation dataset.
133    """
134    assert segmentation_type in ("instances", "semantic"), \
135        f"'{segmentation_type}' is not valid. Choose from 'instances' or 'semantic'."
136
137    h5_paths = get_yeastcellseg_paths(path, download)
138
139    label_key = f"labels/{segmentation_type}"
140
141    kwargs, _ = util.add_instance_label_transform(
142        kwargs, add_binary_target=True, label_dtype=np.int64,
143    )
144    kwargs = util.ensure_transforms(ndim=2, **kwargs)
145
146    return torch_em.default_segmentation_dataset(
147        raw_paths=h5_paths,
148        raw_key="raw",
149        label_paths=h5_paths,
150        label_key=label_key,
151        patch_shape=patch_shape,
152        ndim=2,
153        **kwargs
154    )
155
156
157def get_yeastcellseg_loader(
158    path: Union[os.PathLike, str],
159    batch_size: int,
160    patch_shape: Tuple[int, int],
161    segmentation_type: Literal["instances", "semantic"] = "instances",
162    download: bool = False,
163    **kwargs
164) -> DataLoader:
165    """Get the YeastCellSeg dataloader for yeast cell segmentation.
166
167    Args:
168        path: Filepath to a folder where the downloaded data will be saved.
169        batch_size: The batch size for training.
170        patch_shape: The patch shape to use for training.
171        segmentation_type: The type of segmentation labels to use.
172            One of 'instances' (connected component instance labels) or 'semantic' (binary cell mask).
173        download: Whether to download the data if it is not present.
174        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
175
176    Returns:
177        The DataLoader.
178    """
179    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
180    dataset = get_yeastcellseg_dataset(
181        path=path,
182        patch_shape=patch_shape,
183        segmentation_type=segmentation_type,
184        download=download,
185        **ds_kwargs,
186    )
187    return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)

BASE_URL = 'https://zenodo.org/records/344879/files'

def get_yeastcellseg_data(path: Union[os.PathLike, str], download: bool = False) -> str: View Source

63def get_yeastcellseg_data(path: Union[os.PathLike, str], download: bool = False) -> str:
64    """Download the YeastCellSeg dataset.
65
66    Args:
67        path: Filepath to a folder where the downloaded data will be saved.
68        download: Whether to download the data if it is not present.
69
70    Returns:
71        The filepath to the directory with the h5 data.
72    """
73    h5_dir = os.path.join(path, "h5_data")
74    if os.path.exists(h5_dir) and len(glob(os.path.join(h5_dir, "*.h5"))) == len(_FILENAMES):
75        return h5_dir
76
77    raw_dir = os.path.join(path, "raw")
78    gt_dir = os.path.join(path, "gt")
79    os.makedirs(raw_dir, exist_ok=True)
80    os.makedirs(gt_dir, exist_ok=True)
81
82    for name in _FILENAMES:
83        raw_path = os.path.join(raw_dir, f"{name}.tif")
84        if not os.path.exists(raw_path):
85            util.download_source(path=raw_path, url=f"{BASE_URL}/{name}.tif", download=download, checksum=None)
86
87        gt_path = os.path.join(gt_dir, f"{name}_gt.tif")
88        if not os.path.exists(gt_path):
89            util.download_source(path=gt_path, url=f"{BASE_URL}/{name}_gt.tif", download=download, checksum=None)
90
91    return _create_h5_data(path, raw_dir, gt_dir)

Download the YeastCellSeg dataset.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
download: Whether to download the data if it is not present.

Returns:

The filepath to the directory with the h5 data.

def get_yeastcellseg_paths(path: Union[os.PathLike, str], download: bool = False) -> List[str]: View Source

 94def get_yeastcellseg_paths(
 95    path: Union[os.PathLike, str],
 96    download: bool = False,
 97) -> List[str]:
 98    """Get paths to the YeastCellSeg data.
 99
100    Args:
101        path: Filepath to a folder where the downloaded data will be saved.
102        download: Whether to download the data if it is not present.
103
104    Returns:
105        List of filepaths for the h5 data.
106    """
107    from natsort import natsorted
108
109    h5_dir = get_yeastcellseg_data(path, download)
110    h5_paths = natsorted(glob(os.path.join(h5_dir, "*.h5")))
111    assert len(h5_paths) == len(_FILENAMES), f"Expected {len(_FILENAMES)} h5 files, found {len(h5_paths)}"
112    return h5_paths

Get paths to the YeastCellSeg data.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
download: Whether to download the data if it is not present.

Returns:

List of filepaths for the h5 data.

def get_yeastcellseg_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], segmentation_type: Literal['instances', 'semantic'] = 'instances', download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset: View Source

115def get_yeastcellseg_dataset(
116    path: Union[os.PathLike, str],
117    patch_shape: Tuple[int, int],
118    segmentation_type: Literal["instances", "semantic"] = "instances",
119    download: bool = False,
120    **kwargs
121) -> Dataset:
122    """Get the YeastCellSeg dataset for yeast cell segmentation.
123
124    Args:
125        path: Filepath to a folder where the downloaded data will be saved.
126        patch_shape: The patch shape to use for training.
127        segmentation_type: The type of segmentation labels to use.
128            One of 'instances' (connected component instance labels) or 'semantic' (binary cell mask).
129        download: Whether to download the data if it is not present.
130        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
131
132    Returns:
133        The segmentation dataset.
134    """
135    assert segmentation_type in ("instances", "semantic"), \
136        f"'{segmentation_type}' is not valid. Choose from 'instances' or 'semantic'."
137
138    h5_paths = get_yeastcellseg_paths(path, download)
139
140    label_key = f"labels/{segmentation_type}"
141
142    kwargs, _ = util.add_instance_label_transform(
143        kwargs, add_binary_target=True, label_dtype=np.int64,
144    )
145    kwargs = util.ensure_transforms(ndim=2, **kwargs)
146
147    return torch_em.default_segmentation_dataset(
148        raw_paths=h5_paths,
149        raw_key="raw",
150        label_paths=h5_paths,
151        label_key=label_key,
152        patch_shape=patch_shape,
153        ndim=2,
154        **kwargs
155    )

Get the YeastCellSeg dataset for yeast cell segmentation.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
patch_shape: The patch shape to use for training.
segmentation_type: The type of segmentation labels to use. One of 'instances' (connected component instance labels) or 'semantic' (binary cell mask).
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

Returns:

The segmentation dataset.

def get_yeastcellseg_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], segmentation_type: Literal['instances', 'semantic'] = 'instances', download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader: View Source

158def get_yeastcellseg_loader(
159    path: Union[os.PathLike, str],
160    batch_size: int,
161    patch_shape: Tuple[int, int],
162    segmentation_type: Literal["instances", "semantic"] = "instances",
163    download: bool = False,
164    **kwargs
165) -> DataLoader:
166    """Get the YeastCellSeg dataloader for yeast cell segmentation.
167
168    Args:
169        path: Filepath to a folder where the downloaded data will be saved.
170        batch_size: The batch size for training.
171        patch_shape: The patch shape to use for training.
172        segmentation_type: The type of segmentation labels to use.
173            One of 'instances' (connected component instance labels) or 'semantic' (binary cell mask).
174        download: Whether to download the data if it is not present.
175        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
176
177    Returns:
178        The DataLoader.
179    """
180    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
181    dataset = get_yeastcellseg_dataset(
182        path=path,
183        patch_shape=patch_shape,
184        segmentation_type=segmentation_type,
185        download=download,
186        **ds_kwargs,
187    )
188    return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)

Get the YeastCellSeg dataloader for yeast cell segmentation.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
batch_size: The batch size for training.
patch_shape: The patch shape to use for training.
segmentation_type: The type of segmentation labels to use. One of 'instances' (connected component instance labels) or 'semantic' (binary cell mask).
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

Returns:

The DataLoader.