torch_em.data.datasets.light_microscopy.yeastms

The YeastMS dataset contains annotations for yeast cell instance segmentation in brightfield microscopy images of microfluidic trap structures.

The dataset provides 493 annotated images (256x256) with instance segmentation masks for both cells and trap microstructures across train/val/test splits.

The dataset is located at https://tudatalib.ulb.tu-darmstadt.de/handle/tudatalib/3799. This dataset is from the publication https://doi.org/10.48550/arXiv.2304.07597. Please cite it if you use this dataset in your research.

View Source

  1"""The YeastMS dataset contains annotations for yeast cell instance segmentation
  2in brightfield microscopy images of microfluidic trap structures.
  3
  4The dataset provides 493 annotated images (256x256) with instance segmentation
  5masks for both cells and trap microstructures across train/val/test splits.
  6
  7The dataset is located at https://tudatalib.ulb.tu-darmstadt.de/handle/tudatalib/3799.
  8This dataset is from the publication https://doi.org/10.48550/arXiv.2304.07597.
  9Please cite it if you use this dataset in your research.
 10"""
 11
 12import os
 13from glob import glob
 14from typing import Union, Tuple, List, Literal
 15
 16import numpy as np
 17
 18from torch.utils.data import Dataset, DataLoader
 19
 20import torch_em
 21
 22from .. import util
 23
 24
 25URL = "https://tudatalib.ulb.tu-darmstadt.de/bitstream/handle/tudatalib/3799/yeast_cell_in_microstructures_dataset.zip"
 26CHECKSUM = "80d9e34266895a030b5dfbb81c25f9bd41e7d8c3d57f2c5aaeafd7c7c3a2d6b5"
 27
 28VALID_SPLITS = ["train", "val", "test"]
 29
 30
 31def _create_h5_data(path, split):
 32    """Create h5 files with raw images and cell instance labels from .pt tensors."""
 33    import h5py
 34    import torch
 35    from natsort import natsorted
 36    from tqdm import tqdm
 37
 38    h5_dir = os.path.join(path, "h5_data", split)
 39    os.makedirs(h5_dir, exist_ok=True)
 40
 41    input_dir = os.path.join(path, split, "inputs")
 42    instance_dir = os.path.join(path, split, "instances")
 43    class_dir = os.path.join(path, split, "classes")
 44
 45    input_paths = natsorted(glob(os.path.join(input_dir, "*.pt")))
 46
 47    for input_path in tqdm(input_paths, desc=f"Creating h5 files for '{split}'"):
 48        fname = os.path.basename(input_path).replace(".pt", ".h5")
 49        h5_path = os.path.join(h5_dir, fname)
 50
 51        if os.path.exists(h5_path):
 52            continue
 53
 54        sample_id = os.path.basename(input_path)
 55        instance_path = os.path.join(instance_dir, sample_id)
 56        class_path = os.path.join(class_dir, sample_id)
 57
 58        raw = torch.load(input_path, weights_only=False).numpy()
 59        instances = torch.load(instance_path, weights_only=False).numpy()  # (N, H, W)
 60        classes = torch.load(class_path, weights_only=False).numpy()  # (N,)
 61
 62        # Create cell instance labels (class 0 = cell, class 1 = trap).
 63        labels = np.zeros(raw.shape, dtype="int64")
 64        cell_id = 1
 65        for i in range(instances.shape[0]):
 66            if classes[i] == 0:  # cell
 67                labels[instances[i] > 0] = cell_id
 68                cell_id += 1
 69
 70        with h5py.File(h5_path, "w") as f:
 71            f.create_dataset("raw", data=raw, compression="gzip")
 72            f.create_dataset("labels", data=labels, compression="gzip")
 73
 74    return h5_dir
 75
 76
 77def get_yeastms_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 78    """Download the YeastMS dataset.
 79
 80    Args:
 81        path: Filepath to a folder where the downloaded data will be saved.
 82        download: Whether to download the data if it is not present.
 83
 84    Returns:
 85        The filepath to the directory with the data.
 86    """
 87    data_dir = os.path.join(path, "train")
 88    if os.path.exists(data_dir):
 89        return path
 90
 91    os.makedirs(path, exist_ok=True)
 92    zip_path = os.path.join(path, "yeast_cell_in_microstructures_dataset.zip")
 93    util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM)
 94    util.unzip(zip_path=zip_path, dst=path)
 95
 96    return path
 97
 98
 99def get_yeastms_paths(
100    path: Union[os.PathLike, str],
101    split: Literal["train", "val", "test"] = "train",
102    download: bool = False,
103) -> List[str]:
104    """Get paths to the YeastMS data.
105
106    Args:
107        path: Filepath to a folder where the downloaded data will be saved.
108        split: The data split to use. One of 'train', 'val' or 'test'.
109        download: Whether to download the data if it is not present.
110
111    Returns:
112        List of filepaths for the h5 data.
113    """
114    from natsort import natsorted
115
116    assert split in VALID_SPLITS, f"'{split}' is not a valid split. Choose from {VALID_SPLITS}."
117
118    get_yeastms_data(path, download)
119
120    h5_dir = os.path.join(path, "h5_data", split)
121    if not os.path.exists(h5_dir) or len(glob(os.path.join(h5_dir, "*.h5"))) == 0:
122        _create_h5_data(path, split)
123
124    h5_paths = natsorted(glob(os.path.join(h5_dir, "*.h5")))
125    assert len(h5_paths) > 0, f"No data found for split '{split}'"
126
127    return h5_paths
128
129
130def get_yeastms_dataset(
131    path: Union[os.PathLike, str],
132    patch_shape: Tuple[int, int],
133    split: Literal["train", "val", "test"] = "train",
134    download: bool = False,
135    **kwargs
136) -> Dataset:
137    """Get the YeastMS dataset for yeast cell segmentation in microstructures.
138
139    Args:
140        path: Filepath to a folder where the downloaded data will be saved.
141        patch_shape: The patch shape to use for training.
142        split: The data split to use. One of 'train', 'val' or 'test'.
143        download: Whether to download the data if it is not present.
144        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
145
146    Returns:
147        The segmentation dataset.
148    """
149    h5_paths = get_yeastms_paths(path, split, download)
150
151    kwargs, _ = util.add_instance_label_transform(
152        kwargs, add_binary_target=True,
153    )
154    kwargs = util.ensure_transforms(ndim=2, **kwargs)
155
156    return torch_em.default_segmentation_dataset(
157        raw_paths=h5_paths,
158        raw_key="raw",
159        label_paths=h5_paths,
160        label_key="labels",
161        patch_shape=patch_shape,
162        ndim=2,
163        **kwargs
164    )
165
166
167def get_yeastms_loader(
168    path: Union[os.PathLike, str],
169    batch_size: int,
170    patch_shape: Tuple[int, int],
171    split: Literal["train", "val", "test"] = "train",
172    download: bool = False,
173    **kwargs
174) -> DataLoader:
175    """Get the YeastMS dataloader for yeast cell segmentation in microstructures.
176
177    Args:
178        path: Filepath to a folder where the downloaded data will be saved.
179        batch_size: The batch size for training.
180        patch_shape: The patch shape to use for training.
181        split: The data split to use. One of 'train', 'val' or 'test'.
182        download: Whether to download the data if it is not present.
183        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
184
185    Returns:
186        The DataLoader.
187    """
188    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
189    dataset = get_yeastms_dataset(
190        path=path,
191        patch_shape=patch_shape,
192        split=split,
193        download=download,
194        **ds_kwargs,
195    )
196    return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)

URL = 'https://tudatalib.ulb.tu-darmstadt.de/bitstream/handle/tudatalib/3799/yeast_cell_in_microstructures_dataset.zip'

CHECKSUM = '80d9e34266895a030b5dfbb81c25f9bd41e7d8c3d57f2c5aaeafd7c7c3a2d6b5'

VALID_SPLITS = ['train', 'val', 'test']

def get_yeastms_data(path: Union[os.PathLike, str], download: bool = False) -> str: View Source

78def get_yeastms_data(path: Union[os.PathLike, str], download: bool = False) -> str:
79    """Download the YeastMS dataset.
80
81    Args:
82        path: Filepath to a folder where the downloaded data will be saved.
83        download: Whether to download the data if it is not present.
84
85    Returns:
86        The filepath to the directory with the data.
87    """
88    data_dir = os.path.join(path, "train")
89    if os.path.exists(data_dir):
90        return path
91
92    os.makedirs(path, exist_ok=True)
93    zip_path = os.path.join(path, "yeast_cell_in_microstructures_dataset.zip")
94    util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM)
95    util.unzip(zip_path=zip_path, dst=path)
96
97    return path

Download the YeastMS dataset.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
download: Whether to download the data if it is not present.

Returns:

The filepath to the directory with the data.

def get_yeastms_paths( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'] = 'train', download: bool = False) -> List[str]: View Source

100def get_yeastms_paths(
101    path: Union[os.PathLike, str],
102    split: Literal["train", "val", "test"] = "train",
103    download: bool = False,
104) -> List[str]:
105    """Get paths to the YeastMS data.
106
107    Args:
108        path: Filepath to a folder where the downloaded data will be saved.
109        split: The data split to use. One of 'train', 'val' or 'test'.
110        download: Whether to download the data if it is not present.
111
112    Returns:
113        List of filepaths for the h5 data.
114    """
115    from natsort import natsorted
116
117    assert split in VALID_SPLITS, f"'{split}' is not a valid split. Choose from {VALID_SPLITS}."
118
119    get_yeastms_data(path, download)
120
121    h5_dir = os.path.join(path, "h5_data", split)
122    if not os.path.exists(h5_dir) or len(glob(os.path.join(h5_dir, "*.h5"))) == 0:
123        _create_h5_data(path, split)
124
125    h5_paths = natsorted(glob(os.path.join(h5_dir, "*.h5")))
126    assert len(h5_paths) > 0, f"No data found for split '{split}'"
127
128    return h5_paths

Get paths to the YeastMS data.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
split: The data split to use. One of 'train', 'val' or 'test'.
download: Whether to download the data if it is not present.

Returns:

List of filepaths for the h5 data.

def get_yeastms_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'] = 'train', download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset: View Source

131def get_yeastms_dataset(
132    path: Union[os.PathLike, str],
133    patch_shape: Tuple[int, int],
134    split: Literal["train", "val", "test"] = "train",
135    download: bool = False,
136    **kwargs
137) -> Dataset:
138    """Get the YeastMS dataset for yeast cell segmentation in microstructures.
139
140    Args:
141        path: Filepath to a folder where the downloaded data will be saved.
142        patch_shape: The patch shape to use for training.
143        split: The data split to use. One of 'train', 'val' or 'test'.
144        download: Whether to download the data if it is not present.
145        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
146
147    Returns:
148        The segmentation dataset.
149    """
150    h5_paths = get_yeastms_paths(path, split, download)
151
152    kwargs, _ = util.add_instance_label_transform(
153        kwargs, add_binary_target=True,
154    )
155    kwargs = util.ensure_transforms(ndim=2, **kwargs)
156
157    return torch_em.default_segmentation_dataset(
158        raw_paths=h5_paths,
159        raw_key="raw",
160        label_paths=h5_paths,
161        label_key="labels",
162        patch_shape=patch_shape,
163        ndim=2,
164        **kwargs
165    )

Get the YeastMS dataset for yeast cell segmentation in microstructures.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
patch_shape: The patch shape to use for training.
split: The data split to use. One of 'train', 'val' or 'test'.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

Returns:

The segmentation dataset.

def get_yeastms_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'] = 'train', download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader: View Source

168def get_yeastms_loader(
169    path: Union[os.PathLike, str],
170    batch_size: int,
171    patch_shape: Tuple[int, int],
172    split: Literal["train", "val", "test"] = "train",
173    download: bool = False,
174    **kwargs
175) -> DataLoader:
176    """Get the YeastMS dataloader for yeast cell segmentation in microstructures.
177
178    Args:
179        path: Filepath to a folder where the downloaded data will be saved.
180        batch_size: The batch size for training.
181        patch_shape: The patch shape to use for training.
182        split: The data split to use. One of 'train', 'val' or 'test'.
183        download: Whether to download the data if it is not present.
184        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
185
186    Returns:
187        The DataLoader.
188    """
189    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
190    dataset = get_yeastms_dataset(
191        path=path,
192        patch_shape=patch_shape,
193        split=split,
194        download=download,
195        **ds_kwargs,
196    )
197    return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)

Get the YeastMS dataloader for yeast cell segmentation in microstructures.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
batch_size: The batch size for training.
patch_shape: The patch shape to use for training.
split: The data split to use. One of 'train', 'val' or 'test'.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

Returns:

The DataLoader.