torch_em.data.datasets.light_microscopy.balf

The BALF dataset contains annotations for cell instance segmentation in bronchoalveolar lavage fluid microscopy images.

The dataset is located at https://zenodo.org/records/14871206. The dataset is from the publication https://doi.org/10.1038/s41597-025-05452-4. Please cite it if you use this dataset in your research.

View Source

  1"""The BALF dataset contains annotations for cell instance segmentation
  2in bronchoalveolar lavage fluid microscopy images.
  3
  4The dataset is located at https://zenodo.org/records/14871206.
  5The dataset is from the publication https://doi.org/10.1038/s41597-025-05452-4.
  6Please cite it if you use this dataset in your research.
  7"""
  8
  9import os
 10from glob import glob
 11from tqdm import tqdm
 12from natsort import natsorted
 13from typing import Union, Literal, Tuple, Optional, List
 14
 15import numpy as np
 16import imageio.v3 as imageio
 17
 18from skimage.draw import polygon as draw_polygon
 19
 20from torch.utils.data import Dataset, DataLoader
 21
 22import torch_em
 23
 24from .. import util
 25
 26
 27URLS = {
 28    "images": "https://zenodo.org/records/14871206/files/Images.rar",
 29    "labels": "https://zenodo.org/records/14871206/files/Labels.rar",
 30}
 31CHECKSUMS = {
 32    "images": None,
 33    "labels": None,
 34}
 35
 36CELL_TYPES = [
 37    "erythrocyte",
 38    "ciliated_columnar_epithelial",
 39    "squamous_epithelial",
 40    "macrophage",
 41    "lymphocyte",
 42    "neutrophil",
 43    "eosinophil",
 44]
 45
 46SPLITS = ["train", "val"]
 47
 48
 49def _create_data_from_yolo(image_dir, label_dir, data_dir):
 50    """Convert YOLO polygon annotations to HDF5 files with image, instance and semantic masks.
 51
 52    Each HDF5 file contains:
 53        - 'raw': RGB image in (C, H, W) format.
 54        - 'labels/instances': Instance segmentation mask with unique IDs per cell.
 55        - 'labels/semantic': Semantic segmentation mask with the following class mapping:
 56            0: background
 57            1: erythrocyte
 58            2: ciliated columnar epithelial
 59            3: squamous epithelial
 60            4: macrophage
 61            5: lymphocyte
 62            6: neutrophil
 63            7: eosinophil
 64    """
 65    import h5py
 66
 67    os.makedirs(data_dir, exist_ok=True)
 68
 69    label_paths = natsorted(glob(os.path.join(label_dir, "*.txt")))
 70    assert len(label_paths) > 0, f"No label files found in {label_dir}"
 71
 72    data_paths = []
 73    for label_path in tqdm(label_paths, desc="Creating BALF data"):
 74        stem = os.path.splitext(os.path.basename(label_path))[0]
 75
 76        image_path = os.path.join(image_dir, f"{stem}.jpg")
 77        assert os.path.exists(image_path), f"Image not found: {image_path}"
 78
 79        data_path = os.path.join(data_dir, f"{stem}.h5")
 80        data_paths.append(data_path)
 81
 82        if os.path.exists(data_path):
 83            continue
 84
 85        image = imageio.imread(image_path)
 86        h, w = image.shape[:2]
 87
 88        with open(label_path) as f:
 89            lines = f.readlines()
 90
 91        # Parse YOLO polygon annotations and compute areas for sorting.
 92        polygons = []
 93        for line in lines:
 94            parts = line.strip().split()
 95            class_id = int(parts[0])
 96            coords = [float(x) for x in parts[1:]]
 97            xs = [coords[i] * w for i in range(0, len(coords), 2)]
 98            ys = [coords[i] * h for i in range(1, len(coords), 2)]
 99            rr, cc = draw_polygon(ys, xs, shape=(h, w))
100            area = len(rr)
101            polygons.append((rr, cc, area, class_id))
102
103        # Sort by area (largest first so smaller objects are not occluded).
104        sorting = np.argsort([p[2] for p in polygons])[::-1]
105
106        instances = np.zeros((h, w), dtype="uint16")
107        semantic = np.zeros((h, w), dtype="uint16")
108        for seg_id, idx in enumerate(sorting, 1):
109            rr, cc, _, class_id = polygons[idx]
110            instances[rr, cc] = seg_id
111            semantic[rr, cc] = class_id + 1  # 0 = background, 1-7 = cell types
112
113        # Store image as channels-first (C, H, W).
114        raw = image.transpose(2, 0, 1)
115
116        with h5py.File(data_path, "w") as f:
117            f.create_dataset("raw", data=raw, compression="gzip")
118            f.create_dataset("labels/instances", data=instances, compression="gzip")
119            f.create_dataset("labels/semantic", data=semantic, compression="gzip")
120
121    return natsorted(data_paths)
122
123
124def get_balf_data(path: Union[os.PathLike, str], download: bool = False) -> str:
125    """Download the BALF dataset.
126
127    Args:
128        path: Filepath to a folder where the downloaded data will be saved.
129        download: Whether to download the data if it is not present.
130
131    Returns:
132        The path where the data is stored.
133    """
134    for key in URLS:
135        fname = URLS[key].rsplit("/", 1)[-1]
136        dirname = os.path.splitext(fname)[0].lower()
137
138        if os.path.exists(os.path.join(path, dirname)):
139            continue
140
141        os.makedirs(path, exist_ok=True)
142        rar_path = os.path.join(path, fname)
143        util.download_source(path=rar_path, url=URLS[key], download=download, checksum=CHECKSUMS[key])
144        util.unzip_rarfile(rar_path=rar_path, dst=path)
145
146    return path
147
148
149def get_balf_paths(
150    path: Union[os.PathLike, str],
151    split: Literal["train", "val"] = "train",
152    download: bool = False,
153) -> List[str]:
154    """Get paths to the BALF data.
155
156    Args:
157        path: Filepath to a folder where the downloaded data will be saved.
158        split: The data split to use. Either 'train' or 'val'.
159        download: Whether to download the data if it is not present.
160
161    Returns:
162        List of filepaths for the stored data.
163    """
164    assert split in SPLITS, f"'{split}' is not a valid split. Choose from {SPLITS}."
165
166    get_balf_data(path, download)
167
168    image_dir = os.path.join(path, "images", split)
169    label_dir = os.path.join(path, "labels", split)
170    data_dir = os.path.join(path, "data", split)
171
172    if not os.path.exists(data_dir) or len(glob(os.path.join(data_dir, "*.h5"))) == 0:
173        data_paths = _create_data_from_yolo(image_dir, label_dir, data_dir)
174    else:
175        data_paths = natsorted(glob(os.path.join(data_dir, "*.h5")))
176
177    assert len(data_paths) > 0
178    return data_paths
179
180
181def get_balf_dataset(
182    path: Union[os.PathLike, str],
183    patch_shape: Tuple[int, int],
184    split: Literal["train", "val"] = "train",
185    segmentation_type: Literal["instances", "semantic"] = "instances",
186    offsets: Optional[List[List[int]]] = None,
187    boundaries: bool = False,
188    binary: bool = False,
189    download: bool = False,
190    **kwargs
191) -> Dataset:
192    """Get the BALF dataset for cell segmentation in bronchoalveolar lavage fluid microscopy images.
193
194    Args:
195        path: Filepath to a folder where the downloaded data will be saved.
196        patch_shape: The patch shape to use for training.
197        split: The data split to use. Either 'train' or 'val'.
198        segmentation_type: The segmentation target. Either 'instances' or 'semantic'.
199        offsets: Offset values for affinity computation used as target.
200        boundaries: Whether to compute boundaries as the target.
201        binary: Whether to use a binary segmentation target.
202        download: Whether to download the data if it is not present.
203        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
204
205    Returns:
206        The segmentation dataset.
207    """
208    data_paths = get_balf_paths(path, split, download)
209
210    kwargs = util.ensure_transforms(ndim=2, **kwargs)
211    kwargs, _ = util.add_instance_label_transform(
212        kwargs, add_binary_target=True, offsets=offsets, boundaries=boundaries, binary=binary
213    )
214
215    return torch_em.default_segmentation_dataset(
216        raw_paths=data_paths,
217        raw_key="raw",
218        label_paths=data_paths,
219        label_key=f"labels/{segmentation_type}",
220        patch_shape=patch_shape,
221        with_channels=True,
222        ndim=2,
223        **kwargs
224    )
225
226
227def get_balf_loader(
228    path: Union[os.PathLike, str],
229    batch_size: int,
230    patch_shape: Tuple[int, int],
231    split: Literal["train", "val"] = "train",
232    segmentation_type: Literal["instances", "semantic"] = "instances",
233    offsets: Optional[List[List[int]]] = None,
234    boundaries: bool = False,
235    binary: bool = False,
236    download: bool = False,
237    **kwargs
238) -> DataLoader:
239    """Get the BALF dataloader for cell segmentation in bronchoalveolar lavage fluid microscopy images.
240
241    Args:
242        path: Filepath to a folder where the downloaded data will be saved.
243        batch_size: The batch size for training.
244        patch_shape: The patch shape to use for training.
245        split: The data split to use. Either 'train' or 'val'.
246        segmentation_type: The segmentation target. Either 'instances' or 'semantic'.
247        offsets: Offset values for affinity computation used as target.
248        boundaries: Whether to compute boundaries as the target.
249        binary: Whether to use a binary segmentation target.
250        download: Whether to download the data if it is not present.
251        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
252
253    Returns:
254        The DataLoader.
255    """
256    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
257    dataset = get_balf_dataset(
258        path=path,
259        patch_shape=patch_shape,
260        split=split,
261        segmentation_type=segmentation_type,
262        offsets=offsets,
263        boundaries=boundaries,
264        binary=binary,
265        download=download,
266        **ds_kwargs,
267    )
268    return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)

URLS = {'images': 'https://zenodo.org/records/14871206/files/Images.rar', 'labels': 'https://zenodo.org/records/14871206/files/Labels.rar'}

CHECKSUMS = {'images': None, 'labels': None}

CELL_TYPES = ['erythrocyte', 'ciliated_columnar_epithelial', 'squamous_epithelial', 'macrophage', 'lymphocyte', 'neutrophil', 'eosinophil']

SPLITS = ['train', 'val']

def get_balf_data(path: Union[os.PathLike, str], download: bool = False) -> str: View Source

125def get_balf_data(path: Union[os.PathLike, str], download: bool = False) -> str:
126    """Download the BALF dataset.
127
128    Args:
129        path: Filepath to a folder where the downloaded data will be saved.
130        download: Whether to download the data if it is not present.
131
132    Returns:
133        The path where the data is stored.
134    """
135    for key in URLS:
136        fname = URLS[key].rsplit("/", 1)[-1]
137        dirname = os.path.splitext(fname)[0].lower()
138
139        if os.path.exists(os.path.join(path, dirname)):
140            continue
141
142        os.makedirs(path, exist_ok=True)
143        rar_path = os.path.join(path, fname)
144        util.download_source(path=rar_path, url=URLS[key], download=download, checksum=CHECKSUMS[key])
145        util.unzip_rarfile(rar_path=rar_path, dst=path)
146
147    return path

Download the BALF dataset.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
download: Whether to download the data if it is not present.

Returns:

The path where the data is stored.

def get_balf_paths( path: Union[os.PathLike, str], split: Literal['train', 'val'] = 'train', download: bool = False) -> List[str]: View Source

150def get_balf_paths(
151    path: Union[os.PathLike, str],
152    split: Literal["train", "val"] = "train",
153    download: bool = False,
154) -> List[str]:
155    """Get paths to the BALF data.
156
157    Args:
158        path: Filepath to a folder where the downloaded data will be saved.
159        split: The data split to use. Either 'train' or 'val'.
160        download: Whether to download the data if it is not present.
161
162    Returns:
163        List of filepaths for the stored data.
164    """
165    assert split in SPLITS, f"'{split}' is not a valid split. Choose from {SPLITS}."
166
167    get_balf_data(path, download)
168
169    image_dir = os.path.join(path, "images", split)
170    label_dir = os.path.join(path, "labels", split)
171    data_dir = os.path.join(path, "data", split)
172
173    if not os.path.exists(data_dir) or len(glob(os.path.join(data_dir, "*.h5"))) == 0:
174        data_paths = _create_data_from_yolo(image_dir, label_dir, data_dir)
175    else:
176        data_paths = natsorted(glob(os.path.join(data_dir, "*.h5")))
177
178    assert len(data_paths) > 0
179    return data_paths

Get paths to the BALF data.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
split: The data split to use. Either 'train' or 'val'.
download: Whether to download the data if it is not present.

Returns:

List of filepaths for the stored data.

def get_balf_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Literal['train', 'val'] = 'train', segmentation_type: Literal['instances', 'semantic'] = 'instances', offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset: View Source

182def get_balf_dataset(
183    path: Union[os.PathLike, str],
184    patch_shape: Tuple[int, int],
185    split: Literal["train", "val"] = "train",
186    segmentation_type: Literal["instances", "semantic"] = "instances",
187    offsets: Optional[List[List[int]]] = None,
188    boundaries: bool = False,
189    binary: bool = False,
190    download: bool = False,
191    **kwargs
192) -> Dataset:
193    """Get the BALF dataset for cell segmentation in bronchoalveolar lavage fluid microscopy images.
194
195    Args:
196        path: Filepath to a folder where the downloaded data will be saved.
197        patch_shape: The patch shape to use for training.
198        split: The data split to use. Either 'train' or 'val'.
199        segmentation_type: The segmentation target. Either 'instances' or 'semantic'.
200        offsets: Offset values for affinity computation used as target.
201        boundaries: Whether to compute boundaries as the target.
202        binary: Whether to use a binary segmentation target.
203        download: Whether to download the data if it is not present.
204        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
205
206    Returns:
207        The segmentation dataset.
208    """
209    data_paths = get_balf_paths(path, split, download)
210
211    kwargs = util.ensure_transforms(ndim=2, **kwargs)
212    kwargs, _ = util.add_instance_label_transform(
213        kwargs, add_binary_target=True, offsets=offsets, boundaries=boundaries, binary=binary
214    )
215
216    return torch_em.default_segmentation_dataset(
217        raw_paths=data_paths,
218        raw_key="raw",
219        label_paths=data_paths,
220        label_key=f"labels/{segmentation_type}",
221        patch_shape=patch_shape,
222        with_channels=True,
223        ndim=2,
224        **kwargs
225    )

Get the BALF dataset for cell segmentation in bronchoalveolar lavage fluid microscopy images.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
patch_shape: The patch shape to use for training.
split: The data split to use. Either 'train' or 'val'.
segmentation_type: The segmentation target. Either 'instances' or 'semantic'.
offsets: Offset values for affinity computation used as target.
boundaries: Whether to compute boundaries as the target.
binary: Whether to use a binary segmentation target.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

Returns:

The segmentation dataset.

def get_balf_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Literal['train', 'val'] = 'train', segmentation_type: Literal['instances', 'semantic'] = 'instances', offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader: View Source

228def get_balf_loader(
229    path: Union[os.PathLike, str],
230    batch_size: int,
231    patch_shape: Tuple[int, int],
232    split: Literal["train", "val"] = "train",
233    segmentation_type: Literal["instances", "semantic"] = "instances",
234    offsets: Optional[List[List[int]]] = None,
235    boundaries: bool = False,
236    binary: bool = False,
237    download: bool = False,
238    **kwargs
239) -> DataLoader:
240    """Get the BALF dataloader for cell segmentation in bronchoalveolar lavage fluid microscopy images.
241
242    Args:
243        path: Filepath to a folder where the downloaded data will be saved.
244        batch_size: The batch size for training.
245        patch_shape: The patch shape to use for training.
246        split: The data split to use. Either 'train' or 'val'.
247        segmentation_type: The segmentation target. Either 'instances' or 'semantic'.
248        offsets: Offset values for affinity computation used as target.
249        boundaries: Whether to compute boundaries as the target.
250        binary: Whether to use a binary segmentation target.
251        download: Whether to download the data if it is not present.
252        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
253
254    Returns:
255        The DataLoader.
256    """
257    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
258    dataset = get_balf_dataset(
259        path=path,
260        patch_shape=patch_shape,
261        split=split,
262        segmentation_type=segmentation_type,
263        offsets=offsets,
264        boundaries=boundaries,
265        binary=binary,
266        download=download,
267        **ds_kwargs,
268    )
269    return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)

Get the BALF dataloader for cell segmentation in bronchoalveolar lavage fluid microscopy images.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
batch_size: The batch size for training.
patch_shape: The patch shape to use for training.
split: The data split to use. Either 'train' or 'val'.
segmentation_type: The segmentation target. Either 'instances' or 'semantic'.
offsets: Offset values for affinity computation used as target.
boundaries: Whether to compute boundaries as the target.
binary: Whether to use a binary segmentation target.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

Returns:

The DataLoader.