torch_em.data.datasets.light_microscopy.balf
The BALF dataset contains annotations for cell instance segmentation in bronchoalveolar lavage fluid microscopy images.
The dataset is located at https://zenodo.org/records/14871206. The dataset is from the publication https://doi.org/10.1038/s41597-025-05452-4. Please cite it if you use this dataset in your research.
1"""The BALF dataset contains annotations for cell instance segmentation 2in bronchoalveolar lavage fluid microscopy images. 3 4The dataset is located at https://zenodo.org/records/14871206. 5The dataset is from the publication https://doi.org/10.1038/s41597-025-05452-4. 6Please cite it if you use this dataset in your research. 7""" 8 9import os 10from glob import glob 11from tqdm import tqdm 12from natsort import natsorted 13from typing import Union, Literal, Tuple, Optional, List 14 15import numpy as np 16import imageio.v3 as imageio 17 18from skimage.draw import polygon as draw_polygon 19 20from torch.utils.data import Dataset, DataLoader 21 22import torch_em 23 24from .. import util 25 26 27URLS = { 28 "images": "https://zenodo.org/records/14871206/files/Images.rar", 29 "labels": "https://zenodo.org/records/14871206/files/Labels.rar", 30} 31CHECKSUMS = { 32 "images": None, 33 "labels": None, 34} 35 36CELL_TYPES = [ 37 "erythrocyte", 38 "ciliated_columnar_epithelial", 39 "squamous_epithelial", 40 "macrophage", 41 "lymphocyte", 42 "neutrophil", 43 "eosinophil", 44] 45 46SPLITS = ["train", "val"] 47 48 49def _create_data_from_yolo(image_dir, label_dir, data_dir): 50 """Convert YOLO polygon annotations to HDF5 files with image, instance and semantic masks. 51 52 Each HDF5 file contains: 53 - 'raw': RGB image in (C, H, W) format. 54 - 'labels/instances': Instance segmentation mask with unique IDs per cell. 55 - 'labels/semantic': Semantic segmentation mask with the following class mapping: 56 0: background 57 1: erythrocyte 58 2: ciliated columnar epithelial 59 3: squamous epithelial 60 4: macrophage 61 5: lymphocyte 62 6: neutrophil 63 7: eosinophil 64 """ 65 import h5py 66 67 os.makedirs(data_dir, exist_ok=True) 68 69 label_paths = natsorted(glob(os.path.join(label_dir, "*.txt"))) 70 assert len(label_paths) > 0, f"No label files found in {label_dir}" 71 72 data_paths = [] 73 for label_path in tqdm(label_paths, desc="Creating BALF data"): 74 stem = os.path.splitext(os.path.basename(label_path))[0] 75 76 image_path = os.path.join(image_dir, f"{stem}.jpg") 77 assert os.path.exists(image_path), f"Image not found: {image_path}" 78 79 data_path = os.path.join(data_dir, f"{stem}.h5") 80 data_paths.append(data_path) 81 82 if os.path.exists(data_path): 83 continue 84 85 image = imageio.imread(image_path) 86 h, w = image.shape[:2] 87 88 with open(label_path) as f: 89 lines = f.readlines() 90 91 # Parse YOLO polygon annotations and compute areas for sorting. 92 polygons = [] 93 for line in lines: 94 parts = line.strip().split() 95 class_id = int(parts[0]) 96 coords = [float(x) for x in parts[1:]] 97 xs = [coords[i] * w for i in range(0, len(coords), 2)] 98 ys = [coords[i] * h for i in range(1, len(coords), 2)] 99 rr, cc = draw_polygon(ys, xs, shape=(h, w)) 100 area = len(rr) 101 polygons.append((rr, cc, area, class_id)) 102 103 # Sort by area (largest first so smaller objects are not occluded). 104 sorting = np.argsort([p[2] for p in polygons])[::-1] 105 106 instances = np.zeros((h, w), dtype="uint16") 107 semantic = np.zeros((h, w), dtype="uint16") 108 for seg_id, idx in enumerate(sorting, 1): 109 rr, cc, _, class_id = polygons[idx] 110 instances[rr, cc] = seg_id 111 semantic[rr, cc] = class_id + 1 # 0 = background, 1-7 = cell types 112 113 # Store image as channels-first (C, H, W). 114 raw = image.transpose(2, 0, 1) 115 116 with h5py.File(data_path, "w") as f: 117 f.create_dataset("raw", data=raw, compression="gzip") 118 f.create_dataset("labels/instances", data=instances, compression="gzip") 119 f.create_dataset("labels/semantic", data=semantic, compression="gzip") 120 121 return natsorted(data_paths) 122 123 124def get_balf_data(path: Union[os.PathLike, str], download: bool = False) -> str: 125 """Download the BALF dataset. 126 127 Args: 128 path: Filepath to a folder where the downloaded data will be saved. 129 download: Whether to download the data if it is not present. 130 131 Returns: 132 The path where the data is stored. 133 """ 134 for key in URLS: 135 fname = URLS[key].rsplit("/", 1)[-1] 136 dirname = os.path.splitext(fname)[0].lower() 137 138 if os.path.exists(os.path.join(path, dirname)): 139 continue 140 141 os.makedirs(path, exist_ok=True) 142 rar_path = os.path.join(path, fname) 143 util.download_source(path=rar_path, url=URLS[key], download=download, checksum=CHECKSUMS[key]) 144 util.unzip_rarfile(rar_path=rar_path, dst=path) 145 146 return path 147 148 149def get_balf_paths( 150 path: Union[os.PathLike, str], 151 split: Literal["train", "val"] = "train", 152 download: bool = False, 153) -> List[str]: 154 """Get paths to the BALF data. 155 156 Args: 157 path: Filepath to a folder where the downloaded data will be saved. 158 split: The data split to use. Either 'train' or 'val'. 159 download: Whether to download the data if it is not present. 160 161 Returns: 162 List of filepaths for the stored data. 163 """ 164 assert split in SPLITS, f"'{split}' is not a valid split. Choose from {SPLITS}." 165 166 get_balf_data(path, download) 167 168 image_dir = os.path.join(path, "images", split) 169 label_dir = os.path.join(path, "labels", split) 170 data_dir = os.path.join(path, "data", split) 171 172 if not os.path.exists(data_dir) or len(glob(os.path.join(data_dir, "*.h5"))) == 0: 173 data_paths = _create_data_from_yolo(image_dir, label_dir, data_dir) 174 else: 175 data_paths = natsorted(glob(os.path.join(data_dir, "*.h5"))) 176 177 assert len(data_paths) > 0 178 return data_paths 179 180 181def get_balf_dataset( 182 path: Union[os.PathLike, str], 183 patch_shape: Tuple[int, int], 184 split: Literal["train", "val"] = "train", 185 segmentation_type: Literal["instances", "semantic"] = "instances", 186 offsets: Optional[List[List[int]]] = None, 187 boundaries: bool = False, 188 binary: bool = False, 189 download: bool = False, 190 **kwargs 191) -> Dataset: 192 """Get the BALF dataset for cell segmentation in bronchoalveolar lavage fluid microscopy images. 193 194 Args: 195 path: Filepath to a folder where the downloaded data will be saved. 196 patch_shape: The patch shape to use for training. 197 split: The data split to use. Either 'train' or 'val'. 198 segmentation_type: The segmentation target. Either 'instances' or 'semantic'. 199 offsets: Offset values for affinity computation used as target. 200 boundaries: Whether to compute boundaries as the target. 201 binary: Whether to use a binary segmentation target. 202 download: Whether to download the data if it is not present. 203 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 204 205 Returns: 206 The segmentation dataset. 207 """ 208 data_paths = get_balf_paths(path, split, download) 209 210 kwargs = util.ensure_transforms(ndim=2, **kwargs) 211 kwargs, _ = util.add_instance_label_transform( 212 kwargs, add_binary_target=True, offsets=offsets, boundaries=boundaries, binary=binary 213 ) 214 215 return torch_em.default_segmentation_dataset( 216 raw_paths=data_paths, 217 raw_key="raw", 218 label_paths=data_paths, 219 label_key=f"labels/{segmentation_type}", 220 patch_shape=patch_shape, 221 with_channels=True, 222 ndim=2, 223 **kwargs 224 ) 225 226 227def get_balf_loader( 228 path: Union[os.PathLike, str], 229 batch_size: int, 230 patch_shape: Tuple[int, int], 231 split: Literal["train", "val"] = "train", 232 segmentation_type: Literal["instances", "semantic"] = "instances", 233 offsets: Optional[List[List[int]]] = None, 234 boundaries: bool = False, 235 binary: bool = False, 236 download: bool = False, 237 **kwargs 238) -> DataLoader: 239 """Get the BALF dataloader for cell segmentation in bronchoalveolar lavage fluid microscopy images. 240 241 Args: 242 path: Filepath to a folder where the downloaded data will be saved. 243 batch_size: The batch size for training. 244 patch_shape: The patch shape to use for training. 245 split: The data split to use. Either 'train' or 'val'. 246 segmentation_type: The segmentation target. Either 'instances' or 'semantic'. 247 offsets: Offset values for affinity computation used as target. 248 boundaries: Whether to compute boundaries as the target. 249 binary: Whether to use a binary segmentation target. 250 download: Whether to download the data if it is not present. 251 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 252 253 Returns: 254 The DataLoader. 255 """ 256 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 257 dataset = get_balf_dataset( 258 path=path, 259 patch_shape=patch_shape, 260 split=split, 261 segmentation_type=segmentation_type, 262 offsets=offsets, 263 boundaries=boundaries, 264 binary=binary, 265 download=download, 266 **ds_kwargs, 267 ) 268 return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)
URLS =
{'images': 'https://zenodo.org/records/14871206/files/Images.rar', 'labels': 'https://zenodo.org/records/14871206/files/Labels.rar'}
CHECKSUMS =
{'images': None, 'labels': None}
CELL_TYPES =
['erythrocyte', 'ciliated_columnar_epithelial', 'squamous_epithelial', 'macrophage', 'lymphocyte', 'neutrophil', 'eosinophil']
SPLITS =
['train', 'val']
def
get_balf_data(path: Union[os.PathLike, str], download: bool = False) -> str:
125def get_balf_data(path: Union[os.PathLike, str], download: bool = False) -> str: 126 """Download the BALF dataset. 127 128 Args: 129 path: Filepath to a folder where the downloaded data will be saved. 130 download: Whether to download the data if it is not present. 131 132 Returns: 133 The path where the data is stored. 134 """ 135 for key in URLS: 136 fname = URLS[key].rsplit("/", 1)[-1] 137 dirname = os.path.splitext(fname)[0].lower() 138 139 if os.path.exists(os.path.join(path, dirname)): 140 continue 141 142 os.makedirs(path, exist_ok=True) 143 rar_path = os.path.join(path, fname) 144 util.download_source(path=rar_path, url=URLS[key], download=download, checksum=CHECKSUMS[key]) 145 util.unzip_rarfile(rar_path=rar_path, dst=path) 146 147 return path
Download the BALF dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- download: Whether to download the data if it is not present.
Returns:
The path where the data is stored.
def
get_balf_paths( path: Union[os.PathLike, str], split: Literal['train', 'val'] = 'train', download: bool = False) -> List[str]:
150def get_balf_paths( 151 path: Union[os.PathLike, str], 152 split: Literal["train", "val"] = "train", 153 download: bool = False, 154) -> List[str]: 155 """Get paths to the BALF data. 156 157 Args: 158 path: Filepath to a folder where the downloaded data will be saved. 159 split: The data split to use. Either 'train' or 'val'. 160 download: Whether to download the data if it is not present. 161 162 Returns: 163 List of filepaths for the stored data. 164 """ 165 assert split in SPLITS, f"'{split}' is not a valid split. Choose from {SPLITS}." 166 167 get_balf_data(path, download) 168 169 image_dir = os.path.join(path, "images", split) 170 label_dir = os.path.join(path, "labels", split) 171 data_dir = os.path.join(path, "data", split) 172 173 if not os.path.exists(data_dir) or len(glob(os.path.join(data_dir, "*.h5"))) == 0: 174 data_paths = _create_data_from_yolo(image_dir, label_dir, data_dir) 175 else: 176 data_paths = natsorted(glob(os.path.join(data_dir, "*.h5"))) 177 178 assert len(data_paths) > 0 179 return data_paths
Get paths to the BALF data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The data split to use. Either 'train' or 'val'.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the stored data.
def
get_balf_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Literal['train', 'val'] = 'train', segmentation_type: Literal['instances', 'semantic'] = 'instances', offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
182def get_balf_dataset( 183 path: Union[os.PathLike, str], 184 patch_shape: Tuple[int, int], 185 split: Literal["train", "val"] = "train", 186 segmentation_type: Literal["instances", "semantic"] = "instances", 187 offsets: Optional[List[List[int]]] = None, 188 boundaries: bool = False, 189 binary: bool = False, 190 download: bool = False, 191 **kwargs 192) -> Dataset: 193 """Get the BALF dataset for cell segmentation in bronchoalveolar lavage fluid microscopy images. 194 195 Args: 196 path: Filepath to a folder where the downloaded data will be saved. 197 patch_shape: The patch shape to use for training. 198 split: The data split to use. Either 'train' or 'val'. 199 segmentation_type: The segmentation target. Either 'instances' or 'semantic'. 200 offsets: Offset values for affinity computation used as target. 201 boundaries: Whether to compute boundaries as the target. 202 binary: Whether to use a binary segmentation target. 203 download: Whether to download the data if it is not present. 204 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 205 206 Returns: 207 The segmentation dataset. 208 """ 209 data_paths = get_balf_paths(path, split, download) 210 211 kwargs = util.ensure_transforms(ndim=2, **kwargs) 212 kwargs, _ = util.add_instance_label_transform( 213 kwargs, add_binary_target=True, offsets=offsets, boundaries=boundaries, binary=binary 214 ) 215 216 return torch_em.default_segmentation_dataset( 217 raw_paths=data_paths, 218 raw_key="raw", 219 label_paths=data_paths, 220 label_key=f"labels/{segmentation_type}", 221 patch_shape=patch_shape, 222 with_channels=True, 223 ndim=2, 224 **kwargs 225 )
Get the BALF dataset for cell segmentation in bronchoalveolar lavage fluid microscopy images.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- split: The data split to use. Either 'train' or 'val'.
- segmentation_type: The segmentation target. Either 'instances' or 'semantic'.
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- binary: Whether to use a binary segmentation target.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset.
Returns:
The segmentation dataset.
def
get_balf_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Literal['train', 'val'] = 'train', segmentation_type: Literal['instances', 'semantic'] = 'instances', offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
228def get_balf_loader( 229 path: Union[os.PathLike, str], 230 batch_size: int, 231 patch_shape: Tuple[int, int], 232 split: Literal["train", "val"] = "train", 233 segmentation_type: Literal["instances", "semantic"] = "instances", 234 offsets: Optional[List[List[int]]] = None, 235 boundaries: bool = False, 236 binary: bool = False, 237 download: bool = False, 238 **kwargs 239) -> DataLoader: 240 """Get the BALF dataloader for cell segmentation in bronchoalveolar lavage fluid microscopy images. 241 242 Args: 243 path: Filepath to a folder where the downloaded data will be saved. 244 batch_size: The batch size for training. 245 patch_shape: The patch shape to use for training. 246 split: The data split to use. Either 'train' or 'val'. 247 segmentation_type: The segmentation target. Either 'instances' or 'semantic'. 248 offsets: Offset values for affinity computation used as target. 249 boundaries: Whether to compute boundaries as the target. 250 binary: Whether to use a binary segmentation target. 251 download: Whether to download the data if it is not present. 252 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 253 254 Returns: 255 The DataLoader. 256 """ 257 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 258 dataset = get_balf_dataset( 259 path=path, 260 patch_shape=patch_shape, 261 split=split, 262 segmentation_type=segmentation_type, 263 offsets=offsets, 264 boundaries=boundaries, 265 binary=binary, 266 download=download, 267 **ds_kwargs, 268 ) 269 return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)
Get the BALF dataloader for cell segmentation in bronchoalveolar lavage fluid microscopy images.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- split: The data split to use. Either 'train' or 'val'.
- segmentation_type: The segmentation target. Either 'instances' or 'semantic'.
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- binary: Whether to use a binary segmentation target.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_datasetor for the PyTorch DataLoader.
Returns:
The DataLoader.