torch_em.data.datasets.light_microscopy.bbbc030
The BBBC030 dataset contains 60 Differential Interference Contrast (DIC) images of Chinese Hamster Ovary (CHO) cells acquired during initial cell attachment, with hand-segmented cell contour ground truth annotations.
Raw images are RGB-encoded grayscale (R=G=B). Ground truth files are contour/boundary maps (thin cell outlines), which are converted to instance segmentation labels by finding the enclosed regions and labeling them with connected components.
The dataset is located at https://bbbc.broadinstitute.org/BBBC030. This dataset is from the following publication:
- Koos et al. (2016): https://doi.org/10.1371/journal.pone.0163431 Please cite it if you use this dataset in your research.
1"""The BBBC030 dataset contains 60 Differential Interference Contrast (DIC) images 2of Chinese Hamster Ovary (CHO) cells acquired during initial cell attachment, with 3hand-segmented cell contour ground truth annotations. 4 5Raw images are RGB-encoded grayscale (R=G=B). Ground truth files are contour/boundary 6maps (thin cell outlines), which are converted to instance segmentation labels by 7finding the enclosed regions and labeling them with connected components. 8 9The dataset is located at https://bbbc.broadinstitute.org/BBBC030. 10This dataset is from the following publication: 11- Koos et al. (2016): https://doi.org/10.1371/journal.pone.0163431 12Please cite it if you use this dataset in your research. 13""" 14 15import os 16from glob import glob 17from natsort import natsorted 18from typing import List, Optional, Tuple, Union 19 20import numpy as np 21import imageio.v3 as imageio 22from tqdm import tqdm 23from sklearn.model_selection import train_test_split 24 25from torch.utils.data import Dataset, DataLoader 26 27import torch_em 28 29from .. import util 30 31 32IMAGE_URL = "https://data.broadinstitute.org/bbbc/BBBC030/images.zip" 33IMAGE_CHECKSUM = None 34 35GT_URL = "https://data.broadinstitute.org/bbbc/BBBC030/ground_truth.zip" 36GT_CHECKSUM = None 37 38 39def _contours_to_instances(contour_mask: np.ndarray) -> np.ndarray: 40 """Convert a contour/boundary map to an instance segmentation label image. 41 42 Cells are identified as enclosed regions surrounded by boundary pixels. 43 The large background region is removed; remaining connected components are 44 each assigned a unique integer label. 45 """ 46 from skimage.morphology import binary_dilation, disk 47 from skimage.measure import regionprops 48 from bioimage_cpp.segmentation import label 49 50 boundaries = contour_mask > 0 51 52 # Dilate slightly to close small gaps in hand-drawn contours. 53 closed = binary_dilation(boundaries, disk(2)) 54 55 # Enclosed interior regions are the complement of the closed boundaries. 56 interior = ~closed 57 labeled = label(interior) 58 59 # The largest connected component is the background - remove it. 60 props = regionprops(labeled) 61 if not props: 62 return np.zeros_like(contour_mask, dtype=np.int32) 63 64 bg_label = max(props, key=lambda p: p.area).label 65 labeled[labeled == bg_label] = 0 66 67 return labeled.astype(np.int32) 68 69 70def _preprocess(data_dir: str) -> str: 71 """Convert raw PNGs to preprocessed H5 files (grayscale raw + instance labels).""" 72 import h5py 73 74 h5_dir = os.path.join(data_dir, "h5_data") 75 if os.path.exists(h5_dir): 76 return h5_dir 77 os.makedirs(h5_dir, exist_ok=True) 78 79 raw_paths = natsorted(glob(os.path.join(data_dir, "images", "*.png"))) 80 for raw_path in tqdm(raw_paths, desc="Preprocessing BBBC030"): 81 fname = os.path.splitext(os.path.basename(raw_path))[0] 82 h5_path = os.path.join(h5_dir, fname + ".h5") 83 84 gt_path = os.path.join(data_dir, "ground_truth", os.path.basename(raw_path)) 85 if not os.path.exists(gt_path): 86 continue 87 88 raw = imageio.imread(raw_path) 89 if raw.ndim == 3: # grayscale saved as RGB 90 raw = raw[..., 0] 91 92 contours = imageio.imread(gt_path) 93 instances = _contours_to_instances(contours) 94 95 with h5py.File(h5_path, "w") as f: 96 f.create_dataset("raw", data=raw, compression="gzip") 97 f.create_dataset("labels", data=instances, compression="gzip") 98 99 return h5_dir 100 101 102def get_bbbc030_data(path: Union[os.PathLike, str], download: bool = False) -> str: 103 """Download and preprocess the BBBC030 dataset. 104 105 Args: 106 path: Filepath to a folder where the downloaded data will be saved. 107 download: Whether to download the data if it is not present. 108 109 Returns: 110 The filepath to the preprocessed H5 data directory. 111 """ 112 data_dir = os.path.join(path, "BBBC030") 113 114 if not os.path.exists(data_dir): 115 os.makedirs(data_dir, exist_ok=True) 116 img_zip = os.path.join(path, "BBBC030_images.zip") 117 gt_zip = os.path.join(path, "BBBC030_ground_truth.zip") 118 util.download_source(img_zip, IMAGE_URL, download, checksum=IMAGE_CHECKSUM) 119 util.download_source(gt_zip, GT_URL, download, checksum=GT_CHECKSUM) 120 util.unzip(img_zip, data_dir) 121 util.unzip(gt_zip, data_dir) 122 123 return _preprocess(data_dir) 124 125 126def get_bbbc030_paths( 127 path: Union[os.PathLike, str], 128 split: Optional[str] = None, 129 download: bool = False, 130) -> Tuple[List[str], List[str]]: 131 """Get paths to the BBBC030 data. 132 133 Args: 134 path: Filepath to a folder where the downloaded data will be saved. 135 split: The data split to use. One of 'train', 'val', 'test', or None (use all). 136 download: Whether to download the data if it is not present. 137 138 Returns: 139 List of filepaths for the image data (H5, key 'raw'). 140 List of filepaths for the label data (H5, key 'labels'). 141 """ 142 h5_dir = get_bbbc030_data(path, download) 143 h5_paths = natsorted(glob(os.path.join(h5_dir, "*.h5"))) 144 145 if len(h5_paths) == 0: 146 raise RuntimeError(f"No preprocessed files found in {h5_dir}.") 147 148 if split is None: 149 return h5_paths, h5_paths 150 151 train_paths, test_paths = train_test_split(h5_paths, test_size=0.2, random_state=42) 152 train_paths, val_paths = train_test_split(train_paths, test_size=0.15, random_state=42) 153 154 split_map = {"train": train_paths, "val": val_paths, "test": test_paths} 155 assert split in split_map, f"'{split}' is not a valid split. Choose from {list(split_map)}." 156 selected = split_map[split] 157 return selected, selected 158 159 160def get_bbbc030_dataset( 161 path: Union[os.PathLike, str], 162 patch_shape: Tuple[int, int], 163 split: Optional[str] = None, 164 download: bool = False, 165 **kwargs, 166) -> Dataset: 167 """Get the BBBC030 dataset for DIC cell instance segmentation. 168 169 Args: 170 path: Filepath to a folder where the downloaded data will be saved. 171 patch_shape: The patch shape to use for training. 172 split: The data split to use. One of 'train', 'val', 'test', or None (use all). 173 download: Whether to download the data if it is not present. 174 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 175 176 Returns: 177 The segmentation dataset. 178 """ 179 raw_paths, label_paths = get_bbbc030_paths(path, split, download) 180 181 return torch_em.default_segmentation_dataset( 182 raw_paths=raw_paths, 183 raw_key="raw", 184 label_paths=label_paths, 185 label_key="labels", 186 patch_shape=patch_shape, 187 **kwargs, 188 ) 189 190 191def get_bbbc030_loader( 192 path: Union[os.PathLike, str], 193 batch_size: int, 194 patch_shape: Tuple[int, int], 195 split: Optional[str] = None, 196 download: bool = False, 197 **kwargs, 198) -> DataLoader: 199 """Get the BBBC030 dataloader for DIC cell instance segmentation. 200 201 Args: 202 path: Filepath to a folder where the downloaded data will be saved. 203 batch_size: The batch size for training. 204 patch_shape: The patch shape to use for training. 205 split: The data split to use. One of 'train', 'val', 'test', or None (use all). 206 download: Whether to download the data if it is not present. 207 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 208 209 Returns: 210 The DataLoader. 211 """ 212 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 213 dataset = get_bbbc030_dataset(path, patch_shape, split, download, **ds_kwargs) 214 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
103def get_bbbc030_data(path: Union[os.PathLike, str], download: bool = False) -> str: 104 """Download and preprocess the BBBC030 dataset. 105 106 Args: 107 path: Filepath to a folder where the downloaded data will be saved. 108 download: Whether to download the data if it is not present. 109 110 Returns: 111 The filepath to the preprocessed H5 data directory. 112 """ 113 data_dir = os.path.join(path, "BBBC030") 114 115 if not os.path.exists(data_dir): 116 os.makedirs(data_dir, exist_ok=True) 117 img_zip = os.path.join(path, "BBBC030_images.zip") 118 gt_zip = os.path.join(path, "BBBC030_ground_truth.zip") 119 util.download_source(img_zip, IMAGE_URL, download, checksum=IMAGE_CHECKSUM) 120 util.download_source(gt_zip, GT_URL, download, checksum=GT_CHECKSUM) 121 util.unzip(img_zip, data_dir) 122 util.unzip(gt_zip, data_dir) 123 124 return _preprocess(data_dir)
Download and preprocess the BBBC030 dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- download: Whether to download the data if it is not present.
Returns:
The filepath to the preprocessed H5 data directory.
127def get_bbbc030_paths( 128 path: Union[os.PathLike, str], 129 split: Optional[str] = None, 130 download: bool = False, 131) -> Tuple[List[str], List[str]]: 132 """Get paths to the BBBC030 data. 133 134 Args: 135 path: Filepath to a folder where the downloaded data will be saved. 136 split: The data split to use. One of 'train', 'val', 'test', or None (use all). 137 download: Whether to download the data if it is not present. 138 139 Returns: 140 List of filepaths for the image data (H5, key 'raw'). 141 List of filepaths for the label data (H5, key 'labels'). 142 """ 143 h5_dir = get_bbbc030_data(path, download) 144 h5_paths = natsorted(glob(os.path.join(h5_dir, "*.h5"))) 145 146 if len(h5_paths) == 0: 147 raise RuntimeError(f"No preprocessed files found in {h5_dir}.") 148 149 if split is None: 150 return h5_paths, h5_paths 151 152 train_paths, test_paths = train_test_split(h5_paths, test_size=0.2, random_state=42) 153 train_paths, val_paths = train_test_split(train_paths, test_size=0.15, random_state=42) 154 155 split_map = {"train": train_paths, "val": val_paths, "test": test_paths} 156 assert split in split_map, f"'{split}' is not a valid split. Choose from {list(split_map)}." 157 selected = split_map[split] 158 return selected, selected
Get paths to the BBBC030 data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The data split to use. One of 'train', 'val', 'test', or None (use all).
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the image data (H5, key 'raw'). List of filepaths for the label data (H5, key 'labels').
161def get_bbbc030_dataset( 162 path: Union[os.PathLike, str], 163 patch_shape: Tuple[int, int], 164 split: Optional[str] = None, 165 download: bool = False, 166 **kwargs, 167) -> Dataset: 168 """Get the BBBC030 dataset for DIC cell instance segmentation. 169 170 Args: 171 path: Filepath to a folder where the downloaded data will be saved. 172 patch_shape: The patch shape to use for training. 173 split: The data split to use. One of 'train', 'val', 'test', or None (use all). 174 download: Whether to download the data if it is not present. 175 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 176 177 Returns: 178 The segmentation dataset. 179 """ 180 raw_paths, label_paths = get_bbbc030_paths(path, split, download) 181 182 return torch_em.default_segmentation_dataset( 183 raw_paths=raw_paths, 184 raw_key="raw", 185 label_paths=label_paths, 186 label_key="labels", 187 patch_shape=patch_shape, 188 **kwargs, 189 )
Get the BBBC030 dataset for DIC cell instance segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- split: The data split to use. One of 'train', 'val', 'test', or None (use all).
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset.
Returns:
The segmentation dataset.
192def get_bbbc030_loader( 193 path: Union[os.PathLike, str], 194 batch_size: int, 195 patch_shape: Tuple[int, int], 196 split: Optional[str] = None, 197 download: bool = False, 198 **kwargs, 199) -> DataLoader: 200 """Get the BBBC030 dataloader for DIC cell instance segmentation. 201 202 Args: 203 path: Filepath to a folder where the downloaded data will be saved. 204 batch_size: The batch size for training. 205 patch_shape: The patch shape to use for training. 206 split: The data split to use. One of 'train', 'val', 'test', or None (use all). 207 download: Whether to download the data if it is not present. 208 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 209 210 Returns: 211 The DataLoader. 212 """ 213 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 214 dataset = get_bbbc030_dataset(path, patch_shape, split, download, **ds_kwargs) 215 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the BBBC030 dataloader for DIC cell instance segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- split: The data split to use. One of 'train', 'val', 'test', or None (use all).
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_datasetor for the PyTorch DataLoader.
Returns:
The DataLoader.