torch_em.data.datasets.light_microscopy.bbbc030
The BBBC030 dataset contains 60 Differential Interference Contrast (DIC) images of Chinese Hamster Ovary (CHO) cells acquired during initial cell attachment, with hand-segmented cell contour ground truth annotations.
Raw images are RGB-encoded grayscale (R=G=B). Ground truth files are contour/boundary maps (thin cell outlines), which are converted to instance segmentation labels by finding the enclosed regions and labeling them with connected components.
The dataset is located at https://bbbc.broadinstitute.org/BBBC030. This dataset is from the following publication:
- Koos et al. (2016): https://doi.org/10.1371/journal.pone.0163431 Please cite it if you use this dataset in your research.
1"""The BBBC030 dataset contains 60 Differential Interference Contrast (DIC) images 2of Chinese Hamster Ovary (CHO) cells acquired during initial cell attachment, with 3hand-segmented cell contour ground truth annotations. 4 5Raw images are RGB-encoded grayscale (R=G=B). Ground truth files are contour/boundary 6maps (thin cell outlines), which are converted to instance segmentation labels by 7finding the enclosed regions and labeling them with connected components. 8 9The dataset is located at https://bbbc.broadinstitute.org/BBBC030. 10This dataset is from the following publication: 11- Koos et al. (2016): https://doi.org/10.1371/journal.pone.0163431 12Please cite it if you use this dataset in your research. 13""" 14 15import os 16from glob import glob 17from natsort import natsorted 18from typing import List, Optional, Tuple, Union 19 20import numpy as np 21import imageio.v3 as imageio 22from tqdm import tqdm 23from sklearn.model_selection import train_test_split 24 25from torch.utils.data import Dataset, DataLoader 26 27import torch_em 28 29from .. import util 30 31 32IMAGE_URL = "https://data.broadinstitute.org/bbbc/BBBC030/images.zip" 33IMAGE_CHECKSUM = None 34 35GT_URL = "https://data.broadinstitute.org/bbbc/BBBC030/ground_truth.zip" 36GT_CHECKSUM = None 37 38 39def _contours_to_instances(contour_mask: np.ndarray) -> np.ndarray: 40 """Convert a contour/boundary map to an instance segmentation label image. 41 42 Cells are identified as enclosed regions surrounded by boundary pixels. 43 The large background region is removed; remaining connected components are 44 each assigned a unique integer label. 45 """ 46 from skimage.morphology import binary_dilation, disk 47 from skimage.measure import label, regionprops 48 49 boundaries = contour_mask > 0 50 51 # Dilate slightly to close small gaps in hand-drawn contours. 52 closed = binary_dilation(boundaries, disk(2)) 53 54 # Enclosed interior regions are the complement of the closed boundaries. 55 interior = ~closed 56 labeled = label(interior) 57 58 # The largest connected component is the background - remove it. 59 props = regionprops(labeled) 60 if not props: 61 return np.zeros_like(contour_mask, dtype=np.int32) 62 63 bg_label = max(props, key=lambda p: p.area).label 64 labeled[labeled == bg_label] = 0 65 66 return labeled.astype(np.int32) 67 68 69def _preprocess(data_dir: str) -> str: 70 """Convert raw PNGs to preprocessed H5 files (grayscale raw + instance labels).""" 71 import h5py 72 73 h5_dir = os.path.join(data_dir, "h5_data") 74 if os.path.exists(h5_dir): 75 return h5_dir 76 os.makedirs(h5_dir, exist_ok=True) 77 78 raw_paths = natsorted(glob(os.path.join(data_dir, "images", "*.png"))) 79 for raw_path in tqdm(raw_paths, desc="Preprocessing BBBC030"): 80 fname = os.path.splitext(os.path.basename(raw_path))[0] 81 h5_path = os.path.join(h5_dir, fname + ".h5") 82 83 gt_path = os.path.join(data_dir, "ground_truth", os.path.basename(raw_path)) 84 if not os.path.exists(gt_path): 85 continue 86 87 raw = imageio.imread(raw_path) 88 if raw.ndim == 3: # grayscale saved as RGB 89 raw = raw[..., 0] 90 91 contours = imageio.imread(gt_path) 92 instances = _contours_to_instances(contours) 93 94 with h5py.File(h5_path, "w") as f: 95 f.create_dataset("raw", data=raw, compression="gzip") 96 f.create_dataset("labels", data=instances, compression="gzip") 97 98 return h5_dir 99 100 101def get_bbbc030_data(path: Union[os.PathLike, str], download: bool = False) -> str: 102 """Download and preprocess the BBBC030 dataset. 103 104 Args: 105 path: Filepath to a folder where the downloaded data will be saved. 106 download: Whether to download the data if it is not present. 107 108 Returns: 109 The filepath to the preprocessed H5 data directory. 110 """ 111 data_dir = os.path.join(path, "BBBC030") 112 113 if not os.path.exists(data_dir): 114 os.makedirs(data_dir, exist_ok=True) 115 img_zip = os.path.join(path, "BBBC030_images.zip") 116 gt_zip = os.path.join(path, "BBBC030_ground_truth.zip") 117 util.download_source(img_zip, IMAGE_URL, download, checksum=IMAGE_CHECKSUM) 118 util.download_source(gt_zip, GT_URL, download, checksum=GT_CHECKSUM) 119 util.unzip(img_zip, data_dir) 120 util.unzip(gt_zip, data_dir) 121 122 return _preprocess(data_dir) 123 124 125def get_bbbc030_paths( 126 path: Union[os.PathLike, str], 127 split: Optional[str] = None, 128 download: bool = False, 129) -> Tuple[List[str], List[str]]: 130 """Get paths to the BBBC030 data. 131 132 Args: 133 path: Filepath to a folder where the downloaded data will be saved. 134 split: The data split to use. One of 'train', 'val', 'test', or None (use all). 135 download: Whether to download the data if it is not present. 136 137 Returns: 138 List of filepaths for the image data (H5, key 'raw'). 139 List of filepaths for the label data (H5, key 'labels'). 140 """ 141 h5_dir = get_bbbc030_data(path, download) 142 h5_paths = natsorted(glob(os.path.join(h5_dir, "*.h5"))) 143 144 if len(h5_paths) == 0: 145 raise RuntimeError(f"No preprocessed files found in {h5_dir}.") 146 147 if split is None: 148 return h5_paths, h5_paths 149 150 train_paths, test_paths = train_test_split(h5_paths, test_size=0.2, random_state=42) 151 train_paths, val_paths = train_test_split(train_paths, test_size=0.15, random_state=42) 152 153 split_map = {"train": train_paths, "val": val_paths, "test": test_paths} 154 assert split in split_map, f"'{split}' is not a valid split. Choose from {list(split_map)}." 155 selected = split_map[split] 156 return selected, selected 157 158 159def get_bbbc030_dataset( 160 path: Union[os.PathLike, str], 161 patch_shape: Tuple[int, int], 162 split: Optional[str] = None, 163 download: bool = False, 164 **kwargs, 165) -> Dataset: 166 """Get the BBBC030 dataset for DIC cell instance segmentation. 167 168 Args: 169 path: Filepath to a folder where the downloaded data will be saved. 170 patch_shape: The patch shape to use for training. 171 split: The data split to use. One of 'train', 'val', 'test', or None (use all). 172 download: Whether to download the data if it is not present. 173 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 174 175 Returns: 176 The segmentation dataset. 177 """ 178 raw_paths, label_paths = get_bbbc030_paths(path, split, download) 179 180 return torch_em.default_segmentation_dataset( 181 raw_paths=raw_paths, 182 raw_key="raw", 183 label_paths=label_paths, 184 label_key="labels", 185 patch_shape=patch_shape, 186 **kwargs, 187 ) 188 189 190def get_bbbc030_loader( 191 path: Union[os.PathLike, str], 192 batch_size: int, 193 patch_shape: Tuple[int, int], 194 split: Optional[str] = None, 195 download: bool = False, 196 **kwargs, 197) -> DataLoader: 198 """Get the BBBC030 dataloader for DIC cell instance segmentation. 199 200 Args: 201 path: Filepath to a folder where the downloaded data will be saved. 202 batch_size: The batch size for training. 203 patch_shape: The patch shape to use for training. 204 split: The data split to use. One of 'train', 'val', 'test', or None (use all). 205 download: Whether to download the data if it is not present. 206 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 207 208 Returns: 209 The DataLoader. 210 """ 211 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 212 dataset = get_bbbc030_dataset(path, patch_shape, split, download, **ds_kwargs) 213 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
102def get_bbbc030_data(path: Union[os.PathLike, str], download: bool = False) -> str: 103 """Download and preprocess the BBBC030 dataset. 104 105 Args: 106 path: Filepath to a folder where the downloaded data will be saved. 107 download: Whether to download the data if it is not present. 108 109 Returns: 110 The filepath to the preprocessed H5 data directory. 111 """ 112 data_dir = os.path.join(path, "BBBC030") 113 114 if not os.path.exists(data_dir): 115 os.makedirs(data_dir, exist_ok=True) 116 img_zip = os.path.join(path, "BBBC030_images.zip") 117 gt_zip = os.path.join(path, "BBBC030_ground_truth.zip") 118 util.download_source(img_zip, IMAGE_URL, download, checksum=IMAGE_CHECKSUM) 119 util.download_source(gt_zip, GT_URL, download, checksum=GT_CHECKSUM) 120 util.unzip(img_zip, data_dir) 121 util.unzip(gt_zip, data_dir) 122 123 return _preprocess(data_dir)
Download and preprocess the BBBC030 dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- download: Whether to download the data if it is not present.
Returns:
The filepath to the preprocessed H5 data directory.
126def get_bbbc030_paths( 127 path: Union[os.PathLike, str], 128 split: Optional[str] = None, 129 download: bool = False, 130) -> Tuple[List[str], List[str]]: 131 """Get paths to the BBBC030 data. 132 133 Args: 134 path: Filepath to a folder where the downloaded data will be saved. 135 split: The data split to use. One of 'train', 'val', 'test', or None (use all). 136 download: Whether to download the data if it is not present. 137 138 Returns: 139 List of filepaths for the image data (H5, key 'raw'). 140 List of filepaths for the label data (H5, key 'labels'). 141 """ 142 h5_dir = get_bbbc030_data(path, download) 143 h5_paths = natsorted(glob(os.path.join(h5_dir, "*.h5"))) 144 145 if len(h5_paths) == 0: 146 raise RuntimeError(f"No preprocessed files found in {h5_dir}.") 147 148 if split is None: 149 return h5_paths, h5_paths 150 151 train_paths, test_paths = train_test_split(h5_paths, test_size=0.2, random_state=42) 152 train_paths, val_paths = train_test_split(train_paths, test_size=0.15, random_state=42) 153 154 split_map = {"train": train_paths, "val": val_paths, "test": test_paths} 155 assert split in split_map, f"'{split}' is not a valid split. Choose from {list(split_map)}." 156 selected = split_map[split] 157 return selected, selected
Get paths to the BBBC030 data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The data split to use. One of 'train', 'val', 'test', or None (use all).
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the image data (H5, key 'raw'). List of filepaths for the label data (H5, key 'labels').
160def get_bbbc030_dataset( 161 path: Union[os.PathLike, str], 162 patch_shape: Tuple[int, int], 163 split: Optional[str] = None, 164 download: bool = False, 165 **kwargs, 166) -> Dataset: 167 """Get the BBBC030 dataset for DIC cell instance segmentation. 168 169 Args: 170 path: Filepath to a folder where the downloaded data will be saved. 171 patch_shape: The patch shape to use for training. 172 split: The data split to use. One of 'train', 'val', 'test', or None (use all). 173 download: Whether to download the data if it is not present. 174 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 175 176 Returns: 177 The segmentation dataset. 178 """ 179 raw_paths, label_paths = get_bbbc030_paths(path, split, download) 180 181 return torch_em.default_segmentation_dataset( 182 raw_paths=raw_paths, 183 raw_key="raw", 184 label_paths=label_paths, 185 label_key="labels", 186 patch_shape=patch_shape, 187 **kwargs, 188 )
Get the BBBC030 dataset for DIC cell instance segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- split: The data split to use. One of 'train', 'val', 'test', or None (use all).
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset.
Returns:
The segmentation dataset.
191def get_bbbc030_loader( 192 path: Union[os.PathLike, str], 193 batch_size: int, 194 patch_shape: Tuple[int, int], 195 split: Optional[str] = None, 196 download: bool = False, 197 **kwargs, 198) -> DataLoader: 199 """Get the BBBC030 dataloader for DIC cell instance segmentation. 200 201 Args: 202 path: Filepath to a folder where the downloaded data will be saved. 203 batch_size: The batch size for training. 204 patch_shape: The patch shape to use for training. 205 split: The data split to use. One of 'train', 'val', 'test', or None (use all). 206 download: Whether to download the data if it is not present. 207 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 208 209 Returns: 210 The DataLoader. 211 """ 212 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 213 dataset = get_bbbc030_dataset(path, patch_shape, split, download, **ds_kwargs) 214 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the BBBC030 dataloader for DIC cell instance segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- split: The data split to use. One of 'train', 'val', 'test', or None (use all).
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_datasetor for the PyTorch DataLoader.
Returns:
The DataLoader.