torch_em.data.datasets.light_microscopy.bbbc030

The BBBC030 dataset contains 60 Differential Interference Contrast (DIC) images of Chinese Hamster Ovary (CHO) cells acquired during initial cell attachment, with hand-segmented cell contour ground truth annotations.

Raw images are RGB-encoded grayscale (R=G=B). Ground truth files are contour/boundary maps (thin cell outlines), which are converted to instance segmentation labels by finding the enclosed regions and labeling them with connected components.

The dataset is located at https://bbbc.broadinstitute.org/BBBC030. This dataset is from the following publication:

Koos et al. (2016): https://doi.org/10.1371/journal.pone.0163431 Please cite it if you use this dataset in your research.

View Source

  1"""The BBBC030 dataset contains 60 Differential Interference Contrast (DIC) images
  2of Chinese Hamster Ovary (CHO) cells acquired during initial cell attachment, with
  3hand-segmented cell contour ground truth annotations.
  4
  5Raw images are RGB-encoded grayscale (R=G=B). Ground truth files are contour/boundary
  6maps (thin cell outlines), which are converted to instance segmentation labels by
  7finding the enclosed regions and labeling them with connected components.
  8
  9The dataset is located at https://bbbc.broadinstitute.org/BBBC030.
 10This dataset is from the following publication:
 11- Koos et al. (2016): https://doi.org/10.1371/journal.pone.0163431
 12Please cite it if you use this dataset in your research.
 13"""
 14
 15import os
 16from glob import glob
 17from natsort import natsorted
 18from typing import List, Optional, Tuple, Union
 19
 20import numpy as np
 21import imageio.v3 as imageio
 22from tqdm import tqdm
 23from sklearn.model_selection import train_test_split
 24
 25from torch.utils.data import Dataset, DataLoader
 26
 27import torch_em
 28
 29from .. import util
 30
 31
 32IMAGE_URL = "https://data.broadinstitute.org/bbbc/BBBC030/images.zip"
 33IMAGE_CHECKSUM = None
 34
 35GT_URL = "https://data.broadinstitute.org/bbbc/BBBC030/ground_truth.zip"
 36GT_CHECKSUM = None
 37
 38
 39def _contours_to_instances(contour_mask: np.ndarray) -> np.ndarray:
 40    """Convert a contour/boundary map to an instance segmentation label image.
 41
 42    Cells are identified as enclosed regions surrounded by boundary pixels.
 43    The large background region is removed; remaining connected components are
 44    each assigned a unique integer label.
 45    """
 46    from skimage.morphology import binary_dilation, disk
 47    from skimage.measure import regionprops
 48    from bioimage_cpp.segmentation import label
 49
 50    boundaries = contour_mask > 0
 51
 52    # Dilate slightly to close small gaps in hand-drawn contours.
 53    closed = binary_dilation(boundaries, disk(2))
 54
 55    # Enclosed interior regions are the complement of the closed boundaries.
 56    interior = ~closed
 57    labeled = label(interior)
 58
 59    # The largest connected component is the background - remove it.
 60    props = regionprops(labeled)
 61    if not props:
 62        return np.zeros_like(contour_mask, dtype=np.int32)
 63
 64    bg_label = max(props, key=lambda p: p.area).label
 65    labeled[labeled == bg_label] = 0
 66
 67    return labeled.astype(np.int32)
 68
 69
 70def _preprocess(data_dir: str) -> str:
 71    """Convert raw PNGs to preprocessed H5 files (grayscale raw + instance labels)."""
 72    import h5py
 73
 74    h5_dir = os.path.join(data_dir, "h5_data")
 75    if os.path.exists(h5_dir):
 76        return h5_dir
 77    os.makedirs(h5_dir, exist_ok=True)
 78
 79    raw_paths = natsorted(glob(os.path.join(data_dir, "images", "*.png")))
 80    for raw_path in tqdm(raw_paths, desc="Preprocessing BBBC030"):
 81        fname = os.path.splitext(os.path.basename(raw_path))[0]
 82        h5_path = os.path.join(h5_dir, fname + ".h5")
 83
 84        gt_path = os.path.join(data_dir, "ground_truth", os.path.basename(raw_path))
 85        if not os.path.exists(gt_path):
 86            continue
 87
 88        raw = imageio.imread(raw_path)
 89        if raw.ndim == 3:  # grayscale saved as RGB
 90            raw = raw[..., 0]
 91
 92        contours = imageio.imread(gt_path)
 93        instances = _contours_to_instances(contours)
 94
 95        with h5py.File(h5_path, "w") as f:
 96            f.create_dataset("raw", data=raw, compression="gzip")
 97            f.create_dataset("labels", data=instances, compression="gzip")
 98
 99    return h5_dir
100
101
102def get_bbbc030_data(path: Union[os.PathLike, str], download: bool = False) -> str:
103    """Download and preprocess the BBBC030 dataset.
104
105    Args:
106        path: Filepath to a folder where the downloaded data will be saved.
107        download: Whether to download the data if it is not present.
108
109    Returns:
110        The filepath to the preprocessed H5 data directory.
111    """
112    data_dir = os.path.join(path, "BBBC030")
113
114    if not os.path.exists(data_dir):
115        os.makedirs(data_dir, exist_ok=True)
116        img_zip = os.path.join(path, "BBBC030_images.zip")
117        gt_zip = os.path.join(path, "BBBC030_ground_truth.zip")
118        util.download_source(img_zip, IMAGE_URL, download, checksum=IMAGE_CHECKSUM)
119        util.download_source(gt_zip, GT_URL, download, checksum=GT_CHECKSUM)
120        util.unzip(img_zip, data_dir)
121        util.unzip(gt_zip, data_dir)
122
123    return _preprocess(data_dir)
124
125
126def get_bbbc030_paths(
127    path: Union[os.PathLike, str],
128    split: Optional[str] = None,
129    download: bool = False,
130) -> Tuple[List[str], List[str]]:
131    """Get paths to the BBBC030 data.
132
133    Args:
134        path: Filepath to a folder where the downloaded data will be saved.
135        split: The data split to use. One of 'train', 'val', 'test', or None (use all).
136        download: Whether to download the data if it is not present.
137
138    Returns:
139        List of filepaths for the image data (H5, key 'raw').
140        List of filepaths for the label data (H5, key 'labels').
141    """
142    h5_dir = get_bbbc030_data(path, download)
143    h5_paths = natsorted(glob(os.path.join(h5_dir, "*.h5")))
144
145    if len(h5_paths) == 0:
146        raise RuntimeError(f"No preprocessed files found in {h5_dir}.")
147
148    if split is None:
149        return h5_paths, h5_paths
150
151    train_paths, test_paths = train_test_split(h5_paths, test_size=0.2, random_state=42)
152    train_paths, val_paths = train_test_split(train_paths, test_size=0.15, random_state=42)
153
154    split_map = {"train": train_paths, "val": val_paths, "test": test_paths}
155    assert split in split_map, f"'{split}' is not a valid split. Choose from {list(split_map)}."
156    selected = split_map[split]
157    return selected, selected
158
159
160def get_bbbc030_dataset(
161    path: Union[os.PathLike, str],
162    patch_shape: Tuple[int, int],
163    split: Optional[str] = None,
164    download: bool = False,
165    **kwargs,
166) -> Dataset:
167    """Get the BBBC030 dataset for DIC cell instance segmentation.
168
169    Args:
170        path: Filepath to a folder where the downloaded data will be saved.
171        patch_shape: The patch shape to use for training.
172        split: The data split to use. One of 'train', 'val', 'test', or None (use all).
173        download: Whether to download the data if it is not present.
174        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
175
176    Returns:
177        The segmentation dataset.
178    """
179    raw_paths, label_paths = get_bbbc030_paths(path, split, download)
180
181    return torch_em.default_segmentation_dataset(
182        raw_paths=raw_paths,
183        raw_key="raw",
184        label_paths=label_paths,
185        label_key="labels",
186        patch_shape=patch_shape,
187        **kwargs,
188    )
189
190
191def get_bbbc030_loader(
192    path: Union[os.PathLike, str],
193    batch_size: int,
194    patch_shape: Tuple[int, int],
195    split: Optional[str] = None,
196    download: bool = False,
197    **kwargs,
198) -> DataLoader:
199    """Get the BBBC030 dataloader for DIC cell instance segmentation.
200
201    Args:
202        path: Filepath to a folder where the downloaded data will be saved.
203        batch_size: The batch size for training.
204        patch_shape: The patch shape to use for training.
205        split: The data split to use. One of 'train', 'val', 'test', or None (use all).
206        download: Whether to download the data if it is not present.
207        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
208
209    Returns:
210        The DataLoader.
211    """
212    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
213    dataset = get_bbbc030_dataset(path, patch_shape, split, download, **ds_kwargs)
214    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

IMAGE_URL = 'https://data.broadinstitute.org/bbbc/BBBC030/images.zip'

IMAGE_CHECKSUM = None

GT_URL = 'https://data.broadinstitute.org/bbbc/BBBC030/ground_truth.zip'

GT_CHECKSUM = None

def get_bbbc030_data(path: Union[os.PathLike, str], download: bool = False) -> str: View Source

103def get_bbbc030_data(path: Union[os.PathLike, str], download: bool = False) -> str:
104    """Download and preprocess the BBBC030 dataset.
105
106    Args:
107        path: Filepath to a folder where the downloaded data will be saved.
108        download: Whether to download the data if it is not present.
109
110    Returns:
111        The filepath to the preprocessed H5 data directory.
112    """
113    data_dir = os.path.join(path, "BBBC030")
114
115    if not os.path.exists(data_dir):
116        os.makedirs(data_dir, exist_ok=True)
117        img_zip = os.path.join(path, "BBBC030_images.zip")
118        gt_zip = os.path.join(path, "BBBC030_ground_truth.zip")
119        util.download_source(img_zip, IMAGE_URL, download, checksum=IMAGE_CHECKSUM)
120        util.download_source(gt_zip, GT_URL, download, checksum=GT_CHECKSUM)
121        util.unzip(img_zip, data_dir)
122        util.unzip(gt_zip, data_dir)
123
124    return _preprocess(data_dir)

Download and preprocess the BBBC030 dataset.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
download: Whether to download the data if it is not present.

Returns:

The filepath to the preprocessed H5 data directory.

def get_bbbc030_paths( path: Union[os.PathLike, str], split: Optional[str] = None, download: bool = False) -> Tuple[List[str], List[str]]: View Source

127def get_bbbc030_paths(
128    path: Union[os.PathLike, str],
129    split: Optional[str] = None,
130    download: bool = False,
131) -> Tuple[List[str], List[str]]:
132    """Get paths to the BBBC030 data.
133
134    Args:
135        path: Filepath to a folder where the downloaded data will be saved.
136        split: The data split to use. One of 'train', 'val', 'test', or None (use all).
137        download: Whether to download the data if it is not present.
138
139    Returns:
140        List of filepaths for the image data (H5, key 'raw').
141        List of filepaths for the label data (H5, key 'labels').
142    """
143    h5_dir = get_bbbc030_data(path, download)
144    h5_paths = natsorted(glob(os.path.join(h5_dir, "*.h5")))
145
146    if len(h5_paths) == 0:
147        raise RuntimeError(f"No preprocessed files found in {h5_dir}.")
148
149    if split is None:
150        return h5_paths, h5_paths
151
152    train_paths, test_paths = train_test_split(h5_paths, test_size=0.2, random_state=42)
153    train_paths, val_paths = train_test_split(train_paths, test_size=0.15, random_state=42)
154
155    split_map = {"train": train_paths, "val": val_paths, "test": test_paths}
156    assert split in split_map, f"'{split}' is not a valid split. Choose from {list(split_map)}."
157    selected = split_map[split]
158    return selected, selected

Get paths to the BBBC030 data.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
split: The data split to use. One of 'train', 'val', 'test', or None (use all).
download: Whether to download the data if it is not present.

Returns:

List of filepaths for the image data (H5, key 'raw'). List of filepaths for the label data (H5, key 'labels').

def get_bbbc030_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Optional[str] = None, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset: View Source

161def get_bbbc030_dataset(
162    path: Union[os.PathLike, str],
163    patch_shape: Tuple[int, int],
164    split: Optional[str] = None,
165    download: bool = False,
166    **kwargs,
167) -> Dataset:
168    """Get the BBBC030 dataset for DIC cell instance segmentation.
169
170    Args:
171        path: Filepath to a folder where the downloaded data will be saved.
172        patch_shape: The patch shape to use for training.
173        split: The data split to use. One of 'train', 'val', 'test', or None (use all).
174        download: Whether to download the data if it is not present.
175        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
176
177    Returns:
178        The segmentation dataset.
179    """
180    raw_paths, label_paths = get_bbbc030_paths(path, split, download)
181
182    return torch_em.default_segmentation_dataset(
183        raw_paths=raw_paths,
184        raw_key="raw",
185        label_paths=label_paths,
186        label_key="labels",
187        patch_shape=patch_shape,
188        **kwargs,
189    )

Get the BBBC030 dataset for DIC cell instance segmentation.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
patch_shape: The patch shape to use for training.
split: The data split to use. One of 'train', 'val', 'test', or None (use all).
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

Returns:

The segmentation dataset.

def get_bbbc030_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Optional[str] = None, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader: View Source

192def get_bbbc030_loader(
193    path: Union[os.PathLike, str],
194    batch_size: int,
195    patch_shape: Tuple[int, int],
196    split: Optional[str] = None,
197    download: bool = False,
198    **kwargs,
199) -> DataLoader:
200    """Get the BBBC030 dataloader for DIC cell instance segmentation.
201
202    Args:
203        path: Filepath to a folder where the downloaded data will be saved.
204        batch_size: The batch size for training.
205        patch_shape: The patch shape to use for training.
206        split: The data split to use. One of 'train', 'val', 'test', or None (use all).
207        download: Whether to download the data if it is not present.
208        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
209
210    Returns:
211        The DataLoader.
212    """
213    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
214    dataset = get_bbbc030_dataset(path, patch_shape, split, download, **ds_kwargs)
215    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the BBBC030 dataloader for DIC cell instance segmentation.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
batch_size: The batch size for training.
patch_shape: The patch shape to use for training.
split: The data split to use. One of 'train', 'val', 'test', or None (use all).
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

Returns:

The DataLoader.