torch_em.data.datasets.light_microscopy.bbbc030

The BBBC030 dataset contains 60 Differential Interference Contrast (DIC) images of Chinese Hamster Ovary (CHO) cells acquired during initial cell attachment, with hand-segmented cell contour ground truth annotations.

Raw images are RGB-encoded grayscale (R=G=B). Ground truth files are contour/boundary maps (thin cell outlines), which are converted to instance segmentation labels by finding the enclosed regions and labeling them with connected components.

The dataset is located at https://bbbc.broadinstitute.org/BBBC030. This dataset is from the following publication:

  1"""The BBBC030 dataset contains 60 Differential Interference Contrast (DIC) images
  2of Chinese Hamster Ovary (CHO) cells acquired during initial cell attachment, with
  3hand-segmented cell contour ground truth annotations.
  4
  5Raw images are RGB-encoded grayscale (R=G=B). Ground truth files are contour/boundary
  6maps (thin cell outlines), which are converted to instance segmentation labels by
  7finding the enclosed regions and labeling them with connected components.
  8
  9The dataset is located at https://bbbc.broadinstitute.org/BBBC030.
 10This dataset is from the following publication:
 11- Koos et al. (2016): https://doi.org/10.1371/journal.pone.0163431
 12Please cite it if you use this dataset in your research.
 13"""
 14
 15import os
 16from glob import glob
 17from natsort import natsorted
 18from typing import List, Optional, Tuple, Union
 19
 20import numpy as np
 21import imageio.v3 as imageio
 22from tqdm import tqdm
 23from sklearn.model_selection import train_test_split
 24
 25from torch.utils.data import Dataset, DataLoader
 26
 27import torch_em
 28
 29from .. import util
 30
 31
 32IMAGE_URL = "https://data.broadinstitute.org/bbbc/BBBC030/images.zip"
 33IMAGE_CHECKSUM = None
 34
 35GT_URL = "https://data.broadinstitute.org/bbbc/BBBC030/ground_truth.zip"
 36GT_CHECKSUM = None
 37
 38
 39def _contours_to_instances(contour_mask: np.ndarray) -> np.ndarray:
 40    """Convert a contour/boundary map to an instance segmentation label image.
 41
 42    Cells are identified as enclosed regions surrounded by boundary pixels.
 43    The large background region is removed; remaining connected components are
 44    each assigned a unique integer label.
 45    """
 46    from skimage.morphology import binary_dilation, disk
 47    from skimage.measure import label, regionprops
 48
 49    boundaries = contour_mask > 0
 50
 51    # Dilate slightly to close small gaps in hand-drawn contours.
 52    closed = binary_dilation(boundaries, disk(2))
 53
 54    # Enclosed interior regions are the complement of the closed boundaries.
 55    interior = ~closed
 56    labeled = label(interior)
 57
 58    # The largest connected component is the background - remove it.
 59    props = regionprops(labeled)
 60    if not props:
 61        return np.zeros_like(contour_mask, dtype=np.int32)
 62
 63    bg_label = max(props, key=lambda p: p.area).label
 64    labeled[labeled == bg_label] = 0
 65
 66    return labeled.astype(np.int32)
 67
 68
 69def _preprocess(data_dir: str) -> str:
 70    """Convert raw PNGs to preprocessed H5 files (grayscale raw + instance labels)."""
 71    import h5py
 72
 73    h5_dir = os.path.join(data_dir, "h5_data")
 74    if os.path.exists(h5_dir):
 75        return h5_dir
 76    os.makedirs(h5_dir, exist_ok=True)
 77
 78    raw_paths = natsorted(glob(os.path.join(data_dir, "images", "*.png")))
 79    for raw_path in tqdm(raw_paths, desc="Preprocessing BBBC030"):
 80        fname = os.path.splitext(os.path.basename(raw_path))[0]
 81        h5_path = os.path.join(h5_dir, fname + ".h5")
 82
 83        gt_path = os.path.join(data_dir, "ground_truth", os.path.basename(raw_path))
 84        if not os.path.exists(gt_path):
 85            continue
 86
 87        raw = imageio.imread(raw_path)
 88        if raw.ndim == 3:  # grayscale saved as RGB
 89            raw = raw[..., 0]
 90
 91        contours = imageio.imread(gt_path)
 92        instances = _contours_to_instances(contours)
 93
 94        with h5py.File(h5_path, "w") as f:
 95            f.create_dataset("raw", data=raw, compression="gzip")
 96            f.create_dataset("labels", data=instances, compression="gzip")
 97
 98    return h5_dir
 99
100
101def get_bbbc030_data(path: Union[os.PathLike, str], download: bool = False) -> str:
102    """Download and preprocess the BBBC030 dataset.
103
104    Args:
105        path: Filepath to a folder where the downloaded data will be saved.
106        download: Whether to download the data if it is not present.
107
108    Returns:
109        The filepath to the preprocessed H5 data directory.
110    """
111    data_dir = os.path.join(path, "BBBC030")
112
113    if not os.path.exists(data_dir):
114        os.makedirs(data_dir, exist_ok=True)
115        img_zip = os.path.join(path, "BBBC030_images.zip")
116        gt_zip = os.path.join(path, "BBBC030_ground_truth.zip")
117        util.download_source(img_zip, IMAGE_URL, download, checksum=IMAGE_CHECKSUM)
118        util.download_source(gt_zip, GT_URL, download, checksum=GT_CHECKSUM)
119        util.unzip(img_zip, data_dir)
120        util.unzip(gt_zip, data_dir)
121
122    return _preprocess(data_dir)
123
124
125def get_bbbc030_paths(
126    path: Union[os.PathLike, str],
127    split: Optional[str] = None,
128    download: bool = False,
129) -> Tuple[List[str], List[str]]:
130    """Get paths to the BBBC030 data.
131
132    Args:
133        path: Filepath to a folder where the downloaded data will be saved.
134        split: The data split to use. One of 'train', 'val', 'test', or None (use all).
135        download: Whether to download the data if it is not present.
136
137    Returns:
138        List of filepaths for the image data (H5, key 'raw').
139        List of filepaths for the label data (H5, key 'labels').
140    """
141    h5_dir = get_bbbc030_data(path, download)
142    h5_paths = natsorted(glob(os.path.join(h5_dir, "*.h5")))
143
144    if len(h5_paths) == 0:
145        raise RuntimeError(f"No preprocessed files found in {h5_dir}.")
146
147    if split is None:
148        return h5_paths, h5_paths
149
150    train_paths, test_paths = train_test_split(h5_paths, test_size=0.2, random_state=42)
151    train_paths, val_paths = train_test_split(train_paths, test_size=0.15, random_state=42)
152
153    split_map = {"train": train_paths, "val": val_paths, "test": test_paths}
154    assert split in split_map, f"'{split}' is not a valid split. Choose from {list(split_map)}."
155    selected = split_map[split]
156    return selected, selected
157
158
159def get_bbbc030_dataset(
160    path: Union[os.PathLike, str],
161    patch_shape: Tuple[int, int],
162    split: Optional[str] = None,
163    download: bool = False,
164    **kwargs,
165) -> Dataset:
166    """Get the BBBC030 dataset for DIC cell instance segmentation.
167
168    Args:
169        path: Filepath to a folder where the downloaded data will be saved.
170        patch_shape: The patch shape to use for training.
171        split: The data split to use. One of 'train', 'val', 'test', or None (use all).
172        download: Whether to download the data if it is not present.
173        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
174
175    Returns:
176        The segmentation dataset.
177    """
178    raw_paths, label_paths = get_bbbc030_paths(path, split, download)
179
180    return torch_em.default_segmentation_dataset(
181        raw_paths=raw_paths,
182        raw_key="raw",
183        label_paths=label_paths,
184        label_key="labels",
185        patch_shape=patch_shape,
186        **kwargs,
187    )
188
189
190def get_bbbc030_loader(
191    path: Union[os.PathLike, str],
192    batch_size: int,
193    patch_shape: Tuple[int, int],
194    split: Optional[str] = None,
195    download: bool = False,
196    **kwargs,
197) -> DataLoader:
198    """Get the BBBC030 dataloader for DIC cell instance segmentation.
199
200    Args:
201        path: Filepath to a folder where the downloaded data will be saved.
202        batch_size: The batch size for training.
203        patch_shape: The patch shape to use for training.
204        split: The data split to use. One of 'train', 'val', 'test', or None (use all).
205        download: Whether to download the data if it is not present.
206        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
207
208    Returns:
209        The DataLoader.
210    """
211    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
212    dataset = get_bbbc030_dataset(path, patch_shape, split, download, **ds_kwargs)
213    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
IMAGE_URL = 'https://data.broadinstitute.org/bbbc/BBBC030/images.zip'
IMAGE_CHECKSUM = None
GT_URL = 'https://data.broadinstitute.org/bbbc/BBBC030/ground_truth.zip'
GT_CHECKSUM = None
def get_bbbc030_data(path: Union[os.PathLike, str], download: bool = False) -> str:
102def get_bbbc030_data(path: Union[os.PathLike, str], download: bool = False) -> str:
103    """Download and preprocess the BBBC030 dataset.
104
105    Args:
106        path: Filepath to a folder where the downloaded data will be saved.
107        download: Whether to download the data if it is not present.
108
109    Returns:
110        The filepath to the preprocessed H5 data directory.
111    """
112    data_dir = os.path.join(path, "BBBC030")
113
114    if not os.path.exists(data_dir):
115        os.makedirs(data_dir, exist_ok=True)
116        img_zip = os.path.join(path, "BBBC030_images.zip")
117        gt_zip = os.path.join(path, "BBBC030_ground_truth.zip")
118        util.download_source(img_zip, IMAGE_URL, download, checksum=IMAGE_CHECKSUM)
119        util.download_source(gt_zip, GT_URL, download, checksum=GT_CHECKSUM)
120        util.unzip(img_zip, data_dir)
121        util.unzip(gt_zip, data_dir)
122
123    return _preprocess(data_dir)

Download and preprocess the BBBC030 dataset.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • download: Whether to download the data if it is not present.
Returns:

The filepath to the preprocessed H5 data directory.

def get_bbbc030_paths( path: Union[os.PathLike, str], split: Optional[str] = None, download: bool = False) -> Tuple[List[str], List[str]]:
126def get_bbbc030_paths(
127    path: Union[os.PathLike, str],
128    split: Optional[str] = None,
129    download: bool = False,
130) -> Tuple[List[str], List[str]]:
131    """Get paths to the BBBC030 data.
132
133    Args:
134        path: Filepath to a folder where the downloaded data will be saved.
135        split: The data split to use. One of 'train', 'val', 'test', or None (use all).
136        download: Whether to download the data if it is not present.
137
138    Returns:
139        List of filepaths for the image data (H5, key 'raw').
140        List of filepaths for the label data (H5, key 'labels').
141    """
142    h5_dir = get_bbbc030_data(path, download)
143    h5_paths = natsorted(glob(os.path.join(h5_dir, "*.h5")))
144
145    if len(h5_paths) == 0:
146        raise RuntimeError(f"No preprocessed files found in {h5_dir}.")
147
148    if split is None:
149        return h5_paths, h5_paths
150
151    train_paths, test_paths = train_test_split(h5_paths, test_size=0.2, random_state=42)
152    train_paths, val_paths = train_test_split(train_paths, test_size=0.15, random_state=42)
153
154    split_map = {"train": train_paths, "val": val_paths, "test": test_paths}
155    assert split in split_map, f"'{split}' is not a valid split. Choose from {list(split_map)}."
156    selected = split_map[split]
157    return selected, selected

Get paths to the BBBC030 data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • split: The data split to use. One of 'train', 'val', 'test', or None (use all).
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths for the image data (H5, key 'raw'). List of filepaths for the label data (H5, key 'labels').

def get_bbbc030_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Optional[str] = None, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
160def get_bbbc030_dataset(
161    path: Union[os.PathLike, str],
162    patch_shape: Tuple[int, int],
163    split: Optional[str] = None,
164    download: bool = False,
165    **kwargs,
166) -> Dataset:
167    """Get the BBBC030 dataset for DIC cell instance segmentation.
168
169    Args:
170        path: Filepath to a folder where the downloaded data will be saved.
171        patch_shape: The patch shape to use for training.
172        split: The data split to use. One of 'train', 'val', 'test', or None (use all).
173        download: Whether to download the data if it is not present.
174        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
175
176    Returns:
177        The segmentation dataset.
178    """
179    raw_paths, label_paths = get_bbbc030_paths(path, split, download)
180
181    return torch_em.default_segmentation_dataset(
182        raw_paths=raw_paths,
183        raw_key="raw",
184        label_paths=label_paths,
185        label_key="labels",
186        patch_shape=patch_shape,
187        **kwargs,
188    )

Get the BBBC030 dataset for DIC cell instance segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • split: The data split to use. One of 'train', 'val', 'test', or None (use all).
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_bbbc030_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Optional[str] = None, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
191def get_bbbc030_loader(
192    path: Union[os.PathLike, str],
193    batch_size: int,
194    patch_shape: Tuple[int, int],
195    split: Optional[str] = None,
196    download: bool = False,
197    **kwargs,
198) -> DataLoader:
199    """Get the BBBC030 dataloader for DIC cell instance segmentation.
200
201    Args:
202        path: Filepath to a folder where the downloaded data will be saved.
203        batch_size: The batch size for training.
204        patch_shape: The patch shape to use for training.
205        split: The data split to use. One of 'train', 'val', 'test', or None (use all).
206        download: Whether to download the data if it is not present.
207        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
208
209    Returns:
210        The DataLoader.
211    """
212    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
213    dataset = get_bbbc030_dataset(path, patch_shape, split, download, **ds_kwargs)
214    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the BBBC030 dataloader for DIC cell instance segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape to use for training.
  • split: The data split to use. One of 'train', 'val', 'test', or None (use all).
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.