torch_em.data.datasets.light_microscopy.bbbc034

The BBBC034v1 dataset contains 3D fluorescence microscopy images of human induced pluripotent stem cell (iPSC) nuclei, with manually annotated instance segmentation ground truth from the Allen Institute for Cell Science.

The dataset contains 3 imaging conditions (1024 x 1024 x 52 voxels):

  • Single cell (AICS-12_134, 4 channels): has ground truth segmentation
  • Colony center (3 channels, no ground truth)
  • Colony edge (3 channels, no ground truth)

NOTE: The ground truth is sparsely annotated — only a subset of the visible nuclei in the annotated volume are labeled. Consider using a sampler to reject patches that contain no annotations when training.

Channels for the annotated single-cell image (AICS_12_134): C=0 (CellMask plasma membrane), C=1 (GFP), C=2 (Hoechst/DNA nuclei), C=3 (Brightfield).

The dataset is located at https://bbbc.broadinstitute.org/BBBC034. Generated by the AICS Microscopy & Image Analysis Team at the Allen Institute for Cell Science. Please cite this dataset if you use it in your research.

  1"""The BBBC034v1 dataset contains 3D fluorescence microscopy images of human
  2induced pluripotent stem cell (iPSC) nuclei, with manually annotated instance
  3segmentation ground truth from the Allen Institute for Cell Science.
  4
  5The dataset contains 3 imaging conditions (1024 x 1024 x 52 voxels):
  6- Single cell (AICS-12_134, 4 channels): has ground truth segmentation
  7- Colony center (3 channels, no ground truth)
  8- Colony edge (3 channels, no ground truth)
  9
 10NOTE: The ground truth is sparsely annotated — only a subset of the visible
 11nuclei in the annotated volume are labeled. Consider using a sampler to reject
 12patches that contain no annotations when training.
 13
 14Channels for the annotated single-cell image (AICS_12_134):
 15C=0 (CellMask plasma membrane), C=1 (GFP), C=2 (Hoechst/DNA nuclei), C=3 (Brightfield).
 16
 17The dataset is located at https://bbbc.broadinstitute.org/BBBC034.
 18Generated by the AICS Microscopy & Image Analysis Team at the Allen Institute for Cell Science.
 19Please cite this dataset if you use it in your research.
 20"""
 21
 22import os
 23from glob import glob
 24from natsort import natsorted
 25from typing import List, Tuple, Union
 26
 27from torch.utils.data import Dataset, DataLoader
 28
 29import torch_em
 30
 31from .. import util
 32
 33
 34IMAGE_URL = "https://data.broadinstitute.org/bbbc/BBBC034/BBBC034_v1_dataset.zip"
 35IMAGE_CHECKSUM = None
 36
 37GT_URL = "https://data.broadinstitute.org/bbbc/BBBC034/BBBC034_v1_DatasetGroundTruth.zip"
 38GT_CHECKSUM = None
 39
 40
 41def get_bbbc034_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 42    """Download the BBBC034v1 dataset.
 43
 44    Args:
 45        path: Filepath to a folder where the downloaded data will be saved.
 46        download: Whether to download the data if it is not present.
 47
 48    Returns:
 49        The filepath to the extracted data directory.
 50    """
 51    data_dir = os.path.join(path, "BBBC034")
 52    if os.path.exists(data_dir):
 53        return data_dir
 54
 55    os.makedirs(data_dir, exist_ok=True)
 56
 57    img_zip = os.path.join(path, "BBBC034_v1_dataset.zip")
 58    gt_zip = os.path.join(path, "BBBC034_v1_DatasetGroundTruth.zip")
 59
 60    util.download_source(img_zip, IMAGE_URL, download, checksum=IMAGE_CHECKSUM)
 61    util.download_source(gt_zip, GT_URL, download, checksum=GT_CHECKSUM)
 62
 63    util.unzip(img_zip, data_dir)
 64    util.unzip(gt_zip, data_dir)
 65
 66    return data_dir
 67
 68
 69def get_bbbc034_paths(
 70    path: Union[os.PathLike, str], channel: int = 2, download: bool = False,
 71) -> Tuple[List[str], List[str]]:
 72    """Get paths to the BBBC034v1 data.
 73
 74    The ground truth segmentation covers a subset of the images. Raw images are
 75    single-channel TIFFs extracted from the multi-channel dataset.
 76
 77    Args:
 78        path: Filepath to a folder where the downloaded data will be saved.
 79        channel: The imaging channel to use as raw input. Default: 2 (Hoechst/DNA nuclei).
 80            Available channels: 0=CellMask, 1=GFP, 2=Hoechst/DNA, 3=Brightfield.
 81        download: Whether to download the data if it is not present.
 82
 83    Returns:
 84        List of filepaths for the image data.
 85        List of filepaths for the label data.
 86    """
 87    data_dir = get_bbbc034_data(path, download)
 88
 89    raw_paths = natsorted(glob(os.path.join(data_dir, f"*_C={channel}.tif")))
 90    label_path = os.path.join(data_dir, "ground_truth_segmented.tif")
 91
 92    if len(raw_paths) == 0:
 93        raise RuntimeError(
 94            f"No image files found for channel {channel} in {data_dir}. "
 95            "Please check the dataset structure after downloading."
 96        )
 97    if not os.path.exists(label_path):
 98        raise RuntimeError(
 99            f"Ground truth file not found: {label_path}. Please check the dataset structure after downloading."
100        )
101
102    return raw_paths, [label_path]
103
104
105def get_bbbc034_dataset(
106    path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], channel: int = 2, download: bool = False, **kwargs,
107) -> Dataset:
108    """Get the BBBC034v1 dataset for 3D nucleus instance segmentation.
109
110    Args:
111        path: Filepath to a folder where the downloaded data will be saved.
112        patch_shape: The patch shape to use for training.
113        channel: The imaging channel to use as raw input. Default: 2 (Hoechst/DNA nuclei).
114        download: Whether to download the data if it is not present.
115        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
116
117    Returns:
118        The segmentation dataset.
119    """
120    raw_paths, label_paths = get_bbbc034_paths(path, channel, download)
121
122    return torch_em.default_segmentation_dataset(
123        raw_paths=raw_paths,
124        raw_key=None,
125        label_paths=label_paths,
126        label_key=None,
127        patch_shape=patch_shape,
128        **kwargs,
129    )
130
131
132def get_bbbc034_loader(
133    path: Union[os.PathLike, str],
134    batch_size: int,
135    patch_shape: Tuple[int, ...],
136    channel: int = 2,
137    download: bool = False,
138    **kwargs,
139) -> DataLoader:
140    """Get the BBBC034v1 dataloader for 3D nucleus instance segmentation.
141
142    Args:
143        path: Filepath to a folder where the downloaded data will be saved.
144        batch_size: The batch size for training.
145        patch_shape: The patch shape to use for training.
146        channel: The imaging channel to use as raw input. Default: 2 (Hoechst/DNA nuclei).
147        download: Whether to download the data if it is not present.
148        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
149
150    Returns:
151        The DataLoader.
152    """
153    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
154    dataset = get_bbbc034_dataset(path, patch_shape, channel, download, **ds_kwargs)
155    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
IMAGE_URL = 'https://data.broadinstitute.org/bbbc/BBBC034/BBBC034_v1_dataset.zip'
IMAGE_CHECKSUM = None
GT_URL = 'https://data.broadinstitute.org/bbbc/BBBC034/BBBC034_v1_DatasetGroundTruth.zip'
GT_CHECKSUM = None
def get_bbbc034_data(path: Union[os.PathLike, str], download: bool = False) -> str:
42def get_bbbc034_data(path: Union[os.PathLike, str], download: bool = False) -> str:
43    """Download the BBBC034v1 dataset.
44
45    Args:
46        path: Filepath to a folder where the downloaded data will be saved.
47        download: Whether to download the data if it is not present.
48
49    Returns:
50        The filepath to the extracted data directory.
51    """
52    data_dir = os.path.join(path, "BBBC034")
53    if os.path.exists(data_dir):
54        return data_dir
55
56    os.makedirs(data_dir, exist_ok=True)
57
58    img_zip = os.path.join(path, "BBBC034_v1_dataset.zip")
59    gt_zip = os.path.join(path, "BBBC034_v1_DatasetGroundTruth.zip")
60
61    util.download_source(img_zip, IMAGE_URL, download, checksum=IMAGE_CHECKSUM)
62    util.download_source(gt_zip, GT_URL, download, checksum=GT_CHECKSUM)
63
64    util.unzip(img_zip, data_dir)
65    util.unzip(gt_zip, data_dir)
66
67    return data_dir

Download the BBBC034v1 dataset.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • download: Whether to download the data if it is not present.
Returns:

The filepath to the extracted data directory.

def get_bbbc034_paths( path: Union[os.PathLike, str], channel: int = 2, download: bool = False) -> Tuple[List[str], List[str]]:
 70def get_bbbc034_paths(
 71    path: Union[os.PathLike, str], channel: int = 2, download: bool = False,
 72) -> Tuple[List[str], List[str]]:
 73    """Get paths to the BBBC034v1 data.
 74
 75    The ground truth segmentation covers a subset of the images. Raw images are
 76    single-channel TIFFs extracted from the multi-channel dataset.
 77
 78    Args:
 79        path: Filepath to a folder where the downloaded data will be saved.
 80        channel: The imaging channel to use as raw input. Default: 2 (Hoechst/DNA nuclei).
 81            Available channels: 0=CellMask, 1=GFP, 2=Hoechst/DNA, 3=Brightfield.
 82        download: Whether to download the data if it is not present.
 83
 84    Returns:
 85        List of filepaths for the image data.
 86        List of filepaths for the label data.
 87    """
 88    data_dir = get_bbbc034_data(path, download)
 89
 90    raw_paths = natsorted(glob(os.path.join(data_dir, f"*_C={channel}.tif")))
 91    label_path = os.path.join(data_dir, "ground_truth_segmented.tif")
 92
 93    if len(raw_paths) == 0:
 94        raise RuntimeError(
 95            f"No image files found for channel {channel} in {data_dir}. "
 96            "Please check the dataset structure after downloading."
 97        )
 98    if not os.path.exists(label_path):
 99        raise RuntimeError(
100            f"Ground truth file not found: {label_path}. Please check the dataset structure after downloading."
101        )
102
103    return raw_paths, [label_path]

Get paths to the BBBC034v1 data.

The ground truth segmentation covers a subset of the images. Raw images are single-channel TIFFs extracted from the multi-channel dataset.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • channel: The imaging channel to use as raw input. Default: 2 (Hoechst/DNA nuclei). Available channels: 0=CellMask, 1=GFP, 2=Hoechst/DNA, 3=Brightfield.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths for the image data. List of filepaths for the label data.

def get_bbbc034_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], channel: int = 2, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
106def get_bbbc034_dataset(
107    path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], channel: int = 2, download: bool = False, **kwargs,
108) -> Dataset:
109    """Get the BBBC034v1 dataset for 3D nucleus instance segmentation.
110
111    Args:
112        path: Filepath to a folder where the downloaded data will be saved.
113        patch_shape: The patch shape to use for training.
114        channel: The imaging channel to use as raw input. Default: 2 (Hoechst/DNA nuclei).
115        download: Whether to download the data if it is not present.
116        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
117
118    Returns:
119        The segmentation dataset.
120    """
121    raw_paths, label_paths = get_bbbc034_paths(path, channel, download)
122
123    return torch_em.default_segmentation_dataset(
124        raw_paths=raw_paths,
125        raw_key=None,
126        label_paths=label_paths,
127        label_key=None,
128        patch_shape=patch_shape,
129        **kwargs,
130    )

Get the BBBC034v1 dataset for 3D nucleus instance segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • channel: The imaging channel to use as raw input. Default: 2 (Hoechst/DNA nuclei).
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_bbbc034_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, ...], channel: int = 2, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
133def get_bbbc034_loader(
134    path: Union[os.PathLike, str],
135    batch_size: int,
136    patch_shape: Tuple[int, ...],
137    channel: int = 2,
138    download: bool = False,
139    **kwargs,
140) -> DataLoader:
141    """Get the BBBC034v1 dataloader for 3D nucleus instance segmentation.
142
143    Args:
144        path: Filepath to a folder where the downloaded data will be saved.
145        batch_size: The batch size for training.
146        patch_shape: The patch shape to use for training.
147        channel: The imaging channel to use as raw input. Default: 2 (Hoechst/DNA nuclei).
148        download: Whether to download the data if it is not present.
149        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
150
151    Returns:
152        The DataLoader.
153    """
154    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
155    dataset = get_bbbc034_dataset(path, patch_shape, channel, download, **ds_kwargs)
156    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the BBBC034v1 dataloader for 3D nucleus instance segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape to use for training.
  • channel: The imaging channel to use as raw input. Default: 2 (Hoechst/DNA nuclei).
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.