torch_em.data.datasets.light_microscopy.bccd

The BCCD dataset contains annotations for blood cell segmentation in microscopy images of blood smears.

The dataset provides 1,328 images with corresponding segmentation masks. Instance segmentation labels are derived via connected components from the semantic masks.

The dataset is located at https://www.kaggle.com/datasets/jeetblahiri/bccd-dataset-with-mask (https://doi.org/10.34740/kaggle/dsv/6107556) Please cite it (the respective doi above) if you use this dataset in your research.

  1"""The BCCD dataset contains annotations for blood cell segmentation
  2in microscopy images of blood smears.
  3
  4The dataset provides 1,328 images with corresponding segmentation masks.
  5Instance segmentation labels are derived via connected components from the semantic masks.
  6
  7The dataset is located at https://www.kaggle.com/datasets/jeetblahiri/bccd-dataset-with-mask
  8(https://doi.org/10.34740/kaggle/dsv/6107556)
  9Please cite it (the respective doi above) if you use this dataset in your research.
 10"""
 11
 12import os
 13from glob import glob
 14from typing import Union, Literal, Tuple, List
 15
 16import numpy as np
 17import imageio.v3 as imageio
 18
 19from torch.utils.data import Dataset, DataLoader
 20
 21import torch_em
 22
 23from .. import util
 24
 25
 26def _create_h5_data(path, split):
 27    """Create h5 files with raw images, semantic masks and instance labels."""
 28    import h5py
 29    from skimage.measure import label
 30    from tqdm import tqdm
 31
 32    data_dir = os.path.join(path, "data", "BCCD Dataset with mask")
 33    h5_dir = os.path.join(path, "h5_data", split)
 34    os.makedirs(h5_dir, exist_ok=True)
 35
 36    raw_dir = os.path.join(data_dir, split, "original")
 37    mask_dir = os.path.join(data_dir, split, "mask")
 38
 39    raw_paths = sorted(glob(os.path.join(raw_dir, "*.png")))
 40
 41    for raw_path in tqdm(raw_paths, desc=f"Creating h5 files for {split}"):
 42        fname = os.path.basename(raw_path)
 43        h5_path = os.path.join(h5_dir, fname.replace(".png", ".h5"))
 44
 45        if os.path.exists(h5_path):
 46            continue
 47
 48        mask_path = os.path.join(mask_dir, fname)
 49        if not os.path.exists(mask_path):
 50            continue
 51
 52        raw = imageio.imread(raw_path)
 53        mask = imageio.imread(mask_path)
 54
 55        # Convert mask to binary semantic segmentation
 56        if mask.ndim == 3:
 57            mask = mask[..., 0]  # Take first channel if RGB
 58        semantic = (mask > 0).astype("uint8")
 59
 60        # Create instance labels via connected components
 61        instances = label(semantic).astype("int64")
 62
 63        with h5py.File(h5_path, "w") as f:
 64            # Store raw as (C, H, W) if RGB
 65            if raw.ndim == 3:
 66                raw = raw.transpose(2, 0, 1)
 67            f.create_dataset("raw", data=raw, compression="gzip")
 68            f.create_dataset("labels/semantic", data=semantic, compression="gzip")
 69            f.create_dataset("labels/instances", data=instances, compression="gzip")
 70
 71    return h5_dir
 72
 73
 74def get_bccd_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 75    """Download the BCCD dataset.
 76
 77    Args:
 78        path: Filepath to a folder where the downloaded data will be saved.
 79        download: Whether to download the data if it is not present.
 80
 81    Returns:
 82        The filepath to the directory with the data.
 83    """
 84    data_dir = os.path.join(path, "data", r"BCCD Dataset with mask")
 85    if os.path.exists(data_dir):
 86        return data_dir
 87
 88    os.makedirs(path, exist_ok=True)
 89
 90    util.download_source_kaggle(path=path, dataset_name="jeetblahiri/bccd-dataset-with-mask", download=download)
 91    util.unzip(zip_path=os.path.join(path, "bccd-dataset-with-mask.zip"), dst=os.path.join(path, "data"))
 92
 93    return data_dir
 94
 95
 96def get_bccd_paths(
 97    path: Union[os.PathLike, str],
 98    split: Literal["train", "test"] = "train",
 99    download: bool = False,
100) -> List[str]:
101    """Get paths to the BCCD data.
102
103    Args:
104        path: Filepath to a folder where the downloaded data will be saved.
105        split: The data split to use. One of 'train' or 'test'.
106        download: Whether to download the data if it is not present.
107
108    Returns:
109        List of filepaths for the h5 data.
110    """
111    from natsort import natsorted
112
113    assert split in ("train", "test"), f"'{split}' is not a valid split."
114
115    get_bccd_data(path, download)
116
117    h5_dir = os.path.join(path, "h5_data", split)
118    if not os.path.exists(h5_dir) or len(glob(os.path.join(h5_dir, "*.h5"))) == 0:
119        _create_h5_data(path, split)
120
121    h5_paths = natsorted(glob(os.path.join(h5_dir, "*.h5")))
122    assert len(h5_paths) > 0, f"No data found for split '{split}'"
123
124    return h5_paths
125
126
127def get_bccd_dataset(
128    path: Union[os.PathLike, str],
129    patch_shape: Tuple[int, int],
130    split: Literal["train", "test"] = "train",
131    segmentation_type: Literal["instances", "semantic"] = "instances",
132    download: bool = False,
133    **kwargs
134) -> Dataset:
135    """Get the BCCD dataset for blood cell segmentation.
136
137    Args:
138        path: Filepath to a folder where the downloaded data will be saved.
139        patch_shape: The patch shape to use for training.
140        split: The data split to use. One of 'train' or 'test'.
141        segmentation_type: The type of segmentation labels to use.
142            One of 'instances' (connected component instance labels) or 'semantic' (binary cell mask).
143        download: Whether to download the data if it is not present.
144        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
145
146    Returns:
147        The segmentation dataset.
148    """
149    assert segmentation_type in ("instances", "semantic"), \
150        f"'{segmentation_type}' is not valid. Choose from 'instances' or 'semantic'."
151
152    h5_paths = get_bccd_paths(path, split, download)
153
154    label_key = f"labels/{segmentation_type}"
155
156    kwargs, _ = util.add_instance_label_transform(
157        kwargs, add_binary_target=True, label_dtype=np.int64,
158    )
159    kwargs = util.ensure_transforms(ndim=2, **kwargs)
160
161    return torch_em.default_segmentation_dataset(
162        raw_paths=h5_paths,
163        raw_key="raw",
164        label_paths=h5_paths,
165        label_key=label_key,
166        patch_shape=patch_shape,
167        with_channels=True,
168        ndim=2,
169        **kwargs
170    )
171
172
173def get_bccd_loader(
174    path: Union[os.PathLike, str],
175    batch_size: int,
176    patch_shape: Tuple[int, int],
177    split: Literal["train", "test"] = "train",
178    segmentation_type: Literal["instances", "semantic"] = "instances",
179    download: bool = False,
180    **kwargs
181) -> DataLoader:
182    """Get the BCCD dataloader for blood cell segmentation.
183
184    Args:
185        path: Filepath to a folder where the downloaded data will be saved.
186        batch_size: The batch size for training.
187        patch_shape: The patch shape to use for training.
188        split: The data split to use. One of 'train' or 'test'.
189        segmentation_type: The type of segmentation labels to use.
190            One of 'instances' (connected component instance labels) or 'semantic' (binary cell mask).
191        download: Whether to download the data if it is not present.
192        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
193
194    Returns:
195        The DataLoader.
196    """
197    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
198    dataset = get_bccd_dataset(
199        path=path,
200        patch_shape=patch_shape,
201        split=split,
202        segmentation_type=segmentation_type,
203        download=download,
204        **ds_kwargs,
205    )
206    return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)
def get_bccd_data(path: Union[os.PathLike, str], download: bool = False) -> str:
75def get_bccd_data(path: Union[os.PathLike, str], download: bool = False) -> str:
76    """Download the BCCD dataset.
77
78    Args:
79        path: Filepath to a folder where the downloaded data will be saved.
80        download: Whether to download the data if it is not present.
81
82    Returns:
83        The filepath to the directory with the data.
84    """
85    data_dir = os.path.join(path, "data", r"BCCD Dataset with mask")
86    if os.path.exists(data_dir):
87        return data_dir
88
89    os.makedirs(path, exist_ok=True)
90
91    util.download_source_kaggle(path=path, dataset_name="jeetblahiri/bccd-dataset-with-mask", download=download)
92    util.unzip(zip_path=os.path.join(path, "bccd-dataset-with-mask.zip"), dst=os.path.join(path, "data"))
93
94    return data_dir

Download the BCCD dataset.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • download: Whether to download the data if it is not present.
Returns:

The filepath to the directory with the data.

def get_bccd_paths( path: Union[os.PathLike, str], split: Literal['train', 'test'] = 'train', download: bool = False) -> List[str]:
 97def get_bccd_paths(
 98    path: Union[os.PathLike, str],
 99    split: Literal["train", "test"] = "train",
100    download: bool = False,
101) -> List[str]:
102    """Get paths to the BCCD data.
103
104    Args:
105        path: Filepath to a folder where the downloaded data will be saved.
106        split: The data split to use. One of 'train' or 'test'.
107        download: Whether to download the data if it is not present.
108
109    Returns:
110        List of filepaths for the h5 data.
111    """
112    from natsort import natsorted
113
114    assert split in ("train", "test"), f"'{split}' is not a valid split."
115
116    get_bccd_data(path, download)
117
118    h5_dir = os.path.join(path, "h5_data", split)
119    if not os.path.exists(h5_dir) or len(glob(os.path.join(h5_dir, "*.h5"))) == 0:
120        _create_h5_data(path, split)
121
122    h5_paths = natsorted(glob(os.path.join(h5_dir, "*.h5")))
123    assert len(h5_paths) > 0, f"No data found for split '{split}'"
124
125    return h5_paths

Get paths to the BCCD data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • split: The data split to use. One of 'train' or 'test'.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths for the h5 data.

def get_bccd_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Literal['train', 'test'] = 'train', segmentation_type: Literal['instances', 'semantic'] = 'instances', download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
128def get_bccd_dataset(
129    path: Union[os.PathLike, str],
130    patch_shape: Tuple[int, int],
131    split: Literal["train", "test"] = "train",
132    segmentation_type: Literal["instances", "semantic"] = "instances",
133    download: bool = False,
134    **kwargs
135) -> Dataset:
136    """Get the BCCD dataset for blood cell segmentation.
137
138    Args:
139        path: Filepath to a folder where the downloaded data will be saved.
140        patch_shape: The patch shape to use for training.
141        split: The data split to use. One of 'train' or 'test'.
142        segmentation_type: The type of segmentation labels to use.
143            One of 'instances' (connected component instance labels) or 'semantic' (binary cell mask).
144        download: Whether to download the data if it is not present.
145        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
146
147    Returns:
148        The segmentation dataset.
149    """
150    assert segmentation_type in ("instances", "semantic"), \
151        f"'{segmentation_type}' is not valid. Choose from 'instances' or 'semantic'."
152
153    h5_paths = get_bccd_paths(path, split, download)
154
155    label_key = f"labels/{segmentation_type}"
156
157    kwargs, _ = util.add_instance_label_transform(
158        kwargs, add_binary_target=True, label_dtype=np.int64,
159    )
160    kwargs = util.ensure_transforms(ndim=2, **kwargs)
161
162    return torch_em.default_segmentation_dataset(
163        raw_paths=h5_paths,
164        raw_key="raw",
165        label_paths=h5_paths,
166        label_key=label_key,
167        patch_shape=patch_shape,
168        with_channels=True,
169        ndim=2,
170        **kwargs
171    )

Get the BCCD dataset for blood cell segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • split: The data split to use. One of 'train' or 'test'.
  • segmentation_type: The type of segmentation labels to use. One of 'instances' (connected component instance labels) or 'semantic' (binary cell mask).
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_bccd_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Literal['train', 'test'] = 'train', segmentation_type: Literal['instances', 'semantic'] = 'instances', download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
174def get_bccd_loader(
175    path: Union[os.PathLike, str],
176    batch_size: int,
177    patch_shape: Tuple[int, int],
178    split: Literal["train", "test"] = "train",
179    segmentation_type: Literal["instances", "semantic"] = "instances",
180    download: bool = False,
181    **kwargs
182) -> DataLoader:
183    """Get the BCCD dataloader for blood cell segmentation.
184
185    Args:
186        path: Filepath to a folder where the downloaded data will be saved.
187        batch_size: The batch size for training.
188        patch_shape: The patch shape to use for training.
189        split: The data split to use. One of 'train' or 'test'.
190        segmentation_type: The type of segmentation labels to use.
191            One of 'instances' (connected component instance labels) or 'semantic' (binary cell mask).
192        download: Whether to download the data if it is not present.
193        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
194
195    Returns:
196        The DataLoader.
197    """
198    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
199    dataset = get_bccd_dataset(
200        path=path,
201        patch_shape=patch_shape,
202        split=split,
203        segmentation_type=segmentation_type,
204        download=download,
205        **ds_kwargs,
206    )
207    return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)

Get the BCCD dataloader for blood cell segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape to use for training.
  • split: The data split to use. One of 'train' or 'test'.
  • segmentation_type: The type of segmentation labels to use. One of 'instances' (connected component instance labels) or 'semantic' (binary cell mask).
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.