torch_em.data.datasets.light_microscopy.bccd
The BCCD dataset contains annotations for blood cell segmentation in microscopy images of blood smears.
The dataset provides 1,328 images with corresponding segmentation masks. Instance segmentation labels are derived via connected components from the semantic masks.
The dataset is located at https://www.kaggle.com/datasets/jeetblahiri/bccd-dataset-with-mask (https://doi.org/10.34740/kaggle/dsv/6107556) Please cite it (the respective doi above) if you use this dataset in your research.
1"""The BCCD dataset contains annotations for blood cell segmentation 2in microscopy images of blood smears. 3 4The dataset provides 1,328 images with corresponding segmentation masks. 5Instance segmentation labels are derived via connected components from the semantic masks. 6 7The dataset is located at https://www.kaggle.com/datasets/jeetblahiri/bccd-dataset-with-mask 8(https://doi.org/10.34740/kaggle/dsv/6107556) 9Please cite it (the respective doi above) if you use this dataset in your research. 10""" 11 12import os 13from glob import glob 14from typing import Union, Literal, Tuple, List 15 16import numpy as np 17import imageio.v3 as imageio 18 19from torch.utils.data import Dataset, DataLoader 20 21import torch_em 22 23from .. import util 24 25 26def _create_h5_data(path, split): 27 """Create h5 files with raw images, semantic masks and instance labels.""" 28 import h5py 29 from skimage.measure import label 30 from tqdm import tqdm 31 32 data_dir = os.path.join(path, "data", "BCCD Dataset with mask") 33 h5_dir = os.path.join(path, "h5_data", split) 34 os.makedirs(h5_dir, exist_ok=True) 35 36 raw_dir = os.path.join(data_dir, split, "original") 37 mask_dir = os.path.join(data_dir, split, "mask") 38 39 raw_paths = sorted(glob(os.path.join(raw_dir, "*.png"))) 40 41 for raw_path in tqdm(raw_paths, desc=f"Creating h5 files for {split}"): 42 fname = os.path.basename(raw_path) 43 h5_path = os.path.join(h5_dir, fname.replace(".png", ".h5")) 44 45 if os.path.exists(h5_path): 46 continue 47 48 mask_path = os.path.join(mask_dir, fname) 49 if not os.path.exists(mask_path): 50 continue 51 52 raw = imageio.imread(raw_path) 53 mask = imageio.imread(mask_path) 54 55 # Convert mask to binary semantic segmentation 56 if mask.ndim == 3: 57 mask = mask[..., 0] # Take first channel if RGB 58 semantic = (mask > 0).astype("uint8") 59 60 # Create instance labels via connected components 61 instances = label(semantic).astype("int64") 62 63 with h5py.File(h5_path, "w") as f: 64 # Store raw as (C, H, W) if RGB 65 if raw.ndim == 3: 66 raw = raw.transpose(2, 0, 1) 67 f.create_dataset("raw", data=raw, compression="gzip") 68 f.create_dataset("labels/semantic", data=semantic, compression="gzip") 69 f.create_dataset("labels/instances", data=instances, compression="gzip") 70 71 return h5_dir 72 73 74def get_bccd_data(path: Union[os.PathLike, str], download: bool = False) -> str: 75 """Download the BCCD dataset. 76 77 Args: 78 path: Filepath to a folder where the downloaded data will be saved. 79 download: Whether to download the data if it is not present. 80 81 Returns: 82 The filepath to the directory with the data. 83 """ 84 data_dir = os.path.join(path, "data", r"BCCD Dataset with mask") 85 if os.path.exists(data_dir): 86 return data_dir 87 88 os.makedirs(path, exist_ok=True) 89 90 util.download_source_kaggle(path=path, dataset_name="jeetblahiri/bccd-dataset-with-mask", download=download) 91 util.unzip(zip_path=os.path.join(path, "bccd-dataset-with-mask.zip"), dst=os.path.join(path, "data")) 92 93 return data_dir 94 95 96def get_bccd_paths( 97 path: Union[os.PathLike, str], 98 split: Literal["train", "test"] = "train", 99 download: bool = False, 100) -> List[str]: 101 """Get paths to the BCCD data. 102 103 Args: 104 path: Filepath to a folder where the downloaded data will be saved. 105 split: The data split to use. One of 'train' or 'test'. 106 download: Whether to download the data if it is not present. 107 108 Returns: 109 List of filepaths for the h5 data. 110 """ 111 from natsort import natsorted 112 113 assert split in ("train", "test"), f"'{split}' is not a valid split." 114 115 get_bccd_data(path, download) 116 117 h5_dir = os.path.join(path, "h5_data", split) 118 if not os.path.exists(h5_dir) or len(glob(os.path.join(h5_dir, "*.h5"))) == 0: 119 _create_h5_data(path, split) 120 121 h5_paths = natsorted(glob(os.path.join(h5_dir, "*.h5"))) 122 assert len(h5_paths) > 0, f"No data found for split '{split}'" 123 124 return h5_paths 125 126 127def get_bccd_dataset( 128 path: Union[os.PathLike, str], 129 patch_shape: Tuple[int, int], 130 split: Literal["train", "test"] = "train", 131 segmentation_type: Literal["instances", "semantic"] = "instances", 132 download: bool = False, 133 **kwargs 134) -> Dataset: 135 """Get the BCCD dataset for blood cell segmentation. 136 137 Args: 138 path: Filepath to a folder where the downloaded data will be saved. 139 patch_shape: The patch shape to use for training. 140 split: The data split to use. One of 'train' or 'test'. 141 segmentation_type: The type of segmentation labels to use. 142 One of 'instances' (connected component instance labels) or 'semantic' (binary cell mask). 143 download: Whether to download the data if it is not present. 144 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 145 146 Returns: 147 The segmentation dataset. 148 """ 149 assert segmentation_type in ("instances", "semantic"), \ 150 f"'{segmentation_type}' is not valid. Choose from 'instances' or 'semantic'." 151 152 h5_paths = get_bccd_paths(path, split, download) 153 154 label_key = f"labels/{segmentation_type}" 155 156 kwargs, _ = util.add_instance_label_transform( 157 kwargs, add_binary_target=True, label_dtype=np.int64, 158 ) 159 kwargs = util.ensure_transforms(ndim=2, **kwargs) 160 161 return torch_em.default_segmentation_dataset( 162 raw_paths=h5_paths, 163 raw_key="raw", 164 label_paths=h5_paths, 165 label_key=label_key, 166 patch_shape=patch_shape, 167 with_channels=True, 168 ndim=2, 169 **kwargs 170 ) 171 172 173def get_bccd_loader( 174 path: Union[os.PathLike, str], 175 batch_size: int, 176 patch_shape: Tuple[int, int], 177 split: Literal["train", "test"] = "train", 178 segmentation_type: Literal["instances", "semantic"] = "instances", 179 download: bool = False, 180 **kwargs 181) -> DataLoader: 182 """Get the BCCD dataloader for blood cell segmentation. 183 184 Args: 185 path: Filepath to a folder where the downloaded data will be saved. 186 batch_size: The batch size for training. 187 patch_shape: The patch shape to use for training. 188 split: The data split to use. One of 'train' or 'test'. 189 segmentation_type: The type of segmentation labels to use. 190 One of 'instances' (connected component instance labels) or 'semantic' (binary cell mask). 191 download: Whether to download the data if it is not present. 192 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 193 194 Returns: 195 The DataLoader. 196 """ 197 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 198 dataset = get_bccd_dataset( 199 path=path, 200 patch_shape=patch_shape, 201 split=split, 202 segmentation_type=segmentation_type, 203 download=download, 204 **ds_kwargs, 205 ) 206 return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)
75def get_bccd_data(path: Union[os.PathLike, str], download: bool = False) -> str: 76 """Download the BCCD dataset. 77 78 Args: 79 path: Filepath to a folder where the downloaded data will be saved. 80 download: Whether to download the data if it is not present. 81 82 Returns: 83 The filepath to the directory with the data. 84 """ 85 data_dir = os.path.join(path, "data", r"BCCD Dataset with mask") 86 if os.path.exists(data_dir): 87 return data_dir 88 89 os.makedirs(path, exist_ok=True) 90 91 util.download_source_kaggle(path=path, dataset_name="jeetblahiri/bccd-dataset-with-mask", download=download) 92 util.unzip(zip_path=os.path.join(path, "bccd-dataset-with-mask.zip"), dst=os.path.join(path, "data")) 93 94 return data_dir
Download the BCCD dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- download: Whether to download the data if it is not present.
Returns:
The filepath to the directory with the data.
97def get_bccd_paths( 98 path: Union[os.PathLike, str], 99 split: Literal["train", "test"] = "train", 100 download: bool = False, 101) -> List[str]: 102 """Get paths to the BCCD data. 103 104 Args: 105 path: Filepath to a folder where the downloaded data will be saved. 106 split: The data split to use. One of 'train' or 'test'. 107 download: Whether to download the data if it is not present. 108 109 Returns: 110 List of filepaths for the h5 data. 111 """ 112 from natsort import natsorted 113 114 assert split in ("train", "test"), f"'{split}' is not a valid split." 115 116 get_bccd_data(path, download) 117 118 h5_dir = os.path.join(path, "h5_data", split) 119 if not os.path.exists(h5_dir) or len(glob(os.path.join(h5_dir, "*.h5"))) == 0: 120 _create_h5_data(path, split) 121 122 h5_paths = natsorted(glob(os.path.join(h5_dir, "*.h5"))) 123 assert len(h5_paths) > 0, f"No data found for split '{split}'" 124 125 return h5_paths
Get paths to the BCCD data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The data split to use. One of 'train' or 'test'.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the h5 data.
128def get_bccd_dataset( 129 path: Union[os.PathLike, str], 130 patch_shape: Tuple[int, int], 131 split: Literal["train", "test"] = "train", 132 segmentation_type: Literal["instances", "semantic"] = "instances", 133 download: bool = False, 134 **kwargs 135) -> Dataset: 136 """Get the BCCD dataset for blood cell segmentation. 137 138 Args: 139 path: Filepath to a folder where the downloaded data will be saved. 140 patch_shape: The patch shape to use for training. 141 split: The data split to use. One of 'train' or 'test'. 142 segmentation_type: The type of segmentation labels to use. 143 One of 'instances' (connected component instance labels) or 'semantic' (binary cell mask). 144 download: Whether to download the data if it is not present. 145 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 146 147 Returns: 148 The segmentation dataset. 149 """ 150 assert segmentation_type in ("instances", "semantic"), \ 151 f"'{segmentation_type}' is not valid. Choose from 'instances' or 'semantic'." 152 153 h5_paths = get_bccd_paths(path, split, download) 154 155 label_key = f"labels/{segmentation_type}" 156 157 kwargs, _ = util.add_instance_label_transform( 158 kwargs, add_binary_target=True, label_dtype=np.int64, 159 ) 160 kwargs = util.ensure_transforms(ndim=2, **kwargs) 161 162 return torch_em.default_segmentation_dataset( 163 raw_paths=h5_paths, 164 raw_key="raw", 165 label_paths=h5_paths, 166 label_key=label_key, 167 patch_shape=patch_shape, 168 with_channels=True, 169 ndim=2, 170 **kwargs 171 )
Get the BCCD dataset for blood cell segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- split: The data split to use. One of 'train' or 'test'.
- segmentation_type: The type of segmentation labels to use. One of 'instances' (connected component instance labels) or 'semantic' (binary cell mask).
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset.
Returns:
The segmentation dataset.
174def get_bccd_loader( 175 path: Union[os.PathLike, str], 176 batch_size: int, 177 patch_shape: Tuple[int, int], 178 split: Literal["train", "test"] = "train", 179 segmentation_type: Literal["instances", "semantic"] = "instances", 180 download: bool = False, 181 **kwargs 182) -> DataLoader: 183 """Get the BCCD dataloader for blood cell segmentation. 184 185 Args: 186 path: Filepath to a folder where the downloaded data will be saved. 187 batch_size: The batch size for training. 188 patch_shape: The patch shape to use for training. 189 split: The data split to use. One of 'train' or 'test'. 190 segmentation_type: The type of segmentation labels to use. 191 One of 'instances' (connected component instance labels) or 'semantic' (binary cell mask). 192 download: Whether to download the data if it is not present. 193 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 194 195 Returns: 196 The DataLoader. 197 """ 198 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 199 dataset = get_bccd_dataset( 200 path=path, 201 patch_shape=patch_shape, 202 split=split, 203 segmentation_type=segmentation_type, 204 download=download, 205 **ds_kwargs, 206 ) 207 return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)
Get the BCCD dataloader for blood cell segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- split: The data split to use. One of 'train' or 'test'.
- segmentation_type: The type of segmentation labels to use. One of 'instances' (connected component instance labels) or 'semantic' (binary cell mask).
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_datasetor for the PyTorch DataLoader.
Returns:
The DataLoader.