torch_em.data.datasets.light_microscopy.cisd
The CISD dataset contains 3,911 samples of touching or overlapping urothelial cells from digital cytology, with manually annotated instance segmentation masks.
The data comes from 30 cytology slides prepared from healthy patient urine samples and digitized with 21 focal planes. Two 2D image modes are supported:
- center_slice: Single best-focus 2D plane (JPG)
- edf: Extended Depth of Field — 21 planes merged into one focused 2D image (JPG)
NOTE: The raw dataset also provides a "stack" mode (all 21 focal planes per sample), but it is not supported here because the annotations are always 2D instance masks. A 3D stack with 2D-only labels cannot form a valid segmentation dataset.
Annotations are 2D instance masks stored in RLE format in CISD.json. Cell categories: RED_BLOOD_CELL, NEUTROPHIL, SUPERFICIAL, UROTHELIAL.
The dataset is located at https://zenodo.org/records/5938893. This dataset is from the publication https://doi.org/10.1109/ISBI52829.2022.9761495. Please cite it if you use this dataset in your research.
1"""The CISD dataset contains 3,911 samples of touching or overlapping urothelial cells 2from digital cytology, with manually annotated instance segmentation masks. 3 4The data comes from 30 cytology slides prepared from healthy patient urine samples 5and digitized with 21 focal planes. Two 2D image modes are supported: 6- center_slice: Single best-focus 2D plane (JPG) 7- edf: Extended Depth of Field — 21 planes merged into one focused 2D image (JPG) 8 9NOTE: The raw dataset also provides a "stack" mode (all 21 focal planes per sample), 10but it is not supported here because the annotations are always 2D instance masks. 11A 3D stack with 2D-only labels cannot form a valid segmentation dataset. 12 13Annotations are 2D instance masks stored in RLE format in CISD.json. 14Cell categories: RED_BLOOD_CELL, NEUTROPHIL, SUPERFICIAL, UROTHELIAL. 15 16The dataset is located at https://zenodo.org/records/5938893. 17This dataset is from the publication https://doi.org/10.1109/ISBI52829.2022.9761495. 18Please cite it if you use this dataset in your research. 19""" 20 21import os 22import json 23from glob import glob 24from natsort import natsorted 25from typing import List, Literal, Tuple, Union 26 27import numpy as np 28 29from torch.utils.data import Dataset, DataLoader 30 31import torch_em 32 33from .. import util 34 35 36URL = "https://zenodo.org/records/5938893/files/CISD.zip" 37CHECKSUM = None 38 39 40def get_cisd_data(path: Union[os.PathLike, str], download: bool = False) -> str: 41 """Download the CISD dataset. 42 43 Args: 44 path: Filepath to a folder where the downloaded data will be saved. 45 download: Whether to download the data if it is not present. 46 47 Returns: 48 The filepath to the extracted data directory. 49 """ 50 data_dir = os.path.join(path, "CISD") 51 if os.path.exists(data_dir): 52 return data_dir 53 54 os.makedirs(path, exist_ok=True) 55 zip_path = os.path.join(path, "CISD.zip") 56 util.download_source(zip_path, URL, download, checksum=CHECKSUM) 57 util.unzip(zip_path, path) 58 59 return data_dir 60 61 62def _decode_rle(rle_counts, height, width): 63 """Decode an uncompressed RLE mask (row-major order) to a 2D array.""" 64 flat = np.zeros(height * width, dtype=np.uint8) 65 pos = 0 66 for i, count in enumerate(rle_counts): 67 if i % 2 == 1: 68 flat[pos:pos + count] = 1 69 pos += count 70 return flat.reshape((height, width), order="C") 71 72 73def _convert_annotations(data_dir: str, mode: str) -> str: 74 """Convert CISD.json RLE masks to per-sample 2D TIFF label images. 75 76 Reads image dimensions from the mask 'size' field — no raw images are loaded. 77 Runs once; subsequent calls return the cached label directory immediately. 78 79 Args: 80 data_dir: The root CISD data directory (contains CISD.json). 81 mode: One of "center_slice" or "edf". 82 83 Returns: 84 Path to the directory containing the generated label TIFFs. 85 """ 86 import imageio.v3 as imageio 87 from tqdm import tqdm 88 89 label_dir = os.path.join(data_dir, f"{mode}_labels") 90 if os.path.exists(label_dir) and len(glob(os.path.join(label_dir, "*.tif"))) > 0: 91 return label_dir 92 93 os.makedirs(label_dir, exist_ok=True) 94 95 json_path = os.path.join(data_dir, "CISD.json") 96 if not os.path.exists(json_path): 97 raise RuntimeError(f"Annotation file not found: {json_path}") 98 99 with open(json_path, "r") as f: 100 data = json.load(f) 101 102 assets = data["assets"] # list of {"asset_id", "file_name", "annotations": [...]} 103 104 for asset in tqdm(assets, desc=f"Converting CISD {mode} labels"): 105 file_name = asset["file_name"] # e.g. "0241_BB_01471.jpg" 106 base_name = os.path.splitext(file_name)[0] # e.g. "0241_BB_01471" 107 anns = asset.get("annotations", []) 108 109 # Get (H, W) from the first RLE size field — no image loading needed 110 h, w = None, None 111 for ann in anns: 112 for item in ann.get("data", []): 113 mask_info = item.get("mask", {}) 114 if "size" in mask_info: 115 h, w = mask_info["size"] 116 break 117 if h is not None: 118 break 119 120 if h is None or w is None: 121 continue 122 123 label = np.zeros((h, w), dtype=np.int32) 124 inst_id = 1 125 for ann in anns: 126 for item in ann.get("data", []): 127 mask_info = item.get("mask", {}) 128 counts = mask_info.get("counts", []) 129 size = mask_info.get("size", [h, w]) 130 if not counts: 131 continue 132 mask = _decode_rle(counts, size[0], size[1]) 133 label[mask > 0] = inst_id 134 inst_id += 1 135 136 out_path = os.path.join(label_dir, f"{base_name}.tif") 137 imageio.imwrite(out_path, label) 138 139 return label_dir 140 141 142def _convert_raw_to_grayscale(data_dir: str, mode: str) -> str: 143 """Convert RGB JPG images to grayscale TIFFs so shapes match the 2D label masks. 144 145 Runs once; subsequent calls return the cached directory immediately. 146 147 Args: 148 data_dir: The root CISD data directory. 149 mode: One of "center_slice" or "edf". 150 151 Returns: 152 Path to the directory containing the grayscale TIFFs. 153 """ 154 import imageio.v3 as imageio 155 from tqdm import tqdm 156 157 gray_dir = os.path.join(data_dir, f"{mode}_gray") 158 if os.path.exists(gray_dir) and len(glob(os.path.join(gray_dir, "*.tif"))) > 0: 159 return gray_dir 160 161 os.makedirs(gray_dir, exist_ok=True) 162 163 src_dir = os.path.join(data_dir, mode) 164 for jpg_path in tqdm(natsorted(glob(os.path.join(src_dir, "*.jpg"))), desc=f"Converting CISD {mode} to grayscale"): 165 img = imageio.imread(jpg_path) 166 if img.ndim == 3: 167 img = (img[..., :3] @ np.array([0.2989, 0.5870, 0.1140])).astype(np.uint8) 168 stem = os.path.splitext(os.path.basename(jpg_path))[0] 169 imageio.imwrite(os.path.join(gray_dir, f"{stem}.tif"), img) 170 171 return gray_dir 172 173 174def get_cisd_paths( 175 path: Union[os.PathLike, str], 176 mode: Literal["center_slice", "edf"] = "center_slice", 177 download: bool = False, 178) -> Tuple[List[str], List[str]]: 179 """Get paths to the CISD data. 180 181 Args: 182 path: Filepath to a folder where the downloaded data will be saved. 183 mode: The image format to use. One of "center_slice" (single best-focus 2D plane) 184 or "edf" (Extended Depth of Field 2D composite). 185 download: Whether to download the data if it is not present. 186 187 Returns: 188 List of filepaths for the image data. 189 List of filepaths for the label data. 190 """ 191 if mode not in ("center_slice", "edf"): 192 raise ValueError(f"Invalid mode '{mode}'. Choose 'center_slice' or 'edf'.") 193 194 data_dir = get_cisd_data(path, download) 195 196 img_dir = os.path.join(data_dir, mode) 197 if not os.path.exists(img_dir): 198 raise RuntimeError( 199 f"Image directory for mode '{mode}' not found: {img_dir}. " 200 "Expected modes: 'center_slice', 'edf'." 201 ) 202 203 label_dir = _convert_annotations(data_dir, mode) 204 raw_dir = _convert_raw_to_grayscale(data_dir, mode) 205 raw_paths = natsorted(glob(os.path.join(raw_dir, "*.tif"))) 206 label_paths = natsorted(glob(os.path.join(label_dir, "*.tif"))) 207 208 if len(raw_paths) == 0: 209 raise RuntimeError(f"No image files found in {img_dir}.") 210 if len(label_paths) == 0: 211 raise RuntimeError(f"No label files found in {label_dir}.") 212 213 # Match by stem name 214 raw_stems = {os.path.splitext(os.path.basename(p))[0]: p for p in raw_paths} 215 label_stems = {os.path.splitext(os.path.basename(p))[0]: p for p in label_paths} 216 common = natsorted(set(raw_stems) & set(label_stems)) 217 218 raw_paths = [raw_stems[s] for s in common] 219 label_paths = [label_stems[s] for s in common] 220 221 return raw_paths, label_paths 222 223 224def get_cisd_dataset( 225 path: Union[os.PathLike, str], 226 patch_shape: Tuple[int, ...], 227 mode: Literal["center_slice", "edf"] = "center_slice", 228 download: bool = False, 229 **kwargs, 230) -> Dataset: 231 """Get the CISD dataset for urothelial cell instance segmentation. 232 233 Args: 234 path: Filepath to a folder where the downloaded data will be saved. 235 patch_shape: The patch shape to use for training. 236 mode: The image format to use. One of "center_slice" or "edf". 237 download: Whether to download the data if it is not present. 238 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 239 240 Returns: 241 The segmentation dataset. 242 """ 243 raw_paths, label_paths = get_cisd_paths(path, mode, download) 244 245 return torch_em.default_segmentation_dataset( 246 raw_paths=raw_paths, 247 raw_key=None, 248 label_paths=label_paths, 249 label_key=None, 250 patch_shape=patch_shape, 251 **kwargs, 252 ) 253 254 255def get_cisd_loader( 256 path: Union[os.PathLike, str], 257 batch_size: int, 258 patch_shape: Tuple[int, ...], 259 mode: Literal["center_slice", "edf"] = "center_slice", 260 download: bool = False, 261 **kwargs, 262) -> DataLoader: 263 """Get the CISD dataloader for urothelial cell instance segmentation. 264 265 Args: 266 path: Filepath to a folder where the downloaded data will be saved. 267 batch_size: The batch size for training. 268 patch_shape: The patch shape to use for training. 269 mode: The image format to use. One of "center_slice" or "edf". 270 download: Whether to download the data if it is not present. 271 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 272 273 Returns: 274 The DataLoader. 275 """ 276 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 277 dataset = get_cisd_dataset(path, patch_shape, mode, download, **ds_kwargs) 278 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
41def get_cisd_data(path: Union[os.PathLike, str], download: bool = False) -> str: 42 """Download the CISD dataset. 43 44 Args: 45 path: Filepath to a folder where the downloaded data will be saved. 46 download: Whether to download the data if it is not present. 47 48 Returns: 49 The filepath to the extracted data directory. 50 """ 51 data_dir = os.path.join(path, "CISD") 52 if os.path.exists(data_dir): 53 return data_dir 54 55 os.makedirs(path, exist_ok=True) 56 zip_path = os.path.join(path, "CISD.zip") 57 util.download_source(zip_path, URL, download, checksum=CHECKSUM) 58 util.unzip(zip_path, path) 59 60 return data_dir
Download the CISD dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- download: Whether to download the data if it is not present.
Returns:
The filepath to the extracted data directory.
175def get_cisd_paths( 176 path: Union[os.PathLike, str], 177 mode: Literal["center_slice", "edf"] = "center_slice", 178 download: bool = False, 179) -> Tuple[List[str], List[str]]: 180 """Get paths to the CISD data. 181 182 Args: 183 path: Filepath to a folder where the downloaded data will be saved. 184 mode: The image format to use. One of "center_slice" (single best-focus 2D plane) 185 or "edf" (Extended Depth of Field 2D composite). 186 download: Whether to download the data if it is not present. 187 188 Returns: 189 List of filepaths for the image data. 190 List of filepaths for the label data. 191 """ 192 if mode not in ("center_slice", "edf"): 193 raise ValueError(f"Invalid mode '{mode}'. Choose 'center_slice' or 'edf'.") 194 195 data_dir = get_cisd_data(path, download) 196 197 img_dir = os.path.join(data_dir, mode) 198 if not os.path.exists(img_dir): 199 raise RuntimeError( 200 f"Image directory for mode '{mode}' not found: {img_dir}. " 201 "Expected modes: 'center_slice', 'edf'." 202 ) 203 204 label_dir = _convert_annotations(data_dir, mode) 205 raw_dir = _convert_raw_to_grayscale(data_dir, mode) 206 raw_paths = natsorted(glob(os.path.join(raw_dir, "*.tif"))) 207 label_paths = natsorted(glob(os.path.join(label_dir, "*.tif"))) 208 209 if len(raw_paths) == 0: 210 raise RuntimeError(f"No image files found in {img_dir}.") 211 if len(label_paths) == 0: 212 raise RuntimeError(f"No label files found in {label_dir}.") 213 214 # Match by stem name 215 raw_stems = {os.path.splitext(os.path.basename(p))[0]: p for p in raw_paths} 216 label_stems = {os.path.splitext(os.path.basename(p))[0]: p for p in label_paths} 217 common = natsorted(set(raw_stems) & set(label_stems)) 218 219 raw_paths = [raw_stems[s] for s in common] 220 label_paths = [label_stems[s] for s in common] 221 222 return raw_paths, label_paths
Get paths to the CISD data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- mode: The image format to use. One of "center_slice" (single best-focus 2D plane) or "edf" (Extended Depth of Field 2D composite).
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the image data. List of filepaths for the label data.
225def get_cisd_dataset( 226 path: Union[os.PathLike, str], 227 patch_shape: Tuple[int, ...], 228 mode: Literal["center_slice", "edf"] = "center_slice", 229 download: bool = False, 230 **kwargs, 231) -> Dataset: 232 """Get the CISD dataset for urothelial cell instance segmentation. 233 234 Args: 235 path: Filepath to a folder where the downloaded data will be saved. 236 patch_shape: The patch shape to use for training. 237 mode: The image format to use. One of "center_slice" or "edf". 238 download: Whether to download the data if it is not present. 239 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 240 241 Returns: 242 The segmentation dataset. 243 """ 244 raw_paths, label_paths = get_cisd_paths(path, mode, download) 245 246 return torch_em.default_segmentation_dataset( 247 raw_paths=raw_paths, 248 raw_key=None, 249 label_paths=label_paths, 250 label_key=None, 251 patch_shape=patch_shape, 252 **kwargs, 253 )
Get the CISD dataset for urothelial cell instance segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- mode: The image format to use. One of "center_slice" or "edf".
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset.
Returns:
The segmentation dataset.
256def get_cisd_loader( 257 path: Union[os.PathLike, str], 258 batch_size: int, 259 patch_shape: Tuple[int, ...], 260 mode: Literal["center_slice", "edf"] = "center_slice", 261 download: bool = False, 262 **kwargs, 263) -> DataLoader: 264 """Get the CISD dataloader for urothelial cell instance segmentation. 265 266 Args: 267 path: Filepath to a folder where the downloaded data will be saved. 268 batch_size: The batch size for training. 269 patch_shape: The patch shape to use for training. 270 mode: The image format to use. One of "center_slice" or "edf". 271 download: Whether to download the data if it is not present. 272 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 273 274 Returns: 275 The DataLoader. 276 """ 277 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 278 dataset = get_cisd_dataset(path, patch_shape, mode, download, **ds_kwargs) 279 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the CISD dataloader for urothelial cell instance segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- mode: The image format to use. One of "center_slice" or "edf".
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_datasetor for the PyTorch DataLoader.
Returns:
The DataLoader.