torch_em.data.datasets.light_microscopy.cisd

The CISD dataset contains 3,911 samples of touching or overlapping urothelial cells from digital cytology, with manually annotated instance segmentation masks.

The data comes from 30 cytology slides prepared from healthy patient urine samples and digitized with 21 focal planes. Two 2D image modes are supported:

center_slice: Single best-focus 2D plane (JPG)
edf: Extended Depth of Field - 21 planes merged into one focused 2D image (JPG)

NOTE: The raw dataset also provides a "stack" mode (all 21 focal planes per sample), but it is not supported here because the annotations are always 2D instance masks. A 3D stack with 2D-only labels cannot form a valid segmentation dataset.

Annotations are 2D instance masks stored in RLE format in CISD.json. Cell categories: RED_BLOOD_CELL, NEUTROPHIL, SUPERFICIAL, UROTHELIAL.

The dataset is located at https://zenodo.org/records/5938893. This dataset is from the publication https://doi.org/10.1109/ISBI52829.2022.9761495. Please cite it if you use this dataset in your research.

View Source

  1"""The CISD dataset contains 3,911 samples of touching or overlapping urothelial cells
  2from digital cytology, with manually annotated instance segmentation masks.
  3
  4The data comes from 30 cytology slides prepared from healthy patient urine samples
  5and digitized with 21 focal planes. Two 2D image modes are supported:
  6- center_slice: Single best-focus 2D plane (JPG)
  7- edf: Extended Depth of Field - 21 planes merged into one focused 2D image (JPG)
  8
  9NOTE: The raw dataset also provides a "stack" mode (all 21 focal planes per sample),
 10but it is not supported here because the annotations are always 2D instance masks.
 11A 3D stack with 2D-only labels cannot form a valid segmentation dataset.
 12
 13Annotations are 2D instance masks stored in RLE format in CISD.json.
 14Cell categories: RED_BLOOD_CELL, NEUTROPHIL, SUPERFICIAL, UROTHELIAL.
 15
 16The dataset is located at https://zenodo.org/records/5938893.
 17This dataset is from the publication https://doi.org/10.1109/ISBI52829.2022.9761495.
 18Please cite it if you use this dataset in your research.
 19"""
 20
 21import os
 22import json
 23from glob import glob
 24from natsort import natsorted
 25from typing import List, Literal, Tuple, Union
 26
 27import numpy as np
 28
 29from torch.utils.data import Dataset, DataLoader
 30
 31import torch_em
 32
 33from .. import util
 34
 35
 36URL = "https://zenodo.org/records/5938893/files/CISD.zip"
 37CHECKSUM = None
 38
 39
 40def get_cisd_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 41    """Download the CISD dataset.
 42
 43    Args:
 44        path: Filepath to a folder where the downloaded data will be saved.
 45        download: Whether to download the data if it is not present.
 46
 47    Returns:
 48        The filepath to the extracted data directory.
 49    """
 50    data_dir = os.path.join(path, "CISD")
 51    if os.path.exists(data_dir):
 52        return data_dir
 53
 54    os.makedirs(path, exist_ok=True)
 55    zip_path = os.path.join(path, "CISD.zip")
 56    util.download_source(zip_path, URL, download, checksum=CHECKSUM)
 57    util.unzip(zip_path, path)
 58
 59    return data_dir
 60
 61
 62def _decode_rle(rle_counts, height, width):
 63    """Decode an uncompressed RLE mask (row-major order) to a 2D array."""
 64    flat = np.zeros(height * width, dtype=np.uint8)
 65    pos = 0
 66    for i, count in enumerate(rle_counts):
 67        if i % 2 == 1:
 68            flat[pos:pos + count] = 1
 69        pos += count
 70    return flat.reshape((height, width), order="C")
 71
 72
 73def _convert_annotations(data_dir: str, mode: str) -> str:
 74    """Convert CISD.json RLE masks to per-sample 2D TIFF label images.
 75
 76    Reads image dimensions from the mask 'size' field - no raw images are loaded.
 77    Runs once; subsequent calls return the cached label directory immediately.
 78
 79    Args:
 80        data_dir: The root CISD data directory (contains CISD.json).
 81        mode: One of "center_slice" or "edf".
 82
 83    Returns:
 84        Path to the directory containing the generated label TIFFs.
 85    """
 86    import imageio.v3 as imageio
 87    from tqdm import tqdm
 88
 89    label_dir = os.path.join(data_dir, f"{mode}_labels")
 90    if os.path.exists(label_dir) and len(glob(os.path.join(label_dir, "*.tif"))) > 0:
 91        return label_dir
 92
 93    os.makedirs(label_dir, exist_ok=True)
 94
 95    json_path = os.path.join(data_dir, "CISD.json")
 96    if not os.path.exists(json_path):
 97        raise RuntimeError(f"Annotation file not found: {json_path}")
 98
 99    with open(json_path, "r") as f:
100        data = json.load(f)
101
102    assets = data["assets"]  # list of {"asset_id", "file_name", "annotations": [...]}
103
104    for asset in tqdm(assets, desc=f"Converting CISD {mode} labels"):
105        file_name = asset["file_name"]               # e.g. "0241_BB_01471.jpg"
106        base_name = os.path.splitext(file_name)[0]   # e.g. "0241_BB_01471"
107        anns = asset.get("annotations", [])
108
109        # Get (H, W) from the first RLE size field - no image loading needed
110        h, w = None, None
111        for ann in anns:
112            for item in ann.get("data", []):
113                mask_info = item.get("mask", {})
114                if "size" in mask_info:
115                    h, w = mask_info["size"]
116                    break
117            if h is not None:
118                break
119
120        if h is None or w is None:
121            continue
122
123        label = np.zeros((h, w), dtype=np.int32)
124        inst_id = 1
125        for ann in anns:
126            for item in ann.get("data", []):
127                mask_info = item.get("mask", {})
128                counts = mask_info.get("counts", [])
129                size = mask_info.get("size", [h, w])
130                if not counts:
131                    continue
132                mask = _decode_rle(counts, size[0], size[1])
133                label[mask > 0] = inst_id
134                inst_id += 1
135
136        out_path = os.path.join(label_dir, f"{base_name}.tif")
137        imageio.imwrite(out_path, label)
138
139    return label_dir
140
141
142def _convert_raw_to_grayscale(data_dir: str, mode: str) -> str:
143    """Convert RGB JPG images to grayscale TIFFs so shapes match the 2D label masks.
144
145    Runs once; subsequent calls return the cached directory immediately.
146
147    Args:
148        data_dir: The root CISD data directory.
149        mode: One of "center_slice" or "edf".
150
151    Returns:
152        Path to the directory containing the grayscale TIFFs.
153    """
154    import imageio.v3 as imageio
155    from tqdm import tqdm
156
157    gray_dir = os.path.join(data_dir, f"{mode}_gray")
158    if os.path.exists(gray_dir) and len(glob(os.path.join(gray_dir, "*.tif"))) > 0:
159        return gray_dir
160
161    os.makedirs(gray_dir, exist_ok=True)
162
163    src_dir = os.path.join(data_dir, mode)
164    for jpg_path in tqdm(natsorted(glob(os.path.join(src_dir, "*.jpg"))), desc=f"Converting CISD {mode} to grayscale"):
165        img = imageio.imread(jpg_path)
166        if img.ndim == 3:
167            img = (img[..., :3] @ np.array([0.2989, 0.5870, 0.1140])).astype(np.uint8)
168        stem = os.path.splitext(os.path.basename(jpg_path))[0]
169        imageio.imwrite(os.path.join(gray_dir, f"{stem}.tif"), img)
170
171    return gray_dir
172
173
174def get_cisd_paths(
175    path: Union[os.PathLike, str],
176    mode: Literal["center_slice", "edf"] = "center_slice",
177    download: bool = False,
178) -> Tuple[List[str], List[str]]:
179    """Get paths to the CISD data.
180
181    Args:
182        path: Filepath to a folder where the downloaded data will be saved.
183        mode: The image format to use. One of "center_slice" (single best-focus 2D plane)
184            or "edf" (Extended Depth of Field 2D composite).
185        download: Whether to download the data if it is not present.
186
187    Returns:
188        List of filepaths for the image data.
189        List of filepaths for the label data.
190    """
191    if mode not in ("center_slice", "edf"):
192        raise ValueError(f"Invalid mode '{mode}'. Choose 'center_slice' or 'edf'.")
193
194    data_dir = get_cisd_data(path, download)
195
196    img_dir = os.path.join(data_dir, mode)
197    if not os.path.exists(img_dir):
198        raise RuntimeError(
199            f"Image directory for mode '{mode}' not found: {img_dir}. "
200            "Expected modes: 'center_slice', 'edf'."
201        )
202
203    label_dir = _convert_annotations(data_dir, mode)
204    raw_dir = _convert_raw_to_grayscale(data_dir, mode)
205    raw_paths = natsorted(glob(os.path.join(raw_dir, "*.tif")))
206    label_paths = natsorted(glob(os.path.join(label_dir, "*.tif")))
207
208    if len(raw_paths) == 0:
209        raise RuntimeError(f"No image files found in {img_dir}.")
210    if len(label_paths) == 0:
211        raise RuntimeError(f"No label files found in {label_dir}.")
212
213    # Match by stem name
214    raw_stems = {os.path.splitext(os.path.basename(p))[0]: p for p in raw_paths}
215    label_stems = {os.path.splitext(os.path.basename(p))[0]: p for p in label_paths}
216    common = natsorted(set(raw_stems) & set(label_stems))
217
218    raw_paths = [raw_stems[s] for s in common]
219    label_paths = [label_stems[s] for s in common]
220
221    return raw_paths, label_paths
222
223
224def get_cisd_dataset(
225    path: Union[os.PathLike, str],
226    patch_shape: Tuple[int, ...],
227    mode: Literal["center_slice", "edf"] = "center_slice",
228    download: bool = False,
229    **kwargs,
230) -> Dataset:
231    """Get the CISD dataset for urothelial cell instance segmentation.
232
233    Args:
234        path: Filepath to a folder where the downloaded data will be saved.
235        patch_shape: The patch shape to use for training.
236        mode: The image format to use. One of "center_slice" or "edf".
237        download: Whether to download the data if it is not present.
238        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
239
240    Returns:
241        The segmentation dataset.
242    """
243    raw_paths, label_paths = get_cisd_paths(path, mode, download)
244
245    return torch_em.default_segmentation_dataset(
246        raw_paths=raw_paths,
247        raw_key=None,
248        label_paths=label_paths,
249        label_key=None,
250        patch_shape=patch_shape,
251        **kwargs,
252    )
253
254
255def get_cisd_loader(
256    path: Union[os.PathLike, str],
257    batch_size: int,
258    patch_shape: Tuple[int, ...],
259    mode: Literal["center_slice", "edf"] = "center_slice",
260    download: bool = False,
261    **kwargs,
262) -> DataLoader:
263    """Get the CISD dataloader for urothelial cell instance segmentation.
264
265    Args:
266        path: Filepath to a folder where the downloaded data will be saved.
267        batch_size: The batch size for training.
268        patch_shape: The patch shape to use for training.
269        mode: The image format to use. One of "center_slice" or "edf".
270        download: Whether to download the data if it is not present.
271        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
272
273    Returns:
274        The DataLoader.
275    """
276    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
277    dataset = get_cisd_dataset(path, patch_shape, mode, download, **ds_kwargs)
278    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

URL = 'https://zenodo.org/records/5938893/files/CISD.zip'

CHECKSUM = None

def get_cisd_data(path: Union[os.PathLike, str], download: bool = False) -> str: View Source

41def get_cisd_data(path: Union[os.PathLike, str], download: bool = False) -> str:
42    """Download the CISD dataset.
43
44    Args:
45        path: Filepath to a folder where the downloaded data will be saved.
46        download: Whether to download the data if it is not present.
47
48    Returns:
49        The filepath to the extracted data directory.
50    """
51    data_dir = os.path.join(path, "CISD")
52    if os.path.exists(data_dir):
53        return data_dir
54
55    os.makedirs(path, exist_ok=True)
56    zip_path = os.path.join(path, "CISD.zip")
57    util.download_source(zip_path, URL, download, checksum=CHECKSUM)
58    util.unzip(zip_path, path)
59
60    return data_dir

Download the CISD dataset.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
download: Whether to download the data if it is not present.

Returns:

The filepath to the extracted data directory.

def get_cisd_paths( path: Union[os.PathLike, str], mode: Literal['center_slice', 'edf'] = 'center_slice', download: bool = False) -> Tuple[List[str], List[str]]: View Source

175def get_cisd_paths(
176    path: Union[os.PathLike, str],
177    mode: Literal["center_slice", "edf"] = "center_slice",
178    download: bool = False,
179) -> Tuple[List[str], List[str]]:
180    """Get paths to the CISD data.
181
182    Args:
183        path: Filepath to a folder where the downloaded data will be saved.
184        mode: The image format to use. One of "center_slice" (single best-focus 2D plane)
185            or "edf" (Extended Depth of Field 2D composite).
186        download: Whether to download the data if it is not present.
187
188    Returns:
189        List of filepaths for the image data.
190        List of filepaths for the label data.
191    """
192    if mode not in ("center_slice", "edf"):
193        raise ValueError(f"Invalid mode '{mode}'. Choose 'center_slice' or 'edf'.")
194
195    data_dir = get_cisd_data(path, download)
196
197    img_dir = os.path.join(data_dir, mode)
198    if not os.path.exists(img_dir):
199        raise RuntimeError(
200            f"Image directory for mode '{mode}' not found: {img_dir}. "
201            "Expected modes: 'center_slice', 'edf'."
202        )
203
204    label_dir = _convert_annotations(data_dir, mode)
205    raw_dir = _convert_raw_to_grayscale(data_dir, mode)
206    raw_paths = natsorted(glob(os.path.join(raw_dir, "*.tif")))
207    label_paths = natsorted(glob(os.path.join(label_dir, "*.tif")))
208
209    if len(raw_paths) == 0:
210        raise RuntimeError(f"No image files found in {img_dir}.")
211    if len(label_paths) == 0:
212        raise RuntimeError(f"No label files found in {label_dir}.")
213
214    # Match by stem name
215    raw_stems = {os.path.splitext(os.path.basename(p))[0]: p for p in raw_paths}
216    label_stems = {os.path.splitext(os.path.basename(p))[0]: p for p in label_paths}
217    common = natsorted(set(raw_stems) & set(label_stems))
218
219    raw_paths = [raw_stems[s] for s in common]
220    label_paths = [label_stems[s] for s in common]
221
222    return raw_paths, label_paths

Get paths to the CISD data.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
mode: The image format to use. One of "center_slice" (single best-focus 2D plane) or "edf" (Extended Depth of Field 2D composite).
download: Whether to download the data if it is not present.

Returns:

List of filepaths for the image data. List of filepaths for the label data.

def get_cisd_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], mode: Literal['center_slice', 'edf'] = 'center_slice', download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset: View Source

225def get_cisd_dataset(
226    path: Union[os.PathLike, str],
227    patch_shape: Tuple[int, ...],
228    mode: Literal["center_slice", "edf"] = "center_slice",
229    download: bool = False,
230    **kwargs,
231) -> Dataset:
232    """Get the CISD dataset for urothelial cell instance segmentation.
233
234    Args:
235        path: Filepath to a folder where the downloaded data will be saved.
236        patch_shape: The patch shape to use for training.
237        mode: The image format to use. One of "center_slice" or "edf".
238        download: Whether to download the data if it is not present.
239        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
240
241    Returns:
242        The segmentation dataset.
243    """
244    raw_paths, label_paths = get_cisd_paths(path, mode, download)
245
246    return torch_em.default_segmentation_dataset(
247        raw_paths=raw_paths,
248        raw_key=None,
249        label_paths=label_paths,
250        label_key=None,
251        patch_shape=patch_shape,
252        **kwargs,
253    )

Get the CISD dataset for urothelial cell instance segmentation.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
patch_shape: The patch shape to use for training.
mode: The image format to use. One of "center_slice" or "edf".
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

Returns:

The segmentation dataset.

def get_cisd_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, ...], mode: Literal['center_slice', 'edf'] = 'center_slice', download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader: View Source

256def get_cisd_loader(
257    path: Union[os.PathLike, str],
258    batch_size: int,
259    patch_shape: Tuple[int, ...],
260    mode: Literal["center_slice", "edf"] = "center_slice",
261    download: bool = False,
262    **kwargs,
263) -> DataLoader:
264    """Get the CISD dataloader for urothelial cell instance segmentation.
265
266    Args:
267        path: Filepath to a folder where the downloaded data will be saved.
268        batch_size: The batch size for training.
269        patch_shape: The patch shape to use for training.
270        mode: The image format to use. One of "center_slice" or "edf".
271        download: Whether to download the data if it is not present.
272        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
273
274    Returns:
275        The DataLoader.
276    """
277    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
278    dataset = get_cisd_dataset(path, patch_shape, mode, download, **ds_kwargs)
279    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the CISD dataloader for urothelial cell instance segmentation.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
batch_size: The batch size for training.
patch_shape: The patch shape to use for training.
mode: The image format to use. One of "center_slice" or "edf".
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

Returns:

The DataLoader.