torch_em.data.datasets.medical.cholecseg8k

The CholecSeg8k dataset contains annotations for organs and instrument segmentation in endoscopy.

This dataset is located at https://www.kaggle.com/datasets/newslab/cholecseg8k/data. This dataset is from the publication https://doi.org/10.48550/arXiv.1602.03012. Please cite it if you use this data in a publication.

  1"""The CholecSeg8k dataset contains annotations for organs and instrument segmentation in endoscopy.
  2
  3This dataset is located at https://www.kaggle.com/datasets/newslab/cholecseg8k/data.
  4This dataset is from the publication https://doi.org/10.48550/arXiv.1602.03012.
  5Please cite it if you use this data in a publication.
  6"""
  7
  8import os
  9import shutil
 10from glob import glob
 11from tqdm import tqdm
 12from pathlib import Path
 13from natsort import natsorted
 14from typing import Tuple, Union, Literal, List
 15
 16import numpy as np
 17import imageio.v3 as imageio
 18
 19from torch.utils.data import Dataset, DataLoader
 20
 21import torch_em
 22
 23from .. import util
 24
 25
 26LABEL_MAPS = {
 27    (255, 255, 255): 0,  # small white frame around the image
 28    (50, 50, 50): 0,  # background
 29    (11, 11, 11): 1,  # abdominal wall
 30    (21, 21, 21): 2,  # liver
 31    (13, 13, 13): 3,  # gastrointestinal tract
 32    (12, 12, 12): 4,  # fat
 33    (31, 31, 31): 5,  # grasper
 34    (23, 23, 23): 6,  # connective tissue
 35    (24, 24, 24): 7,  # blood
 36    (25, 25, 25): 8,  # cystic dust
 37    (32, 32, 32): 9,  # l-hook electrocautery
 38    (22, 22, 22): 10,  # gallbladder
 39    (33, 33, 33): 11,  # hepatic vein
 40    (5, 5, 5): 12  # liver ligament
 41}
 42
 43
 44def get_cholecseg8k_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 45    """Get the CholecSeg8k data.
 46
 47    Args:
 48        path: Filepath to a folder where the data is downloaded for further processing.
 49        download: Whether to download the data if it is not present.
 50
 51    Returns:
 52        Filepath where the data is downloaded.
 53    """
 54    data_dir = os.path.join(path, "data")
 55    if os.path.exists(data_dir):
 56        return data_dir
 57
 58    os.makedirs(path, exist_ok=True)
 59
 60    zip_path = os.path.join(path, "cholecseg8k.zip")
 61    util.download_source_kaggle(path=zip_path, dataset_name="newslab/cholecseg8k", download=download)
 62    util.unzip(zip_path=zip_path, dst=data_dir)
 63
 64    return data_dir
 65
 66
 67def get_cholecseg8k_paths(
 68    path: Union[os.PathLike, str], split: Literal["train", "val", "test"], download: bool = False
 69) -> Tuple[List[str], List[str]]:
 70    """Get paths for the CholecSeg8k dataset.
 71
 72    Args:
 73        path: Filepath to a folder where the data is downloaded for further processing.
 74        split: The choice of data split.
 75        download: Whether to download the data if it is not present.
 76
 77    Returns:
 78        List of filepaths for the image data.
 79        List of filepaths for the label data.
 80    """
 81    data_dir = get_cholecseg8k_data(path, download)
 82
 83    video_dirs = natsorted(glob(os.path.join(data_dir, "video*")))
 84    if split == "train":
 85        video_dirs = video_dirs[2:-2]
 86    elif split == "val":
 87        video_dirs = [video_dirs[1], video_dirs[-2]]
 88    elif split == "test":
 89        video_dirs = [video_dirs[0], video_dirs[-1]]
 90    else:
 91        raise ValueError(f"'{split}' is not a valid split.")
 92
 93    ppdir = os.path.join(data_dir, "preprocessed", split)
 94    if os.path.exists(ppdir):
 95        _image_paths = natsorted(glob(os.path.join(ppdir, "images", "*")))
 96        _gt_paths = natsorted(glob(os.path.join(ppdir, "masks", "*")))
 97        return _image_paths, _gt_paths
 98
 99    os.makedirs(os.path.join(ppdir, "images"), exist_ok=True)
100    os.makedirs(os.path.join(ppdir, "masks"), exist_ok=True)
101
102    image_paths, gt_paths = [], []
103    for video_dir in tqdm(video_dirs):
104        org_image_paths = natsorted(glob(os.path.join(video_dir, "video*", "*_endo.png")))
105        org_gt_paths = natsorted(glob(os.path.join(video_dir, "video*", "*_endo_watershed_mask.png")))
106
107        for org_image_path, org_gt_path in zip(org_image_paths, org_gt_paths):
108            image_id = os.path.split(org_image_path)[-1]
109
110            image_path = os.path.join(ppdir, "images", image_id)
111            gt_path = os.path.join(ppdir, "masks", Path(image_id).with_suffix(".tif"))
112
113            image_paths.append(image_path)
114            gt_paths.append(gt_path)
115
116            if os.path.exists(image_path) and os.path.exists(gt_path):
117                continue
118
119            gt = imageio.imread(org_gt_path)
120            assert gt.ndim == 3
121            if gt.shape[-1] != 3:  # some labels have a 4th channel which has all values as 255
122                print("Found a label with inconsistent format.")
123                # let's verify the case
124                assert np.unique(gt[..., -1]) == 255
125                gt = gt[..., :3]
126
127            instances = np.zeros(gt.shape[:2])
128            for lmap in LABEL_MAPS:
129                binary_map = (gt == lmap).all(axis=2)
130                instances[binary_map > 0] = LABEL_MAPS[lmap]
131
132            shutil.copy(src=org_image_path, dst=image_path)
133            imageio.imwrite(gt_path, instances, compression="zlib")
134
135    return image_paths, gt_paths
136
137
138def get_cholecseg8k_dataset(
139    path: Union[str, os.PathLike],
140    patch_shape: Tuple[int, int],
141    split: Literal["train", "val", "test"],
142    resize_inputs: bool = False,
143    download: bool = False,
144    **kwargs
145) -> Dataset:
146    """Get the CholecSeg8k dataset for organ and instrument segmentation.
147
148    Args:
149        path: Filepath to a folder where the data is downloaded for further processing.
150        patch_shape: The patch shape to use for training.
151        split: The choice of data split.
152        resize_inputs: Whether to resize inputs to the desired patch shape.
153        download: Whether to download the data if it is not present.
154        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
155
156    Returns:
157        The segmentation dataset.
158    """
159    image_paths, gt_paths = get_cholecseg8k_paths(path, split, download)
160
161    if resize_inputs:
162        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
163        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
164            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
165        )
166
167    return torch_em.default_segmentation_dataset(
168        raw_paths=image_paths,
169        raw_key=None,
170        label_paths=gt_paths,
171        label_key=None,
172        is_seg_dataset=False,
173        patch_shape=patch_shape,
174        **kwargs
175    )
176
177
178def get_cholecseg8k_loader(
179    path: Union[str, os.PathLike],
180    batch_size: int,
181    patch_shape: Tuple[int, int],
182    split: Literal["train", "val", "test"],
183    resize_inputs: bool = False,
184    download: bool = False,
185    **kwargs
186) -> DataLoader:
187    """Get the CholecSeg8k dataloader for organ and instrument segmentation.
188
189    Args:
190        path: Filepath to a folder where the data is downloaded for further processing.
191        batch_size: The batch size for training.
192        patch_shape: The patch shape to use for training.
193        split: The choice of data split.
194        resize_inputs: Whether to resize inputs to the desired patch shape.
195        download: Whether to download the data if it is not present.
196        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
197
198    Returns:
199        The DataLoader.
200    """
201    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
202    dataset = get_cholecseg8k_dataset(path, patch_shape, split, resize_inputs, download, **ds_kwargs)
203    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
LABEL_MAPS = {(255, 255, 255): 0, (50, 50, 50): 0, (11, 11, 11): 1, (21, 21, 21): 2, (13, 13, 13): 3, (12, 12, 12): 4, (31, 31, 31): 5, (23, 23, 23): 6, (24, 24, 24): 7, (25, 25, 25): 8, (32, 32, 32): 9, (22, 22, 22): 10, (33, 33, 33): 11, (5, 5, 5): 12}
def get_cholecseg8k_data(path: Union[os.PathLike, str], download: bool = False) -> str:
45def get_cholecseg8k_data(path: Union[os.PathLike, str], download: bool = False) -> str:
46    """Get the CholecSeg8k data.
47
48    Args:
49        path: Filepath to a folder where the data is downloaded for further processing.
50        download: Whether to download the data if it is not present.
51
52    Returns:
53        Filepath where the data is downloaded.
54    """
55    data_dir = os.path.join(path, "data")
56    if os.path.exists(data_dir):
57        return data_dir
58
59    os.makedirs(path, exist_ok=True)
60
61    zip_path = os.path.join(path, "cholecseg8k.zip")
62    util.download_source_kaggle(path=zip_path, dataset_name="newslab/cholecseg8k", download=download)
63    util.unzip(zip_path=zip_path, dst=data_dir)
64
65    return data_dir

Get the CholecSeg8k data.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • download: Whether to download the data if it is not present.
Returns:

Filepath where the data is downloaded.

def get_cholecseg8k_paths( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], download: bool = False) -> Tuple[List[str], List[str]]:
 68def get_cholecseg8k_paths(
 69    path: Union[os.PathLike, str], split: Literal["train", "val", "test"], download: bool = False
 70) -> Tuple[List[str], List[str]]:
 71    """Get paths for the CholecSeg8k dataset.
 72
 73    Args:
 74        path: Filepath to a folder where the data is downloaded for further processing.
 75        split: The choice of data split.
 76        download: Whether to download the data if it is not present.
 77
 78    Returns:
 79        List of filepaths for the image data.
 80        List of filepaths for the label data.
 81    """
 82    data_dir = get_cholecseg8k_data(path, download)
 83
 84    video_dirs = natsorted(glob(os.path.join(data_dir, "video*")))
 85    if split == "train":
 86        video_dirs = video_dirs[2:-2]
 87    elif split == "val":
 88        video_dirs = [video_dirs[1], video_dirs[-2]]
 89    elif split == "test":
 90        video_dirs = [video_dirs[0], video_dirs[-1]]
 91    else:
 92        raise ValueError(f"'{split}' is not a valid split.")
 93
 94    ppdir = os.path.join(data_dir, "preprocessed", split)
 95    if os.path.exists(ppdir):
 96        _image_paths = natsorted(glob(os.path.join(ppdir, "images", "*")))
 97        _gt_paths = natsorted(glob(os.path.join(ppdir, "masks", "*")))
 98        return _image_paths, _gt_paths
 99
100    os.makedirs(os.path.join(ppdir, "images"), exist_ok=True)
101    os.makedirs(os.path.join(ppdir, "masks"), exist_ok=True)
102
103    image_paths, gt_paths = [], []
104    for video_dir in tqdm(video_dirs):
105        org_image_paths = natsorted(glob(os.path.join(video_dir, "video*", "*_endo.png")))
106        org_gt_paths = natsorted(glob(os.path.join(video_dir, "video*", "*_endo_watershed_mask.png")))
107
108        for org_image_path, org_gt_path in zip(org_image_paths, org_gt_paths):
109            image_id = os.path.split(org_image_path)[-1]
110
111            image_path = os.path.join(ppdir, "images", image_id)
112            gt_path = os.path.join(ppdir, "masks", Path(image_id).with_suffix(".tif"))
113
114            image_paths.append(image_path)
115            gt_paths.append(gt_path)
116
117            if os.path.exists(image_path) and os.path.exists(gt_path):
118                continue
119
120            gt = imageio.imread(org_gt_path)
121            assert gt.ndim == 3
122            if gt.shape[-1] != 3:  # some labels have a 4th channel which has all values as 255
123                print("Found a label with inconsistent format.")
124                # let's verify the case
125                assert np.unique(gt[..., -1]) == 255
126                gt = gt[..., :3]
127
128            instances = np.zeros(gt.shape[:2])
129            for lmap in LABEL_MAPS:
130                binary_map = (gt == lmap).all(axis=2)
131                instances[binary_map > 0] = LABEL_MAPS[lmap]
132
133            shutil.copy(src=org_image_path, dst=image_path)
134            imageio.imwrite(gt_path, instances, compression="zlib")
135
136    return image_paths, gt_paths

Get paths for the CholecSeg8k dataset.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • split: The choice of data split.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths for the image data. List of filepaths for the label data.

def get_cholecseg8k_dataset( path: Union[str, os.PathLike], patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
139def get_cholecseg8k_dataset(
140    path: Union[str, os.PathLike],
141    patch_shape: Tuple[int, int],
142    split: Literal["train", "val", "test"],
143    resize_inputs: bool = False,
144    download: bool = False,
145    **kwargs
146) -> Dataset:
147    """Get the CholecSeg8k dataset for organ and instrument segmentation.
148
149    Args:
150        path: Filepath to a folder where the data is downloaded for further processing.
151        patch_shape: The patch shape to use for training.
152        split: The choice of data split.
153        resize_inputs: Whether to resize inputs to the desired patch shape.
154        download: Whether to download the data if it is not present.
155        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
156
157    Returns:
158        The segmentation dataset.
159    """
160    image_paths, gt_paths = get_cholecseg8k_paths(path, split, download)
161
162    if resize_inputs:
163        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
164        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
165            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
166        )
167
168    return torch_em.default_segmentation_dataset(
169        raw_paths=image_paths,
170        raw_key=None,
171        label_paths=gt_paths,
172        label_key=None,
173        is_seg_dataset=False,
174        patch_shape=patch_shape,
175        **kwargs
176    )

Get the CholecSeg8k dataset for organ and instrument segmentation.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • patch_shape: The patch shape to use for training.
  • split: The choice of data split.
  • resize_inputs: Whether to resize inputs to the desired patch shape.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_cholecseg8k_loader( path: Union[str, os.PathLike], batch_size: int, patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
179def get_cholecseg8k_loader(
180    path: Union[str, os.PathLike],
181    batch_size: int,
182    patch_shape: Tuple[int, int],
183    split: Literal["train", "val", "test"],
184    resize_inputs: bool = False,
185    download: bool = False,
186    **kwargs
187) -> DataLoader:
188    """Get the CholecSeg8k dataloader for organ and instrument segmentation.
189
190    Args:
191        path: Filepath to a folder where the data is downloaded for further processing.
192        batch_size: The batch size for training.
193        patch_shape: The patch shape to use for training.
194        split: The choice of data split.
195        resize_inputs: Whether to resize inputs to the desired patch shape.
196        download: Whether to download the data if it is not present.
197        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
198
199    Returns:
200        The DataLoader.
201    """
202    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
203    dataset = get_cholecseg8k_dataset(path, patch_shape, split, resize_inputs, download, **ds_kwargs)
204    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the CholecSeg8k dataloader for organ and instrument segmentation.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape to use for training.
  • split: The choice of data split.
  • resize_inputs: Whether to resize inputs to the desired patch shape.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.