torch_em.data.datasets.medical.cholecseg8k
The CholecSeg8k dataset contains annotations for organs and instrument segmentation in endoscopy.
This dataset is located at https://www.kaggle.com/datasets/newslab/cholecseg8k/data. This dataset is from the publication https://doi.org/10.48550/arXiv.1602.03012. Please cite it if you use this data in a publication.
1"""The CholecSeg8k dataset contains annotations for organs and instrument segmentation in endoscopy. 2 3This dataset is located at https://www.kaggle.com/datasets/newslab/cholecseg8k/data. 4This dataset is from the publication https://doi.org/10.48550/arXiv.1602.03012. 5Please cite it if you use this data in a publication. 6""" 7 8import os 9import shutil 10from glob import glob 11from tqdm import tqdm 12from pathlib import Path 13from natsort import natsorted 14from typing import Tuple, Union, Literal, List 15 16import numpy as np 17import imageio.v3 as imageio 18 19from torch.utils.data import Dataset, DataLoader 20 21import torch_em 22 23from .. import util 24 25 26LABEL_MAPS = { 27 (255, 255, 255): 0, # small white frame around the image 28 (50, 50, 50): 0, # background 29 (11, 11, 11): 1, # abdominal wall 30 (21, 21, 21): 2, # liver 31 (13, 13, 13): 3, # gastrointestinal tract 32 (12, 12, 12): 4, # fat 33 (31, 31, 31): 5, # grasper 34 (23, 23, 23): 6, # connective tissue 35 (24, 24, 24): 7, # blood 36 (25, 25, 25): 8, # cystic dust 37 (32, 32, 32): 9, # l-hook electrocautery 38 (22, 22, 22): 10, # gallbladder 39 (33, 33, 33): 11, # hepatic vein 40 (5, 5, 5): 12 # liver ligament 41} 42 43 44def get_cholecseg8k_data(path: Union[os.PathLike, str], download: bool = False) -> str: 45 """Get the CholecSeg8k data. 46 47 Args: 48 path: Filepath to a folder where the data is downloaded for further processing. 49 download: Whether to download the data if it is not present. 50 51 Returns: 52 Filepath where the data is downloaded. 53 """ 54 data_dir = os.path.join(path, "data") 55 if os.path.exists(data_dir): 56 return data_dir 57 58 os.makedirs(path, exist_ok=True) 59 60 zip_path = os.path.join(path, "cholecseg8k.zip") 61 util.download_source_kaggle(path=zip_path, dataset_name="newslab/cholecseg8k", download=download) 62 util.unzip(zip_path=zip_path, dst=data_dir) 63 64 return data_dir 65 66 67def get_cholecseg8k_paths( 68 path: Union[os.PathLike, str], split: Literal["train", "val", "test"], download: bool = False 69) -> Tuple[List[str], List[str]]: 70 """Get paths for the CholecSeg8k dataset. 71 72 Args: 73 path: Filepath to a folder where the data is downloaded for further processing. 74 split: The choice of data split. 75 download: Whether to download the data if it is not present. 76 77 Returns: 78 List of filepaths for the image data. 79 List of filepaths for the label data. 80 """ 81 data_dir = get_cholecseg8k_data(path, download) 82 83 video_dirs = natsorted(glob(os.path.join(data_dir, "video*"))) 84 if split == "train": 85 video_dirs = video_dirs[2:-2] 86 elif split == "val": 87 video_dirs = [video_dirs[1], video_dirs[-2]] 88 elif split == "test": 89 video_dirs = [video_dirs[0], video_dirs[-1]] 90 else: 91 raise ValueError(f"'{split}' is not a valid split.") 92 93 ppdir = os.path.join(data_dir, "preprocessed", split) 94 if os.path.exists(ppdir): 95 _image_paths = natsorted(glob(os.path.join(ppdir, "images", "*"))) 96 _gt_paths = natsorted(glob(os.path.join(ppdir, "masks", "*"))) 97 return _image_paths, _gt_paths 98 99 os.makedirs(os.path.join(ppdir, "images"), exist_ok=True) 100 os.makedirs(os.path.join(ppdir, "masks"), exist_ok=True) 101 102 image_paths, gt_paths = [], [] 103 for video_dir in tqdm(video_dirs): 104 org_image_paths = natsorted(glob(os.path.join(video_dir, "video*", "*_endo.png"))) 105 org_gt_paths = natsorted(glob(os.path.join(video_dir, "video*", "*_endo_watershed_mask.png"))) 106 107 for org_image_path, org_gt_path in zip(org_image_paths, org_gt_paths): 108 image_id = os.path.split(org_image_path)[-1] 109 110 image_path = os.path.join(ppdir, "images", image_id) 111 gt_path = os.path.join(ppdir, "masks", Path(image_id).with_suffix(".tif")) 112 113 image_paths.append(image_path) 114 gt_paths.append(gt_path) 115 116 if os.path.exists(image_path) and os.path.exists(gt_path): 117 continue 118 119 gt = imageio.imread(org_gt_path) 120 assert gt.ndim == 3 121 if gt.shape[-1] != 3: # some labels have a 4th channel which has all values as 255 122 print("Found a label with inconsistent format.") 123 # let's verify the case 124 assert np.unique(gt[..., -1]) == 255 125 gt = gt[..., :3] 126 127 instances = np.zeros(gt.shape[:2]) 128 for lmap in LABEL_MAPS: 129 binary_map = (gt == lmap).all(axis=2) 130 instances[binary_map > 0] = LABEL_MAPS[lmap] 131 132 shutil.copy(src=org_image_path, dst=image_path) 133 imageio.imwrite(gt_path, instances, compression="zlib") 134 135 return image_paths, gt_paths 136 137 138def get_cholecseg8k_dataset( 139 path: Union[str, os.PathLike], 140 patch_shape: Tuple[int, int], 141 split: Literal["train", "val", "test"], 142 resize_inputs: bool = False, 143 download: bool = False, 144 **kwargs 145) -> Dataset: 146 """Get the CholecSeg8k dataset for organ and instrument segmentation. 147 148 Args: 149 path: Filepath to a folder where the data is downloaded for further processing. 150 patch_shape: The patch shape to use for training. 151 split: The choice of data split. 152 resize_inputs: Whether to resize inputs to the desired patch shape. 153 download: Whether to download the data if it is not present. 154 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 155 156 Returns: 157 The segmentation dataset. 158 """ 159 image_paths, gt_paths = get_cholecseg8k_paths(path, split, download) 160 161 if resize_inputs: 162 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True} 163 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 164 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 165 ) 166 167 return torch_em.default_segmentation_dataset( 168 raw_paths=image_paths, 169 raw_key=None, 170 label_paths=gt_paths, 171 label_key=None, 172 is_seg_dataset=False, 173 patch_shape=patch_shape, 174 **kwargs 175 ) 176 177 178def get_cholecseg8k_loader( 179 path: Union[str, os.PathLike], 180 batch_size: int, 181 patch_shape: Tuple[int, int], 182 split: Literal["train", "val", "test"], 183 resize_inputs: bool = False, 184 download: bool = False, 185 **kwargs 186) -> DataLoader: 187 """Get the CholecSeg8k dataloader for organ and instrument segmentation. 188 189 Args: 190 path: Filepath to a folder where the data is downloaded for further processing. 191 batch_size: The batch size for training. 192 patch_shape: The patch shape to use for training. 193 split: The choice of data split. 194 resize_inputs: Whether to resize inputs to the desired patch shape. 195 download: Whether to download the data if it is not present. 196 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 197 198 Returns: 199 The DataLoader. 200 """ 201 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 202 dataset = get_cholecseg8k_dataset(path, patch_shape, split, resize_inputs, download, **ds_kwargs) 203 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
LABEL_MAPS =
{(255, 255, 255): 0, (50, 50, 50): 0, (11, 11, 11): 1, (21, 21, 21): 2, (13, 13, 13): 3, (12, 12, 12): 4, (31, 31, 31): 5, (23, 23, 23): 6, (24, 24, 24): 7, (25, 25, 25): 8, (32, 32, 32): 9, (22, 22, 22): 10, (33, 33, 33): 11, (5, 5, 5): 12}
def
get_cholecseg8k_data(path: Union[os.PathLike, str], download: bool = False) -> str:
45def get_cholecseg8k_data(path: Union[os.PathLike, str], download: bool = False) -> str: 46 """Get the CholecSeg8k data. 47 48 Args: 49 path: Filepath to a folder where the data is downloaded for further processing. 50 download: Whether to download the data if it is not present. 51 52 Returns: 53 Filepath where the data is downloaded. 54 """ 55 data_dir = os.path.join(path, "data") 56 if os.path.exists(data_dir): 57 return data_dir 58 59 os.makedirs(path, exist_ok=True) 60 61 zip_path = os.path.join(path, "cholecseg8k.zip") 62 util.download_source_kaggle(path=zip_path, dataset_name="newslab/cholecseg8k", download=download) 63 util.unzip(zip_path=zip_path, dst=data_dir) 64 65 return data_dir
Get the CholecSeg8k data.
Arguments:
- path: Filepath to a folder where the data is downloaded for further processing.
- download: Whether to download the data if it is not present.
Returns:
Filepath where the data is downloaded.
def
get_cholecseg8k_paths( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], download: bool = False) -> Tuple[List[str], List[str]]:
68def get_cholecseg8k_paths( 69 path: Union[os.PathLike, str], split: Literal["train", "val", "test"], download: bool = False 70) -> Tuple[List[str], List[str]]: 71 """Get paths for the CholecSeg8k dataset. 72 73 Args: 74 path: Filepath to a folder where the data is downloaded for further processing. 75 split: The choice of data split. 76 download: Whether to download the data if it is not present. 77 78 Returns: 79 List of filepaths for the image data. 80 List of filepaths for the label data. 81 """ 82 data_dir = get_cholecseg8k_data(path, download) 83 84 video_dirs = natsorted(glob(os.path.join(data_dir, "video*"))) 85 if split == "train": 86 video_dirs = video_dirs[2:-2] 87 elif split == "val": 88 video_dirs = [video_dirs[1], video_dirs[-2]] 89 elif split == "test": 90 video_dirs = [video_dirs[0], video_dirs[-1]] 91 else: 92 raise ValueError(f"'{split}' is not a valid split.") 93 94 ppdir = os.path.join(data_dir, "preprocessed", split) 95 if os.path.exists(ppdir): 96 _image_paths = natsorted(glob(os.path.join(ppdir, "images", "*"))) 97 _gt_paths = natsorted(glob(os.path.join(ppdir, "masks", "*"))) 98 return _image_paths, _gt_paths 99 100 os.makedirs(os.path.join(ppdir, "images"), exist_ok=True) 101 os.makedirs(os.path.join(ppdir, "masks"), exist_ok=True) 102 103 image_paths, gt_paths = [], [] 104 for video_dir in tqdm(video_dirs): 105 org_image_paths = natsorted(glob(os.path.join(video_dir, "video*", "*_endo.png"))) 106 org_gt_paths = natsorted(glob(os.path.join(video_dir, "video*", "*_endo_watershed_mask.png"))) 107 108 for org_image_path, org_gt_path in zip(org_image_paths, org_gt_paths): 109 image_id = os.path.split(org_image_path)[-1] 110 111 image_path = os.path.join(ppdir, "images", image_id) 112 gt_path = os.path.join(ppdir, "masks", Path(image_id).with_suffix(".tif")) 113 114 image_paths.append(image_path) 115 gt_paths.append(gt_path) 116 117 if os.path.exists(image_path) and os.path.exists(gt_path): 118 continue 119 120 gt = imageio.imread(org_gt_path) 121 assert gt.ndim == 3 122 if gt.shape[-1] != 3: # some labels have a 4th channel which has all values as 255 123 print("Found a label with inconsistent format.") 124 # let's verify the case 125 assert np.unique(gt[..., -1]) == 255 126 gt = gt[..., :3] 127 128 instances = np.zeros(gt.shape[:2]) 129 for lmap in LABEL_MAPS: 130 binary_map = (gt == lmap).all(axis=2) 131 instances[binary_map > 0] = LABEL_MAPS[lmap] 132 133 shutil.copy(src=org_image_path, dst=image_path) 134 imageio.imwrite(gt_path, instances, compression="zlib") 135 136 return image_paths, gt_paths
Get paths for the CholecSeg8k dataset.
Arguments:
- path: Filepath to a folder where the data is downloaded for further processing.
- split: The choice of data split.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the image data. List of filepaths for the label data.
def
get_cholecseg8k_dataset( path: Union[str, os.PathLike], patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
139def get_cholecseg8k_dataset( 140 path: Union[str, os.PathLike], 141 patch_shape: Tuple[int, int], 142 split: Literal["train", "val", "test"], 143 resize_inputs: bool = False, 144 download: bool = False, 145 **kwargs 146) -> Dataset: 147 """Get the CholecSeg8k dataset for organ and instrument segmentation. 148 149 Args: 150 path: Filepath to a folder where the data is downloaded for further processing. 151 patch_shape: The patch shape to use for training. 152 split: The choice of data split. 153 resize_inputs: Whether to resize inputs to the desired patch shape. 154 download: Whether to download the data if it is not present. 155 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 156 157 Returns: 158 The segmentation dataset. 159 """ 160 image_paths, gt_paths = get_cholecseg8k_paths(path, split, download) 161 162 if resize_inputs: 163 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True} 164 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 165 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 166 ) 167 168 return torch_em.default_segmentation_dataset( 169 raw_paths=image_paths, 170 raw_key=None, 171 label_paths=gt_paths, 172 label_key=None, 173 is_seg_dataset=False, 174 patch_shape=patch_shape, 175 **kwargs 176 )
Get the CholecSeg8k dataset for organ and instrument segmentation.
Arguments:
- path: Filepath to a folder where the data is downloaded for further processing.
- patch_shape: The patch shape to use for training.
- split: The choice of data split.
- resize_inputs: Whether to resize inputs to the desired patch shape.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
def
get_cholecseg8k_loader( path: Union[str, os.PathLike], batch_size: int, patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
179def get_cholecseg8k_loader( 180 path: Union[str, os.PathLike], 181 batch_size: int, 182 patch_shape: Tuple[int, int], 183 split: Literal["train", "val", "test"], 184 resize_inputs: bool = False, 185 download: bool = False, 186 **kwargs 187) -> DataLoader: 188 """Get the CholecSeg8k dataloader for organ and instrument segmentation. 189 190 Args: 191 path: Filepath to a folder where the data is downloaded for further processing. 192 batch_size: The batch size for training. 193 patch_shape: The patch shape to use for training. 194 split: The choice of data split. 195 resize_inputs: Whether to resize inputs to the desired patch shape. 196 download: Whether to download the data if it is not present. 197 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 198 199 Returns: 200 The DataLoader. 201 """ 202 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 203 dataset = get_cholecseg8k_dataset(path, patch_shape, split, resize_inputs, download, **ds_kwargs) 204 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the CholecSeg8k dataloader for organ and instrument segmentation.
Arguments:
- path: Filepath to a folder where the data is downloaded for further processing.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- split: The choice of data split.
- resize_inputs: Whether to resize inputs to the desired patch shape.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader.