torch_em.data.datasets.medical.curvas
The CURVAS dataset contains annotations for pancreas, kidney and liver in abdominal CT scans.
This dataset is from the challenge: https://curvas.grand-challenge.org. The dataset is located at: https://zenodo.org/records/12687192. Please cite tem if you use this dataset for your research.
1"""The CURVAS dataset contains annotations for pancreas, kidney and liver 2in abdominal CT scans. 3 4This dataset is from the challenge: https://curvas.grand-challenge.org. 5The dataset is located at: https://zenodo.org/records/12687192. 6Please cite tem if you use this dataset for your research. 7""" 8 9import os 10import subprocess 11from glob import glob 12from natsort import natsorted 13from typing import Tuple, Union, Literal, List 14 15from torch.utils.data import Dataset, DataLoader 16 17import torch_em 18 19from .. import util 20 21 22URL = "https://zenodo.org/records/12687192/files/training_set.zip" 23CHECKSUM = "1126a2205553ae1d4fe5fbaee7ea732aacc4f5a92b96504ed521c23e5a0e3f89" 24 25 26def get_curvas_data(path: Union[os.PathLike, str], download: bool = False) -> str: 27 """Download the CURVAS dataset. 28 29 Args: 30 path: Filepath to a folder where the data is downloaded for further processing. 31 download: Whether to download the data if it is not present. 32 33 Returns: 34 Filepath where the data is downloaded. 35 """ 36 data_dir = os.path.join(path, "training_set") 37 if os.path.exists(data_dir): 38 return data_dir 39 40 os.makedirs(path, exist_ok=True) 41 42 zip_path = os.path.join(path, "training_set.zip") 43 util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM) 44 45 # HACK: The zip file is broken. We fix it using the following script. 46 fixed_zip_path = os.path.join(path, "training_set_fixed.zip") 47 subprocess.run(["zip", "-FF", zip_path, "--out", fixed_zip_path]) 48 subprocess.run(["unzip", fixed_zip_path, "-d", path]) 49 50 return data_dir 51 52 53def get_curvas_paths( 54 path: Union[os.PathLike, str], 55 split: Literal['train', 'val', 'test'], 56 rater: Literal["1"] = "1", 57 download: bool = False 58) -> Tuple[List[str], List[str]]: 59 """Get paths to the CURVAS data. 60 61 Args: 62 path: Filepath to a folder where the data is downloaded for further processing. 63 split: The choice of data split. 64 rater: The choice of rater providing the annotations. 65 download: Whether to download the data if it is not present. 66 67 Returns: 68 List of filepaths for the image data. 69 List of filepaths for the label data. 70 """ 71 data_dir = get_curvas_data(path, download) 72 73 if not isinstance(rater, list): 74 rater = [rater] 75 76 assert len(rater) == 1, "The segmentations for multiple raters is not supported at the moment." 77 78 image_paths = natsorted(glob(os.path.join(data_dir, "*", "image.nii.gz"))) 79 gt_paths = [] 80 for _rater in rater: 81 gt_paths.extend(natsorted(glob(os.path.join(data_dir, "*", f"annotation_{_rater}.nii.gz")))) 82 83 assert len(image_paths) == len(gt_paths) 84 85 if split == "train": 86 image_paths, gt_paths = image_paths[:10], gt_paths[:10] 87 elif split == "val": 88 image_paths, gt_paths = image_paths[10:13], gt_paths[10:13] 89 elif split == "test": 90 image_paths, gt_paths = image_paths[13:], gt_paths[13:] 91 else: 92 raise ValueError(f"'{split}' is not a valid split.") 93 94 return image_paths, gt_paths 95 96 97def get_curvas_dataset( 98 path: Union[os.PathLike, str], 99 patch_shape: Tuple[int, ...], 100 split: Literal['train', 'val', 'test'], 101 rater: Literal["1"] = "1", 102 resize_inputs: bool = False, 103 download: bool = False, 104 **kwargs 105) -> Dataset: 106 """Get the CURVAS dataset for pancreas, kidney and liver segmentation. 107 108 Args: 109 path: Filepath to a folder where the data is downloaded for further processing. 110 patch_shape: The patch shape to use for training. 111 split: The choice of data split. 112 rater: The choice of rater providing the annotations. 113 resize_inputs: Whether to resize inputs to the desired patch shape. 114 download: Whether to download the data if it is not present. 115 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 116 117 Returns: 118 The segmentation dataset. 119 """ 120 image_paths, gt_paths = get_curvas_paths(path, split, rater, download) 121 122 if resize_inputs: 123 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": False} 124 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 125 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 126 ) 127 128 return torch_em.default_segmentation_dataset( 129 raw_paths=image_paths, 130 raw_key="data", 131 label_paths=gt_paths, 132 label_key="data", 133 patch_shape=patch_shape, 134 **kwargs 135 ) 136 137 138def get_curvas_loader( 139 path: Union[os.PathLike, str], 140 batch_size: int, 141 patch_shape: Tuple[int, ...], 142 split: Literal['train', 'val', 'test'], 143 rater: Literal["1"] = "1", 144 resize_inputs: bool = False, 145 download: bool = False, 146 **kwargs 147) -> DataLoader: 148 """Get the CURVAS dataloader for pancreas, kidney and liver segmentation. 149 150 Args: 151 path: Filepath to a folder where the data is downloaded for further processing. 152 batch_size: The batch size for training. 153 patch_shape: The patch shape to use for training. 154 split: The choice of data split. 155 rater: The choice of rater providing the annotations. 156 resize_inputs: Whether to resize inputs to the desired patch shape. 157 download: Whether to download the data if it is not present. 158 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 159 160 Returns: 161 The DataLoader. 162 """ 163 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 164 dataset = get_curvas_dataset(path, patch_shape, split, rater, resize_inputs, download, **ds_kwargs) 165 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URL =
'https://zenodo.org/records/12687192/files/training_set.zip'
CHECKSUM =
'1126a2205553ae1d4fe5fbaee7ea732aacc4f5a92b96504ed521c23e5a0e3f89'
def
get_curvas_data(path: Union[os.PathLike, str], download: bool = False) -> str:
27def get_curvas_data(path: Union[os.PathLike, str], download: bool = False) -> str: 28 """Download the CURVAS dataset. 29 30 Args: 31 path: Filepath to a folder where the data is downloaded for further processing. 32 download: Whether to download the data if it is not present. 33 34 Returns: 35 Filepath where the data is downloaded. 36 """ 37 data_dir = os.path.join(path, "training_set") 38 if os.path.exists(data_dir): 39 return data_dir 40 41 os.makedirs(path, exist_ok=True) 42 43 zip_path = os.path.join(path, "training_set.zip") 44 util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM) 45 46 # HACK: The zip file is broken. We fix it using the following script. 47 fixed_zip_path = os.path.join(path, "training_set_fixed.zip") 48 subprocess.run(["zip", "-FF", zip_path, "--out", fixed_zip_path]) 49 subprocess.run(["unzip", fixed_zip_path, "-d", path]) 50 51 return data_dir
Download the CURVAS dataset.
Arguments:
- path: Filepath to a folder where the data is downloaded for further processing.
- download: Whether to download the data if it is not present.
Returns:
Filepath where the data is downloaded.
def
get_curvas_paths( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], rater: Literal['1'] = '1', download: bool = False) -> Tuple[List[str], List[str]]:
54def get_curvas_paths( 55 path: Union[os.PathLike, str], 56 split: Literal['train', 'val', 'test'], 57 rater: Literal["1"] = "1", 58 download: bool = False 59) -> Tuple[List[str], List[str]]: 60 """Get paths to the CURVAS data. 61 62 Args: 63 path: Filepath to a folder where the data is downloaded for further processing. 64 split: The choice of data split. 65 rater: The choice of rater providing the annotations. 66 download: Whether to download the data if it is not present. 67 68 Returns: 69 List of filepaths for the image data. 70 List of filepaths for the label data. 71 """ 72 data_dir = get_curvas_data(path, download) 73 74 if not isinstance(rater, list): 75 rater = [rater] 76 77 assert len(rater) == 1, "The segmentations for multiple raters is not supported at the moment." 78 79 image_paths = natsorted(glob(os.path.join(data_dir, "*", "image.nii.gz"))) 80 gt_paths = [] 81 for _rater in rater: 82 gt_paths.extend(natsorted(glob(os.path.join(data_dir, "*", f"annotation_{_rater}.nii.gz")))) 83 84 assert len(image_paths) == len(gt_paths) 85 86 if split == "train": 87 image_paths, gt_paths = image_paths[:10], gt_paths[:10] 88 elif split == "val": 89 image_paths, gt_paths = image_paths[10:13], gt_paths[10:13] 90 elif split == "test": 91 image_paths, gt_paths = image_paths[13:], gt_paths[13:] 92 else: 93 raise ValueError(f"'{split}' is not a valid split.") 94 95 return image_paths, gt_paths
Get paths to the CURVAS data.
Arguments:
- path: Filepath to a folder where the data is downloaded for further processing.
- split: The choice of data split.
- rater: The choice of rater providing the annotations.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the image data. List of filepaths for the label data.
def
get_curvas_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], split: Literal['train', 'val', 'test'], rater: Literal['1'] = '1', resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
98def get_curvas_dataset( 99 path: Union[os.PathLike, str], 100 patch_shape: Tuple[int, ...], 101 split: Literal['train', 'val', 'test'], 102 rater: Literal["1"] = "1", 103 resize_inputs: bool = False, 104 download: bool = False, 105 **kwargs 106) -> Dataset: 107 """Get the CURVAS dataset for pancreas, kidney and liver segmentation. 108 109 Args: 110 path: Filepath to a folder where the data is downloaded for further processing. 111 patch_shape: The patch shape to use for training. 112 split: The choice of data split. 113 rater: The choice of rater providing the annotations. 114 resize_inputs: Whether to resize inputs to the desired patch shape. 115 download: Whether to download the data if it is not present. 116 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 117 118 Returns: 119 The segmentation dataset. 120 """ 121 image_paths, gt_paths = get_curvas_paths(path, split, rater, download) 122 123 if resize_inputs: 124 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": False} 125 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 126 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 127 ) 128 129 return torch_em.default_segmentation_dataset( 130 raw_paths=image_paths, 131 raw_key="data", 132 label_paths=gt_paths, 133 label_key="data", 134 patch_shape=patch_shape, 135 **kwargs 136 )
Get the CURVAS dataset for pancreas, kidney and liver segmentation.
Arguments:
- path: Filepath to a folder where the data is downloaded for further processing.
- patch_shape: The patch shape to use for training.
- split: The choice of data split.
- rater: The choice of rater providing the annotations.
- resize_inputs: Whether to resize inputs to the desired patch shape.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
def
get_curvas_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, ...], split: Literal['train', 'val', 'test'], rater: Literal['1'] = '1', resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
139def get_curvas_loader( 140 path: Union[os.PathLike, str], 141 batch_size: int, 142 patch_shape: Tuple[int, ...], 143 split: Literal['train', 'val', 'test'], 144 rater: Literal["1"] = "1", 145 resize_inputs: bool = False, 146 download: bool = False, 147 **kwargs 148) -> DataLoader: 149 """Get the CURVAS dataloader for pancreas, kidney and liver segmentation. 150 151 Args: 152 path: Filepath to a folder where the data is downloaded for further processing. 153 batch_size: The batch size for training. 154 patch_shape: The patch shape to use for training. 155 split: The choice of data split. 156 rater: The choice of rater providing the annotations. 157 resize_inputs: Whether to resize inputs to the desired patch shape. 158 download: Whether to download the data if it is not present. 159 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 160 161 Returns: 162 The DataLoader. 163 """ 164 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 165 dataset = get_curvas_dataset(path, patch_shape, split, rater, resize_inputs, download, **ds_kwargs) 166 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the CURVAS dataloader for pancreas, kidney and liver segmentation.
Arguments:
- path: Filepath to a folder where the data is downloaded for further processing.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- split: The choice of data split.
- rater: The choice of rater providing the annotations.
- resize_inputs: Whether to resize inputs to the desired patch shape.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader.