torch_em.data.datasets.medical.curvas

The CURVAS dataset contains annotations for pancreas, kidney and liver in abdominal CT scans.

This dataset is from the challenge: https://curvas.grand-challenge.org. The dataset is located at: https://zenodo.org/records/12687192. Please cite tem if you use this dataset for your research.

  1"""The CURVAS dataset contains annotations for pancreas, kidney and liver
  2in abdominal CT scans.
  3
  4This dataset is from the challenge: https://curvas.grand-challenge.org.
  5The dataset is located at: https://zenodo.org/records/12687192.
  6Please cite tem if you use this dataset for your research.
  7"""
  8
  9import os
 10import subprocess
 11from glob import glob
 12from natsort import natsorted
 13from typing import Tuple, Union, Literal, List
 14
 15from torch.utils.data import Dataset, DataLoader
 16
 17import torch_em
 18
 19from .. import util
 20
 21
 22URL = "https://zenodo.org/records/12687192/files/training_set.zip"
 23CHECKSUM = "1126a2205553ae1d4fe5fbaee7ea732aacc4f5a92b96504ed521c23e5a0e3f89"
 24
 25
 26def get_curvas_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 27    """Download the CURVAS dataset.
 28
 29    Args:
 30        path: Filepath to a folder where the data is downloaded for further processing.
 31        download: Whether to download the data if it is not present.
 32
 33    Returns:
 34        Filepath where the data is downloaded.
 35    """
 36    data_dir = os.path.join(path, "training_set")
 37    if os.path.exists(data_dir):
 38        return data_dir
 39
 40    os.makedirs(path, exist_ok=True)
 41
 42    zip_path = os.path.join(path, "training_set.zip")
 43    util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM)
 44
 45    # HACK: The zip file is broken. We fix it using the following script.
 46    fixed_zip_path = os.path.join(path, "training_set_fixed.zip")
 47    subprocess.run(["zip", "-FF", zip_path, "--out", fixed_zip_path])
 48    subprocess.run(["unzip", fixed_zip_path, "-d", path])
 49
 50    return data_dir
 51
 52
 53def get_curvas_paths(
 54    path: Union[os.PathLike, str],
 55    split: Literal['train', 'val', 'test'],
 56    rater: Literal["1"] = "1",
 57    download: bool = False
 58) -> Tuple[List[str], List[str]]:
 59    """Get paths to the CURVAS data.
 60
 61    Args:
 62        path: Filepath to a folder where the data is downloaded for further processing.
 63        split: The choice of data split.
 64        rater: The choice of rater providing the annotations.
 65        download: Whether to download the data if it is not present.
 66
 67    Returns:
 68        List of filepaths for the image data.
 69        List of filepaths for the label data.
 70    """
 71    data_dir = get_curvas_data(path, download)
 72
 73    if not isinstance(rater, list):
 74        rater = [rater]
 75
 76    assert len(rater) == 1, "The segmentations for multiple raters is not supported at the moment."
 77
 78    image_paths = natsorted(glob(os.path.join(data_dir, "*", "image.nii.gz")))
 79    gt_paths = []
 80    for _rater in rater:
 81        gt_paths.extend(natsorted(glob(os.path.join(data_dir, "*", f"annotation_{_rater}.nii.gz"))))
 82
 83    assert len(image_paths) == len(gt_paths)
 84
 85    if split == "train":
 86        image_paths, gt_paths = image_paths[:10], gt_paths[:10]
 87    elif split == "val":
 88        image_paths, gt_paths = image_paths[10:13], gt_paths[10:13]
 89    elif split == "test":
 90        image_paths, gt_paths = image_paths[13:], gt_paths[13:]
 91    else:
 92        raise ValueError(f"'{split}' is not a valid split.")
 93
 94    return image_paths, gt_paths
 95
 96
 97def get_curvas_dataset(
 98    path: Union[os.PathLike, str],
 99    patch_shape: Tuple[int, ...],
100    split: Literal['train', 'val', 'test'],
101    rater: Literal["1"] = "1",
102    resize_inputs: bool = False,
103    download: bool = False,
104    **kwargs
105) -> Dataset:
106    """Get the CURVAS dataset for pancreas, kidney and liver segmentation.
107
108    Args:
109        path: Filepath to a folder where the data is downloaded for further processing.
110        patch_shape: The patch shape to use for training.
111        split: The choice of data split.
112        rater: The choice of rater providing the annotations.
113        resize_inputs: Whether to resize inputs to the desired patch shape.
114        download: Whether to download the data if it is not present.
115        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
116
117    Returns:
118        The segmentation dataset.
119    """
120    image_paths, gt_paths = get_curvas_paths(path, split, rater, download)
121
122    if resize_inputs:
123        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": False}
124        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
125            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
126        )
127
128    return torch_em.default_segmentation_dataset(
129        raw_paths=image_paths,
130        raw_key="data",
131        label_paths=gt_paths,
132        label_key="data",
133        patch_shape=patch_shape,
134        **kwargs
135    )
136
137
138def get_curvas_loader(
139    path: Union[os.PathLike, str],
140    batch_size: int,
141    patch_shape: Tuple[int, ...],
142    split: Literal['train', 'val', 'test'],
143    rater: Literal["1"] = "1",
144    resize_inputs: bool = False,
145    download: bool = False,
146    **kwargs
147) -> DataLoader:
148    """Get the CURVAS dataloader for pancreas, kidney and liver segmentation.
149
150    Args:
151        path: Filepath to a folder where the data is downloaded for further processing.
152        batch_size: The batch size for training.
153        patch_shape: The patch shape to use for training.
154        split: The choice of data split.
155        rater: The choice of rater providing the annotations.
156        resize_inputs: Whether to resize inputs to the desired patch shape.
157        download: Whether to download the data if it is not present.
158        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
159
160    Returns:
161        The DataLoader.
162    """
163    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
164    dataset = get_curvas_dataset(path, patch_shape, split, rater, resize_inputs, download, **ds_kwargs)
165    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URL = 'https://zenodo.org/records/12687192/files/training_set.zip'
CHECKSUM = '1126a2205553ae1d4fe5fbaee7ea732aacc4f5a92b96504ed521c23e5a0e3f89'
def get_curvas_data(path: Union[os.PathLike, str], download: bool = False) -> str:
27def get_curvas_data(path: Union[os.PathLike, str], download: bool = False) -> str:
28    """Download the CURVAS dataset.
29
30    Args:
31        path: Filepath to a folder where the data is downloaded for further processing.
32        download: Whether to download the data if it is not present.
33
34    Returns:
35        Filepath where the data is downloaded.
36    """
37    data_dir = os.path.join(path, "training_set")
38    if os.path.exists(data_dir):
39        return data_dir
40
41    os.makedirs(path, exist_ok=True)
42
43    zip_path = os.path.join(path, "training_set.zip")
44    util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM)
45
46    # HACK: The zip file is broken. We fix it using the following script.
47    fixed_zip_path = os.path.join(path, "training_set_fixed.zip")
48    subprocess.run(["zip", "-FF", zip_path, "--out", fixed_zip_path])
49    subprocess.run(["unzip", fixed_zip_path, "-d", path])
50
51    return data_dir

Download the CURVAS dataset.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • download: Whether to download the data if it is not present.
Returns:

Filepath where the data is downloaded.

def get_curvas_paths( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], rater: Literal['1'] = '1', download: bool = False) -> Tuple[List[str], List[str]]:
54def get_curvas_paths(
55    path: Union[os.PathLike, str],
56    split: Literal['train', 'val', 'test'],
57    rater: Literal["1"] = "1",
58    download: bool = False
59) -> Tuple[List[str], List[str]]:
60    """Get paths to the CURVAS data.
61
62    Args:
63        path: Filepath to a folder where the data is downloaded for further processing.
64        split: The choice of data split.
65        rater: The choice of rater providing the annotations.
66        download: Whether to download the data if it is not present.
67
68    Returns:
69        List of filepaths for the image data.
70        List of filepaths for the label data.
71    """
72    data_dir = get_curvas_data(path, download)
73
74    if not isinstance(rater, list):
75        rater = [rater]
76
77    assert len(rater) == 1, "The segmentations for multiple raters is not supported at the moment."
78
79    image_paths = natsorted(glob(os.path.join(data_dir, "*", "image.nii.gz")))
80    gt_paths = []
81    for _rater in rater:
82        gt_paths.extend(natsorted(glob(os.path.join(data_dir, "*", f"annotation_{_rater}.nii.gz"))))
83
84    assert len(image_paths) == len(gt_paths)
85
86    if split == "train":
87        image_paths, gt_paths = image_paths[:10], gt_paths[:10]
88    elif split == "val":
89        image_paths, gt_paths = image_paths[10:13], gt_paths[10:13]
90    elif split == "test":
91        image_paths, gt_paths = image_paths[13:], gt_paths[13:]
92    else:
93        raise ValueError(f"'{split}' is not a valid split.")
94
95    return image_paths, gt_paths

Get paths to the CURVAS data.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • split: The choice of data split.
  • rater: The choice of rater providing the annotations.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths for the image data. List of filepaths for the label data.

def get_curvas_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], split: Literal['train', 'val', 'test'], rater: Literal['1'] = '1', resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
 98def get_curvas_dataset(
 99    path: Union[os.PathLike, str],
100    patch_shape: Tuple[int, ...],
101    split: Literal['train', 'val', 'test'],
102    rater: Literal["1"] = "1",
103    resize_inputs: bool = False,
104    download: bool = False,
105    **kwargs
106) -> Dataset:
107    """Get the CURVAS dataset for pancreas, kidney and liver segmentation.
108
109    Args:
110        path: Filepath to a folder where the data is downloaded for further processing.
111        patch_shape: The patch shape to use for training.
112        split: The choice of data split.
113        rater: The choice of rater providing the annotations.
114        resize_inputs: Whether to resize inputs to the desired patch shape.
115        download: Whether to download the data if it is not present.
116        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
117
118    Returns:
119        The segmentation dataset.
120    """
121    image_paths, gt_paths = get_curvas_paths(path, split, rater, download)
122
123    if resize_inputs:
124        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": False}
125        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
126            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
127        )
128
129    return torch_em.default_segmentation_dataset(
130        raw_paths=image_paths,
131        raw_key="data",
132        label_paths=gt_paths,
133        label_key="data",
134        patch_shape=patch_shape,
135        **kwargs
136    )

Get the CURVAS dataset for pancreas, kidney and liver segmentation.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • patch_shape: The patch shape to use for training.
  • split: The choice of data split.
  • rater: The choice of rater providing the annotations.
  • resize_inputs: Whether to resize inputs to the desired patch shape.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_curvas_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, ...], split: Literal['train', 'val', 'test'], rater: Literal['1'] = '1', resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
139def get_curvas_loader(
140    path: Union[os.PathLike, str],
141    batch_size: int,
142    patch_shape: Tuple[int, ...],
143    split: Literal['train', 'val', 'test'],
144    rater: Literal["1"] = "1",
145    resize_inputs: bool = False,
146    download: bool = False,
147    **kwargs
148) -> DataLoader:
149    """Get the CURVAS dataloader for pancreas, kidney and liver segmentation.
150
151    Args:
152        path: Filepath to a folder where the data is downloaded for further processing.
153        batch_size: The batch size for training.
154        patch_shape: The patch shape to use for training.
155        split: The choice of data split.
156        rater: The choice of rater providing the annotations.
157        resize_inputs: Whether to resize inputs to the desired patch shape.
158        download: Whether to download the data if it is not present.
159        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
160
161    Returns:
162        The DataLoader.
163    """
164    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
165    dataset = get_curvas_dataset(path, patch_shape, split, rater, resize_inputs, download, **ds_kwargs)
166    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the CURVAS dataloader for pancreas, kidney and liver segmentation.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape to use for training.
  • split: The choice of data split.
  • rater: The choice of rater providing the annotations.
  • resize_inputs: Whether to resize inputs to the desired patch shape.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.