torch_em.data.datasets.medical.pengwin

The PENGWIN dataset contains annotation for pelvic bone fracture and fragments in CT and X-Ray images.

This dataset is from the challenge: https://pengwin.grand-challenge.org/pengwin/. This dataset is related to the publication: https://doi.org/10.1007/978-3-031-43996-4_30. Please cite them if you use this dataset for your publication.

  1"""The PENGWIN dataset contains annotation for pelvic bone fracture and
  2fragments in CT and X-Ray images.
  3
  4This dataset is from the challenge: https://pengwin.grand-challenge.org/pengwin/.
  5This dataset is related to the publication: https://doi.org/10.1007/978-3-031-43996-4_30.
  6Please cite them if you use this dataset for your publication.
  7"""
  8
  9import os
 10from glob import glob
 11from natsort import natsorted
 12from typing import Union, Tuple, Literal, List
 13
 14from torch.utils.data import Dataset, DataLoader
 15
 16import torch_em
 17
 18from .. import util
 19
 20
 21URLS = {
 22    "CT": [
 23        "https://zenodo.org/records/10927452/files/PENGWIN_CT_train_images_part1.zip",  # inputs part 1
 24        "https://zenodo.org/records/10927452/files/PENGWIN_CT_train_images_part2.zip",  # inputs part 2
 25        "https://zenodo.org/records/10927452/files/PENGWIN_CT_train_labels.zip",  # labels
 26    ],
 27    "X-Ray": ["https://zenodo.org/records/10913196/files/train.zip"]
 28}
 29
 30CHECKSUMS = {
 31    "CT": [
 32        "e2e9f99798960607ffced1fbdeee75a626c41bf859eaf4125029a38fac6b7609",  # inputs part 1
 33        "19f3cdc5edd1daf9324c70f8ba683eed054f6ed8f2b1cc59dbd80724f8f0bbb2",  # inputs part 2
 34        "c4d3857e02d3ee5d0df6c8c918dd3cf5a7c9419135f1ec089b78215f37c6665c"  # labels
 35    ],
 36    "X-Ray": ["48d107979eb929a3c61da4e75566306a066408954cf132907bda570f2a7de725"]
 37}
 38
 39TARGET_DIRS = {
 40    "CT": ["CT/images", "CT/images", "CT/labels"],
 41    "X-Ray": ["X-Ray"]
 42}
 43
 44MODALITIES = ["CT", "X-Ray"]
 45
 46
 47def get_pengwin_data(
 48    path: Union[os.PathLike, str], modality: Literal["CT", "X-Ray"], download: bool = False
 49) -> str:
 50    """Download the PENGWIN dataset.
 51
 52    Args:
 53        path: Filepath to a folder where the data is downloaded for further processing.
 54        modality: The choice of modality for inputs.
 55        download: Whether to download the data if it is not present.
 56
 57    Returns:
 58        Filepath where the data is downlaoded.
 59    """
 60    if not isinstance(modality, str) and modality in MODALITIES:
 61        raise ValueError(f"'{modality}' is not a valid modality. Please choose from {MODALITIES}.")
 62
 63    data_dir = os.path.join(path, "data")
 64    if os.path.exists(os.path.join(data_dir, modality)):
 65        return data_dir
 66
 67    os.makedirs(path, exist_ok=True)
 68
 69    for url, checksum, dst_dir in zip(URLS[modality], CHECKSUMS[modality], TARGET_DIRS[modality]):
 70        zip_path = os.path.join(path, os.path.split(url)[-1])
 71        util.download_source(path=zip_path, url=url, download=download, checksum=checksum)
 72        util.unzip(zip_path=zip_path, dst=os.path.join(data_dir, dst_dir))
 73
 74    return data_dir
 75
 76
 77def get_pengwin_paths(
 78    path: Union[os.PathLike, str], modality: Literal["CT", "X-Ray"], download: bool = False
 79) -> Tuple[List[str], List[str]]:
 80    """Get paths to the PENGWIN data.
 81
 82    Args:
 83        path: Filepath to a folder where the data is downloaded for further processing.
 84        modality: The choice of modality for inputs.
 85        download: Whether to download the data if it is not present.
 86
 87    Returns:
 88        List of filepaths for the image data.
 89        List of filepaths for the label data.
 90    """
 91    data_dir = get_pengwin_data(path=path, modality=modality, download=download)
 92
 93    if modality == "CT":
 94        image_paths = natsorted(glob(os.path.join(data_dir, modality, "images", "*.mha")))
 95        gt_paths = natsorted(glob(os.path.join(data_dir, modality, "labels", "*.mha")))
 96    else:  # X-Ray
 97        base_dir = os.path.join(data_dir, modality, "train")
 98        image_paths = natsorted(glob(os.path.join(base_dir, "input", "images", "*.tif")))
 99        gt_paths = natsorted(glob(os.path.join(base_dir, "output", "images", "*.tif")))
100
101    return image_paths, gt_paths
102
103
104def get_pengwin_dataset(
105    path: Union[os.PathLike, str],
106    patch_shape: Tuple[int, ...],
107    modality: Literal["CT", "X-Ray"],
108    resize_inputs: bool = False,
109    download: bool = False,
110    **kwargs
111) -> Dataset:
112    """Get the PENGWIN dataset for pelvic fracture segmentation.
113
114    Args:
115        path: Filepath to a folder where the data is downloaded for further processing.
116        patch_shape: The patch shape to use for training.
117        modality: The choice of modality for inputs.
118        resize_inputs: Whether to resize inputs to the desired patch shape.
119        download: Whether to download the data if it is not present.
120        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
121
122    Returns:
123        The segmentation dataset.
124    """
125    image_paths, gt_paths = get_pengwin_paths(path=path, modality=modality, download=download)
126
127    if resize_inputs:
128        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": False}
129        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
130            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
131        )
132
133    return torch_em.default_segmentation_dataset(
134        raw_paths=image_paths,
135        raw_key=None,
136        label_paths=gt_paths,
137        label_key=None,
138        patch_shape=patch_shape,
139        **kwargs
140    )
141
142
143def get_pengwin_loader(
144    path: Union[os.PathLike, str],
145    batch_size: int,
146    patch_shape: Tuple[int, ...],
147    modality: Literal["CT", "X-Ray"],
148    resize_inputs: bool = False,
149    download: bool = False,
150    **kwargs
151) -> DataLoader:
152    """Get the PENGWIN dataloader for pelvic fracture segmentation.
153
154    Args:
155        path: Filepath to a folder where the data is downloaded for further processing.
156        batch_size: The batch size for training.
157        patch_shape: The patch shape to use for training.
158        modality: The choice of modality for inputs.
159        resize_inputs: Whether to resize inputs to the desired patch shape.
160        download: Whether to download the data if it is not present.
161        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
162
163    Returns:
164        The DataLoader.
165    """
166    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
167    dataset = get_pengwin_dataset(path, patch_shape, modality, resize_inputs, download, **ds_kwargs)
168    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URLS = {'CT': ['https://zenodo.org/records/10927452/files/PENGWIN_CT_train_images_part1.zip', 'https://zenodo.org/records/10927452/files/PENGWIN_CT_train_images_part2.zip', 'https://zenodo.org/records/10927452/files/PENGWIN_CT_train_labels.zip'], 'X-Ray': ['https://zenodo.org/records/10913196/files/train.zip']}
CHECKSUMS = {'CT': ['e2e9f99798960607ffced1fbdeee75a626c41bf859eaf4125029a38fac6b7609', '19f3cdc5edd1daf9324c70f8ba683eed054f6ed8f2b1cc59dbd80724f8f0bbb2', 'c4d3857e02d3ee5d0df6c8c918dd3cf5a7c9419135f1ec089b78215f37c6665c'], 'X-Ray': ['48d107979eb929a3c61da4e75566306a066408954cf132907bda570f2a7de725']}
TARGET_DIRS = {'CT': ['CT/images', 'CT/images', 'CT/labels'], 'X-Ray': ['X-Ray']}
MODALITIES = ['CT', 'X-Ray']
def get_pengwin_data( path: Union[os.PathLike, str], modality: Literal['CT', 'X-Ray'], download: bool = False) -> str:
48def get_pengwin_data(
49    path: Union[os.PathLike, str], modality: Literal["CT", "X-Ray"], download: bool = False
50) -> str:
51    """Download the PENGWIN dataset.
52
53    Args:
54        path: Filepath to a folder where the data is downloaded for further processing.
55        modality: The choice of modality for inputs.
56        download: Whether to download the data if it is not present.
57
58    Returns:
59        Filepath where the data is downlaoded.
60    """
61    if not isinstance(modality, str) and modality in MODALITIES:
62        raise ValueError(f"'{modality}' is not a valid modality. Please choose from {MODALITIES}.")
63
64    data_dir = os.path.join(path, "data")
65    if os.path.exists(os.path.join(data_dir, modality)):
66        return data_dir
67
68    os.makedirs(path, exist_ok=True)
69
70    for url, checksum, dst_dir in zip(URLS[modality], CHECKSUMS[modality], TARGET_DIRS[modality]):
71        zip_path = os.path.join(path, os.path.split(url)[-1])
72        util.download_source(path=zip_path, url=url, download=download, checksum=checksum)
73        util.unzip(zip_path=zip_path, dst=os.path.join(data_dir, dst_dir))
74
75    return data_dir

Download the PENGWIN dataset.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • modality: The choice of modality for inputs.
  • download: Whether to download the data if it is not present.
Returns:

Filepath where the data is downlaoded.

def get_pengwin_paths( path: Union[os.PathLike, str], modality: Literal['CT', 'X-Ray'], download: bool = False) -> Tuple[List[str], List[str]]:
 78def get_pengwin_paths(
 79    path: Union[os.PathLike, str], modality: Literal["CT", "X-Ray"], download: bool = False
 80) -> Tuple[List[str], List[str]]:
 81    """Get paths to the PENGWIN data.
 82
 83    Args:
 84        path: Filepath to a folder where the data is downloaded for further processing.
 85        modality: The choice of modality for inputs.
 86        download: Whether to download the data if it is not present.
 87
 88    Returns:
 89        List of filepaths for the image data.
 90        List of filepaths for the label data.
 91    """
 92    data_dir = get_pengwin_data(path=path, modality=modality, download=download)
 93
 94    if modality == "CT":
 95        image_paths = natsorted(glob(os.path.join(data_dir, modality, "images", "*.mha")))
 96        gt_paths = natsorted(glob(os.path.join(data_dir, modality, "labels", "*.mha")))
 97    else:  # X-Ray
 98        base_dir = os.path.join(data_dir, modality, "train")
 99        image_paths = natsorted(glob(os.path.join(base_dir, "input", "images", "*.tif")))
100        gt_paths = natsorted(glob(os.path.join(base_dir, "output", "images", "*.tif")))
101
102    return image_paths, gt_paths

Get paths to the PENGWIN data.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • modality: The choice of modality for inputs.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths for the image data. List of filepaths for the label data.

def get_pengwin_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], modality: Literal['CT', 'X-Ray'], resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
105def get_pengwin_dataset(
106    path: Union[os.PathLike, str],
107    patch_shape: Tuple[int, ...],
108    modality: Literal["CT", "X-Ray"],
109    resize_inputs: bool = False,
110    download: bool = False,
111    **kwargs
112) -> Dataset:
113    """Get the PENGWIN dataset for pelvic fracture segmentation.
114
115    Args:
116        path: Filepath to a folder where the data is downloaded for further processing.
117        patch_shape: The patch shape to use for training.
118        modality: The choice of modality for inputs.
119        resize_inputs: Whether to resize inputs to the desired patch shape.
120        download: Whether to download the data if it is not present.
121        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
122
123    Returns:
124        The segmentation dataset.
125    """
126    image_paths, gt_paths = get_pengwin_paths(path=path, modality=modality, download=download)
127
128    if resize_inputs:
129        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": False}
130        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
131            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
132        )
133
134    return torch_em.default_segmentation_dataset(
135        raw_paths=image_paths,
136        raw_key=None,
137        label_paths=gt_paths,
138        label_key=None,
139        patch_shape=patch_shape,
140        **kwargs
141    )

Get the PENGWIN dataset for pelvic fracture segmentation.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • patch_shape: The patch shape to use for training.
  • modality: The choice of modality for inputs.
  • resize_inputs: Whether to resize inputs to the desired patch shape.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_pengwin_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, ...], modality: Literal['CT', 'X-Ray'], resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
144def get_pengwin_loader(
145    path: Union[os.PathLike, str],
146    batch_size: int,
147    patch_shape: Tuple[int, ...],
148    modality: Literal["CT", "X-Ray"],
149    resize_inputs: bool = False,
150    download: bool = False,
151    **kwargs
152) -> DataLoader:
153    """Get the PENGWIN dataloader for pelvic fracture segmentation.
154
155    Args:
156        path: Filepath to a folder where the data is downloaded for further processing.
157        batch_size: The batch size for training.
158        patch_shape: The patch shape to use for training.
159        modality: The choice of modality for inputs.
160        resize_inputs: Whether to resize inputs to the desired patch shape.
161        download: Whether to download the data if it is not present.
162        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
163
164    Returns:
165        The DataLoader.
166    """
167    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
168    dataset = get_pengwin_dataset(path, patch_shape, modality, resize_inputs, download, **ds_kwargs)
169    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the PENGWIN dataloader for pelvic fracture segmentation.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape to use for training.
  • modality: The choice of modality for inputs.
  • resize_inputs: Whether to resize inputs to the desired patch shape.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.