torch_em.data.datasets.medical.osic_pulmofib

The OSIC PulmoFib dataset contains annotations for lung, heart and trachea in CT scans.

This dataset is from OSIC Pulmonary Fibrosis Progression Challenge:

https://www.kaggle.com/c/osic-pulmonary-fibrosis-progression/data (dataset source)
https://www.kaggle.com/datasets/sandorkonya/ct-lung-heart-trachea-segmentation (segmentation source) Please cite them if you use this dataset for your research.

View Source

  1"""The OSIC PulmoFib dataset contains annotations for lung, heart and trachea in CT scans.
  2
  3This dataset is from OSIC Pulmonary Fibrosis Progression Challenge:
  4- https://www.kaggle.com/c/osic-pulmonary-fibrosis-progression/data (dataset source)
  5- https://www.kaggle.com/datasets/sandorkonya/ct-lung-heart-trachea-segmentation (segmentation source)
  6Please cite them if you use this dataset for your research.
  7"""
  8
  9import os
 10from glob import glob
 11from tqdm import tqdm
 12from pathlib import Path
 13from natsort import natsorted
 14from typing import Union, Tuple, List, Literal
 15
 16import json
 17import numpy as np
 18
 19import torch_em
 20
 21from .. import util
 22
 23
 24ORGAN_IDS = {"heart": 1, "lung": 2, "trachea": 3}
 25
 26
 27def get_osic_pulmofib_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 28    """Download the OSIC PulmoFib dataset.
 29
 30    Args:
 31        path: Filepath to a folder where the data is downloaded for further processing.
 32        download: Whether to download the data if it is not present.
 33
 34    Returns:
 35        Filepath where the data is downloaded.
 36    """
 37    data_dir = os.path.join(path, "data")
 38    if os.path.exists(data_dir):
 39        return data_dir
 40
 41    os.makedirs(path, exist_ok=True)
 42
 43    # download the inputs
 44    zip_path = os.path.join(path, "osic-pulmonary-fibrosis-progression.zip")
 45    util.download_source_kaggle(
 46        path=path, dataset_name="osic-pulmonary-fibrosis-progression", download=download, competition=True
 47    )
 48    util.unzip(zip_path=zip_path, dst=data_dir, remove=False)
 49
 50    # download the labels
 51    zip_path = os.path.join(path, "ct-lung-heart-trachea-segmentation.zip")
 52    util.download_source_kaggle(
 53        path=path, dataset_name="sandorkonya/ct-lung-heart-trachea-segmentation", download=download
 54    )
 55    util.unzip(zip_path=zip_path, dst=data_dir)
 56
 57    return data_dir
 58
 59
 60def _preprocess_inputs(data_dir, split):
 61    image_dir = os.path.join(data_dir, "preprocessed", "images")
 62    gt_dir = os.path.join(data_dir, "preprocessed", "ground_truth")
 63
 64    os.makedirs(image_dir, exist_ok=True)
 65    os.makedirs(gt_dir, exist_ok=True)
 66
 67    cpath = os.path.join(data_dir, "preprocessed", "confirmer.json")
 68    _completed_preproc = os.path.exists(cpath)
 69
 70    image_paths, gt_paths = [], []
 71    uid_paths = natsorted(glob(os.path.join(data_dir, "train", "*")))
 72    for uid_path in tqdm(uid_paths, desc="Preprocessing inputs", disable=_completed_preproc):
 73        uid = uid_path.split("/")[-1]
 74
 75        image_path = os.path.join(image_dir, f"{uid}.nii.gz")
 76        gt_path = os.path.join(gt_dir, f"{uid}.nii.gz")
 77
 78        if _completed_preproc:
 79            if os.path.exists(image_path) and os.path.exists(gt_path):
 80                image_paths.append(image_path)
 81                gt_paths.append(gt_path)
 82
 83            continue
 84
 85        import nrrd
 86        import nibabel as nib
 87        import pydicom as dicom
 88
 89        # creating the volume out of individual dicom slices
 90        all_slices = []
 91        for slice_path in natsorted(glob(os.path.join(uid_path, "*.dcm"))):
 92            per_slice = dicom.dcmread(slice_path)
 93            per_slice = per_slice.pixel_array
 94            all_slices.append(per_slice)
 95        all_slices = np.stack(all_slices).transpose(1, 2, 0)
 96
 97        # next, combining the semantic organ annotations into one ground-truth volume with specific semantic labels
 98        all_gt = np.zeros(all_slices.shape, dtype="uint8")
 99        for ann_path in glob(os.path.join(data_dir, "*", "*", f"{uid}_*.nrrd")):
100            ann_organ = Path(ann_path).stem.split("_")[-1]
101            if ann_organ == "noisy":
102                continue
103
104            per_gt, _ = nrrd.read(ann_path)
105            per_gt = per_gt.transpose(1, 0, 2)
106
107            # some organ anns have weird dimension mismatch, we don't consider them for simplicity
108            if per_gt.shape == all_slices.shape:
109                all_gt[per_gt > 0] = ORGAN_IDS[ann_organ]
110
111        # only if the volume has any labels (some volumes do not have segmentations), we save those raw and gt volumes
112        if len(np.unique(all_gt)) > 1:
113            all_gt = np.flip(all_gt, axis=2)
114
115            image_nifti = nib.Nifti2Image(all_slices, np.eye(4))
116            gt_nifti = nib.Nifti2Image(all_gt, np.eye(4))
117
118            nib.save(image_nifti, image_path)
119            nib.save(gt_nifti, gt_path)
120
121            image_paths.append(image_path)
122            gt_paths.append(gt_path)
123
124    if not _completed_preproc:
125        # since we do not have segmentation for all volumes, we store a file which reflects aggrement of created dataset
126        confirm_msg = "The dataset has been preprocessed. "
127        confirm_msg += f"It has {len(image_paths)} volume and {len(gt_paths)} respective ground-truth."
128        print(confirm_msg)
129
130        with open(cpath, "w") as f:
131            json.dump(confirm_msg, f)
132
133    if split == "train":
134        image_paths, gt_paths = image_paths[:75], gt_paths[:75]
135    elif split == "val":
136        image_paths, gt_paths = image_paths[75:90], gt_paths[75:90]
137    elif split == "test":
138        image_paths, gt_paths = image_paths[90:], gt_paths[90:]
139    else:
140        raise ValueError(f"'{split}' is not a valid split.")
141
142    return image_paths, gt_paths
143
144
145def get_osic_pulmofib_paths(
146    path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], download: bool = False
147) -> Tuple[List[str], List[str]]:
148    """Get paths to the OSIC PulmoFib data.
149
150    Args:
151        path: Filepath to a folder where the data is downloaded for further processing.
152        split: The choice of data split.
153        download: Whether to download the data if it is not present.
154
155    Returns:
156        List of filepaths for the image data.
157        List of filepaths for the label data.
158    """
159    data_dir = get_osic_pulmofib_data(path, download)
160    image_paths, gt_paths = _preprocess_inputs(data_dir, split)
161    return image_paths, gt_paths
162
163
164def get_osic_pulmofib_dataset(
165    path: Union[os.PathLike, str],
166    patch_shape: Tuple[int, ...],
167    split: Literal['train', 'val', 'test'],
168    resize_inputs: bool = False,
169    download: bool = False,
170    **kwargs
171):
172    """Get the OSIC PulmoFib dataset for segmentation of lung, heart and trachea.
173
174    Args:
175        path: Filepath to a folder where the data is downloaded for further processing.
176        patch_shape: The patch shape to use for training.
177        split: The choice of data split.
178        resize_inputs: Whether to resize the inputs to the patch shape.
179        download: Whether to download the data if it is not present.
180        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
181
182    Returns:
183        The segmentation dataset.
184    """
185    image_paths, gt_paths = get_osic_pulmofib_paths(path, split, download)
186
187    if resize_inputs:
188        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": False}
189        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
190            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
191        )
192
193    dataset = torch_em.default_segmentation_dataset(
194        raw_paths=image_paths,
195        raw_key="data",
196        label_paths=gt_paths,
197        label_key="data",
198        patch_shape=patch_shape,
199        is_seg_dataset=True,
200        **kwargs
201    )
202
203    for d in dataset.datasets:
204        d.max_sampling_attempts = 1000
205
206    return dataset
207
208
209def get_osic_pulmofib_loader(
210    path: Union[os.PathLike, str],
211    batch_size: int,
212    patch_shape: Tuple[int, ...],
213    split: Literal['train', 'val', 'test'],
214    resize_inputs: bool = False,
215    download: bool = False,
216    **kwargs
217):
218    """Get the OSIC PulmoFib dataloader for segmentation of lung, heart and trachea.
219
220    Args:
221        path: Filepath to a folder where the data is downloaded for further processing.
222        batch_size: The batch size for training.
223        patch_shape: The patch shape to use for training.
224        split: The choice of data split.
225        resize_inputs: Whether to resize the inputs to the patch shape.
226        download: Whether to download the data if it is not present.
227        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
228
229    Returns:
230        The DataLoader.
231    """
232    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
233    dataset = get_osic_pulmofib_dataset(path, patch_shape, split, resize_inputs, download, **ds_kwargs)
234    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

ORGAN_IDS = {'heart': 1, 'lung': 2, 'trachea': 3}

def get_osic_pulmofib_data(path: Union[os.PathLike, str], download: bool = False) -> str: View Source

28def get_osic_pulmofib_data(path: Union[os.PathLike, str], download: bool = False) -> str:
29    """Download the OSIC PulmoFib dataset.
30
31    Args:
32        path: Filepath to a folder where the data is downloaded for further processing.
33        download: Whether to download the data if it is not present.
34
35    Returns:
36        Filepath where the data is downloaded.
37    """
38    data_dir = os.path.join(path, "data")
39    if os.path.exists(data_dir):
40        return data_dir
41
42    os.makedirs(path, exist_ok=True)
43
44    # download the inputs
45    zip_path = os.path.join(path, "osic-pulmonary-fibrosis-progression.zip")
46    util.download_source_kaggle(
47        path=path, dataset_name="osic-pulmonary-fibrosis-progression", download=download, competition=True
48    )
49    util.unzip(zip_path=zip_path, dst=data_dir, remove=False)
50
51    # download the labels
52    zip_path = os.path.join(path, "ct-lung-heart-trachea-segmentation.zip")
53    util.download_source_kaggle(
54        path=path, dataset_name="sandorkonya/ct-lung-heart-trachea-segmentation", download=download
55    )
56    util.unzip(zip_path=zip_path, dst=data_dir)
57
58    return data_dir

Download the OSIC PulmoFib dataset.

Arguments:

path: Filepath to a folder where the data is downloaded for further processing.
download: Whether to download the data if it is not present.

Returns:

Filepath where the data is downloaded.

def get_osic_pulmofib_paths( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], download: bool = False) -> Tuple[List[str], List[str]]: View Source

146def get_osic_pulmofib_paths(
147    path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], download: bool = False
148) -> Tuple[List[str], List[str]]:
149    """Get paths to the OSIC PulmoFib data.
150
151    Args:
152        path: Filepath to a folder where the data is downloaded for further processing.
153        split: The choice of data split.
154        download: Whether to download the data if it is not present.
155
156    Returns:
157        List of filepaths for the image data.
158        List of filepaths for the label data.
159    """
160    data_dir = get_osic_pulmofib_data(path, download)
161    image_paths, gt_paths = _preprocess_inputs(data_dir, split)
162    return image_paths, gt_paths

Get paths to the OSIC PulmoFib data.

Arguments:

path: Filepath to a folder where the data is downloaded for further processing.
split: The choice of data split.
download: Whether to download the data if it is not present.

Returns:

List of filepaths for the image data. List of filepaths for the label data.

def get_osic_pulmofib_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], split: Literal['train', 'val', 'test'], resize_inputs: bool = False, download: bool = False, **kwargs): View Source

165def get_osic_pulmofib_dataset(
166    path: Union[os.PathLike, str],
167    patch_shape: Tuple[int, ...],
168    split: Literal['train', 'val', 'test'],
169    resize_inputs: bool = False,
170    download: bool = False,
171    **kwargs
172):
173    """Get the OSIC PulmoFib dataset for segmentation of lung, heart and trachea.
174
175    Args:
176        path: Filepath to a folder where the data is downloaded for further processing.
177        patch_shape: The patch shape to use for training.
178        split: The choice of data split.
179        resize_inputs: Whether to resize the inputs to the patch shape.
180        download: Whether to download the data if it is not present.
181        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
182
183    Returns:
184        The segmentation dataset.
185    """
186    image_paths, gt_paths = get_osic_pulmofib_paths(path, split, download)
187
188    if resize_inputs:
189        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": False}
190        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
191            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
192        )
193
194    dataset = torch_em.default_segmentation_dataset(
195        raw_paths=image_paths,
196        raw_key="data",
197        label_paths=gt_paths,
198        label_key="data",
199        patch_shape=patch_shape,
200        is_seg_dataset=True,
201        **kwargs
202    )
203
204    for d in dataset.datasets:
205        d.max_sampling_attempts = 1000
206
207    return dataset

Get the OSIC PulmoFib dataset for segmentation of lung, heart and trachea.

Arguments:

path: Filepath to a folder where the data is downloaded for further processing.
patch_shape: The patch shape to use for training.
split: The choice of data split.
resize_inputs: Whether to resize the inputs to the patch shape.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

Returns:

The segmentation dataset.

def get_osic_pulmofib_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, ...], split: Literal['train', 'val', 'test'], resize_inputs: bool = False, download: bool = False, **kwargs): View Source

210def get_osic_pulmofib_loader(
211    path: Union[os.PathLike, str],
212    batch_size: int,
213    patch_shape: Tuple[int, ...],
214    split: Literal['train', 'val', 'test'],
215    resize_inputs: bool = False,
216    download: bool = False,
217    **kwargs
218):
219    """Get the OSIC PulmoFib dataloader for segmentation of lung, heart and trachea.
220
221    Args:
222        path: Filepath to a folder where the data is downloaded for further processing.
223        batch_size: The batch size for training.
224        patch_shape: The patch shape to use for training.
225        split: The choice of data split.
226        resize_inputs: Whether to resize the inputs to the patch shape.
227        download: Whether to download the data if it is not present.
228        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
229
230    Returns:
231        The DataLoader.
232    """
233    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
234    dataset = get_osic_pulmofib_dataset(path, patch_shape, split, resize_inputs, download, **ds_kwargs)
235    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the OSIC PulmoFib dataloader for segmentation of lung, heart and trachea.

Arguments:

path: Filepath to a folder where the data is downloaded for further processing.
batch_size: The batch size for training.
patch_shape: The patch shape to use for training.
split: The choice of data split.
resize_inputs: Whether to resize the inputs to the patch shape.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

Returns:

The DataLoader.