torch_em.data.datasets.medical.osic_pulmofib
The OSIC PulmoFib dataset contains annotations for lung, heart and trachea in CT scans.
This dataset is from OSIC Pulmonary Fibrosis Progression Challenge:
- https://www.kaggle.com/c/osic-pulmonary-fibrosis-progression/data (dataset source)
- https://www.kaggle.com/datasets/sandorkonya/ct-lung-heart-trachea-segmentation (segmentation source) Please cite them if you use this dataset for your research.
1"""The OSIC PulmoFib dataset contains annotations for lung, heart and trachea in CT scans. 2 3This dataset is from OSIC Pulmonary Fibrosis Progression Challenge: 4- https://www.kaggle.com/c/osic-pulmonary-fibrosis-progression/data (dataset source) 5- https://www.kaggle.com/datasets/sandorkonya/ct-lung-heart-trachea-segmentation (segmentation source) 6Please cite them if you use this dataset for your research. 7""" 8 9import os 10from glob import glob 11from tqdm import tqdm 12from pathlib import Path 13from natsort import natsorted 14from typing import Union, Tuple, List, Literal 15 16import json 17import numpy as np 18 19import torch_em 20 21from .. import util 22 23 24ORGAN_IDS = {"heart": 1, "lung": 2, "trachea": 3} 25 26 27def get_osic_pulmofib_data(path: Union[os.PathLike, str], download: bool = False) -> str: 28 """Download the OSIC PulmoFib dataset. 29 30 Args: 31 path: Filepath to a folder where the data is downloaded for further processing. 32 download: Whether to download the data if it is not present. 33 34 Returns: 35 Filepath where the data is downloaded. 36 """ 37 data_dir = os.path.join(path, "data") 38 if os.path.exists(data_dir): 39 return data_dir 40 41 os.makedirs(path, exist_ok=True) 42 43 # download the inputs 44 zip_path = os.path.join(path, "osic-pulmonary-fibrosis-progression.zip") 45 util.download_source_kaggle( 46 path=path, dataset_name="osic-pulmonary-fibrosis-progression", download=download, competition=True 47 ) 48 util.unzip(zip_path=zip_path, dst=data_dir, remove=False) 49 50 # download the labels 51 zip_path = os.path.join(path, "ct-lung-heart-trachea-segmentation.zip") 52 util.download_source_kaggle( 53 path=path, dataset_name="sandorkonya/ct-lung-heart-trachea-segmentation", download=download 54 ) 55 util.unzip(zip_path=zip_path, dst=data_dir) 56 57 return data_dir 58 59 60def _preprocess_inputs(data_dir, split): 61 image_dir = os.path.join(data_dir, "preprocessed", "images") 62 gt_dir = os.path.join(data_dir, "preprocessed", "ground_truth") 63 64 os.makedirs(image_dir, exist_ok=True) 65 os.makedirs(gt_dir, exist_ok=True) 66 67 cpath = os.path.join(data_dir, "preprocessed", "confirmer.json") 68 _completed_preproc = os.path.exists(cpath) 69 70 image_paths, gt_paths = [], [] 71 uid_paths = natsorted(glob(os.path.join(data_dir, "train", "*"))) 72 for uid_path in tqdm(uid_paths, desc="Preprocessing inputs", disable=_completed_preproc): 73 uid = uid_path.split("/")[-1] 74 75 image_path = os.path.join(image_dir, f"{uid}.nii.gz") 76 gt_path = os.path.join(gt_dir, f"{uid}.nii.gz") 77 78 if _completed_preproc: 79 if os.path.exists(image_path) and os.path.exists(gt_path): 80 image_paths.append(image_path) 81 gt_paths.append(gt_path) 82 83 continue 84 85 import nrrd 86 import nibabel as nib 87 import pydicom as dicom 88 89 # creating the volume out of individual dicom slices 90 all_slices = [] 91 for slice_path in natsorted(glob(os.path.join(uid_path, "*.dcm"))): 92 per_slice = dicom.dcmread(slice_path) 93 per_slice = per_slice.pixel_array 94 all_slices.append(per_slice) 95 all_slices = np.stack(all_slices).transpose(1, 2, 0) 96 97 # next, combining the semantic organ annotations into one ground-truth volume with specific semantic labels 98 all_gt = np.zeros(all_slices.shape, dtype="uint8") 99 for ann_path in glob(os.path.join(data_dir, "*", "*", f"{uid}_*.nrrd")): 100 ann_organ = Path(ann_path).stem.split("_")[-1] 101 if ann_organ == "noisy": 102 continue 103 104 per_gt, _ = nrrd.read(ann_path) 105 per_gt = per_gt.transpose(1, 0, 2) 106 107 # some organ anns have weird dimension mismatch, we don't consider them for simplicity 108 if per_gt.shape == all_slices.shape: 109 all_gt[per_gt > 0] = ORGAN_IDS[ann_organ] 110 111 # only if the volume has any labels (some volumes do not have segmentations), we save those raw and gt volumes 112 if len(np.unique(all_gt)) > 1: 113 all_gt = np.flip(all_gt, axis=2) 114 115 image_nifti = nib.Nifti2Image(all_slices, np.eye(4)) 116 gt_nifti = nib.Nifti2Image(all_gt, np.eye(4)) 117 118 nib.save(image_nifti, image_path) 119 nib.save(gt_nifti, gt_path) 120 121 image_paths.append(image_path) 122 gt_paths.append(gt_path) 123 124 if not _completed_preproc: 125 # since we do not have segmentation for all volumes, we store a file which reflects aggrement of created dataset 126 confirm_msg = "The dataset has been preprocessed. " 127 confirm_msg += f"It has {len(image_paths)} volume and {len(gt_paths)} respective ground-truth." 128 print(confirm_msg) 129 130 with open(cpath, "w") as f: 131 json.dump(confirm_msg, f) 132 133 if split == "train": 134 image_paths, gt_paths = image_paths[:75], gt_paths[:75] 135 elif split == "val": 136 image_paths, gt_paths = image_paths[75:90], gt_paths[75:90] 137 elif split == "test": 138 image_paths, gt_paths = image_paths[90:], gt_paths[90:] 139 else: 140 raise ValueError(f"'{split}' is not a valid split.") 141 142 return image_paths, gt_paths 143 144 145def get_osic_pulmofib_paths( 146 path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], download: bool = False 147) -> Tuple[List[str], List[str]]: 148 """Get paths to the OSIC PulmoFib data. 149 150 Args: 151 path: Filepath to a folder where the data is downloaded for further processing. 152 split: The choice of data split. 153 download: Whether to download the data if it is not present. 154 155 Returns: 156 List of filepaths for the image data. 157 List of filepaths for the label data. 158 """ 159 data_dir = get_osic_pulmofib_data(path, download) 160 image_paths, gt_paths = _preprocess_inputs(data_dir, split) 161 return image_paths, gt_paths 162 163 164def get_osic_pulmofib_dataset( 165 path: Union[os.PathLike, str], 166 patch_shape: Tuple[int, ...], 167 split: Literal['train', 'val', 'test'], 168 resize_inputs: bool = False, 169 download: bool = False, 170 **kwargs 171): 172 """Get the OSIC PulmoFib dataset for segmentation of lung, heart and trachea. 173 174 Args: 175 path: Filepath to a folder where the data is downloaded for further processing. 176 patch_shape: The patch shape to use for training. 177 split: The choice of data split. 178 resize_inputs: Whether to resize the inputs to the patch shape. 179 download: Whether to download the data if it is not present. 180 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 181 182 Returns: 183 The segmentation dataset. 184 """ 185 image_paths, gt_paths = get_osic_pulmofib_paths(path, split, download) 186 187 if resize_inputs: 188 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": False} 189 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 190 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 191 ) 192 193 dataset = torch_em.default_segmentation_dataset( 194 raw_paths=image_paths, 195 raw_key="data", 196 label_paths=gt_paths, 197 label_key="data", 198 patch_shape=patch_shape, 199 is_seg_dataset=True, 200 **kwargs 201 ) 202 203 for d in dataset.datasets: 204 d.max_sampling_attempts = 1000 205 206 return dataset 207 208 209def get_osic_pulmofib_loader( 210 path: Union[os.PathLike, str], 211 batch_size: int, 212 patch_shape: Tuple[int, ...], 213 split: Literal['train', 'val', 'test'], 214 resize_inputs: bool = False, 215 download: bool = False, 216 **kwargs 217): 218 """Get the OSIC PulmoFib dataloader for segmentation of lung, heart and trachea. 219 220 Args: 221 path: Filepath to a folder where the data is downloaded for further processing. 222 batch_size: The batch size for training. 223 patch_shape: The patch shape to use for training. 224 split: The choice of data split. 225 resize_inputs: Whether to resize the inputs to the patch shape. 226 download: Whether to download the data if it is not present. 227 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 228 229 Returns: 230 The DataLoader. 231 """ 232 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 233 dataset = get_osic_pulmofib_dataset(path, patch_shape, split, resize_inputs, download, **ds_kwargs) 234 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
ORGAN_IDS =
{'heart': 1, 'lung': 2, 'trachea': 3}
def
get_osic_pulmofib_data(path: Union[os.PathLike, str], download: bool = False) -> str:
28def get_osic_pulmofib_data(path: Union[os.PathLike, str], download: bool = False) -> str: 29 """Download the OSIC PulmoFib dataset. 30 31 Args: 32 path: Filepath to a folder where the data is downloaded for further processing. 33 download: Whether to download the data if it is not present. 34 35 Returns: 36 Filepath where the data is downloaded. 37 """ 38 data_dir = os.path.join(path, "data") 39 if os.path.exists(data_dir): 40 return data_dir 41 42 os.makedirs(path, exist_ok=True) 43 44 # download the inputs 45 zip_path = os.path.join(path, "osic-pulmonary-fibrosis-progression.zip") 46 util.download_source_kaggle( 47 path=path, dataset_name="osic-pulmonary-fibrosis-progression", download=download, competition=True 48 ) 49 util.unzip(zip_path=zip_path, dst=data_dir, remove=False) 50 51 # download the labels 52 zip_path = os.path.join(path, "ct-lung-heart-trachea-segmentation.zip") 53 util.download_source_kaggle( 54 path=path, dataset_name="sandorkonya/ct-lung-heart-trachea-segmentation", download=download 55 ) 56 util.unzip(zip_path=zip_path, dst=data_dir) 57 58 return data_dir
Download the OSIC PulmoFib dataset.
Arguments:
- path: Filepath to a folder where the data is downloaded for further processing.
- download: Whether to download the data if it is not present.
Returns:
Filepath where the data is downloaded.
def
get_osic_pulmofib_paths( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], download: bool = False) -> Tuple[List[str], List[str]]:
146def get_osic_pulmofib_paths( 147 path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], download: bool = False 148) -> Tuple[List[str], List[str]]: 149 """Get paths to the OSIC PulmoFib data. 150 151 Args: 152 path: Filepath to a folder where the data is downloaded for further processing. 153 split: The choice of data split. 154 download: Whether to download the data if it is not present. 155 156 Returns: 157 List of filepaths for the image data. 158 List of filepaths for the label data. 159 """ 160 data_dir = get_osic_pulmofib_data(path, download) 161 image_paths, gt_paths = _preprocess_inputs(data_dir, split) 162 return image_paths, gt_paths
Get paths to the OSIC PulmoFib data.
Arguments:
- path: Filepath to a folder where the data is downloaded for further processing.
- split: The choice of data split.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the image data. List of filepaths for the label data.
def
get_osic_pulmofib_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], split: Literal['train', 'val', 'test'], resize_inputs: bool = False, download: bool = False, **kwargs):
165def get_osic_pulmofib_dataset( 166 path: Union[os.PathLike, str], 167 patch_shape: Tuple[int, ...], 168 split: Literal['train', 'val', 'test'], 169 resize_inputs: bool = False, 170 download: bool = False, 171 **kwargs 172): 173 """Get the OSIC PulmoFib dataset for segmentation of lung, heart and trachea. 174 175 Args: 176 path: Filepath to a folder where the data is downloaded for further processing. 177 patch_shape: The patch shape to use for training. 178 split: The choice of data split. 179 resize_inputs: Whether to resize the inputs to the patch shape. 180 download: Whether to download the data if it is not present. 181 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 182 183 Returns: 184 The segmentation dataset. 185 """ 186 image_paths, gt_paths = get_osic_pulmofib_paths(path, split, download) 187 188 if resize_inputs: 189 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": False} 190 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 191 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 192 ) 193 194 dataset = torch_em.default_segmentation_dataset( 195 raw_paths=image_paths, 196 raw_key="data", 197 label_paths=gt_paths, 198 label_key="data", 199 patch_shape=patch_shape, 200 is_seg_dataset=True, 201 **kwargs 202 ) 203 204 for d in dataset.datasets: 205 d.max_sampling_attempts = 1000 206 207 return dataset
Get the OSIC PulmoFib dataset for segmentation of lung, heart and trachea.
Arguments:
- path: Filepath to a folder where the data is downloaded for further processing.
- patch_shape: The patch shape to use for training.
- split: The choice of data split.
- resize_inputs: Whether to resize the inputs to the patch shape.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
def
get_osic_pulmofib_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, ...], split: Literal['train', 'val', 'test'], resize_inputs: bool = False, download: bool = False, **kwargs):
210def get_osic_pulmofib_loader( 211 path: Union[os.PathLike, str], 212 batch_size: int, 213 patch_shape: Tuple[int, ...], 214 split: Literal['train', 'val', 'test'], 215 resize_inputs: bool = False, 216 download: bool = False, 217 **kwargs 218): 219 """Get the OSIC PulmoFib dataloader for segmentation of lung, heart and trachea. 220 221 Args: 222 path: Filepath to a folder where the data is downloaded for further processing. 223 batch_size: The batch size for training. 224 patch_shape: The patch shape to use for training. 225 split: The choice of data split. 226 resize_inputs: Whether to resize the inputs to the patch shape. 227 download: Whether to download the data if it is not present. 228 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 229 230 Returns: 231 The DataLoader. 232 """ 233 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 234 dataset = get_osic_pulmofib_dataset(path, patch_shape, split, resize_inputs, download, **ds_kwargs) 235 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the OSIC PulmoFib dataloader for segmentation of lung, heart and trachea.
Arguments:
- path: Filepath to a folder where the data is downloaded for further processing.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- split: The choice of data split.
- resize_inputs: Whether to resize the inputs to the patch shape.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader.