torch_em.data.datasets.medical.plethora
The PLETHORA dataset contains annotations for thoracic organs and pleural effusion in CT.
This dataset is from the publication https://doi.org/10.1002/mp.14424/. Please cite it if you use this dataset for your research.
1"""The PLETHORA dataset contains annotations for thoracic organs and pleural effusion in CT. 2 3This dataset is from the publication https://doi.org/10.1002/mp.14424/. 4Please cite it if you use this dataset for your research. 5""" 6 7import os 8from glob import glob 9from tqdm import tqdm 10from pathlib import Path 11from natsort import natsorted 12from urllib.parse import urljoin 13from typing import Union, Tuple, Literal, List 14 15import numpy as np 16import pandas as pd 17 18from torch.utils.data import Dataset, DataLoader 19 20import torch_em 21 22from .. import util 23 24 25BASE_URL = "https://wiki.cancerimagingarchive.net/download/attachments/68551327/" 26 27 28URL = { 29 "image": urljoin(BASE_URL, "NSCLC-Radiomics-OriginalCTs.tcia"), 30 "gt": { 31 "thoracic": urljoin( 32 BASE_URL, "PleThora%20Thoracic_Cavities%20June%202020.zip?version=1&modificationDate=1593202695428&api=v2" 33 ), 34 "pleural_effusion": urljoin( 35 BASE_URL, "PleThora%20Effusions%20June%202020.zip?version=1&modificationDate=1593202778373&api=v2" 36 ) 37 } 38} 39 40 41CHECKSUMS = { 42 "image": None, 43 "gt": { 44 "thoracic": "6dfcb60e46c7b0ccf240bc5d13acb1c45c8d2f4922223f7b2fbd5e37acff2be0", 45 "pleural_effusion": "5dd07c327fb5723c5bbb48f2a02d7f365513d3ad136811fbe4def330ef2d7f6a" 46 } 47} 48 49 50ZIPFILES = { 51 "thoracic": "thoracic.zip", 52 "pleural_effusion": "pleural_effusion.zip" 53} 54 55 56def get_plethora_data( 57 path: Union[os.PathLike, str], task: Literal["thoracic", "pleural_effusion"], download: bool = False 58) -> Tuple[str, str, str]: 59 """Get the PLETHORA dataset. 60 61 Args: 62 path: Filepath to a folder where the data is downloaded for further processing. 63 task: The choice of task. 64 download: Whether to download the data if it is not present. 65 66 Returns: 67 Filepath where the data is downloaded. 68 """ 69 image_dir = os.path.join(path, "data", "images") 70 gt_dir = os.path.join(path, "data", "gt", "Thoracic_Cavities" if task == "thoracic" else "Effusions") 71 csv_path = os.path.join(path, "plethora_images") 72 if os.path.exists(image_dir) and os.path.exists(gt_dir): 73 return image_dir, gt_dir, Path(csv_path).with_suffix(".csv") 74 75 os.makedirs(path, exist_ok=True) 76 77 # let's download dicom files from the tcia manifest 78 tcia_path = os.path.join(path, "NSCLC-Radiomics-OriginalCTs.tcia") 79 util.download_source_tcia(path=tcia_path, url=URL["image"], dst=image_dir, csv_filename=csv_path, download=download) 80 81 # let's download the segmentations from zipfiles 82 zip_path = os.path.join(path, ZIPFILES[task]) 83 util.download_source( 84 path=zip_path, url=URL["gt"][task], download=download, checksum=CHECKSUMS["gt"][task] 85 ) 86 util.unzip(zip_path=zip_path, dst=os.path.join(path, "data", "gt")) 87 88 return image_dir, gt_dir, Path(csv_path).with_suffix(".csv") 89 90 91def _assort_plethora_inputs(image_dir, gt_dir, task, csv_path): 92 import nibabel as nib 93 import pydicom as dicom 94 95 df = pd.read_csv(csv_path) 96 97 task_gt_dir = os.path.join(gt_dir, ) 98 99 os.makedirs(os.path.join(image_dir, "preprocessed"), exist_ok=True) 100 os.makedirs(os.path.join(task_gt_dir, "preprocessed"), exist_ok=True) 101 102 # let's get all the series uid of the volumes downloaded and spot their allocated subject id 103 all_series_uid_dirs = glob(os.path.join(image_dir, "1.3*")) 104 image_paths, gt_paths = [], [] 105 for series_uid_dir in tqdm(all_series_uid_dirs): 106 series_uid = os.path.split(series_uid_dir)[-1] 107 subject_id = pd.Series.to_string(df.loc[df["Series UID"] == series_uid]["Subject ID"])[-9:] 108 109 try: 110 gt_path = glob(os.path.join(task_gt_dir, subject_id, "*.nii.gz"))[0] 111 except IndexError: 112 # - some patients do not have "Thoracic_Cavities" segmentation 113 print(f"The ground truth is missing for subject '{subject_id}'") 114 continue 115 116 assert os.path.exists(gt_path) 117 118 vol_path = os.path.join(image_dir, "preprocessed", f"{subject_id}.nii.gz") 119 neu_gt_path = os.path.join(task_gt_dir, "preprocessed", os.path.split(gt_path)[-1]) 120 121 image_paths.append(vol_path) 122 gt_paths.append(neu_gt_path) 123 if os.path.exists(vol_path) and os.path.exists(neu_gt_path): 124 continue 125 126 # the individual slices for the inputs need to be merged into one volume. 127 if not os.path.exists(vol_path): 128 all_dcm_slices = natsorted(glob(os.path.join(series_uid_dir, "*.dcm"))) 129 all_slices = [] 130 for dcm_path in all_dcm_slices: 131 dcmfile = dicom.dcmread(dcm_path) 132 img = dcmfile.pixel_array 133 all_slices.append(img) 134 135 volume = np.stack(all_slices) 136 volume = volume.transpose(1, 2, 0) 137 nii_vol = nib.Nifti1Image(volume, np.eye(4)) 138 nii_vol.header.get_xyzt_units() 139 nii_vol.to_filename(vol_path) 140 141 # the ground truth needs to be aligned as the inputs, let's take care of that. 142 gt = nib.load(gt_path) 143 gt = gt.get_fdata() 144 gt = gt.transpose(2, 1, 0) # aligning w.r.t the inputs 145 gt = np.flip(gt, axis=(0, 1)) 146 147 gt = gt.transpose(1, 2, 0) 148 gt_nii_vol = nib.Nifti1Image(gt, np.eye(4)) 149 gt_nii_vol.header.get_xyzt_units() 150 gt_nii_vol.to_filename(neu_gt_path) 151 152 return image_paths, gt_paths 153 154 155def get_plethora_paths( 156 path: Union[os.PathLike, str], task: Literal["thoracic", "pleural_effusion"], download: bool = False 157) -> Tuple[List[str], List[str]]: 158 """Get paths to the PLETHORA data. 159 160 Args: 161 path: Filepath to a folder where the data is downloaded for further processing. 162 task: The choice of task. 163 download: Whether to download the data if it is not present. 164 165 Returns: 166 List of filepaths for the image data. 167 List of filepaths for the label data. 168 """ 169 image_dir, gt_dir, csv_path = get_plethora_data(path, task, download) 170 image_paths, gt_paths = _assort_plethora_inputs(image_dir=image_dir, gt_dir=gt_dir, task=task, csv_path=csv_path) 171 return image_paths, gt_paths 172 173 174def get_plethora_dataset( 175 path: Union[os.PathLike, str], 176 patch_shape: Tuple[int, ...], 177 task: Literal["thoracic", "pleural_effusion"], 178 resize_inputs: bool = False, 179 download: bool = False, 180 **kwargs 181) -> Dataset: 182 """Get the PLETHORA dataset. 183 184 Args: 185 path: Filepath to a folder where the data is downloaded for further processing. 186 patch_shape: The patch shape to use for training. 187 task: The choice of task. 188 resize_inputs: Whether to resize inputs to the desired patch shape. 189 download: Whether to download the data if it is not present. 190 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 191 192 Returns: 193 The segmentation dataset. 194 """ 195 image_paths, gt_paths = get_plethora_paths(path, task, download) 196 197 if resize_inputs: 198 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": False} 199 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 200 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 201 ) 202 203 return torch_em.default_segmentation_dataset( 204 raw_paths=image_paths, 205 raw_key="data", 206 label_paths=gt_paths, 207 label_key="data", 208 patch_shape=patch_shape, 209 **kwargs 210 ) 211 212 213def get_plethora_loader( 214 path: Union[os.PathLike, str], 215 batch_size: int, 216 patch_shape: Tuple[int, ...], 217 task: Literal["thoracic", "pleural_effusion"], 218 resize_inputs: bool = False, 219 download: bool = False, 220 **kwargs 221) -> DataLoader: 222 """Get the PLETHORA dataloader. 223 224 Args: 225 path: Filepath to a folder where the data is downloaded for further processing. 226 batch_size: The batch size for training. 227 patch_shape: The patch shape to use for training. 228 task: The choice of task. 229 resize_inputs: Whether to resize inputs to the desired patch shape. 230 download: Whether to download the data if it is not present. 231 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 232 233 Returns: 234 The DataLoader. 235 """ 236 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 237 dataset = get_plethora_dataset(path, patch_shape, task, resize_inputs, download, **ds_kwargs) 238 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
BASE_URL =
'https://wiki.cancerimagingarchive.net/download/attachments/68551327/'
URL =
{'image': 'https://wiki.cancerimagingarchive.net/download/attachments/68551327/NSCLC-Radiomics-OriginalCTs.tcia', 'gt': {'thoracic': 'https://wiki.cancerimagingarchive.net/download/attachments/68551327/PleThora%20Thoracic_Cavities%20June%202020.zip?version=1&modificationDate=1593202695428&api=v2', 'pleural_effusion': 'https://wiki.cancerimagingarchive.net/download/attachments/68551327/PleThora%20Effusions%20June%202020.zip?version=1&modificationDate=1593202778373&api=v2'}}
CHECKSUMS =
{'image': None, 'gt': {'thoracic': '6dfcb60e46c7b0ccf240bc5d13acb1c45c8d2f4922223f7b2fbd5e37acff2be0', 'pleural_effusion': '5dd07c327fb5723c5bbb48f2a02d7f365513d3ad136811fbe4def330ef2d7f6a'}}
ZIPFILES =
{'thoracic': 'thoracic.zip', 'pleural_effusion': 'pleural_effusion.zip'}
def
get_plethora_data( path: Union[os.PathLike, str], task: Literal['thoracic', 'pleural_effusion'], download: bool = False) -> Tuple[str, str, str]:
57def get_plethora_data( 58 path: Union[os.PathLike, str], task: Literal["thoracic", "pleural_effusion"], download: bool = False 59) -> Tuple[str, str, str]: 60 """Get the PLETHORA dataset. 61 62 Args: 63 path: Filepath to a folder where the data is downloaded for further processing. 64 task: The choice of task. 65 download: Whether to download the data if it is not present. 66 67 Returns: 68 Filepath where the data is downloaded. 69 """ 70 image_dir = os.path.join(path, "data", "images") 71 gt_dir = os.path.join(path, "data", "gt", "Thoracic_Cavities" if task == "thoracic" else "Effusions") 72 csv_path = os.path.join(path, "plethora_images") 73 if os.path.exists(image_dir) and os.path.exists(gt_dir): 74 return image_dir, gt_dir, Path(csv_path).with_suffix(".csv") 75 76 os.makedirs(path, exist_ok=True) 77 78 # let's download dicom files from the tcia manifest 79 tcia_path = os.path.join(path, "NSCLC-Radiomics-OriginalCTs.tcia") 80 util.download_source_tcia(path=tcia_path, url=URL["image"], dst=image_dir, csv_filename=csv_path, download=download) 81 82 # let's download the segmentations from zipfiles 83 zip_path = os.path.join(path, ZIPFILES[task]) 84 util.download_source( 85 path=zip_path, url=URL["gt"][task], download=download, checksum=CHECKSUMS["gt"][task] 86 ) 87 util.unzip(zip_path=zip_path, dst=os.path.join(path, "data", "gt")) 88 89 return image_dir, gt_dir, Path(csv_path).with_suffix(".csv")
Get the PLETHORA dataset.
Arguments:
- path: Filepath to a folder where the data is downloaded for further processing.
- task: The choice of task.
- download: Whether to download the data if it is not present.
Returns:
Filepath where the data is downloaded.
def
get_plethora_paths( path: Union[os.PathLike, str], task: Literal['thoracic', 'pleural_effusion'], download: bool = False) -> Tuple[List[str], List[str]]:
156def get_plethora_paths( 157 path: Union[os.PathLike, str], task: Literal["thoracic", "pleural_effusion"], download: bool = False 158) -> Tuple[List[str], List[str]]: 159 """Get paths to the PLETHORA data. 160 161 Args: 162 path: Filepath to a folder where the data is downloaded for further processing. 163 task: The choice of task. 164 download: Whether to download the data if it is not present. 165 166 Returns: 167 List of filepaths for the image data. 168 List of filepaths for the label data. 169 """ 170 image_dir, gt_dir, csv_path = get_plethora_data(path, task, download) 171 image_paths, gt_paths = _assort_plethora_inputs(image_dir=image_dir, gt_dir=gt_dir, task=task, csv_path=csv_path) 172 return image_paths, gt_paths
Get paths to the PLETHORA data.
Arguments:
- path: Filepath to a folder where the data is downloaded for further processing.
- task: The choice of task.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the image data. List of filepaths for the label data.
def
get_plethora_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], task: Literal['thoracic', 'pleural_effusion'], resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
175def get_plethora_dataset( 176 path: Union[os.PathLike, str], 177 patch_shape: Tuple[int, ...], 178 task: Literal["thoracic", "pleural_effusion"], 179 resize_inputs: bool = False, 180 download: bool = False, 181 **kwargs 182) -> Dataset: 183 """Get the PLETHORA dataset. 184 185 Args: 186 path: Filepath to a folder where the data is downloaded for further processing. 187 patch_shape: The patch shape to use for training. 188 task: The choice of task. 189 resize_inputs: Whether to resize inputs to the desired patch shape. 190 download: Whether to download the data if it is not present. 191 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 192 193 Returns: 194 The segmentation dataset. 195 """ 196 image_paths, gt_paths = get_plethora_paths(path, task, download) 197 198 if resize_inputs: 199 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": False} 200 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 201 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 202 ) 203 204 return torch_em.default_segmentation_dataset( 205 raw_paths=image_paths, 206 raw_key="data", 207 label_paths=gt_paths, 208 label_key="data", 209 patch_shape=patch_shape, 210 **kwargs 211 )
Get the PLETHORA dataset.
Arguments:
- path: Filepath to a folder where the data is downloaded for further processing.
- patch_shape: The patch shape to use for training.
- task: The choice of task.
- resize_inputs: Whether to resize inputs to the desired patch shape.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
def
get_plethora_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, ...], task: Literal['thoracic', 'pleural_effusion'], resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
214def get_plethora_loader( 215 path: Union[os.PathLike, str], 216 batch_size: int, 217 patch_shape: Tuple[int, ...], 218 task: Literal["thoracic", "pleural_effusion"], 219 resize_inputs: bool = False, 220 download: bool = False, 221 **kwargs 222) -> DataLoader: 223 """Get the PLETHORA dataloader. 224 225 Args: 226 path: Filepath to a folder where the data is downloaded for further processing. 227 batch_size: The batch size for training. 228 patch_shape: The patch shape to use for training. 229 task: The choice of task. 230 resize_inputs: Whether to resize inputs to the desired patch shape. 231 download: Whether to download the data if it is not present. 232 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 233 234 Returns: 235 The DataLoader. 236 """ 237 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 238 dataset = get_plethora_dataset(path, patch_shape, task, resize_inputs, download, **ds_kwargs) 239 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the PLETHORA dataloader.
Arguments:
- path: Filepath to a folder where the data is downloaded for further processing.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- task: The choice of task.
- resize_inputs: Whether to resize inputs to the desired patch shape.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader.