torch_em.data.datasets.medical.plethora

The PLETHORA dataset contains annotations for thoracic organs and pleural effusion in CT.

This dataset is from the publication https://doi.org/10.1002/mp.14424/. Please cite it if you use this dataset for your research.

  1"""The PLETHORA dataset contains annotations for thoracic organs and pleural effusion in CT.
  2
  3This dataset is from the publication https://doi.org/10.1002/mp.14424/.
  4Please cite it if you use this dataset for your research.
  5"""
  6
  7import os
  8from glob import glob
  9from tqdm import tqdm
 10from pathlib import Path
 11from natsort import natsorted
 12from urllib.parse import urljoin
 13from typing import Union, Tuple, Literal, List
 14
 15import numpy as np
 16import pandas as pd
 17
 18from torch.utils.data import Dataset, DataLoader
 19
 20import torch_em
 21
 22from .. import util
 23
 24
 25BASE_URL = "https://wiki.cancerimagingarchive.net/download/attachments/68551327/"
 26
 27
 28URL = {
 29    "image": urljoin(BASE_URL, "NSCLC-Radiomics-OriginalCTs.tcia"),
 30    "gt": {
 31        "thoracic": urljoin(
 32            BASE_URL, "PleThora%20Thoracic_Cavities%20June%202020.zip?version=1&modificationDate=1593202695428&api=v2"
 33        ),
 34        "pleural_effusion": urljoin(
 35            BASE_URL, "PleThora%20Effusions%20June%202020.zip?version=1&modificationDate=1593202778373&api=v2"
 36        )
 37    }
 38}
 39
 40
 41CHECKSUMS = {
 42    "image": None,
 43    "gt": {
 44        "thoracic": "6dfcb60e46c7b0ccf240bc5d13acb1c45c8d2f4922223f7b2fbd5e37acff2be0",
 45        "pleural_effusion": "5dd07c327fb5723c5bbb48f2a02d7f365513d3ad136811fbe4def330ef2d7f6a"
 46    }
 47}
 48
 49
 50ZIPFILES = {
 51    "thoracic": "thoracic.zip",
 52    "pleural_effusion": "pleural_effusion.zip"
 53}
 54
 55
 56def get_plethora_data(
 57    path: Union[os.PathLike, str], task: Literal["thoracic", "pleural_effusion"], download: bool = False
 58) -> Tuple[str, str, str]:
 59    """Get the PLETHORA dataset.
 60
 61    Args:
 62        path: Filepath to a folder where the data is downloaded for further processing.
 63        task: The choice of task.
 64        download: Whether to download the data if it is not present.
 65
 66    Returns:
 67        Filepath where the data is downloaded.
 68    """
 69    image_dir = os.path.join(path, "data", "images")
 70    gt_dir = os.path.join(path, "data", "gt", "Thoracic_Cavities" if task == "thoracic" else "Effusions")
 71    csv_path = os.path.join(path, "plethora_images")
 72    if os.path.exists(image_dir) and os.path.exists(gt_dir):
 73        return image_dir, gt_dir, Path(csv_path).with_suffix(".csv")
 74
 75    os.makedirs(path, exist_ok=True)
 76
 77    # let's download dicom files from the tcia manifest
 78    tcia_path = os.path.join(path, "NSCLC-Radiomics-OriginalCTs.tcia")
 79    util.download_source_tcia(path=tcia_path, url=URL["image"], dst=image_dir, csv_filename=csv_path, download=download)
 80
 81    # let's download the segmentations from zipfiles
 82    zip_path = os.path.join(path, ZIPFILES[task])
 83    util.download_source(
 84        path=zip_path, url=URL["gt"][task], download=download, checksum=CHECKSUMS["gt"][task]
 85    )
 86    util.unzip(zip_path=zip_path, dst=os.path.join(path, "data", "gt"))
 87
 88    return image_dir, gt_dir, Path(csv_path).with_suffix(".csv")
 89
 90
 91def _assort_plethora_inputs(image_dir, gt_dir, task, csv_path):
 92    import nibabel as nib
 93    import pydicom as dicom
 94
 95    df = pd.read_csv(csv_path)
 96
 97    task_gt_dir = os.path.join(gt_dir, )
 98
 99    os.makedirs(os.path.join(image_dir, "preprocessed"), exist_ok=True)
100    os.makedirs(os.path.join(task_gt_dir, "preprocessed"), exist_ok=True)
101
102    # let's get all the series uid of the volumes downloaded and spot their allocated subject id
103    all_series_uid_dirs = glob(os.path.join(image_dir, "1.3*"))
104    image_paths, gt_paths = [], []
105    for series_uid_dir in tqdm(all_series_uid_dirs):
106        series_uid = os.path.split(series_uid_dir)[-1]
107        subject_id = pd.Series.to_string(df.loc[df["Series UID"] == series_uid]["Subject ID"])[-9:]
108
109        try:
110            gt_path = glob(os.path.join(task_gt_dir, subject_id, "*.nii.gz"))[0]
111        except IndexError:
112            # - some patients do not have "Thoracic_Cavities" segmentation
113            print(f"The ground truth is missing for subject '{subject_id}'")
114            continue
115
116        assert os.path.exists(gt_path)
117
118        vol_path = os.path.join(image_dir, "preprocessed", f"{subject_id}.nii.gz")
119        neu_gt_path = os.path.join(task_gt_dir, "preprocessed", os.path.split(gt_path)[-1])
120
121        image_paths.append(vol_path)
122        gt_paths.append(neu_gt_path)
123        if os.path.exists(vol_path) and os.path.exists(neu_gt_path):
124            continue
125
126        # the individual slices for the inputs need to be merged into one volume.
127        if not os.path.exists(vol_path):
128            all_dcm_slices = natsorted(glob(os.path.join(series_uid_dir, "*.dcm")))
129            all_slices = []
130            for dcm_path in all_dcm_slices:
131                dcmfile = dicom.dcmread(dcm_path)
132                img = dcmfile.pixel_array
133                all_slices.append(img)
134
135            volume = np.stack(all_slices)
136            volume = volume.transpose(1, 2, 0)
137            nii_vol = nib.Nifti1Image(volume, np.eye(4))
138            nii_vol.header.get_xyzt_units()
139            nii_vol.to_filename(vol_path)
140
141        # the ground truth needs to be aligned as the inputs, let's take care of that.
142        gt = nib.load(gt_path)
143        gt = gt.get_fdata()
144        gt = gt.transpose(2, 1, 0)  # aligning w.r.t the inputs
145        gt = np.flip(gt, axis=(0, 1))
146
147        gt = gt.transpose(1, 2, 0)
148        gt_nii_vol = nib.Nifti1Image(gt, np.eye(4))
149        gt_nii_vol.header.get_xyzt_units()
150        gt_nii_vol.to_filename(neu_gt_path)
151
152    return image_paths, gt_paths
153
154
155def get_plethora_paths(
156    path: Union[os.PathLike, str], task: Literal["thoracic", "pleural_effusion"], download: bool = False
157) -> Tuple[List[str], List[str]]:
158    """Get paths to the PLETHORA data.
159
160    Args:
161        path: Filepath to a folder where the data is downloaded for further processing.
162        task: The choice of task.
163        download: Whether to download the data if it is not present.
164
165    Returns:
166        List of filepaths for the image data.
167        List of filepaths for the label data.
168    """
169    image_dir, gt_dir, csv_path = get_plethora_data(path, task, download)
170    image_paths, gt_paths = _assort_plethora_inputs(image_dir=image_dir, gt_dir=gt_dir, task=task, csv_path=csv_path)
171    return image_paths, gt_paths
172
173
174def get_plethora_dataset(
175    path: Union[os.PathLike, str],
176    patch_shape: Tuple[int, ...],
177    task: Literal["thoracic", "pleural_effusion"],
178    resize_inputs: bool = False,
179    download: bool = False,
180    **kwargs
181) -> Dataset:
182    """Get the PLETHORA dataset.
183
184    Args:
185        path: Filepath to a folder where the data is downloaded for further processing.
186        patch_shape: The patch shape to use for training.
187        task: The choice of task.
188        resize_inputs: Whether to resize inputs to the desired patch shape.
189        download: Whether to download the data if it is not present.
190        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
191
192    Returns:
193        The segmentation dataset.
194    """
195    image_paths, gt_paths = get_plethora_paths(path, task, download)
196
197    if resize_inputs:
198        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": False}
199        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
200            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
201        )
202
203    return torch_em.default_segmentation_dataset(
204        raw_paths=image_paths,
205        raw_key="data",
206        label_paths=gt_paths,
207        label_key="data",
208        patch_shape=patch_shape,
209        **kwargs
210    )
211
212
213def get_plethora_loader(
214    path: Union[os.PathLike, str],
215    batch_size: int,
216    patch_shape: Tuple[int, ...],
217    task: Literal["thoracic", "pleural_effusion"],
218    resize_inputs: bool = False,
219    download: bool = False,
220    **kwargs
221) -> DataLoader:
222    """Get the PLETHORA dataloader.
223
224    Args:
225        path: Filepath to a folder where the data is downloaded for further processing.
226        batch_size: The batch size for training.
227        patch_shape: The patch shape to use for training.
228        task: The choice of task.
229        resize_inputs: Whether to resize inputs to the desired patch shape.
230        download: Whether to download the data if it is not present.
231        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
232
233    Returns:
234        The DataLoader.
235    """
236    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
237    dataset = get_plethora_dataset(path, patch_shape, task, resize_inputs, download, **ds_kwargs)
238    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
BASE_URL = 'https://wiki.cancerimagingarchive.net/download/attachments/68551327/'
URL = {'image': 'https://wiki.cancerimagingarchive.net/download/attachments/68551327/NSCLC-Radiomics-OriginalCTs.tcia', 'gt': {'thoracic': 'https://wiki.cancerimagingarchive.net/download/attachments/68551327/PleThora%20Thoracic_Cavities%20June%202020.zip?version=1&modificationDate=1593202695428&api=v2', 'pleural_effusion': 'https://wiki.cancerimagingarchive.net/download/attachments/68551327/PleThora%20Effusions%20June%202020.zip?version=1&modificationDate=1593202778373&api=v2'}}
CHECKSUMS = {'image': None, 'gt': {'thoracic': '6dfcb60e46c7b0ccf240bc5d13acb1c45c8d2f4922223f7b2fbd5e37acff2be0', 'pleural_effusion': '5dd07c327fb5723c5bbb48f2a02d7f365513d3ad136811fbe4def330ef2d7f6a'}}
ZIPFILES = {'thoracic': 'thoracic.zip', 'pleural_effusion': 'pleural_effusion.zip'}
def get_plethora_data( path: Union[os.PathLike, str], task: Literal['thoracic', 'pleural_effusion'], download: bool = False) -> Tuple[str, str, str]:
57def get_plethora_data(
58    path: Union[os.PathLike, str], task: Literal["thoracic", "pleural_effusion"], download: bool = False
59) -> Tuple[str, str, str]:
60    """Get the PLETHORA dataset.
61
62    Args:
63        path: Filepath to a folder where the data is downloaded for further processing.
64        task: The choice of task.
65        download: Whether to download the data if it is not present.
66
67    Returns:
68        Filepath where the data is downloaded.
69    """
70    image_dir = os.path.join(path, "data", "images")
71    gt_dir = os.path.join(path, "data", "gt", "Thoracic_Cavities" if task == "thoracic" else "Effusions")
72    csv_path = os.path.join(path, "plethora_images")
73    if os.path.exists(image_dir) and os.path.exists(gt_dir):
74        return image_dir, gt_dir, Path(csv_path).with_suffix(".csv")
75
76    os.makedirs(path, exist_ok=True)
77
78    # let's download dicom files from the tcia manifest
79    tcia_path = os.path.join(path, "NSCLC-Radiomics-OriginalCTs.tcia")
80    util.download_source_tcia(path=tcia_path, url=URL["image"], dst=image_dir, csv_filename=csv_path, download=download)
81
82    # let's download the segmentations from zipfiles
83    zip_path = os.path.join(path, ZIPFILES[task])
84    util.download_source(
85        path=zip_path, url=URL["gt"][task], download=download, checksum=CHECKSUMS["gt"][task]
86    )
87    util.unzip(zip_path=zip_path, dst=os.path.join(path, "data", "gt"))
88
89    return image_dir, gt_dir, Path(csv_path).with_suffix(".csv")

Get the PLETHORA dataset.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • task: The choice of task.
  • download: Whether to download the data if it is not present.
Returns:

Filepath where the data is downloaded.

def get_plethora_paths( path: Union[os.PathLike, str], task: Literal['thoracic', 'pleural_effusion'], download: bool = False) -> Tuple[List[str], List[str]]:
156def get_plethora_paths(
157    path: Union[os.PathLike, str], task: Literal["thoracic", "pleural_effusion"], download: bool = False
158) -> Tuple[List[str], List[str]]:
159    """Get paths to the PLETHORA data.
160
161    Args:
162        path: Filepath to a folder where the data is downloaded for further processing.
163        task: The choice of task.
164        download: Whether to download the data if it is not present.
165
166    Returns:
167        List of filepaths for the image data.
168        List of filepaths for the label data.
169    """
170    image_dir, gt_dir, csv_path = get_plethora_data(path, task, download)
171    image_paths, gt_paths = _assort_plethora_inputs(image_dir=image_dir, gt_dir=gt_dir, task=task, csv_path=csv_path)
172    return image_paths, gt_paths

Get paths to the PLETHORA data.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • task: The choice of task.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths for the image data. List of filepaths for the label data.

def get_plethora_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], task: Literal['thoracic', 'pleural_effusion'], resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
175def get_plethora_dataset(
176    path: Union[os.PathLike, str],
177    patch_shape: Tuple[int, ...],
178    task: Literal["thoracic", "pleural_effusion"],
179    resize_inputs: bool = False,
180    download: bool = False,
181    **kwargs
182) -> Dataset:
183    """Get the PLETHORA dataset.
184
185    Args:
186        path: Filepath to a folder where the data is downloaded for further processing.
187        patch_shape: The patch shape to use for training.
188        task: The choice of task.
189        resize_inputs: Whether to resize inputs to the desired patch shape.
190        download: Whether to download the data if it is not present.
191        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
192
193    Returns:
194        The segmentation dataset.
195    """
196    image_paths, gt_paths = get_plethora_paths(path, task, download)
197
198    if resize_inputs:
199        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": False}
200        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
201            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
202        )
203
204    return torch_em.default_segmentation_dataset(
205        raw_paths=image_paths,
206        raw_key="data",
207        label_paths=gt_paths,
208        label_key="data",
209        patch_shape=patch_shape,
210        **kwargs
211    )

Get the PLETHORA dataset.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • patch_shape: The patch shape to use for training.
  • task: The choice of task.
  • resize_inputs: Whether to resize inputs to the desired patch shape.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_plethora_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, ...], task: Literal['thoracic', 'pleural_effusion'], resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
214def get_plethora_loader(
215    path: Union[os.PathLike, str],
216    batch_size: int,
217    patch_shape: Tuple[int, ...],
218    task: Literal["thoracic", "pleural_effusion"],
219    resize_inputs: bool = False,
220    download: bool = False,
221    **kwargs
222) -> DataLoader:
223    """Get the PLETHORA dataloader.
224
225    Args:
226        path: Filepath to a folder where the data is downloaded for further processing.
227        batch_size: The batch size for training.
228        patch_shape: The patch shape to use for training.
229        task: The choice of task.
230        resize_inputs: Whether to resize inputs to the desired patch shape.
231        download: Whether to download the data if it is not present.
232        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
233
234    Returns:
235        The DataLoader.
236    """
237    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
238    dataset = get_plethora_dataset(path, patch_shape, task, resize_inputs, download, **ds_kwargs)
239    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the PLETHORA dataloader.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape to use for training.
  • task: The choice of task.
  • resize_inputs: Whether to resize inputs to the desired patch shape.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.