torch_em.data.datasets.medical.duke_liver

The Duke Liver dataset contains annotations for liver segmentation in MRI scans.

NOTE: This dataset is located at https://doi.org/10.5281/zenodo.7774566. Please see 'get_duke_liver_data' for instructions on downloading the dataset.

The dataset is from the publication https://doi.org/10.1148/ryai.220275. Please cite it if you use this dataset for your research.

View Source

  1"""The Duke Liver dataset contains annotations for liver segmentation in MRI scans.
  2
  3NOTE: This dataset is located at https://doi.org/10.5281/zenodo.7774566.
  4Please see 'get_duke_liver_data' for instructions on downloading the dataset.
  5
  6The dataset is from the publication https://doi.org/10.1148/ryai.220275.
  7Please cite it if you use this dataset for your research.
  8"""
  9
 10import os
 11from glob import glob
 12from tqdm import tqdm
 13from natsort import natsorted
 14from typing import Union, Tuple, Literal, List
 15
 16import numpy as np
 17
 18from torch.utils.data import Dataset, DataLoader
 19
 20import torch_em
 21
 22from .. import util
 23
 24
 25def get_duke_liver_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 26    """Get the Duke Liver dataset.
 27
 28    The dataset is located at https://doi.org/10.5281/zenodo.7774566.
 29
 30    Follow the instructions below to get access to the dataset.
 31    - Visit the zenodo site attached above.
 32    - Send a request message alongwith some details to get access to the dataset.
 33    - The authors would accept the request, then you can access the dataset.
 34    - Next, download the `Segmentation.zip` file and provide the path where the zip file is stored.
 35
 36    Args:
 37        path: Filepath to a folder where the data needs to be downloaded for further processing.
 38        download: Whether to download the data if it is not present.
 39
 40    Returns:
 41        Filepath where the data is preprocessed.
 42    """
 43    data_dir = os.path.join(path, "data", "Segmentation")
 44    if os.path.exists(data_dir):
 45        return data_dir
 46
 47    if download:
 48        raise NotImplementedError(
 49            "Automatic download for Duke Liver dataset is not possible. See `get_duke_liver_data` for details."
 50        )
 51
 52    zip_path = os.path.join(path, "Segmentation.zip")
 53    util.unzip(zip_path=zip_path, dst=os.path.join(path, "data"), remove=False)
 54
 55    return data_dir
 56
 57
 58def _preprocess_data(path, data_dir):
 59    preprocess_dir = os.path.join(path, "data", "preprocessed")
 60
 61    if os.path.exists(preprocess_dir):
 62        _image_paths = natsorted(glob(os.path.join(preprocess_dir, "images", "*.nii.gz")))
 63        _gt_paths = natsorted(glob(os.path.join(preprocess_dir, "masks", "*.nii.gz")))
 64        return _image_paths, _gt_paths
 65
 66    os.makedirs(os.path.join(preprocess_dir, "images"), exist_ok=True)
 67    os.makedirs(os.path.join(preprocess_dir, "masks"), exist_ok=True)
 68
 69    image_paths, gt_paths = [], []
 70    for patient_dir in tqdm(glob(os.path.join(data_dir, "00*"))):
 71        patient_id = os.path.split(patient_dir)[-1]
 72
 73        for sub_id_dir in glob(os.path.join(patient_dir, "*")):
 74            sub_id = os.path.split(sub_id_dir)[-1]
 75
 76            image_path = os.path.join(preprocess_dir, "images", f"{patient_id}_{sub_id}.nii.gz")
 77            gt_path = os.path.join(preprocess_dir, "masks", f"{patient_id}_{sub_id}.nii.gz")
 78
 79            image_paths.append(image_path)
 80            gt_paths.append(gt_path)
 81
 82            if os.path.exists(image_path) and os.path.exists(gt_path):
 83                continue
 84
 85            image_slice_paths = natsorted(glob(os.path.join(sub_id_dir, "images", "*.dicom")))
 86            gt_slice_paths = natsorted(glob(os.path.join(sub_id_dir, "masks", "*.dicom")))
 87
 88            import pydicom as dicom
 89            import nibabel as nib
 90
 91            images, gts = [], []
 92            for image_slice_path, gt_slice_path in zip(image_slice_paths, gt_slice_paths):
 93                image_slice = dicom.dcmread(image_slice_path).pixel_array
 94                gt_slice = dicom.dcmread(gt_slice_path).pixel_array
 95
 96                images.append(image_slice)
 97                gts.append(gt_slice)
 98
 99            image = np.stack(images).transpose(1, 2, 0)
100            gt = np.stack(gts).transpose(1, 2, 0)
101
102            assert image.shape == gt.shape
103
104            image = nib.Nifti2Image(image, np.eye(4))
105            gt = nib.Nifti2Image(gt, np.eye(4))
106
107            nib.save(image, image_path)
108            nib.save(gt, gt_path)
109
110    return natsorted(image_paths), natsorted(gt_paths)
111
112
113def get_duke_liver_paths(
114    path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], download: bool = False
115) -> Tuple[List[str], List[str]]:
116    """Get paths to the Duke Liver dataset.
117
118    Args:
119        path: Filepath to a folder where the data needs to be downloaded for further processing.
120        split: The choice of data split.
121        download: Whether to download the data if it is not present.
122
123    Returns:
124        List of filepaths for the image data.
125        List of filepaths for the label data.
126    """
127    data_dir = get_duke_liver_data(path=path, download=download)
128
129    image_paths, gt_paths = _preprocess_data(path=path, data_dir=data_dir)
130
131    if split == "train":
132        image_paths, gt_paths = image_paths[:250], gt_paths[:250]
133    elif split == "val":
134        image_paths, gt_paths = image_paths[250:260], gt_paths[250:260]
135    elif split == "test":
136        image_paths, gt_paths = image_paths[260:], gt_paths[260:]
137    else:
138        raise ValueError(f"'{split}' is not a valid split.")
139
140    return image_paths, gt_paths
141
142
143def get_duke_liver_dataset(
144    path: Union[os.PathLike, str],
145    patch_shape: Tuple[int, ...],
146    split: Literal['train', 'val', 'test'],
147    resize_inputs: bool = False,
148    download: bool = False,
149    **kwargs
150) -> Dataset:
151    """Get the Duke Liver dataset for segmentation of liver in MRI.
152
153    Args:
154        path: Filepath to a folder where the data needs to be downloaded for further processing.
155        patch_shape: The patch shape to use for training.
156        split: The choice of data split.
157        resize_inputs: Whether to resize the inputs to the patch shape.
158        download: Whether to download the data if it is not present.
159        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
160
161    Returns:
162        The segmentation dataset.
163    """
164    image_paths, gt_paths = get_duke_liver_paths(path, split, download)
165
166    if resize_inputs:
167        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": False}
168        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
169            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
170        )
171
172    return torch_em.default_segmentation_dataset(
173        raw_paths=image_paths,
174        raw_key="data",
175        label_paths=gt_paths,
176        label_key="data",
177        is_seg_dataset=True,
178        patch_shape=patch_shape,
179        **kwargs
180    )
181
182
183def get_duke_liver_loader(
184    path: Union[os.PathLike, str],
185    batch_size: int,
186    patch_shape: Tuple[int, ...],
187    split: Literal['train', 'val', 'test'],
188    resize_inputs: bool = False,
189    download: bool = False,
190    **kwargs
191) -> DataLoader:
192    """Get the Duke Liver dataloader for segmentation of liver in MRI.
193
194    Args:
195        path: Filepath to a folder where the data needs to be downloaded for further processing.
196        batch_size: The batch size for training.
197        patch_shape: The patch shape to use for training.
198        split: The choice of data split.
199        resize_inputs: Whether to resize the inputs to the patch shape.
200        download: Whether to download the data if it is not present.
201        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
202
203    Returns:
204        The DataLoader.
205    """
206    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
207    dataset = get_duke_liver_dataset(path, patch_shape, split, resize_inputs, download, **ds_kwargs)
208    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

def get_duke_liver_data(path: Union[os.PathLike, str], download: bool = False) -> str: View Source

26def get_duke_liver_data(path: Union[os.PathLike, str], download: bool = False) -> str:
27    """Get the Duke Liver dataset.
28
29    The dataset is located at https://doi.org/10.5281/zenodo.7774566.
30
31    Follow the instructions below to get access to the dataset.
32    - Visit the zenodo site attached above.
33    - Send a request message alongwith some details to get access to the dataset.
34    - The authors would accept the request, then you can access the dataset.
35    - Next, download the `Segmentation.zip` file and provide the path where the zip file is stored.
36
37    Args:
38        path: Filepath to a folder where the data needs to be downloaded for further processing.
39        download: Whether to download the data if it is not present.
40
41    Returns:
42        Filepath where the data is preprocessed.
43    """
44    data_dir = os.path.join(path, "data", "Segmentation")
45    if os.path.exists(data_dir):
46        return data_dir
47
48    if download:
49        raise NotImplementedError(
50            "Automatic download for Duke Liver dataset is not possible. See `get_duke_liver_data` for details."
51        )
52
53    zip_path = os.path.join(path, "Segmentation.zip")
54    util.unzip(zip_path=zip_path, dst=os.path.join(path, "data"), remove=False)
55
56    return data_dir

Get the Duke Liver dataset.

The dataset is located at https://doi.org/10.5281/zenodo.7774566.

Follow the instructions below to get access to the dataset.

Visit the zenodo site attached above.
Send a request message alongwith some details to get access to the dataset.
The authors would accept the request, then you can access the dataset.
Next, download the Segmentation.zip file and provide the path where the zip file is stored.

Arguments:

path: Filepath to a folder where the data needs to be downloaded for further processing.
download: Whether to download the data if it is not present.

Returns:

Filepath where the data is preprocessed.

def get_duke_liver_paths( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], download: bool = False) -> Tuple[List[str], List[str]]: View Source

114def get_duke_liver_paths(
115    path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], download: bool = False
116) -> Tuple[List[str], List[str]]:
117    """Get paths to the Duke Liver dataset.
118
119    Args:
120        path: Filepath to a folder where the data needs to be downloaded for further processing.
121        split: The choice of data split.
122        download: Whether to download the data if it is not present.
123
124    Returns:
125        List of filepaths for the image data.
126        List of filepaths for the label data.
127    """
128    data_dir = get_duke_liver_data(path=path, download=download)
129
130    image_paths, gt_paths = _preprocess_data(path=path, data_dir=data_dir)
131
132    if split == "train":
133        image_paths, gt_paths = image_paths[:250], gt_paths[:250]
134    elif split == "val":
135        image_paths, gt_paths = image_paths[250:260], gt_paths[250:260]
136    elif split == "test":
137        image_paths, gt_paths = image_paths[260:], gt_paths[260:]
138    else:
139        raise ValueError(f"'{split}' is not a valid split.")
140
141    return image_paths, gt_paths

Get paths to the Duke Liver dataset.

Arguments:

path: Filepath to a folder where the data needs to be downloaded for further processing.
split: The choice of data split.
download: Whether to download the data if it is not present.

Returns:

List of filepaths for the image data. List of filepaths for the label data.

def get_duke_liver_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], split: Literal['train', 'val', 'test'], resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset: View Source

144def get_duke_liver_dataset(
145    path: Union[os.PathLike, str],
146    patch_shape: Tuple[int, ...],
147    split: Literal['train', 'val', 'test'],
148    resize_inputs: bool = False,
149    download: bool = False,
150    **kwargs
151) -> Dataset:
152    """Get the Duke Liver dataset for segmentation of liver in MRI.
153
154    Args:
155        path: Filepath to a folder where the data needs to be downloaded for further processing.
156        patch_shape: The patch shape to use for training.
157        split: The choice of data split.
158        resize_inputs: Whether to resize the inputs to the patch shape.
159        download: Whether to download the data if it is not present.
160        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
161
162    Returns:
163        The segmentation dataset.
164    """
165    image_paths, gt_paths = get_duke_liver_paths(path, split, download)
166
167    if resize_inputs:
168        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": False}
169        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
170            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
171        )
172
173    return torch_em.default_segmentation_dataset(
174        raw_paths=image_paths,
175        raw_key="data",
176        label_paths=gt_paths,
177        label_key="data",
178        is_seg_dataset=True,
179        patch_shape=patch_shape,
180        **kwargs
181    )

Get the Duke Liver dataset for segmentation of liver in MRI.

Arguments:

path: Filepath to a folder where the data needs to be downloaded for further processing.
patch_shape: The patch shape to use for training.
split: The choice of data split.
resize_inputs: Whether to resize the inputs to the patch shape.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

Returns:

The segmentation dataset.

def get_duke_liver_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, ...], split: Literal['train', 'val', 'test'], resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader: View Source

184def get_duke_liver_loader(
185    path: Union[os.PathLike, str],
186    batch_size: int,
187    patch_shape: Tuple[int, ...],
188    split: Literal['train', 'val', 'test'],
189    resize_inputs: bool = False,
190    download: bool = False,
191    **kwargs
192) -> DataLoader:
193    """Get the Duke Liver dataloader for segmentation of liver in MRI.
194
195    Args:
196        path: Filepath to a folder where the data needs to be downloaded for further processing.
197        batch_size: The batch size for training.
198        patch_shape: The patch shape to use for training.
199        split: The choice of data split.
200        resize_inputs: Whether to resize the inputs to the patch shape.
201        download: Whether to download the data if it is not present.
202        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
203
204    Returns:
205        The DataLoader.
206    """
207    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
208    dataset = get_duke_liver_dataset(path, patch_shape, split, resize_inputs, download, **ds_kwargs)
209    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the Duke Liver dataloader for segmentation of liver in MRI.

Arguments:

path: Filepath to a folder where the data needs to be downloaded for further processing.
batch_size: The batch size for training.
patch_shape: The patch shape to use for training.
split: The choice of data split.
resize_inputs: Whether to resize the inputs to the patch shape.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

Returns:

The DataLoader.