torch_em.data.datasets.medical.cbis_ddsm

The CBIS DDSM contains annotations for lesion segmentation in mammography images.

This dataset is a preprocessed version of https://www.cancerimagingarchive.net/collection/cbis-ddsm/ available at https://www.kaggle.com/datasets/mohamedbenticha/cbis-ddsm/data. The dataset is related to the publication https://doi.org/10.1038/sdata.2017.177. Please cite them if you use this dataset for your research.

  1"""The CBIS DDSM contains annotations for lesion segmentation in
  2mammography images.
  3
  4This dataset is a preprocessed version of https://www.cancerimagingarchive.net/collection/cbis-ddsm/ available
  5at https://www.kaggle.com/datasets/mohamedbenticha/cbis-ddsm/data.
  6The dataset is related to the publication https://doi.org/10.1038/sdata.2017.177.
  7Please cite them if you use this dataset for your research.
  8"""
  9
 10import os
 11from glob import glob
 12from tqdm import tqdm
 13from natsort import natsorted
 14from typing import Union, Tuple, Literal, Optional
 15
 16from torch.utils.data import Dataset, DataLoader
 17
 18import torch_em
 19
 20from .. import util
 21
 22
 23def get_cbis_ddsm_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 24    """Download the CBIS DDSM dataset.
 25     Args:
 26        path: Filepath to a folder where the data is downloaded for further processing.
 27        download: Whether to download the data if it is not present.
 28
 29    Returns:
 30        Filepath where the data is downloaded for the selected task.
 31    """
 32    data_dir = os.path.join(path, "DATA")
 33    if os.path.exists(data_dir):
 34        return data_dir
 35
 36    os.makedirs(path, exist_ok=True)
 37
 38    zip_path = os.path.join(path, "cbis-ddsm.zip")
 39    util.download_source_kaggle(path=path, dataset_name="mohamedbenticha/cbis-ddsm/", download=download)
 40    util.unzip(zip_path=zip_path, dst=path)
 41
 42    return data_dir
 43
 44
 45def _check_if_size_matches(image_path, gt_path):
 46    from PIL import Image
 47    return Image.open(image_path).size == Image.open(gt_path).size
 48
 49
 50def get_cbis_ddsm_paths(
 51    path: Union[os.PathLike, str],
 52    split: Literal['Train', 'Val', 'Test'],
 53    task: Literal['Calc', 'Mass'],
 54    tumour_type: Optional[Literal["MALIGNANT", "BENIGN"]] = None,
 55    download: bool = False,
 56    ignore_mismatching_pairs: bool = False,
 57):
 58    """Get paths to the CBIS DDSM data.
 59
 60    Args:
 61        path: Filepath to a folder where the data is downloaded for further processing.
 62        split: The choice of data split.
 63        task: The choice of labels for the specified task.
 64        tumour_type: The choice of tumour type.
 65        download: Whether to download the data if it is not present.
 66        ignore_mismatching_pairs: Whether to avoid returning paths to image-label pairs of mismatching shape.
 67
 68    Returns:
 69        List of filepaths for the image data.
 70        List of filepaths for the label data.
 71    """
 72    data_dir = get_cbis_ddsm_data(path, download)
 73
 74    if split not in ["Train", "Val", "Test"]:
 75        raise ValueError(f"'{split}' is not a valid split.")
 76
 77    if task is None:
 78        task = "*"
 79    else:
 80        assert task in ["Calc", "Mass"], f"'{task}' is not a valid task."
 81
 82    if tumour_type is None:
 83        tumour_type = "*"
 84    else:
 85        assert tumour_type in ["MALIGNANT", "BENIGN"], f"'{tumour_type}' is not a tumor type."
 86
 87    def _remove_mismatching_image_label_pairs(image_paths, gt_paths):
 88        input_paths = [
 89            (ip, gp) for ip, gp in tqdm(zip(image_paths, gt_paths), total=len(image_paths), desc="Validate inputs")
 90            if _check_if_size_matches(ip, gp)
 91        ]
 92        image_paths = [p[0] for p in input_paths]
 93        gt_paths = [p[1] for p in input_paths]
 94        return image_paths, gt_paths
 95
 96    if split == "Test":
 97        target_dir = os.path.join(data_dir, task, split, tumour_type)
 98        image_paths = natsorted(glob(os.path.join(target_dir, "*_FULL_*.png")))
 99        gt_paths = natsorted(glob(os.path.join(target_dir, "*_MASK_*.png")))
100
101        if ignore_mismatching_pairs:
102            image_paths, gt_paths = _remove_mismatching_image_label_pairs(image_paths, gt_paths)
103
104    else:
105        target_dir = os.path.join(data_dir, task, "Train", tumour_type)
106        image_paths = natsorted(glob(os.path.join(target_dir, "*_FULL_*.png")))
107        gt_paths = natsorted(glob(os.path.join(target_dir, "*_MASK_*.png")))
108
109        if ignore_mismatching_pairs:
110            image_paths, gt_paths = _remove_mismatching_image_label_pairs(image_paths, gt_paths)
111
112        if split == "Train":
113            image_paths, gt_paths = image_paths[125:], gt_paths[125:]
114        else:  # validation split (take the first 125 samples for validation)
115            image_paths, gt_paths = image_paths[:125], gt_paths[:125]
116
117    assert len(image_paths) == len(gt_paths)
118
119    return image_paths, gt_paths
120
121
122def get_cbis_ddsm_dataset(
123    path: Union[os.PathLike, str],
124    patch_shape: Tuple[int, int],
125    split: Literal['Train', 'Val', 'Test'],
126    task: Optional[Literal["Calc", "Mass"]] = None,
127    tumour_type: Optional[Literal["MALIGNANT", "BENIGN"]] = None,
128    resize_inputs: bool = False,
129    download: bool = False,
130    **kwargs
131) -> Dataset:
132    """Get the CBIS DDSM dataset for lesion segmentation in mammograms.
133
134    Args:
135        path: Filepath to a folder where the data is downloaded for further processing.
136        patch_shape: The patch shape to use for training.
137        split: The choice of data split.
138        task: The choice of labels for the specified task.
139        tumour_type: The choice of tumour type.
140        resize_inputs: Whether to resize the inputs to the expected patch shape.
141        download: Whether to download the data if it is not present.
142        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
143
144    Returns:
145        The segmentation dataset.
146    """
147    image_paths, gt_paths = get_cbis_ddsm_paths(path, split, task, tumour_type, download)
148
149    if resize_inputs:
150        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": False}
151        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
152            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
153        )
154
155    return torch_em.default_segmentation_dataset(
156        raw_paths=image_paths,
157        raw_key=None,
158        label_paths=gt_paths,
159        label_key=None,
160        patch_shape=patch_shape,
161        is_seg_dataset=False,
162        **kwargs
163    )
164
165
166def get_cbis_ddsm_loader(
167    path: Union[os.PathLike, str],
168    batch_size: int,
169    patch_shape: Tuple[int, int],
170    split: Literal['Train', 'Val', 'Test'],
171    task: Optional[Literal["Calc", "Mass"]] = None,
172    tumour_type: Optional[Literal["MALIGNANT", "BENIGN"]] = None,
173    resize_inputs: bool = False,
174    download: bool = False,
175    **kwargs
176) -> DataLoader:
177    """Get the CBIS DDSM dataloader for lesion segmentation in mammograms.
178
179    Args:
180        path: Filepath to a folder where the data is downloaded for further processing.
181        batch_size: The batch size for training.
182        patch_shape: The patch shape to use for training.
183        split: The choice of data split.
184        task: The choice of labels for the specified task.
185        tumour_type: The choice of tumour type.
186        resize_inputs: Whether to resize the inputs to the expected patch shape.
187        download: Whether to download the data if it is not present.
188        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
189
190    Returns:
191        The DataLoader.
192    """
193    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
194    dataset = get_cbis_ddsm_dataset(path, patch_shape, split, task, tumour_type, resize_inputs, download, **ds_kwargs)
195    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
def get_cbis_ddsm_data(path: Union[os.PathLike, str], download: bool = False) -> str:
24def get_cbis_ddsm_data(path: Union[os.PathLike, str], download: bool = False) -> str:
25    """Download the CBIS DDSM dataset.
26     Args:
27        path: Filepath to a folder where the data is downloaded for further processing.
28        download: Whether to download the data if it is not present.
29
30    Returns:
31        Filepath where the data is downloaded for the selected task.
32    """
33    data_dir = os.path.join(path, "DATA")
34    if os.path.exists(data_dir):
35        return data_dir
36
37    os.makedirs(path, exist_ok=True)
38
39    zip_path = os.path.join(path, "cbis-ddsm.zip")
40    util.download_source_kaggle(path=path, dataset_name="mohamedbenticha/cbis-ddsm/", download=download)
41    util.unzip(zip_path=zip_path, dst=path)
42
43    return data_dir

Download the CBIS DDSM dataset. Args: path: Filepath to a folder where the data is downloaded for further processing. download: Whether to download the data if it is not present.

Returns:

Filepath where the data is downloaded for the selected task.

def get_cbis_ddsm_paths( path: Union[os.PathLike, str], split: Literal['Train', 'Val', 'Test'], task: Literal['Calc', 'Mass'], tumour_type: Optional[Literal['MALIGNANT', 'BENIGN']] = None, download: bool = False, ignore_mismatching_pairs: bool = False):
 51def get_cbis_ddsm_paths(
 52    path: Union[os.PathLike, str],
 53    split: Literal['Train', 'Val', 'Test'],
 54    task: Literal['Calc', 'Mass'],
 55    tumour_type: Optional[Literal["MALIGNANT", "BENIGN"]] = None,
 56    download: bool = False,
 57    ignore_mismatching_pairs: bool = False,
 58):
 59    """Get paths to the CBIS DDSM data.
 60
 61    Args:
 62        path: Filepath to a folder where the data is downloaded for further processing.
 63        split: The choice of data split.
 64        task: The choice of labels for the specified task.
 65        tumour_type: The choice of tumour type.
 66        download: Whether to download the data if it is not present.
 67        ignore_mismatching_pairs: Whether to avoid returning paths to image-label pairs of mismatching shape.
 68
 69    Returns:
 70        List of filepaths for the image data.
 71        List of filepaths for the label data.
 72    """
 73    data_dir = get_cbis_ddsm_data(path, download)
 74
 75    if split not in ["Train", "Val", "Test"]:
 76        raise ValueError(f"'{split}' is not a valid split.")
 77
 78    if task is None:
 79        task = "*"
 80    else:
 81        assert task in ["Calc", "Mass"], f"'{task}' is not a valid task."
 82
 83    if tumour_type is None:
 84        tumour_type = "*"
 85    else:
 86        assert tumour_type in ["MALIGNANT", "BENIGN"], f"'{tumour_type}' is not a tumor type."
 87
 88    def _remove_mismatching_image_label_pairs(image_paths, gt_paths):
 89        input_paths = [
 90            (ip, gp) for ip, gp in tqdm(zip(image_paths, gt_paths), total=len(image_paths), desc="Validate inputs")
 91            if _check_if_size_matches(ip, gp)
 92        ]
 93        image_paths = [p[0] for p in input_paths]
 94        gt_paths = [p[1] for p in input_paths]
 95        return image_paths, gt_paths
 96
 97    if split == "Test":
 98        target_dir = os.path.join(data_dir, task, split, tumour_type)
 99        image_paths = natsorted(glob(os.path.join(target_dir, "*_FULL_*.png")))
100        gt_paths = natsorted(glob(os.path.join(target_dir, "*_MASK_*.png")))
101
102        if ignore_mismatching_pairs:
103            image_paths, gt_paths = _remove_mismatching_image_label_pairs(image_paths, gt_paths)
104
105    else:
106        target_dir = os.path.join(data_dir, task, "Train", tumour_type)
107        image_paths = natsorted(glob(os.path.join(target_dir, "*_FULL_*.png")))
108        gt_paths = natsorted(glob(os.path.join(target_dir, "*_MASK_*.png")))
109
110        if ignore_mismatching_pairs:
111            image_paths, gt_paths = _remove_mismatching_image_label_pairs(image_paths, gt_paths)
112
113        if split == "Train":
114            image_paths, gt_paths = image_paths[125:], gt_paths[125:]
115        else:  # validation split (take the first 125 samples for validation)
116            image_paths, gt_paths = image_paths[:125], gt_paths[:125]
117
118    assert len(image_paths) == len(gt_paths)
119
120    return image_paths, gt_paths

Get paths to the CBIS DDSM data.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • split: The choice of data split.
  • task: The choice of labels for the specified task.
  • tumour_type: The choice of tumour type.
  • download: Whether to download the data if it is not present.
  • ignore_mismatching_pairs: Whether to avoid returning paths to image-label pairs of mismatching shape.
Returns:

List of filepaths for the image data. List of filepaths for the label data.

def get_cbis_ddsm_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Literal['Train', 'Val', 'Test'], task: Optional[Literal['Calc', 'Mass']] = None, tumour_type: Optional[Literal['MALIGNANT', 'BENIGN']] = None, resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
123def get_cbis_ddsm_dataset(
124    path: Union[os.PathLike, str],
125    patch_shape: Tuple[int, int],
126    split: Literal['Train', 'Val', 'Test'],
127    task: Optional[Literal["Calc", "Mass"]] = None,
128    tumour_type: Optional[Literal["MALIGNANT", "BENIGN"]] = None,
129    resize_inputs: bool = False,
130    download: bool = False,
131    **kwargs
132) -> Dataset:
133    """Get the CBIS DDSM dataset for lesion segmentation in mammograms.
134
135    Args:
136        path: Filepath to a folder where the data is downloaded for further processing.
137        patch_shape: The patch shape to use for training.
138        split: The choice of data split.
139        task: The choice of labels for the specified task.
140        tumour_type: The choice of tumour type.
141        resize_inputs: Whether to resize the inputs to the expected patch shape.
142        download: Whether to download the data if it is not present.
143        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
144
145    Returns:
146        The segmentation dataset.
147    """
148    image_paths, gt_paths = get_cbis_ddsm_paths(path, split, task, tumour_type, download)
149
150    if resize_inputs:
151        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": False}
152        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
153            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
154        )
155
156    return torch_em.default_segmentation_dataset(
157        raw_paths=image_paths,
158        raw_key=None,
159        label_paths=gt_paths,
160        label_key=None,
161        patch_shape=patch_shape,
162        is_seg_dataset=False,
163        **kwargs
164    )

Get the CBIS DDSM dataset for lesion segmentation in mammograms.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • patch_shape: The patch shape to use for training.
  • split: The choice of data split.
  • task: The choice of labels for the specified task.
  • tumour_type: The choice of tumour type.
  • resize_inputs: Whether to resize the inputs to the expected patch shape.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_cbis_ddsm_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Literal['Train', 'Val', 'Test'], task: Optional[Literal['Calc', 'Mass']] = None, tumour_type: Optional[Literal['MALIGNANT', 'BENIGN']] = None, resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
167def get_cbis_ddsm_loader(
168    path: Union[os.PathLike, str],
169    batch_size: int,
170    patch_shape: Tuple[int, int],
171    split: Literal['Train', 'Val', 'Test'],
172    task: Optional[Literal["Calc", "Mass"]] = None,
173    tumour_type: Optional[Literal["MALIGNANT", "BENIGN"]] = None,
174    resize_inputs: bool = False,
175    download: bool = False,
176    **kwargs
177) -> DataLoader:
178    """Get the CBIS DDSM dataloader for lesion segmentation in mammograms.
179
180    Args:
181        path: Filepath to a folder where the data is downloaded for further processing.
182        batch_size: The batch size for training.
183        patch_shape: The patch shape to use for training.
184        split: The choice of data split.
185        task: The choice of labels for the specified task.
186        tumour_type: The choice of tumour type.
187        resize_inputs: Whether to resize the inputs to the expected patch shape.
188        download: Whether to download the data if it is not present.
189        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
190
191    Returns:
192        The DataLoader.
193    """
194    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
195    dataset = get_cbis_ddsm_dataset(path, patch_shape, split, task, tumour_type, resize_inputs, download, **ds_kwargs)
196    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the CBIS DDSM dataloader for lesion segmentation in mammograms.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape to use for training.
  • split: The choice of data split.
  • task: The choice of labels for the specified task.
  • tumour_type: The choice of tumour type.
  • resize_inputs: Whether to resize the inputs to the expected patch shape.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.