torch_em.data.datasets.medical.cbis_ddsm
The CBIS DDSM contains annotations for lesion segmentation in mammography images.
This dataset is a preprocessed version of https://www.cancerimagingarchive.net/collection/cbis-ddsm/ available at https://www.kaggle.com/datasets/mohamedbenticha/cbis-ddsm/data. The dataset is related to the publication https://doi.org/10.1038/sdata.2017.177. Please cite them if you use this dataset for your research.
1"""The CBIS DDSM contains annotations for lesion segmentation in 2mammography images. 3 4This dataset is a preprocessed version of https://www.cancerimagingarchive.net/collection/cbis-ddsm/ available 5at https://www.kaggle.com/datasets/mohamedbenticha/cbis-ddsm/data. 6The dataset is related to the publication https://doi.org/10.1038/sdata.2017.177. 7Please cite them if you use this dataset for your research. 8""" 9 10import os 11from glob import glob 12from tqdm import tqdm 13from natsort import natsorted 14from typing import Union, Tuple, Literal, Optional 15 16from torch.utils.data import Dataset, DataLoader 17 18import torch_em 19 20from .. import util 21 22 23def get_cbis_ddsm_data(path: Union[os.PathLike, str], download: bool = False) -> str: 24 """Download the CBIS DDSM dataset. 25 Args: 26 path: Filepath to a folder where the data is downloaded for further processing. 27 download: Whether to download the data if it is not present. 28 29 Returns: 30 Filepath where the data is downloaded for the selected task. 31 """ 32 data_dir = os.path.join(path, "DATA") 33 if os.path.exists(data_dir): 34 return data_dir 35 36 os.makedirs(path, exist_ok=True) 37 38 zip_path = os.path.join(path, "cbis-ddsm.zip") 39 util.download_source_kaggle(path=path, dataset_name="mohamedbenticha/cbis-ddsm/", download=download) 40 util.unzip(zip_path=zip_path, dst=path) 41 42 return data_dir 43 44 45def _check_if_size_matches(image_path, gt_path): 46 from PIL import Image 47 return Image.open(image_path).size == Image.open(gt_path).size 48 49 50def get_cbis_ddsm_paths( 51 path: Union[os.PathLike, str], 52 split: Literal['Train', 'Val', 'Test'], 53 task: Literal['Calc', 'Mass'], 54 tumour_type: Optional[Literal["MALIGNANT", "BENIGN"]] = None, 55 download: bool = False, 56 ignore_mismatching_pairs: bool = False, 57): 58 """Get paths to the CBIS DDSM data. 59 60 Args: 61 path: Filepath to a folder where the data is downloaded for further processing. 62 split: The choice of data split. 63 task: The choice of labels for the specified task. 64 tumour_type: The choice of tumour type. 65 download: Whether to download the data if it is not present. 66 ignore_mismatching_pairs: Whether to avoid returning paths to image-label pairs of mismatching shape. 67 68 Returns: 69 List of filepaths for the image data. 70 List of filepaths for the label data. 71 """ 72 data_dir = get_cbis_ddsm_data(path, download) 73 74 if split not in ["Train", "Val", "Test"]: 75 raise ValueError(f"'{split}' is not a valid split.") 76 77 if task is None: 78 task = "*" 79 else: 80 assert task in ["Calc", "Mass"], f"'{task}' is not a valid task." 81 82 if tumour_type is None: 83 tumour_type = "*" 84 else: 85 assert tumour_type in ["MALIGNANT", "BENIGN"], f"'{tumour_type}' is not a tumor type." 86 87 def _remove_mismatching_image_label_pairs(image_paths, gt_paths): 88 input_paths = [ 89 (ip, gp) for ip, gp in tqdm(zip(image_paths, gt_paths), total=len(image_paths), desc="Validate inputs") 90 if _check_if_size_matches(ip, gp) 91 ] 92 image_paths = [p[0] for p in input_paths] 93 gt_paths = [p[1] for p in input_paths] 94 return image_paths, gt_paths 95 96 if split == "Test": 97 target_dir = os.path.join(data_dir, task, split, tumour_type) 98 image_paths = natsorted(glob(os.path.join(target_dir, "*_FULL_*.png"))) 99 gt_paths = natsorted(glob(os.path.join(target_dir, "*_MASK_*.png"))) 100 101 if ignore_mismatching_pairs: 102 image_paths, gt_paths = _remove_mismatching_image_label_pairs(image_paths, gt_paths) 103 104 else: 105 target_dir = os.path.join(data_dir, task, "Train", tumour_type) 106 image_paths = natsorted(glob(os.path.join(target_dir, "*_FULL_*.png"))) 107 gt_paths = natsorted(glob(os.path.join(target_dir, "*_MASK_*.png"))) 108 109 if ignore_mismatching_pairs: 110 image_paths, gt_paths = _remove_mismatching_image_label_pairs(image_paths, gt_paths) 111 112 if split == "Train": 113 image_paths, gt_paths = image_paths[125:], gt_paths[125:] 114 else: # validation split (take the first 125 samples for validation) 115 image_paths, gt_paths = image_paths[:125], gt_paths[:125] 116 117 assert len(image_paths) == len(gt_paths) 118 119 return image_paths, gt_paths 120 121 122def get_cbis_ddsm_dataset( 123 path: Union[os.PathLike, str], 124 patch_shape: Tuple[int, int], 125 split: Literal['Train', 'Val', 'Test'], 126 task: Optional[Literal["Calc", "Mass"]] = None, 127 tumour_type: Optional[Literal["MALIGNANT", "BENIGN"]] = None, 128 resize_inputs: bool = False, 129 download: bool = False, 130 **kwargs 131) -> Dataset: 132 """Get the CBIS DDSM dataset for lesion segmentation in mammograms. 133 134 Args: 135 path: Filepath to a folder where the data is downloaded for further processing. 136 patch_shape: The patch shape to use for training. 137 split: The choice of data split. 138 task: The choice of labels for the specified task. 139 tumour_type: The choice of tumour type. 140 resize_inputs: Whether to resize the inputs to the expected patch shape. 141 download: Whether to download the data if it is not present. 142 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 143 144 Returns: 145 The segmentation dataset. 146 """ 147 image_paths, gt_paths = get_cbis_ddsm_paths(path, split, task, tumour_type, download) 148 149 if resize_inputs: 150 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": False} 151 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 152 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 153 ) 154 155 return torch_em.default_segmentation_dataset( 156 raw_paths=image_paths, 157 raw_key=None, 158 label_paths=gt_paths, 159 label_key=None, 160 patch_shape=patch_shape, 161 is_seg_dataset=False, 162 **kwargs 163 ) 164 165 166def get_cbis_ddsm_loader( 167 path: Union[os.PathLike, str], 168 batch_size: int, 169 patch_shape: Tuple[int, int], 170 split: Literal['Train', 'Val', 'Test'], 171 task: Optional[Literal["Calc", "Mass"]] = None, 172 tumour_type: Optional[Literal["MALIGNANT", "BENIGN"]] = None, 173 resize_inputs: bool = False, 174 download: bool = False, 175 **kwargs 176) -> DataLoader: 177 """Get the CBIS DDSM dataloader for lesion segmentation in mammograms. 178 179 Args: 180 path: Filepath to a folder where the data is downloaded for further processing. 181 batch_size: The batch size for training. 182 patch_shape: The patch shape to use for training. 183 split: The choice of data split. 184 task: The choice of labels for the specified task. 185 tumour_type: The choice of tumour type. 186 resize_inputs: Whether to resize the inputs to the expected patch shape. 187 download: Whether to download the data if it is not present. 188 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 189 190 Returns: 191 The DataLoader. 192 """ 193 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 194 dataset = get_cbis_ddsm_dataset(path, patch_shape, split, task, tumour_type, resize_inputs, download, **ds_kwargs) 195 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
24def get_cbis_ddsm_data(path: Union[os.PathLike, str], download: bool = False) -> str: 25 """Download the CBIS DDSM dataset. 26 Args: 27 path: Filepath to a folder where the data is downloaded for further processing. 28 download: Whether to download the data if it is not present. 29 30 Returns: 31 Filepath where the data is downloaded for the selected task. 32 """ 33 data_dir = os.path.join(path, "DATA") 34 if os.path.exists(data_dir): 35 return data_dir 36 37 os.makedirs(path, exist_ok=True) 38 39 zip_path = os.path.join(path, "cbis-ddsm.zip") 40 util.download_source_kaggle(path=path, dataset_name="mohamedbenticha/cbis-ddsm/", download=download) 41 util.unzip(zip_path=zip_path, dst=path) 42 43 return data_dir
Download the CBIS DDSM dataset. Args: path: Filepath to a folder where the data is downloaded for further processing. download: Whether to download the data if it is not present.
Returns:
Filepath where the data is downloaded for the selected task.
51def get_cbis_ddsm_paths( 52 path: Union[os.PathLike, str], 53 split: Literal['Train', 'Val', 'Test'], 54 task: Literal['Calc', 'Mass'], 55 tumour_type: Optional[Literal["MALIGNANT", "BENIGN"]] = None, 56 download: bool = False, 57 ignore_mismatching_pairs: bool = False, 58): 59 """Get paths to the CBIS DDSM data. 60 61 Args: 62 path: Filepath to a folder where the data is downloaded for further processing. 63 split: The choice of data split. 64 task: The choice of labels for the specified task. 65 tumour_type: The choice of tumour type. 66 download: Whether to download the data if it is not present. 67 ignore_mismatching_pairs: Whether to avoid returning paths to image-label pairs of mismatching shape. 68 69 Returns: 70 List of filepaths for the image data. 71 List of filepaths for the label data. 72 """ 73 data_dir = get_cbis_ddsm_data(path, download) 74 75 if split not in ["Train", "Val", "Test"]: 76 raise ValueError(f"'{split}' is not a valid split.") 77 78 if task is None: 79 task = "*" 80 else: 81 assert task in ["Calc", "Mass"], f"'{task}' is not a valid task." 82 83 if tumour_type is None: 84 tumour_type = "*" 85 else: 86 assert tumour_type in ["MALIGNANT", "BENIGN"], f"'{tumour_type}' is not a tumor type." 87 88 def _remove_mismatching_image_label_pairs(image_paths, gt_paths): 89 input_paths = [ 90 (ip, gp) for ip, gp in tqdm(zip(image_paths, gt_paths), total=len(image_paths), desc="Validate inputs") 91 if _check_if_size_matches(ip, gp) 92 ] 93 image_paths = [p[0] for p in input_paths] 94 gt_paths = [p[1] for p in input_paths] 95 return image_paths, gt_paths 96 97 if split == "Test": 98 target_dir = os.path.join(data_dir, task, split, tumour_type) 99 image_paths = natsorted(glob(os.path.join(target_dir, "*_FULL_*.png"))) 100 gt_paths = natsorted(glob(os.path.join(target_dir, "*_MASK_*.png"))) 101 102 if ignore_mismatching_pairs: 103 image_paths, gt_paths = _remove_mismatching_image_label_pairs(image_paths, gt_paths) 104 105 else: 106 target_dir = os.path.join(data_dir, task, "Train", tumour_type) 107 image_paths = natsorted(glob(os.path.join(target_dir, "*_FULL_*.png"))) 108 gt_paths = natsorted(glob(os.path.join(target_dir, "*_MASK_*.png"))) 109 110 if ignore_mismatching_pairs: 111 image_paths, gt_paths = _remove_mismatching_image_label_pairs(image_paths, gt_paths) 112 113 if split == "Train": 114 image_paths, gt_paths = image_paths[125:], gt_paths[125:] 115 else: # validation split (take the first 125 samples for validation) 116 image_paths, gt_paths = image_paths[:125], gt_paths[:125] 117 118 assert len(image_paths) == len(gt_paths) 119 120 return image_paths, gt_paths
Get paths to the CBIS DDSM data.
Arguments:
- path: Filepath to a folder where the data is downloaded for further processing.
- split: The choice of data split.
- task: The choice of labels for the specified task.
- tumour_type: The choice of tumour type.
- download: Whether to download the data if it is not present.
- ignore_mismatching_pairs: Whether to avoid returning paths to image-label pairs of mismatching shape.
Returns:
List of filepaths for the image data. List of filepaths for the label data.
123def get_cbis_ddsm_dataset( 124 path: Union[os.PathLike, str], 125 patch_shape: Tuple[int, int], 126 split: Literal['Train', 'Val', 'Test'], 127 task: Optional[Literal["Calc", "Mass"]] = None, 128 tumour_type: Optional[Literal["MALIGNANT", "BENIGN"]] = None, 129 resize_inputs: bool = False, 130 download: bool = False, 131 **kwargs 132) -> Dataset: 133 """Get the CBIS DDSM dataset for lesion segmentation in mammograms. 134 135 Args: 136 path: Filepath to a folder where the data is downloaded for further processing. 137 patch_shape: The patch shape to use for training. 138 split: The choice of data split. 139 task: The choice of labels for the specified task. 140 tumour_type: The choice of tumour type. 141 resize_inputs: Whether to resize the inputs to the expected patch shape. 142 download: Whether to download the data if it is not present. 143 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 144 145 Returns: 146 The segmentation dataset. 147 """ 148 image_paths, gt_paths = get_cbis_ddsm_paths(path, split, task, tumour_type, download) 149 150 if resize_inputs: 151 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": False} 152 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 153 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 154 ) 155 156 return torch_em.default_segmentation_dataset( 157 raw_paths=image_paths, 158 raw_key=None, 159 label_paths=gt_paths, 160 label_key=None, 161 patch_shape=patch_shape, 162 is_seg_dataset=False, 163 **kwargs 164 )
Get the CBIS DDSM dataset for lesion segmentation in mammograms.
Arguments:
- path: Filepath to a folder where the data is downloaded for further processing.
- patch_shape: The patch shape to use for training.
- split: The choice of data split.
- task: The choice of labels for the specified task.
- tumour_type: The choice of tumour type.
- resize_inputs: Whether to resize the inputs to the expected patch shape.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
167def get_cbis_ddsm_loader( 168 path: Union[os.PathLike, str], 169 batch_size: int, 170 patch_shape: Tuple[int, int], 171 split: Literal['Train', 'Val', 'Test'], 172 task: Optional[Literal["Calc", "Mass"]] = None, 173 tumour_type: Optional[Literal["MALIGNANT", "BENIGN"]] = None, 174 resize_inputs: bool = False, 175 download: bool = False, 176 **kwargs 177) -> DataLoader: 178 """Get the CBIS DDSM dataloader for lesion segmentation in mammograms. 179 180 Args: 181 path: Filepath to a folder where the data is downloaded for further processing. 182 batch_size: The batch size for training. 183 patch_shape: The patch shape to use for training. 184 split: The choice of data split. 185 task: The choice of labels for the specified task. 186 tumour_type: The choice of tumour type. 187 resize_inputs: Whether to resize the inputs to the expected patch shape. 188 download: Whether to download the data if it is not present. 189 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 190 191 Returns: 192 The DataLoader. 193 """ 194 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 195 dataset = get_cbis_ddsm_dataset(path, patch_shape, split, task, tumour_type, resize_inputs, download, **ds_kwargs) 196 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the CBIS DDSM dataloader for lesion segmentation in mammograms.
Arguments:
- path: Filepath to a folder where the data is downloaded for further processing.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- split: The choice of data split.
- task: The choice of labels for the specified task.
- tumour_type: The choice of tumour type.
- resize_inputs: Whether to resize the inputs to the expected patch shape.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader.