torch_em.data.datasets.medical.uwaterloo_skin
The UWaterloo Skin dataset contains annotations for skin lesion segmentation in dermoscopy images.
The database is located at https://uwaterloo.ca/vision-image-processing-lab/research-demos/skin-cancer-detection.
Please cite it if you use this dataset for a publication.
1"""The UWaterloo Skin dataset contains annotations for skin lesion segmentation in dermoscopy images. 2 3The database is located at 4https://uwaterloo.ca/vision-image-processing-lab/research-demos/skin-cancer-detection. 5 6Please cite it if you use this dataset for a publication. 7""" 8 9import os 10import shutil 11from glob import glob 12from urllib.parse import urljoin 13from urllib3.exceptions import ProtocolError 14from typing import Tuple, Union, Literal, List 15 16from torch.utils.data import Dataset, DataLoader 17 18import torch_em 19 20from .. import util 21 22 23BASE_URL = "https://uwaterloo.ca/vision-image-processing-lab/sites/ca.vision-image-processing-lab/files/uploads/files/" 24 25 26ZIPFILES = { 27 "set1": "skin_image_data_set-1.zip", # patients with melanoma 28 "set2": "skin_image_data_set-2.zip" # patients without melanoma 29} 30 31CHECKSUMS = { 32 "set1": "1788cd3eb7a4744012aad9a154e514fc5b82b9f3b19e31cc1b6ded5fc6bed297", 33 "set2": "108a818baf20b36ef4544ebda10a8075dad99e335f0535c9533bb14cb02b5c53" 34} 35 36 37def get_uwaterloo_skin_data( 38 path: Union[os.PathLike, str], chosen_set: Literal["set1", "set2"], download: bool = False 39) -> str: 40 """Download the UWaterloo Skin dataset. 41 42 Args: 43 path: Filepath to a folder where the data is downloaded for further processing. 44 chosen_set: The choice of data subset. 45 download: Whether to download the data if it is not present. 46 47 Returns: 48 Filepath where the data is downloaded. 49 """ 50 assert chosen_set in ZIPFILES.keys(), f"'{chosen_set}' is not a valid set." 51 52 data_dir = os.path.join(path, f"{chosen_set}_Data") 53 if os.path.exists(data_dir): 54 return data_dir 55 56 os.makedirs(path, exist_ok=True) 57 58 zip_path = os.path.join(path, ZIPFILES[chosen_set]) 59 url = urljoin(BASE_URL, ZIPFILES[chosen_set]) 60 61 try: 62 util.download_source(path=zip_path, url=url, download=download, checksum=CHECKSUMS[chosen_set]) 63 except ProtocolError: # the 'uwaterloo.ca' quite randomly times out of connections, pretty weird. 64 msg = "The server seems to be unreachable at the moment. " 65 msg += f"We recommend downloading the data manually, from '{url}' at '{path}'. " 66 print(msg) 67 quit() 68 69 util.unzip(zip_path=zip_path, dst=path) 70 71 setnum = chosen_set[-1] 72 tmp_dir = os.path.join(path, fr"Skin Image Data Set-{setnum}") 73 shutil.move(src=tmp_dir, dst=data_dir) 74 75 return data_dir 76 77 78def get_uwaterloo_skin_paths(path: Union[os.PathLike, str], download: bool = False) -> Tuple[List[str], List[str]]: 79 """Get paths to the UWaterloo Skin data. 80 81 Args: 82 path: Filepath to a folder where the data is downloaded for further processing. 83 download: Whether to download the data if it is not present. 84 85 Returns: 86 List of filepaths for the image data. 87 List of filepaths for the label data. 88 """ 89 data_dir = get_uwaterloo_skin_data(path, "set1", download) 90 image_paths = sorted(glob(os.path.join(data_dir, "skin_data", "melanoma", "*", "*_orig.jpg"))) 91 gt_paths = sorted(glob(os.path.join(data_dir, "skin_data", "melanoma", "*", "*_contour.png"))) 92 93 data_dir = get_uwaterloo_skin_data(path, "set2", download) 94 image_paths.extend(sorted(glob(os.path.join(data_dir, "skin_data", "notmelanoma", "*", "*_orig.jpg")))) 95 gt_paths.extend(sorted(glob(os.path.join(data_dir, "skin_data", "notmelanoma", "*", "*_contour.png")))) 96 97 return image_paths, gt_paths 98 99 100def get_uwaterloo_skin_dataset( 101 path: Union[os.PathLike, str], 102 patch_shape: Tuple[int, int], 103 resize_inputs: bool = False, 104 download: bool = False, 105 **kwargs 106) -> Dataset: 107 """Get the UWaterloo Skin dataset for skin lesion segmentation. 108 109 Args: 110 path: Filepath to a folder where the data is downloaded for further processing. 111 patch_shape: The patch shape to use for training. 112 resize_inputs: Whether to resize inputs to the desired patch shape. 113 download: Whether to download the data if it is not present. 114 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 115 116 Returns: 117 The segmentation dataset. 118 """ 119 image_paths, gt_paths = get_uwaterloo_skin_paths(path, download) 120 121 if resize_inputs: 122 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True} 123 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 124 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 125 ) 126 127 return torch_em.default_segmentation_dataset( 128 raw_paths=image_paths, 129 raw_key=None, 130 label_paths=gt_paths, 131 label_key=None, 132 is_seg_dataset=False, 133 patch_shape=patch_shape, 134 **kwargs 135 ) 136 137 138def get_uwaterloo_skin_loader( 139 path: Union[os.PathLike, str], 140 batch_size: int, 141 patch_shape: Tuple[int, int], 142 resize_inputs: bool = False, 143 download: bool = False, 144 **kwargs 145) -> DataLoader: 146 """Get the UWaterloo Skin dataloader for skin lesion segmentation. 147 148 Args: 149 path: Filepath to a folder where the data is downloaded for further processing. 150 batch_size: The batch size for training. 151 patch_shape: The patch shape to use for training. 152 resize_inputs: Whether to resize inputs to the desired patch shape. 153 download: Whether to download the data if it is not present. 154 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 155 156 Returns: 157 The DataLoader. 158 """ 159 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 160 dataset = get_uwaterloo_skin_dataset(path, patch_shape, resize_inputs, download, **ds_kwargs) 161 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
BASE_URL =
'https://uwaterloo.ca/vision-image-processing-lab/sites/ca.vision-image-processing-lab/files/uploads/files/'
ZIPFILES =
{'set1': 'skin_image_data_set-1.zip', 'set2': 'skin_image_data_set-2.zip'}
CHECKSUMS =
{'set1': '1788cd3eb7a4744012aad9a154e514fc5b82b9f3b19e31cc1b6ded5fc6bed297', 'set2': '108a818baf20b36ef4544ebda10a8075dad99e335f0535c9533bb14cb02b5c53'}
def
get_uwaterloo_skin_data( path: Union[os.PathLike, str], chosen_set: Literal['set1', 'set2'], download: bool = False) -> str:
38def get_uwaterloo_skin_data( 39 path: Union[os.PathLike, str], chosen_set: Literal["set1", "set2"], download: bool = False 40) -> str: 41 """Download the UWaterloo Skin dataset. 42 43 Args: 44 path: Filepath to a folder where the data is downloaded for further processing. 45 chosen_set: The choice of data subset. 46 download: Whether to download the data if it is not present. 47 48 Returns: 49 Filepath where the data is downloaded. 50 """ 51 assert chosen_set in ZIPFILES.keys(), f"'{chosen_set}' is not a valid set." 52 53 data_dir = os.path.join(path, f"{chosen_set}_Data") 54 if os.path.exists(data_dir): 55 return data_dir 56 57 os.makedirs(path, exist_ok=True) 58 59 zip_path = os.path.join(path, ZIPFILES[chosen_set]) 60 url = urljoin(BASE_URL, ZIPFILES[chosen_set]) 61 62 try: 63 util.download_source(path=zip_path, url=url, download=download, checksum=CHECKSUMS[chosen_set]) 64 except ProtocolError: # the 'uwaterloo.ca' quite randomly times out of connections, pretty weird. 65 msg = "The server seems to be unreachable at the moment. " 66 msg += f"We recommend downloading the data manually, from '{url}' at '{path}'. " 67 print(msg) 68 quit() 69 70 util.unzip(zip_path=zip_path, dst=path) 71 72 setnum = chosen_set[-1] 73 tmp_dir = os.path.join(path, fr"Skin Image Data Set-{setnum}") 74 shutil.move(src=tmp_dir, dst=data_dir) 75 76 return data_dir
Download the UWaterloo Skin dataset.
Arguments:
- path: Filepath to a folder where the data is downloaded for further processing.
- chosen_set: The choice of data subset.
- download: Whether to download the data if it is not present.
Returns:
Filepath where the data is downloaded.
def
get_uwaterloo_skin_paths( path: Union[os.PathLike, str], download: bool = False) -> Tuple[List[str], List[str]]:
79def get_uwaterloo_skin_paths(path: Union[os.PathLike, str], download: bool = False) -> Tuple[List[str], List[str]]: 80 """Get paths to the UWaterloo Skin data. 81 82 Args: 83 path: Filepath to a folder where the data is downloaded for further processing. 84 download: Whether to download the data if it is not present. 85 86 Returns: 87 List of filepaths for the image data. 88 List of filepaths for the label data. 89 """ 90 data_dir = get_uwaterloo_skin_data(path, "set1", download) 91 image_paths = sorted(glob(os.path.join(data_dir, "skin_data", "melanoma", "*", "*_orig.jpg"))) 92 gt_paths = sorted(glob(os.path.join(data_dir, "skin_data", "melanoma", "*", "*_contour.png"))) 93 94 data_dir = get_uwaterloo_skin_data(path, "set2", download) 95 image_paths.extend(sorted(glob(os.path.join(data_dir, "skin_data", "notmelanoma", "*", "*_orig.jpg")))) 96 gt_paths.extend(sorted(glob(os.path.join(data_dir, "skin_data", "notmelanoma", "*", "*_contour.png")))) 97 98 return image_paths, gt_paths
Get paths to the UWaterloo Skin data.
Arguments:
- path: Filepath to a folder where the data is downloaded for further processing.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the image data. List of filepaths for the label data.
def
get_uwaterloo_skin_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
101def get_uwaterloo_skin_dataset( 102 path: Union[os.PathLike, str], 103 patch_shape: Tuple[int, int], 104 resize_inputs: bool = False, 105 download: bool = False, 106 **kwargs 107) -> Dataset: 108 """Get the UWaterloo Skin dataset for skin lesion segmentation. 109 110 Args: 111 path: Filepath to a folder where the data is downloaded for further processing. 112 patch_shape: The patch shape to use for training. 113 resize_inputs: Whether to resize inputs to the desired patch shape. 114 download: Whether to download the data if it is not present. 115 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 116 117 Returns: 118 The segmentation dataset. 119 """ 120 image_paths, gt_paths = get_uwaterloo_skin_paths(path, download) 121 122 if resize_inputs: 123 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True} 124 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 125 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 126 ) 127 128 return torch_em.default_segmentation_dataset( 129 raw_paths=image_paths, 130 raw_key=None, 131 label_paths=gt_paths, 132 label_key=None, 133 is_seg_dataset=False, 134 patch_shape=patch_shape, 135 **kwargs 136 )
Get the UWaterloo Skin dataset for skin lesion segmentation.
Arguments:
- path: Filepath to a folder where the data is downloaded for further processing.
- patch_shape: The patch shape to use for training.
- resize_inputs: Whether to resize inputs to the desired patch shape.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
def
get_uwaterloo_skin_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
139def get_uwaterloo_skin_loader( 140 path: Union[os.PathLike, str], 141 batch_size: int, 142 patch_shape: Tuple[int, int], 143 resize_inputs: bool = False, 144 download: bool = False, 145 **kwargs 146) -> DataLoader: 147 """Get the UWaterloo Skin dataloader for skin lesion segmentation. 148 149 Args: 150 path: Filepath to a folder where the data is downloaded for further processing. 151 batch_size: The batch size for training. 152 patch_shape: The patch shape to use for training. 153 resize_inputs: Whether to resize inputs to the desired patch shape. 154 download: Whether to download the data if it is not present. 155 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 156 157 Returns: 158 The DataLoader. 159 """ 160 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 161 dataset = get_uwaterloo_skin_dataset(path, patch_shape, resize_inputs, download, **ds_kwargs) 162 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the UWaterloo Skin dataloader for skin lesion segmentation.
Arguments:
- path: Filepath to a folder where the data is downloaded for further processing.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- resize_inputs: Whether to resize inputs to the desired patch shape.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader.