torch_em.data.datasets.medical.uwaterloo_skin

The UWaterloo Skin dataset contains annotations for skin lesion segmentation in dermoscopy images.

The database is located at https://uwaterloo.ca/vision-image-processing-lab/research-demos/skin-cancer-detection.

Please cite it if you use this dataset for a publication.

  1"""The UWaterloo Skin dataset contains annotations for skin lesion segmentation in dermoscopy images.
  2
  3The database is located at
  4https://uwaterloo.ca/vision-image-processing-lab/research-demos/skin-cancer-detection.
  5
  6Please cite it if you use this dataset for a publication.
  7"""
  8
  9import os
 10import shutil
 11from glob import glob
 12from urllib.parse import urljoin
 13from urllib3.exceptions import ProtocolError
 14from typing import Tuple, Union, Literal, List
 15
 16from torch.utils.data import Dataset, DataLoader
 17
 18import torch_em
 19
 20from .. import util
 21
 22
 23BASE_URL = "https://uwaterloo.ca/vision-image-processing-lab/sites/ca.vision-image-processing-lab/files/uploads/files/"
 24
 25
 26ZIPFILES = {
 27    "set1": "skin_image_data_set-1.zip",  # patients with melanoma
 28    "set2": "skin_image_data_set-2.zip"  # patients without melanoma
 29}
 30
 31CHECKSUMS = {
 32    "set1": "1788cd3eb7a4744012aad9a154e514fc5b82b9f3b19e31cc1b6ded5fc6bed297",
 33    "set2": "108a818baf20b36ef4544ebda10a8075dad99e335f0535c9533bb14cb02b5c53"
 34}
 35
 36
 37def get_uwaterloo_skin_data(
 38    path: Union[os.PathLike, str], chosen_set: Literal["set1", "set2"], download: bool = False
 39) -> str:
 40    """Download the UWaterloo Skin dataset.
 41
 42    Args:
 43        path: Filepath to a folder where the data is downloaded for further processing.
 44        chosen_set: The choice of data subset.
 45        download: Whether to download the data if it is not present.
 46
 47    Returns:
 48        Filepath where the data is downloaded.
 49    """
 50    assert chosen_set in ZIPFILES.keys(), f"'{chosen_set}' is not a valid set."
 51
 52    data_dir = os.path.join(path, f"{chosen_set}_Data")
 53    if os.path.exists(data_dir):
 54        return data_dir
 55
 56    os.makedirs(path, exist_ok=True)
 57
 58    zip_path = os.path.join(path, ZIPFILES[chosen_set])
 59    url = urljoin(BASE_URL, ZIPFILES[chosen_set])
 60
 61    try:
 62        util.download_source(path=zip_path, url=url, download=download, checksum=CHECKSUMS[chosen_set])
 63    except ProtocolError:  # the 'uwaterloo.ca' quite randomly times out of connections, pretty weird.
 64        msg = "The server seems to be unreachable at the moment. "
 65        msg += f"We recommend downloading the data manually, from '{url}' at '{path}'. "
 66        print(msg)
 67        quit()
 68
 69    util.unzip(zip_path=zip_path, dst=path)
 70
 71    setnum = chosen_set[-1]
 72    tmp_dir = os.path.join(path, fr"Skin Image Data Set-{setnum}")
 73    shutil.move(src=tmp_dir, dst=data_dir)
 74
 75    return data_dir
 76
 77
 78def get_uwaterloo_skin_paths(path: Union[os.PathLike, str], download: bool = False) -> Tuple[List[str], List[str]]:
 79    """Get paths to the UWaterloo Skin data.
 80
 81    Args:
 82        path: Filepath to a folder where the data is downloaded for further processing.
 83        download: Whether to download the data if it is not present.
 84
 85    Returns:
 86        List of filepaths for the image data.
 87        List of filepaths for the label data.
 88    """
 89    data_dir = get_uwaterloo_skin_data(path, "set1", download)
 90    image_paths = sorted(glob(os.path.join(data_dir, "skin_data", "melanoma", "*", "*_orig.jpg")))
 91    gt_paths = sorted(glob(os.path.join(data_dir, "skin_data", "melanoma", "*", "*_contour.png")))
 92
 93    data_dir = get_uwaterloo_skin_data(path, "set2", download)
 94    image_paths.extend(sorted(glob(os.path.join(data_dir, "skin_data", "notmelanoma", "*", "*_orig.jpg"))))
 95    gt_paths.extend(sorted(glob(os.path.join(data_dir, "skin_data", "notmelanoma", "*", "*_contour.png"))))
 96
 97    return image_paths, gt_paths
 98
 99
100def get_uwaterloo_skin_dataset(
101    path: Union[os.PathLike, str],
102    patch_shape: Tuple[int, int],
103    resize_inputs: bool = False,
104    download: bool = False,
105    **kwargs
106) -> Dataset:
107    """Get the UWaterloo Skin dataset for skin lesion segmentation.
108
109    Args:
110        path: Filepath to a folder where the data is downloaded for further processing.
111        patch_shape: The patch shape to use for training.
112        resize_inputs: Whether to resize inputs to the desired patch shape.
113        download: Whether to download the data if it is not present.
114        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
115
116    Returns:
117        The segmentation dataset.
118    """
119    image_paths, gt_paths = get_uwaterloo_skin_paths(path, download)
120
121    if resize_inputs:
122        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
123        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
124            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
125        )
126
127    return torch_em.default_segmentation_dataset(
128        raw_paths=image_paths,
129        raw_key=None,
130        label_paths=gt_paths,
131        label_key=None,
132        is_seg_dataset=False,
133        patch_shape=patch_shape,
134        **kwargs
135    )
136
137
138def get_uwaterloo_skin_loader(
139    path: Union[os.PathLike, str],
140    batch_size: int,
141    patch_shape: Tuple[int, int],
142    resize_inputs: bool = False,
143    download: bool = False,
144    **kwargs
145) -> DataLoader:
146    """Get the UWaterloo Skin dataloader for skin lesion segmentation.
147
148    Args:
149        path: Filepath to a folder where the data is downloaded for further processing.
150        batch_size: The batch size for training.
151        patch_shape: The patch shape to use for training.
152        resize_inputs: Whether to resize inputs to the desired patch shape.
153        download: Whether to download the data if it is not present.
154        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
155
156    Returns:
157        The DataLoader.
158    """
159    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
160    dataset = get_uwaterloo_skin_dataset(path, patch_shape, resize_inputs, download, **ds_kwargs)
161    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
BASE_URL = 'https://uwaterloo.ca/vision-image-processing-lab/sites/ca.vision-image-processing-lab/files/uploads/files/'
ZIPFILES = {'set1': 'skin_image_data_set-1.zip', 'set2': 'skin_image_data_set-2.zip'}
CHECKSUMS = {'set1': '1788cd3eb7a4744012aad9a154e514fc5b82b9f3b19e31cc1b6ded5fc6bed297', 'set2': '108a818baf20b36ef4544ebda10a8075dad99e335f0535c9533bb14cb02b5c53'}
def get_uwaterloo_skin_data( path: Union[os.PathLike, str], chosen_set: Literal['set1', 'set2'], download: bool = False) -> str:
38def get_uwaterloo_skin_data(
39    path: Union[os.PathLike, str], chosen_set: Literal["set1", "set2"], download: bool = False
40) -> str:
41    """Download the UWaterloo Skin dataset.
42
43    Args:
44        path: Filepath to a folder where the data is downloaded for further processing.
45        chosen_set: The choice of data subset.
46        download: Whether to download the data if it is not present.
47
48    Returns:
49        Filepath where the data is downloaded.
50    """
51    assert chosen_set in ZIPFILES.keys(), f"'{chosen_set}' is not a valid set."
52
53    data_dir = os.path.join(path, f"{chosen_set}_Data")
54    if os.path.exists(data_dir):
55        return data_dir
56
57    os.makedirs(path, exist_ok=True)
58
59    zip_path = os.path.join(path, ZIPFILES[chosen_set])
60    url = urljoin(BASE_URL, ZIPFILES[chosen_set])
61
62    try:
63        util.download_source(path=zip_path, url=url, download=download, checksum=CHECKSUMS[chosen_set])
64    except ProtocolError:  # the 'uwaterloo.ca' quite randomly times out of connections, pretty weird.
65        msg = "The server seems to be unreachable at the moment. "
66        msg += f"We recommend downloading the data manually, from '{url}' at '{path}'. "
67        print(msg)
68        quit()
69
70    util.unzip(zip_path=zip_path, dst=path)
71
72    setnum = chosen_set[-1]
73    tmp_dir = os.path.join(path, fr"Skin Image Data Set-{setnum}")
74    shutil.move(src=tmp_dir, dst=data_dir)
75
76    return data_dir

Download the UWaterloo Skin dataset.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • chosen_set: The choice of data subset.
  • download: Whether to download the data if it is not present.
Returns:

Filepath where the data is downloaded.

def get_uwaterloo_skin_paths( path: Union[os.PathLike, str], download: bool = False) -> Tuple[List[str], List[str]]:
79def get_uwaterloo_skin_paths(path: Union[os.PathLike, str], download: bool = False) -> Tuple[List[str], List[str]]:
80    """Get paths to the UWaterloo Skin data.
81
82    Args:
83        path: Filepath to a folder where the data is downloaded for further processing.
84        download: Whether to download the data if it is not present.
85
86    Returns:
87        List of filepaths for the image data.
88        List of filepaths for the label data.
89    """
90    data_dir = get_uwaterloo_skin_data(path, "set1", download)
91    image_paths = sorted(glob(os.path.join(data_dir, "skin_data", "melanoma", "*", "*_orig.jpg")))
92    gt_paths = sorted(glob(os.path.join(data_dir, "skin_data", "melanoma", "*", "*_contour.png")))
93
94    data_dir = get_uwaterloo_skin_data(path, "set2", download)
95    image_paths.extend(sorted(glob(os.path.join(data_dir, "skin_data", "notmelanoma", "*", "*_orig.jpg"))))
96    gt_paths.extend(sorted(glob(os.path.join(data_dir, "skin_data", "notmelanoma", "*", "*_contour.png"))))
97
98    return image_paths, gt_paths

Get paths to the UWaterloo Skin data.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths for the image data. List of filepaths for the label data.

def get_uwaterloo_skin_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
101def get_uwaterloo_skin_dataset(
102    path: Union[os.PathLike, str],
103    patch_shape: Tuple[int, int],
104    resize_inputs: bool = False,
105    download: bool = False,
106    **kwargs
107) -> Dataset:
108    """Get the UWaterloo Skin dataset for skin lesion segmentation.
109
110    Args:
111        path: Filepath to a folder where the data is downloaded for further processing.
112        patch_shape: The patch shape to use for training.
113        resize_inputs: Whether to resize inputs to the desired patch shape.
114        download: Whether to download the data if it is not present.
115        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
116
117    Returns:
118        The segmentation dataset.
119    """
120    image_paths, gt_paths = get_uwaterloo_skin_paths(path, download)
121
122    if resize_inputs:
123        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
124        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
125            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
126        )
127
128    return torch_em.default_segmentation_dataset(
129        raw_paths=image_paths,
130        raw_key=None,
131        label_paths=gt_paths,
132        label_key=None,
133        is_seg_dataset=False,
134        patch_shape=patch_shape,
135        **kwargs
136    )

Get the UWaterloo Skin dataset for skin lesion segmentation.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • patch_shape: The patch shape to use for training.
  • resize_inputs: Whether to resize inputs to the desired patch shape.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_uwaterloo_skin_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
139def get_uwaterloo_skin_loader(
140    path: Union[os.PathLike, str],
141    batch_size: int,
142    patch_shape: Tuple[int, int],
143    resize_inputs: bool = False,
144    download: bool = False,
145    **kwargs
146) -> DataLoader:
147    """Get the UWaterloo Skin dataloader for skin lesion segmentation.
148
149    Args:
150        path: Filepath to a folder where the data is downloaded for further processing.
151        batch_size: The batch size for training.
152        patch_shape: The patch shape to use for training.
153        resize_inputs: Whether to resize inputs to the desired patch shape.
154        download: Whether to download the data if it is not present.
155        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
156
157    Returns:
158        The DataLoader.
159    """
160    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
161    dataset = get_uwaterloo_skin_dataset(path, patch_shape, resize_inputs, download, **ds_kwargs)
162    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the UWaterloo Skin dataloader for skin lesion segmentation.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape to use for training.
  • resize_inputs: Whether to resize inputs to the desired patch shape.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.