torch_em.data.datasets.medical.kvasir

The KVASIR dataset contains annotations for polyp segmentation in colonoscopy images.

The dataset is located at: https://datasets.simula.no/kvasir-seg/. This dataset is from the publication https://doi.org/10.1007/978-3-030-37734-2_37. Please cite it if you use this dataset for your research.

  1"""The KVASIR dataset contains annotations for polyp segmentation
  2in colonoscopy images.
  3
  4The dataset is located at: https://datasets.simula.no/kvasir-seg/.
  5This dataset is from the publication https://doi.org/10.1007/978-3-030-37734-2_37.
  6Please cite it if you use this dataset for your research.
  7"""
  8
  9import os
 10from glob import glob
 11from tqdm import tqdm
 12from pathlib import Path
 13from typing import Union, Tuple, List
 14
 15import numpy as np
 16import imageio.v3 as imageio
 17
 18from torch.utils.data import Dataset, DataLoader
 19
 20import torch_em
 21
 22from .. import util
 23
 24
 25URL = "https://datasets.simula.no/downloads/kvasir-seg.zip"
 26CHECKSUM = "03b30e21d584e04facf49397a2576738fd626815771afbbf788f74a7153478f7"
 27
 28
 29def get_kvasir_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 30    """Download the KVASIR dataset.
 31
 32    Args:
 33        path: Filepath to a folder where the data is downloaded for further processing.
 34        download: Whether to download the data if it is not present.
 35
 36    Returns:
 37        Filepath where the data is downloaded.
 38    """
 39    data_dir = os.path.join(path, "Kvasir-SEG")
 40    if os.path.exists(data_dir):
 41        return data_dir
 42
 43    os.makedirs(path, exist_ok=True)
 44
 45    zip_path = os.path.join(path, "kvasir-seg.zip")
 46    util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM)
 47    util.unzip(zip_path=zip_path, dst=path)
 48
 49    return data_dir
 50
 51
 52def get_kvasir_paths(path: Union[os.PathLike, str], download: bool = False) -> Tuple[List[str], List[str]]:
 53    """Get paths to the KVASIR data.
 54
 55    Args:
 56        path: Filepath to a folder where the data is downloaded for further processing.
 57        download: Whether to download the data if it is not present.
 58
 59    Returns:
 60        List of filepaths for the image data.
 61        List of filepaths for the label data.
 62    """
 63    data_dir = get_kvasir_data(path=path, download=download)
 64
 65    image_paths = sorted(glob(os.path.join(data_dir, "images", "*.jpg")))
 66    gt_paths = sorted(glob(os.path.join(data_dir, "masks", "*.jpg")))
 67
 68    neu_gt_dir = os.path.join(data_dir, "masks", "preprocessed")
 69    os.makedirs(neu_gt_dir, exist_ok=True)
 70
 71    neu_gt_paths = []
 72    for gt_path in tqdm(gt_paths):
 73        neu_gt_path = os.path.join(neu_gt_dir, f"{Path(gt_path).stem}.tif")
 74        neu_gt_paths.append(neu_gt_path)
 75        if os.path.exists(neu_gt_path):
 76            continue
 77
 78        gt = imageio.imread(gt_path)
 79        gt = np.mean(gt, axis=-1)
 80        gt = (gt >= 240).astype("uint8")
 81        imageio.imwrite(neu_gt_path, gt, compression="zlib")
 82
 83    return image_paths, neu_gt_paths
 84
 85
 86def get_kvasir_dataset(
 87    path: Union[os.PathLike, str],
 88    patch_shape: Tuple[int, int],
 89    resize_inputs: bool = False,
 90    download: bool = False,
 91    **kwargs
 92) -> Dataset:
 93    """Get the KVASIR dataset for polyp segmentation.
 94
 95    Args:
 96        path: Filepath to a folder where the data is downloaded for further processing.
 97        patch_shape: The patch shape to use for training.
 98        resize_inputs: Whether to resize the inputs to the patch shape.
 99        download: Whether to download the data if it is not present.
100        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
101
102    Returns:
103        The segmentation dataset.
104    """
105    image_paths, gt_paths = get_kvasir_paths(path, download)
106
107    if resize_inputs:
108        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
109        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
110            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
111        )
112
113    return torch_em.default_segmentation_dataset(
114        raw_paths=image_paths,
115        raw_key=None,
116        label_paths=gt_paths,
117        label_key=None,
118        patch_shape=patch_shape,
119        is_seg_dataset=False,
120        **kwargs
121    )
122
123
124def get_kvasir_loader(
125    path: Union[os.PathLike, str],
126    patch_shape: Tuple[int, int],
127    batch_size: int,
128    resize_inputs: bool = False,
129    download: bool = False,
130    **kwargs
131) -> DataLoader:
132    """Get the KVASIR dataloader for polyp segmentation.
133
134    Args:
135        path: Filepath to a folder where the data is downloaded for further processing.
136        batch_size: The batch size for training.
137        patch_shape: The patch shape to use for training.
138        resize_inputs: Whether to resize the inputs to the patch shape.
139        download: Whether to download the data if it is not present.
140        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
141
142    Returns:
143        The DataLoader.
144    """
145    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
146    dataset = get_kvasir_dataset(path, patch_shape, resize_inputs, download, **ds_kwargs)
147    return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)
URL = 'https://datasets.simula.no/downloads/kvasir-seg.zip'
CHECKSUM = '03b30e21d584e04facf49397a2576738fd626815771afbbf788f74a7153478f7'
def get_kvasir_data(path: Union[os.PathLike, str], download: bool = False) -> str:
30def get_kvasir_data(path: Union[os.PathLike, str], download: bool = False) -> str:
31    """Download the KVASIR dataset.
32
33    Args:
34        path: Filepath to a folder where the data is downloaded for further processing.
35        download: Whether to download the data if it is not present.
36
37    Returns:
38        Filepath where the data is downloaded.
39    """
40    data_dir = os.path.join(path, "Kvasir-SEG")
41    if os.path.exists(data_dir):
42        return data_dir
43
44    os.makedirs(path, exist_ok=True)
45
46    zip_path = os.path.join(path, "kvasir-seg.zip")
47    util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM)
48    util.unzip(zip_path=zip_path, dst=path)
49
50    return data_dir

Download the KVASIR dataset.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • download: Whether to download the data if it is not present.
Returns:

Filepath where the data is downloaded.

def get_kvasir_paths( path: Union[os.PathLike, str], download: bool = False) -> Tuple[List[str], List[str]]:
53def get_kvasir_paths(path: Union[os.PathLike, str], download: bool = False) -> Tuple[List[str], List[str]]:
54    """Get paths to the KVASIR data.
55
56    Args:
57        path: Filepath to a folder where the data is downloaded for further processing.
58        download: Whether to download the data if it is not present.
59
60    Returns:
61        List of filepaths for the image data.
62        List of filepaths for the label data.
63    """
64    data_dir = get_kvasir_data(path=path, download=download)
65
66    image_paths = sorted(glob(os.path.join(data_dir, "images", "*.jpg")))
67    gt_paths = sorted(glob(os.path.join(data_dir, "masks", "*.jpg")))
68
69    neu_gt_dir = os.path.join(data_dir, "masks", "preprocessed")
70    os.makedirs(neu_gt_dir, exist_ok=True)
71
72    neu_gt_paths = []
73    for gt_path in tqdm(gt_paths):
74        neu_gt_path = os.path.join(neu_gt_dir, f"{Path(gt_path).stem}.tif")
75        neu_gt_paths.append(neu_gt_path)
76        if os.path.exists(neu_gt_path):
77            continue
78
79        gt = imageio.imread(gt_path)
80        gt = np.mean(gt, axis=-1)
81        gt = (gt >= 240).astype("uint8")
82        imageio.imwrite(neu_gt_path, gt, compression="zlib")
83
84    return image_paths, neu_gt_paths

Get paths to the KVASIR data.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths for the image data. List of filepaths for the label data.

def get_kvasir_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
 87def get_kvasir_dataset(
 88    path: Union[os.PathLike, str],
 89    patch_shape: Tuple[int, int],
 90    resize_inputs: bool = False,
 91    download: bool = False,
 92    **kwargs
 93) -> Dataset:
 94    """Get the KVASIR dataset for polyp segmentation.
 95
 96    Args:
 97        path: Filepath to a folder where the data is downloaded for further processing.
 98        patch_shape: The patch shape to use for training.
 99        resize_inputs: Whether to resize the inputs to the patch shape.
100        download: Whether to download the data if it is not present.
101        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
102
103    Returns:
104        The segmentation dataset.
105    """
106    image_paths, gt_paths = get_kvasir_paths(path, download)
107
108    if resize_inputs:
109        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
110        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
111            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
112        )
113
114    return torch_em.default_segmentation_dataset(
115        raw_paths=image_paths,
116        raw_key=None,
117        label_paths=gt_paths,
118        label_key=None,
119        patch_shape=patch_shape,
120        is_seg_dataset=False,
121        **kwargs
122    )

Get the KVASIR dataset for polyp segmentation.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • patch_shape: The patch shape to use for training.
  • resize_inputs: Whether to resize the inputs to the patch shape.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_kvasir_loader( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], batch_size: int, resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
125def get_kvasir_loader(
126    path: Union[os.PathLike, str],
127    patch_shape: Tuple[int, int],
128    batch_size: int,
129    resize_inputs: bool = False,
130    download: bool = False,
131    **kwargs
132) -> DataLoader:
133    """Get the KVASIR dataloader for polyp segmentation.
134
135    Args:
136        path: Filepath to a folder where the data is downloaded for further processing.
137        batch_size: The batch size for training.
138        patch_shape: The patch shape to use for training.
139        resize_inputs: Whether to resize the inputs to the patch shape.
140        download: Whether to download the data if it is not present.
141        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
142
143    Returns:
144        The DataLoader.
145    """
146    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
147    dataset = get_kvasir_dataset(path, patch_shape, resize_inputs, download, **ds_kwargs)
148    return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)

Get the KVASIR dataloader for polyp segmentation.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape to use for training.
  • resize_inputs: Whether to resize the inputs to the patch shape.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.