torch_em.data.datasets.histopathology.glas

The GlaS dataset contains annotations for gland segmentation in colon histology images.

This dataset is located at https://www.kaggle.com/datasets/sani84/glasmiccai2015-gland-segmentation. The dataset is from the publication http://arxiv.org/abs/1603.00275. Please cite it if you use this dataset for your research.

View Source

  1"""The GlaS dataset contains annotations for gland segmentation in colon histology images.
  2
  3This dataset is located at https://www.kaggle.com/datasets/sani84/glasmiccai2015-gland-segmentation.
  4The dataset is from the publication http://arxiv.org/abs/1603.00275.
  5Please cite it if you use this dataset for your research.
  6"""
  7
  8import os
  9import shutil
 10from glob import glob
 11from tqdm import tqdm
 12from natsort import natsorted
 13from typing import Union, Tuple, List, Literal
 14
 15import imageio.v3 as imageio
 16
 17import torch_em
 18
 19from torch.utils.data import Dataset, DataLoader
 20
 21from .. import util
 22
 23
 24def _extract_images(split, path):
 25    import h5py
 26
 27    data_folder = os.path.join(path, "Warwick_QU_Dataset")
 28
 29    label_paths = natsorted(glob(os.path.join(data_folder, f"{split}*anno.bmp")))
 30    image_paths = [
 31        image_path for image_path in natsorted(glob(os.path.join(data_folder, f"{split}*.bmp")))
 32        if image_path not in label_paths
 33    ]
 34    assert image_paths and len(image_paths) == len(label_paths)
 35
 36    os.makedirs(os.path.join(path, split), exist_ok=True)
 37
 38    for image_path, label_path in tqdm(
 39        zip(image_paths, label_paths), total=len(image_paths),
 40        desc=f"Extract images from {os.path.abspath(data_folder)}"
 41    ):
 42        fname = os.path.basename(image_path).split(".")[0]
 43
 44        image = imageio.imread(image_path)
 45        segmentation = imageio.imread(label_path)
 46        image = image.transpose(2, 0, 1)
 47
 48        with h5py.File(os.path.join(path, split, f"{fname}.h5"), "a") as f:
 49            f.create_dataset("raw", data=image, compression="gzip")
 50            f.create_dataset("labels", data=segmentation, compression="gzip")
 51
 52
 53def get_glas_data(
 54    path: Union[os.PathLike, str], split: Literal["train", "val", "test"], download: bool = False
 55) -> str:
 56    """Download the GlaS dataset.
 57
 58    Args:
 59        path: Filepath to a folder where the data is downloaded for further processing.
 60        split: The choice of data split.
 61        download: Whether to download the data if it is not present.
 62
 63    Returns:
 64        Filepath where the data is downloaded and preprocessed.
 65    """
 66    data_dir = os.path.join(path, split)
 67    if os.path.exists(data_dir):
 68        return data_dir
 69
 70    os.makedirs(path, exist_ok=True)
 71
 72    # Download the files.
 73    util.download_source_kaggle(path=path, dataset_name="sani84/glasmiccai2015-gland-segmentation", download=download)
 74    util.unzip(zip_path=os.path.join(path, "glasmiccai2015-gland-segmentation.zip"), dst=path)
 75
 76    # Preprocess inputs per split.
 77    splits = ["train", "test"]
 78    if split not in splits:
 79        raise ValueError(f"'{split}' is not a valid split choice.")
 80
 81    for _split in splits:
 82        _extract_images(_split, path)
 83
 84    # Remove original data
 85    shutil.rmtree(os.path.join(path, "Warwick_QU_Dataset"))
 86
 87    return data_dir
 88
 89
 90def get_glas_paths(path: Union[os.PathLike], split: Literal["train", "test"], download: bool = False) -> List[str]:
 91    """Get paths to the GlaS data.
 92
 93    Args:
 94        path: Filepath to a folder where the downloaded data will be saved.
 95        split: The choice of data split.
 96        download: Whether to download the data if it is not present.
 97
 98    Returns:
 99        List of filepaths for the stored data.
100    """
101    data_dir = get_glas_data(path, split, download)
102    data_paths = natsorted(glob(os.path.join(data_dir, "*.h5")))
103    return data_paths
104
105
106def get_glas_dataset(
107    path: Union[os.PathLike, str],
108    patch_shape: Tuple[int, int],
109    split: Literal["train", "test"],
110    resize_inputs: bool = False,
111    download: bool = False,
112    **kwargs
113) -> Dataset:
114    """Get the GlaS dataset for gland segmentation.
115
116    Args:
117        path: Filepath to a folder where the downloaded data will be saved.
118        patch_shape: The patch shape to use for training.
119        split: The choice of data split.
120        resize_inputs: Whether to resize the input images.
121        download: Whether to download the data if it is not present.
122        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
123
124    Returns:
125        The segmentation dataset.
126    """
127    data_paths = get_glas_paths(path, split, download)
128
129    if resize_inputs:
130        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
131        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
132            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
133        )
134
135    return torch_em.default_segmentation_dataset(
136        raw_paths=data_paths,
137        raw_key="raw",
138        label_paths=data_paths,
139        label_key="labels",
140        patch_shape=patch_shape,
141        ndim=2,
142        with_channels=True,
143        **kwargs
144    )
145
146
147def get_glas_loader(
148    path: Union[os.PathLike, str],
149    batch_size: int,
150    patch_shape: Tuple[int, int],
151    split: Literal["train", "test"],
152    resize_inputs: bool = False,
153    download: bool = False,
154    **kwargs
155) -> DataLoader:
156    """Get the GlaS dataloader for gland segmentation.
157
158    Args:
159        path: Filepath to a folder where the downloaded data will be saved.
160        batch_size: The batch size for training.
161        patch_shape: The patch shape to use for training.
162        split: The choice of data split.
163        resize_inputs: Whether to resize the inputs.
164        download: Whether to download the data if it is not present.
165        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
166
167    Returns:
168        The DataLoader.
169    """
170    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
171    dataset = get_glas_dataset(path, patch_shape, split, resize_inputs, download, **ds_kwargs)
172    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

def get_glas_data( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], download: bool = False) -> str: View Source

54def get_glas_data(
55    path: Union[os.PathLike, str], split: Literal["train", "val", "test"], download: bool = False
56) -> str:
57    """Download the GlaS dataset.
58
59    Args:
60        path: Filepath to a folder where the data is downloaded for further processing.
61        split: The choice of data split.
62        download: Whether to download the data if it is not present.
63
64    Returns:
65        Filepath where the data is downloaded and preprocessed.
66    """
67    data_dir = os.path.join(path, split)
68    if os.path.exists(data_dir):
69        return data_dir
70
71    os.makedirs(path, exist_ok=True)
72
73    # Download the files.
74    util.download_source_kaggle(path=path, dataset_name="sani84/glasmiccai2015-gland-segmentation", download=download)
75    util.unzip(zip_path=os.path.join(path, "glasmiccai2015-gland-segmentation.zip"), dst=path)
76
77    # Preprocess inputs per split.
78    splits = ["train", "test"]
79    if split not in splits:
80        raise ValueError(f"'{split}' is not a valid split choice.")
81
82    for _split in splits:
83        _extract_images(_split, path)
84
85    # Remove original data
86    shutil.rmtree(os.path.join(path, "Warwick_QU_Dataset"))
87
88    return data_dir

Download the GlaS dataset.

Arguments:

path: Filepath to a folder where the data is downloaded for further processing.
split: The choice of data split.
download: Whether to download the data if it is not present.

Returns:

Filepath where the data is downloaded and preprocessed.

def get_glas_paths( path: os.PathLike, split: Literal['train', 'test'], download: bool = False) -> List[str]: View Source

 91def get_glas_paths(path: Union[os.PathLike], split: Literal["train", "test"], download: bool = False) -> List[str]:
 92    """Get paths to the GlaS data.
 93
 94    Args:
 95        path: Filepath to a folder where the downloaded data will be saved.
 96        split: The choice of data split.
 97        download: Whether to download the data if it is not present.
 98
 99    Returns:
100        List of filepaths for the stored data.
101    """
102    data_dir = get_glas_data(path, split, download)
103    data_paths = natsorted(glob(os.path.join(data_dir, "*.h5")))
104    return data_paths

Get paths to the GlaS data.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
split: The choice of data split.
download: Whether to download the data if it is not present.

Returns:

List of filepaths for the stored data.

def get_glas_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Literal['train', 'test'], resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset: View Source

107def get_glas_dataset(
108    path: Union[os.PathLike, str],
109    patch_shape: Tuple[int, int],
110    split: Literal["train", "test"],
111    resize_inputs: bool = False,
112    download: bool = False,
113    **kwargs
114) -> Dataset:
115    """Get the GlaS dataset for gland segmentation.
116
117    Args:
118        path: Filepath to a folder where the downloaded data will be saved.
119        patch_shape: The patch shape to use for training.
120        split: The choice of data split.
121        resize_inputs: Whether to resize the input images.
122        download: Whether to download the data if it is not present.
123        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
124
125    Returns:
126        The segmentation dataset.
127    """
128    data_paths = get_glas_paths(path, split, download)
129
130    if resize_inputs:
131        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
132        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
133            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
134        )
135
136    return torch_em.default_segmentation_dataset(
137        raw_paths=data_paths,
138        raw_key="raw",
139        label_paths=data_paths,
140        label_key="labels",
141        patch_shape=patch_shape,
142        ndim=2,
143        with_channels=True,
144        **kwargs
145    )

Get the GlaS dataset for gland segmentation.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
patch_shape: The patch shape to use for training.
split: The choice of data split.
resize_inputs: Whether to resize the input images.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

Returns:

The segmentation dataset.

def get_glas_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Literal['train', 'test'], resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader: View Source

148def get_glas_loader(
149    path: Union[os.PathLike, str],
150    batch_size: int,
151    patch_shape: Tuple[int, int],
152    split: Literal["train", "test"],
153    resize_inputs: bool = False,
154    download: bool = False,
155    **kwargs
156) -> DataLoader:
157    """Get the GlaS dataloader for gland segmentation.
158
159    Args:
160        path: Filepath to a folder where the downloaded data will be saved.
161        batch_size: The batch size for training.
162        patch_shape: The patch shape to use for training.
163        split: The choice of data split.
164        resize_inputs: Whether to resize the inputs.
165        download: Whether to download the data if it is not present.
166        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
167
168    Returns:
169        The DataLoader.
170    """
171    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
172    dataset = get_glas_dataset(path, patch_shape, split, resize_inputs, download, **ds_kwargs)
173    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the GlaS dataloader for gland segmentation.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
batch_size: The batch size for training.
patch_shape: The patch shape to use for training.
split: The choice of data split.
resize_inputs: Whether to resize the inputs.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

Returns:

The DataLoader.