torch_em.data.datasets.histopathology.glas
The GlaS dataset contains annotations for gland segmentation in colon histology images.
This dataset is located at https://www.kaggle.com/datasets/sani84/glasmiccai2015-gland-segmentation. The dataset is from the publication http://arxiv.org/abs/1603.00275. Please cite it if you use this dataset for your research.
1"""The GlaS dataset contains annotations for gland segmentation in colon histology images. 2 3This dataset is located at https://www.kaggle.com/datasets/sani84/glasmiccai2015-gland-segmentation. 4The dataset is from the publication http://arxiv.org/abs/1603.00275. 5Please cite it if you use this dataset for your research. 6""" 7 8import os 9import shutil 10from glob import glob 11from tqdm import tqdm 12from natsort import natsorted 13from typing import Union, Tuple, List, Literal 14 15import imageio.v3 as imageio 16 17import torch_em 18 19from torch.utils.data import Dataset, DataLoader 20 21from .. import util 22 23 24def _extract_images(split, path): 25 import h5py 26 27 data_folder = os.path.join(path, "Warwick_QU_Dataset") 28 29 label_paths = natsorted(glob(os.path.join(data_folder, f"{split}*anno.bmp"))) 30 image_paths = [ 31 image_path for image_path in natsorted(glob(os.path.join(data_folder, f"{split}*.bmp"))) 32 if image_path not in label_paths 33 ] 34 assert image_paths and len(image_paths) == len(label_paths) 35 36 os.makedirs(os.path.join(path, split), exist_ok=True) 37 38 for image_path, label_path in tqdm( 39 zip(image_paths, label_paths), total=len(image_paths), 40 desc=f"Extract images from {os.path.abspath(data_folder)}" 41 ): 42 fname = os.path.basename(image_path).split(".")[0] 43 44 image = imageio.imread(image_path) 45 segmentation = imageio.imread(label_path) 46 image = image.transpose(2, 0, 1) 47 48 with h5py.File(os.path.join(path, split, f"{fname}.h5"), "a") as f: 49 f.create_dataset("raw", data=image, compression="gzip") 50 f.create_dataset("labels", data=segmentation, compression="gzip") 51 52 53def get_glas_data( 54 path: Union[os.PathLike, str], split: Literal["train", "val", "test"], download: bool = False 55) -> str: 56 """Download the GlaS dataset. 57 58 Args: 59 path: Filepath to a folder where the data is downloaded for further processing. 60 split: The choice of data split. 61 download: Whether to download the data if it is not present. 62 63 Returns: 64 Filepath where the data is downloaded and preprocessed. 65 """ 66 data_dir = os.path.join(path, split) 67 if os.path.exists(data_dir): 68 return data_dir 69 70 os.makedirs(path, exist_ok=True) 71 72 # Download the files. 73 util.download_source_kaggle(path=path, dataset_name="sani84/glasmiccai2015-gland-segmentation", download=download) 74 util.unzip(zip_path=os.path.join(path, "glasmiccai2015-gland-segmentation.zip"), dst=path) 75 76 # Preprocess inputs per split. 77 splits = ["train", "test"] 78 if split not in splits: 79 raise ValueError(f"'{split}' is not a valid split choice.") 80 81 for _split in splits: 82 _extract_images(_split, path) 83 84 # Remove original data 85 shutil.rmtree(os.path.join(path, "Warwick_QU_Dataset")) 86 87 return data_dir 88 89 90def get_glas_paths(path: Union[os.PathLike], split: Literal["train", "test"], download: bool = False) -> List[str]: 91 """Get paths to the GlaS data. 92 93 Args: 94 path: Filepath to a folder where the downloaded data will be saved. 95 split: The choice of data split. 96 download: Whether to download the data if it is not present. 97 98 Returns: 99 List of filepaths for the stored data. 100 """ 101 data_dir = get_glas_data(path, split, download) 102 data_paths = natsorted(glob(os.path.join(data_dir, "*.h5"))) 103 return data_paths 104 105 106def get_glas_dataset( 107 path: Union[os.PathLike, str], 108 patch_shape: Tuple[int, int], 109 split: Literal["train", "test"], 110 resize_inputs: bool = False, 111 download: bool = False, 112 **kwargs 113) -> Dataset: 114 """Get the GlaS dataset for gland segmentation. 115 116 Args: 117 path: Filepath to a folder where the downloaded data will be saved. 118 patch_shape: The patch shape to use for training. 119 split: The choice of data split. 120 resize_inputs: Whether to resize the input images. 121 download: Whether to download the data if it is not present. 122 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 123 124 Returns: 125 The segmentation dataset. 126 """ 127 data_paths = get_glas_paths(path, split, download) 128 129 if resize_inputs: 130 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True} 131 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 132 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 133 ) 134 135 return torch_em.default_segmentation_dataset( 136 raw_paths=data_paths, 137 raw_key="raw", 138 label_paths=data_paths, 139 label_key="labels", 140 patch_shape=patch_shape, 141 ndim=2, 142 with_channels=True, 143 **kwargs 144 ) 145 146 147def get_glas_loader( 148 path: Union[os.PathLike, str], 149 batch_size: int, 150 patch_shape: Tuple[int, int], 151 split: Literal["train", "test"], 152 resize_inputs: bool = False, 153 download: bool = False, 154 **kwargs 155) -> DataLoader: 156 """Get the GlaS dataloader for gland segmentation. 157 158 Args: 159 path: Filepath to a folder where the downloaded data will be saved. 160 batch_size: The batch size for training. 161 patch_shape: The patch shape to use for training. 162 split: The choice of data split. 163 resize_inputs: Whether to resize the inputs. 164 download: Whether to download the data if it is not present. 165 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 166 167 Returns: 168 The DataLoader. 169 """ 170 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 171 dataset = get_glas_dataset(path, patch_shape, split, resize_inputs, download, **ds_kwargs) 172 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
def
get_glas_data( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], download: bool = False) -> str:
54def get_glas_data( 55 path: Union[os.PathLike, str], split: Literal["train", "val", "test"], download: bool = False 56) -> str: 57 """Download the GlaS dataset. 58 59 Args: 60 path: Filepath to a folder where the data is downloaded for further processing. 61 split: The choice of data split. 62 download: Whether to download the data if it is not present. 63 64 Returns: 65 Filepath where the data is downloaded and preprocessed. 66 """ 67 data_dir = os.path.join(path, split) 68 if os.path.exists(data_dir): 69 return data_dir 70 71 os.makedirs(path, exist_ok=True) 72 73 # Download the files. 74 util.download_source_kaggle(path=path, dataset_name="sani84/glasmiccai2015-gland-segmentation", download=download) 75 util.unzip(zip_path=os.path.join(path, "glasmiccai2015-gland-segmentation.zip"), dst=path) 76 77 # Preprocess inputs per split. 78 splits = ["train", "test"] 79 if split not in splits: 80 raise ValueError(f"'{split}' is not a valid split choice.") 81 82 for _split in splits: 83 _extract_images(_split, path) 84 85 # Remove original data 86 shutil.rmtree(os.path.join(path, "Warwick_QU_Dataset")) 87 88 return data_dir
Download the GlaS dataset.
Arguments:
- path: Filepath to a folder where the data is downloaded for further processing.
- split: The choice of data split.
- download: Whether to download the data if it is not present.
Returns:
Filepath where the data is downloaded and preprocessed.
def
get_glas_paths( path: os.PathLike, split: Literal['train', 'test'], download: bool = False) -> List[str]:
91def get_glas_paths(path: Union[os.PathLike], split: Literal["train", "test"], download: bool = False) -> List[str]: 92 """Get paths to the GlaS data. 93 94 Args: 95 path: Filepath to a folder where the downloaded data will be saved. 96 split: The choice of data split. 97 download: Whether to download the data if it is not present. 98 99 Returns: 100 List of filepaths for the stored data. 101 """ 102 data_dir = get_glas_data(path, split, download) 103 data_paths = natsorted(glob(os.path.join(data_dir, "*.h5"))) 104 return data_paths
Get paths to the GlaS data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The choice of data split.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the stored data.
def
get_glas_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Literal['train', 'test'], resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
107def get_glas_dataset( 108 path: Union[os.PathLike, str], 109 patch_shape: Tuple[int, int], 110 split: Literal["train", "test"], 111 resize_inputs: bool = False, 112 download: bool = False, 113 **kwargs 114) -> Dataset: 115 """Get the GlaS dataset for gland segmentation. 116 117 Args: 118 path: Filepath to a folder where the downloaded data will be saved. 119 patch_shape: The patch shape to use for training. 120 split: The choice of data split. 121 resize_inputs: Whether to resize the input images. 122 download: Whether to download the data if it is not present. 123 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 124 125 Returns: 126 The segmentation dataset. 127 """ 128 data_paths = get_glas_paths(path, split, download) 129 130 if resize_inputs: 131 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True} 132 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 133 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 134 ) 135 136 return torch_em.default_segmentation_dataset( 137 raw_paths=data_paths, 138 raw_key="raw", 139 label_paths=data_paths, 140 label_key="labels", 141 patch_shape=patch_shape, 142 ndim=2, 143 with_channels=True, 144 **kwargs 145 )
Get the GlaS dataset for gland segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- split: The choice of data split.
- resize_inputs: Whether to resize the input images.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
def
get_glas_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Literal['train', 'test'], resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
148def get_glas_loader( 149 path: Union[os.PathLike, str], 150 batch_size: int, 151 patch_shape: Tuple[int, int], 152 split: Literal["train", "test"], 153 resize_inputs: bool = False, 154 download: bool = False, 155 **kwargs 156) -> DataLoader: 157 """Get the GlaS dataloader for gland segmentation. 158 159 Args: 160 path: Filepath to a folder where the downloaded data will be saved. 161 batch_size: The batch size for training. 162 patch_shape: The patch shape to use for training. 163 split: The choice of data split. 164 resize_inputs: Whether to resize the inputs. 165 download: Whether to download the data if it is not present. 166 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 167 168 Returns: 169 The DataLoader. 170 """ 171 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 172 dataset = get_glas_dataset(path, patch_shape, split, resize_inputs, download, **ds_kwargs) 173 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the GlaS dataloader for gland segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- split: The choice of data split.
- resize_inputs: Whether to resize the inputs.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader.