torch_em.data.datasets.histopathology.glysac
The GLySAC dataset contains annotations for nuclei instance segmentation and classification in H&E stained gastric cancer histopathology images.
The dataset contains 59 image tiles of size 1000x1000 pixels with instance segmentation masks and cell type annotations. Three cell classes are provided: lymphocytes, epithelial cells (normal and tumor), and other cells.
NOTE: The dataset is hosted on Google Drive and requires gdown to download. Install it with: conda install -c conda-forge gdown==4.6.3
The dataset is located at https://drive.google.com/file/d/1g1_xYFWgp3cRLKrlSwD2U5JDjooC0yHp/view This dataset is from the publication https://doi.org/10.1109/jbhi.2022.3149936. Please cite it if you use this dataset in your research.
1"""The GLySAC dataset contains annotations for nuclei instance segmentation and 2classification in H&E stained gastric cancer histopathology images. 3 4The dataset contains 59 image tiles of size 1000x1000 pixels with instance 5segmentation masks and cell type annotations. Three cell classes are provided: 6lymphocytes, epithelial cells (normal and tumor), and other cells. 7 8NOTE: The dataset is hosted on Google Drive and requires gdown to download. 9Install it with: conda install -c conda-forge gdown==4.6.3 10 11The dataset is located at https://drive.google.com/file/d/1g1_xYFWgp3cRLKrlSwD2U5JDjooC0yHp/view 12This dataset is from the publication https://doi.org/10.1109/jbhi.2022.3149936. 13Please cite it if you use this dataset in your research. 14""" 15 16import os 17from glob import glob 18from tqdm import tqdm 19from natsort import natsorted 20from typing import List, Literal, Tuple, Union 21 22import h5py 23import imageio.v3 as imageio 24from scipy.io import loadmat 25from torch.utils.data import Dataset, DataLoader 26 27import torch_em 28 29from .. import util 30 31 32GDRIVE_ID = "1g1_xYFWgp3cRLKrlSwD2U5JDjooC0yHp" 33URL = f"https://drive.google.com/uc?id={GDRIVE_ID}" 34CHECKSUM = None 35 36 37def _create_h5_files(data_dir: str, split: str) -> None: 38 folder = "Train" if split == "train" else "Test" 39 image_dir = os.path.join(data_dir, folder, "Images") 40 label_dir = os.path.join(data_dir, folder, "Labels") 41 h5_dir = os.path.join(data_dir, "h5", split) 42 os.makedirs(h5_dir, exist_ok=True) 43 44 image_paths = natsorted(glob(os.path.join(image_dir, "*.png"))) 45 for image_path in tqdm(image_paths, desc=f"Preprocessing {split}"): 46 fname = os.path.splitext(os.path.basename(image_path))[0] 47 h5_path = os.path.join(h5_dir, f"{fname}.h5") 48 if os.path.exists(h5_path): 49 continue 50 51 label_path = os.path.join(label_dir, f"{fname}.mat") 52 raw = imageio.imread(image_path)[..., :3] 53 mat = loadmat(label_path) 54 inst_map = mat["inst_map"].astype("int32") 55 type_map = mat["type_map"].astype("int32") 56 57 with h5py.File(h5_path, "w") as f: 58 f.create_dataset("raw", data=raw.transpose(2, 0, 1), compression="gzip") 59 f.create_dataset("labels/instances", data=inst_map, compression="gzip") 60 f.create_dataset("labels/semantic", data=type_map, compression="gzip") 61 62 63def get_glysac_data(path: Union[os.PathLike, str], download: bool = False) -> str: 64 """Download the GLySAC dataset. 65 66 Args: 67 path: Filepath to a folder where the downloaded data will be saved. 68 download: Whether to download the data if it is not present. 69 70 Returns: 71 The filepath to the data directory. 72 """ 73 data_dir = os.path.join(path, "glysac_dataset") 74 if os.path.exists(data_dir): 75 return data_dir 76 77 os.makedirs(path, exist_ok=True) 78 zip_path = os.path.join(path, "glysac_dataset.zip") 79 util.download_source_gdrive(path=zip_path, url=URL, download=download, checksum=CHECKSUM) 80 util.unzip(zip_path, path) 81 82 return data_dir 83 84 85def get_glysac_paths( 86 path: Union[os.PathLike, str], 87 split: Literal["train", "test"], 88 download: bool = False, 89) -> List[str]: 90 """Get paths to the GLySAC data. 91 92 Args: 93 path: Filepath to a folder where the downloaded data will be saved. 94 split: The data split to use. Either 'train' or 'test'. 95 download: Whether to download the data if it is not present. 96 97 Returns: 98 List of filepaths for the h5 data. 99 """ 100 if split not in ("train", "test"): 101 raise ValueError(f"'{split}' is not a valid split. Choose from 'train' or 'test'.") 102 103 data_dir = get_glysac_data(path, download) 104 _create_h5_files(data_dir, split) 105 106 h5_paths = natsorted(glob(os.path.join(data_dir, "h5", split, "*.h5"))) 107 if len(h5_paths) == 0: 108 raise RuntimeError(f"No data found for split '{split}'. Check the dataset at {data_dir}.") 109 110 return h5_paths 111 112 113def get_glysac_dataset( 114 path: Union[os.PathLike, str], 115 patch_shape: Tuple[int, int], 116 split: Literal["train", "test"], 117 label_choice: Literal["instances", "semantic"] = "instances", 118 download: bool = False, 119 **kwargs, 120) -> Dataset: 121 """Get the GLySAC dataset for gastric nuclei segmentation. 122 123 Args: 124 path: Filepath to a folder where the downloaded data will be saved. 125 patch_shape: The patch shape to use for training. 126 split: The data split to use. Either 'train' or 'test'. 127 label_choice: The type of labels to load. Either 'instances' for instance segmentation 128 or 'semantic' for cell type classification (4 classes: other, lymphocyte, epithelial, ambiguous). 129 download: Whether to download the data if it is not present. 130 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 131 132 Returns: 133 The segmentation dataset. 134 """ 135 if label_choice not in ("instances", "semantic"): 136 raise ValueError(f"'{label_choice}' is not a valid label choice. Use 'instances' or 'semantic'.") 137 138 h5_paths = get_glysac_paths(path, split, download) 139 140 if label_choice == "instances": 141 kwargs, _ = util.add_instance_label_transform(kwargs, add_binary_target=True) 142 kwargs = util.ensure_transforms(ndim=2, **kwargs) 143 144 return torch_em.default_segmentation_dataset( 145 raw_paths=h5_paths, 146 raw_key="raw", 147 label_paths=h5_paths, 148 label_key=f"labels/{label_choice}", 149 patch_shape=patch_shape, 150 with_channels=True, 151 ndim=2, 152 **kwargs, 153 ) 154 155 156def get_glysac_loader( 157 path: Union[os.PathLike, str], 158 batch_size: int, 159 patch_shape: Tuple[int, int], 160 split: Literal["train", "test"], 161 label_choice: Literal["instances", "semantic"] = "instances", 162 download: bool = False, 163 **kwargs, 164) -> DataLoader: 165 """Get the GLySAC dataloader for gastric nuclei segmentation. 166 167 Args: 168 path: Filepath to a folder where the downloaded data will be saved. 169 batch_size: The batch size for training. 170 patch_shape: The patch shape to use for training. 171 split: The data split to use. Either 'train' or 'test'. 172 label_choice: The type of labels to load. Either 'instances' for instance segmentation 173 or 'semantic' for cell type classification (4 classes: other, lymphocyte, epithelial, ambiguous). 174 download: Whether to download the data if it is not present. 175 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 176 177 Returns: 178 The DataLoader. 179 """ 180 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 181 dataset = get_glysac_dataset(path, patch_shape, split, label_choice, download, **ds_kwargs) 182 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
64def get_glysac_data(path: Union[os.PathLike, str], download: bool = False) -> str: 65 """Download the GLySAC dataset. 66 67 Args: 68 path: Filepath to a folder where the downloaded data will be saved. 69 download: Whether to download the data if it is not present. 70 71 Returns: 72 The filepath to the data directory. 73 """ 74 data_dir = os.path.join(path, "glysac_dataset") 75 if os.path.exists(data_dir): 76 return data_dir 77 78 os.makedirs(path, exist_ok=True) 79 zip_path = os.path.join(path, "glysac_dataset.zip") 80 util.download_source_gdrive(path=zip_path, url=URL, download=download, checksum=CHECKSUM) 81 util.unzip(zip_path, path) 82 83 return data_dir
Download the GLySAC dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- download: Whether to download the data if it is not present.
Returns:
The filepath to the data directory.
86def get_glysac_paths( 87 path: Union[os.PathLike, str], 88 split: Literal["train", "test"], 89 download: bool = False, 90) -> List[str]: 91 """Get paths to the GLySAC data. 92 93 Args: 94 path: Filepath to a folder where the downloaded data will be saved. 95 split: The data split to use. Either 'train' or 'test'. 96 download: Whether to download the data if it is not present. 97 98 Returns: 99 List of filepaths for the h5 data. 100 """ 101 if split not in ("train", "test"): 102 raise ValueError(f"'{split}' is not a valid split. Choose from 'train' or 'test'.") 103 104 data_dir = get_glysac_data(path, download) 105 _create_h5_files(data_dir, split) 106 107 h5_paths = natsorted(glob(os.path.join(data_dir, "h5", split, "*.h5"))) 108 if len(h5_paths) == 0: 109 raise RuntimeError(f"No data found for split '{split}'. Check the dataset at {data_dir}.") 110 111 return h5_paths
Get paths to the GLySAC data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The data split to use. Either 'train' or 'test'.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the h5 data.
114def get_glysac_dataset( 115 path: Union[os.PathLike, str], 116 patch_shape: Tuple[int, int], 117 split: Literal["train", "test"], 118 label_choice: Literal["instances", "semantic"] = "instances", 119 download: bool = False, 120 **kwargs, 121) -> Dataset: 122 """Get the GLySAC dataset for gastric nuclei segmentation. 123 124 Args: 125 path: Filepath to a folder where the downloaded data will be saved. 126 patch_shape: The patch shape to use for training. 127 split: The data split to use. Either 'train' or 'test'. 128 label_choice: The type of labels to load. Either 'instances' for instance segmentation 129 or 'semantic' for cell type classification (4 classes: other, lymphocyte, epithelial, ambiguous). 130 download: Whether to download the data if it is not present. 131 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 132 133 Returns: 134 The segmentation dataset. 135 """ 136 if label_choice not in ("instances", "semantic"): 137 raise ValueError(f"'{label_choice}' is not a valid label choice. Use 'instances' or 'semantic'.") 138 139 h5_paths = get_glysac_paths(path, split, download) 140 141 if label_choice == "instances": 142 kwargs, _ = util.add_instance_label_transform(kwargs, add_binary_target=True) 143 kwargs = util.ensure_transforms(ndim=2, **kwargs) 144 145 return torch_em.default_segmentation_dataset( 146 raw_paths=h5_paths, 147 raw_key="raw", 148 label_paths=h5_paths, 149 label_key=f"labels/{label_choice}", 150 patch_shape=patch_shape, 151 with_channels=True, 152 ndim=2, 153 **kwargs, 154 )
Get the GLySAC dataset for gastric nuclei segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- split: The data split to use. Either 'train' or 'test'.
- label_choice: The type of labels to load. Either 'instances' for instance segmentation or 'semantic' for cell type classification (4 classes: other, lymphocyte, epithelial, ambiguous).
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset.
Returns:
The segmentation dataset.
157def get_glysac_loader( 158 path: Union[os.PathLike, str], 159 batch_size: int, 160 patch_shape: Tuple[int, int], 161 split: Literal["train", "test"], 162 label_choice: Literal["instances", "semantic"] = "instances", 163 download: bool = False, 164 **kwargs, 165) -> DataLoader: 166 """Get the GLySAC dataloader for gastric nuclei segmentation. 167 168 Args: 169 path: Filepath to a folder where the downloaded data will be saved. 170 batch_size: The batch size for training. 171 patch_shape: The patch shape to use for training. 172 split: The data split to use. Either 'train' or 'test'. 173 label_choice: The type of labels to load. Either 'instances' for instance segmentation 174 or 'semantic' for cell type classification (4 classes: other, lymphocyte, epithelial, ambiguous). 175 download: Whether to download the data if it is not present. 176 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 177 178 Returns: 179 The DataLoader. 180 """ 181 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 182 dataset = get_glysac_dataset(path, patch_shape, split, label_choice, download, **ds_kwargs) 183 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the GLySAC dataloader for gastric nuclei segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- split: The data split to use. Either 'train' or 'test'.
- label_choice: The type of labels to load. Either 'instances' for instance segmentation or 'semantic' for cell type classification (4 classes: other, lymphocyte, epithelial, ambiguous).
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_datasetor for the PyTorch DataLoader.
Returns:
The DataLoader.