torch_em.data.datasets.histopathology.lizard
The Lizard dataset contains annotations for nucleus segmentation in histopathology images in H&E stained colon tissue.
This dataset is from the publication https://doi.org/10.48550/arXiv.2108.11195. Please cite it if you use this dataset for your research.
1"""The Lizard dataset contains annotations for nucleus segmentation 2in histopathology images in H&E stained colon tissue. 3 4This dataset is from the publication https://doi.org/10.48550/arXiv.2108.11195. 5Please cite it if you use this dataset for your research. 6""" 7 8import os 9from glob import glob 10from tqdm import tqdm 11from pathlib import Path 12from shutil import rmtree 13from natsort import natsorted 14from typing import Tuple, Union, List, Literal 15 16import pandas as pd 17import imageio.v3 as imageio 18from scipy.io import loadmat 19 20from torch.utils.data import Dataset, DataLoader 21 22import torch_em 23 24from .. import util 25 26 27SPLIT_MAP = {"train": 1, "val": 2, "test": 3} 28 29 30def _create_split_list(path, split): 31 df = pd.read_csv(os.path.join(path, 'lizard_labels', 'Lizard_Labels', 'info.csv')) 32 split_list = [df['Filename'].iloc[i] for i in df.index if df['Split'].iloc[i] == SPLIT_MAP[split]] 33 return split_list 34 35 36def _extract_images(split, image_folder, label_folder, output_dir): 37 import h5py 38 39 image_files = glob(os.path.join(image_folder, "*.png")) 40 split_list = _create_split_list(output_dir, split) 41 os.makedirs(os.path.join(output_dir, split), exist_ok=True) 42 43 for image_file in tqdm(image_files, desc=f"Extract images from {os.path.abspath(image_folder)}"): 44 fname = Path(os.path.basename(image_file)) 45 if fname.stem not in split_list: 46 continue 47 48 label_file = os.path.join(label_folder, fname.with_suffix(".mat")) 49 assert os.path.exists(label_file), label_file 50 51 image = imageio.imread(image_file) 52 assert image.ndim == 3 and image.shape[-1] == 3 53 54 labels = loadmat(label_file) 55 segmentation = labels["inst_map"] 56 assert image.shape[:-1] == segmentation.shape 57 classes = labels["class"] 58 59 image = image.transpose((2, 0, 1)) 60 assert image.shape[1:] == segmentation.shape 61 62 output_file = os.path.join(output_dir, split, fname.with_suffix(".h5")) 63 with h5py.File(output_file, "a") as f: 64 f.create_dataset("image", data=image, compression="gzip") 65 f.create_dataset("labels/segmentation", data=segmentation, compression="gzip") 66 f.create_dataset("labels/classes", data=classes, compression="gzip") 67 68 69def get_lizard_data(path: Union[os.PathLike, str], split: Literal["train", "val", "test"], download: bool = False): 70 """Download the Lizard dataset for nucleus segmentation. 71 72 Args: 73 path: Filepath to a folder where the downloaded data will be saved. 74 split: The choice of data split. 75 download: Whether to download the data if it is not present. 76 """ 77 if split not in SPLIT_MAP.keys(): 78 raise ValueError(f"'{split}' is not a valid split.") 79 80 image_files = glob(os.path.join(path, split, "*.h5")) 81 if len(image_files) > 0: 82 return 83 84 os.makedirs(path, exist_ok=True) 85 util.download_source_kaggle(path=path, dataset_name="aadimator/lizard-dataset", download=download) 86 zip_path = os.path.join(path, "lizard-dataset.zip") 87 util.unzip(zip_path=zip_path, dst=path) 88 89 image_folder1 = os.path.join(path, "lizard_images1", "Lizard_Images1") 90 image_folder2 = os.path.join(path, "lizard_images2", "Lizard_Images2") 91 label_folder = os.path.join(path, "lizard_labels", "Lizard_Labels") 92 93 assert os.path.exists(image_folder1), image_folder1 94 assert os.path.exists(image_folder2), image_folder2 95 assert os.path.exists(label_folder), label_folder 96 97 # Extract and preprocess images for all splits 98 for _split in SPLIT_MAP.keys(): 99 _extract_images(_split, image_folder1, os.path.join(label_folder, "Labels"), path) 100 _extract_images(_split, image_folder2, os.path.join(label_folder, "Labels"), path) 101 102 rmtree(os.path.join(path, "lizard_images1")) 103 rmtree(os.path.join(path, "lizard_images2")) 104 rmtree(os.path.join(path, "lizard_labels")) 105 rmtree(os.path.join(path, "overlay")) 106 107 108def get_lizard_paths( 109 path: Union[os.PathLike], split: Literal["train", "val", "test"], download: bool = False 110) -> List[str]: 111 """Get paths to the Lizard data. 112 113 Args: 114 path: Filepath to a folder where the downloaded data will be saved. 115 split: The choice of data splits. 116 download: Whether to download the data if it is not present. 117 118 Returns: 119 List of filepaths for the stored data. 120 """ 121 get_lizard_data(path, split, download) 122 data_paths = natsorted(glob(os.path.join(path, split, "*.h5"))) 123 return data_paths 124 125 126def get_lizard_dataset( 127 path: Union[os.PathLike, str], 128 patch_shape: Tuple[int, int], 129 split: Literal["train", "val", "test"], 130 resize_inputs: bool = False, 131 download: bool = False, 132 **kwargs 133) -> Dataset: 134 """Get the Lizard dataset for nucleus segmentation. 135 136 Args: 137 path: Filepath to a folder where the downloaded data will be saved. 138 patch_shape: The patch shape to use for training. 139 split: The choice of data split. 140 resize_inputs: Whether to resize the input images. 141 download: Whether to download the data if it is not present. 142 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 143 144 Returns: 145 The segmentation dataset. 146 """ 147 data_paths = get_lizard_paths(path, split, download) 148 149 if resize_inputs: 150 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True} 151 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 152 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 153 ) 154 155 return torch_em.default_segmentation_dataset( 156 raw_paths=data_paths, 157 raw_key="image", 158 label_paths=data_paths, 159 label_key="labels/segmentation", 160 patch_shape=patch_shape, 161 ndim=2, 162 with_channels=True, 163 **kwargs 164 ) 165 166 167# TODO implement loading the classification labels 168# TODO implement selecting different tissue types 169def get_lizard_loader( 170 path: Union[os.PathLike, str], 171 batch_size: int, 172 patch_shape: Tuple[int, int], 173 split: Literal["train", "val", "test"], 174 resize_inputs: bool = False, 175 download: bool = False, 176 **kwargs 177) -> DataLoader: 178 """Get the Lizard dataloader for nucleus segmentation. 179 180 Args: 181 path: Filepath to a folder where the downloaded data will be saved. 182 batch_size: The batch size for training. 183 patch_shape: The patch shape to use for training. 184 split: The choice of data split. 185 resize_inputs: Whether to resize the inputs. 186 download: Whether to download the data if it is not present. 187 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 188 189 Returns: 190 The DataLoader. 191 """ 192 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 193 ds = get_lizard_dataset(path, patch_shape, split, resize_inputs, download, **ds_kwargs) 194 return torch_em.get_data_loader(ds, batch_size, **loader_kwargs)
SPLIT_MAP =
{'train': 1, 'val': 2, 'test': 3}
def
get_lizard_data( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], download: bool = False):
70def get_lizard_data(path: Union[os.PathLike, str], split: Literal["train", "val", "test"], download: bool = False): 71 """Download the Lizard dataset for nucleus segmentation. 72 73 Args: 74 path: Filepath to a folder where the downloaded data will be saved. 75 split: The choice of data split. 76 download: Whether to download the data if it is not present. 77 """ 78 if split not in SPLIT_MAP.keys(): 79 raise ValueError(f"'{split}' is not a valid split.") 80 81 image_files = glob(os.path.join(path, split, "*.h5")) 82 if len(image_files) > 0: 83 return 84 85 os.makedirs(path, exist_ok=True) 86 util.download_source_kaggle(path=path, dataset_name="aadimator/lizard-dataset", download=download) 87 zip_path = os.path.join(path, "lizard-dataset.zip") 88 util.unzip(zip_path=zip_path, dst=path) 89 90 image_folder1 = os.path.join(path, "lizard_images1", "Lizard_Images1") 91 image_folder2 = os.path.join(path, "lizard_images2", "Lizard_Images2") 92 label_folder = os.path.join(path, "lizard_labels", "Lizard_Labels") 93 94 assert os.path.exists(image_folder1), image_folder1 95 assert os.path.exists(image_folder2), image_folder2 96 assert os.path.exists(label_folder), label_folder 97 98 # Extract and preprocess images for all splits 99 for _split in SPLIT_MAP.keys(): 100 _extract_images(_split, image_folder1, os.path.join(label_folder, "Labels"), path) 101 _extract_images(_split, image_folder2, os.path.join(label_folder, "Labels"), path) 102 103 rmtree(os.path.join(path, "lizard_images1")) 104 rmtree(os.path.join(path, "lizard_images2")) 105 rmtree(os.path.join(path, "lizard_labels")) 106 rmtree(os.path.join(path, "overlay"))
Download the Lizard dataset for nucleus segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The choice of data split.
- download: Whether to download the data if it is not present.
def
get_lizard_paths( path: os.PathLike, split: Literal['train', 'val', 'test'], download: bool = False) -> List[str]:
109def get_lizard_paths( 110 path: Union[os.PathLike], split: Literal["train", "val", "test"], download: bool = False 111) -> List[str]: 112 """Get paths to the Lizard data. 113 114 Args: 115 path: Filepath to a folder where the downloaded data will be saved. 116 split: The choice of data splits. 117 download: Whether to download the data if it is not present. 118 119 Returns: 120 List of filepaths for the stored data. 121 """ 122 get_lizard_data(path, split, download) 123 data_paths = natsorted(glob(os.path.join(path, split, "*.h5"))) 124 return data_paths
Get paths to the Lizard data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The choice of data splits.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the stored data.
def
get_lizard_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
127def get_lizard_dataset( 128 path: Union[os.PathLike, str], 129 patch_shape: Tuple[int, int], 130 split: Literal["train", "val", "test"], 131 resize_inputs: bool = False, 132 download: bool = False, 133 **kwargs 134) -> Dataset: 135 """Get the Lizard dataset for nucleus segmentation. 136 137 Args: 138 path: Filepath to a folder where the downloaded data will be saved. 139 patch_shape: The patch shape to use for training. 140 split: The choice of data split. 141 resize_inputs: Whether to resize the input images. 142 download: Whether to download the data if it is not present. 143 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 144 145 Returns: 146 The segmentation dataset. 147 """ 148 data_paths = get_lizard_paths(path, split, download) 149 150 if resize_inputs: 151 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True} 152 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 153 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 154 ) 155 156 return torch_em.default_segmentation_dataset( 157 raw_paths=data_paths, 158 raw_key="image", 159 label_paths=data_paths, 160 label_key="labels/segmentation", 161 patch_shape=patch_shape, 162 ndim=2, 163 with_channels=True, 164 **kwargs 165 )
Get the Lizard dataset for nucleus segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- split: The choice of data split.
- resize_inputs: Whether to resize the input images.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
def
get_lizard_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
170def get_lizard_loader( 171 path: Union[os.PathLike, str], 172 batch_size: int, 173 patch_shape: Tuple[int, int], 174 split: Literal["train", "val", "test"], 175 resize_inputs: bool = False, 176 download: bool = False, 177 **kwargs 178) -> DataLoader: 179 """Get the Lizard dataloader for nucleus segmentation. 180 181 Args: 182 path: Filepath to a folder where the downloaded data will be saved. 183 batch_size: The batch size for training. 184 patch_shape: The patch shape to use for training. 185 split: The choice of data split. 186 resize_inputs: Whether to resize the inputs. 187 download: Whether to download the data if it is not present. 188 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 189 190 Returns: 191 The DataLoader. 192 """ 193 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 194 ds = get_lizard_dataset(path, patch_shape, split, resize_inputs, download, **ds_kwargs) 195 return torch_em.get_data_loader(ds, batch_size, **loader_kwargs)
Get the Lizard dataloader for nucleus segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- split: The choice of data split.
- resize_inputs: Whether to resize the inputs.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader.