torch_em.data.datasets.histopathology.lizard

The Lizard dataset contains annotations for nucleus segmentation in histopathology images in H&E stained colon tissue.

This dataset is from the publication https://doi.org/10.48550/arXiv.2108.11195. Please cite it if you use this dataset for your research.

  1"""The Lizard dataset contains annotations for nucleus segmentation
  2in histopathology images in H&E stained colon tissue.
  3
  4This dataset is from the publication https://doi.org/10.48550/arXiv.2108.11195.
  5Please cite it if you use this dataset for your research.
  6"""
  7
  8import os
  9from glob import glob
 10from tqdm import tqdm
 11from pathlib import Path
 12from shutil import rmtree
 13from natsort import natsorted
 14from typing import Tuple, Union, List, Literal
 15
 16import pandas as pd
 17import imageio.v3 as imageio
 18from scipy.io import loadmat
 19
 20from torch.utils.data import Dataset, DataLoader
 21
 22import torch_em
 23
 24from .. import util
 25
 26
 27SPLIT_MAP = {"train": 1, "val": 2, "test": 3}
 28
 29
 30def _create_split_list(path, split):
 31    df = pd.read_csv(os.path.join(path, 'lizard_labels', 'Lizard_Labels', 'info.csv'))
 32    split_list = [df['Filename'].iloc[i] for i in df.index if df['Split'].iloc[i] == SPLIT_MAP[split]]
 33    return split_list
 34
 35
 36def _extract_images(split, image_folder, label_folder, output_dir):
 37    import h5py
 38
 39    image_files = glob(os.path.join(image_folder, "*.png"))
 40    split_list = _create_split_list(output_dir, split)
 41    os.makedirs(os.path.join(output_dir, split), exist_ok=True)
 42
 43    for image_file in tqdm(image_files, desc=f"Extract images from {os.path.abspath(image_folder)}"):
 44        fname = Path(os.path.basename(image_file))
 45        if fname.stem not in split_list:
 46            continue
 47
 48        label_file = os.path.join(label_folder, fname.with_suffix(".mat"))
 49        assert os.path.exists(label_file), label_file
 50
 51        image = imageio.imread(image_file)
 52        assert image.ndim == 3 and image.shape[-1] == 3
 53
 54        labels = loadmat(label_file)
 55        segmentation = labels["inst_map"]
 56        assert image.shape[:-1] == segmentation.shape
 57        classes = labels["class"]
 58
 59        image = image.transpose((2, 0, 1))
 60        assert image.shape[1:] == segmentation.shape
 61
 62        output_file = os.path.join(output_dir, split, fname.with_suffix(".h5"))
 63        with h5py.File(output_file, "a") as f:
 64            f.create_dataset("image", data=image, compression="gzip")
 65            f.create_dataset("labels/segmentation", data=segmentation, compression="gzip")
 66            f.create_dataset("labels/classes", data=classes, compression="gzip")
 67
 68
 69def get_lizard_data(path: Union[os.PathLike, str], split: Literal["train", "val", "test"], download: bool = False):
 70    """Download the Lizard dataset for nucleus segmentation.
 71
 72    Args:
 73        path: Filepath to a folder where the downloaded data will be saved.
 74        split: The choice of data split.
 75        download: Whether to download the data if it is not present.
 76    """
 77    if split not in SPLIT_MAP.keys():
 78        raise ValueError(f"'{split}' is not a valid split.")
 79
 80    image_files = glob(os.path.join(path, split, "*.h5"))
 81    if len(image_files) > 0:
 82        return
 83
 84    os.makedirs(path, exist_ok=True)
 85    util.download_source_kaggle(path=path, dataset_name="aadimator/lizard-dataset", download=download)
 86    zip_path = os.path.join(path, "lizard-dataset.zip")
 87    util.unzip(zip_path=zip_path, dst=path)
 88
 89    image_folder1 = os.path.join(path, "lizard_images1", "Lizard_Images1")
 90    image_folder2 = os.path.join(path, "lizard_images2",  "Lizard_Images2")
 91    label_folder = os.path.join(path, "lizard_labels", "Lizard_Labels")
 92
 93    assert os.path.exists(image_folder1), image_folder1
 94    assert os.path.exists(image_folder2), image_folder2
 95    assert os.path.exists(label_folder), label_folder
 96
 97    # Extract and preprocess images for all splits
 98    for _split in SPLIT_MAP.keys():
 99        _extract_images(_split, image_folder1, os.path.join(label_folder, "Labels"), path)
100        _extract_images(_split, image_folder2, os.path.join(label_folder, "Labels"), path)
101
102    rmtree(os.path.join(path, "lizard_images1"))
103    rmtree(os.path.join(path, "lizard_images2"))
104    rmtree(os.path.join(path, "lizard_labels"))
105    rmtree(os.path.join(path, "overlay"))
106
107
108def get_lizard_paths(
109    path: Union[os.PathLike], split: Literal["train", "val", "test"], download: bool = False
110) -> List[str]:
111    """Get paths to the Lizard data.
112
113    Args:
114        path: Filepath to a folder where the downloaded data will be saved.
115        split: The choice of data splits.
116        download: Whether to download the data if it is not present.
117
118    Returns:
119        List of filepaths for the stored data.
120    """
121    get_lizard_data(path, split, download)
122    data_paths = natsorted(glob(os.path.join(path, split, "*.h5")))
123    return data_paths
124
125
126def get_lizard_dataset(
127    path: Union[os.PathLike, str],
128    patch_shape: Tuple[int, int],
129    split: Literal["train", "val", "test"],
130    resize_inputs: bool = False,
131    download: bool = False,
132    **kwargs
133) -> Dataset:
134    """Get the Lizard dataset for nucleus segmentation.
135
136    Args:
137        path: Filepath to a folder where the downloaded data will be saved.
138        patch_shape: The patch shape to use for training.
139        split: The choice of data split.
140        resize_inputs: Whether to resize the input images.
141        download: Whether to download the data if it is not present.
142        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
143
144    Returns:
145        The segmentation dataset.
146    """
147    data_paths = get_lizard_paths(path, split, download)
148
149    if resize_inputs:
150        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
151        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
152            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
153        )
154
155    return torch_em.default_segmentation_dataset(
156        raw_paths=data_paths,
157        raw_key="image",
158        label_paths=data_paths,
159        label_key="labels/segmentation",
160        patch_shape=patch_shape,
161        ndim=2,
162        with_channels=True,
163        **kwargs
164    )
165
166
167# TODO implement loading the classification labels
168# TODO implement selecting different tissue types
169def get_lizard_loader(
170    path: Union[os.PathLike, str],
171    batch_size: int,
172    patch_shape: Tuple[int, int],
173    split: Literal["train", "val", "test"],
174    resize_inputs: bool = False,
175    download: bool = False,
176    **kwargs
177) -> DataLoader:
178    """Get the Lizard dataloader for nucleus segmentation.
179
180    Args:
181        path: Filepath to a folder where the downloaded data will be saved.
182        batch_size: The batch size for training.
183        patch_shape: The patch shape to use for training.
184        split: The choice of data split.
185        resize_inputs: Whether to resize the inputs.
186        download: Whether to download the data if it is not present.
187        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
188
189    Returns:
190        The DataLoader.
191    """
192    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
193    ds = get_lizard_dataset(path, patch_shape, split, resize_inputs, download, **ds_kwargs)
194    return torch_em.get_data_loader(ds, batch_size, **loader_kwargs)
SPLIT_MAP = {'train': 1, 'val': 2, 'test': 3}
def get_lizard_data( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], download: bool = False):
 70def get_lizard_data(path: Union[os.PathLike, str], split: Literal["train", "val", "test"], download: bool = False):
 71    """Download the Lizard dataset for nucleus segmentation.
 72
 73    Args:
 74        path: Filepath to a folder where the downloaded data will be saved.
 75        split: The choice of data split.
 76        download: Whether to download the data if it is not present.
 77    """
 78    if split not in SPLIT_MAP.keys():
 79        raise ValueError(f"'{split}' is not a valid split.")
 80
 81    image_files = glob(os.path.join(path, split, "*.h5"))
 82    if len(image_files) > 0:
 83        return
 84
 85    os.makedirs(path, exist_ok=True)
 86    util.download_source_kaggle(path=path, dataset_name="aadimator/lizard-dataset", download=download)
 87    zip_path = os.path.join(path, "lizard-dataset.zip")
 88    util.unzip(zip_path=zip_path, dst=path)
 89
 90    image_folder1 = os.path.join(path, "lizard_images1", "Lizard_Images1")
 91    image_folder2 = os.path.join(path, "lizard_images2",  "Lizard_Images2")
 92    label_folder = os.path.join(path, "lizard_labels", "Lizard_Labels")
 93
 94    assert os.path.exists(image_folder1), image_folder1
 95    assert os.path.exists(image_folder2), image_folder2
 96    assert os.path.exists(label_folder), label_folder
 97
 98    # Extract and preprocess images for all splits
 99    for _split in SPLIT_MAP.keys():
100        _extract_images(_split, image_folder1, os.path.join(label_folder, "Labels"), path)
101        _extract_images(_split, image_folder2, os.path.join(label_folder, "Labels"), path)
102
103    rmtree(os.path.join(path, "lizard_images1"))
104    rmtree(os.path.join(path, "lizard_images2"))
105    rmtree(os.path.join(path, "lizard_labels"))
106    rmtree(os.path.join(path, "overlay"))

Download the Lizard dataset for nucleus segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • split: The choice of data split.
  • download: Whether to download the data if it is not present.
def get_lizard_paths( path: os.PathLike, split: Literal['train', 'val', 'test'], download: bool = False) -> List[str]:
109def get_lizard_paths(
110    path: Union[os.PathLike], split: Literal["train", "val", "test"], download: bool = False
111) -> List[str]:
112    """Get paths to the Lizard data.
113
114    Args:
115        path: Filepath to a folder where the downloaded data will be saved.
116        split: The choice of data splits.
117        download: Whether to download the data if it is not present.
118
119    Returns:
120        List of filepaths for the stored data.
121    """
122    get_lizard_data(path, split, download)
123    data_paths = natsorted(glob(os.path.join(path, split, "*.h5")))
124    return data_paths

Get paths to the Lizard data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • split: The choice of data splits.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths for the stored data.

def get_lizard_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
127def get_lizard_dataset(
128    path: Union[os.PathLike, str],
129    patch_shape: Tuple[int, int],
130    split: Literal["train", "val", "test"],
131    resize_inputs: bool = False,
132    download: bool = False,
133    **kwargs
134) -> Dataset:
135    """Get the Lizard dataset for nucleus segmentation.
136
137    Args:
138        path: Filepath to a folder where the downloaded data will be saved.
139        patch_shape: The patch shape to use for training.
140        split: The choice of data split.
141        resize_inputs: Whether to resize the input images.
142        download: Whether to download the data if it is not present.
143        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
144
145    Returns:
146        The segmentation dataset.
147    """
148    data_paths = get_lizard_paths(path, split, download)
149
150    if resize_inputs:
151        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
152        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
153            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
154        )
155
156    return torch_em.default_segmentation_dataset(
157        raw_paths=data_paths,
158        raw_key="image",
159        label_paths=data_paths,
160        label_key="labels/segmentation",
161        patch_shape=patch_shape,
162        ndim=2,
163        with_channels=True,
164        **kwargs
165    )

Get the Lizard dataset for nucleus segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • split: The choice of data split.
  • resize_inputs: Whether to resize the input images.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_lizard_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
170def get_lizard_loader(
171    path: Union[os.PathLike, str],
172    batch_size: int,
173    patch_shape: Tuple[int, int],
174    split: Literal["train", "val", "test"],
175    resize_inputs: bool = False,
176    download: bool = False,
177    **kwargs
178) -> DataLoader:
179    """Get the Lizard dataloader for nucleus segmentation.
180
181    Args:
182        path: Filepath to a folder where the downloaded data will be saved.
183        batch_size: The batch size for training.
184        patch_shape: The patch shape to use for training.
185        split: The choice of data split.
186        resize_inputs: Whether to resize the inputs.
187        download: Whether to download the data if it is not present.
188        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
189
190    Returns:
191        The DataLoader.
192    """
193    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
194    ds = get_lizard_dataset(path, patch_shape, split, resize_inputs, download, **ds_kwargs)
195    return torch_em.get_data_loader(ds, batch_size, **loader_kwargs)

Get the Lizard dataloader for nucleus segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape to use for training.
  • split: The choice of data split.
  • resize_inputs: Whether to resize the inputs.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.