torch_em.data.datasets.histopathology.lizard

  1import os
  2import warnings
  3from glob import glob
  4from shutil import rmtree
  5
  6import h5py
  7import imageio.v3 as imageio
  8import torch_em
  9
 10from scipy.io import loadmat
 11from tqdm import tqdm
 12from .. import util
 13
 14# TODO: the links don't work anymore (?)
 15# workaround to still make this work (kaggle still has the dataset in the same structure):
 16#   - download the zip files manually from here - https://www.kaggle.com/datasets/aadimator/lizard-dataset
 17#   - Kaggle API (TODO) - `kaggle datasets download -d aadimator/lizard-dataset`
 18URL1 = "https://warwick.ac.uk/fac/cross_fac/tia/data/lizard/lizard_images1.zip"
 19URL2 = "https://warwick.ac.uk/fac/cross_fac/tia/data/lizard/lizard_images2.zip"
 20LABEL_URL = "https://warwick.ac.uk/fac/cross_fac/tia/data/lizard/lizard_labels.zip"
 21
 22CHECKSUM1 = "d2c4e7c83dff634624c9c14d4a1a0b821d4e9ac41e05e3b36303d8f0c510113d"
 23CHECKSUM2 = "9f529f30d9de66587167991a8bf75aaad07ce1d518b72e825c868ac7c33015ed"
 24LABEL_CHECKSUM = "79f22ca83ca535682fba340cbc8bb66b74abd1ead4151ffc8593f204fcb97dec"
 25
 26
 27def _extract_images(image_folder, label_folder, output_dir):
 28    image_files = glob(os.path.join(image_folder, "*.png"))
 29    for image_file in tqdm(image_files, desc=f"Extract images from {image_folder}"):
 30        fname = os.path.basename(image_file)
 31        label_file = os.path.join(label_folder, fname.replace(".png", ".mat"))
 32        assert os.path.exists(label_file), label_file
 33
 34        image = imageio.imread(image_file)
 35        assert image.ndim == 3 and image.shape[-1] == 3
 36
 37        labels = loadmat(label_file)
 38        segmentation = labels["inst_map"]
 39        assert image.shape[:-1] == segmentation.shape
 40        classes = labels["class"]
 41
 42        image = image.transpose((2, 0, 1))
 43        assert image.shape[1:] == segmentation.shape
 44
 45        output_file = os.path.join(output_dir, fname.replace(".png", ".h5"))
 46        with h5py.File(output_file, "a") as f:
 47            f.create_dataset("image", data=image, compression="gzip")
 48            f.create_dataset("labels/segmentation", data=segmentation, compression="gzip")
 49            f.create_dataset("labels/classes", data=classes, compression="gzip")
 50
 51
 52def _require_lizard_data(path, download):
 53    image_files = glob(os.path.join(path, "*.h5"))
 54    if len(image_files) > 0:
 55        return
 56
 57    os.makedirs(path, exist_ok=True)
 58
 59    zip_path = os.path.join(path, "lizard_images1.zip")
 60    util.download_source(zip_path, URL1, download=download, checksum=CHECKSUM1)
 61    util.unzip(zip_path, path, remove=True)
 62
 63    zip_path = os.path.join(path, "lizard_images2.zip")
 64    util.download_source(zip_path, URL2, download=download, checksum=CHECKSUM2)
 65    util.unzip(zip_path, path, remove=True)
 66
 67    zip_path = os.path.join(path, "lizard_labels.zip")
 68    util.download_source(zip_path, LABEL_URL, download=download, checksum=LABEL_CHECKSUM)
 69    util.unzip(zip_path, path, remove=True)
 70
 71    image_folder1 = os.path.join(path, "Lizard_Images1")
 72    image_folder2 = os.path.join(path, "Lizard_Images2")
 73    label_folder = os.path.join(path, "Lizard_Labels")
 74
 75    assert os.path.exists(image_folder1), image_folder1
 76    assert os.path.exists(image_folder2), image_folder2
 77    assert os.path.exists(label_folder), label_folder
 78
 79    _extract_images(image_folder1, os.path.join(label_folder, "Labels"), path)
 80    _extract_images(image_folder2, os.path.join(label_folder, "Labels"), path)
 81
 82    rmtree(image_folder1)
 83    rmtree(image_folder2)
 84    rmtree(label_folder)
 85
 86
 87def get_lizard_dataset(path, patch_shape, download=False, **kwargs):
 88    """Dataset for the segmentation of nuclei in histopathology.
 89
 90    This dataset is from the publication https://doi.org/10.48550/arXiv.2108.11195.
 91    Please cite it if you use this dataset for a publication.
 92    """
 93    if download:
 94        warnings.warn(
 95            "The download link does not work right now. "
 96            "Please manually download the zip files from https://www.kaggle.com/datasets/aadimator/lizard-dataset"
 97        )
 98
 99    _require_lizard_data(path, download)
100
101    data_paths = glob(os.path.join(path, "*.h5"))
102    data_paths.sort()
103
104    raw_key = "image"
105    label_key = "labels/segmentation"
106    return torch_em.default_segmentation_dataset(
107        data_paths, raw_key, data_paths, label_key, patch_shape, ndim=2, with_channels=True, **kwargs
108    )
109
110
111# TODO implement loading the classification labels
112# TODO implement selecting different tissue types
113# TODO implement train / val / test split (is pre-defined in a csv)
114def get_lizard_loader(path, patch_shape, batch_size, download=False, **kwargs):
115    """Dataloader for the segmentation of nuclei in histopathology. See 'get_lizard_dataset' for details."""
116    ds_kwargs, loader_kwargs = util.split_kwargs(
117        torch_em.default_segmentation_dataset, **kwargs
118    )
119    ds = get_lizard_dataset(path, patch_shape, download=download, **ds_kwargs)
120    return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)
URL1 = 'https://warwick.ac.uk/fac/cross_fac/tia/data/lizard/lizard_images1.zip'
URL2 = 'https://warwick.ac.uk/fac/cross_fac/tia/data/lizard/lizard_images2.zip'
LABEL_URL = 'https://warwick.ac.uk/fac/cross_fac/tia/data/lizard/lizard_labels.zip'
CHECKSUM1 = 'd2c4e7c83dff634624c9c14d4a1a0b821d4e9ac41e05e3b36303d8f0c510113d'
CHECKSUM2 = '9f529f30d9de66587167991a8bf75aaad07ce1d518b72e825c868ac7c33015ed'
LABEL_CHECKSUM = '79f22ca83ca535682fba340cbc8bb66b74abd1ead4151ffc8593f204fcb97dec'
def get_lizard_dataset(path, patch_shape, download=False, **kwargs):
 88def get_lizard_dataset(path, patch_shape, download=False, **kwargs):
 89    """Dataset for the segmentation of nuclei in histopathology.
 90
 91    This dataset is from the publication https://doi.org/10.48550/arXiv.2108.11195.
 92    Please cite it if you use this dataset for a publication.
 93    """
 94    if download:
 95        warnings.warn(
 96            "The download link does not work right now. "
 97            "Please manually download the zip files from https://www.kaggle.com/datasets/aadimator/lizard-dataset"
 98        )
 99
100    _require_lizard_data(path, download)
101
102    data_paths = glob(os.path.join(path, "*.h5"))
103    data_paths.sort()
104
105    raw_key = "image"
106    label_key = "labels/segmentation"
107    return torch_em.default_segmentation_dataset(
108        data_paths, raw_key, data_paths, label_key, patch_shape, ndim=2, with_channels=True, **kwargs
109    )

Dataset for the segmentation of nuclei in histopathology.

This dataset is from the publication https://doi.org/10.48550/arXiv.2108.11195. Please cite it if you use this dataset for a publication.

def get_lizard_loader(path, patch_shape, batch_size, download=False, **kwargs):
115def get_lizard_loader(path, patch_shape, batch_size, download=False, **kwargs):
116    """Dataloader for the segmentation of nuclei in histopathology. See 'get_lizard_dataset' for details."""
117    ds_kwargs, loader_kwargs = util.split_kwargs(
118        torch_em.default_segmentation_dataset, **kwargs
119    )
120    ds = get_lizard_dataset(path, patch_shape, download=download, **ds_kwargs)
121    return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)

Dataloader for the segmentation of nuclei in histopathology. See 'get_lizard_dataset' for details.