torch_em.data.datasets.histopathology.lizard
1import os 2import warnings 3from glob import glob 4from shutil import rmtree 5 6import h5py 7import imageio.v3 as imageio 8import torch_em 9 10from scipy.io import loadmat 11from tqdm import tqdm 12from .. import util 13 14# TODO: the links don't work anymore (?) 15# workaround to still make this work (kaggle still has the dataset in the same structure): 16# - download the zip files manually from here - https://www.kaggle.com/datasets/aadimator/lizard-dataset 17# - Kaggle API (TODO) - `kaggle datasets download -d aadimator/lizard-dataset` 18URL1 = "https://warwick.ac.uk/fac/cross_fac/tia/data/lizard/lizard_images1.zip" 19URL2 = "https://warwick.ac.uk/fac/cross_fac/tia/data/lizard/lizard_images2.zip" 20LABEL_URL = "https://warwick.ac.uk/fac/cross_fac/tia/data/lizard/lizard_labels.zip" 21 22CHECKSUM1 = "d2c4e7c83dff634624c9c14d4a1a0b821d4e9ac41e05e3b36303d8f0c510113d" 23CHECKSUM2 = "9f529f30d9de66587167991a8bf75aaad07ce1d518b72e825c868ac7c33015ed" 24LABEL_CHECKSUM = "79f22ca83ca535682fba340cbc8bb66b74abd1ead4151ffc8593f204fcb97dec" 25 26 27def _extract_images(image_folder, label_folder, output_dir): 28 image_files = glob(os.path.join(image_folder, "*.png")) 29 for image_file in tqdm(image_files, desc=f"Extract images from {image_folder}"): 30 fname = os.path.basename(image_file) 31 label_file = os.path.join(label_folder, fname.replace(".png", ".mat")) 32 assert os.path.exists(label_file), label_file 33 34 image = imageio.imread(image_file) 35 assert image.ndim == 3 and image.shape[-1] == 3 36 37 labels = loadmat(label_file) 38 segmentation = labels["inst_map"] 39 assert image.shape[:-1] == segmentation.shape 40 classes = labels["class"] 41 42 image = image.transpose((2, 0, 1)) 43 assert image.shape[1:] == segmentation.shape 44 45 output_file = os.path.join(output_dir, fname.replace(".png", ".h5")) 46 with h5py.File(output_file, "a") as f: 47 f.create_dataset("image", data=image, compression="gzip") 48 f.create_dataset("labels/segmentation", data=segmentation, compression="gzip") 49 f.create_dataset("labels/classes", data=classes, compression="gzip") 50 51 52def _require_lizard_data(path, download): 53 image_files = glob(os.path.join(path, "*.h5")) 54 if len(image_files) > 0: 55 return 56 57 os.makedirs(path, exist_ok=True) 58 59 zip_path = os.path.join(path, "lizard_images1.zip") 60 util.download_source(zip_path, URL1, download=download, checksum=CHECKSUM1) 61 util.unzip(zip_path, path, remove=True) 62 63 zip_path = os.path.join(path, "lizard_images2.zip") 64 util.download_source(zip_path, URL2, download=download, checksum=CHECKSUM2) 65 util.unzip(zip_path, path, remove=True) 66 67 zip_path = os.path.join(path, "lizard_labels.zip") 68 util.download_source(zip_path, LABEL_URL, download=download, checksum=LABEL_CHECKSUM) 69 util.unzip(zip_path, path, remove=True) 70 71 image_folder1 = os.path.join(path, "Lizard_Images1") 72 image_folder2 = os.path.join(path, "Lizard_Images2") 73 label_folder = os.path.join(path, "Lizard_Labels") 74 75 assert os.path.exists(image_folder1), image_folder1 76 assert os.path.exists(image_folder2), image_folder2 77 assert os.path.exists(label_folder), label_folder 78 79 _extract_images(image_folder1, os.path.join(label_folder, "Labels"), path) 80 _extract_images(image_folder2, os.path.join(label_folder, "Labels"), path) 81 82 rmtree(image_folder1) 83 rmtree(image_folder2) 84 rmtree(label_folder) 85 86 87def get_lizard_dataset(path, patch_shape, download=False, **kwargs): 88 """Dataset for the segmentation of nuclei in histopathology. 89 90 This dataset is from the publication https://doi.org/10.48550/arXiv.2108.11195. 91 Please cite it if you use this dataset for a publication. 92 """ 93 if download: 94 warnings.warn( 95 "The download link does not work right now. " 96 "Please manually download the zip files from https://www.kaggle.com/datasets/aadimator/lizard-dataset" 97 ) 98 99 _require_lizard_data(path, download) 100 101 data_paths = glob(os.path.join(path, "*.h5")) 102 data_paths.sort() 103 104 raw_key = "image" 105 label_key = "labels/segmentation" 106 return torch_em.default_segmentation_dataset( 107 data_paths, raw_key, data_paths, label_key, patch_shape, ndim=2, with_channels=True, **kwargs 108 ) 109 110 111# TODO implement loading the classification labels 112# TODO implement selecting different tissue types 113# TODO implement train / val / test split (is pre-defined in a csv) 114def get_lizard_loader(path, patch_shape, batch_size, download=False, **kwargs): 115 """Dataloader for the segmentation of nuclei in histopathology. See 'get_lizard_dataset' for details.""" 116 ds_kwargs, loader_kwargs = util.split_kwargs( 117 torch_em.default_segmentation_dataset, **kwargs 118 ) 119 ds = get_lizard_dataset(path, patch_shape, download=download, **ds_kwargs) 120 return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)
URL1 =
'https://warwick.ac.uk/fac/cross_fac/tia/data/lizard/lizard_images1.zip'
URL2 =
'https://warwick.ac.uk/fac/cross_fac/tia/data/lizard/lizard_images2.zip'
LABEL_URL =
'https://warwick.ac.uk/fac/cross_fac/tia/data/lizard/lizard_labels.zip'
CHECKSUM1 =
'd2c4e7c83dff634624c9c14d4a1a0b821d4e9ac41e05e3b36303d8f0c510113d'
CHECKSUM2 =
'9f529f30d9de66587167991a8bf75aaad07ce1d518b72e825c868ac7c33015ed'
LABEL_CHECKSUM =
'79f22ca83ca535682fba340cbc8bb66b74abd1ead4151ffc8593f204fcb97dec'
def
get_lizard_dataset(path, patch_shape, download=False, **kwargs):
88def get_lizard_dataset(path, patch_shape, download=False, **kwargs): 89 """Dataset for the segmentation of nuclei in histopathology. 90 91 This dataset is from the publication https://doi.org/10.48550/arXiv.2108.11195. 92 Please cite it if you use this dataset for a publication. 93 """ 94 if download: 95 warnings.warn( 96 "The download link does not work right now. " 97 "Please manually download the zip files from https://www.kaggle.com/datasets/aadimator/lizard-dataset" 98 ) 99 100 _require_lizard_data(path, download) 101 102 data_paths = glob(os.path.join(path, "*.h5")) 103 data_paths.sort() 104 105 raw_key = "image" 106 label_key = "labels/segmentation" 107 return torch_em.default_segmentation_dataset( 108 data_paths, raw_key, data_paths, label_key, patch_shape, ndim=2, with_channels=True, **kwargs 109 )
Dataset for the segmentation of nuclei in histopathology.
This dataset is from the publication https://doi.org/10.48550/arXiv.2108.11195. Please cite it if you use this dataset for a publication.
def
get_lizard_loader(path, patch_shape, batch_size, download=False, **kwargs):
115def get_lizard_loader(path, patch_shape, batch_size, download=False, **kwargs): 116 """Dataloader for the segmentation of nuclei in histopathology. See 'get_lizard_dataset' for details.""" 117 ds_kwargs, loader_kwargs = util.split_kwargs( 118 torch_em.default_segmentation_dataset, **kwargs 119 ) 120 ds = get_lizard_dataset(path, patch_shape, download=download, **ds_kwargs) 121 return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)
Dataloader for the segmentation of nuclei in histopathology. See 'get_lizard_dataset' for details.