torch_em.data.datasets.light_microscopy.vgg_hela
This is a dataset for counting HeLA cells in phase-contrast microscopy.
It is described in the publication https://www.robots.ox.ac.uk/~vgg/publications/2012/Arteta12/. Please cite it if you use this dataset in your research.
1"""This is a dataset for counting HeLA cells in phase-contrast microscopy. 2 3It is described in the publication https://www.robots.ox.ac.uk/~vgg/publications/2012/Arteta12/. 4Please cite it if you use this dataset in your research. 5""" 6 7import os 8from glob import glob 9from shutil import rmtree 10from typing import Tuple, Union 11 12import numpy as np 13import imageio.v3 as imageio 14from scipy.io import loadmat 15 16from torch.utils.data import Dataset, DataLoader 17 18import torch_em 19 20from .. import util 21 22 23URL = "https://www.robots.ox.ac.uk/~vgg/software/cell_detection/downloads/CellDetect_v1.0.tar.gz" 24CHECKSUM = "09825d6a8e287ddf2c4b1ef3d2f62585ec6876e3bfcd4b9bbcd3dd300e4be282" 25 26 27def get_vgg_hela_data(path: Union[os.PathLike, str], download: bool) -> str: 28 """Download the HeLA VGG dataset. 29 30 Args: 31 path: Filepath to a folder where the downloaded data will be saved. 32 download: Whether to download the data if it is not present. 33 34 Returns: 35 The filepath to the training data. 36 """ 37 os.makedirs(path, exist_ok=True) 38 url = URL 39 checksum = CHECKSUM 40 41 train_path = os.path.join(path, "train") 42 test_path = os.path.join(path, "test") 43 44 if os.path.exists(train_path) and os.path.exists(test_path): 45 return path 46 47 dl_path = os.path.join(path, "cell_detect.tar.gz") 48 util.download_source(dl_path, url, download, checksum) 49 util.unzip_tarfile(dl_path, path, True) 50 51 extracted_path = os.path.join(path, "CellDetect_v1.0") 52 assert os.path.exists(extracted_path), extracted_path 53 54 splits_in = ["trainPhasecontrast", "testPhasecontrast"] 55 splits_out = [train_path, test_path] 56 57 for split_in, out_folder in zip(splits_in, splits_out): 58 out_im_folder = os.path.join(out_folder, "images") 59 os.makedirs(out_im_folder, exist_ok=True) 60 61 out_label_folder = os.path.join(out_folder, "labels") 62 os.makedirs(out_label_folder, exist_ok=True) 63 64 split_root = os.path.join(extracted_path, "phasecontrast", split_in) 65 image_files = sorted(glob(os.path.join(split_root, "*.pgm"))) 66 mat_files = sorted(glob(os.path.join(split_root, "*.mat"))) 67 68 for ii, (im, mat) in enumerate(zip(image_files, mat_files), 1): 69 im = imageio.imread(im) 70 coordinates = loadmat(mat)["gt"] - 1 71 coordinates = (coordinates[:, 1], coordinates[:, 0]) 72 73 out_im = os.path.join(out_im_folder, f"im{ii:02}.tif") 74 imageio.imwrite(out_im, im, compression="zlib") 75 76 labels = np.zeros(im.shape, dtype="uint8") 77 labels[coordinates] = 1 78 out_labels = os.path.join(out_label_folder, f"im{ii:02}.tif") 79 imageio.imwrite(out_labels, labels, compression="zlib") 80 81 rmtree(extracted_path) 82 return path 83 84 85def get_vgg_hela_paths(path: Union[os.PathLike, str], split: str, download: bool = False) -> Tuple[str, str]: 86 """Get paths for HeLA VGG data. 87 88 Args: 89 path: Filepath to a folder where the downloaded data will be saved. 90 split: The split to use for the dataset. Either 'train' or 'test'. 91 download: Whether to download the data if it is not present. 92 93 Returns: 94 Filepath to the folder where image data is stored. 95 Filepath to the folder where label data is stored. 96 """ 97 get_vgg_hela_data(path, download) 98 99 image_path = os.path.join(path, split, "images") 100 label_path = os.path.join(path, split, "labels") 101 102 return image_path, label_path 103 104 105def get_vgg_hela_dataset( 106 path: Union[os.PathLike, str], split: str, patch_shape: Tuple[int, int], download: bool = False, **kwargs 107) -> Dataset: 108 """Get the HeLA VGG dataset for cell counting. 109 110 Args: 111 path: Filepath to a folder where the downloaded data will be saved. 112 split: The split to use for the dataset. Either 'train' or 'test'. 113 patch_shape: The patch shape to use for training. 114 download: Whether to download the data if it is not present. 115 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 116 117 Returns: 118 The segmentation dataset. 119 """ 120 assert split in ("test", "train"), split 121 122 image_path, label_path = get_vgg_hela_paths(path, split, download) 123 124 kwargs = util.update_kwargs(kwargs, "ndim", 2) 125 kwargs = util.update_kwargs(kwargs, "is_seg_dataset", True) 126 127 return torch_em.default_segmentation_dataset( 128 raw_paths=image_path, 129 raw_key="*.tif", 130 label_paths=label_path, 131 label_key="*.tif", 132 patch_shape=patch_shape, 133 **kwargs 134 ) 135 136 137def get_vgg_hela_loader( 138 path: Union[os.PathLike, str], 139 split: str, 140 patch_shape: Tuple[int, int], 141 batch_size: int, 142 download: bool = False, 143 **kwargs 144) -> DataLoader: 145 """Get the HeLA VGG dataloader for cell counting. 146 147 Args: 148 path: Filepath to a folder where the downloaded data will be saved. 149 split: The split to use for the dataset. Either 'train' or 'test'. 150 patch_shape: The patch shape to use for training. 151 batch_size: The batch size for training. 152 download: Whether to download the data if it is not present. 153 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 154 155 Returns: 156 The DataLoader. 157 """ 158 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 159 dataset = get_vgg_hela_dataset(path, split, patch_shape, download=download, **ds_kwargs) 160 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URL =
'https://www.robots.ox.ac.uk/~vgg/software/cell_detection/downloads/CellDetect_v1.0.tar.gz'
CHECKSUM =
'09825d6a8e287ddf2c4b1ef3d2f62585ec6876e3bfcd4b9bbcd3dd300e4be282'
def
get_vgg_hela_data(path: Union[os.PathLike, str], download: bool) -> str:
28def get_vgg_hela_data(path: Union[os.PathLike, str], download: bool) -> str: 29 """Download the HeLA VGG dataset. 30 31 Args: 32 path: Filepath to a folder where the downloaded data will be saved. 33 download: Whether to download the data if it is not present. 34 35 Returns: 36 The filepath to the training data. 37 """ 38 os.makedirs(path, exist_ok=True) 39 url = URL 40 checksum = CHECKSUM 41 42 train_path = os.path.join(path, "train") 43 test_path = os.path.join(path, "test") 44 45 if os.path.exists(train_path) and os.path.exists(test_path): 46 return path 47 48 dl_path = os.path.join(path, "cell_detect.tar.gz") 49 util.download_source(dl_path, url, download, checksum) 50 util.unzip_tarfile(dl_path, path, True) 51 52 extracted_path = os.path.join(path, "CellDetect_v1.0") 53 assert os.path.exists(extracted_path), extracted_path 54 55 splits_in = ["trainPhasecontrast", "testPhasecontrast"] 56 splits_out = [train_path, test_path] 57 58 for split_in, out_folder in zip(splits_in, splits_out): 59 out_im_folder = os.path.join(out_folder, "images") 60 os.makedirs(out_im_folder, exist_ok=True) 61 62 out_label_folder = os.path.join(out_folder, "labels") 63 os.makedirs(out_label_folder, exist_ok=True) 64 65 split_root = os.path.join(extracted_path, "phasecontrast", split_in) 66 image_files = sorted(glob(os.path.join(split_root, "*.pgm"))) 67 mat_files = sorted(glob(os.path.join(split_root, "*.mat"))) 68 69 for ii, (im, mat) in enumerate(zip(image_files, mat_files), 1): 70 im = imageio.imread(im) 71 coordinates = loadmat(mat)["gt"] - 1 72 coordinates = (coordinates[:, 1], coordinates[:, 0]) 73 74 out_im = os.path.join(out_im_folder, f"im{ii:02}.tif") 75 imageio.imwrite(out_im, im, compression="zlib") 76 77 labels = np.zeros(im.shape, dtype="uint8") 78 labels[coordinates] = 1 79 out_labels = os.path.join(out_label_folder, f"im{ii:02}.tif") 80 imageio.imwrite(out_labels, labels, compression="zlib") 81 82 rmtree(extracted_path) 83 return path
Download the HeLA VGG dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- download: Whether to download the data if it is not present.
Returns:
The filepath to the training data.
def
get_vgg_hela_paths( path: Union[os.PathLike, str], split: str, download: bool = False) -> Tuple[str, str]:
86def get_vgg_hela_paths(path: Union[os.PathLike, str], split: str, download: bool = False) -> Tuple[str, str]: 87 """Get paths for HeLA VGG data. 88 89 Args: 90 path: Filepath to a folder where the downloaded data will be saved. 91 split: The split to use for the dataset. Either 'train' or 'test'. 92 download: Whether to download the data if it is not present. 93 94 Returns: 95 Filepath to the folder where image data is stored. 96 Filepath to the folder where label data is stored. 97 """ 98 get_vgg_hela_data(path, download) 99 100 image_path = os.path.join(path, split, "images") 101 label_path = os.path.join(path, split, "labels") 102 103 return image_path, label_path
Get paths for HeLA VGG data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The split to use for the dataset. Either 'train' or 'test'.
- download: Whether to download the data if it is not present.
Returns:
Filepath to the folder where image data is stored. Filepath to the folder where label data is stored.
def
get_vgg_hela_dataset( path: Union[os.PathLike, str], split: str, patch_shape: Tuple[int, int], download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
106def get_vgg_hela_dataset( 107 path: Union[os.PathLike, str], split: str, patch_shape: Tuple[int, int], download: bool = False, **kwargs 108) -> Dataset: 109 """Get the HeLA VGG dataset for cell counting. 110 111 Args: 112 path: Filepath to a folder where the downloaded data will be saved. 113 split: The split to use for the dataset. Either 'train' or 'test'. 114 patch_shape: The patch shape to use for training. 115 download: Whether to download the data if it is not present. 116 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 117 118 Returns: 119 The segmentation dataset. 120 """ 121 assert split in ("test", "train"), split 122 123 image_path, label_path = get_vgg_hela_paths(path, split, download) 124 125 kwargs = util.update_kwargs(kwargs, "ndim", 2) 126 kwargs = util.update_kwargs(kwargs, "is_seg_dataset", True) 127 128 return torch_em.default_segmentation_dataset( 129 raw_paths=image_path, 130 raw_key="*.tif", 131 label_paths=label_path, 132 label_key="*.tif", 133 patch_shape=patch_shape, 134 **kwargs 135 )
Get the HeLA VGG dataset for cell counting.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The split to use for the dataset. Either 'train' or 'test'.
- patch_shape: The patch shape to use for training.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
def
get_vgg_hela_loader( path: Union[os.PathLike, str], split: str, patch_shape: Tuple[int, int], batch_size: int, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
138def get_vgg_hela_loader( 139 path: Union[os.PathLike, str], 140 split: str, 141 patch_shape: Tuple[int, int], 142 batch_size: int, 143 download: bool = False, 144 **kwargs 145) -> DataLoader: 146 """Get the HeLA VGG dataloader for cell counting. 147 148 Args: 149 path: Filepath to a folder where the downloaded data will be saved. 150 split: The split to use for the dataset. Either 'train' or 'test'. 151 patch_shape: The patch shape to use for training. 152 batch_size: The batch size for training. 153 download: Whether to download the data if it is not present. 154 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 155 156 Returns: 157 The DataLoader. 158 """ 159 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 160 dataset = get_vgg_hela_dataset(path, split, patch_shape, download=download, **ds_kwargs) 161 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the HeLA VGG dataloader for cell counting.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The split to use for the dataset. Either 'train' or 'test'.
- patch_shape: The patch shape to use for training.
- batch_size: The batch size for training.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader.