torch_em.data.datasets.light_microscopy.vgg_hela

This is a dataset for counting HeLA cells in phase-contrast microscopy.

It is described in the publication https://www.robots.ox.ac.uk/~vgg/publications/2012/Arteta12/. Please cite it if you use this dataset in your research.

  1"""This is a dataset for counting HeLA cells in phase-contrast microscopy.
  2
  3It is described in the publication https://www.robots.ox.ac.uk/~vgg/publications/2012/Arteta12/.
  4Please cite it if you use this dataset in your research.
  5"""
  6
  7import os
  8from glob import glob
  9from shutil import rmtree
 10from typing import Tuple, Union
 11
 12import numpy as np
 13import imageio.v3 as imageio
 14from scipy.io import loadmat
 15
 16from torch.utils.data import Dataset, DataLoader
 17
 18import torch_em
 19
 20from .. import util
 21
 22
 23URL = "https://www.robots.ox.ac.uk/~vgg/software/cell_detection/downloads/CellDetect_v1.0.tar.gz"
 24CHECKSUM = "09825d6a8e287ddf2c4b1ef3d2f62585ec6876e3bfcd4b9bbcd3dd300e4be282"
 25
 26
 27def get_vgg_hela_data(path: Union[os.PathLike, str], download: bool) -> str:
 28    """Download the HeLA VGG dataset.
 29
 30    Args:
 31        path: Filepath to a folder where the downloaded data will be saved.
 32        download: Whether to download the data if it is not present.
 33
 34    Returns:
 35        The filepath to the training data.
 36    """
 37    os.makedirs(path, exist_ok=True)
 38    url = URL
 39    checksum = CHECKSUM
 40
 41    train_path = os.path.join(path, "train")
 42    test_path = os.path.join(path, "test")
 43
 44    if os.path.exists(train_path) and os.path.exists(test_path):
 45        return path
 46
 47    dl_path = os.path.join(path, "cell_detect.tar.gz")
 48    util.download_source(dl_path, url, download, checksum)
 49    util.unzip_tarfile(dl_path, path, True)
 50
 51    extracted_path = os.path.join(path, "CellDetect_v1.0")
 52    assert os.path.exists(extracted_path), extracted_path
 53
 54    splits_in = ["trainPhasecontrast", "testPhasecontrast"]
 55    splits_out = [train_path, test_path]
 56
 57    for split_in, out_folder in zip(splits_in, splits_out):
 58        out_im_folder = os.path.join(out_folder, "images")
 59        os.makedirs(out_im_folder, exist_ok=True)
 60
 61        out_label_folder = os.path.join(out_folder, "labels")
 62        os.makedirs(out_label_folder, exist_ok=True)
 63
 64        split_root = os.path.join(extracted_path, "phasecontrast", split_in)
 65        image_files = sorted(glob(os.path.join(split_root, "*.pgm")))
 66        mat_files = sorted(glob(os.path.join(split_root, "*.mat")))
 67
 68        for ii, (im, mat) in enumerate(zip(image_files, mat_files), 1):
 69            im = imageio.imread(im)
 70            coordinates = loadmat(mat)["gt"] - 1
 71            coordinates = (coordinates[:, 1], coordinates[:, 0])
 72
 73            out_im = os.path.join(out_im_folder, f"im{ii:02}.tif")
 74            imageio.imwrite(out_im, im, compression="zlib")
 75
 76            labels = np.zeros(im.shape, dtype="uint8")
 77            labels[coordinates] = 1
 78            out_labels = os.path.join(out_label_folder, f"im{ii:02}.tif")
 79            imageio.imwrite(out_labels, labels, compression="zlib")
 80
 81    rmtree(extracted_path)
 82    return path
 83
 84
 85def get_vgg_hela_paths(path: Union[os.PathLike, str], split: str, download: bool = False) -> Tuple[str, str]:
 86    """Get paths for HeLA VGG data.
 87
 88    Args:
 89        path: Filepath to a folder where the downloaded data will be saved.
 90        split: The split to use for the dataset. Either 'train' or 'test'.
 91        download: Whether to download the data if it is not present.
 92
 93    Returns:
 94        Filepath to the folder where image data is stored.
 95        Filepath to the folder where label data is stored.
 96    """
 97    get_vgg_hela_data(path, download)
 98
 99    image_path = os.path.join(path, split, "images")
100    label_path = os.path.join(path, split, "labels")
101
102    return image_path, label_path
103
104
105def get_vgg_hela_dataset(
106    path: Union[os.PathLike, str], split: str, patch_shape: Tuple[int, int], download: bool = False, **kwargs
107) -> Dataset:
108    """Get the HeLA VGG dataset for cell counting.
109
110    Args:
111        path: Filepath to a folder where the downloaded data will be saved.
112        split: The split to use for the dataset. Either 'train' or 'test'.
113        patch_shape: The patch shape to use for training.
114        download: Whether to download the data if it is not present.
115        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
116
117    Returns:
118       The segmentation dataset.
119    """
120    assert split in ("test", "train"), split
121
122    image_path, label_path = get_vgg_hela_paths(path, split, download)
123
124    kwargs = util.update_kwargs(kwargs, "ndim", 2)
125    kwargs = util.update_kwargs(kwargs, "is_seg_dataset", True)
126
127    return torch_em.default_segmentation_dataset(
128        raw_paths=image_path,
129        raw_key="*.tif",
130        label_paths=label_path,
131        label_key="*.tif",
132        patch_shape=patch_shape,
133        **kwargs
134    )
135
136
137def get_vgg_hela_loader(
138    path: Union[os.PathLike, str],
139    split: str,
140    patch_shape: Tuple[int, int],
141    batch_size: int,
142    download: bool = False,
143    **kwargs
144) -> DataLoader:
145    """Get the HeLA VGG dataloader for cell counting.
146
147    Args:
148        path: Filepath to a folder where the downloaded data will be saved.
149        split: The split to use for the dataset. Either 'train' or 'test'.
150        patch_shape: The patch shape to use for training.
151        batch_size: The batch size for training.
152        download: Whether to download the data if it is not present.
153        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
154
155    Returns:
156        The DataLoader.
157    """
158    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
159    dataset = get_vgg_hela_dataset(path, split, patch_shape, download=download, **ds_kwargs)
160    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URL = 'https://www.robots.ox.ac.uk/~vgg/software/cell_detection/downloads/CellDetect_v1.0.tar.gz'
CHECKSUM = '09825d6a8e287ddf2c4b1ef3d2f62585ec6876e3bfcd4b9bbcd3dd300e4be282'
def get_vgg_hela_data(path: Union[os.PathLike, str], download: bool) -> str:
28def get_vgg_hela_data(path: Union[os.PathLike, str], download: bool) -> str:
29    """Download the HeLA VGG dataset.
30
31    Args:
32        path: Filepath to a folder where the downloaded data will be saved.
33        download: Whether to download the data if it is not present.
34
35    Returns:
36        The filepath to the training data.
37    """
38    os.makedirs(path, exist_ok=True)
39    url = URL
40    checksum = CHECKSUM
41
42    train_path = os.path.join(path, "train")
43    test_path = os.path.join(path, "test")
44
45    if os.path.exists(train_path) and os.path.exists(test_path):
46        return path
47
48    dl_path = os.path.join(path, "cell_detect.tar.gz")
49    util.download_source(dl_path, url, download, checksum)
50    util.unzip_tarfile(dl_path, path, True)
51
52    extracted_path = os.path.join(path, "CellDetect_v1.0")
53    assert os.path.exists(extracted_path), extracted_path
54
55    splits_in = ["trainPhasecontrast", "testPhasecontrast"]
56    splits_out = [train_path, test_path]
57
58    for split_in, out_folder in zip(splits_in, splits_out):
59        out_im_folder = os.path.join(out_folder, "images")
60        os.makedirs(out_im_folder, exist_ok=True)
61
62        out_label_folder = os.path.join(out_folder, "labels")
63        os.makedirs(out_label_folder, exist_ok=True)
64
65        split_root = os.path.join(extracted_path, "phasecontrast", split_in)
66        image_files = sorted(glob(os.path.join(split_root, "*.pgm")))
67        mat_files = sorted(glob(os.path.join(split_root, "*.mat")))
68
69        for ii, (im, mat) in enumerate(zip(image_files, mat_files), 1):
70            im = imageio.imread(im)
71            coordinates = loadmat(mat)["gt"] - 1
72            coordinates = (coordinates[:, 1], coordinates[:, 0])
73
74            out_im = os.path.join(out_im_folder, f"im{ii:02}.tif")
75            imageio.imwrite(out_im, im, compression="zlib")
76
77            labels = np.zeros(im.shape, dtype="uint8")
78            labels[coordinates] = 1
79            out_labels = os.path.join(out_label_folder, f"im{ii:02}.tif")
80            imageio.imwrite(out_labels, labels, compression="zlib")
81
82    rmtree(extracted_path)
83    return path

Download the HeLA VGG dataset.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • download: Whether to download the data if it is not present.
Returns:

The filepath to the training data.

def get_vgg_hela_paths( path: Union[os.PathLike, str], split: str, download: bool = False) -> Tuple[str, str]:
 86def get_vgg_hela_paths(path: Union[os.PathLike, str], split: str, download: bool = False) -> Tuple[str, str]:
 87    """Get paths for HeLA VGG data.
 88
 89    Args:
 90        path: Filepath to a folder where the downloaded data will be saved.
 91        split: The split to use for the dataset. Either 'train' or 'test'.
 92        download: Whether to download the data if it is not present.
 93
 94    Returns:
 95        Filepath to the folder where image data is stored.
 96        Filepath to the folder where label data is stored.
 97    """
 98    get_vgg_hela_data(path, download)
 99
100    image_path = os.path.join(path, split, "images")
101    label_path = os.path.join(path, split, "labels")
102
103    return image_path, label_path

Get paths for HeLA VGG data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • split: The split to use for the dataset. Either 'train' or 'test'.
  • download: Whether to download the data if it is not present.
Returns:

Filepath to the folder where image data is stored. Filepath to the folder where label data is stored.

def get_vgg_hela_dataset( path: Union[os.PathLike, str], split: str, patch_shape: Tuple[int, int], download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
106def get_vgg_hela_dataset(
107    path: Union[os.PathLike, str], split: str, patch_shape: Tuple[int, int], download: bool = False, **kwargs
108) -> Dataset:
109    """Get the HeLA VGG dataset for cell counting.
110
111    Args:
112        path: Filepath to a folder where the downloaded data will be saved.
113        split: The split to use for the dataset. Either 'train' or 'test'.
114        patch_shape: The patch shape to use for training.
115        download: Whether to download the data if it is not present.
116        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
117
118    Returns:
119       The segmentation dataset.
120    """
121    assert split in ("test", "train"), split
122
123    image_path, label_path = get_vgg_hela_paths(path, split, download)
124
125    kwargs = util.update_kwargs(kwargs, "ndim", 2)
126    kwargs = util.update_kwargs(kwargs, "is_seg_dataset", True)
127
128    return torch_em.default_segmentation_dataset(
129        raw_paths=image_path,
130        raw_key="*.tif",
131        label_paths=label_path,
132        label_key="*.tif",
133        patch_shape=patch_shape,
134        **kwargs
135    )

Get the HeLA VGG dataset for cell counting.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • split: The split to use for the dataset. Either 'train' or 'test'.
  • patch_shape: The patch shape to use for training.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_vgg_hela_loader( path: Union[os.PathLike, str], split: str, patch_shape: Tuple[int, int], batch_size: int, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
138def get_vgg_hela_loader(
139    path: Union[os.PathLike, str],
140    split: str,
141    patch_shape: Tuple[int, int],
142    batch_size: int,
143    download: bool = False,
144    **kwargs
145) -> DataLoader:
146    """Get the HeLA VGG dataloader for cell counting.
147
148    Args:
149        path: Filepath to a folder where the downloaded data will be saved.
150        split: The split to use for the dataset. Either 'train' or 'test'.
151        patch_shape: The patch shape to use for training.
152        batch_size: The batch size for training.
153        download: Whether to download the data if it is not present.
154        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
155
156    Returns:
157        The DataLoader.
158    """
159    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
160    dataset = get_vgg_hela_dataset(path, split, patch_shape, download=download, **ds_kwargs)
161    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the HeLA VGG dataloader for cell counting.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • split: The split to use for the dataset. Either 'train' or 'test'.
  • patch_shape: The patch shape to use for training.
  • batch_size: The batch size for training.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.