torch_em.data.datasets.histopathology.glysac

The GLySAC dataset contains annotations for nuclei instance segmentation and classification in H&E stained gastric cancer histopathology images.

The dataset contains 59 image tiles of size 1000x1000 pixels with instance segmentation masks and cell type annotations. Three cell classes are provided: lymphocytes, epithelial cells (normal and tumor), and other cells.

NOTE: The dataset is hosted on Google Drive and requires gdown to download. Install it with: conda install -c conda-forge gdown==4.6.3

The dataset is located at https://drive.google.com/file/d/1g1_xYFWgp3cRLKrlSwD2U5JDjooC0yHp/view This dataset is from the publication https://doi.org/10.1109/jbhi.2022.3149936. Please cite it if you use this dataset in your research.

  1"""The GLySAC dataset contains annotations for nuclei instance segmentation and
  2classification in H&E stained gastric cancer histopathology images.
  3
  4The dataset contains 59 image tiles of size 1000x1000 pixels with instance
  5segmentation masks and cell type annotations. Three cell classes are provided:
  6lymphocytes, epithelial cells (normal and tumor), and other cells.
  7
  8NOTE: The dataset is hosted on Google Drive and requires gdown to download.
  9Install it with: conda install -c conda-forge gdown==4.6.3
 10
 11The dataset is located at https://drive.google.com/file/d/1g1_xYFWgp3cRLKrlSwD2U5JDjooC0yHp/view
 12This dataset is from the publication https://doi.org/10.1109/jbhi.2022.3149936.
 13Please cite it if you use this dataset in your research.
 14"""
 15
 16import os
 17from glob import glob
 18from tqdm import tqdm
 19from natsort import natsorted
 20from typing import List, Literal, Tuple, Union
 21
 22import h5py
 23import imageio.v3 as imageio
 24from scipy.io import loadmat
 25from torch.utils.data import Dataset, DataLoader
 26
 27import torch_em
 28
 29from .. import util
 30
 31
 32GDRIVE_ID = "1g1_xYFWgp3cRLKrlSwD2U5JDjooC0yHp"
 33URL = f"https://drive.google.com/uc?id={GDRIVE_ID}"
 34CHECKSUM = None
 35
 36
 37def _create_h5_files(data_dir: str, split: str) -> None:
 38    folder = "Train" if split == "train" else "Test"
 39    image_dir = os.path.join(data_dir, folder, "Images")
 40    label_dir = os.path.join(data_dir, folder, "Labels")
 41    h5_dir = os.path.join(data_dir, "h5", split)
 42    os.makedirs(h5_dir, exist_ok=True)
 43
 44    image_paths = natsorted(glob(os.path.join(image_dir, "*.png")))
 45    for image_path in tqdm(image_paths, desc=f"Preprocessing {split}"):
 46        fname = os.path.splitext(os.path.basename(image_path))[0]
 47        h5_path = os.path.join(h5_dir, f"{fname}.h5")
 48        if os.path.exists(h5_path):
 49            continue
 50
 51        label_path = os.path.join(label_dir, f"{fname}.mat")
 52        raw = imageio.imread(image_path)[..., :3]
 53        mat = loadmat(label_path)
 54        inst_map = mat["inst_map"].astype("int32")
 55        type_map = mat["type_map"].astype("int32")
 56
 57        with h5py.File(h5_path, "w") as f:
 58            f.create_dataset("raw", data=raw.transpose(2, 0, 1), compression="gzip")
 59            f.create_dataset("labels/instances", data=inst_map, compression="gzip")
 60            f.create_dataset("labels/semantic", data=type_map, compression="gzip")
 61
 62
 63def get_glysac_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 64    """Download the GLySAC dataset.
 65
 66    Args:
 67        path: Filepath to a folder where the downloaded data will be saved.
 68        download: Whether to download the data if it is not present.
 69
 70    Returns:
 71        The filepath to the data directory.
 72    """
 73    data_dir = os.path.join(path, "glysac_dataset")
 74    if os.path.exists(data_dir):
 75        return data_dir
 76
 77    os.makedirs(path, exist_ok=True)
 78    zip_path = os.path.join(path, "glysac_dataset.zip")
 79    util.download_source_gdrive(path=zip_path, url=URL, download=download, checksum=CHECKSUM)
 80    util.unzip(zip_path, path)
 81
 82    return data_dir
 83
 84
 85def get_glysac_paths(
 86    path: Union[os.PathLike, str],
 87    split: Literal["train", "test"],
 88    download: bool = False,
 89) -> List[str]:
 90    """Get paths to the GLySAC data.
 91
 92    Args:
 93        path: Filepath to a folder where the downloaded data will be saved.
 94        split: The data split to use. Either 'train' or 'test'.
 95        download: Whether to download the data if it is not present.
 96
 97    Returns:
 98        List of filepaths for the h5 data.
 99    """
100    if split not in ("train", "test"):
101        raise ValueError(f"'{split}' is not a valid split. Choose from 'train' or 'test'.")
102
103    data_dir = get_glysac_data(path, download)
104    _create_h5_files(data_dir, split)
105
106    h5_paths = natsorted(glob(os.path.join(data_dir, "h5", split, "*.h5")))
107    if len(h5_paths) == 0:
108        raise RuntimeError(f"No data found for split '{split}'. Check the dataset at {data_dir}.")
109
110    return h5_paths
111
112
113def get_glysac_dataset(
114    path: Union[os.PathLike, str],
115    patch_shape: Tuple[int, int],
116    split: Literal["train", "test"],
117    label_choice: Literal["instances", "semantic"] = "instances",
118    download: bool = False,
119    **kwargs,
120) -> Dataset:
121    """Get the GLySAC dataset for gastric nuclei segmentation.
122
123    Args:
124        path: Filepath to a folder where the downloaded data will be saved.
125        patch_shape: The patch shape to use for training.
126        split: The data split to use. Either 'train' or 'test'.
127        label_choice: The type of labels to load. Either 'instances' for instance segmentation
128            or 'semantic' for cell type classification (4 classes: other, lymphocyte, epithelial, ambiguous).
129        download: Whether to download the data if it is not present.
130        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
131
132    Returns:
133        The segmentation dataset.
134    """
135    if label_choice not in ("instances", "semantic"):
136        raise ValueError(f"'{label_choice}' is not a valid label choice. Use 'instances' or 'semantic'.")
137
138    h5_paths = get_glysac_paths(path, split, download)
139
140    if label_choice == "instances":
141        kwargs, _ = util.add_instance_label_transform(kwargs, add_binary_target=True)
142    kwargs = util.ensure_transforms(ndim=2, **kwargs)
143
144    return torch_em.default_segmentation_dataset(
145        raw_paths=h5_paths,
146        raw_key="raw",
147        label_paths=h5_paths,
148        label_key=f"labels/{label_choice}",
149        patch_shape=patch_shape,
150        with_channels=True,
151        ndim=2,
152        **kwargs,
153    )
154
155
156def get_glysac_loader(
157    path: Union[os.PathLike, str],
158    batch_size: int,
159    patch_shape: Tuple[int, int],
160    split: Literal["train", "test"],
161    label_choice: Literal["instances", "semantic"] = "instances",
162    download: bool = False,
163    **kwargs,
164) -> DataLoader:
165    """Get the GLySAC dataloader for gastric nuclei segmentation.
166
167    Args:
168        path: Filepath to a folder where the downloaded data will be saved.
169        batch_size: The batch size for training.
170        patch_shape: The patch shape to use for training.
171        split: The data split to use. Either 'train' or 'test'.
172        label_choice: The type of labels to load. Either 'instances' for instance segmentation
173            or 'semantic' for cell type classification (4 classes: other, lymphocyte, epithelial, ambiguous).
174        download: Whether to download the data if it is not present.
175        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
176
177    Returns:
178        The DataLoader.
179    """
180    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
181    dataset = get_glysac_dataset(path, patch_shape, split, label_choice, download, **ds_kwargs)
182    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
GDRIVE_ID = '1g1_xYFWgp3cRLKrlSwD2U5JDjooC0yHp'
URL = 'https://drive.google.com/uc?id=1g1_xYFWgp3cRLKrlSwD2U5JDjooC0yHp'
CHECKSUM = None
def get_glysac_data(path: Union[os.PathLike, str], download: bool = False) -> str:
64def get_glysac_data(path: Union[os.PathLike, str], download: bool = False) -> str:
65    """Download the GLySAC dataset.
66
67    Args:
68        path: Filepath to a folder where the downloaded data will be saved.
69        download: Whether to download the data if it is not present.
70
71    Returns:
72        The filepath to the data directory.
73    """
74    data_dir = os.path.join(path, "glysac_dataset")
75    if os.path.exists(data_dir):
76        return data_dir
77
78    os.makedirs(path, exist_ok=True)
79    zip_path = os.path.join(path, "glysac_dataset.zip")
80    util.download_source_gdrive(path=zip_path, url=URL, download=download, checksum=CHECKSUM)
81    util.unzip(zip_path, path)
82
83    return data_dir

Download the GLySAC dataset.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • download: Whether to download the data if it is not present.
Returns:

The filepath to the data directory.

def get_glysac_paths( path: Union[os.PathLike, str], split: Literal['train', 'test'], download: bool = False) -> List[str]:
 86def get_glysac_paths(
 87    path: Union[os.PathLike, str],
 88    split: Literal["train", "test"],
 89    download: bool = False,
 90) -> List[str]:
 91    """Get paths to the GLySAC data.
 92
 93    Args:
 94        path: Filepath to a folder where the downloaded data will be saved.
 95        split: The data split to use. Either 'train' or 'test'.
 96        download: Whether to download the data if it is not present.
 97
 98    Returns:
 99        List of filepaths for the h5 data.
100    """
101    if split not in ("train", "test"):
102        raise ValueError(f"'{split}' is not a valid split. Choose from 'train' or 'test'.")
103
104    data_dir = get_glysac_data(path, download)
105    _create_h5_files(data_dir, split)
106
107    h5_paths = natsorted(glob(os.path.join(data_dir, "h5", split, "*.h5")))
108    if len(h5_paths) == 0:
109        raise RuntimeError(f"No data found for split '{split}'. Check the dataset at {data_dir}.")
110
111    return h5_paths

Get paths to the GLySAC data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • split: The data split to use. Either 'train' or 'test'.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths for the h5 data.

def get_glysac_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Literal['train', 'test'], label_choice: Literal['instances', 'semantic'] = 'instances', download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
114def get_glysac_dataset(
115    path: Union[os.PathLike, str],
116    patch_shape: Tuple[int, int],
117    split: Literal["train", "test"],
118    label_choice: Literal["instances", "semantic"] = "instances",
119    download: bool = False,
120    **kwargs,
121) -> Dataset:
122    """Get the GLySAC dataset for gastric nuclei segmentation.
123
124    Args:
125        path: Filepath to a folder where the downloaded data will be saved.
126        patch_shape: The patch shape to use for training.
127        split: The data split to use. Either 'train' or 'test'.
128        label_choice: The type of labels to load. Either 'instances' for instance segmentation
129            or 'semantic' for cell type classification (4 classes: other, lymphocyte, epithelial, ambiguous).
130        download: Whether to download the data if it is not present.
131        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
132
133    Returns:
134        The segmentation dataset.
135    """
136    if label_choice not in ("instances", "semantic"):
137        raise ValueError(f"'{label_choice}' is not a valid label choice. Use 'instances' or 'semantic'.")
138
139    h5_paths = get_glysac_paths(path, split, download)
140
141    if label_choice == "instances":
142        kwargs, _ = util.add_instance_label_transform(kwargs, add_binary_target=True)
143    kwargs = util.ensure_transforms(ndim=2, **kwargs)
144
145    return torch_em.default_segmentation_dataset(
146        raw_paths=h5_paths,
147        raw_key="raw",
148        label_paths=h5_paths,
149        label_key=f"labels/{label_choice}",
150        patch_shape=patch_shape,
151        with_channels=True,
152        ndim=2,
153        **kwargs,
154    )

Get the GLySAC dataset for gastric nuclei segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • split: The data split to use. Either 'train' or 'test'.
  • label_choice: The type of labels to load. Either 'instances' for instance segmentation or 'semantic' for cell type classification (4 classes: other, lymphocyte, epithelial, ambiguous).
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_glysac_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Literal['train', 'test'], label_choice: Literal['instances', 'semantic'] = 'instances', download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
157def get_glysac_loader(
158    path: Union[os.PathLike, str],
159    batch_size: int,
160    patch_shape: Tuple[int, int],
161    split: Literal["train", "test"],
162    label_choice: Literal["instances", "semantic"] = "instances",
163    download: bool = False,
164    **kwargs,
165) -> DataLoader:
166    """Get the GLySAC dataloader for gastric nuclei segmentation.
167
168    Args:
169        path: Filepath to a folder where the downloaded data will be saved.
170        batch_size: The batch size for training.
171        patch_shape: The patch shape to use for training.
172        split: The data split to use. Either 'train' or 'test'.
173        label_choice: The type of labels to load. Either 'instances' for instance segmentation
174            or 'semantic' for cell type classification (4 classes: other, lymphocyte, epithelial, ambiguous).
175        download: Whether to download the data if it is not present.
176        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
177
178    Returns:
179        The DataLoader.
180    """
181    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
182    dataset = get_glysac_dataset(path, patch_shape, split, label_choice, download, **ds_kwargs)
183    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the GLySAC dataloader for gastric nuclei segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape to use for training.
  • split: The data split to use. Either 'train' or 'test'.
  • label_choice: The type of labels to load. Either 'instances' for instance segmentation or 'semantic' for cell type classification (4 classes: other, lymphocyte, epithelial, ambiguous).
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.