torch_em.data.datasets.histopathology.consep

The CoNSeP dataset contains annotations for nucleus segmentation in H&E stained histopathology images for multi-tissue regions.

NOTE: The source of this dataset is an open-source version hosted on Kaggle:

https://www.kaggle.com/datasets/rftexas/tiled-consep-224x224px

This dataset is from the publication https://doi.org/10.1016/j.media.2019.101563. Please cite it if you use this dataset for your research.

View Source

  1"""The CoNSeP dataset contains annotations for nucleus segmentation in
  2H&E stained histopathology images for multi-tissue regions.
  3
  4NOTE: The source of this dataset is an open-source version hosted on Kaggle:
  5- https://www.kaggle.com/datasets/rftexas/tiled-consep-224x224px
  6
  7This dataset is from the publication https://doi.org/10.1016/j.media.2019.101563.
  8Please cite it if you use this dataset for your research.
  9"""
 10
 11import os
 12from glob import glob
 13from tqdm import tqdm
 14from pathlib import Path
 15from natsort import natsorted
 16from typing import Union, Tuple, List, Literal
 17
 18import h5py
 19import imageio.v3 as imageio
 20import numpy as np
 21import torch_em
 22
 23from elf.segmentation.stitching import stitch_tiled_segmentation
 24from scipy.io import loadmat
 25from skimage.measure import label as connected_components
 26from torch.utils.data import Dataset, DataLoader
 27
 28from .. import util
 29
 30
 31def _preprocess_image(raw_paths, label_paths, output_path):
 32
 33    # Find the start and stop coordinates for all tiles by parsing their filenames.
 34    tile_coordinates = []
 35    for path in raw_paths:
 36        tile_coords = tuple(int(coord) for coord in Path(path).stem.split("_")[2:])
 37        tile_coordinates.append(tile_coords)
 38
 39    # Find the dimension of the image as the maximum of the tile coordinates.
 40    h = max(coord[1] for coord in tile_coordinates)
 41    w = max(coord[3] for coord in tile_coordinates)
 42    shape = (h, w)
 43
 44    # Stitch together the image data.
 45    raw = np.zeros(shape + (3,), dtype="uint8")
 46    for path, coords in zip(raw_paths, tile_coordinates):
 47        tile_data = imageio.imread(path)
 48        y1, y2, x1, x2 = coords
 49        raw[y1:y2, x1:x2] = tile_data
 50
 51    # Stitch together the label data.
 52    # First, we load the labels and apply an offset so that we have unique ids.
 53    # Also, some parts of the labels are over-lapping and we make sure to only write it once.
 54    offset = 0
 55    labels = np.zeros(shape, dtype="uint32")
 56    written = np.zeros(shape, dtype=bool)
 57    for path, coords in zip(label_paths, tile_coordinates):
 58        y1, y2, x1, x2 = coords
 59
 60        tile_labels = loadmat(path)["instance_map"]
 61        tile_labels = connected_components(tile_labels).astype("uint32")
 62
 63        # Find the mask where we have labels in this tile, and where data was already written.
 64        tile_mask = tile_labels != 0
 65        tile_not_written = ~written[y1:y2, x1:x2]
 66
 67        # And intersect them.
 68        tile_mask = np.logical_and(tile_mask, tile_not_written)
 69
 70        # Add up the offset to this tile, unless it is empty.
 71        if tile_mask.sum() > 0:
 72            tile_labels[tile_mask] += offset
 73            offset = int(tile_labels.max())
 74
 75        # Write out what has been written and the labels.
 76        written[y1:y2, x1:x2][tile_mask] = 1
 77        labels[y1:y2, x1:x2][tile_mask] = tile_labels[tile_mask]
 78
 79    # Stitch the labels together.
 80    tile_shape = (224, 224)
 81    stitched_labels = stitch_tiled_segmentation(labels, tile_shape=tile_shape, overlap=1, verbose=False)
 82
 83    with h5py.File(output_path, "w") as f:
 84        f.create_dataset("raw", data=raw.transpose(2, 0, 1), compression="gzip")
 85        f.create_dataset("labels", data=stitched_labels, compression="gzip")
 86
 87
 88def _preprocess_data(data_dir, split):
 89    preprocessed_dir = os.path.join(data_dir, "preprocessed", split)
 90    os.makedirs(preprocessed_dir, exist_ok=True)
 91
 92    n_images = 28 if split == "train" else 15
 93    for image_id in tqdm(range(1, n_images), desc="Preprocessing inputs"):
 94        output_path = os.path.join(preprocessed_dir, f"{image_id}.h5")
 95        if os.path.exists(output_path):
 96            continue
 97
 98        raw_paths = natsorted(glob(os.path.join(data_dir, "tiles", f"{split}_{image_id}_*.png")))
 99        label_paths = [p.replace("tiles", "labels").replace(".png", ".mat") for p in raw_paths]
100        _preprocess_image(raw_paths, label_paths, output_path)
101
102
103def get_consep_data(path: Union[os.PathLike, str], download: bool = False) -> str:
104    """Download the CoNSeP dataset.
105
106    Args:
107        path: Filepath to a folder where the data is downloaded for further processing.
108        download: Whether to download the data if it is not present.
109
110    Returns:
111        Filepath where the data is downloaded and preprocessed.
112    """
113    data_dir = os.path.join(path, "data", "consep")
114    if os.path.exists(data_dir):
115        return data_dir
116
117    os.makedirs(path, exist_ok=True)
118
119    util.download_source_kaggle(path=path, dataset_name="rftexas/tiled-consep-224x224px", download=download)
120    util.unzip(zip_path=os.path.join(
121        path, "tiled-consep-224x224px.zip"), dst=os.path.join(path, "data"), remove=False
122    )
123
124    return data_dir
125
126
127def get_consep_paths(
128    path: Union[os.PathLike, str], split: Literal["train", "test"], download: bool = False
129) -> List[str]:
130    """Get paths to the CoNSeP data.
131
132    Args:
133        path: Filepath to a folder where the data is downloaded for further processing.
134        split: The choice of data split.
135        download: Whether to download the data if it is not present.
136
137    Returns:
138        List of filepaths for the input data.
139    """
140    data_dir = get_consep_data(path, download)
141
142    _preprocess_data(data_dir, "train")
143    _preprocess_data(data_dir, "test")
144
145    if split not in ["train", "test"]:
146        raise ValueError(f"'{split}' is not a valid split.")
147
148    paths = natsorted(glob(os.path.join(data_dir, "preprocessed", split, "*.h5")))
149    return paths
150
151
152def get_consep_dataset(
153    path: Union[os.PathLike, str],
154    patch_shape: Tuple[int, int],
155    split: Literal["train", "test"],
156    download: bool = False,
157    **kwargs
158) -> Dataset:
159    """Get the CoNSeP dataset for nucleus segmentation.
160
161    Args:
162        path: Filepath to a folder where the data is downloaded for further processing.
163        patch_shape: The patch shape to use for training.
164        split: The choice of data split.
165        download: Whether to download the data if it is not present.
166        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
167
168    Returns:
169        The segmentation dataset.
170    """
171    volume_paths = get_consep_paths(path, split, download)
172
173    return torch_em.default_segmentation_dataset(
174        raw_paths=volume_paths,
175        raw_key="raw",
176        label_paths=volume_paths,
177        label_key="labels",
178        is_seg_dataset=True,
179        patch_shape=patch_shape,
180        with_channels=True,
181        ndim=2,
182        **kwargs
183    )
184
185
186def get_consep_loader(
187    path: Union[os.PathLike, str],
188    batch_size: int,
189    patch_shape: Tuple[int, int],
190    split: Literal["train", "test"],
191    download: bool = False,
192    **kwargs
193) -> DataLoader:
194    """Get the CoNSeP dataloader for nucleus segmentation.
195
196    Args:
197        path: Filepath to a folder where the data is downloaded for further processing.
198        batch_size: The batch size for training.
199        patch_shape: The patch shape to use for training.
200        split: The choice of data split.
201        download: Whether to download the data if it is not present.
202        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
203
204    Returns:
205        The DataLoader.
206    """
207    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
208    dataset = get_consep_dataset(path, patch_shape, split, download, **ds_kwargs)
209    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

def get_consep_data(path: Union[os.PathLike, str], download: bool = False) -> str: View Source

104def get_consep_data(path: Union[os.PathLike, str], download: bool = False) -> str:
105    """Download the CoNSeP dataset.
106
107    Args:
108        path: Filepath to a folder where the data is downloaded for further processing.
109        download: Whether to download the data if it is not present.
110
111    Returns:
112        Filepath where the data is downloaded and preprocessed.
113    """
114    data_dir = os.path.join(path, "data", "consep")
115    if os.path.exists(data_dir):
116        return data_dir
117
118    os.makedirs(path, exist_ok=True)
119
120    util.download_source_kaggle(path=path, dataset_name="rftexas/tiled-consep-224x224px", download=download)
121    util.unzip(zip_path=os.path.join(
122        path, "tiled-consep-224x224px.zip"), dst=os.path.join(path, "data"), remove=False
123    )
124
125    return data_dir

Download the CoNSeP dataset.

Arguments:

path: Filepath to a folder where the data is downloaded for further processing.
download: Whether to download the data if it is not present.

Returns:

Filepath where the data is downloaded and preprocessed.

def get_consep_paths( path: Union[os.PathLike, str], split: Literal['train', 'test'], download: bool = False) -> List[str]: View Source

128def get_consep_paths(
129    path: Union[os.PathLike, str], split: Literal["train", "test"], download: bool = False
130) -> List[str]:
131    """Get paths to the CoNSeP data.
132
133    Args:
134        path: Filepath to a folder where the data is downloaded for further processing.
135        split: The choice of data split.
136        download: Whether to download the data if it is not present.
137
138    Returns:
139        List of filepaths for the input data.
140    """
141    data_dir = get_consep_data(path, download)
142
143    _preprocess_data(data_dir, "train")
144    _preprocess_data(data_dir, "test")
145
146    if split not in ["train", "test"]:
147        raise ValueError(f"'{split}' is not a valid split.")
148
149    paths = natsorted(glob(os.path.join(data_dir, "preprocessed", split, "*.h5")))
150    return paths

Get paths to the CoNSeP data.

Arguments:

path: Filepath to a folder where the data is downloaded for further processing.
split: The choice of data split.
download: Whether to download the data if it is not present.

Returns:

List of filepaths for the input data.

def get_consep_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Literal['train', 'test'], download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset: View Source

153def get_consep_dataset(
154    path: Union[os.PathLike, str],
155    patch_shape: Tuple[int, int],
156    split: Literal["train", "test"],
157    download: bool = False,
158    **kwargs
159) -> Dataset:
160    """Get the CoNSeP dataset for nucleus segmentation.
161
162    Args:
163        path: Filepath to a folder where the data is downloaded for further processing.
164        patch_shape: The patch shape to use for training.
165        split: The choice of data split.
166        download: Whether to download the data if it is not present.
167        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
168
169    Returns:
170        The segmentation dataset.
171    """
172    volume_paths = get_consep_paths(path, split, download)
173
174    return torch_em.default_segmentation_dataset(
175        raw_paths=volume_paths,
176        raw_key="raw",
177        label_paths=volume_paths,
178        label_key="labels",
179        is_seg_dataset=True,
180        patch_shape=patch_shape,
181        with_channels=True,
182        ndim=2,
183        **kwargs
184    )

Get the CoNSeP dataset for nucleus segmentation.

Arguments:

path: Filepath to a folder where the data is downloaded for further processing.
patch_shape: The patch shape to use for training.
split: The choice of data split.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

Returns:

The segmentation dataset.

def get_consep_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Literal['train', 'test'], download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader: View Source

187def get_consep_loader(
188    path: Union[os.PathLike, str],
189    batch_size: int,
190    patch_shape: Tuple[int, int],
191    split: Literal["train", "test"],
192    download: bool = False,
193    **kwargs
194) -> DataLoader:
195    """Get the CoNSeP dataloader for nucleus segmentation.
196
197    Args:
198        path: Filepath to a folder where the data is downloaded for further processing.
199        batch_size: The batch size for training.
200        patch_shape: The patch shape to use for training.
201        split: The choice of data split.
202        download: Whether to download the data if it is not present.
203        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
204
205    Returns:
206        The DataLoader.
207    """
208    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
209    dataset = get_consep_dataset(path, patch_shape, split, download, **ds_kwargs)
210    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the CoNSeP dataloader for nucleus segmentation.

Arguments:

path: Filepath to a folder where the data is downloaded for further processing.
batch_size: The batch size for training.
patch_shape: The patch shape to use for training.
split: The choice of data split.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

Returns:

The DataLoader.