torch_em.data.datasets.histopathology.ignite

The IGNITE dataset contains semantic tissue segmentations in H&E-stained NSCLC and centroid annotations in IHC-stained NSCLC.

The dataset is located at https://doi.org/10.5281/zenodo.15674784. This dataset is from the publication https://doi.org/10.48550/arXiv.2507.16855. Please cite it if you use this dataset in your research.

View Source

  1"""The IGNITE dataset contains semantic tissue segmentations in H&E-stained NSCLC
  2and centroid annotations in IHC-stained NSCLC.
  3
  4The dataset is located at https://doi.org/10.5281/zenodo.15674784.
  5This dataset is from the publication https://doi.org/10.48550/arXiv.2507.16855.
  6Please cite it if you use this dataset in your research.
  7"""
  8
  9import os
 10from pathlib import Path
 11from typing import List, Literal, Optional, Tuple, Union
 12
 13import pandas as pd
 14from natsort import natsorted
 15from torch.utils.data import DataLoader, Dataset
 16
 17import torch_em
 18
 19from .. import util
 20
 21URLS = {
 22    "tissue_annotations": "https://zenodo.org/records/15674785/files/annotations.zip",
 23    "images": "https://zenodo.org/records/15674785/files/images.zip",
 24    "data_overview": "https://zenodo.org/records/15674785/files/data_overview.csv",
 25}
 26
 27CHECKSUMS = {
 28    "tissue_annotations": "b333fab032735de87563c5510de38fc5e2dccc0903a787f7b2b9bd249e66713b",
 29    "images": "12389313f7f05a6dfb1a15b4aa94a8b16ec4a61a9daf2e86ca6e0a19db2b7628",
 30    "data_overview": "fa693185d602b9fa91b5556fb622c82c1761759829d593923537f2e774cf8def",
 31}
 32
 33
 34def get_split_samples(path: Path, split: str):
 35    df = pd.read_csv(path / "data_overview.csv", index_col="image_path")
 36    split_paths = df[(df["split"] == split) & (df["stain"] == "H&E")].index.tolist()
 37    return [Path(p).name for p in split_paths]
 38
 39
 40def get_ignite_data(
 41    path: Union[os.PathLike, str],
 42    download: bool = False,
 43    annotation_type: str = "tissue_annotations",
 44) -> str:
 45    """Download the IGNITE dataset for tissue segmentation.
 46
 47    Args:
 48        path: Filepath to a folder where the downloaded data will be saved.
 49        download: Whether to download the data if it is not present.
 50        annotation_type: The type of annotations. Only "tissue_annotations" is currently supported.
 51
 52    Returns:
 53        The filepath to the downloaded data.
 54    """
 55    path = Path(path)
 56
 57    if annotation_type != "tissue_annotations":
 58        raise NotImplementedError(f"Annotation loading for {annotation_type} is not implemented.")
 59
 60    for data_entity in [annotation_type, "images"]:
 61        data_dir = path / "data" / data_entity
 62        if data_dir.exists():
 63            continue
 64
 65        data_dir.mkdir(parents=True, exist_ok=True)
 66
 67        zip_path = path / f"{data_entity}.zip"
 68        util.download_source(path=zip_path, url=URLS[data_entity], download=download, checksum=CHECKSUMS[data_entity])
 69        util.unzip(zip_path=zip_path, dst=data_dir)
 70
 71    util.download_source(
 72        path=path / "data_overview.csv",
 73        url=URLS["data_overview"],
 74        download=download,
 75        checksum=CHECKSUMS["data_overview"],
 76    )
 77
 78    return path / "data"
 79
 80
 81def get_ignite_paths(
 82    path: Union[os.PathLike, str],
 83    split: Optional[Literal["train", "test"]] = None,
 84    annotation_type: Optional[Literal["tissue_annotations"]] = "tissue_annotations",
 85    download: bool = False,
 86) -> Tuple[List[str], List[str]]:
 87    """Get paths to the IGNITE data.
 88
 89    Args:
 90        path: Filepath to a folder where the downloaded data will be saved.
 91        split: The choice of data split.
 92        annotation_type: The type of annotations.
 93        download: Whether to download the data if it is not present.
 94
 95    Returns:
 96        List of filepaths to the image data.
 97        List of filepaths to the label data.
 98    """
 99    data_dir = get_ignite_data(Path(path), download, annotation_type=annotation_type)
100
101    annotation_dir = data_dir / "tissue_annotations" / "annotations" / "he"
102    img_dir = data_dir / "images" / "images" / "he"
103
104    if split is not None:
105        split_filenames = get_split_samples(Path(path), split)
106        img_paths = natsorted([str(img_dir / fn) for fn in split_filenames])
107        annotation_paths = natsorted([str(annotation_dir / fn) for fn in split_filenames])
108    else:
109        img_paths = natsorted([str(p) for p in img_dir.iterdir() if not p.stem.endswith("context")])
110        annotation_paths = natsorted([str(p) for p in annotation_dir.iterdir() if not p.stem.endswith("context")])
111
112    assert len(img_paths) == len(annotation_paths) and len(img_paths) > 0, "The inputs are not of expected length."
113
114    return img_paths, annotation_paths
115
116
117def get_ignite_dataset(
118    path: Union[os.PathLike, str],
119    patch_shape: Tuple[int, int],
120    split: Optional[Literal["train", "test"]] = None,
121    annotation_type: Optional[Literal["tissue_annotations"]] = "tissue_annotations",
122    resize_inputs: bool = False,
123    download: bool = False,
124    **kwargs,
125) -> Dataset:
126    """Get the IGNITE dataset for tissue segmentation.
127
128    Args:
129        path: Filepath to a folder where the downloaded data will be saved.
130        patch_shape: The patch shape to use for training.
131        split: The choice of data split.
132        annotation_type: The type of annotations.
133        resize_inputs: Whether to resize the inputs.
134        download: Whether to download the data if it is not present.
135        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
136
137    Returns:
138        The segmentation dataset.
139    """
140    raw_paths, label_paths = get_ignite_paths(path, split, annotation_type, download)
141
142    if resize_inputs:
143        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
144        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
145            kwargs=kwargs,
146            patch_shape=patch_shape,
147            resize_inputs=resize_inputs,
148            resize_kwargs=resize_kwargs,
149        )
150
151    return torch_em.default_segmentation_dataset(
152        raw_paths=raw_paths,
153        raw_key=None,
154        label_paths=label_paths,
155        label_key=None,
156        patch_shape=patch_shape,
157        is_seg_dataset=False,
158        **kwargs,
159    )
160
161
162def get_ignite_loader(
163    path: Union[os.PathLike, str],
164    batch_size: int,
165    patch_shape: Tuple[int, int],
166    split: Optional[Literal["train", "test"]] = None,
167    annotation_type: Optional[Literal["tissue_annotations"]] = "tissue_annotations",
168    resize_inputs: bool = False,
169    download: bool = False,
170    **kwargs,
171) -> DataLoader:
172    """Get the IGNITE dataloader for tissue segmentation.
173
174    Args:
175        path: Filepath to a folder where the downloaded data will be saved.
176        batch_size: The batch size for training.
177        patch_shape: The patch shape to use for training.
178        split: The choice of data split.
179        annotation_type: The type of annotations.
180        resize_inputs: Whether to resize the inputs.
181        download: Whether to download the data if it is not present.
182        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
183
184    Returns:
185        The DataLoader.
186    """
187    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
188    dataset = get_ignite_dataset(path, patch_shape, split, annotation_type, resize_inputs, download, **ds_kwargs)
189    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

URLS = {'tissue_annotations': 'https://zenodo.org/records/15674785/files/annotations.zip', 'images': 'https://zenodo.org/records/15674785/files/images.zip', 'data_overview': 'https://zenodo.org/records/15674785/files/data_overview.csv'}

CHECKSUMS = {'tissue_annotations': 'b333fab032735de87563c5510de38fc5e2dccc0903a787f7b2b9bd249e66713b', 'images': '12389313f7f05a6dfb1a15b4aa94a8b16ec4a61a9daf2e86ca6e0a19db2b7628', 'data_overview': 'fa693185d602b9fa91b5556fb622c82c1761759829d593923537f2e774cf8def'}

def get_split_samples(path: pathlib.Path, split: str): View Source

35def get_split_samples(path: Path, split: str):
36    df = pd.read_csv(path / "data_overview.csv", index_col="image_path")
37    split_paths = df[(df["split"] == split) & (df["stain"] == "H&E")].index.tolist()
38    return [Path(p).name for p in split_paths]

def get_ignite_data( path: Union[os.PathLike, str], download: bool = False, annotation_type: str = 'tissue_annotations') -> str: View Source

41def get_ignite_data(
42    path: Union[os.PathLike, str],
43    download: bool = False,
44    annotation_type: str = "tissue_annotations",
45) -> str:
46    """Download the IGNITE dataset for tissue segmentation.
47
48    Args:
49        path: Filepath to a folder where the downloaded data will be saved.
50        download: Whether to download the data if it is not present.
51        annotation_type: The type of annotations. Only "tissue_annotations" is currently supported.
52
53    Returns:
54        The filepath to the downloaded data.
55    """
56    path = Path(path)
57
58    if annotation_type != "tissue_annotations":
59        raise NotImplementedError(f"Annotation loading for {annotation_type} is not implemented.")
60
61    for data_entity in [annotation_type, "images"]:
62        data_dir = path / "data" / data_entity
63        if data_dir.exists():
64            continue
65
66        data_dir.mkdir(parents=True, exist_ok=True)
67
68        zip_path = path / f"{data_entity}.zip"
69        util.download_source(path=zip_path, url=URLS[data_entity], download=download, checksum=CHECKSUMS[data_entity])
70        util.unzip(zip_path=zip_path, dst=data_dir)
71
72    util.download_source(
73        path=path / "data_overview.csv",
74        url=URLS["data_overview"],
75        download=download,
76        checksum=CHECKSUMS["data_overview"],
77    )
78
79    return path / "data"

Download the IGNITE dataset for tissue segmentation.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
download: Whether to download the data if it is not present.
annotation_type: The type of annotations. Only "tissue_annotations" is currently supported.

Returns:

The filepath to the downloaded data.

def get_ignite_paths( path: Union[os.PathLike, str], split: Optional[Literal['train', 'test']] = None, annotation_type: Optional[Literal['tissue_annotations']] = 'tissue_annotations', download: bool = False) -> Tuple[List[str], List[str]]: View Source

 82def get_ignite_paths(
 83    path: Union[os.PathLike, str],
 84    split: Optional[Literal["train", "test"]] = None,
 85    annotation_type: Optional[Literal["tissue_annotations"]] = "tissue_annotations",
 86    download: bool = False,
 87) -> Tuple[List[str], List[str]]:
 88    """Get paths to the IGNITE data.
 89
 90    Args:
 91        path: Filepath to a folder where the downloaded data will be saved.
 92        split: The choice of data split.
 93        annotation_type: The type of annotations.
 94        download: Whether to download the data if it is not present.
 95
 96    Returns:
 97        List of filepaths to the image data.
 98        List of filepaths to the label data.
 99    """
100    data_dir = get_ignite_data(Path(path), download, annotation_type=annotation_type)
101
102    annotation_dir = data_dir / "tissue_annotations" / "annotations" / "he"
103    img_dir = data_dir / "images" / "images" / "he"
104
105    if split is not None:
106        split_filenames = get_split_samples(Path(path), split)
107        img_paths = natsorted([str(img_dir / fn) for fn in split_filenames])
108        annotation_paths = natsorted([str(annotation_dir / fn) for fn in split_filenames])
109    else:
110        img_paths = natsorted([str(p) for p in img_dir.iterdir() if not p.stem.endswith("context")])
111        annotation_paths = natsorted([str(p) for p in annotation_dir.iterdir() if not p.stem.endswith("context")])
112
113    assert len(img_paths) == len(annotation_paths) and len(img_paths) > 0, "The inputs are not of expected length."
114
115    return img_paths, annotation_paths

Get paths to the IGNITE data.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
split: The choice of data split.
annotation_type: The type of annotations.
download: Whether to download the data if it is not present.

Returns:

List of filepaths to the image data. List of filepaths to the label data.

def get_ignite_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Optional[Literal['train', 'test']] = None, annotation_type: Optional[Literal['tissue_annotations']] = 'tissue_annotations', resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset: View Source

118def get_ignite_dataset(
119    path: Union[os.PathLike, str],
120    patch_shape: Tuple[int, int],
121    split: Optional[Literal["train", "test"]] = None,
122    annotation_type: Optional[Literal["tissue_annotations"]] = "tissue_annotations",
123    resize_inputs: bool = False,
124    download: bool = False,
125    **kwargs,
126) -> Dataset:
127    """Get the IGNITE dataset for tissue segmentation.
128
129    Args:
130        path: Filepath to a folder where the downloaded data will be saved.
131        patch_shape: The patch shape to use for training.
132        split: The choice of data split.
133        annotation_type: The type of annotations.
134        resize_inputs: Whether to resize the inputs.
135        download: Whether to download the data if it is not present.
136        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
137
138    Returns:
139        The segmentation dataset.
140    """
141    raw_paths, label_paths = get_ignite_paths(path, split, annotation_type, download)
142
143    if resize_inputs:
144        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
145        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
146            kwargs=kwargs,
147            patch_shape=patch_shape,
148            resize_inputs=resize_inputs,
149            resize_kwargs=resize_kwargs,
150        )
151
152    return torch_em.default_segmentation_dataset(
153        raw_paths=raw_paths,
154        raw_key=None,
155        label_paths=label_paths,
156        label_key=None,
157        patch_shape=patch_shape,
158        is_seg_dataset=False,
159        **kwargs,
160    )

Get the IGNITE dataset for tissue segmentation.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
patch_shape: The patch shape to use for training.
split: The choice of data split.
annotation_type: The type of annotations.
resize_inputs: Whether to resize the inputs.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

Returns:

The segmentation dataset.

def get_ignite_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Optional[Literal['train', 'test']] = None, annotation_type: Optional[Literal['tissue_annotations']] = 'tissue_annotations', resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader: View Source

163def get_ignite_loader(
164    path: Union[os.PathLike, str],
165    batch_size: int,
166    patch_shape: Tuple[int, int],
167    split: Optional[Literal["train", "test"]] = None,
168    annotation_type: Optional[Literal["tissue_annotations"]] = "tissue_annotations",
169    resize_inputs: bool = False,
170    download: bool = False,
171    **kwargs,
172) -> DataLoader:
173    """Get the IGNITE dataloader for tissue segmentation.
174
175    Args:
176        path: Filepath to a folder where the downloaded data will be saved.
177        batch_size: The batch size for training.
178        patch_shape: The patch shape to use for training.
179        split: The choice of data split.
180        annotation_type: The type of annotations.
181        resize_inputs: Whether to resize the inputs.
182        download: Whether to download the data if it is not present.
183        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
184
185    Returns:
186        The DataLoader.
187    """
188    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
189    dataset = get_ignite_dataset(path, patch_shape, split, annotation_type, resize_inputs, download, **ds_kwargs)
190    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the IGNITE dataloader for tissue segmentation.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
batch_size: The batch size for training.
patch_shape: The patch shape to use for training.
split: The choice of data split.
annotation_type: The type of annotations.
resize_inputs: Whether to resize the inputs.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

Returns:

The DataLoader.