torch_em.data.datasets.histopathology.ignite
The IGNITE dataset contains semantic tissue segmentations in H&E-stained NSCLC and centroid annotations in IHC-stained NSCLC.
The dataset is located at https://doi.org/10.5281/zenodo.15674784. This dataset is from the publication https://doi.org/10.48550/arXiv.2507.16855. Please cite it if you use this dataset in your research.
1"""The IGNITE dataset contains semantic tissue segmentations in H&E-stained NSCLC 2and centroid annotations in IHC-stained NSCLC. 3 4The dataset is located at https://doi.org/10.5281/zenodo.15674784. 5This dataset is from the publication https://doi.org/10.48550/arXiv.2507.16855. 6Please cite it if you use this dataset in your research. 7""" 8 9import os 10from pathlib import Path 11from typing import List, Literal, Optional, Tuple, Union 12 13import pandas as pd 14from natsort import natsorted 15from torch.utils.data import DataLoader, Dataset 16 17import torch_em 18 19from .. import util 20 21URLS = { 22 "tissue_annotations": "https://zenodo.org/records/15674785/files/annotations.zip", 23 "images": "https://zenodo.org/records/15674785/files/images.zip", 24 "data_overview": "https://zenodo.org/records/15674785/files/data_overview.csv", 25} 26 27CHECKSUMS = { 28 "tissue_annotations": "b333fab032735de87563c5510de38fc5e2dccc0903a787f7b2b9bd249e66713b", 29 "images": "12389313f7f05a6dfb1a15b4aa94a8b16ec4a61a9daf2e86ca6e0a19db2b7628", 30 "data_overview": "fa693185d602b9fa91b5556fb622c82c1761759829d593923537f2e774cf8def", 31} 32 33 34def get_split_samples(path: Path, split: str): 35 df = pd.read_csv(path / "data_overview.csv", index_col="image_path") 36 split_paths = df[(df["split"] == split) & (df["stain"] == "H&E")].index.tolist() 37 return [Path(p).name for p in split_paths] 38 39 40def get_ignite_data( 41 path: Union[os.PathLike, str], 42 download: bool = False, 43 annotation_type: str = "tissue_annotations", 44) -> str: 45 """Download the IGNITE dataset for tissue segmentation. 46 47 Args: 48 path: Filepath to a folder where the downloaded data will be saved. 49 download: Whether to download the data if it is not present. 50 annotation_type: The type of annotations. Only "tissue_annotations" is currently supported. 51 52 Returns: 53 The filepath to the downloaded data. 54 """ 55 path = Path(path) 56 57 if annotation_type != "tissue_annotations": 58 raise NotImplementedError(f"Annotation loading for {annotation_type} is not implemented.") 59 60 for data_entity in [annotation_type, "images"]: 61 data_dir = path / "data" / data_entity 62 if data_dir.exists(): 63 continue 64 65 data_dir.mkdir(parents=True, exist_ok=True) 66 67 zip_path = path / f"{data_entity}.zip" 68 util.download_source(path=zip_path, url=URLS[data_entity], download=download, checksum=CHECKSUMS[data_entity]) 69 util.unzip(zip_path=zip_path, dst=data_dir) 70 71 util.download_source( 72 path=path / "data_overview.csv", 73 url=URLS["data_overview"], 74 download=download, 75 checksum=CHECKSUMS["data_overview"], 76 ) 77 78 return path / "data" 79 80 81def get_ignite_paths( 82 path: Union[os.PathLike, str], 83 split: Optional[Literal["train", "test"]] = None, 84 annotation_type: Optional[Literal["tissue_annotations"]] = "tissue_annotations", 85 download: bool = False, 86) -> Tuple[List[str], List[str]]: 87 """Get paths to the IGNITE data. 88 89 Args: 90 path: Filepath to a folder where the downloaded data will be saved. 91 split: The choice of data split. 92 annotation_type: The type of annotations. 93 download: Whether to download the data if it is not present. 94 95 Returns: 96 List of filepaths to the image data. 97 List of filepaths to the label data. 98 """ 99 data_dir = get_ignite_data(Path(path), download, annotation_type=annotation_type) 100 101 annotation_dir = data_dir / "tissue_annotations" / "annotations" / "he" 102 img_dir = data_dir / "images" / "images" / "he" 103 104 if split is not None: 105 split_filenames = get_split_samples(Path(path), split) 106 img_paths = natsorted([str(img_dir / fn) for fn in split_filenames]) 107 annotation_paths = natsorted([str(annotation_dir / fn) for fn in split_filenames]) 108 else: 109 img_paths = natsorted([str(p) for p in img_dir.iterdir() if not p.stem.endswith("context")]) 110 annotation_paths = natsorted([str(p) for p in annotation_dir.iterdir() if not p.stem.endswith("context")]) 111 112 assert len(img_paths) == len(annotation_paths) and len(img_paths) > 0, "The inputs are not of expected length." 113 114 return img_paths, annotation_paths 115 116 117def get_ignite_dataset( 118 path: Union[os.PathLike, str], 119 patch_shape: Tuple[int, int], 120 split: Optional[Literal["train", "test"]] = None, 121 annotation_type: Optional[Literal["tissue_annotations"]] = "tissue_annotations", 122 resize_inputs: bool = False, 123 download: bool = False, 124 **kwargs, 125) -> Dataset: 126 """Get the IGNITE dataset for tissue segmentation. 127 128 Args: 129 path: Filepath to a folder where the downloaded data will be saved. 130 patch_shape: The patch shape to use for training. 131 split: The choice of data split. 132 annotation_type: The type of annotations. 133 resize_inputs: Whether to resize the inputs. 134 download: Whether to download the data if it is not present. 135 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 136 137 Returns: 138 The segmentation dataset. 139 """ 140 raw_paths, label_paths = get_ignite_paths(path, split, annotation_type, download) 141 142 if resize_inputs: 143 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True} 144 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 145 kwargs=kwargs, 146 patch_shape=patch_shape, 147 resize_inputs=resize_inputs, 148 resize_kwargs=resize_kwargs, 149 ) 150 151 return torch_em.default_segmentation_dataset( 152 raw_paths=raw_paths, 153 raw_key=None, 154 label_paths=label_paths, 155 label_key=None, 156 patch_shape=patch_shape, 157 is_seg_dataset=False, 158 **kwargs, 159 ) 160 161 162def get_ignite_loader( 163 path: Union[os.PathLike, str], 164 batch_size: int, 165 patch_shape: Tuple[int, int], 166 split: Optional[Literal["train", "test"]] = None, 167 annotation_type: Optional[Literal["tissue_annotations"]] = "tissue_annotations", 168 resize_inputs: bool = False, 169 download: bool = False, 170 **kwargs, 171) -> DataLoader: 172 """Get the IGNITE dataloader for tissue segmentation. 173 174 Args: 175 path: Filepath to a folder where the downloaded data will be saved. 176 batch_size: The batch size for training. 177 patch_shape: The patch shape to use for training. 178 split: The choice of data split. 179 annotation_type: The type of annotations. 180 resize_inputs: Whether to resize the inputs. 181 download: Whether to download the data if it is not present. 182 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 183 184 Returns: 185 The DataLoader. 186 """ 187 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 188 dataset = get_ignite_dataset(path, patch_shape, split, annotation_type, resize_inputs, download, **ds_kwargs) 189 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URLS =
{'tissue_annotations': 'https://zenodo.org/records/15674785/files/annotations.zip', 'images': 'https://zenodo.org/records/15674785/files/images.zip', 'data_overview': 'https://zenodo.org/records/15674785/files/data_overview.csv'}
CHECKSUMS =
{'tissue_annotations': 'b333fab032735de87563c5510de38fc5e2dccc0903a787f7b2b9bd249e66713b', 'images': '12389313f7f05a6dfb1a15b4aa94a8b16ec4a61a9daf2e86ca6e0a19db2b7628', 'data_overview': 'fa693185d602b9fa91b5556fb622c82c1761759829d593923537f2e774cf8def'}
def
get_split_samples(path: pathlib.Path, split: str):
def
get_ignite_data( path: Union[os.PathLike, str], download: bool = False, annotation_type: str = 'tissue_annotations') -> str:
41def get_ignite_data( 42 path: Union[os.PathLike, str], 43 download: bool = False, 44 annotation_type: str = "tissue_annotations", 45) -> str: 46 """Download the IGNITE dataset for tissue segmentation. 47 48 Args: 49 path: Filepath to a folder where the downloaded data will be saved. 50 download: Whether to download the data if it is not present. 51 annotation_type: The type of annotations. Only "tissue_annotations" is currently supported. 52 53 Returns: 54 The filepath to the downloaded data. 55 """ 56 path = Path(path) 57 58 if annotation_type != "tissue_annotations": 59 raise NotImplementedError(f"Annotation loading for {annotation_type} is not implemented.") 60 61 for data_entity in [annotation_type, "images"]: 62 data_dir = path / "data" / data_entity 63 if data_dir.exists(): 64 continue 65 66 data_dir.mkdir(parents=True, exist_ok=True) 67 68 zip_path = path / f"{data_entity}.zip" 69 util.download_source(path=zip_path, url=URLS[data_entity], download=download, checksum=CHECKSUMS[data_entity]) 70 util.unzip(zip_path=zip_path, dst=data_dir) 71 72 util.download_source( 73 path=path / "data_overview.csv", 74 url=URLS["data_overview"], 75 download=download, 76 checksum=CHECKSUMS["data_overview"], 77 ) 78 79 return path / "data"
Download the IGNITE dataset for tissue segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- download: Whether to download the data if it is not present.
- annotation_type: The type of annotations. Only "tissue_annotations" is currently supported.
Returns:
The filepath to the downloaded data.
def
get_ignite_paths( path: Union[os.PathLike, str], split: Optional[Literal['train', 'test']] = None, annotation_type: Optional[Literal['tissue_annotations']] = 'tissue_annotations', download: bool = False) -> Tuple[List[str], List[str]]:
82def get_ignite_paths( 83 path: Union[os.PathLike, str], 84 split: Optional[Literal["train", "test"]] = None, 85 annotation_type: Optional[Literal["tissue_annotations"]] = "tissue_annotations", 86 download: bool = False, 87) -> Tuple[List[str], List[str]]: 88 """Get paths to the IGNITE data. 89 90 Args: 91 path: Filepath to a folder where the downloaded data will be saved. 92 split: The choice of data split. 93 annotation_type: The type of annotations. 94 download: Whether to download the data if it is not present. 95 96 Returns: 97 List of filepaths to the image data. 98 List of filepaths to the label data. 99 """ 100 data_dir = get_ignite_data(Path(path), download, annotation_type=annotation_type) 101 102 annotation_dir = data_dir / "tissue_annotations" / "annotations" / "he" 103 img_dir = data_dir / "images" / "images" / "he" 104 105 if split is not None: 106 split_filenames = get_split_samples(Path(path), split) 107 img_paths = natsorted([str(img_dir / fn) for fn in split_filenames]) 108 annotation_paths = natsorted([str(annotation_dir / fn) for fn in split_filenames]) 109 else: 110 img_paths = natsorted([str(p) for p in img_dir.iterdir() if not p.stem.endswith("context")]) 111 annotation_paths = natsorted([str(p) for p in annotation_dir.iterdir() if not p.stem.endswith("context")]) 112 113 assert len(img_paths) == len(annotation_paths) and len(img_paths) > 0, "The inputs are not of expected length." 114 115 return img_paths, annotation_paths
Get paths to the IGNITE data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The choice of data split.
- annotation_type: The type of annotations.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths to the image data. List of filepaths to the label data.
def
get_ignite_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Optional[Literal['train', 'test']] = None, annotation_type: Optional[Literal['tissue_annotations']] = 'tissue_annotations', resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
118def get_ignite_dataset( 119 path: Union[os.PathLike, str], 120 patch_shape: Tuple[int, int], 121 split: Optional[Literal["train", "test"]] = None, 122 annotation_type: Optional[Literal["tissue_annotations"]] = "tissue_annotations", 123 resize_inputs: bool = False, 124 download: bool = False, 125 **kwargs, 126) -> Dataset: 127 """Get the IGNITE dataset for tissue segmentation. 128 129 Args: 130 path: Filepath to a folder where the downloaded data will be saved. 131 patch_shape: The patch shape to use for training. 132 split: The choice of data split. 133 annotation_type: The type of annotations. 134 resize_inputs: Whether to resize the inputs. 135 download: Whether to download the data if it is not present. 136 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 137 138 Returns: 139 The segmentation dataset. 140 """ 141 raw_paths, label_paths = get_ignite_paths(path, split, annotation_type, download) 142 143 if resize_inputs: 144 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True} 145 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 146 kwargs=kwargs, 147 patch_shape=patch_shape, 148 resize_inputs=resize_inputs, 149 resize_kwargs=resize_kwargs, 150 ) 151 152 return torch_em.default_segmentation_dataset( 153 raw_paths=raw_paths, 154 raw_key=None, 155 label_paths=label_paths, 156 label_key=None, 157 patch_shape=patch_shape, 158 is_seg_dataset=False, 159 **kwargs, 160 )
Get the IGNITE dataset for tissue segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- split: The choice of data split.
- annotation_type: The type of annotations.
- resize_inputs: Whether to resize the inputs.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset.
Returns:
The segmentation dataset.
def
get_ignite_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Optional[Literal['train', 'test']] = None, annotation_type: Optional[Literal['tissue_annotations']] = 'tissue_annotations', resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
163def get_ignite_loader( 164 path: Union[os.PathLike, str], 165 batch_size: int, 166 patch_shape: Tuple[int, int], 167 split: Optional[Literal["train", "test"]] = None, 168 annotation_type: Optional[Literal["tissue_annotations"]] = "tissue_annotations", 169 resize_inputs: bool = False, 170 download: bool = False, 171 **kwargs, 172) -> DataLoader: 173 """Get the IGNITE dataloader for tissue segmentation. 174 175 Args: 176 path: Filepath to a folder where the downloaded data will be saved. 177 batch_size: The batch size for training. 178 patch_shape: The patch shape to use for training. 179 split: The choice of data split. 180 annotation_type: The type of annotations. 181 resize_inputs: Whether to resize the inputs. 182 download: Whether to download the data if it is not present. 183 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 184 185 Returns: 186 The DataLoader. 187 """ 188 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 189 dataset = get_ignite_dataset(path, patch_shape, split, annotation_type, resize_inputs, download, **ds_kwargs) 190 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the IGNITE dataloader for tissue segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- split: The choice of data split.
- annotation_type: The type of annotations.
- resize_inputs: Whether to resize the inputs.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_datasetor for the PyTorch DataLoader.
Returns:
The DataLoader.