torch_em.data.datasets.histopathology.panoptils

The PanopTILs dataset contains panoptic segmentation annotations for tumor-infiltrating lymphocyte (TIL) assessment in H&E stained breast cancer histopathology images.

The dataset provides 1,349 ROIs (1024x1024 pixels at 0.25 MPP) from TCGA invasive breast cancer cases with three annotation types: nuclei instance segmentation, nuclei semantic segmentation (type), and tissue semantic segmentation.

Nuclei classes: background (0), neoplastic (1), stromal (2), inflammatory (3), epithelial (4), other (5), unknown (6).

Tissue classes: background (0), tumor (1), stroma (2), epithelium (3), junk/debris (4), blood (5), other (6).

NOTE: This uses the refined version from https://huggingface.co/datasets/histolytics-hub/panoptils_refined. The original dataset is at https://sites.google.com/view/panoptils/. This dataset is from the publication https://doi.org/10.1038/s41523-024-00663-1. Please cite it if you use this dataset in your research.

View Source

  1"""The PanopTILs dataset contains panoptic segmentation annotations for tumor-infiltrating
  2lymphocyte (TIL) assessment in H&E stained breast cancer histopathology images.
  3
  4The dataset provides 1,349 ROIs (1024x1024 pixels at 0.25 MPP) from TCGA invasive breast
  5cancer cases with three annotation types: nuclei instance segmentation, nuclei semantic
  6segmentation (type), and tissue semantic segmentation.
  7
  8Nuclei classes: background (0), neoplastic (1), stromal (2), inflammatory (3),
  9epithelial (4), other (5), unknown (6).
 10
 11Tissue classes: background (0), tumor (1), stroma (2), epithelium (3),
 12junk/debris (4), blood (5), other (6).
 13
 14NOTE: This uses the refined version from https://huggingface.co/datasets/histolytics-hub/panoptils_refined.
 15The original dataset is at https://sites.google.com/view/panoptils/.
 16This dataset is from the publication https://doi.org/10.1038/s41523-024-00663-1.
 17Please cite it if you use this dataset in your research.
 18"""
 19
 20import os
 21from glob import glob
 22from typing import Union, Tuple, List, Literal
 23
 24import numpy as np
 25
 26from torch.utils.data import Dataset, DataLoader
 27
 28import torch_em
 29
 30from .. import util
 31
 32
 33URL = "https://huggingface.co/datasets/histolytics-hub/panoptils_refined/resolve/main/panoptils_refined.parquet"
 34
 35LABEL_CHOICES = ["instances", "type", "semantic"]
 36
 37
 38def _create_images_from_parquet(path):
 39    """Extract images and masks from the parquet file and save as TIF."""
 40    import imageio.v3 as imageio
 41    import pandas as pd
 42    from io import BytesIO
 43    from PIL import Image
 44    from tqdm import tqdm
 45
 46    image_dir = os.path.join(path, "images")
 47    inst_dir = os.path.join(path, "instances")
 48    type_dir = os.path.join(path, "types")
 49    sem_dir = os.path.join(path, "semantic")
 50    for d in [image_dir, inst_dir, type_dir, sem_dir]:
 51        os.makedirs(d, exist_ok=True)
 52
 53    parquet_path = os.path.join(path, "panoptils_refined.parquet")
 54    df = pd.read_parquet(parquet_path)
 55
 56    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Extracting PanopTILs images"):
 57        sample_id = f"{idx:05d}"
 58        img_path = os.path.join(image_dir, f"{sample_id}.tif")
 59
 60        if os.path.exists(img_path):
 61            continue
 62
 63        img = np.array(Image.open(BytesIO(row["image"])).convert("RGB"))
 64        inst = np.array(Image.open(BytesIO(row["inst"])))
 65        ntype = np.array(Image.open(BytesIO(row["type"])))
 66        sem = np.array(Image.open(BytesIO(row["sem"])))
 67
 68        imageio.imwrite(img_path, img, compression="zlib")
 69        imageio.imwrite(os.path.join(inst_dir, f"{sample_id}.tif"), inst.astype("uint32"), compression="zlib")
 70        imageio.imwrite(os.path.join(type_dir, f"{sample_id}.tif"), ntype.astype("uint8"), compression="zlib")
 71        imageio.imwrite(os.path.join(sem_dir, f"{sample_id}.tif"), sem.astype("uint8"), compression="zlib")
 72
 73
 74def get_panoptils_data(
 75    path: Union[os.PathLike, str],
 76    download: bool = False,
 77) -> str:
 78    """Download the PanopTILs dataset.
 79
 80    Args:
 81        path: Filepath to a folder where the downloaded data will be saved.
 82        download: Whether to download the data if it is not present.
 83
 84    Returns:
 85        The filepath to the directory with the data.
 86    """
 87    parquet_path = os.path.join(path, "panoptils_refined.parquet")
 88    if not os.path.exists(parquet_path):
 89        os.makedirs(path, exist_ok=True)
 90        util.download_source(path=parquet_path, url=URL, download=download, checksum=None)
 91
 92    image_dir = os.path.join(path, "images")
 93    if not os.path.exists(image_dir) or len(glob(os.path.join(image_dir, "*.tif"))) == 0:
 94        _create_images_from_parquet(path)
 95
 96    return path
 97
 98
 99def get_panoptils_paths(
100    path: Union[os.PathLike, str],
101    label_choice: Literal["instances", "type", "semantic"] = "instances",
102    download: bool = False,
103) -> Tuple[List[str], List[str]]:
104    """Get paths to the PanopTILs data.
105
106    Args:
107        path: Filepath to a folder where the downloaded data will be saved.
108        label_choice: The type of labels to use. One of 'instances' (nuclei instance segmentation),
109            'type' (nuclei semantic segmentation), or 'semantic' (tissue semantic segmentation).
110        download: Whether to download the data if it is not present.
111
112    Returns:
113        List of filepaths for the image data.
114        List of filepaths for the label data.
115    """
116    from natsort import natsorted
117
118    assert label_choice in LABEL_CHOICES, f"'{label_choice}' is not valid. Choose from {LABEL_CHOICES}."
119
120    get_panoptils_data(path, download)
121
122    label_dir = label_choice if label_choice != "type" else "types"
123    image_paths = natsorted(glob(os.path.join(path, "images", "*.tif")))
124    label_paths = natsorted(glob(os.path.join(path, label_dir, "*.tif")))
125
126    assert len(image_paths) == len(label_paths) and len(image_paths) > 0
127
128    return image_paths, label_paths
129
130
131def get_panoptils_dataset(
132    path: Union[os.PathLike, str],
133    patch_shape: Tuple[int, int],
134    label_choice: Literal["instances", "type", "semantic"] = "instances",
135    download: bool = False,
136    **kwargs,
137) -> Dataset:
138    """Get the PanopTILs dataset for panoptic segmentation of tumor-infiltrating lymphocytes.
139
140    Args:
141        path: Filepath to a folder where the downloaded data will be saved.
142        patch_shape: The patch shape to use for training.
143        label_choice: The type of labels to use. One of 'instances' (nuclei instance segmentation),
144            'type' (nuclei semantic segmentation), or 'semantic' (tissue semantic segmentation).
145        download: Whether to download the data if it is not present.
146        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
147
148    Returns:
149        The segmentation dataset.
150    """
151    image_paths, label_paths = get_panoptils_paths(path, label_choice, download)
152
153    if label_choice == "instances":
154        kwargs, _ = util.add_instance_label_transform(
155            kwargs, add_binary_target=True,
156        )
157
158    kwargs = util.update_kwargs(kwargs, "ndim", 2)
159
160    return torch_em.default_segmentation_dataset(
161        raw_paths=image_paths,
162        raw_key=None,
163        label_paths=label_paths,
164        label_key=None,
165        patch_shape=patch_shape,
166        is_seg_dataset=label_choice != "instances",
167        **kwargs,
168    )
169
170
171def get_panoptils_loader(
172    path: Union[os.PathLike, str],
173    batch_size: int,
174    patch_shape: Tuple[int, int],
175    label_choice: Literal["instances", "type", "semantic"] = "instances",
176    download: bool = False,
177    **kwargs,
178) -> DataLoader:
179    """Get the PanopTILs dataloader for panoptic segmentation of tumor-infiltrating lymphocytes.
180
181    Args:
182        path: Filepath to a folder where the downloaded data will be saved.
183        batch_size: The batch size for training.
184        patch_shape: The patch shape to use for training.
185        label_choice: The type of labels to use. One of 'instances' (nuclei instance segmentation),
186            'type' (nuclei semantic segmentation), or 'semantic' (tissue semantic segmentation).
187        download: Whether to download the data if it is not present.
188        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
189
190    Returns:
191        The DataLoader.
192    """
193    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
194    dataset = get_panoptils_dataset(path, patch_shape, label_choice, download, **ds_kwargs)
195    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

URL = 'https://huggingface.co/datasets/histolytics-hub/panoptils_refined/resolve/main/panoptils_refined.parquet'

LABEL_CHOICES = ['instances', 'type', 'semantic']

def get_panoptils_data(path: Union[os.PathLike, str], download: bool = False) -> str: View Source

75def get_panoptils_data(
76    path: Union[os.PathLike, str],
77    download: bool = False,
78) -> str:
79    """Download the PanopTILs dataset.
80
81    Args:
82        path: Filepath to a folder where the downloaded data will be saved.
83        download: Whether to download the data if it is not present.
84
85    Returns:
86        The filepath to the directory with the data.
87    """
88    parquet_path = os.path.join(path, "panoptils_refined.parquet")
89    if not os.path.exists(parquet_path):
90        os.makedirs(path, exist_ok=True)
91        util.download_source(path=parquet_path, url=URL, download=download, checksum=None)
92
93    image_dir = os.path.join(path, "images")
94    if not os.path.exists(image_dir) or len(glob(os.path.join(image_dir, "*.tif"))) == 0:
95        _create_images_from_parquet(path)
96
97    return path

Download the PanopTILs dataset.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
download: Whether to download the data if it is not present.

Returns:

The filepath to the directory with the data.

def get_panoptils_paths( path: Union[os.PathLike, str], label_choice: Literal['instances', 'type', 'semantic'] = 'instances', download: bool = False) -> Tuple[List[str], List[str]]: View Source

100def get_panoptils_paths(
101    path: Union[os.PathLike, str],
102    label_choice: Literal["instances", "type", "semantic"] = "instances",
103    download: bool = False,
104) -> Tuple[List[str], List[str]]:
105    """Get paths to the PanopTILs data.
106
107    Args:
108        path: Filepath to a folder where the downloaded data will be saved.
109        label_choice: The type of labels to use. One of 'instances' (nuclei instance segmentation),
110            'type' (nuclei semantic segmentation), or 'semantic' (tissue semantic segmentation).
111        download: Whether to download the data if it is not present.
112
113    Returns:
114        List of filepaths for the image data.
115        List of filepaths for the label data.
116    """
117    from natsort import natsorted
118
119    assert label_choice in LABEL_CHOICES, f"'{label_choice}' is not valid. Choose from {LABEL_CHOICES}."
120
121    get_panoptils_data(path, download)
122
123    label_dir = label_choice if label_choice != "type" else "types"
124    image_paths = natsorted(glob(os.path.join(path, "images", "*.tif")))
125    label_paths = natsorted(glob(os.path.join(path, label_dir, "*.tif")))
126
127    assert len(image_paths) == len(label_paths) and len(image_paths) > 0
128
129    return image_paths, label_paths

Get paths to the PanopTILs data.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
label_choice: The type of labels to use. One of 'instances' (nuclei instance segmentation), 'type' (nuclei semantic segmentation), or 'semantic' (tissue semantic segmentation).
download: Whether to download the data if it is not present.

Returns:

List of filepaths for the image data. List of filepaths for the label data.

def get_panoptils_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], label_choice: Literal['instances', 'type', 'semantic'] = 'instances', download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset: View Source

132def get_panoptils_dataset(
133    path: Union[os.PathLike, str],
134    patch_shape: Tuple[int, int],
135    label_choice: Literal["instances", "type", "semantic"] = "instances",
136    download: bool = False,
137    **kwargs,
138) -> Dataset:
139    """Get the PanopTILs dataset for panoptic segmentation of tumor-infiltrating lymphocytes.
140
141    Args:
142        path: Filepath to a folder where the downloaded data will be saved.
143        patch_shape: The patch shape to use for training.
144        label_choice: The type of labels to use. One of 'instances' (nuclei instance segmentation),
145            'type' (nuclei semantic segmentation), or 'semantic' (tissue semantic segmentation).
146        download: Whether to download the data if it is not present.
147        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
148
149    Returns:
150        The segmentation dataset.
151    """
152    image_paths, label_paths = get_panoptils_paths(path, label_choice, download)
153
154    if label_choice == "instances":
155        kwargs, _ = util.add_instance_label_transform(
156            kwargs, add_binary_target=True,
157        )
158
159    kwargs = util.update_kwargs(kwargs, "ndim", 2)
160
161    return torch_em.default_segmentation_dataset(
162        raw_paths=image_paths,
163        raw_key=None,
164        label_paths=label_paths,
165        label_key=None,
166        patch_shape=patch_shape,
167        is_seg_dataset=label_choice != "instances",
168        **kwargs,
169    )

Get the PanopTILs dataset for panoptic segmentation of tumor-infiltrating lymphocytes.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
patch_shape: The patch shape to use for training.
label_choice: The type of labels to use. One of 'instances' (nuclei instance segmentation), 'type' (nuclei semantic segmentation), or 'semantic' (tissue semantic segmentation).
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

Returns:

The segmentation dataset.

def get_panoptils_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], label_choice: Literal['instances', 'type', 'semantic'] = 'instances', download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader: View Source

172def get_panoptils_loader(
173    path: Union[os.PathLike, str],
174    batch_size: int,
175    patch_shape: Tuple[int, int],
176    label_choice: Literal["instances", "type", "semantic"] = "instances",
177    download: bool = False,
178    **kwargs,
179) -> DataLoader:
180    """Get the PanopTILs dataloader for panoptic segmentation of tumor-infiltrating lymphocytes.
181
182    Args:
183        path: Filepath to a folder where the downloaded data will be saved.
184        batch_size: The batch size for training.
185        patch_shape: The patch shape to use for training.
186        label_choice: The type of labels to use. One of 'instances' (nuclei instance segmentation),
187            'type' (nuclei semantic segmentation), or 'semantic' (tissue semantic segmentation).
188        download: Whether to download the data if it is not present.
189        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
190
191    Returns:
192        The DataLoader.
193    """
194    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
195    dataset = get_panoptils_dataset(path, patch_shape, label_choice, download, **ds_kwargs)
196    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the PanopTILs dataloader for panoptic segmentation of tumor-infiltrating lymphocytes.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
batch_size: The batch size for training.
patch_shape: The patch shape to use for training.
label_choice: The type of labels to use. One of 'instances' (nuclei instance segmentation), 'type' (nuclei semantic segmentation), or 'semantic' (tissue semantic segmentation).
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

Returns:

The DataLoader.