torch_em.data.datasets.histopathology.panoptils
The PanopTILs dataset contains panoptic segmentation annotations for tumor-infiltrating lymphocyte (TIL) assessment in H&E stained breast cancer histopathology images.
The dataset provides 1,349 ROIs (1024x1024 pixels at 0.25 MPP) from TCGA invasive breast cancer cases with three annotation types: nuclei instance segmentation, nuclei semantic segmentation (type), and tissue semantic segmentation.
Nuclei classes: background (0), neoplastic (1), stromal (2), inflammatory (3), epithelial (4), other (5), unknown (6).
Tissue classes: background (0), tumor (1), stroma (2), epithelium (3), junk/debris (4), blood (5), other (6).
NOTE: This uses the refined version from https://huggingface.co/datasets/histolytics-hub/panoptils_refined. The original dataset is at https://sites.google.com/view/panoptils/. This dataset is from the publication https://doi.org/10.1038/s41523-024-00663-1. Please cite it if you use this dataset in your research.
1"""The PanopTILs dataset contains panoptic segmentation annotations for tumor-infiltrating 2lymphocyte (TIL) assessment in H&E stained breast cancer histopathology images. 3 4The dataset provides 1,349 ROIs (1024x1024 pixels at 0.25 MPP) from TCGA invasive breast 5cancer cases with three annotation types: nuclei instance segmentation, nuclei semantic 6segmentation (type), and tissue semantic segmentation. 7 8Nuclei classes: background (0), neoplastic (1), stromal (2), inflammatory (3), 9epithelial (4), other (5), unknown (6). 10 11Tissue classes: background (0), tumor (1), stroma (2), epithelium (3), 12junk/debris (4), blood (5), other (6). 13 14NOTE: This uses the refined version from https://huggingface.co/datasets/histolytics-hub/panoptils_refined. 15The original dataset is at https://sites.google.com/view/panoptils/. 16This dataset is from the publication https://doi.org/10.1038/s41523-024-00663-1. 17Please cite it if you use this dataset in your research. 18""" 19 20import os 21from glob import glob 22from typing import Union, Tuple, List, Literal 23 24import numpy as np 25 26from torch.utils.data import Dataset, DataLoader 27 28import torch_em 29 30from .. import util 31 32 33URL = "https://huggingface.co/datasets/histolytics-hub/panoptils_refined/resolve/main/panoptils_refined.parquet" 34 35LABEL_CHOICES = ["instances", "type", "semantic"] 36 37 38def _create_images_from_parquet(path): 39 """Extract images and masks from the parquet file and save as TIF.""" 40 import imageio.v3 as imageio 41 import pandas as pd 42 from io import BytesIO 43 from PIL import Image 44 from tqdm import tqdm 45 46 image_dir = os.path.join(path, "images") 47 inst_dir = os.path.join(path, "instances") 48 type_dir = os.path.join(path, "types") 49 sem_dir = os.path.join(path, "semantic") 50 for d in [image_dir, inst_dir, type_dir, sem_dir]: 51 os.makedirs(d, exist_ok=True) 52 53 parquet_path = os.path.join(path, "panoptils_refined.parquet") 54 df = pd.read_parquet(parquet_path) 55 56 for idx, row in tqdm(df.iterrows(), total=len(df), desc="Extracting PanopTILs images"): 57 sample_id = f"{idx:05d}" 58 img_path = os.path.join(image_dir, f"{sample_id}.tif") 59 60 if os.path.exists(img_path): 61 continue 62 63 img = np.array(Image.open(BytesIO(row["image"])).convert("RGB")) 64 inst = np.array(Image.open(BytesIO(row["inst"]))) 65 ntype = np.array(Image.open(BytesIO(row["type"]))) 66 sem = np.array(Image.open(BytesIO(row["sem"]))) 67 68 imageio.imwrite(img_path, img, compression="zlib") 69 imageio.imwrite(os.path.join(inst_dir, f"{sample_id}.tif"), inst.astype("uint32"), compression="zlib") 70 imageio.imwrite(os.path.join(type_dir, f"{sample_id}.tif"), ntype.astype("uint8"), compression="zlib") 71 imageio.imwrite(os.path.join(sem_dir, f"{sample_id}.tif"), sem.astype("uint8"), compression="zlib") 72 73 74def get_panoptils_data( 75 path: Union[os.PathLike, str], 76 download: bool = False, 77) -> str: 78 """Download the PanopTILs dataset. 79 80 Args: 81 path: Filepath to a folder where the downloaded data will be saved. 82 download: Whether to download the data if it is not present. 83 84 Returns: 85 The filepath to the directory with the data. 86 """ 87 parquet_path = os.path.join(path, "panoptils_refined.parquet") 88 if not os.path.exists(parquet_path): 89 os.makedirs(path, exist_ok=True) 90 util.download_source(path=parquet_path, url=URL, download=download, checksum=None) 91 92 image_dir = os.path.join(path, "images") 93 if not os.path.exists(image_dir) or len(glob(os.path.join(image_dir, "*.tif"))) == 0: 94 _create_images_from_parquet(path) 95 96 return path 97 98 99def get_panoptils_paths( 100 path: Union[os.PathLike, str], 101 label_choice: Literal["instances", "type", "semantic"] = "instances", 102 download: bool = False, 103) -> Tuple[List[str], List[str]]: 104 """Get paths to the PanopTILs data. 105 106 Args: 107 path: Filepath to a folder where the downloaded data will be saved. 108 label_choice: The type of labels to use. One of 'instances' (nuclei instance segmentation), 109 'type' (nuclei semantic segmentation), or 'semantic' (tissue semantic segmentation). 110 download: Whether to download the data if it is not present. 111 112 Returns: 113 List of filepaths for the image data. 114 List of filepaths for the label data. 115 """ 116 from natsort import natsorted 117 118 assert label_choice in LABEL_CHOICES, f"'{label_choice}' is not valid. Choose from {LABEL_CHOICES}." 119 120 get_panoptils_data(path, download) 121 122 label_dir = label_choice if label_choice != "type" else "types" 123 image_paths = natsorted(glob(os.path.join(path, "images", "*.tif"))) 124 label_paths = natsorted(glob(os.path.join(path, label_dir, "*.tif"))) 125 126 assert len(image_paths) == len(label_paths) and len(image_paths) > 0 127 128 return image_paths, label_paths 129 130 131def get_panoptils_dataset( 132 path: Union[os.PathLike, str], 133 patch_shape: Tuple[int, int], 134 label_choice: Literal["instances", "type", "semantic"] = "instances", 135 download: bool = False, 136 **kwargs, 137) -> Dataset: 138 """Get the PanopTILs dataset for panoptic segmentation of tumor-infiltrating lymphocytes. 139 140 Args: 141 path: Filepath to a folder where the downloaded data will be saved. 142 patch_shape: The patch shape to use for training. 143 label_choice: The type of labels to use. One of 'instances' (nuclei instance segmentation), 144 'type' (nuclei semantic segmentation), or 'semantic' (tissue semantic segmentation). 145 download: Whether to download the data if it is not present. 146 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 147 148 Returns: 149 The segmentation dataset. 150 """ 151 image_paths, label_paths = get_panoptils_paths(path, label_choice, download) 152 153 if label_choice == "instances": 154 kwargs, _ = util.add_instance_label_transform( 155 kwargs, add_binary_target=True, 156 ) 157 158 kwargs = util.update_kwargs(kwargs, "ndim", 2) 159 160 return torch_em.default_segmentation_dataset( 161 raw_paths=image_paths, 162 raw_key=None, 163 label_paths=label_paths, 164 label_key=None, 165 patch_shape=patch_shape, 166 is_seg_dataset=label_choice != "instances", 167 **kwargs, 168 ) 169 170 171def get_panoptils_loader( 172 path: Union[os.PathLike, str], 173 batch_size: int, 174 patch_shape: Tuple[int, int], 175 label_choice: Literal["instances", "type", "semantic"] = "instances", 176 download: bool = False, 177 **kwargs, 178) -> DataLoader: 179 """Get the PanopTILs dataloader for panoptic segmentation of tumor-infiltrating lymphocytes. 180 181 Args: 182 path: Filepath to a folder where the downloaded data will be saved. 183 batch_size: The batch size for training. 184 patch_shape: The patch shape to use for training. 185 label_choice: The type of labels to use. One of 'instances' (nuclei instance segmentation), 186 'type' (nuclei semantic segmentation), or 'semantic' (tissue semantic segmentation). 187 download: Whether to download the data if it is not present. 188 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 189 190 Returns: 191 The DataLoader. 192 """ 193 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 194 dataset = get_panoptils_dataset(path, patch_shape, label_choice, download, **ds_kwargs) 195 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
75def get_panoptils_data( 76 path: Union[os.PathLike, str], 77 download: bool = False, 78) -> str: 79 """Download the PanopTILs dataset. 80 81 Args: 82 path: Filepath to a folder where the downloaded data will be saved. 83 download: Whether to download the data if it is not present. 84 85 Returns: 86 The filepath to the directory with the data. 87 """ 88 parquet_path = os.path.join(path, "panoptils_refined.parquet") 89 if not os.path.exists(parquet_path): 90 os.makedirs(path, exist_ok=True) 91 util.download_source(path=parquet_path, url=URL, download=download, checksum=None) 92 93 image_dir = os.path.join(path, "images") 94 if not os.path.exists(image_dir) or len(glob(os.path.join(image_dir, "*.tif"))) == 0: 95 _create_images_from_parquet(path) 96 97 return path
Download the PanopTILs dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- download: Whether to download the data if it is not present.
Returns:
The filepath to the directory with the data.
100def get_panoptils_paths( 101 path: Union[os.PathLike, str], 102 label_choice: Literal["instances", "type", "semantic"] = "instances", 103 download: bool = False, 104) -> Tuple[List[str], List[str]]: 105 """Get paths to the PanopTILs data. 106 107 Args: 108 path: Filepath to a folder where the downloaded data will be saved. 109 label_choice: The type of labels to use. One of 'instances' (nuclei instance segmentation), 110 'type' (nuclei semantic segmentation), or 'semantic' (tissue semantic segmentation). 111 download: Whether to download the data if it is not present. 112 113 Returns: 114 List of filepaths for the image data. 115 List of filepaths for the label data. 116 """ 117 from natsort import natsorted 118 119 assert label_choice in LABEL_CHOICES, f"'{label_choice}' is not valid. Choose from {LABEL_CHOICES}." 120 121 get_panoptils_data(path, download) 122 123 label_dir = label_choice if label_choice != "type" else "types" 124 image_paths = natsorted(glob(os.path.join(path, "images", "*.tif"))) 125 label_paths = natsorted(glob(os.path.join(path, label_dir, "*.tif"))) 126 127 assert len(image_paths) == len(label_paths) and len(image_paths) > 0 128 129 return image_paths, label_paths
Get paths to the PanopTILs data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- label_choice: The type of labels to use. One of 'instances' (nuclei instance segmentation), 'type' (nuclei semantic segmentation), or 'semantic' (tissue semantic segmentation).
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the image data. List of filepaths for the label data.
132def get_panoptils_dataset( 133 path: Union[os.PathLike, str], 134 patch_shape: Tuple[int, int], 135 label_choice: Literal["instances", "type", "semantic"] = "instances", 136 download: bool = False, 137 **kwargs, 138) -> Dataset: 139 """Get the PanopTILs dataset for panoptic segmentation of tumor-infiltrating lymphocytes. 140 141 Args: 142 path: Filepath to a folder where the downloaded data will be saved. 143 patch_shape: The patch shape to use for training. 144 label_choice: The type of labels to use. One of 'instances' (nuclei instance segmentation), 145 'type' (nuclei semantic segmentation), or 'semantic' (tissue semantic segmentation). 146 download: Whether to download the data if it is not present. 147 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 148 149 Returns: 150 The segmentation dataset. 151 """ 152 image_paths, label_paths = get_panoptils_paths(path, label_choice, download) 153 154 if label_choice == "instances": 155 kwargs, _ = util.add_instance_label_transform( 156 kwargs, add_binary_target=True, 157 ) 158 159 kwargs = util.update_kwargs(kwargs, "ndim", 2) 160 161 return torch_em.default_segmentation_dataset( 162 raw_paths=image_paths, 163 raw_key=None, 164 label_paths=label_paths, 165 label_key=None, 166 patch_shape=patch_shape, 167 is_seg_dataset=label_choice != "instances", 168 **kwargs, 169 )
Get the PanopTILs dataset for panoptic segmentation of tumor-infiltrating lymphocytes.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- label_choice: The type of labels to use. One of 'instances' (nuclei instance segmentation), 'type' (nuclei semantic segmentation), or 'semantic' (tissue semantic segmentation).
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset.
Returns:
The segmentation dataset.
172def get_panoptils_loader( 173 path: Union[os.PathLike, str], 174 batch_size: int, 175 patch_shape: Tuple[int, int], 176 label_choice: Literal["instances", "type", "semantic"] = "instances", 177 download: bool = False, 178 **kwargs, 179) -> DataLoader: 180 """Get the PanopTILs dataloader for panoptic segmentation of tumor-infiltrating lymphocytes. 181 182 Args: 183 path: Filepath to a folder where the downloaded data will be saved. 184 batch_size: The batch size for training. 185 patch_shape: The patch shape to use for training. 186 label_choice: The type of labels to use. One of 'instances' (nuclei instance segmentation), 187 'type' (nuclei semantic segmentation), or 'semantic' (tissue semantic segmentation). 188 download: Whether to download the data if it is not present. 189 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 190 191 Returns: 192 The DataLoader. 193 """ 194 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 195 dataset = get_panoptils_dataset(path, patch_shape, label_choice, download, **ds_kwargs) 196 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the PanopTILs dataloader for panoptic segmentation of tumor-infiltrating lymphocytes.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- label_choice: The type of labels to use. One of 'instances' (nuclei instance segmentation), 'type' (nuclei semantic segmentation), or 'semantic' (tissue semantic segmentation).
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_datasetor for the PyTorch DataLoader.
Returns:
The DataLoader.