torch_em.data.datasets.histopathology.puma
The PUMA dataset contains annotations for nucleus and tissue segmentation in melanoma H&E stained histopathology images.
This dataset is located at https://zenodo.org/records/13859989. This is part of the PUMA Grand Challenge: https://puma.grand-challenge.org/. The dataset is from the publication https://doi.org/10.1093/gigascience/giaf011. Please cite them if you use this dataset for your research.
1"""The PUMA dataset contains annotations for nucleus and tissue segmentation 2in melanoma H&E stained histopathology images. 3 4This dataset is located at https://zenodo.org/records/13859989. 5This is part of the PUMA Grand Challenge: https://puma.grand-challenge.org/. 6The dataset is from the publication https://doi.org/10.1093/gigascience/giaf011. 7Please cite them if you use this dataset for your research. 8""" 9 10import os 11from glob import glob 12from tqdm import tqdm 13from pathlib import Path 14from natsort import natsorted 15from typing import Union, Literal, List, Tuple 16 17import json 18import numpy as np 19import pandas as pd 20import imageio.v3 as imageio 21from sklearn.model_selection import train_test_split 22 23from torch.utils.data import Dataset, DataLoader 24 25import torch_em 26 27from .. import util 28 29 30URL = { 31 "data": "https://zenodo.org/records/15050523/files/01_training_dataset_tif_ROIs.zip", 32 "annotations": { 33 "nuclei": "https://zenodo.org/records/15050523/files/01_training_dataset_geojson_nuclei.zip", 34 "tissue": "https://zenodo.org/records/15050523/files/01_training_dataset_geojson_tissue.zip", 35 } 36} 37 38CHECKSUM = { 39 "data": "af48b879f8ff7e74b84a7114924881606f13f108aa0f9bcc21d3593b717ee022", 40 "annotations": { 41 "nuclei": "eda271225900d6de0759e0281f3731a570e09f2adab58bd36425b9d2dfad91a0", 42 "tissue": "fc2835135cc28324f52eac131327f0f12c554c0b1f334a108bf4b65e0f18c42b", 43 } 44} 45 46NUCLEI_CLASS_DICT = { 47 "nuclei_stroma": 1, 48 "nuclei_tumor": 2, 49 "nuclei_plasma_cell": 3, 50 "nuclei_histiocyte": 4, 51 "nuclei_lymphocyte": 5, 52 "nuclei_melanophage": 6, 53 "nuclei_neutrophil": 7, 54 "nuclei_endothelium": 8, 55 "nuclei_epithelium": 9, 56 "nuclei_apoptosis": 10, 57} 58 59TISSUE_CLASS_DICT = { 60 "tissue_stroma": 1, 61 "tissue_tumor": 2, 62 "tissue_epidermis": 3, 63 "tissue_blood_vessel": 4, 64 "tissue_necrosis": 5, 65 "tissue_white_background": 6, 66} 67 68CLASS_DICT = { 69 "nuclei": NUCLEI_CLASS_DICT, 70 "tissue": TISSUE_CLASS_DICT, 71} 72 73 74def _create_split_csv(path, annotations, split): 75 "This creates a split saved to a .csv file in the dataset directory" 76 csv_path = os.path.join(path, "puma_split.csv") 77 78 if os.path.exists(csv_path): 79 df = pd.read_csv(csv_path) 80 df[split] = df[split].apply(lambda x: json.loads(x.replace("'", '"'))) # ensures all items from column in list. 81 split_list = df.iloc[0][split] 82 else: 83 print(f"Creating a new split file at '{csv_path}'.") 84 metastatic_ids = [ 85 os.path.basename(image).split(".")[0] 86 for image in glob(os.path.join(path, "data", "01_training_dataset_tif_ROIs", "*metastatic*")) 87 ] 88 primary_ids = [ 89 os.path.basename(image).split(".")[0] 90 for image in glob(os.path.join(path, "data", "01_training_dataset_tif_ROIs", "*primary*")) 91 ] 92 93 # Create random splits per dataset. 94 train_ids, test_ids = train_test_split(metastatic_ids, test_size=0.2) # 20% for test. 95 train_ids, val_ids = train_test_split(train_ids, test_size=0.15) # 15% of the train set for val. 96 ptrain_ids, ptest_ids = train_test_split(primary_ids, test_size=0.2) # do same as above for 'primary' samples. 97 ptrain_ids, pval_ids = train_test_split(ptrain_ids, test_size=0.15) # do same as above for 'primary' samples. 98 train_ids.extend(ptrain_ids) 99 val_ids.extend(pval_ids) 100 test_ids.extend(ptest_ids) 101 102 split_ids = {"train": train_ids, "val": val_ids, "test": test_ids} 103 104 df = pd.DataFrame.from_dict([split_ids]) 105 df.to_csv(csv_path, index=False) 106 107 split_list = split_ids[split] 108 109 return split_list 110 111 112def _preprocess_inputs(path, annotations, split): 113 import h5py 114 try: 115 import geopandas as gpd 116 except ModuleNotFoundError: 117 raise RuntimeError("Please install 'geopandas': 'conda install -c conda-forge geopandas'.") 118 119 try: 120 from rasterio.features import rasterize 121 from rasterio.transform import from_bounds 122 except ModuleNotFoundError: 123 raise RuntimeError("Please install 'rasterio': 'conda install -c conda-forge rasterio'.") 124 125 annotation_paths = glob( 126 os.path.join(path, "annotations", annotations, f"01_training_dataset_geojson_{annotations}", "*.geojson") 127 ) 128 roi_dir = os.path.join(path, "data", "01_training_dataset_tif_ROIs") 129 preprocessed_dir = os.path.join(path, split, "preprocessed") 130 os.makedirs(preprocessed_dir, exist_ok=True) 131 132 split_list = _create_split_csv(path, annotations, split) 133 print(f"The data split '{split}' has '{len(split_list)}' samples!") 134 135 for ann_path in tqdm(annotation_paths, desc=f"Preprocessing '{annotations}'"): 136 fname = os.path.basename(ann_path).replace(f"_{annotations}.geojson", ".tif") 137 image_path = os.path.join(roi_dir, fname) 138 139 # Handle inconsistent extension for sample 103 (.tiff instead of .tif). 140 if not os.path.exists(image_path): 141 image_path = image_path + "f" # Retrying with .tiff 142 143 if os.path.basename(image_path).split(".")[0] not in split_list: 144 continue 145 146 assert os.path.exists(image_path), image_path 147 148 volume_path = os.path.join(preprocessed_dir, Path(fname).with_suffix(".h5")) 149 gdf = gpd.read_file(ann_path) 150 minx, miny, maxx, maxy = gdf.total_bounds 151 152 width, height = 1024, 1024 # roi shape 153 transform = from_bounds(minx, miny, maxx, maxy, width, height) 154 155 # Extract class ids mapped to each class name. 156 class_dict = CLASS_DICT[annotations] 157 class_ids = [class_dict[cls_entry["name"]] for cls_entry in gdf["classification"]] 158 semantic_shapes = ((geom, unique_id) for geom, unique_id in zip(gdf.geometry, class_ids)) 159 semantic_mask = rasterize( 160 semantic_shapes, out_shape=(height, width), transform=transform, fill=0, dtype=np.uint8 161 ) 162 163 gdf['id'] = range(1, len(gdf) + 1) 164 instance_shapes = ((geom, unique_id) for geom, unique_id in zip(gdf.geometry, gdf['id'])) 165 instance_mask = rasterize( 166 instance_shapes, out_shape=(height, width), transform=transform, fill=0, dtype=np.int32 167 ) 168 169 # Transform labels to match expected orientation 170 instance_mask = np.flip(instance_mask) 171 instance_mask = np.fliplr(instance_mask) 172 173 semantic_mask = np.flip(semantic_mask) 174 semantic_mask = np.fliplr(semantic_mask) 175 176 image = imageio.imread(image_path) 177 image = image[..., :-1].transpose(2, 0, 1) 178 179 with h5py.File(volume_path, "a") as f: 180 if "raw" not in f.keys(): 181 f.create_dataset("raw", data=image, compression="gzip") 182 183 if f"labels/instances/{annotations}" not in f.keys(): 184 f.create_dataset(f"labels/instances/{annotations}", data=instance_mask, compression="gzip") 185 186 if f"labels/semantic/{annotations}" not in f.keys(): 187 f.create_dataset(f"labels/semantic/{annotations}", data=semantic_mask, compression="gzip") 188 189 190def _annotations_are_stored(data_dir, annotations): 191 import h5py 192 volume_paths = glob(os.path.join(data_dir, "preprocessed", "*.h5")) 193 if not volume_paths: 194 return 195 f = h5py.File(volume_paths[0], "r") 196 return f"labels/instances/{annotations}" in f.keys() 197 198 199def get_puma_data( 200 path: Union[os.PathLike, str], 201 split: Literal["train", "val", "test"], 202 annotations: Literal['nuclei', 'tissue'] = "nuclei", 203 download: bool = False, 204) -> str: 205 """Download the PUMA data. 206 207 Args: 208 path: Filepath to a folder where the downloaded data will be saved. 209 split: The choice of data split. 210 annotations: The choice of annotations. 211 download: Whether to download the data if it is not present. 212 213 Returns: 214 Filepath where the dataset is downloaded and stored for further preprocessing. 215 """ 216 if annotations not in ["nuclei", "tissue"]: 217 raise ValueError(f"'{annotations}' is not a valid annotation for the data.") 218 219 data_dir = os.path.join(path, split) 220 if os.path.exists(data_dir) and _annotations_are_stored(data_dir, annotations): 221 return data_dir 222 223 os.makedirs(path, exist_ok=True) 224 225 if not os.path.exists(os.path.join(path, "data")): 226 # Download the data. 227 zip_path = os.path.join(path, "roi.zip") 228 util.download_source(path=zip_path, url=URL["data"], download=download, checksum=CHECKSUM["data"]) 229 util.unzip(zip_path=zip_path, dst=os.path.join(path, "data")) 230 231 # Download the annotations. 232 zip_path = os.path.join(path, "annotations.zip") 233 util.download_source( 234 path=zip_path, 235 url=URL["annotations"][annotations], 236 download=download, 237 checksum=CHECKSUM["annotations"][annotations] 238 ) 239 util.unzip(zip_path=zip_path, dst=os.path.join(path, "annotations", annotations)) 240 241 _preprocess_inputs(path, annotations, split) 242 243 return data_dir 244 245 246def get_puma_paths( 247 path: Union[os.PathLike, str], 248 split: Literal["train", "val", "test"], 249 annotations: Literal['nuclei', 'tissue'] = "nuclei", 250 download: bool = False 251) -> List[str]: 252 """Get paths to the PUMA dataset. 253 254 Args: 255 path: Filepath to a folder where the downloaded data will be saved. 256 split: The choice of data split. 257 annotations: The choice of annotations. 258 download: Whether to download the data if it is not present. 259 260 Returns: 261 List of filepaths for the input data. 262 """ 263 data_dir = get_puma_data(path, split, annotations, download) 264 volume_paths = natsorted(glob(os.path.join(data_dir, "preprocessed", "*.h5"))) 265 return volume_paths 266 267 268def get_puma_dataset( 269 path: Union[os.PathLike, str], 270 patch_shape: Tuple[int, int], 271 split: Literal["train", "val", "test"], 272 annotations: Literal['nuclei', 'tissue'] = "nuclei", 273 label_choice: Literal["instances", "semantic"] = "instances", 274 resize_inputs: bool = False, 275 download: bool = False, 276 **kwargs 277) -> Dataset: 278 """Get the PUMA dataset for nuclei and tissue segmentation. 279 280 Args: 281 path: Filepath to a folder where the downloaded data will be saved. 282 patch_shape: The patch shape to use for training. 283 split: The choice of data split. 284 annotations: The choice of annotations. 285 label_choice: The choice of segmentation type. 286 resize_inputs: Whether to resize the inputs. 287 download: Whether to download the data if it is not present. 288 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 289 290 Returns: 291 The segmentation dataset. 292 """ 293 volume_paths = get_puma_paths(path, split, annotations, download) 294 295 if resize_inputs: 296 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True} 297 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 298 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 299 ) 300 301 return torch_em.default_segmentation_dataset( 302 raw_paths=volume_paths, 303 raw_key="raw", 304 label_paths=volume_paths, 305 label_key=f"labels/{label_choice}/{annotations}", 306 patch_shape=patch_shape, 307 with_channels=True, 308 is_seg_dataset=True, 309 ndim=2, 310 **kwargs 311 ) 312 313 314def get_puma_loader( 315 path: Union[os.PathLike, str], 316 batch_size: int, 317 patch_shape: Tuple[int, int], 318 split: Literal["train", "val", "test"], 319 annotations: Literal['nuclei', 'tissue'] = "nuclei", 320 label_choice: Literal["instances", "semantic"] = "instances", 321 resize_inputs: bool = False, 322 download: bool = False, 323 **kwargs 324) -> DataLoader: 325 """Get the PUMA dataloader for nuclei and tissue segmentation. 326 327 Args: 328 path: Filepath to a folder where the downloaded data will be saved. 329 batch_size: The batch size for training. 330 patch_shape: The patch shape to use for training. 331 split: The choice of data split. 332 annotations: The choice of annotations. 333 label_choice: The choice of segmentation type. 334 resize_inputs: Whether to resize the inputs. 335 download: Whether to download the data if it is not present. 336 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 337 338 Returns: 339 The DataLoader. 340 """ 341 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 342 dataset = get_puma_dataset( 343 path, patch_shape, split, annotations, label_choice, resize_inputs, download, **ds_kwargs 344 ) 345 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URL =
{'data': 'https://zenodo.org/records/15050523/files/01_training_dataset_tif_ROIs.zip', 'annotations': {'nuclei': 'https://zenodo.org/records/15050523/files/01_training_dataset_geojson_nuclei.zip', 'tissue': 'https://zenodo.org/records/15050523/files/01_training_dataset_geojson_tissue.zip'}}
CHECKSUM =
{'data': 'af48b879f8ff7e74b84a7114924881606f13f108aa0f9bcc21d3593b717ee022', 'annotations': {'nuclei': 'eda271225900d6de0759e0281f3731a570e09f2adab58bd36425b9d2dfad91a0', 'tissue': 'fc2835135cc28324f52eac131327f0f12c554c0b1f334a108bf4b65e0f18c42b'}}
NUCLEI_CLASS_DICT =
{'nuclei_stroma': 1, 'nuclei_tumor': 2, 'nuclei_plasma_cell': 3, 'nuclei_histiocyte': 4, 'nuclei_lymphocyte': 5, 'nuclei_melanophage': 6, 'nuclei_neutrophil': 7, 'nuclei_endothelium': 8, 'nuclei_epithelium': 9, 'nuclei_apoptosis': 10}
TISSUE_CLASS_DICT =
{'tissue_stroma': 1, 'tissue_tumor': 2, 'tissue_epidermis': 3, 'tissue_blood_vessel': 4, 'tissue_necrosis': 5, 'tissue_white_background': 6}
CLASS_DICT =
{'nuclei': {'nuclei_stroma': 1, 'nuclei_tumor': 2, 'nuclei_plasma_cell': 3, 'nuclei_histiocyte': 4, 'nuclei_lymphocyte': 5, 'nuclei_melanophage': 6, 'nuclei_neutrophil': 7, 'nuclei_endothelium': 8, 'nuclei_epithelium': 9, 'nuclei_apoptosis': 10}, 'tissue': {'tissue_stroma': 1, 'tissue_tumor': 2, 'tissue_epidermis': 3, 'tissue_blood_vessel': 4, 'tissue_necrosis': 5, 'tissue_white_background': 6}}
def
get_puma_data( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], annotations: Literal['nuclei', 'tissue'] = 'nuclei', download: bool = False) -> str:
200def get_puma_data( 201 path: Union[os.PathLike, str], 202 split: Literal["train", "val", "test"], 203 annotations: Literal['nuclei', 'tissue'] = "nuclei", 204 download: bool = False, 205) -> str: 206 """Download the PUMA data. 207 208 Args: 209 path: Filepath to a folder where the downloaded data will be saved. 210 split: The choice of data split. 211 annotations: The choice of annotations. 212 download: Whether to download the data if it is not present. 213 214 Returns: 215 Filepath where the dataset is downloaded and stored for further preprocessing. 216 """ 217 if annotations not in ["nuclei", "tissue"]: 218 raise ValueError(f"'{annotations}' is not a valid annotation for the data.") 219 220 data_dir = os.path.join(path, split) 221 if os.path.exists(data_dir) and _annotations_are_stored(data_dir, annotations): 222 return data_dir 223 224 os.makedirs(path, exist_ok=True) 225 226 if not os.path.exists(os.path.join(path, "data")): 227 # Download the data. 228 zip_path = os.path.join(path, "roi.zip") 229 util.download_source(path=zip_path, url=URL["data"], download=download, checksum=CHECKSUM["data"]) 230 util.unzip(zip_path=zip_path, dst=os.path.join(path, "data")) 231 232 # Download the annotations. 233 zip_path = os.path.join(path, "annotations.zip") 234 util.download_source( 235 path=zip_path, 236 url=URL["annotations"][annotations], 237 download=download, 238 checksum=CHECKSUM["annotations"][annotations] 239 ) 240 util.unzip(zip_path=zip_path, dst=os.path.join(path, "annotations", annotations)) 241 242 _preprocess_inputs(path, annotations, split) 243 244 return data_dir
Download the PUMA data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The choice of data split.
- annotations: The choice of annotations.
- download: Whether to download the data if it is not present.
Returns:
Filepath where the dataset is downloaded and stored for further preprocessing.
def
get_puma_paths( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], annotations: Literal['nuclei', 'tissue'] = 'nuclei', download: bool = False) -> List[str]:
247def get_puma_paths( 248 path: Union[os.PathLike, str], 249 split: Literal["train", "val", "test"], 250 annotations: Literal['nuclei', 'tissue'] = "nuclei", 251 download: bool = False 252) -> List[str]: 253 """Get paths to the PUMA dataset. 254 255 Args: 256 path: Filepath to a folder where the downloaded data will be saved. 257 split: The choice of data split. 258 annotations: The choice of annotations. 259 download: Whether to download the data if it is not present. 260 261 Returns: 262 List of filepaths for the input data. 263 """ 264 data_dir = get_puma_data(path, split, annotations, download) 265 volume_paths = natsorted(glob(os.path.join(data_dir, "preprocessed", "*.h5"))) 266 return volume_paths
Get paths to the PUMA dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The choice of data split.
- annotations: The choice of annotations.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the input data.
def
get_puma_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], annotations: Literal['nuclei', 'tissue'] = 'nuclei', label_choice: Literal['instances', 'semantic'] = 'instances', resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
269def get_puma_dataset( 270 path: Union[os.PathLike, str], 271 patch_shape: Tuple[int, int], 272 split: Literal["train", "val", "test"], 273 annotations: Literal['nuclei', 'tissue'] = "nuclei", 274 label_choice: Literal["instances", "semantic"] = "instances", 275 resize_inputs: bool = False, 276 download: bool = False, 277 **kwargs 278) -> Dataset: 279 """Get the PUMA dataset for nuclei and tissue segmentation. 280 281 Args: 282 path: Filepath to a folder where the downloaded data will be saved. 283 patch_shape: The patch shape to use for training. 284 split: The choice of data split. 285 annotations: The choice of annotations. 286 label_choice: The choice of segmentation type. 287 resize_inputs: Whether to resize the inputs. 288 download: Whether to download the data if it is not present. 289 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 290 291 Returns: 292 The segmentation dataset. 293 """ 294 volume_paths = get_puma_paths(path, split, annotations, download) 295 296 if resize_inputs: 297 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True} 298 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 299 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 300 ) 301 302 return torch_em.default_segmentation_dataset( 303 raw_paths=volume_paths, 304 raw_key="raw", 305 label_paths=volume_paths, 306 label_key=f"labels/{label_choice}/{annotations}", 307 patch_shape=patch_shape, 308 with_channels=True, 309 is_seg_dataset=True, 310 ndim=2, 311 **kwargs 312 )
Get the PUMA dataset for nuclei and tissue segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- split: The choice of data split.
- annotations: The choice of annotations.
- label_choice: The choice of segmentation type.
- resize_inputs: Whether to resize the inputs.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset.
Returns:
The segmentation dataset.
def
get_puma_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], annotations: Literal['nuclei', 'tissue'] = 'nuclei', label_choice: Literal['instances', 'semantic'] = 'instances', resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
315def get_puma_loader( 316 path: Union[os.PathLike, str], 317 batch_size: int, 318 patch_shape: Tuple[int, int], 319 split: Literal["train", "val", "test"], 320 annotations: Literal['nuclei', 'tissue'] = "nuclei", 321 label_choice: Literal["instances", "semantic"] = "instances", 322 resize_inputs: bool = False, 323 download: bool = False, 324 **kwargs 325) -> DataLoader: 326 """Get the PUMA dataloader for nuclei and tissue segmentation. 327 328 Args: 329 path: Filepath to a folder where the downloaded data will be saved. 330 batch_size: The batch size for training. 331 patch_shape: The patch shape to use for training. 332 split: The choice of data split. 333 annotations: The choice of annotations. 334 label_choice: The choice of segmentation type. 335 resize_inputs: Whether to resize the inputs. 336 download: Whether to download the data if it is not present. 337 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 338 339 Returns: 340 The DataLoader. 341 """ 342 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 343 dataset = get_puma_dataset( 344 path, patch_shape, split, annotations, label_choice, resize_inputs, download, **ds_kwargs 345 ) 346 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the PUMA dataloader for nuclei and tissue segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- split: The choice of data split.
- annotations: The choice of annotations.
- label_choice: The choice of segmentation type.
- resize_inputs: Whether to resize the inputs.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_datasetor for the PyTorch DataLoader.
Returns:
The DataLoader.