torch_em.data.datasets.histopathology.puma
The PUMA dataset contains annotations for nucleus and tissue segmentation in melanoma H&E stained histopathology images.
This dataset is located at https://zenodo.org/records/13859989. This is part of the PUMA Grand Challenge: https://puma.grand-challenge.org/
- Preprint with details about the data: https://doi.org/10.1101/2024.10.07.24315039
Please cite them if you use this dataset for your research.
1"""The PUMA dataset contains annotations for nucleus and tissue segmentation 2in melanoma H&E stained histopathology images. 3 4This dataset is located at https://zenodo.org/records/13859989. 5This is part of the PUMA Grand Challenge: https://puma.grand-challenge.org/ 6- Preprint with details about the data: https://doi.org/10.1101/2024.10.07.24315039 7 8Please cite them if you use this dataset for your research. 9""" 10 11import os 12from glob import glob 13from tqdm import tqdm 14from pathlib import Path 15from natsort import natsorted 16from typing import Union, Literal, List, Tuple 17 18import json 19import numpy as np 20import pandas as pd 21import imageio.v3 as imageio 22from sklearn.model_selection import train_test_split 23 24from torch.utils.data import Dataset, DataLoader 25 26import torch_em 27 28from .. import util 29 30 31URL = { 32 "data": "https://zenodo.org/records/13859989/files/01_training_dataset_tif_ROIs.zip", 33 "annotations": { 34 "nuclei": "https://zenodo.org/records/13859989/files/01_training_dataset_geojson_nuclei.zip", 35 "tissue": "https://zenodo.org/records/13859989/files/01_training_dataset_geojson_tissue.zip", 36 } 37} 38 39CHECKSUM = { 40 "data": "a69fd0d8443da29233df103ece5674fb50e8f0cc4b448dc60508cfe883881993", 41 "annotations": { 42 "nuclei": "17f77ca83fb8fccd918ce723a7b3e5cb5a1730b342ad486628f8885d14a1acbd", 43 "tissue": "3b7d6697dd728e3481df0b779ad1e76962f36fc8c871c50edd9aa56ec44c4cc9", 44 } 45} 46 47CLASS_DICT = { 48 "nuclei_stroma": 1, 49 "nuclei_tumor": 2, 50 "nuclei_plasma_cell": 3, 51 "nuclei_histiocyte": 4, 52 "nuclei_lymphocyte": 5, 53 "nuclei_melanophage": 6, 54 "nuclei_neutrophil": 7, 55 "nuclei_endothelium": 8, 56 "nuclei_epithelium": 9, 57 "nuclei_apoptosis": 10, 58} 59 60 61def _create_split_csv(path, split): 62 "This creates a split saved to a .csv file in the dataset directory" 63 csv_path = os.path.join(path, "puma_split.csv") 64 65 if os.path.exists(csv_path): 66 df = pd.read_csv(csv_path) 67 df[split] = df[split].apply(lambda x: json.loads(x.replace("'", '"'))) # ensures all items from column in list. 68 split_list = df.iloc[0][split] 69 else: 70 print(f"Creating a new split file at '{csv_path}'.") 71 metastatic_ids = [ 72 os.path.basename(image).split(".")[0] for image in glob(os.path.join(path, "data", "*metastatic*")) 73 ] 74 primary_ids = [ 75 os.path.basename(image).split(".")[0] for image in glob(os.path.join(path, "data", "*primary*")) 76 ] 77 78 # Create random splits per dataset. 79 train_ids, test_ids = train_test_split(metastatic_ids, test_size=0.2) # 20% for test. 80 train_ids, val_ids = train_test_split(train_ids, test_size=0.15) # 15% of the train set for val. 81 ptrain_ids, ptest_ids = train_test_split(primary_ids, test_size=0.2) # do same as above for 'primary' samples. 82 ptrain_ids, pval_ids = train_test_split(ptrain_ids, test_size=0.15) # do same as above for 'primary' samples. 83 train_ids.extend(ptrain_ids) 84 val_ids.extend(pval_ids) 85 test_ids.extend(ptest_ids) 86 87 split_ids = {"train": train_ids, "val": val_ids, "test": test_ids} 88 89 df = pd.DataFrame.from_dict([split_ids]) 90 df.to_csv(csv_path, index=False) 91 92 split_list = split_ids[split] 93 94 return split_list 95 96 97def _preprocess_inputs(path, annotations, split): 98 import ast 99 import h5py 100 import geopandas as gpd 101 from rasterio.features import rasterize 102 from rasterio.transform import from_bounds 103 104 annotation_paths = glob(os.path.join(path, "annotations", annotations, "*.geojson")) 105 roi_dir = os.path.join(path, "data") 106 preprocessed_dir = os.path.join(path, split, "preprocessed") 107 os.makedirs(preprocessed_dir, exist_ok=True) 108 109 split_list = _create_split_csv(path, split) 110 print(f"The data split '{split}' has '{len(split_list)}' samples!") 111 112 for ann_path in tqdm(annotation_paths, desc=f"Preprocessing '{annotations}'"): 113 fname = os.path.basename(ann_path).replace(f"_{annotations}.geojson", ".tif") 114 image_path = os.path.join(roi_dir, fname) 115 116 if os.path.basename(image_path).split(".")[0] not in split_list: 117 continue 118 119 volume_path = os.path.join(preprocessed_dir, Path(fname).with_suffix(".h5")) 120 gdf = gpd.read_file(ann_path) 121 minx, miny, maxx, maxy = gdf.total_bounds 122 123 width, height = 1024, 1024 # roi shape 124 transform = from_bounds(minx, miny, maxx, maxy, width, height) 125 126 # Extract class ids mapped to each class name. 127 class_ids = [ 128 CLASS_DICT[nuc_class["name"]] for nuc_class in gdf["classification"].apply(lambda x: ast.literal_eval(x)) 129 ] 130 semantic_shapes = ((geom, unique_id) for geom, unique_id in zip(gdf.geometry, class_ids)) 131 semantic_mask = rasterize( 132 semantic_shapes, out_shape=(height, width), transform=transform, fill=0, dtype=np.uint8 133 ) 134 135 gdf['id'] = range(1, len(gdf) + 1) 136 instance_shapes = ((geom, unique_id) for geom, unique_id in zip(gdf.geometry, gdf['id'])) 137 instance_mask = rasterize( 138 instance_shapes, out_shape=(height, width), transform=transform, fill=0, dtype=np.int32 139 ) 140 141 # Transform labels to match expected orientation 142 instance_mask = np.flip(instance_mask) 143 instance_mask = np.fliplr(instance_mask) 144 145 semantic_mask = np.flip(semantic_mask) 146 semantic_mask = np.fliplr(semantic_mask) 147 148 image = imageio.imread(image_path) 149 image = image[..., :-1].transpose(2, 0, 1) 150 151 with h5py.File(volume_path, "a") as f: 152 if "raw" not in f.keys(): 153 f.create_dataset("raw", data=image, compression="gzip") 154 155 if f"labels/instances/{annotations}" not in f.keys(): 156 f.create_dataset(f"labels/instances/{annotations}", data=instance_mask, compression="gzip") 157 158 if f"labels/semantic/{annotations}" not in f.keys(): 159 f.create_dataset(f"labels/semantic/{annotations}", data=semantic_mask, compression="gzip") 160 161 162def get_puma_data( 163 path: Union[os.PathLike, str], 164 split: Literal["train", "val", "test"], 165 annotations: Literal['nuclei', 'tissue'] = "nuclei", 166 download: bool = False, 167) -> str: 168 """Download the PUMA data. 169 170 Args: 171 path: Filepath to a folder where the downloaded data will be saved. 172 split: The choice of data split. 173 annotations: The choice of annotations. 174 download: Whether to download the data if it is not present. 175 176 Returns: 177 Filepath where the dataset is downloaded and stored for further preprocessing. 178 """ 179 if annotations not in ["nuclei", "tissue"]: 180 raise ValueError(f"'{annotations}' is not a valid annotation for the data.") 181 182 data_dir = os.path.join(path, split) 183 if os.path.exists(data_dir): 184 return data_dir 185 186 os.makedirs(path, exist_ok=True) 187 188 if not os.path.exists(os.path.join(path, "data")): 189 # Download the data. 190 zip_path = os.path.join(path, "roi.zip") 191 util.download_source(path=zip_path, url=URL["data"], download=download, checksum=CHECKSUM["data"]) 192 util.unzip(zip_path=zip_path, dst=os.path.join(path, "data")) 193 194 # Download the annotations. 195 zip_path = os.path.join(path, "annotations.zip") 196 util.download_source( 197 path=zip_path, 198 url=URL["annotations"][annotations], 199 download=download, 200 checksum=CHECKSUM["annotations"][annotations] 201 ) 202 util.unzip(zip_path=zip_path, dst=os.path.join(path, "annotations", annotations)) 203 204 _preprocess_inputs(path, annotations, split) 205 206 return data_dir 207 208 209def get_puma_paths( 210 path: Union[os.PathLike, str], 211 split: Literal["train", "val", "test"], 212 annotations: Literal['nuclei', 'tissue'] = "nuclei", 213 download: bool = False 214) -> List[str]: 215 """Get paths to the PUMA dataset. 216 217 Args: 218 path: Filepath to a folder where the downloaded data will be saved. 219 split: The choice of data split. 220 annotations: The choice of annotations. 221 download: Whether to download the data if it is not present. 222 223 Returns: 224 List of filepaths for the input data. 225 """ 226 data_dir = get_puma_data(path, split, annotations, download) 227 volume_paths = natsorted(glob(os.path.join(data_dir, "preprocessed", "*.h5"))) 228 return volume_paths 229 230 231def get_puma_dataset( 232 path: Union[os.PathLike, str], 233 patch_shape: Tuple[int, int], 234 split: Literal["train", "val", "test"], 235 annotations: Literal['nuclei', 'tissue'] = "nuclei", 236 label_choice: Literal["instance", "semantic"] = "instance", 237 resize_inputs: bool = False, 238 download: bool = False, 239 **kwargs 240) -> Dataset: 241 """Get the PUMA dataset for nuclei and tissue segmentation. 242 243 Args: 244 path: Filepath to a folder where the downloaded data will be saved. 245 patch_shape: The patch shape to use for training. 246 split: The choice of data split. 247 annotations: The choice of annotations. 248 label_choice: The choice of segmentation type. 249 resize_inputs: Whether to resize the inputs. 250 download: Whether to download the data if it is not present. 251 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 252 253 Returns: 254 The segmentation dataset. 255 """ 256 volume_paths = get_puma_paths(path, split, annotations, download) 257 258 if resize_inputs: 259 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True} 260 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 261 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 262 ) 263 264 return torch_em.default_segmentation_dataset( 265 raw_paths=volume_paths, 266 raw_key="raw", 267 label_paths=volume_paths, 268 label_key=f"labels/{label_choice}/{annotations}", 269 patch_shape=patch_shape, 270 with_channels=True, 271 is_seg_dataset=True, 272 ndim=2, 273 **kwargs 274 ) 275 276 277def get_puma_loader( 278 path: Union[os.PathLike, str], 279 batch_size: int, 280 patch_shape: Tuple[int, int], 281 split: Literal["train", "val", "test"], 282 annotations: Literal['nuclei', 'tissue'] = "nuclei", 283 label_choice: Literal["instances", "semantic"] = "instances", 284 resize_inputs: bool = False, 285 download: bool = False, 286 **kwargs 287) -> DataLoader: 288 """Get the PUMA dataloader for nuclei and tissue segmentation. 289 290 Args: 291 path: Filepath to a folder where the downloaded data will be saved. 292 batch_size: The batch size for training. 293 patch_shape: The patch shape to use for training. 294 split: The choice of data split. 295 annotations: The choice of annotations. 296 label_choice: The choice of segmentation type. 297 resize_inputs: Whether to resize the inputs. 298 download: Whether to download the data if it is not present. 299 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 300 301 Returns: 302 The DataLoader. 303 """ 304 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 305 dataset = get_puma_dataset( 306 path, patch_shape, split, annotations, label_choice, resize_inputs, download, **ds_kwargs 307 ) 308 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URL =
{'data': 'https://zenodo.org/records/13859989/files/01_training_dataset_tif_ROIs.zip', 'annotations': {'nuclei': 'https://zenodo.org/records/13859989/files/01_training_dataset_geojson_nuclei.zip', 'tissue': 'https://zenodo.org/records/13859989/files/01_training_dataset_geojson_tissue.zip'}}
CHECKSUM =
{'data': 'a69fd0d8443da29233df103ece5674fb50e8f0cc4b448dc60508cfe883881993', 'annotations': {'nuclei': '17f77ca83fb8fccd918ce723a7b3e5cb5a1730b342ad486628f8885d14a1acbd', 'tissue': '3b7d6697dd728e3481df0b779ad1e76962f36fc8c871c50edd9aa56ec44c4cc9'}}
CLASS_DICT =
{'nuclei_stroma': 1, 'nuclei_tumor': 2, 'nuclei_plasma_cell': 3, 'nuclei_histiocyte': 4, 'nuclei_lymphocyte': 5, 'nuclei_melanophage': 6, 'nuclei_neutrophil': 7, 'nuclei_endothelium': 8, 'nuclei_epithelium': 9, 'nuclei_apoptosis': 10}
def
get_puma_data( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], annotations: Literal['nuclei', 'tissue'] = 'nuclei', download: bool = False) -> str:
163def get_puma_data( 164 path: Union[os.PathLike, str], 165 split: Literal["train", "val", "test"], 166 annotations: Literal['nuclei', 'tissue'] = "nuclei", 167 download: bool = False, 168) -> str: 169 """Download the PUMA data. 170 171 Args: 172 path: Filepath to a folder where the downloaded data will be saved. 173 split: The choice of data split. 174 annotations: The choice of annotations. 175 download: Whether to download the data if it is not present. 176 177 Returns: 178 Filepath where the dataset is downloaded and stored for further preprocessing. 179 """ 180 if annotations not in ["nuclei", "tissue"]: 181 raise ValueError(f"'{annotations}' is not a valid annotation for the data.") 182 183 data_dir = os.path.join(path, split) 184 if os.path.exists(data_dir): 185 return data_dir 186 187 os.makedirs(path, exist_ok=True) 188 189 if not os.path.exists(os.path.join(path, "data")): 190 # Download the data. 191 zip_path = os.path.join(path, "roi.zip") 192 util.download_source(path=zip_path, url=URL["data"], download=download, checksum=CHECKSUM["data"]) 193 util.unzip(zip_path=zip_path, dst=os.path.join(path, "data")) 194 195 # Download the annotations. 196 zip_path = os.path.join(path, "annotations.zip") 197 util.download_source( 198 path=zip_path, 199 url=URL["annotations"][annotations], 200 download=download, 201 checksum=CHECKSUM["annotations"][annotations] 202 ) 203 util.unzip(zip_path=zip_path, dst=os.path.join(path, "annotations", annotations)) 204 205 _preprocess_inputs(path, annotations, split) 206 207 return data_dir
Download the PUMA data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The choice of data split.
- annotations: The choice of annotations.
- download: Whether to download the data if it is not present.
Returns:
Filepath where the dataset is downloaded and stored for further preprocessing.
def
get_puma_paths( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], annotations: Literal['nuclei', 'tissue'] = 'nuclei', download: bool = False) -> List[str]:
210def get_puma_paths( 211 path: Union[os.PathLike, str], 212 split: Literal["train", "val", "test"], 213 annotations: Literal['nuclei', 'tissue'] = "nuclei", 214 download: bool = False 215) -> List[str]: 216 """Get paths to the PUMA dataset. 217 218 Args: 219 path: Filepath to a folder where the downloaded data will be saved. 220 split: The choice of data split. 221 annotations: The choice of annotations. 222 download: Whether to download the data if it is not present. 223 224 Returns: 225 List of filepaths for the input data. 226 """ 227 data_dir = get_puma_data(path, split, annotations, download) 228 volume_paths = natsorted(glob(os.path.join(data_dir, "preprocessed", "*.h5"))) 229 return volume_paths
Get paths to the PUMA dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The choice of data split.
- annotations: The choice of annotations.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the input data.
def
get_puma_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], annotations: Literal['nuclei', 'tissue'] = 'nuclei', label_choice: Literal['instance', 'semantic'] = 'instance', resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
232def get_puma_dataset( 233 path: Union[os.PathLike, str], 234 patch_shape: Tuple[int, int], 235 split: Literal["train", "val", "test"], 236 annotations: Literal['nuclei', 'tissue'] = "nuclei", 237 label_choice: Literal["instance", "semantic"] = "instance", 238 resize_inputs: bool = False, 239 download: bool = False, 240 **kwargs 241) -> Dataset: 242 """Get the PUMA dataset for nuclei and tissue segmentation. 243 244 Args: 245 path: Filepath to a folder where the downloaded data will be saved. 246 patch_shape: The patch shape to use for training. 247 split: The choice of data split. 248 annotations: The choice of annotations. 249 label_choice: The choice of segmentation type. 250 resize_inputs: Whether to resize the inputs. 251 download: Whether to download the data if it is not present. 252 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 253 254 Returns: 255 The segmentation dataset. 256 """ 257 volume_paths = get_puma_paths(path, split, annotations, download) 258 259 if resize_inputs: 260 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True} 261 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 262 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 263 ) 264 265 return torch_em.default_segmentation_dataset( 266 raw_paths=volume_paths, 267 raw_key="raw", 268 label_paths=volume_paths, 269 label_key=f"labels/{label_choice}/{annotations}", 270 patch_shape=patch_shape, 271 with_channels=True, 272 is_seg_dataset=True, 273 ndim=2, 274 **kwargs 275 )
Get the PUMA dataset for nuclei and tissue segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- split: The choice of data split.
- annotations: The choice of annotations.
- label_choice: The choice of segmentation type.
- resize_inputs: Whether to resize the inputs.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
def
get_puma_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], annotations: Literal['nuclei', 'tissue'] = 'nuclei', label_choice: Literal['instances', 'semantic'] = 'instances', resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
278def get_puma_loader( 279 path: Union[os.PathLike, str], 280 batch_size: int, 281 patch_shape: Tuple[int, int], 282 split: Literal["train", "val", "test"], 283 annotations: Literal['nuclei', 'tissue'] = "nuclei", 284 label_choice: Literal["instances", "semantic"] = "instances", 285 resize_inputs: bool = False, 286 download: bool = False, 287 **kwargs 288) -> DataLoader: 289 """Get the PUMA dataloader for nuclei and tissue segmentation. 290 291 Args: 292 path: Filepath to a folder where the downloaded data will be saved. 293 batch_size: The batch size for training. 294 patch_shape: The patch shape to use for training. 295 split: The choice of data split. 296 annotations: The choice of annotations. 297 label_choice: The choice of segmentation type. 298 resize_inputs: Whether to resize the inputs. 299 download: Whether to download the data if it is not present. 300 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 301 302 Returns: 303 The DataLoader. 304 """ 305 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 306 dataset = get_puma_dataset( 307 path, patch_shape, split, annotations, label_choice, resize_inputs, download, **ds_kwargs 308 ) 309 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the PUMA dataloader for nuclei and tissue segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- split: The choice of data split.
- annotations: The choice of annotations.
- label_choice: The choice of segmentation type.
- resize_inputs: Whether to resize the inputs.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader.