torch_em.data.datasets.histopathology.puma
The PUMA dataset contains annotations for nucleus and tissue segmentation in melanoma H&E stained histopathology images.
This dataset is located at https://zenodo.org/records/13859989. This is part of the PUMA Grand Challenge: https://puma.grand-challenge.org/
- Preprint with details about the data: https://doi.org/10.1101/2024.10.07.24315039
Please cite them if you use this dataset for your research.
1"""The PUMA dataset contains annotations for nucleus and tissue segmentation 2in melanoma H&E stained histopathology images. 3 4This dataset is located at https://zenodo.org/records/13859989. 5This is part of the PUMA Grand Challenge: https://puma.grand-challenge.org/ 6- Preprint with details about the data: https://doi.org/10.1101/2024.10.07.24315039 7 8Please cite them if you use this dataset for your research. 9""" 10 11import os 12from glob import glob 13from tqdm import tqdm 14from pathlib import Path 15from natsort import natsorted 16from typing import Union, Literal, List, Tuple 17 18import json 19import numpy as np 20import pandas as pd 21import imageio.v3 as imageio 22from sklearn.model_selection import train_test_split 23 24from torch.utils.data import Dataset, DataLoader 25 26import torch_em 27 28from .. import util 29 30 31URL = { 32 "data": "https://zenodo.org/records/13859989/files/01_training_dataset_tif_ROIs.zip", 33 "annotations": { 34 "nuclei": "https://zenodo.org/records/13859989/files/01_training_dataset_geojson_nuclei.zip", 35 "tissue": "https://zenodo.org/records/13859989/files/01_training_dataset_geojson_tissue.zip", 36 } 37} 38 39CHECKSUM = { 40 "data": "a69fd0d8443da29233df103ece5674fb50e8f0cc4b448dc60508cfe883881993", 41 "annotations": { 42 "nuclei": "17f77ca83fb8fccd918ce723a7b3e5cb5a1730b342ad486628f8885d14a1acbd", 43 "tissue": "3b7d6697dd728e3481df0b779ad1e76962f36fc8c871c50edd9aa56ec44c4cc9", 44 } 45} 46 47CLASS_DICT = { 48 "nuclei_stroma": 1, 49 "nuclei_tumor": 2, 50 "nuclei_plasma_cell": 3, 51 "nuclei_histiocyte": 4, 52 "nuclei_lymphocyte": 5, 53 "nuclei_melanophage": 6, 54 "nuclei_neutrophil": 7, 55 "nuclei_endothelium": 8, 56 "nuclei_epithelium": 9, 57 "nuclei_apoptosis": 10, 58} 59 60 61def _create_split_csv(path, split): 62 "This creates a split saved to a .csv file in the dataset directory" 63 csv_path = os.path.join(path, "puma_split.csv") 64 65 if os.path.exists(csv_path): 66 df = pd.read_csv(csv_path) 67 df[split] = df[split].apply(lambda x: json.loads(x.replace("'", '"'))) # ensures all items from column in list. 68 split_list = df.iloc[0][split] 69 else: 70 print(f"Creating a new split file at '{csv_path}'.") 71 metastatic_ids = [ 72 os.path.basename(image).split(".")[0] for image in glob(os.path.join(path, "data", "*metastatic*")) 73 ] 74 primary_ids = [ 75 os.path.basename(image).split(".")[0] for image in glob(os.path.join(path, "data", "*primary*")) 76 ] 77 78 # Create random splits per dataset. 79 train_ids, test_ids = train_test_split(metastatic_ids, test_size=0.2) # 20% for test. 80 train_ids, val_ids = train_test_split(train_ids, test_size=0.15) # 15% of the train set for val. 81 ptrain_ids, ptest_ids = train_test_split(primary_ids, test_size=0.2) # do same as above for 'primary' samples. 82 ptrain_ids, pval_ids = train_test_split(ptrain_ids, test_size=0.15) # do same as above for 'primary' samples. 83 train_ids.extend(ptrain_ids) 84 val_ids.extend(pval_ids) 85 test_ids.extend(ptest_ids) 86 87 split_ids = {"train": train_ids, "val": val_ids, "test": test_ids} 88 89 df = pd.DataFrame.from_dict([split_ids]) 90 df.to_csv(csv_path, index=False) 91 92 split_list = split_ids[split] 93 94 return split_list 95 96 97def _preprocess_inputs(path, annotations, split): 98 import ast 99 import h5py 100 try: 101 import geopandas as gpd 102 except ModuleNotFoundError: 103 raise RuntimeError("Please install 'geopandas': 'conda install -c conda-forge geopandas'.") 104 105 try: 106 from rasterio.features import rasterize 107 from rasterio.transform import from_bounds 108 except ModuleNotFoundError: 109 raise RuntimeError("Please install 'rasterio': 'conda install -c conda-forge rasterio'.") 110 111 annotation_paths = glob(os.path.join(path, "annotations", annotations, "*.geojson")) 112 roi_dir = os.path.join(path, "data") 113 preprocessed_dir = os.path.join(path, split, "preprocessed") 114 os.makedirs(preprocessed_dir, exist_ok=True) 115 116 split_list = _create_split_csv(path, split) 117 print(f"The data split '{split}' has '{len(split_list)}' samples!") 118 119 for ann_path in tqdm(annotation_paths, desc=f"Preprocessing '{annotations}'"): 120 fname = os.path.basename(ann_path).replace(f"_{annotations}.geojson", ".tif") 121 image_path = os.path.join(roi_dir, fname) 122 123 if os.path.basename(image_path).split(".")[0] not in split_list: 124 continue 125 126 volume_path = os.path.join(preprocessed_dir, Path(fname).with_suffix(".h5")) 127 gdf = gpd.read_file(ann_path) 128 minx, miny, maxx, maxy = gdf.total_bounds 129 130 width, height = 1024, 1024 # roi shape 131 transform = from_bounds(minx, miny, maxx, maxy, width, height) 132 133 # Extract class ids mapped to each class name. 134 class_ids = [ 135 CLASS_DICT[nuc_class["name"]] for nuc_class in gdf["classification"].apply(lambda x: ast.literal_eval(x)) 136 ] 137 semantic_shapes = ((geom, unique_id) for geom, unique_id in zip(gdf.geometry, class_ids)) 138 semantic_mask = rasterize( 139 semantic_shapes, out_shape=(height, width), transform=transform, fill=0, dtype=np.uint8 140 ) 141 142 gdf['id'] = range(1, len(gdf) + 1) 143 instance_shapes = ((geom, unique_id) for geom, unique_id in zip(gdf.geometry, gdf['id'])) 144 instance_mask = rasterize( 145 instance_shapes, out_shape=(height, width), transform=transform, fill=0, dtype=np.int32 146 ) 147 148 # Transform labels to match expected orientation 149 instance_mask = np.flip(instance_mask) 150 instance_mask = np.fliplr(instance_mask) 151 152 semantic_mask = np.flip(semantic_mask) 153 semantic_mask = np.fliplr(semantic_mask) 154 155 image = imageio.imread(image_path) 156 image = image[..., :-1].transpose(2, 0, 1) 157 158 with h5py.File(volume_path, "a") as f: 159 if "raw" not in f.keys(): 160 f.create_dataset("raw", data=image, compression="gzip") 161 162 if f"labels/instances/{annotations}" not in f.keys(): 163 f.create_dataset(f"labels/instances/{annotations}", data=instance_mask, compression="gzip") 164 165 if f"labels/semantic/{annotations}" not in f.keys(): 166 f.create_dataset(f"labels/semantic/{annotations}", data=semantic_mask, compression="gzip") 167 168 169def get_puma_data( 170 path: Union[os.PathLike, str], 171 split: Literal["train", "val", "test"], 172 annotations: Literal['nuclei', 'tissue'] = "nuclei", 173 download: bool = False, 174) -> str: 175 """Download the PUMA data. 176 177 Args: 178 path: Filepath to a folder where the downloaded data will be saved. 179 split: The choice of data split. 180 annotations: The choice of annotations. 181 download: Whether to download the data if it is not present. 182 183 Returns: 184 Filepath where the dataset is downloaded and stored for further preprocessing. 185 """ 186 if annotations not in ["nuclei", "tissue"]: 187 raise ValueError(f"'{annotations}' is not a valid annotation for the data.") 188 189 data_dir = os.path.join(path, split) 190 if os.path.exists(data_dir): 191 return data_dir 192 193 os.makedirs(path, exist_ok=True) 194 195 if not os.path.exists(os.path.join(path, "data")): 196 # Download the data. 197 zip_path = os.path.join(path, "roi.zip") 198 util.download_source(path=zip_path, url=URL["data"], download=download, checksum=CHECKSUM["data"]) 199 util.unzip(zip_path=zip_path, dst=os.path.join(path, "data")) 200 201 # Download the annotations. 202 zip_path = os.path.join(path, "annotations.zip") 203 util.download_source( 204 path=zip_path, 205 url=URL["annotations"][annotations], 206 download=download, 207 checksum=CHECKSUM["annotations"][annotations] 208 ) 209 util.unzip(zip_path=zip_path, dst=os.path.join(path, "annotations", annotations)) 210 211 _preprocess_inputs(path, annotations, split) 212 213 return data_dir 214 215 216def get_puma_paths( 217 path: Union[os.PathLike, str], 218 split: Literal["train", "val", "test"], 219 annotations: Literal['nuclei', 'tissue'] = "nuclei", 220 download: bool = False 221) -> List[str]: 222 """Get paths to the PUMA dataset. 223 224 Args: 225 path: Filepath to a folder where the downloaded data will be saved. 226 split: The choice of data split. 227 annotations: The choice of annotations. 228 download: Whether to download the data if it is not present. 229 230 Returns: 231 List of filepaths for the input data. 232 """ 233 data_dir = get_puma_data(path, split, annotations, download) 234 volume_paths = natsorted(glob(os.path.join(data_dir, "preprocessed", "*.h5"))) 235 return volume_paths 236 237 238def get_puma_dataset( 239 path: Union[os.PathLike, str], 240 patch_shape: Tuple[int, int], 241 split: Literal["train", "val", "test"], 242 annotations: Literal['nuclei', 'tissue'] = "nuclei", 243 label_choice: Literal["instances", "semantic"] = "instances", 244 resize_inputs: bool = False, 245 download: bool = False, 246 **kwargs 247) -> Dataset: 248 """Get the PUMA dataset for nuclei and tissue segmentation. 249 250 Args: 251 path: Filepath to a folder where the downloaded data will be saved. 252 patch_shape: The patch shape to use for training. 253 split: The choice of data split. 254 annotations: The choice of annotations. 255 label_choice: The choice of segmentation type. 256 resize_inputs: Whether to resize the inputs. 257 download: Whether to download the data if it is not present. 258 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 259 260 Returns: 261 The segmentation dataset. 262 """ 263 volume_paths = get_puma_paths(path, split, annotations, download) 264 265 if resize_inputs: 266 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True} 267 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 268 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 269 ) 270 271 return torch_em.default_segmentation_dataset( 272 raw_paths=volume_paths, 273 raw_key="raw", 274 label_paths=volume_paths, 275 label_key=f"labels/{label_choice}/{annotations}", 276 patch_shape=patch_shape, 277 with_channels=True, 278 is_seg_dataset=True, 279 ndim=2, 280 **kwargs 281 ) 282 283 284def get_puma_loader( 285 path: Union[os.PathLike, str], 286 batch_size: int, 287 patch_shape: Tuple[int, int], 288 split: Literal["train", "val", "test"], 289 annotations: Literal['nuclei', 'tissue'] = "nuclei", 290 label_choice: Literal["instances", "semantic"] = "instances", 291 resize_inputs: bool = False, 292 download: bool = False, 293 **kwargs 294) -> DataLoader: 295 """Get the PUMA dataloader for nuclei and tissue segmentation. 296 297 Args: 298 path: Filepath to a folder where the downloaded data will be saved. 299 batch_size: The batch size for training. 300 patch_shape: The patch shape to use for training. 301 split: The choice of data split. 302 annotations: The choice of annotations. 303 label_choice: The choice of segmentation type. 304 resize_inputs: Whether to resize the inputs. 305 download: Whether to download the data if it is not present. 306 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 307 308 Returns: 309 The DataLoader. 310 """ 311 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 312 dataset = get_puma_dataset( 313 path, patch_shape, split, annotations, label_choice, resize_inputs, download, **ds_kwargs 314 ) 315 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URL =
{'data': 'https://zenodo.org/records/13859989/files/01_training_dataset_tif_ROIs.zip', 'annotations': {'nuclei': 'https://zenodo.org/records/13859989/files/01_training_dataset_geojson_nuclei.zip', 'tissue': 'https://zenodo.org/records/13859989/files/01_training_dataset_geojson_tissue.zip'}}
CHECKSUM =
{'data': 'a69fd0d8443da29233df103ece5674fb50e8f0cc4b448dc60508cfe883881993', 'annotations': {'nuclei': '17f77ca83fb8fccd918ce723a7b3e5cb5a1730b342ad486628f8885d14a1acbd', 'tissue': '3b7d6697dd728e3481df0b779ad1e76962f36fc8c871c50edd9aa56ec44c4cc9'}}
CLASS_DICT =
{'nuclei_stroma': 1, 'nuclei_tumor': 2, 'nuclei_plasma_cell': 3, 'nuclei_histiocyte': 4, 'nuclei_lymphocyte': 5, 'nuclei_melanophage': 6, 'nuclei_neutrophil': 7, 'nuclei_endothelium': 8, 'nuclei_epithelium': 9, 'nuclei_apoptosis': 10}
def
get_puma_data( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], annotations: Literal['nuclei', 'tissue'] = 'nuclei', download: bool = False) -> str:
170def get_puma_data( 171 path: Union[os.PathLike, str], 172 split: Literal["train", "val", "test"], 173 annotations: Literal['nuclei', 'tissue'] = "nuclei", 174 download: bool = False, 175) -> str: 176 """Download the PUMA data. 177 178 Args: 179 path: Filepath to a folder where the downloaded data will be saved. 180 split: The choice of data split. 181 annotations: The choice of annotations. 182 download: Whether to download the data if it is not present. 183 184 Returns: 185 Filepath where the dataset is downloaded and stored for further preprocessing. 186 """ 187 if annotations not in ["nuclei", "tissue"]: 188 raise ValueError(f"'{annotations}' is not a valid annotation for the data.") 189 190 data_dir = os.path.join(path, split) 191 if os.path.exists(data_dir): 192 return data_dir 193 194 os.makedirs(path, exist_ok=True) 195 196 if not os.path.exists(os.path.join(path, "data")): 197 # Download the data. 198 zip_path = os.path.join(path, "roi.zip") 199 util.download_source(path=zip_path, url=URL["data"], download=download, checksum=CHECKSUM["data"]) 200 util.unzip(zip_path=zip_path, dst=os.path.join(path, "data")) 201 202 # Download the annotations. 203 zip_path = os.path.join(path, "annotations.zip") 204 util.download_source( 205 path=zip_path, 206 url=URL["annotations"][annotations], 207 download=download, 208 checksum=CHECKSUM["annotations"][annotations] 209 ) 210 util.unzip(zip_path=zip_path, dst=os.path.join(path, "annotations", annotations)) 211 212 _preprocess_inputs(path, annotations, split) 213 214 return data_dir
Download the PUMA data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The choice of data split.
- annotations: The choice of annotations.
- download: Whether to download the data if it is not present.
Returns:
Filepath where the dataset is downloaded and stored for further preprocessing.
def
get_puma_paths( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], annotations: Literal['nuclei', 'tissue'] = 'nuclei', download: bool = False) -> List[str]:
217def get_puma_paths( 218 path: Union[os.PathLike, str], 219 split: Literal["train", "val", "test"], 220 annotations: Literal['nuclei', 'tissue'] = "nuclei", 221 download: bool = False 222) -> List[str]: 223 """Get paths to the PUMA dataset. 224 225 Args: 226 path: Filepath to a folder where the downloaded data will be saved. 227 split: The choice of data split. 228 annotations: The choice of annotations. 229 download: Whether to download the data if it is not present. 230 231 Returns: 232 List of filepaths for the input data. 233 """ 234 data_dir = get_puma_data(path, split, annotations, download) 235 volume_paths = natsorted(glob(os.path.join(data_dir, "preprocessed", "*.h5"))) 236 return volume_paths
Get paths to the PUMA dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The choice of data split.
- annotations: The choice of annotations.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the input data.
def
get_puma_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], annotations: Literal['nuclei', 'tissue'] = 'nuclei', label_choice: Literal['instances', 'semantic'] = 'instances', resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
239def get_puma_dataset( 240 path: Union[os.PathLike, str], 241 patch_shape: Tuple[int, int], 242 split: Literal["train", "val", "test"], 243 annotations: Literal['nuclei', 'tissue'] = "nuclei", 244 label_choice: Literal["instances", "semantic"] = "instances", 245 resize_inputs: bool = False, 246 download: bool = False, 247 **kwargs 248) -> Dataset: 249 """Get the PUMA dataset for nuclei and tissue segmentation. 250 251 Args: 252 path: Filepath to a folder where the downloaded data will be saved. 253 patch_shape: The patch shape to use for training. 254 split: The choice of data split. 255 annotations: The choice of annotations. 256 label_choice: The choice of segmentation type. 257 resize_inputs: Whether to resize the inputs. 258 download: Whether to download the data if it is not present. 259 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 260 261 Returns: 262 The segmentation dataset. 263 """ 264 volume_paths = get_puma_paths(path, split, annotations, download) 265 266 if resize_inputs: 267 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True} 268 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 269 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 270 ) 271 272 return torch_em.default_segmentation_dataset( 273 raw_paths=volume_paths, 274 raw_key="raw", 275 label_paths=volume_paths, 276 label_key=f"labels/{label_choice}/{annotations}", 277 patch_shape=patch_shape, 278 with_channels=True, 279 is_seg_dataset=True, 280 ndim=2, 281 **kwargs 282 )
Get the PUMA dataset for nuclei and tissue segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- split: The choice of data split.
- annotations: The choice of annotations.
- label_choice: The choice of segmentation type.
- resize_inputs: Whether to resize the inputs.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
def
get_puma_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], annotations: Literal['nuclei', 'tissue'] = 'nuclei', label_choice: Literal['instances', 'semantic'] = 'instances', resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
285def get_puma_loader( 286 path: Union[os.PathLike, str], 287 batch_size: int, 288 patch_shape: Tuple[int, int], 289 split: Literal["train", "val", "test"], 290 annotations: Literal['nuclei', 'tissue'] = "nuclei", 291 label_choice: Literal["instances", "semantic"] = "instances", 292 resize_inputs: bool = False, 293 download: bool = False, 294 **kwargs 295) -> DataLoader: 296 """Get the PUMA dataloader for nuclei and tissue segmentation. 297 298 Args: 299 path: Filepath to a folder where the downloaded data will be saved. 300 batch_size: The batch size for training. 301 patch_shape: The patch shape to use for training. 302 split: The choice of data split. 303 annotations: The choice of annotations. 304 label_choice: The choice of segmentation type. 305 resize_inputs: Whether to resize the inputs. 306 download: Whether to download the data if it is not present. 307 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 308 309 Returns: 310 The DataLoader. 311 """ 312 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 313 dataset = get_puma_dataset( 314 path, patch_shape, split, annotations, label_choice, resize_inputs, download, **ds_kwargs 315 ) 316 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the PUMA dataloader for nuclei and tissue segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- split: The choice of data split.
- annotations: The choice of annotations.
- label_choice: The choice of segmentation type.
- resize_inputs: Whether to resize the inputs.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader.