torch_em.data.datasets.histopathology.puma

The PUMA dataset contains annotations for nucleus and tissue segmentation in melanoma H&E stained histopathology images.

This dataset is located at https://zenodo.org/records/13859989. This is part of the PUMA Grand Challenge: https://puma.grand-challenge.org/

Please cite them if you use this dataset for your research.

  1"""The PUMA dataset contains annotations for nucleus and tissue segmentation
  2in melanoma H&E stained histopathology images.
  3
  4This dataset is located at https://zenodo.org/records/13859989.
  5This is part of the PUMA Grand Challenge: https://puma.grand-challenge.org/
  6- Preprint with details about the data: https://doi.org/10.1101/2024.10.07.24315039
  7
  8Please cite them if you use this dataset for your research.
  9"""
 10
 11import os
 12from glob import glob
 13from tqdm import tqdm
 14from pathlib import Path
 15from natsort import natsorted
 16from typing import Union, Literal, List, Tuple
 17
 18import json
 19import numpy as np
 20import pandas as pd
 21import imageio.v3 as imageio
 22from sklearn.model_selection import train_test_split
 23
 24from torch.utils.data import Dataset, DataLoader
 25
 26import torch_em
 27
 28from .. import util
 29
 30
 31URL = {
 32    "data": "https://zenodo.org/records/13859989/files/01_training_dataset_tif_ROIs.zip",
 33    "annotations": {
 34        "nuclei": "https://zenodo.org/records/13859989/files/01_training_dataset_geojson_nuclei.zip",
 35        "tissue": "https://zenodo.org/records/13859989/files/01_training_dataset_geojson_tissue.zip",
 36    }
 37}
 38
 39CHECKSUM = {
 40    "data": "a69fd0d8443da29233df103ece5674fb50e8f0cc4b448dc60508cfe883881993",
 41    "annotations": {
 42        "nuclei": "17f77ca83fb8fccd918ce723a7b3e5cb5a1730b342ad486628f8885d14a1acbd",
 43        "tissue": "3b7d6697dd728e3481df0b779ad1e76962f36fc8c871c50edd9aa56ec44c4cc9",
 44    }
 45}
 46
 47CLASS_DICT = {
 48    "nuclei_stroma": 1,
 49    "nuclei_tumor": 2,
 50    "nuclei_plasma_cell": 3,
 51    "nuclei_histiocyte": 4,
 52    "nuclei_lymphocyte": 5,
 53    "nuclei_melanophage": 6,
 54    "nuclei_neutrophil": 7,
 55    "nuclei_endothelium": 8,
 56    "nuclei_epithelium": 9,
 57    "nuclei_apoptosis": 10,
 58}
 59
 60
 61def _create_split_csv(path, split):
 62    "This creates a split saved to a .csv file in the dataset directory"
 63    csv_path = os.path.join(path, "puma_split.csv")
 64
 65    if os.path.exists(csv_path):
 66        df = pd.read_csv(csv_path)
 67        df[split] = df[split].apply(lambda x: json.loads(x.replace("'", '"')))  # ensures all items from column in list.
 68        split_list = df.iloc[0][split]
 69    else:
 70        print(f"Creating a new split file at '{csv_path}'.")
 71        metastatic_ids = [
 72            os.path.basename(image).split(".")[0] for image in glob(os.path.join(path, "data", "*metastatic*"))
 73        ]
 74        primary_ids = [
 75            os.path.basename(image).split(".")[0] for image in glob(os.path.join(path, "data", "*primary*"))
 76        ]
 77
 78        # Create random splits per dataset.
 79        train_ids, test_ids = train_test_split(metastatic_ids, test_size=0.2)  # 20% for test.
 80        train_ids, val_ids = train_test_split(train_ids, test_size=0.15)  # 15% of the train set for val.
 81        ptrain_ids, ptest_ids = train_test_split(primary_ids, test_size=0.2)  # do same as above for 'primary' samples.
 82        ptrain_ids, pval_ids = train_test_split(ptrain_ids, test_size=0.15)  # do same as above for 'primary' samples.
 83        train_ids.extend(ptrain_ids)
 84        val_ids.extend(pval_ids)
 85        test_ids.extend(ptest_ids)
 86
 87        split_ids = {"train": train_ids, "val": val_ids, "test": test_ids}
 88
 89        df = pd.DataFrame.from_dict([split_ids])
 90        df.to_csv(csv_path, index=False)
 91
 92        split_list = split_ids[split]
 93
 94    return split_list
 95
 96
 97def _preprocess_inputs(path, annotations, split):
 98    import ast
 99    import h5py
100    import geopandas as gpd
101    from rasterio.features import rasterize
102    from rasterio.transform import from_bounds
103
104    annotation_paths = glob(os.path.join(path, "annotations", annotations, "*.geojson"))
105    roi_dir = os.path.join(path, "data")
106    preprocessed_dir = os.path.join(path, split, "preprocessed")
107    os.makedirs(preprocessed_dir, exist_ok=True)
108
109    split_list = _create_split_csv(path, split)
110    print(f"The data split '{split}' has '{len(split_list)}' samples!")
111
112    for ann_path in tqdm(annotation_paths, desc=f"Preprocessing '{annotations}'"):
113        fname = os.path.basename(ann_path).replace(f"_{annotations}.geojson", ".tif")
114        image_path = os.path.join(roi_dir, fname)
115
116        if os.path.basename(image_path).split(".")[0] not in split_list:
117            continue
118
119        volume_path = os.path.join(preprocessed_dir, Path(fname).with_suffix(".h5"))
120        gdf = gpd.read_file(ann_path)
121        minx, miny, maxx, maxy = gdf.total_bounds
122
123        width, height = 1024, 1024  # roi shape
124        transform = from_bounds(minx, miny, maxx, maxy, width, height)
125
126        # Extract class ids mapped to each class name.
127        class_ids = [
128            CLASS_DICT[nuc_class["name"]] for nuc_class in gdf["classification"].apply(lambda x: ast.literal_eval(x))
129        ]
130        semantic_shapes = ((geom, unique_id) for geom, unique_id in zip(gdf.geometry, class_ids))
131        semantic_mask = rasterize(
132            semantic_shapes, out_shape=(height, width), transform=transform, fill=0, dtype=np.uint8
133        )
134
135        gdf['id'] = range(1, len(gdf) + 1)
136        instance_shapes = ((geom, unique_id) for geom, unique_id in zip(gdf.geometry, gdf['id']))
137        instance_mask = rasterize(
138            instance_shapes, out_shape=(height, width), transform=transform, fill=0, dtype=np.int32
139        )
140
141        # Transform labels to match expected orientation
142        instance_mask = np.flip(instance_mask)
143        instance_mask = np.fliplr(instance_mask)
144
145        semantic_mask = np.flip(semantic_mask)
146        semantic_mask = np.fliplr(semantic_mask)
147
148        image = imageio.imread(image_path)
149        image = image[..., :-1].transpose(2, 0, 1)
150
151        with h5py.File(volume_path, "a") as f:
152            if "raw" not in f.keys():
153                f.create_dataset("raw", data=image, compression="gzip")
154
155            if f"labels/instances/{annotations}" not in f.keys():
156                f.create_dataset(f"labels/instances/{annotations}", data=instance_mask, compression="gzip")
157
158            if f"labels/semantic/{annotations}" not in f.keys():
159                f.create_dataset(f"labels/semantic/{annotations}", data=semantic_mask, compression="gzip")
160
161
162def get_puma_data(
163    path: Union[os.PathLike, str],
164    split: Literal["train", "val", "test"],
165    annotations: Literal['nuclei', 'tissue'] = "nuclei",
166    download: bool = False,
167) -> str:
168    """Download the PUMA data.
169
170    Args:
171        path: Filepath to a folder where the downloaded data will be saved.
172        split: The choice of data split.
173        annotations: The choice of annotations.
174        download: Whether to download the data if it is not present.
175
176    Returns:
177        Filepath where the dataset is downloaded and stored for further preprocessing.
178    """
179    if annotations not in ["nuclei", "tissue"]:
180        raise ValueError(f"'{annotations}' is not a valid annotation for the data.")
181
182    data_dir = os.path.join(path, split)
183    if os.path.exists(data_dir):
184        return data_dir
185
186    os.makedirs(path, exist_ok=True)
187
188    if not os.path.exists(os.path.join(path, "data")):
189        # Download the data.
190        zip_path = os.path.join(path, "roi.zip")
191        util.download_source(path=zip_path, url=URL["data"], download=download, checksum=CHECKSUM["data"])
192        util.unzip(zip_path=zip_path, dst=os.path.join(path, "data"))
193
194    # Download the annotations.
195    zip_path = os.path.join(path, "annotations.zip")
196    util.download_source(
197        path=zip_path,
198        url=URL["annotations"][annotations],
199        download=download,
200        checksum=CHECKSUM["annotations"][annotations]
201    )
202    util.unzip(zip_path=zip_path, dst=os.path.join(path, "annotations", annotations))
203
204    _preprocess_inputs(path, annotations, split)
205
206    return data_dir
207
208
209def get_puma_paths(
210    path: Union[os.PathLike, str],
211    split: Literal["train", "val", "test"],
212    annotations: Literal['nuclei', 'tissue'] = "nuclei",
213    download: bool = False
214) -> List[str]:
215    """Get paths to the PUMA dataset.
216
217    Args:
218        path: Filepath to a folder where the downloaded data will be saved.
219        split: The choice of data split.
220        annotations: The choice of annotations.
221        download: Whether to download the data if it is not present.
222
223    Returns:
224        List of filepaths for the input data.
225    """
226    data_dir = get_puma_data(path, split, annotations, download)
227    volume_paths = natsorted(glob(os.path.join(data_dir, "preprocessed", "*.h5")))
228    return volume_paths
229
230
231def get_puma_dataset(
232    path: Union[os.PathLike, str],
233    patch_shape: Tuple[int, int],
234    split: Literal["train", "val", "test"],
235    annotations: Literal['nuclei', 'tissue'] = "nuclei",
236    label_choice: Literal["instance", "semantic"] = "instance",
237    resize_inputs: bool = False,
238    download: bool = False,
239    **kwargs
240) -> Dataset:
241    """Get the PUMA dataset for nuclei and tissue segmentation.
242
243    Args:
244        path: Filepath to a folder where the downloaded data will be saved.
245        patch_shape: The patch shape to use for training.
246        split: The choice of data split.
247        annotations: The choice of annotations.
248        label_choice: The choice of segmentation type.
249        resize_inputs: Whether to resize the inputs.
250        download: Whether to download the data if it is not present.
251        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
252
253    Returns:
254        The segmentation dataset.
255    """
256    volume_paths = get_puma_paths(path, split, annotations, download)
257
258    if resize_inputs:
259        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
260        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
261            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
262        )
263
264    return torch_em.default_segmentation_dataset(
265        raw_paths=volume_paths,
266        raw_key="raw",
267        label_paths=volume_paths,
268        label_key=f"labels/{label_choice}/{annotations}",
269        patch_shape=patch_shape,
270        with_channels=True,
271        is_seg_dataset=True,
272        ndim=2,
273        **kwargs
274    )
275
276
277def get_puma_loader(
278    path: Union[os.PathLike, str],
279    batch_size: int,
280    patch_shape: Tuple[int, int],
281    split: Literal["train", "val", "test"],
282    annotations: Literal['nuclei', 'tissue'] = "nuclei",
283    label_choice: Literal["instances", "semantic"] = "instances",
284    resize_inputs: bool = False,
285    download: bool = False,
286    **kwargs
287) -> DataLoader:
288    """Get the PUMA dataloader for nuclei and tissue segmentation.
289
290    Args:
291        path: Filepath to a folder where the downloaded data will be saved.
292        batch_size: The batch size for training.
293        patch_shape: The patch shape to use for training.
294        split: The choice of data split.
295        annotations: The choice of annotations.
296        label_choice: The choice of segmentation type.
297        resize_inputs: Whether to resize the inputs.
298        download: Whether to download the data if it is not present.
299        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
300
301    Returns:
302        The DataLoader.
303    """
304    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
305    dataset = get_puma_dataset(
306        path, patch_shape, split, annotations, label_choice, resize_inputs, download, **ds_kwargs
307    )
308    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URL = {'data': 'https://zenodo.org/records/13859989/files/01_training_dataset_tif_ROIs.zip', 'annotations': {'nuclei': 'https://zenodo.org/records/13859989/files/01_training_dataset_geojson_nuclei.zip', 'tissue': 'https://zenodo.org/records/13859989/files/01_training_dataset_geojson_tissue.zip'}}
CHECKSUM = {'data': 'a69fd0d8443da29233df103ece5674fb50e8f0cc4b448dc60508cfe883881993', 'annotations': {'nuclei': '17f77ca83fb8fccd918ce723a7b3e5cb5a1730b342ad486628f8885d14a1acbd', 'tissue': '3b7d6697dd728e3481df0b779ad1e76962f36fc8c871c50edd9aa56ec44c4cc9'}}
CLASS_DICT = {'nuclei_stroma': 1, 'nuclei_tumor': 2, 'nuclei_plasma_cell': 3, 'nuclei_histiocyte': 4, 'nuclei_lymphocyte': 5, 'nuclei_melanophage': 6, 'nuclei_neutrophil': 7, 'nuclei_endothelium': 8, 'nuclei_epithelium': 9, 'nuclei_apoptosis': 10}
def get_puma_data( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], annotations: Literal['nuclei', 'tissue'] = 'nuclei', download: bool = False) -> str:
163def get_puma_data(
164    path: Union[os.PathLike, str],
165    split: Literal["train", "val", "test"],
166    annotations: Literal['nuclei', 'tissue'] = "nuclei",
167    download: bool = False,
168) -> str:
169    """Download the PUMA data.
170
171    Args:
172        path: Filepath to a folder where the downloaded data will be saved.
173        split: The choice of data split.
174        annotations: The choice of annotations.
175        download: Whether to download the data if it is not present.
176
177    Returns:
178        Filepath where the dataset is downloaded and stored for further preprocessing.
179    """
180    if annotations not in ["nuclei", "tissue"]:
181        raise ValueError(f"'{annotations}' is not a valid annotation for the data.")
182
183    data_dir = os.path.join(path, split)
184    if os.path.exists(data_dir):
185        return data_dir
186
187    os.makedirs(path, exist_ok=True)
188
189    if not os.path.exists(os.path.join(path, "data")):
190        # Download the data.
191        zip_path = os.path.join(path, "roi.zip")
192        util.download_source(path=zip_path, url=URL["data"], download=download, checksum=CHECKSUM["data"])
193        util.unzip(zip_path=zip_path, dst=os.path.join(path, "data"))
194
195    # Download the annotations.
196    zip_path = os.path.join(path, "annotations.zip")
197    util.download_source(
198        path=zip_path,
199        url=URL["annotations"][annotations],
200        download=download,
201        checksum=CHECKSUM["annotations"][annotations]
202    )
203    util.unzip(zip_path=zip_path, dst=os.path.join(path, "annotations", annotations))
204
205    _preprocess_inputs(path, annotations, split)
206
207    return data_dir

Download the PUMA data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • split: The choice of data split.
  • annotations: The choice of annotations.
  • download: Whether to download the data if it is not present.
Returns:

Filepath where the dataset is downloaded and stored for further preprocessing.

def get_puma_paths( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], annotations: Literal['nuclei', 'tissue'] = 'nuclei', download: bool = False) -> List[str]:
210def get_puma_paths(
211    path: Union[os.PathLike, str],
212    split: Literal["train", "val", "test"],
213    annotations: Literal['nuclei', 'tissue'] = "nuclei",
214    download: bool = False
215) -> List[str]:
216    """Get paths to the PUMA dataset.
217
218    Args:
219        path: Filepath to a folder where the downloaded data will be saved.
220        split: The choice of data split.
221        annotations: The choice of annotations.
222        download: Whether to download the data if it is not present.
223
224    Returns:
225        List of filepaths for the input data.
226    """
227    data_dir = get_puma_data(path, split, annotations, download)
228    volume_paths = natsorted(glob(os.path.join(data_dir, "preprocessed", "*.h5")))
229    return volume_paths

Get paths to the PUMA dataset.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • split: The choice of data split.
  • annotations: The choice of annotations.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths for the input data.

def get_puma_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], annotations: Literal['nuclei', 'tissue'] = 'nuclei', label_choice: Literal['instance', 'semantic'] = 'instance', resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
232def get_puma_dataset(
233    path: Union[os.PathLike, str],
234    patch_shape: Tuple[int, int],
235    split: Literal["train", "val", "test"],
236    annotations: Literal['nuclei', 'tissue'] = "nuclei",
237    label_choice: Literal["instance", "semantic"] = "instance",
238    resize_inputs: bool = False,
239    download: bool = False,
240    **kwargs
241) -> Dataset:
242    """Get the PUMA dataset for nuclei and tissue segmentation.
243
244    Args:
245        path: Filepath to a folder where the downloaded data will be saved.
246        patch_shape: The patch shape to use for training.
247        split: The choice of data split.
248        annotations: The choice of annotations.
249        label_choice: The choice of segmentation type.
250        resize_inputs: Whether to resize the inputs.
251        download: Whether to download the data if it is not present.
252        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
253
254    Returns:
255        The segmentation dataset.
256    """
257    volume_paths = get_puma_paths(path, split, annotations, download)
258
259    if resize_inputs:
260        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
261        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
262            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
263        )
264
265    return torch_em.default_segmentation_dataset(
266        raw_paths=volume_paths,
267        raw_key="raw",
268        label_paths=volume_paths,
269        label_key=f"labels/{label_choice}/{annotations}",
270        patch_shape=patch_shape,
271        with_channels=True,
272        is_seg_dataset=True,
273        ndim=2,
274        **kwargs
275    )

Get the PUMA dataset for nuclei and tissue segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • split: The choice of data split.
  • annotations: The choice of annotations.
  • label_choice: The choice of segmentation type.
  • resize_inputs: Whether to resize the inputs.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_puma_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], annotations: Literal['nuclei', 'tissue'] = 'nuclei', label_choice: Literal['instances', 'semantic'] = 'instances', resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
278def get_puma_loader(
279    path: Union[os.PathLike, str],
280    batch_size: int,
281    patch_shape: Tuple[int, int],
282    split: Literal["train", "val", "test"],
283    annotations: Literal['nuclei', 'tissue'] = "nuclei",
284    label_choice: Literal["instances", "semantic"] = "instances",
285    resize_inputs: bool = False,
286    download: bool = False,
287    **kwargs
288) -> DataLoader:
289    """Get the PUMA dataloader for nuclei and tissue segmentation.
290
291    Args:
292        path: Filepath to a folder where the downloaded data will be saved.
293        batch_size: The batch size for training.
294        patch_shape: The patch shape to use for training.
295        split: The choice of data split.
296        annotations: The choice of annotations.
297        label_choice: The choice of segmentation type.
298        resize_inputs: Whether to resize the inputs.
299        download: Whether to download the data if it is not present.
300        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
301
302    Returns:
303        The DataLoader.
304    """
305    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
306    dataset = get_puma_dataset(
307        path, patch_shape, split, annotations, label_choice, resize_inputs, download, **ds_kwargs
308    )
309    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the PUMA dataloader for nuclei and tissue segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape to use for training.
  • split: The choice of data split.
  • annotations: The choice of annotations.
  • label_choice: The choice of segmentation type.
  • resize_inputs: Whether to resize the inputs.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.