torch_em.data.datasets.histopathology.puma

The PUMA dataset contains annotations for nucleus and tissue segmentation in melanoma H&E stained histopathology images.

This dataset is located at https://zenodo.org/records/13859989. This is part of the PUMA Grand Challenge: https://puma.grand-challenge.org/. The dataset is from the publication https://doi.org/10.1093/gigascience/giaf011. Please cite them if you use this dataset for your research.

  1"""The PUMA dataset contains annotations for nucleus and tissue segmentation
  2in melanoma H&E stained histopathology images.
  3
  4This dataset is located at https://zenodo.org/records/13859989.
  5This is part of the PUMA Grand Challenge: https://puma.grand-challenge.org/.
  6The dataset is from the publication https://doi.org/10.1093/gigascience/giaf011.
  7Please cite them if you use this dataset for your research.
  8"""
  9
 10import os
 11from glob import glob
 12from tqdm import tqdm
 13from pathlib import Path
 14from natsort import natsorted
 15from typing import Union, Literal, List, Tuple
 16
 17import json
 18import numpy as np
 19import pandas as pd
 20import imageio.v3 as imageio
 21from sklearn.model_selection import train_test_split
 22
 23from torch.utils.data import Dataset, DataLoader
 24
 25import torch_em
 26
 27from .. import util
 28
 29
 30URL = {
 31    "data": "https://zenodo.org/records/15050523/files/01_training_dataset_tif_ROIs.zip",
 32    "annotations": {
 33        "nuclei": "https://zenodo.org/records/15050523/files/01_training_dataset_geojson_nuclei.zip",
 34        "tissue": "https://zenodo.org/records/15050523/files/01_training_dataset_geojson_tissue.zip",
 35    }
 36}
 37
 38CHECKSUM = {
 39    "data": "af48b879f8ff7e74b84a7114924881606f13f108aa0f9bcc21d3593b717ee022",
 40    "annotations": {
 41        "nuclei": "eda271225900d6de0759e0281f3731a570e09f2adab58bd36425b9d2dfad91a0",
 42        "tissue": "fc2835135cc28324f52eac131327f0f12c554c0b1f334a108bf4b65e0f18c42b",
 43    }
 44}
 45
 46NUCLEI_CLASS_DICT = {
 47    "nuclei_stroma": 1,
 48    "nuclei_tumor": 2,
 49    "nuclei_plasma_cell": 3,
 50    "nuclei_histiocyte": 4,
 51    "nuclei_lymphocyte": 5,
 52    "nuclei_melanophage": 6,
 53    "nuclei_neutrophil": 7,
 54    "nuclei_endothelium": 8,
 55    "nuclei_epithelium": 9,
 56    "nuclei_apoptosis": 10,
 57}
 58
 59TISSUE_CLASS_DICT = {
 60    "tissue_stroma": 1,
 61    "tissue_tumor": 2,
 62    "tissue_epidermis": 3,
 63    "tissue_blood_vessel": 4,
 64    "tissue_necrosis": 5,
 65    "tissue_white_background": 6,
 66}
 67
 68CLASS_DICT = {
 69    "nuclei": NUCLEI_CLASS_DICT,
 70    "tissue": TISSUE_CLASS_DICT,
 71}
 72
 73
 74def _create_split_csv(path, annotations, split):
 75    "This creates a split saved to a .csv file in the dataset directory"
 76    csv_path = os.path.join(path, "puma_split.csv")
 77
 78    if os.path.exists(csv_path):
 79        df = pd.read_csv(csv_path)
 80        df[split] = df[split].apply(lambda x: json.loads(x.replace("'", '"')))  # ensures all items from column in list.
 81        split_list = df.iloc[0][split]
 82    else:
 83        print(f"Creating a new split file at '{csv_path}'.")
 84        metastatic_ids = [
 85            os.path.basename(image).split(".")[0]
 86            for image in glob(os.path.join(path, "data", "01_training_dataset_tif_ROIs", "*metastatic*"))
 87        ]
 88        primary_ids = [
 89            os.path.basename(image).split(".")[0]
 90            for image in glob(os.path.join(path, "data", "01_training_dataset_tif_ROIs", "*primary*"))
 91        ]
 92
 93        # Create random splits per dataset.
 94        train_ids, test_ids = train_test_split(metastatic_ids, test_size=0.2)  # 20% for test.
 95        train_ids, val_ids = train_test_split(train_ids, test_size=0.15)  # 15% of the train set for val.
 96        ptrain_ids, ptest_ids = train_test_split(primary_ids, test_size=0.2)  # do same as above for 'primary' samples.
 97        ptrain_ids, pval_ids = train_test_split(ptrain_ids, test_size=0.15)  # do same as above for 'primary' samples.
 98        train_ids.extend(ptrain_ids)
 99        val_ids.extend(pval_ids)
100        test_ids.extend(ptest_ids)
101
102        split_ids = {"train": train_ids, "val": val_ids, "test": test_ids}
103
104        df = pd.DataFrame.from_dict([split_ids])
105        df.to_csv(csv_path, index=False)
106
107        split_list = split_ids[split]
108
109    return split_list
110
111
112def _preprocess_inputs(path, annotations, split):
113    import h5py
114    try:
115        import geopandas as gpd
116    except ModuleNotFoundError:
117        raise RuntimeError("Please install 'geopandas': 'conda install -c conda-forge geopandas'.")
118
119    try:
120        from rasterio.features import rasterize
121        from rasterio.transform import from_bounds
122    except ModuleNotFoundError:
123        raise RuntimeError("Please install 'rasterio': 'conda install -c conda-forge rasterio'.")
124
125    annotation_paths = glob(
126        os.path.join(path, "annotations", annotations, f"01_training_dataset_geojson_{annotations}", "*.geojson")
127    )
128    roi_dir = os.path.join(path, "data", "01_training_dataset_tif_ROIs")
129    preprocessed_dir = os.path.join(path, split, "preprocessed")
130    os.makedirs(preprocessed_dir, exist_ok=True)
131
132    split_list = _create_split_csv(path, annotations, split)
133    print(f"The data split '{split}' has '{len(split_list)}' samples!")
134
135    for ann_path in tqdm(annotation_paths, desc=f"Preprocessing '{annotations}'"):
136        fname = os.path.basename(ann_path).replace(f"_{annotations}.geojson", ".tif")
137        image_path = os.path.join(roi_dir, fname)
138
139        # Handle inconsistent extension for sample 103 (.tiff instead of .tif).
140        if not os.path.exists(image_path):
141            image_path = image_path + "f"  # Retrying with .tiff
142
143        if os.path.basename(image_path).split(".")[0] not in split_list:
144            continue
145
146        assert os.path.exists(image_path), image_path
147
148        volume_path = os.path.join(preprocessed_dir, Path(fname).with_suffix(".h5"))
149        gdf = gpd.read_file(ann_path)
150        minx, miny, maxx, maxy = gdf.total_bounds
151
152        width, height = 1024, 1024  # roi shape
153        transform = from_bounds(minx, miny, maxx, maxy, width, height)
154
155        # Extract class ids mapped to each class name.
156        class_dict = CLASS_DICT[annotations]
157        class_ids = [class_dict[cls_entry["name"]] for cls_entry in gdf["classification"]]
158        semantic_shapes = ((geom, unique_id) for geom, unique_id in zip(gdf.geometry, class_ids))
159        semantic_mask = rasterize(
160            semantic_shapes, out_shape=(height, width), transform=transform, fill=0, dtype=np.uint8
161        )
162
163        gdf['id'] = range(1, len(gdf) + 1)
164        instance_shapes = ((geom, unique_id) for geom, unique_id in zip(gdf.geometry, gdf['id']))
165        instance_mask = rasterize(
166            instance_shapes, out_shape=(height, width), transform=transform, fill=0, dtype=np.int32
167        )
168
169        # Transform labels to match expected orientation
170        instance_mask = np.flip(instance_mask)
171        instance_mask = np.fliplr(instance_mask)
172
173        semantic_mask = np.flip(semantic_mask)
174        semantic_mask = np.fliplr(semantic_mask)
175
176        image = imageio.imread(image_path)
177        image = image[..., :-1].transpose(2, 0, 1)
178
179        with h5py.File(volume_path, "a") as f:
180            if "raw" not in f.keys():
181                f.create_dataset("raw", data=image, compression="gzip")
182
183            if f"labels/instances/{annotations}" not in f.keys():
184                f.create_dataset(f"labels/instances/{annotations}", data=instance_mask, compression="gzip")
185
186            if f"labels/semantic/{annotations}" not in f.keys():
187                f.create_dataset(f"labels/semantic/{annotations}", data=semantic_mask, compression="gzip")
188
189
190def _annotations_are_stored(data_dir, annotations):
191    import h5py
192    volume_paths = glob(os.path.join(data_dir, "preprocessed", "*.h5"))
193    if not volume_paths:
194        return
195    f = h5py.File(volume_paths[0], "r")
196    return f"labels/instances/{annotations}" in f.keys()
197
198
199def get_puma_data(
200    path: Union[os.PathLike, str],
201    split: Literal["train", "val", "test"],
202    annotations: Literal['nuclei', 'tissue'] = "nuclei",
203    download: bool = False,
204) -> str:
205    """Download the PUMA data.
206
207    Args:
208        path: Filepath to a folder where the downloaded data will be saved.
209        split: The choice of data split.
210        annotations: The choice of annotations.
211        download: Whether to download the data if it is not present.
212
213    Returns:
214        Filepath where the dataset is downloaded and stored for further preprocessing.
215    """
216    if annotations not in ["nuclei", "tissue"]:
217        raise ValueError(f"'{annotations}' is not a valid annotation for the data.")
218
219    data_dir = os.path.join(path, split)
220    if os.path.exists(data_dir) and _annotations_are_stored(data_dir, annotations):
221        return data_dir
222
223    os.makedirs(path, exist_ok=True)
224
225    if not os.path.exists(os.path.join(path, "data")):
226        # Download the data.
227        zip_path = os.path.join(path, "roi.zip")
228        util.download_source(path=zip_path, url=URL["data"], download=download, checksum=CHECKSUM["data"])
229        util.unzip(zip_path=zip_path, dst=os.path.join(path, "data"))
230
231    # Download the annotations.
232    zip_path = os.path.join(path, "annotations.zip")
233    util.download_source(
234        path=zip_path,
235        url=URL["annotations"][annotations],
236        download=download,
237        checksum=CHECKSUM["annotations"][annotations]
238    )
239    util.unzip(zip_path=zip_path, dst=os.path.join(path, "annotations", annotations))
240
241    _preprocess_inputs(path, annotations, split)
242
243    return data_dir
244
245
246def get_puma_paths(
247    path: Union[os.PathLike, str],
248    split: Literal["train", "val", "test"],
249    annotations: Literal['nuclei', 'tissue'] = "nuclei",
250    download: bool = False
251) -> List[str]:
252    """Get paths to the PUMA dataset.
253
254    Args:
255        path: Filepath to a folder where the downloaded data will be saved.
256        split: The choice of data split.
257        annotations: The choice of annotations.
258        download: Whether to download the data if it is not present.
259
260    Returns:
261        List of filepaths for the input data.
262    """
263    data_dir = get_puma_data(path, split, annotations, download)
264    volume_paths = natsorted(glob(os.path.join(data_dir, "preprocessed", "*.h5")))
265    return volume_paths
266
267
268def get_puma_dataset(
269    path: Union[os.PathLike, str],
270    patch_shape: Tuple[int, int],
271    split: Literal["train", "val", "test"],
272    annotations: Literal['nuclei', 'tissue'] = "nuclei",
273    label_choice: Literal["instances", "semantic"] = "instances",
274    resize_inputs: bool = False,
275    download: bool = False,
276    **kwargs
277) -> Dataset:
278    """Get the PUMA dataset for nuclei and tissue segmentation.
279
280    Args:
281        path: Filepath to a folder where the downloaded data will be saved.
282        patch_shape: The patch shape to use for training.
283        split: The choice of data split.
284        annotations: The choice of annotations.
285        label_choice: The choice of segmentation type.
286        resize_inputs: Whether to resize the inputs.
287        download: Whether to download the data if it is not present.
288        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
289
290    Returns:
291        The segmentation dataset.
292    """
293    volume_paths = get_puma_paths(path, split, annotations, download)
294
295    if resize_inputs:
296        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
297        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
298            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
299        )
300
301    return torch_em.default_segmentation_dataset(
302        raw_paths=volume_paths,
303        raw_key="raw",
304        label_paths=volume_paths,
305        label_key=f"labels/{label_choice}/{annotations}",
306        patch_shape=patch_shape,
307        with_channels=True,
308        is_seg_dataset=True,
309        ndim=2,
310        **kwargs
311    )
312
313
314def get_puma_loader(
315    path: Union[os.PathLike, str],
316    batch_size: int,
317    patch_shape: Tuple[int, int],
318    split: Literal["train", "val", "test"],
319    annotations: Literal['nuclei', 'tissue'] = "nuclei",
320    label_choice: Literal["instances", "semantic"] = "instances",
321    resize_inputs: bool = False,
322    download: bool = False,
323    **kwargs
324) -> DataLoader:
325    """Get the PUMA dataloader for nuclei and tissue segmentation.
326
327    Args:
328        path: Filepath to a folder where the downloaded data will be saved.
329        batch_size: The batch size for training.
330        patch_shape: The patch shape to use for training.
331        split: The choice of data split.
332        annotations: The choice of annotations.
333        label_choice: The choice of segmentation type.
334        resize_inputs: Whether to resize the inputs.
335        download: Whether to download the data if it is not present.
336        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
337
338    Returns:
339        The DataLoader.
340    """
341    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
342    dataset = get_puma_dataset(
343        path, patch_shape, split, annotations, label_choice, resize_inputs, download, **ds_kwargs
344    )
345    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URL = {'data': 'https://zenodo.org/records/15050523/files/01_training_dataset_tif_ROIs.zip', 'annotations': {'nuclei': 'https://zenodo.org/records/15050523/files/01_training_dataset_geojson_nuclei.zip', 'tissue': 'https://zenodo.org/records/15050523/files/01_training_dataset_geojson_tissue.zip'}}
CHECKSUM = {'data': 'af48b879f8ff7e74b84a7114924881606f13f108aa0f9bcc21d3593b717ee022', 'annotations': {'nuclei': 'eda271225900d6de0759e0281f3731a570e09f2adab58bd36425b9d2dfad91a0', 'tissue': 'fc2835135cc28324f52eac131327f0f12c554c0b1f334a108bf4b65e0f18c42b'}}
NUCLEI_CLASS_DICT = {'nuclei_stroma': 1, 'nuclei_tumor': 2, 'nuclei_plasma_cell': 3, 'nuclei_histiocyte': 4, 'nuclei_lymphocyte': 5, 'nuclei_melanophage': 6, 'nuclei_neutrophil': 7, 'nuclei_endothelium': 8, 'nuclei_epithelium': 9, 'nuclei_apoptosis': 10}
TISSUE_CLASS_DICT = {'tissue_stroma': 1, 'tissue_tumor': 2, 'tissue_epidermis': 3, 'tissue_blood_vessel': 4, 'tissue_necrosis': 5, 'tissue_white_background': 6}
CLASS_DICT = {'nuclei': {'nuclei_stroma': 1, 'nuclei_tumor': 2, 'nuclei_plasma_cell': 3, 'nuclei_histiocyte': 4, 'nuclei_lymphocyte': 5, 'nuclei_melanophage': 6, 'nuclei_neutrophil': 7, 'nuclei_endothelium': 8, 'nuclei_epithelium': 9, 'nuclei_apoptosis': 10}, 'tissue': {'tissue_stroma': 1, 'tissue_tumor': 2, 'tissue_epidermis': 3, 'tissue_blood_vessel': 4, 'tissue_necrosis': 5, 'tissue_white_background': 6}}
def get_puma_data( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], annotations: Literal['nuclei', 'tissue'] = 'nuclei', download: bool = False) -> str:
200def get_puma_data(
201    path: Union[os.PathLike, str],
202    split: Literal["train", "val", "test"],
203    annotations: Literal['nuclei', 'tissue'] = "nuclei",
204    download: bool = False,
205) -> str:
206    """Download the PUMA data.
207
208    Args:
209        path: Filepath to a folder where the downloaded data will be saved.
210        split: The choice of data split.
211        annotations: The choice of annotations.
212        download: Whether to download the data if it is not present.
213
214    Returns:
215        Filepath where the dataset is downloaded and stored for further preprocessing.
216    """
217    if annotations not in ["nuclei", "tissue"]:
218        raise ValueError(f"'{annotations}' is not a valid annotation for the data.")
219
220    data_dir = os.path.join(path, split)
221    if os.path.exists(data_dir) and _annotations_are_stored(data_dir, annotations):
222        return data_dir
223
224    os.makedirs(path, exist_ok=True)
225
226    if not os.path.exists(os.path.join(path, "data")):
227        # Download the data.
228        zip_path = os.path.join(path, "roi.zip")
229        util.download_source(path=zip_path, url=URL["data"], download=download, checksum=CHECKSUM["data"])
230        util.unzip(zip_path=zip_path, dst=os.path.join(path, "data"))
231
232    # Download the annotations.
233    zip_path = os.path.join(path, "annotations.zip")
234    util.download_source(
235        path=zip_path,
236        url=URL["annotations"][annotations],
237        download=download,
238        checksum=CHECKSUM["annotations"][annotations]
239    )
240    util.unzip(zip_path=zip_path, dst=os.path.join(path, "annotations", annotations))
241
242    _preprocess_inputs(path, annotations, split)
243
244    return data_dir

Download the PUMA data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • split: The choice of data split.
  • annotations: The choice of annotations.
  • download: Whether to download the data if it is not present.
Returns:

Filepath where the dataset is downloaded and stored for further preprocessing.

def get_puma_paths( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], annotations: Literal['nuclei', 'tissue'] = 'nuclei', download: bool = False) -> List[str]:
247def get_puma_paths(
248    path: Union[os.PathLike, str],
249    split: Literal["train", "val", "test"],
250    annotations: Literal['nuclei', 'tissue'] = "nuclei",
251    download: bool = False
252) -> List[str]:
253    """Get paths to the PUMA dataset.
254
255    Args:
256        path: Filepath to a folder where the downloaded data will be saved.
257        split: The choice of data split.
258        annotations: The choice of annotations.
259        download: Whether to download the data if it is not present.
260
261    Returns:
262        List of filepaths for the input data.
263    """
264    data_dir = get_puma_data(path, split, annotations, download)
265    volume_paths = natsorted(glob(os.path.join(data_dir, "preprocessed", "*.h5")))
266    return volume_paths

Get paths to the PUMA dataset.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • split: The choice of data split.
  • annotations: The choice of annotations.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths for the input data.

def get_puma_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], annotations: Literal['nuclei', 'tissue'] = 'nuclei', label_choice: Literal['instances', 'semantic'] = 'instances', resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
269def get_puma_dataset(
270    path: Union[os.PathLike, str],
271    patch_shape: Tuple[int, int],
272    split: Literal["train", "val", "test"],
273    annotations: Literal['nuclei', 'tissue'] = "nuclei",
274    label_choice: Literal["instances", "semantic"] = "instances",
275    resize_inputs: bool = False,
276    download: bool = False,
277    **kwargs
278) -> Dataset:
279    """Get the PUMA dataset for nuclei and tissue segmentation.
280
281    Args:
282        path: Filepath to a folder where the downloaded data will be saved.
283        patch_shape: The patch shape to use for training.
284        split: The choice of data split.
285        annotations: The choice of annotations.
286        label_choice: The choice of segmentation type.
287        resize_inputs: Whether to resize the inputs.
288        download: Whether to download the data if it is not present.
289        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
290
291    Returns:
292        The segmentation dataset.
293    """
294    volume_paths = get_puma_paths(path, split, annotations, download)
295
296    if resize_inputs:
297        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
298        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
299            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
300        )
301
302    return torch_em.default_segmentation_dataset(
303        raw_paths=volume_paths,
304        raw_key="raw",
305        label_paths=volume_paths,
306        label_key=f"labels/{label_choice}/{annotations}",
307        patch_shape=patch_shape,
308        with_channels=True,
309        is_seg_dataset=True,
310        ndim=2,
311        **kwargs
312    )

Get the PUMA dataset for nuclei and tissue segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • split: The choice of data split.
  • annotations: The choice of annotations.
  • label_choice: The choice of segmentation type.
  • resize_inputs: Whether to resize the inputs.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_puma_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], annotations: Literal['nuclei', 'tissue'] = 'nuclei', label_choice: Literal['instances', 'semantic'] = 'instances', resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
315def get_puma_loader(
316    path: Union[os.PathLike, str],
317    batch_size: int,
318    patch_shape: Tuple[int, int],
319    split: Literal["train", "val", "test"],
320    annotations: Literal['nuclei', 'tissue'] = "nuclei",
321    label_choice: Literal["instances", "semantic"] = "instances",
322    resize_inputs: bool = False,
323    download: bool = False,
324    **kwargs
325) -> DataLoader:
326    """Get the PUMA dataloader for nuclei and tissue segmentation.
327
328    Args:
329        path: Filepath to a folder where the downloaded data will be saved.
330        batch_size: The batch size for training.
331        patch_shape: The patch shape to use for training.
332        split: The choice of data split.
333        annotations: The choice of annotations.
334        label_choice: The choice of segmentation type.
335        resize_inputs: Whether to resize the inputs.
336        download: Whether to download the data if it is not present.
337        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
338
339    Returns:
340        The DataLoader.
341    """
342    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
343    dataset = get_puma_dataset(
344        path, patch_shape, split, annotations, label_choice, resize_inputs, download, **ds_kwargs
345    )
346    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the PUMA dataloader for nuclei and tissue segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape to use for training.
  • split: The choice of data split.
  • annotations: The choice of annotations.
  • label_choice: The choice of segmentation type.
  • resize_inputs: Whether to resize the inputs.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.