torch_em.data.datasets.histopathology.puma

The PUMA dataset contains annotations for nucleus and tissue segmentation in melanoma H&E stained histopathology images.

This dataset is located at https://zenodo.org/records/13859989. This is part of the PUMA Grand Challenge: https://puma.grand-challenge.org/

Preprint with details about the data: https://doi.org/10.1101/2024.10.07.24315039

Please cite them if you use this dataset for your research.

View Source

  1"""The PUMA dataset contains annotations for nucleus and tissue segmentation
  2in melanoma H&E stained histopathology images.
  3
  4This dataset is located at https://zenodo.org/records/13859989.
  5This is part of the PUMA Grand Challenge: https://puma.grand-challenge.org/
  6- Preprint with details about the data: https://doi.org/10.1101/2024.10.07.24315039
  7
  8Please cite them if you use this dataset for your research.
  9"""
 10
 11import os
 12from glob import glob
 13from tqdm import tqdm
 14from pathlib import Path
 15from natsort import natsorted
 16from typing import Union, Literal, List, Tuple
 17
 18import json
 19import numpy as np
 20import pandas as pd
 21import imageio.v3 as imageio
 22from sklearn.model_selection import train_test_split
 23
 24from torch.utils.data import Dataset, DataLoader
 25
 26import torch_em
 27
 28from .. import util
 29
 30
 31URL = {
 32    "data": "https://zenodo.org/records/13859989/files/01_training_dataset_tif_ROIs.zip",
 33    "annotations": {
 34        "nuclei": "https://zenodo.org/records/13859989/files/01_training_dataset_geojson_nuclei.zip",
 35        "tissue": "https://zenodo.org/records/13859989/files/01_training_dataset_geojson_tissue.zip",
 36    }
 37}
 38
 39CHECKSUM = {
 40    "data": "a69fd0d8443da29233df103ece5674fb50e8f0cc4b448dc60508cfe883881993",
 41    "annotations": {
 42        "nuclei": "17f77ca83fb8fccd918ce723a7b3e5cb5a1730b342ad486628f8885d14a1acbd",
 43        "tissue": "3b7d6697dd728e3481df0b779ad1e76962f36fc8c871c50edd9aa56ec44c4cc9",
 44    }
 45}
 46
 47CLASS_DICT = {
 48    "nuclei_stroma": 1,
 49    "nuclei_tumor": 2,
 50    "nuclei_plasma_cell": 3,
 51    "nuclei_histiocyte": 4,
 52    "nuclei_lymphocyte": 5,
 53    "nuclei_melanophage": 6,
 54    "nuclei_neutrophil": 7,
 55    "nuclei_endothelium": 8,
 56    "nuclei_epithelium": 9,
 57    "nuclei_apoptosis": 10,
 58}
 59
 60
 61def _create_split_csv(path, split):
 62    "This creates a split saved to a .csv file in the dataset directory"
 63    csv_path = os.path.join(path, "puma_split.csv")
 64
 65    if os.path.exists(csv_path):
 66        df = pd.read_csv(csv_path)
 67        df[split] = df[split].apply(lambda x: json.loads(x.replace("'", '"')))  # ensures all items from column in list.
 68        split_list = df.iloc[0][split]
 69    else:
 70        print(f"Creating a new split file at '{csv_path}'.")
 71        metastatic_ids = [
 72            os.path.basename(image).split(".")[0] for image in glob(os.path.join(path, "data", "*metastatic*"))
 73        ]
 74        primary_ids = [
 75            os.path.basename(image).split(".")[0] for image in glob(os.path.join(path, "data", "*primary*"))
 76        ]
 77
 78        # Create random splits per dataset.
 79        train_ids, test_ids = train_test_split(metastatic_ids, test_size=0.2)  # 20% for test.
 80        train_ids, val_ids = train_test_split(train_ids, test_size=0.15)  # 15% of the train set for val.
 81        ptrain_ids, ptest_ids = train_test_split(primary_ids, test_size=0.2)  # do same as above for 'primary' samples.
 82        ptrain_ids, pval_ids = train_test_split(ptrain_ids, test_size=0.15)  # do same as above for 'primary' samples.
 83        train_ids.extend(ptrain_ids)
 84        val_ids.extend(pval_ids)
 85        test_ids.extend(ptest_ids)
 86
 87        split_ids = {"train": train_ids, "val": val_ids, "test": test_ids}
 88
 89        df = pd.DataFrame.from_dict([split_ids])
 90        df.to_csv(csv_path, index=False)
 91
 92        split_list = split_ids[split]
 93
 94    return split_list
 95
 96
 97def _preprocess_inputs(path, annotations, split):
 98    import ast
 99    import h5py
100    try:
101        import geopandas as gpd
102    except ModuleNotFoundError:
103        raise RuntimeError("Please install 'geopandas': 'conda install -c conda-forge geopandas'.")
104
105    try:
106        from rasterio.features import rasterize
107        from rasterio.transform import from_bounds
108    except ModuleNotFoundError:
109        raise RuntimeError("Please install 'rasterio': 'conda install -c conda-forge rasterio'.")
110
111    annotation_paths = glob(os.path.join(path, "annotations", annotations, "*.geojson"))
112    roi_dir = os.path.join(path, "data")
113    preprocessed_dir = os.path.join(path, split, "preprocessed")
114    os.makedirs(preprocessed_dir, exist_ok=True)
115
116    split_list = _create_split_csv(path, split)
117    print(f"The data split '{split}' has '{len(split_list)}' samples!")
118
119    for ann_path in tqdm(annotation_paths, desc=f"Preprocessing '{annotations}'"):
120        fname = os.path.basename(ann_path).replace(f"_{annotations}.geojson", ".tif")
121        image_path = os.path.join(roi_dir, fname)
122
123        if os.path.basename(image_path).split(".")[0] not in split_list:
124            continue
125
126        volume_path = os.path.join(preprocessed_dir, Path(fname).with_suffix(".h5"))
127        gdf = gpd.read_file(ann_path)
128        minx, miny, maxx, maxy = gdf.total_bounds
129
130        width, height = 1024, 1024  # roi shape
131        transform = from_bounds(minx, miny, maxx, maxy, width, height)
132
133        # Extract class ids mapped to each class name.
134        class_ids = [
135            CLASS_DICT[nuc_class["name"]] for nuc_class in gdf["classification"].apply(lambda x: ast.literal_eval(x))
136        ]
137        semantic_shapes = ((geom, unique_id) for geom, unique_id in zip(gdf.geometry, class_ids))
138        semantic_mask = rasterize(
139            semantic_shapes, out_shape=(height, width), transform=transform, fill=0, dtype=np.uint8
140        )
141
142        gdf['id'] = range(1, len(gdf) + 1)
143        instance_shapes = ((geom, unique_id) for geom, unique_id in zip(gdf.geometry, gdf['id']))
144        instance_mask = rasterize(
145            instance_shapes, out_shape=(height, width), transform=transform, fill=0, dtype=np.int32
146        )
147
148        # Transform labels to match expected orientation
149        instance_mask = np.flip(instance_mask)
150        instance_mask = np.fliplr(instance_mask)
151
152        semantic_mask = np.flip(semantic_mask)
153        semantic_mask = np.fliplr(semantic_mask)
154
155        image = imageio.imread(image_path)
156        image = image[..., :-1].transpose(2, 0, 1)
157
158        with h5py.File(volume_path, "a") as f:
159            if "raw" not in f.keys():
160                f.create_dataset("raw", data=image, compression="gzip")
161
162            if f"labels/instances/{annotations}" not in f.keys():
163                f.create_dataset(f"labels/instances/{annotations}", data=instance_mask, compression="gzip")
164
165            if f"labels/semantic/{annotations}" not in f.keys():
166                f.create_dataset(f"labels/semantic/{annotations}", data=semantic_mask, compression="gzip")
167
168
169def get_puma_data(
170    path: Union[os.PathLike, str],
171    split: Literal["train", "val", "test"],
172    annotations: Literal['nuclei', 'tissue'] = "nuclei",
173    download: bool = False,
174) -> str:
175    """Download the PUMA data.
176
177    Args:
178        path: Filepath to a folder where the downloaded data will be saved.
179        split: The choice of data split.
180        annotations: The choice of annotations.
181        download: Whether to download the data if it is not present.
182
183    Returns:
184        Filepath where the dataset is downloaded and stored for further preprocessing.
185    """
186    if annotations not in ["nuclei", "tissue"]:
187        raise ValueError(f"'{annotations}' is not a valid annotation for the data.")
188
189    data_dir = os.path.join(path, split)
190    if os.path.exists(data_dir):
191        return data_dir
192
193    os.makedirs(path, exist_ok=True)
194
195    if not os.path.exists(os.path.join(path, "data")):
196        # Download the data.
197        zip_path = os.path.join(path, "roi.zip")
198        util.download_source(path=zip_path, url=URL["data"], download=download, checksum=CHECKSUM["data"])
199        util.unzip(zip_path=zip_path, dst=os.path.join(path, "data"))
200
201    # Download the annotations.
202    zip_path = os.path.join(path, "annotations.zip")
203    util.download_source(
204        path=zip_path,
205        url=URL["annotations"][annotations],
206        download=download,
207        checksum=CHECKSUM["annotations"][annotations]
208    )
209    util.unzip(zip_path=zip_path, dst=os.path.join(path, "annotations", annotations))
210
211    _preprocess_inputs(path, annotations, split)
212
213    return data_dir
214
215
216def get_puma_paths(
217    path: Union[os.PathLike, str],
218    split: Literal["train", "val", "test"],
219    annotations: Literal['nuclei', 'tissue'] = "nuclei",
220    download: bool = False
221) -> List[str]:
222    """Get paths to the PUMA dataset.
223
224    Args:
225        path: Filepath to a folder where the downloaded data will be saved.
226        split: The choice of data split.
227        annotations: The choice of annotations.
228        download: Whether to download the data if it is not present.
229
230    Returns:
231        List of filepaths for the input data.
232    """
233    data_dir = get_puma_data(path, split, annotations, download)
234    volume_paths = natsorted(glob(os.path.join(data_dir, "preprocessed", "*.h5")))
235    return volume_paths
236
237
238def get_puma_dataset(
239    path: Union[os.PathLike, str],
240    patch_shape: Tuple[int, int],
241    split: Literal["train", "val", "test"],
242    annotations: Literal['nuclei', 'tissue'] = "nuclei",
243    label_choice: Literal["instances", "semantic"] = "instances",
244    resize_inputs: bool = False,
245    download: bool = False,
246    **kwargs
247) -> Dataset:
248    """Get the PUMA dataset for nuclei and tissue segmentation.
249
250    Args:
251        path: Filepath to a folder where the downloaded data will be saved.
252        patch_shape: The patch shape to use for training.
253        split: The choice of data split.
254        annotations: The choice of annotations.
255        label_choice: The choice of segmentation type.
256        resize_inputs: Whether to resize the inputs.
257        download: Whether to download the data if it is not present.
258        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
259
260    Returns:
261        The segmentation dataset.
262    """
263    volume_paths = get_puma_paths(path, split, annotations, download)
264
265    if resize_inputs:
266        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
267        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
268            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
269        )
270
271    return torch_em.default_segmentation_dataset(
272        raw_paths=volume_paths,
273        raw_key="raw",
274        label_paths=volume_paths,
275        label_key=f"labels/{label_choice}/{annotations}",
276        patch_shape=patch_shape,
277        with_channels=True,
278        is_seg_dataset=True,
279        ndim=2,
280        **kwargs
281    )
282
283
284def get_puma_loader(
285    path: Union[os.PathLike, str],
286    batch_size: int,
287    patch_shape: Tuple[int, int],
288    split: Literal["train", "val", "test"],
289    annotations: Literal['nuclei', 'tissue'] = "nuclei",
290    label_choice: Literal["instances", "semantic"] = "instances",
291    resize_inputs: bool = False,
292    download: bool = False,
293    **kwargs
294) -> DataLoader:
295    """Get the PUMA dataloader for nuclei and tissue segmentation.
296
297    Args:
298        path: Filepath to a folder where the downloaded data will be saved.
299        batch_size: The batch size for training.
300        patch_shape: The patch shape to use for training.
301        split: The choice of data split.
302        annotations: The choice of annotations.
303        label_choice: The choice of segmentation type.
304        resize_inputs: Whether to resize the inputs.
305        download: Whether to download the data if it is not present.
306        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
307
308    Returns:
309        The DataLoader.
310    """
311    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
312    dataset = get_puma_dataset(
313        path, patch_shape, split, annotations, label_choice, resize_inputs, download, **ds_kwargs
314    )
315    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

URL = {'data': 'https://zenodo.org/records/13859989/files/01_training_dataset_tif_ROIs.zip', 'annotations': {'nuclei': 'https://zenodo.org/records/13859989/files/01_training_dataset_geojson_nuclei.zip', 'tissue': 'https://zenodo.org/records/13859989/files/01_training_dataset_geojson_tissue.zip'}}

CHECKSUM = {'data': 'a69fd0d8443da29233df103ece5674fb50e8f0cc4b448dc60508cfe883881993', 'annotations': {'nuclei': '17f77ca83fb8fccd918ce723a7b3e5cb5a1730b342ad486628f8885d14a1acbd', 'tissue': '3b7d6697dd728e3481df0b779ad1e76962f36fc8c871c50edd9aa56ec44c4cc9'}}

CLASS_DICT = {'nuclei_stroma': 1, 'nuclei_tumor': 2, 'nuclei_plasma_cell': 3, 'nuclei_histiocyte': 4, 'nuclei_lymphocyte': 5, 'nuclei_melanophage': 6, 'nuclei_neutrophil': 7, 'nuclei_endothelium': 8, 'nuclei_epithelium': 9, 'nuclei_apoptosis': 10}

def get_puma_data( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], annotations: Literal['nuclei', 'tissue'] = 'nuclei', download: bool = False) -> str: View Source

170def get_puma_data(
171    path: Union[os.PathLike, str],
172    split: Literal["train", "val", "test"],
173    annotations: Literal['nuclei', 'tissue'] = "nuclei",
174    download: bool = False,
175) -> str:
176    """Download the PUMA data.
177
178    Args:
179        path: Filepath to a folder where the downloaded data will be saved.
180        split: The choice of data split.
181        annotations: The choice of annotations.
182        download: Whether to download the data if it is not present.
183
184    Returns:
185        Filepath where the dataset is downloaded and stored for further preprocessing.
186    """
187    if annotations not in ["nuclei", "tissue"]:
188        raise ValueError(f"'{annotations}' is not a valid annotation for the data.")
189
190    data_dir = os.path.join(path, split)
191    if os.path.exists(data_dir):
192        return data_dir
193
194    os.makedirs(path, exist_ok=True)
195
196    if not os.path.exists(os.path.join(path, "data")):
197        # Download the data.
198        zip_path = os.path.join(path, "roi.zip")
199        util.download_source(path=zip_path, url=URL["data"], download=download, checksum=CHECKSUM["data"])
200        util.unzip(zip_path=zip_path, dst=os.path.join(path, "data"))
201
202    # Download the annotations.
203    zip_path = os.path.join(path, "annotations.zip")
204    util.download_source(
205        path=zip_path,
206        url=URL["annotations"][annotations],
207        download=download,
208        checksum=CHECKSUM["annotations"][annotations]
209    )
210    util.unzip(zip_path=zip_path, dst=os.path.join(path, "annotations", annotations))
211
212    _preprocess_inputs(path, annotations, split)
213
214    return data_dir

Download the PUMA data.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
split: The choice of data split.
annotations: The choice of annotations.
download: Whether to download the data if it is not present.

Returns:

Filepath where the dataset is downloaded and stored for further preprocessing.

def get_puma_paths( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], annotations: Literal['nuclei', 'tissue'] = 'nuclei', download: bool = False) -> List[str]: View Source

217def get_puma_paths(
218    path: Union[os.PathLike, str],
219    split: Literal["train", "val", "test"],
220    annotations: Literal['nuclei', 'tissue'] = "nuclei",
221    download: bool = False
222) -> List[str]:
223    """Get paths to the PUMA dataset.
224
225    Args:
226        path: Filepath to a folder where the downloaded data will be saved.
227        split: The choice of data split.
228        annotations: The choice of annotations.
229        download: Whether to download the data if it is not present.
230
231    Returns:
232        List of filepaths for the input data.
233    """
234    data_dir = get_puma_data(path, split, annotations, download)
235    volume_paths = natsorted(glob(os.path.join(data_dir, "preprocessed", "*.h5")))
236    return volume_paths

Get paths to the PUMA dataset.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
split: The choice of data split.
annotations: The choice of annotations.
download: Whether to download the data if it is not present.

Returns:

List of filepaths for the input data.

def get_puma_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], annotations: Literal['nuclei', 'tissue'] = 'nuclei', label_choice: Literal['instances', 'semantic'] = 'instances', resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset: View Source

239def get_puma_dataset(
240    path: Union[os.PathLike, str],
241    patch_shape: Tuple[int, int],
242    split: Literal["train", "val", "test"],
243    annotations: Literal['nuclei', 'tissue'] = "nuclei",
244    label_choice: Literal["instances", "semantic"] = "instances",
245    resize_inputs: bool = False,
246    download: bool = False,
247    **kwargs
248) -> Dataset:
249    """Get the PUMA dataset for nuclei and tissue segmentation.
250
251    Args:
252        path: Filepath to a folder where the downloaded data will be saved.
253        patch_shape: The patch shape to use for training.
254        split: The choice of data split.
255        annotations: The choice of annotations.
256        label_choice: The choice of segmentation type.
257        resize_inputs: Whether to resize the inputs.
258        download: Whether to download the data if it is not present.
259        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
260
261    Returns:
262        The segmentation dataset.
263    """
264    volume_paths = get_puma_paths(path, split, annotations, download)
265
266    if resize_inputs:
267        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
268        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
269            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
270        )
271
272    return torch_em.default_segmentation_dataset(
273        raw_paths=volume_paths,
274        raw_key="raw",
275        label_paths=volume_paths,
276        label_key=f"labels/{label_choice}/{annotations}",
277        patch_shape=patch_shape,
278        with_channels=True,
279        is_seg_dataset=True,
280        ndim=2,
281        **kwargs
282    )

Get the PUMA dataset for nuclei and tissue segmentation.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
patch_shape: The patch shape to use for training.
split: The choice of data split.
annotations: The choice of annotations.
label_choice: The choice of segmentation type.
resize_inputs: Whether to resize the inputs.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

Returns:

The segmentation dataset.

def get_puma_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], annotations: Literal['nuclei', 'tissue'] = 'nuclei', label_choice: Literal['instances', 'semantic'] = 'instances', resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader: View Source

285def get_puma_loader(
286    path: Union[os.PathLike, str],
287    batch_size: int,
288    patch_shape: Tuple[int, int],
289    split: Literal["train", "val", "test"],
290    annotations: Literal['nuclei', 'tissue'] = "nuclei",
291    label_choice: Literal["instances", "semantic"] = "instances",
292    resize_inputs: bool = False,
293    download: bool = False,
294    **kwargs
295) -> DataLoader:
296    """Get the PUMA dataloader for nuclei and tissue segmentation.
297
298    Args:
299        path: Filepath to a folder where the downloaded data will be saved.
300        batch_size: The batch size for training.
301        patch_shape: The patch shape to use for training.
302        split: The choice of data split.
303        annotations: The choice of annotations.
304        label_choice: The choice of segmentation type.
305        resize_inputs: Whether to resize the inputs.
306        download: Whether to download the data if it is not present.
307        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
308
309    Returns:
310        The DataLoader.
311    """
312    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
313    dataset = get_puma_dataset(
314        path, patch_shape, split, annotations, label_choice, resize_inputs, download, **ds_kwargs
315    )
316    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the PUMA dataloader for nuclei and tissue segmentation.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
batch_size: The batch size for training.
patch_shape: The patch shape to use for training.
split: The choice of data split.
annotations: The choice of annotations.
label_choice: The choice of segmentation type.
resize_inputs: Whether to resize the inputs.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

Returns:

The DataLoader.