The PanNuke datasets contains annotations for nucleus segmentation in histopathology images across different tissue types.

This dataset is from the publication Please cite it if you use this dataset for your research.

  1"""The PanNuke datasets contains annotations for nucleus segmentation
  2in histopathology images across different tissue types.
  4This dataset is from the publication
  5Please cite it if you use this dataset for your research.
  8import os
  9import shutil
 10from glob import glob
 11from typing import List, Union, Dict, Tuple
 13import numpy as np
 15from import Dataset, DataLoader
 17import torch_em
 19from .. import util
 22# PanNuke Dataset -
 23URLS = {
 24    "fold_1": "",
 25    "fold_2": "",
 26    "fold_3": ""
 30    "fold_1": "6e19ad380300e8ce9480f9ab6a14cc91fa4b6a511609b40e3d70bdf9c881ed0b",
 31    "fold_2": "5bc540cc509f64b5f5a274d6e5a245527dbd3e6d3155d43555115c5d54709b07",
 32    "fold_3": "c14d372981c42f611ebc80afad01702b89cad8c1b3089daa31931cf5a4b1a39d"
 36def get_pannuke_data(path, download, folds):
 37    """Download the PanNuke data.
 39    Args:
 40        path: Filepath to a folder where the downloaded data will be saved.
 41        download: Whether to download the data if it is not present.
 42        folds: The data fold(s) of choice to be used.
 43    """
 44    os.makedirs(path, exist_ok=True)
 45    for tmp_fold in folds:
 46        if os.path.exists(os.path.join(path, f"pannuke_{tmp_fold}.h5")):
 47            return
 49        util.download_source(os.path.join(path, f"{tmp_fold}.zip"), URLS[tmp_fold], download, CHECKSUM[tmp_fold])
 51        print(f"Unzipping the PanNuke dataset in {tmp_fold} directories...")
 52        util.unzip(os.path.join(path, f"{tmp_fold}.zip"), os.path.join(path, f"{tmp_fold}"), True)
 54        _convert_to_hdf5(path, tmp_fold)
 57def _convert_to_hdf5(path, fold):
 58    """Here, we create the h5 files from the input data into 4 essentials (keys):
 59        - "images" - the raw input images (transposed into the expected format) (S x 3 x H x W)
 60        - "labels/masks" - the raw input masks (transposed as above) (S x 6 x H x W)
 61        - "labels/instances" - the converted all-instance labels (S x H x W)
 62        - "labels/semantic" - the converted semantic labels (S x H x W)
 63            - where, the semantic instance representation is as follows:
 64                (0: Background, 1: Neoplastic cells, 2: Inflammatory,
 65                 3: Connective/Soft tissue cells, 4: Dead Cells, 5: Epithelial)
 66    """
 67    import h5py
 69    if os.path.exists(os.path.join(path, f"pannuke_{fold}.h5")):
 70        return
 72    print(f"Converting {fold} into h5 file format...")
 73    img_paths = glob(os.path.join(path, "**", "images.npy"), recursive=True)
 74    gt_paths = glob(os.path.join(path, "**", "masks.npy"), recursive=True)
 76    for img_path, gt_path in zip(img_paths, gt_paths):
 77        # original (raw) shape : S x H x W x C -> transposed shape (expected) : C x S x H x W
 78        img = np.load(img_path)
 79        labels = np.load(gt_path)
 81        instances = _channels_to_instances(labels)
 82        semantic = _channels_to_semantics(labels)
 84        img = img.transpose(3, 0, 1, 2)
 85        labels = labels.transpose(3, 0, 1, 2)
 87        # img.shape -> (3, 2656, 256, 256) --- img_chunks -> (3, 1, 256, 256)
 88        # (same logic as above for labels)
 89        img_chunks = (img.shape[0], 1) + img.shape[2:]
 90        label_chunks = (labels.shape[0], 1) + labels.shape[2:]
 91        other_label_chunks = (1,) + labels.shape[2:]  # for instance and semantic labels
 93        with h5py.File(os.path.join(path, f"pannuke_{fold}.h5"), "w") as f:
 94            f.create_dataset("images", data=img, compression="gzip", chunks=img_chunks)
 95            f.create_dataset("labels/masks", data=labels, compression="gzip", chunks=label_chunks)
 96            f.create_dataset("labels/instances", data=instances, compression="gzip", chunks=other_label_chunks)
 97            f.create_dataset("labels/semantic", data=semantic, compression="gzip", chunks=other_label_chunks)
 99    dir_to_rm = glob(os.path.join(path, "*[!.h5]"))
100    for tmp_dir in dir_to_rm:
101        shutil.rmtree(tmp_dir)
104def _channels_to_instances(labels):
105    """Converting the ground-truth of 6 (instance) channels into 1 label with instances from all channels
106    channel info -
107    (0: Neoplastic cells, 1: Inflammatory, 2: Connective/Soft tissue cells, 3: Dead Cells, 4: Epithelial, 6: Background)
109    Returns:
110        - instance labels of dimensions -> (C x H x W)
111    """
112    import vigra
114    labels = labels.transpose(0, 3, 1, 2)  # to access with the shape S x 6 x H x W
115    list_of_instances = []
117    for label_slice in labels:  # access the slices (each with 6 channels of H x W labels)
118        segmentation = np.zeros(labels.shape[2:])
119        max_ids = []
120        for label_channel in label_slice[:-1]:  # access the channels
121            # the 'start_label' takes care of where to start allocating the instance ids from
122            this_labels, max_id, _ = vigra.analysis.relabelConsecutive(
123                label_channel.astype("uint64"),
124                start_label=max_ids[-1] + 1 if len(max_ids) > 0 else 1)
126            # some trailing channels might not have labels, hence appending only for elements with RoIs
127            if max_id > 0:
128                max_ids.append(max_id)
130            segmentation[this_labels > 0] = this_labels[this_labels > 0]
132        list_of_instances.append(segmentation)
134    f_segmentation = np.stack(list_of_instances)
136    return f_segmentation
139def _channels_to_semantics(labels):
140    """Converting the ground-truth of 6 (instance) channels  into semantic labels, ollowing below the id info as:
141    (1 -> Neoplastic cells, 2 -> Inflammatory, 3 -> Connective/Soft tissue cells,
142    4 -> Dead Cells, 5 -> Epithelial, 0 -> Background)
144    Returns:
145        - semantic labels of dimensions -> (C x H x W)
146    """
147    labels = labels.transpose(0, 3, 1, 2)
148    list_of_semantic = []
150    for label_slice in labels:
151        segmentation = np.zeros(labels.shape[2:])
152        for i, label_channel in enumerate(label_slice[:-1]):
153            segmentation[label_channel > 0] = i + 1
154        list_of_semantic.append(segmentation)
156    f_segmentation = np.stack(list_of_semantic)
158    return f_segmentation
161def get_pannuke_paths(
162    path: Union[os.PathLike, str], folds: List[str] = ["fold_1", "fold_2", "fold_3"], download: bool = False,
163) -> List[str]:
164    """Get paths to the PanNuke data.
166    Args:
167        path: Filepath to a folder where the downloaded data will be saved.
168        folds: The data fold(s) of choice to be used.
169        download: Whether to download the data if it is not present.
171    Returns:
172        List of filepaths to the stored data.
173    """
174    get_pannuke_data(path, download, folds)
176    data_paths = [os.path.join(path, f"pannuke_{fold}.h5") for fold in folds]
177    return data_paths
180def get_pannuke_dataset(
181    path: Union[os.PathLike, str],
182    patch_shape: Tuple[int, ...],
183    folds: List[str] = ["fold_1", "fold_2", "fold_3"],
184    rois: Dict = {},
185    download: bool = False,
186    custom_label_choice: str = "instances",
187    with_channels: bool = True,
188    with_label_channels: bool = False,
189    resize_inputs: bool = False,
190    **kwargs
191) -> Dataset:
192    """Get the PanNuke dataset for nucleus segmentation.
194    Args:
195        path: Filepath to a folder where the downloaded data will be saved.
196        patch_shape: The patch shape to use for training.
197        folds: The data fold(s) of choice to be used.
198        download: Whether to download the data if it is not present.
199        rois: The choice of rois per fold to create the dataloader for training.
200        custom_label_choice: The choice of labels to be used for training.
201        with_channels: Whether the inputs have channels.
202        with_label_channels: Whether the labels have channels.
203        resize_inputs: Whether to resize the inputs.
204        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
206    Returns:
207        The segmentation dataset
208    """
209    assert custom_label_choice in [
210        "masks", "instances", "semantic"
211    ], "Select the type of labels you want from [masks/instances/semantic] (See `_convert_to_hdf5` for details)"
213    if rois is not None:
214        assert isinstance(rois, dict)
216    data_paths = get_pannuke_paths(path, folds, download)
218    if resize_inputs:
219        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
220        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
221            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
222        )
224    return torch_em.default_segmentation_dataset(
225        raw_paths=data_paths,
226        raw_key="images",
227        label_paths=data_paths,
228        label_key=f"labels/{custom_label_choice}",
229        patch_shape=patch_shape,
230        rois=[rois.get(fold, np.s_[:, :, :]) for fold in folds],
231        with_channels=with_channels,
232        with_label_channels=with_label_channels,
233        **kwargs
234    )
237def get_pannuke_loader(
238    path: Union[os.PathLike, str],
239    patch_shape: Tuple[int, ...],
240    batch_size: str,
241    folds: List[str] = ["fold_1", "fold_2", "fold_3"],
242    download: bool = False,
243    rois: Dict = {},
244    custom_label_choice: str = "instances",
245    resize_inputs: bool = False,
246    **kwargs
247) -> DataLoader:
248    """Get the PanNuke dataloader for nucleus segmentation.
250    Args:
251        path: Filepath to a folder where the downloaded data will be saved.
252        patch_shape: The patch shape to use for training.
253        batch_size: The batch size for training.
254        folds: The data fold(s) of choice to be used.
255        download: Whether to download the data if it is not present.
256        rois: The choice of rois per fold to create the dataloader for training.
257        custom_label_choice: The choice of labels to be used for training.
258        resize_inputs: Whether to resize the inputs.
259        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
261    Returns:
262        The DataLoader
263    """
264    dataset_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
265    ds = get_pannuke_dataset(
266        path=path,
267        patch_shape=patch_shape,
268        folds=folds,
269        rois=rois,
270        download=download,
271        custom_label_choice=custom_label_choice,
272        resize_inputs=resize_inputs,
273        **dataset_kwargs
274    )
275    return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)
URLS = {'fold_1': '', 'fold_2': '', 'fold_3': ''}
CHECKSUM = {'fold_1': '6e19ad380300e8ce9480f9ab6a14cc91fa4b6a511609b40e3d70bdf9c881ed0b', 'fold_2': '5bc540cc509f64b5f5a274d6e5a245527dbd3e6d3155d43555115c5d54709b07', 'fold_3': 'c14d372981c42f611ebc80afad01702b89cad8c1b3089daa31931cf5a4b1a39d'}
def get_pannuke_data(path, download, folds):
37def get_pannuke_data(path, download, folds):
38    """Download the PanNuke data.
40    Args:
41        path: Filepath to a folder where the downloaded data will be saved.
42        download: Whether to download the data if it is not present.
43        folds: The data fold(s) of choice to be used.
44    """
45    os.makedirs(path, exist_ok=True)
46    for tmp_fold in folds:
47        if os.path.exists(os.path.join(path, f"pannuke_{tmp_fold}.h5")):
48            return
50        util.download_source(os.path.join(path, f"{tmp_fold}.zip"), URLS[tmp_fold], download, CHECKSUM[tmp_fold])
52        print(f"Unzipping the PanNuke dataset in {tmp_fold} directories...")
53        util.unzip(os.path.join(path, f"{tmp_fold}.zip"), os.path.join(path, f"{tmp_fold}"), True)
55        _convert_to_hdf5(path, tmp_fold)

Download the PanNuke data.

  • path: Filepath to a folder where the downloaded data will be saved.
  • download: Whether to download the data if it is not present.
  • folds: The data fold(s) of choice to be used.
def get_pannuke_paths( path: Union[os.PathLike, str], folds: List[str] = ['fold_1', 'fold_2', 'fold_3'], download: bool = False) -> List[str]:
162def get_pannuke_paths(
163    path: Union[os.PathLike, str], folds: List[str] = ["fold_1", "fold_2", "fold_3"], download: bool = False,
164) -> List[str]:
165    """Get paths to the PanNuke data.
167    Args:
168        path: Filepath to a folder where the downloaded data will be saved.
169        folds: The data fold(s) of choice to be used.
170        download: Whether to download the data if it is not present.
172    Returns:
173        List of filepaths to the stored data.
174    """
175    get_pannuke_data(path, download, folds)
177    data_paths = [os.path.join(path, f"pannuke_{fold}.h5") for fold in folds]
178    return data_paths

Get paths to the PanNuke data.

  • path: Filepath to a folder where the downloaded data will be saved.
  • folds: The data fold(s) of choice to be used.
  • download: Whether to download the data if it is not present.

List of filepaths to the stored data.

def get_pannuke_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], folds: List[str] = ['fold_1', 'fold_2', 'fold_3'], rois: Dict = {}, download: bool = False, custom_label_choice: str = 'instances', with_channels: bool = True, with_label_channels: bool = False, resize_inputs: bool = False, **kwargs) ->
181def get_pannuke_dataset(
182    path: Union[os.PathLike, str],
183    patch_shape: Tuple[int, ...],
184    folds: List[str] = ["fold_1", "fold_2", "fold_3"],
185    rois: Dict = {},
186    download: bool = False,
187    custom_label_choice: str = "instances",
188    with_channels: bool = True,
189    with_label_channels: bool = False,
190    resize_inputs: bool = False,
191    **kwargs
192) -> Dataset:
193    """Get the PanNuke dataset for nucleus segmentation.
195    Args:
196        path: Filepath to a folder where the downloaded data will be saved.
197        patch_shape: The patch shape to use for training.
198        folds: The data fold(s) of choice to be used.
199        download: Whether to download the data if it is not present.
200        rois: The choice of rois per fold to create the dataloader for training.
201        custom_label_choice: The choice of labels to be used for training.
202        with_channels: Whether the inputs have channels.
203        with_label_channels: Whether the labels have channels.
204        resize_inputs: Whether to resize the inputs.
205        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
207    Returns:
208        The segmentation dataset
209    """
210    assert custom_label_choice in [
211        "masks", "instances", "semantic"
212    ], "Select the type of labels you want from [masks/instances/semantic] (See `_convert_to_hdf5` for details)"
214    if rois is not None:
215        assert isinstance(rois, dict)
217    data_paths = get_pannuke_paths(path, folds, download)
219    if resize_inputs:
220        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
221        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
222            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
223        )
225    return torch_em.default_segmentation_dataset(
226        raw_paths=data_paths,
227        raw_key="images",
228        label_paths=data_paths,
229        label_key=f"labels/{custom_label_choice}",
230        patch_shape=patch_shape,
231        rois=[rois.get(fold, np.s_[:, :, :]) for fold in folds],
232        with_channels=with_channels,
233        with_label_channels=with_label_channels,
234        **kwargs
235    )

Get the PanNuke dataset for nucleus segmentation.

  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • folds: The data fold(s) of choice to be used.
  • download: Whether to download the data if it is not present.
  • rois: The choice of rois per fold to create the dataloader for training.
  • custom_label_choice: The choice of labels to be used for training.
  • with_channels: Whether the inputs have channels.
  • with_label_channels: Whether the labels have channels.
  • resize_inputs: Whether to resize the inputs.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

The segmentation dataset

def get_pannuke_loader( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], batch_size: str, folds: List[str] = ['fold_1', 'fold_2', 'fold_3'], download: bool = False, rois: Dict = {}, custom_label_choice: str = 'instances', resize_inputs: bool = False, **kwargs) ->
238def get_pannuke_loader(
239    path: Union[os.PathLike, str],
240    patch_shape: Tuple[int, ...],
241    batch_size: str,
242    folds: List[str] = ["fold_1", "fold_2", "fold_3"],
243    download: bool = False,
244    rois: Dict = {},
245    custom_label_choice: str = "instances",
246    resize_inputs: bool = False,
247    **kwargs
248) -> DataLoader:
249    """Get the PanNuke dataloader for nucleus segmentation.
251    Args:
252        path: Filepath to a folder where the downloaded data will be saved.
253        patch_shape: The patch shape to use for training.
254        batch_size: The batch size for training.
255        folds: The data fold(s) of choice to be used.
256        download: Whether to download the data if it is not present.
257        rois: The choice of rois per fold to create the dataloader for training.
258        custom_label_choice: The choice of labels to be used for training.
259        resize_inputs: Whether to resize the inputs.
260        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
262    Returns:
263        The DataLoader
264    """
265    dataset_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
266    ds = get_pannuke_dataset(
267        path=path,
268        patch_shape=patch_shape,
269        folds=folds,
270        rois=rois,
271        download=download,
272        custom_label_choice=custom_label_choice,
273        resize_inputs=resize_inputs,
274        **dataset_kwargs
275    )
276    return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)

Get the PanNuke dataloader for nucleus segmentation.

  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • batch_size: The batch size for training.
  • folds: The data fold(s) of choice to be used.
  • download: Whether to download the data if it is not present.
  • rois: The choice of rois per fold to create the dataloader for training.
  • custom_label_choice: The choice of labels to be used for training.
  • resize_inputs: Whether to resize the inputs.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

The DataLoader