torch_em.data.datasets.histopathology.pannuke

The PanNuke datasets contains annotations for nucleus segmentation in histopathology images across different tissue types.

This dataset is from the publication https://doi.org/10.48550/arXiv.2003.10778. Please cite it if you use this dataset for your research.

  1"""The PanNuke datasets contains annotations for nucleus segmentation
  2in histopathology images across different tissue types.
  3
  4This dataset is from the publication https://doi.org/10.48550/arXiv.2003.10778.
  5Please cite it if you use this dataset for your research.
  6"""
  7
  8import os
  9import shutil
 10from glob import glob
 11from typing import List, Union, Dict, Tuple
 12
 13import numpy as np
 14
 15from torch.utils.data import Dataset, DataLoader
 16
 17import torch_em
 18
 19from .. import util
 20
 21
 22# PanNuke Dataset - https://warwick.ac.uk/fac/cross_fac/tia/data/pannuke
 23URLS = {
 24    "fold_1": "https://warwick.ac.uk/fac/cross_fac/tia/data/pannuke/fold_1.zip",
 25    "fold_2": "https://warwick.ac.uk/fac/cross_fac/tia/data/pannuke/fold_2.zip",
 26    "fold_3": "https://warwick.ac.uk/fac/cross_fac/tia/data/pannuke/fold_3.zip"
 27}
 28
 29CHECKSUM = {
 30    "fold_1": "6e19ad380300e8ce9480f9ab6a14cc91fa4b6a511609b40e3d70bdf9c881ed0b",
 31    "fold_2": "5bc540cc509f64b5f5a274d6e5a245527dbd3e6d3155d43555115c5d54709b07",
 32    "fold_3": "c14d372981c42f611ebc80afad01702b89cad8c1b3089daa31931cf5a4b1a39d"
 33}
 34
 35
 36def get_pannuke_data(path, download, folds):
 37    """Download the PanNuke data.
 38
 39    Args:
 40        path: Filepath to a folder where the downloaded data will be saved.
 41        download: Whether to download the data if it is not present.
 42        folds: The data fold(s) of choice to be used.
 43    """
 44    os.makedirs(path, exist_ok=True)
 45    for tmp_fold in folds:
 46        assert tmp_fold in URLS.keys(), "Please choose one or more of existing folds: 'fold_1' / 'fold_2' / 'fold_3'."
 47        if os.path.exists(os.path.join(path, f"pannuke_{tmp_fold}.h5")):
 48            return
 49
 50        util.download_source(os.path.join(path, f"{tmp_fold}.zip"), URLS[tmp_fold], download, CHECKSUM[tmp_fold])
 51
 52        print(f"Unzipping the PanNuke dataset in {tmp_fold} directories...")
 53        util.unzip(os.path.join(path, f"{tmp_fold}.zip"), os.path.join(path, f"{tmp_fold}"), True)
 54
 55        _convert_to_hdf5(path, tmp_fold)
 56
 57
 58def _convert_to_hdf5(path, fold):
 59    """Here, we create the h5 files from the input data into 4 essentials (keys):
 60        - "images" - the raw input images (transposed into the expected format) (S x 3 x H x W)
 61        - "labels/masks" - the raw input masks (transposed as above) (S x 6 x H x W)
 62        - "labels/instances" - the converted all-instance labels (S x H x W)
 63        - "labels/semantic" - the converted semantic labels (S x H x W)
 64            - where, the semantic instance representation is as follows:
 65                (0: Background, 1: Neoplastic cells, 2: Inflammatory,
 66                 3: Connective/Soft tissue cells, 4: Dead Cells, 5: Epithelial)
 67    """
 68    import h5py
 69
 70    if os.path.exists(os.path.join(path, f"pannuke_{fold}.h5")):
 71        return
 72
 73    print(f"Converting {fold} into h5 file format...")
 74    img_paths = glob(os.path.join(path, "**", "images.npy"), recursive=True)
 75    gt_paths = glob(os.path.join(path, "**", "masks.npy"), recursive=True)
 76
 77    for img_path, gt_path in zip(img_paths, gt_paths):
 78        # original (raw) shape : S x H x W x C -> transposed shape (expected) : C x S x H x W
 79        img = np.load(img_path)
 80        labels = np.load(gt_path)
 81
 82        instances = _channels_to_instances(labels)
 83        semantic = _channels_to_semantics(labels)
 84
 85        img = img.transpose(3, 0, 1, 2)
 86        labels = labels.transpose(3, 0, 1, 2)
 87
 88        # img.shape -> (3, 2656, 256, 256) --- img_chunks -> (3, 1, 256, 256)
 89        # (same logic as above for labels)
 90        img_chunks = (img.shape[0], 1) + img.shape[2:]
 91        label_chunks = (labels.shape[0], 1) + labels.shape[2:]
 92        other_label_chunks = (1,) + labels.shape[2:]  # for instance and semantic labels
 93
 94        with h5py.File(os.path.join(path, f"pannuke_{fold}.h5"), "w") as f:
 95            f.create_dataset("images", data=img, compression="gzip", chunks=img_chunks)
 96            f.create_dataset("labels/masks", data=labels, compression="gzip", chunks=label_chunks)
 97            f.create_dataset("labels/instances", data=instances, compression="gzip", chunks=other_label_chunks)
 98            f.create_dataset("labels/semantic", data=semantic, compression="gzip", chunks=other_label_chunks)
 99
100    dir_to_rm = glob(os.path.join(path, "*[!.h5]"))
101    for tmp_dir in dir_to_rm:
102        shutil.rmtree(tmp_dir)
103
104
105def _channels_to_instances(labels):
106    """Converting the ground-truth of 6 (instance) channels into 1 label with instances from all channels
107    channel info -
108    (0: Neoplastic cells, 1: Inflammatory, 2: Connective/Soft tissue cells, 3: Dead Cells, 4: Epithelial, 6: Background)
109
110    Returns:
111        - instance labels of dimensions -> (C x H x W)
112    """
113    import vigra
114
115    labels = labels.transpose(0, 3, 1, 2)  # to access with the shape S x 6 x H x W
116    list_of_instances = []
117
118    for label_slice in labels:  # access the slices (each with 6 channels of H x W labels)
119        segmentation = np.zeros(labels.shape[2:])
120        max_ids = []
121        for label_channel in label_slice[:-1]:  # access the channels
122            # the 'start_label' takes care of where to start allocating the instance ids from
123            this_labels, max_id, _ = vigra.analysis.relabelConsecutive(
124                label_channel.astype("uint64"),
125                start_label=max_ids[-1] + 1 if len(max_ids) > 0 else 1)
126
127            # some trailing channels might not have labels, hence appending only for elements with RoIs
128            if max_id > 0:
129                max_ids.append(max_id)
130
131            segmentation[this_labels > 0] = this_labels[this_labels > 0]
132
133        list_of_instances.append(segmentation)
134
135    f_segmentation = np.stack(list_of_instances)
136
137    return f_segmentation
138
139
140def _channels_to_semantics(labels):
141    """Converting the ground-truth of 6 (instance) channels  into semantic labels, ollowing below the id info as:
142    (1 -> Neoplastic cells, 2 -> Inflammatory, 3 -> Connective/Soft tissue cells,
143    4 -> Dead Cells, 5 -> Epithelial, 0 -> Background)
144
145    Returns:
146        - semantic labels of dimensions -> (C x H x W)
147    """
148    labels = labels.transpose(0, 3, 1, 2)
149    list_of_semantic = []
150
151    for label_slice in labels:
152        segmentation = np.zeros(labels.shape[2:])
153        for i, label_channel in enumerate(label_slice[:-1]):
154            segmentation[label_channel > 0] = i + 1
155        list_of_semantic.append(segmentation)
156
157    f_segmentation = np.stack(list_of_semantic)
158
159    return f_segmentation
160
161
162def get_pannuke_paths(
163    path: Union[os.PathLike, str], folds: List[str] = ["fold_1", "fold_2", "fold_3"], download: bool = False,
164) -> List[str]:
165    """Get paths to the PanNuke data.
166
167    Args:
168        path: Filepath to a folder where the downloaded data will be saved.
169        folds: The data fold(s) of choice to be used.
170        download: Whether to download the data if it is not present.
171
172    Returns:
173        List of filepaths to the stored data.
174    """
175    get_pannuke_data(path, download, folds)
176
177    data_paths = [os.path.join(path, f"pannuke_{fold}.h5") for fold in folds]
178    return data_paths
179
180
181def get_pannuke_dataset(
182    path: Union[os.PathLike, str],
183    patch_shape: Tuple[int, ...],
184    folds: List[str] = ["fold_1", "fold_2", "fold_3"],
185    rois: Dict = {},
186    download: bool = False,
187    custom_label_choice: str = "instances",
188    with_channels: bool = True,
189    with_label_channels: bool = False,
190    resize_inputs: bool = False,
191    **kwargs
192) -> Dataset:
193    """Get the PanNuke dataset for nucleus segmentation.
194
195    Args:
196        path: Filepath to a folder where the downloaded data will be saved.
197        patch_shape: The patch shape to use for training.
198        folds: The data fold(s) of choice to be used.
199        download: Whether to download the data if it is not present.
200        rois: The choice of rois per fold to create the dataloader for training.
201        custom_label_choice: The choice of labels to be used for training.
202        with_channels: Whether the inputs have channels.
203        with_label_channels: Whether the labels have channels.
204        resize_inputs: Whether to resize the inputs.
205        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
206
207    Returns:
208        The segmentation dataset
209    """
210    assert custom_label_choice in [
211        "masks", "instances", "semantic"
212    ], "Select the type of labels you want from [masks/instances/semantic] (See `_convert_to_hdf5` for details)"
213
214    if rois is not None:
215        assert isinstance(rois, dict)
216
217    data_paths = get_pannuke_paths(path, folds, download)
218
219    if resize_inputs:
220        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
221        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
222            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
223        )
224
225    return torch_em.default_segmentation_dataset(
226        raw_paths=data_paths,
227        raw_key="images",
228        label_paths=data_paths,
229        label_key=f"labels/{custom_label_choice}",
230        patch_shape=patch_shape,
231        rois=[rois.get(fold, np.s_[:, :, :]) for fold in folds],
232        with_channels=with_channels,
233        with_label_channels=with_label_channels,
234        **kwargs
235    )
236
237
238def get_pannuke_loader(
239    path: Union[os.PathLike, str],
240    patch_shape: Tuple[int, ...],
241    batch_size: str,
242    folds: List[str] = ["fold_1", "fold_2", "fold_3"],
243    download: bool = False,
244    rois: Dict = {},
245    custom_label_choice: str = "instances",
246    resize_inputs: bool = False,
247    **kwargs
248) -> DataLoader:
249    """Get the PanNuke dataloader for nucleus segmentation.
250
251    Args:
252        path: Filepath to a folder where the downloaded data will be saved.
253        patch_shape: The patch shape to use for training.
254        batch_size: The batch size for training.
255        folds: The data fold(s) of choice to be used.
256        download: Whether to download the data if it is not present.
257        rois: The choice of rois per fold to create the dataloader for training.
258        custom_label_choice: The choice of labels to be used for training.
259        resize_inputs: Whether to resize the inputs.
260        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
261
262    Returns:
263        The DataLoader
264    """
265    dataset_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
266    ds = get_pannuke_dataset(
267        path=path,
268        patch_shape=patch_shape,
269        folds=folds,
270        rois=rois,
271        download=download,
272        custom_label_choice=custom_label_choice,
273        resize_inputs=resize_inputs,
274        **dataset_kwargs
275    )
276    return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)
URLS = {'fold_1': 'https://warwick.ac.uk/fac/cross_fac/tia/data/pannuke/fold_1.zip', 'fold_2': 'https://warwick.ac.uk/fac/cross_fac/tia/data/pannuke/fold_2.zip', 'fold_3': 'https://warwick.ac.uk/fac/cross_fac/tia/data/pannuke/fold_3.zip'}
CHECKSUM = {'fold_1': '6e19ad380300e8ce9480f9ab6a14cc91fa4b6a511609b40e3d70bdf9c881ed0b', 'fold_2': '5bc540cc509f64b5f5a274d6e5a245527dbd3e6d3155d43555115c5d54709b07', 'fold_3': 'c14d372981c42f611ebc80afad01702b89cad8c1b3089daa31931cf5a4b1a39d'}
def get_pannuke_data(path, download, folds):
37def get_pannuke_data(path, download, folds):
38    """Download the PanNuke data.
39
40    Args:
41        path: Filepath to a folder where the downloaded data will be saved.
42        download: Whether to download the data if it is not present.
43        folds: The data fold(s) of choice to be used.
44    """
45    os.makedirs(path, exist_ok=True)
46    for tmp_fold in folds:
47        assert tmp_fold in URLS.keys(), "Please choose one or more of existing folds: 'fold_1' / 'fold_2' / 'fold_3'."
48        if os.path.exists(os.path.join(path, f"pannuke_{tmp_fold}.h5")):
49            return
50
51        util.download_source(os.path.join(path, f"{tmp_fold}.zip"), URLS[tmp_fold], download, CHECKSUM[tmp_fold])
52
53        print(f"Unzipping the PanNuke dataset in {tmp_fold} directories...")
54        util.unzip(os.path.join(path, f"{tmp_fold}.zip"), os.path.join(path, f"{tmp_fold}"), True)
55
56        _convert_to_hdf5(path, tmp_fold)

Download the PanNuke data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • download: Whether to download the data if it is not present.
  • folds: The data fold(s) of choice to be used.
def get_pannuke_paths( path: Union[os.PathLike, str], folds: List[str] = ['fold_1', 'fold_2', 'fold_3'], download: bool = False) -> List[str]:
163def get_pannuke_paths(
164    path: Union[os.PathLike, str], folds: List[str] = ["fold_1", "fold_2", "fold_3"], download: bool = False,
165) -> List[str]:
166    """Get paths to the PanNuke data.
167
168    Args:
169        path: Filepath to a folder where the downloaded data will be saved.
170        folds: The data fold(s) of choice to be used.
171        download: Whether to download the data if it is not present.
172
173    Returns:
174        List of filepaths to the stored data.
175    """
176    get_pannuke_data(path, download, folds)
177
178    data_paths = [os.path.join(path, f"pannuke_{fold}.h5") for fold in folds]
179    return data_paths

Get paths to the PanNuke data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • folds: The data fold(s) of choice to be used.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths to the stored data.

def get_pannuke_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], folds: List[str] = ['fold_1', 'fold_2', 'fold_3'], rois: Dict = {}, download: bool = False, custom_label_choice: str = 'instances', with_channels: bool = True, with_label_channels: bool = False, resize_inputs: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
182def get_pannuke_dataset(
183    path: Union[os.PathLike, str],
184    patch_shape: Tuple[int, ...],
185    folds: List[str] = ["fold_1", "fold_2", "fold_3"],
186    rois: Dict = {},
187    download: bool = False,
188    custom_label_choice: str = "instances",
189    with_channels: bool = True,
190    with_label_channels: bool = False,
191    resize_inputs: bool = False,
192    **kwargs
193) -> Dataset:
194    """Get the PanNuke dataset for nucleus segmentation.
195
196    Args:
197        path: Filepath to a folder where the downloaded data will be saved.
198        patch_shape: The patch shape to use for training.
199        folds: The data fold(s) of choice to be used.
200        download: Whether to download the data if it is not present.
201        rois: The choice of rois per fold to create the dataloader for training.
202        custom_label_choice: The choice of labels to be used for training.
203        with_channels: Whether the inputs have channels.
204        with_label_channels: Whether the labels have channels.
205        resize_inputs: Whether to resize the inputs.
206        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
207
208    Returns:
209        The segmentation dataset
210    """
211    assert custom_label_choice in [
212        "masks", "instances", "semantic"
213    ], "Select the type of labels you want from [masks/instances/semantic] (See `_convert_to_hdf5` for details)"
214
215    if rois is not None:
216        assert isinstance(rois, dict)
217
218    data_paths = get_pannuke_paths(path, folds, download)
219
220    if resize_inputs:
221        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
222        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
223            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
224        )
225
226    return torch_em.default_segmentation_dataset(
227        raw_paths=data_paths,
228        raw_key="images",
229        label_paths=data_paths,
230        label_key=f"labels/{custom_label_choice}",
231        patch_shape=patch_shape,
232        rois=[rois.get(fold, np.s_[:, :, :]) for fold in folds],
233        with_channels=with_channels,
234        with_label_channels=with_label_channels,
235        **kwargs
236    )

Get the PanNuke dataset for nucleus segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • folds: The data fold(s) of choice to be used.
  • download: Whether to download the data if it is not present.
  • rois: The choice of rois per fold to create the dataloader for training.
  • custom_label_choice: The choice of labels to be used for training.
  • with_channels: Whether the inputs have channels.
  • with_label_channels: Whether the labels have channels.
  • resize_inputs: Whether to resize the inputs.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset

def get_pannuke_loader( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], batch_size: str, folds: List[str] = ['fold_1', 'fold_2', 'fold_3'], download: bool = False, rois: Dict = {}, custom_label_choice: str = 'instances', resize_inputs: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
239def get_pannuke_loader(
240    path: Union[os.PathLike, str],
241    patch_shape: Tuple[int, ...],
242    batch_size: str,
243    folds: List[str] = ["fold_1", "fold_2", "fold_3"],
244    download: bool = False,
245    rois: Dict = {},
246    custom_label_choice: str = "instances",
247    resize_inputs: bool = False,
248    **kwargs
249) -> DataLoader:
250    """Get the PanNuke dataloader for nucleus segmentation.
251
252    Args:
253        path: Filepath to a folder where the downloaded data will be saved.
254        patch_shape: The patch shape to use for training.
255        batch_size: The batch size for training.
256        folds: The data fold(s) of choice to be used.
257        download: Whether to download the data if it is not present.
258        rois: The choice of rois per fold to create the dataloader for training.
259        custom_label_choice: The choice of labels to be used for training.
260        resize_inputs: Whether to resize the inputs.
261        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
262
263    Returns:
264        The DataLoader
265    """
266    dataset_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
267    ds = get_pannuke_dataset(
268        path=path,
269        patch_shape=patch_shape,
270        folds=folds,
271        rois=rois,
272        download=download,
273        custom_label_choice=custom_label_choice,
274        resize_inputs=resize_inputs,
275        **dataset_kwargs
276    )
277    return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)

Get the PanNuke dataloader for nucleus segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • batch_size: The batch size for training.
  • folds: The data fold(s) of choice to be used.
  • download: Whether to download the data if it is not present.
  • rois: The choice of rois per fold to create the dataloader for training.
  • custom_label_choice: The choice of labels to be used for training.
  • resize_inputs: Whether to resize the inputs.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader