torch_em.data.datasets.histopathology.pannuke

The PanNuke datasets contains annotations for nucleus segmentation in histopathology images across different tissue types.

This dataset is from the publication https://doi.org/10.48550/arXiv.2003.10778. Please cite it if you use this dataset for your research.

  1"""The PanNuke datasets contains annotations for nucleus segmentation
  2in histopathology images across different tissue types.
  3
  4This dataset is from the publication https://doi.org/10.48550/arXiv.2003.10778.
  5Please cite it if you use this dataset for your research.
  6"""
  7
  8import os
  9import shutil
 10from glob import glob
 11from typing import List, Union, Dict, Tuple
 12
 13import numpy as np
 14
 15from torch.utils.data import Dataset, DataLoader
 16
 17import torch_em
 18
 19from .. import util
 20
 21
 22# PanNuke Dataset - https://warwick.ac.uk/fac/cross_fac/tia/data/pannuke
 23URLS = {
 24    "fold_1": "https://warwick.ac.uk/fac/cross_fac/tia/data/pannuke/fold_1.zip",
 25    "fold_2": "https://warwick.ac.uk/fac/cross_fac/tia/data/pannuke/fold_2.zip",
 26    "fold_3": "https://warwick.ac.uk/fac/cross_fac/tia/data/pannuke/fold_3.zip"
 27}
 28
 29CHECKSUM = {
 30    "fold_1": "6e19ad380300e8ce9480f9ab6a14cc91fa4b6a511609b40e3d70bdf9c881ed0b",
 31    "fold_2": "5bc540cc509f64b5f5a274d6e5a245527dbd3e6d3155d43555115c5d54709b07",
 32    "fold_3": "c14d372981c42f611ebc80afad01702b89cad8c1b3089daa31931cf5a4b1a39d"
 33}
 34
 35
 36def get_pannuke_data(path, download, folds):
 37    """Download the PanNuke data.
 38
 39    Args:
 40        path: Filepath to a folder where the downloaded data will be saved.
 41        download: Whether to download the data if it is not present.
 42        folds: The data fold(s) of choice to be used.
 43    """
 44    os.makedirs(path, exist_ok=True)
 45    for tmp_fold in folds:
 46        assert tmp_fold in URLS.keys(), "Please choose one or more of existing folds: 'fold_1' / 'fold_2' / 'fold_3'."
 47        if os.path.exists(os.path.join(path, f"pannuke_{tmp_fold}.h5")):
 48            return
 49
 50        util.download_source(os.path.join(path, f"{tmp_fold}.zip"), URLS[tmp_fold], download, CHECKSUM[tmp_fold])
 51
 52        print(f"Unzipping the PanNuke dataset in {tmp_fold} directories...")
 53        util.unzip(os.path.join(path, f"{tmp_fold}.zip"), os.path.join(path, f"{tmp_fold}"), True)
 54
 55        _convert_to_hdf5(path, tmp_fold)
 56
 57
 58def _convert_to_hdf5(path, fold):
 59    """Here, we create the h5 files from the input data into 4 essentials (keys):
 60        - "images" - the raw input images (transposed into the expected format) (S x 3 x H x W)
 61        - "labels/masks" - the raw input masks (transposed as above) (S x 6 x H x W)
 62        - "labels/instances" - the converted all-instance labels (S x H x W)
 63        - "labels/semantic" - the converted semantic labels (S x H x W)
 64            - where, the semantic instance representation is as follows:
 65                (0: Background, 1: Neoplastic cells, 2: Inflammatory,
 66                 3: Connective/Soft tissue cells, 4: Dead Cells, 5: Epithelial)
 67    """
 68    import h5py
 69
 70    if os.path.exists(os.path.join(path, f"pannuke_{fold}.h5")):
 71        return
 72
 73    print(f"Converting {fold} into h5 file format...")
 74    img_paths = glob(os.path.join(path, "**", "images.npy"), recursive=True)
 75    gt_paths = glob(os.path.join(path, "**", "masks.npy"), recursive=True)
 76
 77    for img_path, gt_path in zip(img_paths, gt_paths):
 78        # original (raw) shape : S x H x W x C -> transposed shape (expected) : C x S x H x W
 79        img = np.load(img_path)
 80        labels = np.load(gt_path)
 81
 82        instances = _channels_to_instances(labels)
 83        semantic = _channels_to_semantics(labels)
 84
 85        img = img.transpose(3, 0, 1, 2)
 86        labels = labels.transpose(3, 0, 1, 2)
 87
 88        # img.shape -> (3, 2656, 256, 256) --- img_chunks -> (3, 1, 256, 256)
 89        # (same logic as above for labels)
 90        img_chunks = (img.shape[0], 1) + img.shape[2:]
 91        label_chunks = (labels.shape[0], 1) + labels.shape[2:]
 92        other_label_chunks = (1,) + labels.shape[2:]  # for instance and semantic labels
 93
 94        with h5py.File(os.path.join(path, f"pannuke_{fold}.h5"), "w") as f:
 95            f.create_dataset("images", data=img, compression="gzip", chunks=img_chunks)
 96            f.create_dataset("labels/masks", data=labels, compression="gzip", chunks=label_chunks)
 97            f.create_dataset("labels/instances", data=instances, compression="gzip", chunks=other_label_chunks)
 98            f.create_dataset("labels/semantic", data=semantic, compression="gzip", chunks=other_label_chunks)
 99
100    dir_to_rm = glob(os.path.join(path, "*[!.h5]"))
101    for tmp_dir in dir_to_rm:
102        shutil.rmtree(tmp_dir)
103
104
105def _channels_to_instances(labels):
106    """Converting the ground-truth of 6 (instance) channels into 1 label with instances from all channels
107    channel info -
108    (0: Neoplastic cells, 1: Inflammatory, 2: Connective/Soft tissue cells, 3: Dead Cells, 4: Epithelial, 6: Background)
109
110    Returns:
111        - instance labels of dimensions -> (C x H x W)
112    """
113    import bioimage_cpp as bic
114
115    labels = labels.transpose(0, 3, 1, 2)  # to access with the shape S x 6 x H x W
116    list_of_instances = []
117
118    for label_slice in labels:  # access the slices (each with 6 channels of H x W labels)
119        segmentation = np.zeros(labels.shape[2:])
120        max_ids = []
121        for label_channel in label_slice[:-1]:  # access the channels
122            # the 'offset' takes care of where to start allocating the instance ids from
123            this_labels, _, _ = bic.segmentation.relabel_sequential(
124                label_channel.astype("uint64"),
125                offset=max_ids[-1] + 1 if len(max_ids) > 0 else 1)
126            max_id = int(this_labels.max())
127
128            # some trailing channels might not have labels, hence appending only for elements with RoIs
129            if max_id > 0:
130                max_ids.append(max_id)
131
132            segmentation[this_labels > 0] = this_labels[this_labels > 0]
133
134        list_of_instances.append(segmentation)
135
136    f_segmentation = np.stack(list_of_instances)
137
138    return f_segmentation
139
140
141def _channels_to_semantics(labels):
142    """Converting the ground-truth of 6 (instance) channels  into semantic labels, ollowing below the id info as:
143    (1 -> Neoplastic cells, 2 -> Inflammatory, 3 -> Connective/Soft tissue cells,
144    4 -> Dead Cells, 5 -> Epithelial, 0 -> Background)
145
146    Returns:
147        - semantic labels of dimensions -> (C x H x W)
148    """
149    labels = labels.transpose(0, 3, 1, 2)
150    list_of_semantic = []
151
152    for label_slice in labels:
153        segmentation = np.zeros(labels.shape[2:])
154        for i, label_channel in enumerate(label_slice[:-1]):
155            segmentation[label_channel > 0] = i + 1
156        list_of_semantic.append(segmentation)
157
158    f_segmentation = np.stack(list_of_semantic)
159
160    return f_segmentation
161
162
163def get_pannuke_paths(
164    path: Union[os.PathLike, str], folds: List[str] = ["fold_1", "fold_2", "fold_3"], download: bool = False,
165) -> List[str]:
166    """Get paths to the PanNuke data.
167
168    Args:
169        path: Filepath to a folder where the downloaded data will be saved.
170        folds: The data fold(s) of choice to be used.
171        download: Whether to download the data if it is not present.
172
173    Returns:
174        List of filepaths to the stored data.
175    """
176    get_pannuke_data(path, download, folds)
177
178    data_paths = [os.path.join(path, f"pannuke_{fold}.h5") for fold in folds]
179    return data_paths
180
181
182def get_pannuke_dataset(
183    path: Union[os.PathLike, str],
184    patch_shape: Tuple[int, ...],
185    folds: List[str] = ["fold_1", "fold_2", "fold_3"],
186    rois: Dict = {},
187    download: bool = False,
188    custom_label_choice: str = "instances",
189    with_channels: bool = True,
190    with_label_channels: bool = False,
191    resize_inputs: bool = False,
192    **kwargs
193) -> Dataset:
194    """Get the PanNuke dataset for nucleus segmentation.
195
196    Args:
197        path: Filepath to a folder where the downloaded data will be saved.
198        patch_shape: The patch shape to use for training.
199        folds: The data fold(s) of choice to be used.
200        download: Whether to download the data if it is not present.
201        rois: The choice of rois per fold to create the dataloader for training.
202        custom_label_choice: The choice of labels to be used for training.
203        with_channels: Whether the inputs have channels.
204        with_label_channels: Whether the labels have channels.
205        resize_inputs: Whether to resize the inputs.
206        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
207
208    Returns:
209        The segmentation dataset
210    """
211    assert custom_label_choice in [
212        "masks", "instances", "semantic"
213    ], "Select the type of labels you want from [masks/instances/semantic] (See `_convert_to_hdf5` for details)"
214
215    if rois is not None:
216        assert isinstance(rois, dict)
217
218    data_paths = get_pannuke_paths(path, folds, download)
219
220    if resize_inputs:
221        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
222        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
223            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
224        )
225
226    return torch_em.default_segmentation_dataset(
227        raw_paths=data_paths,
228        raw_key="images",
229        label_paths=data_paths,
230        label_key=f"labels/{custom_label_choice}",
231        patch_shape=patch_shape,
232        rois=[rois.get(fold, np.s_[:, :, :]) for fold in folds],
233        with_channels=with_channels,
234        with_label_channels=with_label_channels,
235        **kwargs
236    )
237
238
239def get_pannuke_loader(
240    path: Union[os.PathLike, str],
241    patch_shape: Tuple[int, ...],
242    batch_size: str,
243    folds: List[str] = ["fold_1", "fold_2", "fold_3"],
244    download: bool = False,
245    rois: Dict = {},
246    custom_label_choice: str = "instances",
247    resize_inputs: bool = False,
248    **kwargs
249) -> DataLoader:
250    """Get the PanNuke dataloader for nucleus segmentation.
251
252    Args:
253        path: Filepath to a folder where the downloaded data will be saved.
254        patch_shape: The patch shape to use for training.
255        batch_size: The batch size for training.
256        folds: The data fold(s) of choice to be used.
257        download: Whether to download the data if it is not present.
258        rois: The choice of rois per fold to create the dataloader for training.
259        custom_label_choice: The choice of labels to be used for training.
260        resize_inputs: Whether to resize the inputs.
261        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
262
263    Returns:
264        The DataLoader.
265    """
266    dataset_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
267    ds = get_pannuke_dataset(
268        path=path,
269        patch_shape=patch_shape,
270        folds=folds,
271        rois=rois,
272        download=download,
273        custom_label_choice=custom_label_choice,
274        resize_inputs=resize_inputs,
275        **dataset_kwargs
276    )
277    return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)
URLS = {'fold_1': 'https://warwick.ac.uk/fac/cross_fac/tia/data/pannuke/fold_1.zip', 'fold_2': 'https://warwick.ac.uk/fac/cross_fac/tia/data/pannuke/fold_2.zip', 'fold_3': 'https://warwick.ac.uk/fac/cross_fac/tia/data/pannuke/fold_3.zip'}
CHECKSUM = {'fold_1': '6e19ad380300e8ce9480f9ab6a14cc91fa4b6a511609b40e3d70bdf9c881ed0b', 'fold_2': '5bc540cc509f64b5f5a274d6e5a245527dbd3e6d3155d43555115c5d54709b07', 'fold_3': 'c14d372981c42f611ebc80afad01702b89cad8c1b3089daa31931cf5a4b1a39d'}
def get_pannuke_data(path, download, folds):
37def get_pannuke_data(path, download, folds):
38    """Download the PanNuke data.
39
40    Args:
41        path: Filepath to a folder where the downloaded data will be saved.
42        download: Whether to download the data if it is not present.
43        folds: The data fold(s) of choice to be used.
44    """
45    os.makedirs(path, exist_ok=True)
46    for tmp_fold in folds:
47        assert tmp_fold in URLS.keys(), "Please choose one or more of existing folds: 'fold_1' / 'fold_2' / 'fold_3'."
48        if os.path.exists(os.path.join(path, f"pannuke_{tmp_fold}.h5")):
49            return
50
51        util.download_source(os.path.join(path, f"{tmp_fold}.zip"), URLS[tmp_fold], download, CHECKSUM[tmp_fold])
52
53        print(f"Unzipping the PanNuke dataset in {tmp_fold} directories...")
54        util.unzip(os.path.join(path, f"{tmp_fold}.zip"), os.path.join(path, f"{tmp_fold}"), True)
55
56        _convert_to_hdf5(path, tmp_fold)

Download the PanNuke data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • download: Whether to download the data if it is not present.
  • folds: The data fold(s) of choice to be used.
def get_pannuke_paths( path: Union[os.PathLike, str], folds: List[str] = ['fold_1', 'fold_2', 'fold_3'], download: bool = False) -> List[str]:
164def get_pannuke_paths(
165    path: Union[os.PathLike, str], folds: List[str] = ["fold_1", "fold_2", "fold_3"], download: bool = False,
166) -> List[str]:
167    """Get paths to the PanNuke data.
168
169    Args:
170        path: Filepath to a folder where the downloaded data will be saved.
171        folds: The data fold(s) of choice to be used.
172        download: Whether to download the data if it is not present.
173
174    Returns:
175        List of filepaths to the stored data.
176    """
177    get_pannuke_data(path, download, folds)
178
179    data_paths = [os.path.join(path, f"pannuke_{fold}.h5") for fold in folds]
180    return data_paths

Get paths to the PanNuke data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • folds: The data fold(s) of choice to be used.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths to the stored data.

def get_pannuke_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], folds: List[str] = ['fold_1', 'fold_2', 'fold_3'], rois: Dict = {}, download: bool = False, custom_label_choice: str = 'instances', with_channels: bool = True, with_label_channels: bool = False, resize_inputs: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
183def get_pannuke_dataset(
184    path: Union[os.PathLike, str],
185    patch_shape: Tuple[int, ...],
186    folds: List[str] = ["fold_1", "fold_2", "fold_3"],
187    rois: Dict = {},
188    download: bool = False,
189    custom_label_choice: str = "instances",
190    with_channels: bool = True,
191    with_label_channels: bool = False,
192    resize_inputs: bool = False,
193    **kwargs
194) -> Dataset:
195    """Get the PanNuke dataset for nucleus segmentation.
196
197    Args:
198        path: Filepath to a folder where the downloaded data will be saved.
199        patch_shape: The patch shape to use for training.
200        folds: The data fold(s) of choice to be used.
201        download: Whether to download the data if it is not present.
202        rois: The choice of rois per fold to create the dataloader for training.
203        custom_label_choice: The choice of labels to be used for training.
204        with_channels: Whether the inputs have channels.
205        with_label_channels: Whether the labels have channels.
206        resize_inputs: Whether to resize the inputs.
207        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
208
209    Returns:
210        The segmentation dataset
211    """
212    assert custom_label_choice in [
213        "masks", "instances", "semantic"
214    ], "Select the type of labels you want from [masks/instances/semantic] (See `_convert_to_hdf5` for details)"
215
216    if rois is not None:
217        assert isinstance(rois, dict)
218
219    data_paths = get_pannuke_paths(path, folds, download)
220
221    if resize_inputs:
222        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
223        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
224            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
225        )
226
227    return torch_em.default_segmentation_dataset(
228        raw_paths=data_paths,
229        raw_key="images",
230        label_paths=data_paths,
231        label_key=f"labels/{custom_label_choice}",
232        patch_shape=patch_shape,
233        rois=[rois.get(fold, np.s_[:, :, :]) for fold in folds],
234        with_channels=with_channels,
235        with_label_channels=with_label_channels,
236        **kwargs
237    )

Get the PanNuke dataset for nucleus segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • folds: The data fold(s) of choice to be used.
  • download: Whether to download the data if it is not present.
  • rois: The choice of rois per fold to create the dataloader for training.
  • custom_label_choice: The choice of labels to be used for training.
  • with_channels: Whether the inputs have channels.
  • with_label_channels: Whether the labels have channels.
  • resize_inputs: Whether to resize the inputs.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset

def get_pannuke_loader( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], batch_size: str, folds: List[str] = ['fold_1', 'fold_2', 'fold_3'], download: bool = False, rois: Dict = {}, custom_label_choice: str = 'instances', resize_inputs: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
240def get_pannuke_loader(
241    path: Union[os.PathLike, str],
242    patch_shape: Tuple[int, ...],
243    batch_size: str,
244    folds: List[str] = ["fold_1", "fold_2", "fold_3"],
245    download: bool = False,
246    rois: Dict = {},
247    custom_label_choice: str = "instances",
248    resize_inputs: bool = False,
249    **kwargs
250) -> DataLoader:
251    """Get the PanNuke dataloader for nucleus segmentation.
252
253    Args:
254        path: Filepath to a folder where the downloaded data will be saved.
255        patch_shape: The patch shape to use for training.
256        batch_size: The batch size for training.
257        folds: The data fold(s) of choice to be used.
258        download: Whether to download the data if it is not present.
259        rois: The choice of rois per fold to create the dataloader for training.
260        custom_label_choice: The choice of labels to be used for training.
261        resize_inputs: Whether to resize the inputs.
262        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
263
264    Returns:
265        The DataLoader.
266    """
267    dataset_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
268    ds = get_pannuke_dataset(
269        path=path,
270        patch_shape=patch_shape,
271        folds=folds,
272        rois=rois,
273        download=download,
274        custom_label_choice=custom_label_choice,
275        resize_inputs=resize_inputs,
276        **dataset_kwargs
277    )
278    return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)

Get the PanNuke dataloader for nucleus segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • batch_size: The batch size for training.
  • folds: The data fold(s) of choice to be used.
  • download: Whether to download the data if it is not present.
  • rois: The choice of rois per fold to create the dataloader for training.
  • custom_label_choice: The choice of labels to be used for training.
  • resize_inputs: Whether to resize the inputs.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.