torch_em.data.datasets.histopathology.pannuke

The PanNuke datasets contains annotations for nucleus segmentation in histopathology images across different tissue types.

This dataset is from the publication https://doi.org/10.48550/arXiv.2003.10778. Please cite it if you use this dataset for your research.

View Source

  1"""The PanNuke datasets contains annotations for nucleus segmentation
  2in histopathology images across different tissue types.
  3
  4This dataset is from the publication https://doi.org/10.48550/arXiv.2003.10778.
  5Please cite it if you use this dataset for your research.
  6"""
  7
  8import os
  9import shutil
 10from glob import glob
 11from typing import List, Union, Dict, Tuple
 12
 13import numpy as np
 14
 15from torch.utils.data import Dataset, DataLoader
 16
 17import torch_em
 18
 19from .. import util
 20
 21
 22# PanNuke Dataset - https://warwick.ac.uk/fac/cross_fac/tia/data/pannuke
 23URLS = {
 24    "fold_1": "https://warwick.ac.uk/fac/cross_fac/tia/data/pannuke/fold_1.zip",
 25    "fold_2": "https://warwick.ac.uk/fac/cross_fac/tia/data/pannuke/fold_2.zip",
 26    "fold_3": "https://warwick.ac.uk/fac/cross_fac/tia/data/pannuke/fold_3.zip"
 27}
 28
 29CHECKSUM = {
 30    "fold_1": "6e19ad380300e8ce9480f9ab6a14cc91fa4b6a511609b40e3d70bdf9c881ed0b",
 31    "fold_2": "5bc540cc509f64b5f5a274d6e5a245527dbd3e6d3155d43555115c5d54709b07",
 32    "fold_3": "c14d372981c42f611ebc80afad01702b89cad8c1b3089daa31931cf5a4b1a39d"
 33}
 34
 35
 36def get_pannuke_data(path, download, folds):
 37    """Download the PanNuke data.
 38
 39    Args:
 40        path: Filepath to a folder where the downloaded data will be saved.
 41        download: Whether to download the data if it is not present.
 42        folds: The data fold(s) of choice to be used.
 43    """
 44    os.makedirs(path, exist_ok=True)
 45    for tmp_fold in folds:
 46        if os.path.exists(os.path.join(path, f"pannuke_{tmp_fold}.h5")):
 47            return
 48
 49        util.download_source(os.path.join(path, f"{tmp_fold}.zip"), URLS[tmp_fold], download, CHECKSUM[tmp_fold])
 50
 51        print(f"Unzipping the PanNuke dataset in {tmp_fold} directories...")
 52        util.unzip(os.path.join(path, f"{tmp_fold}.zip"), os.path.join(path, f"{tmp_fold}"), True)
 53
 54        _convert_to_hdf5(path, tmp_fold)
 55
 56
 57def _convert_to_hdf5(path, fold):
 58    """Here, we create the h5 files from the input data into 4 essentials (keys):
 59        - "images" - the raw input images (transposed into the expected format) (S x 3 x H x W)
 60        - "labels/masks" - the raw input masks (transposed as above) (S x 6 x H x W)
 61        - "labels/instances" - the converted all-instance labels (S x H x W)
 62        - "labels/semantic" - the converted semantic labels (S x H x W)
 63            - where, the semantic instance representation is as follows:
 64                (0: Background, 1: Neoplastic cells, 2: Inflammatory,
 65                 3: Connective/Soft tissue cells, 4: Dead Cells, 5: Epithelial)
 66    """
 67    import h5py
 68
 69    if os.path.exists(os.path.join(path, f"pannuke_{fold}.h5")):
 70        return
 71
 72    print(f"Converting {fold} into h5 file format...")
 73    img_paths = glob(os.path.join(path, "**", "images.npy"), recursive=True)
 74    gt_paths = glob(os.path.join(path, "**", "masks.npy"), recursive=True)
 75
 76    for img_path, gt_path in zip(img_paths, gt_paths):
 77        # original (raw) shape : S x H x W x C -> transposed shape (expected) : C x S x H x W
 78        img = np.load(img_path)
 79        labels = np.load(gt_path)
 80
 81        instances = _channels_to_instances(labels)
 82        semantic = _channels_to_semantics(labels)
 83
 84        img = img.transpose(3, 0, 1, 2)
 85        labels = labels.transpose(3, 0, 1, 2)
 86
 87        # img.shape -> (3, 2656, 256, 256) --- img_chunks -> (3, 1, 256, 256)
 88        # (same logic as above for labels)
 89        img_chunks = (img.shape[0], 1) + img.shape[2:]
 90        label_chunks = (labels.shape[0], 1) + labels.shape[2:]
 91        other_label_chunks = (1,) + labels.shape[2:]  # for instance and semantic labels
 92
 93        with h5py.File(os.path.join(path, f"pannuke_{fold}.h5"), "w") as f:
 94            f.create_dataset("images", data=img, compression="gzip", chunks=img_chunks)
 95            f.create_dataset("labels/masks", data=labels, compression="gzip", chunks=label_chunks)
 96            f.create_dataset("labels/instances", data=instances, compression="gzip", chunks=other_label_chunks)
 97            f.create_dataset("labels/semantic", data=semantic, compression="gzip", chunks=other_label_chunks)
 98
 99    dir_to_rm = glob(os.path.join(path, "*[!.h5]"))
100    for tmp_dir in dir_to_rm:
101        shutil.rmtree(tmp_dir)
102
103
104def _channels_to_instances(labels):
105    """Converting the ground-truth of 6 (instance) channels into 1 label with instances from all channels
106    channel info -
107    (0: Neoplastic cells, 1: Inflammatory, 2: Connective/Soft tissue cells, 3: Dead Cells, 4: Epithelial, 6: Background)
108
109    Returns:
110        - instance labels of dimensions -> (C x H x W)
111    """
112    import vigra
113
114    labels = labels.transpose(0, 3, 1, 2)  # to access with the shape S x 6 x H x W
115    list_of_instances = []
116
117    for label_slice in labels:  # access the slices (each with 6 channels of H x W labels)
118        segmentation = np.zeros(labels.shape[2:])
119        max_ids = []
120        for label_channel in label_slice[:-1]:  # access the channels
121            # the 'start_label' takes care of where to start allocating the instance ids from
122            this_labels, max_id, _ = vigra.analysis.relabelConsecutive(
123                label_channel.astype("uint64"),
124                start_label=max_ids[-1] + 1 if len(max_ids) > 0 else 1)
125
126            # some trailing channels might not have labels, hence appending only for elements with RoIs
127            if max_id > 0:
128                max_ids.append(max_id)
129
130            segmentation[this_labels > 0] = this_labels[this_labels > 0]
131
132        list_of_instances.append(segmentation)
133
134    f_segmentation = np.stack(list_of_instances)
135
136    return f_segmentation
137
138
139def _channels_to_semantics(labels):
140    """Converting the ground-truth of 6 (instance) channels  into semantic labels, ollowing below the id info as:
141    (1 -> Neoplastic cells, 2 -> Inflammatory, 3 -> Connective/Soft tissue cells,
142    4 -> Dead Cells, 5 -> Epithelial, 0 -> Background)
143
144    Returns:
145        - semantic labels of dimensions -> (C x H x W)
146    """
147    labels = labels.transpose(0, 3, 1, 2)
148    list_of_semantic = []
149
150    for label_slice in labels:
151        segmentation = np.zeros(labels.shape[2:])
152        for i, label_channel in enumerate(label_slice[:-1]):
153            segmentation[label_channel > 0] = i + 1
154        list_of_semantic.append(segmentation)
155
156    f_segmentation = np.stack(list_of_semantic)
157
158    return f_segmentation
159
160
161def get_pannuke_paths(
162    path: Union[os.PathLike, str], folds: List[str] = ["fold_1", "fold_2", "fold_3"], download: bool = False,
163) -> List[str]:
164    """Get paths to the PanNuke data.
165
166    Args:
167        path: Filepath to a folder where the downloaded data will be saved.
168        folds: The data fold(s) of choice to be used.
169        download: Whether to download the data if it is not present.
170
171    Returns:
172        List of filepaths to the stored data.
173    """
174    get_pannuke_data(path, download, folds)
175
176    data_paths = [os.path.join(path, f"pannuke_{fold}.h5") for fold in folds]
177    return data_paths
178
179
180def get_pannuke_dataset(
181    path: Union[os.PathLike, str],
182    patch_shape: Tuple[int, ...],
183    folds: List[str] = ["fold_1", "fold_2", "fold_3"],
184    rois: Dict = {},
185    download: bool = False,
186    custom_label_choice: str = "instances",
187    with_channels: bool = True,
188    with_label_channels: bool = False,
189    resize_inputs: bool = False,
190    **kwargs
191) -> Dataset:
192    """Get the PanNuke dataset for nucleus segmentation.
193
194    Args:
195        path: Filepath to a folder where the downloaded data will be saved.
196        patch_shape: The patch shape to use for training.
197        folds: The data fold(s) of choice to be used.
198        download: Whether to download the data if it is not present.
199        rois: The choice of rois per fold to create the dataloader for training.
200        custom_label_choice: The choice of labels to be used for training.
201        with_channels: Whether the inputs have channels.
202        with_label_channels: Whether the labels have channels.
203        resize_inputs: Whether to resize the inputs.
204        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
205
206    Returns:
207        The segmentation dataset
208    """
209    assert custom_label_choice in [
210        "masks", "instances", "semantic"
211    ], "Select the type of labels you want from [masks/instances/semantic] (See `_convert_to_hdf5` for details)"
212
213    if rois is not None:
214        assert isinstance(rois, dict)
215
216    data_paths = get_pannuke_paths(path, folds, download)
217
218    if resize_inputs:
219        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
220        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
221            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
222        )
223
224    return torch_em.default_segmentation_dataset(
225        raw_paths=data_paths,
226        raw_key="images",
227        label_paths=data_paths,
228        label_key=f"labels/{custom_label_choice}",
229        patch_shape=patch_shape,
230        rois=[rois.get(fold, np.s_[:, :, :]) for fold in folds],
231        with_channels=with_channels,
232        with_label_channels=with_label_channels,
233        **kwargs
234    )
235
236
237def get_pannuke_loader(
238    path: Union[os.PathLike, str],
239    patch_shape: Tuple[int, ...],
240    batch_size: str,
241    folds: List[str] = ["fold_1", "fold_2", "fold_3"],
242    download: bool = False,
243    rois: Dict = {},
244    custom_label_choice: str = "instances",
245    resize_inputs: bool = False,
246    **kwargs
247) -> DataLoader:
248    """Get the PanNuke dataloader for nucleus segmentation.
249
250    Args:
251        path: Filepath to a folder where the downloaded data will be saved.
252        patch_shape: The patch shape to use for training.
253        batch_size: The batch size for training.
254        folds: The data fold(s) of choice to be used.
255        download: Whether to download the data if it is not present.
256        rois: The choice of rois per fold to create the dataloader for training.
257        custom_label_choice: The choice of labels to be used for training.
258        resize_inputs: Whether to resize the inputs.
259        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
260
261    Returns:
262        The DataLoader
263    """
264    dataset_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
265    ds = get_pannuke_dataset(
266        path=path,
267        patch_shape=patch_shape,
268        folds=folds,
269        rois=rois,
270        download=download,
271        custom_label_choice=custom_label_choice,
272        resize_inputs=resize_inputs,
273        **dataset_kwargs
274    )
275    return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)

URLS = {'fold_1': 'https://warwick.ac.uk/fac/cross_fac/tia/data/pannuke/fold_1.zip', 'fold_2': 'https://warwick.ac.uk/fac/cross_fac/tia/data/pannuke/fold_2.zip', 'fold_3': 'https://warwick.ac.uk/fac/cross_fac/tia/data/pannuke/fold_3.zip'}

CHECKSUM = {'fold_1': '6e19ad380300e8ce9480f9ab6a14cc91fa4b6a511609b40e3d70bdf9c881ed0b', 'fold_2': '5bc540cc509f64b5f5a274d6e5a245527dbd3e6d3155d43555115c5d54709b07', 'fold_3': 'c14d372981c42f611ebc80afad01702b89cad8c1b3089daa31931cf5a4b1a39d'}

def get_pannuke_data(path, download, folds): View Source

37def get_pannuke_data(path, download, folds):
38    """Download the PanNuke data.
39
40    Args:
41        path: Filepath to a folder where the downloaded data will be saved.
42        download: Whether to download the data if it is not present.
43        folds: The data fold(s) of choice to be used.
44    """
45    os.makedirs(path, exist_ok=True)
46    for tmp_fold in folds:
47        if os.path.exists(os.path.join(path, f"pannuke_{tmp_fold}.h5")):
48            return
49
50        util.download_source(os.path.join(path, f"{tmp_fold}.zip"), URLS[tmp_fold], download, CHECKSUM[tmp_fold])
51
52        print(f"Unzipping the PanNuke dataset in {tmp_fold} directories...")
53        util.unzip(os.path.join(path, f"{tmp_fold}.zip"), os.path.join(path, f"{tmp_fold}"), True)
54
55        _convert_to_hdf5(path, tmp_fold)

Download the PanNuke data.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
download: Whether to download the data if it is not present.
folds: The data fold(s) of choice to be used.

def get_pannuke_paths( path: Union[os.PathLike, str], folds: List[str] = ['fold_1', 'fold_2', 'fold_3'], download: bool = False) -> List[str]: View Source

162def get_pannuke_paths(
163    path: Union[os.PathLike, str], folds: List[str] = ["fold_1", "fold_2", "fold_3"], download: bool = False,
164) -> List[str]:
165    """Get paths to the PanNuke data.
166
167    Args:
168        path: Filepath to a folder where the downloaded data will be saved.
169        folds: The data fold(s) of choice to be used.
170        download: Whether to download the data if it is not present.
171
172    Returns:
173        List of filepaths to the stored data.
174    """
175    get_pannuke_data(path, download, folds)
176
177    data_paths = [os.path.join(path, f"pannuke_{fold}.h5") for fold in folds]
178    return data_paths

Get paths to the PanNuke data.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
folds: The data fold(s) of choice to be used.
download: Whether to download the data if it is not present.

Returns:

List of filepaths to the stored data.

def get_pannuke_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], folds: List[str] = ['fold_1', 'fold_2', 'fold_3'], rois: Dict = {}, download: bool = False, custom_label_choice: str = 'instances', with_channels: bool = True, with_label_channels: bool = False, resize_inputs: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset: View Source

181def get_pannuke_dataset(
182    path: Union[os.PathLike, str],
183    patch_shape: Tuple[int, ...],
184    folds: List[str] = ["fold_1", "fold_2", "fold_3"],
185    rois: Dict = {},
186    download: bool = False,
187    custom_label_choice: str = "instances",
188    with_channels: bool = True,
189    with_label_channels: bool = False,
190    resize_inputs: bool = False,
191    **kwargs
192) -> Dataset:
193    """Get the PanNuke dataset for nucleus segmentation.
194
195    Args:
196        path: Filepath to a folder where the downloaded data will be saved.
197        patch_shape: The patch shape to use for training.
198        folds: The data fold(s) of choice to be used.
199        download: Whether to download the data if it is not present.
200        rois: The choice of rois per fold to create the dataloader for training.
201        custom_label_choice: The choice of labels to be used for training.
202        with_channels: Whether the inputs have channels.
203        with_label_channels: Whether the labels have channels.
204        resize_inputs: Whether to resize the inputs.
205        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
206
207    Returns:
208        The segmentation dataset
209    """
210    assert custom_label_choice in [
211        "masks", "instances", "semantic"
212    ], "Select the type of labels you want from [masks/instances/semantic] (See `_convert_to_hdf5` for details)"
213
214    if rois is not None:
215        assert isinstance(rois, dict)
216
217    data_paths = get_pannuke_paths(path, folds, download)
218
219    if resize_inputs:
220        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
221        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
222            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
223        )
224
225    return torch_em.default_segmentation_dataset(
226        raw_paths=data_paths,
227        raw_key="images",
228        label_paths=data_paths,
229        label_key=f"labels/{custom_label_choice}",
230        patch_shape=patch_shape,
231        rois=[rois.get(fold, np.s_[:, :, :]) for fold in folds],
232        with_channels=with_channels,
233        with_label_channels=with_label_channels,
234        **kwargs
235    )

Get the PanNuke dataset for nucleus segmentation.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
patch_shape: The patch shape to use for training.
folds: The data fold(s) of choice to be used.
download: Whether to download the data if it is not present.
rois: The choice of rois per fold to create the dataloader for training.
custom_label_choice: The choice of labels to be used for training.
with_channels: Whether the inputs have channels.
with_label_channels: Whether the labels have channels.
resize_inputs: Whether to resize the inputs.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

Returns:

The segmentation dataset

def get_pannuke_loader( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], batch_size: str, folds: List[str] = ['fold_1', 'fold_2', 'fold_3'], download: bool = False, rois: Dict = {}, custom_label_choice: str = 'instances', resize_inputs: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader: View Source

238def get_pannuke_loader(
239    path: Union[os.PathLike, str],
240    patch_shape: Tuple[int, ...],
241    batch_size: str,
242    folds: List[str] = ["fold_1", "fold_2", "fold_3"],
243    download: bool = False,
244    rois: Dict = {},
245    custom_label_choice: str = "instances",
246    resize_inputs: bool = False,
247    **kwargs
248) -> DataLoader:
249    """Get the PanNuke dataloader for nucleus segmentation.
250
251    Args:
252        path: Filepath to a folder where the downloaded data will be saved.
253        patch_shape: The patch shape to use for training.
254        batch_size: The batch size for training.
255        folds: The data fold(s) of choice to be used.
256        download: Whether to download the data if it is not present.
257        rois: The choice of rois per fold to create the dataloader for training.
258        custom_label_choice: The choice of labels to be used for training.
259        resize_inputs: Whether to resize the inputs.
260        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
261
262    Returns:
263        The DataLoader
264    """
265    dataset_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
266    ds = get_pannuke_dataset(
267        path=path,
268        patch_shape=patch_shape,
269        folds=folds,
270        rois=rois,
271        download=download,
272        custom_label_choice=custom_label_choice,
273        resize_inputs=resize_inputs,
274        **dataset_kwargs
275    )
276    return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)

Get the PanNuke dataloader for nucleus segmentation.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
patch_shape: The patch shape to use for training.
batch_size: The batch size for training.
folds: The data fold(s) of choice to be used.
download: Whether to download the data if it is not present.
rois: The choice of rois per fold to create the dataloader for training.
custom_label_choice: The choice of labels to be used for training.
resize_inputs: Whether to resize the inputs.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

Returns:

The DataLoader