torch_em.data.datasets.histopathology.pannuke
The PanNuke datasets contains annotations for nucleus segmentation in histopathology images across different tissue types.
This dataset is from the publication https://doi.org/10.48550/arXiv.2003.10778. Please cite it if you use this dataset for your research.
1"""The PanNuke datasets contains annotations for nucleus segmentation 2in histopathology images across different tissue types. 3 4This dataset is from the publication https://doi.org/10.48550/arXiv.2003.10778. 5Please cite it if you use this dataset for your research. 6""" 7 8import os 9import shutil 10from glob import glob 11from typing import List, Union, Dict, Tuple 12 13import numpy as np 14 15from torch.utils.data import Dataset, DataLoader 16 17import torch_em 18 19from .. import util 20 21 22# PanNuke Dataset - https://warwick.ac.uk/fac/cross_fac/tia/data/pannuke 23URLS = { 24 "fold_1": "https://warwick.ac.uk/fac/cross_fac/tia/data/pannuke/fold_1.zip", 25 "fold_2": "https://warwick.ac.uk/fac/cross_fac/tia/data/pannuke/fold_2.zip", 26 "fold_3": "https://warwick.ac.uk/fac/cross_fac/tia/data/pannuke/fold_3.zip" 27} 28 29CHECKSUM = { 30 "fold_1": "6e19ad380300e8ce9480f9ab6a14cc91fa4b6a511609b40e3d70bdf9c881ed0b", 31 "fold_2": "5bc540cc509f64b5f5a274d6e5a245527dbd3e6d3155d43555115c5d54709b07", 32 "fold_3": "c14d372981c42f611ebc80afad01702b89cad8c1b3089daa31931cf5a4b1a39d" 33} 34 35 36def get_pannuke_data(path, download, folds): 37 """Download the PanNuke data. 38 39 Args: 40 path: Filepath to a folder where the downloaded data will be saved. 41 download: Whether to download the data if it is not present. 42 folds: The data fold(s) of choice to be used. 43 """ 44 os.makedirs(path, exist_ok=True) 45 for tmp_fold in folds: 46 assert tmp_fold in URLS.keys(), "Please choose one or more of existing folds: 'fold_1' / 'fold_2' / 'fold_3'." 47 if os.path.exists(os.path.join(path, f"pannuke_{tmp_fold}.h5")): 48 return 49 50 util.download_source(os.path.join(path, f"{tmp_fold}.zip"), URLS[tmp_fold], download, CHECKSUM[tmp_fold]) 51 52 print(f"Unzipping the PanNuke dataset in {tmp_fold} directories...") 53 util.unzip(os.path.join(path, f"{tmp_fold}.zip"), os.path.join(path, f"{tmp_fold}"), True) 54 55 _convert_to_hdf5(path, tmp_fold) 56 57 58def _convert_to_hdf5(path, fold): 59 """Here, we create the h5 files from the input data into 4 essentials (keys): 60 - "images" - the raw input images (transposed into the expected format) (S x 3 x H x W) 61 - "labels/masks" - the raw input masks (transposed as above) (S x 6 x H x W) 62 - "labels/instances" - the converted all-instance labels (S x H x W) 63 - "labels/semantic" - the converted semantic labels (S x H x W) 64 - where, the semantic instance representation is as follows: 65 (0: Background, 1: Neoplastic cells, 2: Inflammatory, 66 3: Connective/Soft tissue cells, 4: Dead Cells, 5: Epithelial) 67 """ 68 import h5py 69 70 if os.path.exists(os.path.join(path, f"pannuke_{fold}.h5")): 71 return 72 73 print(f"Converting {fold} into h5 file format...") 74 img_paths = glob(os.path.join(path, "**", "images.npy"), recursive=True) 75 gt_paths = glob(os.path.join(path, "**", "masks.npy"), recursive=True) 76 77 for img_path, gt_path in zip(img_paths, gt_paths): 78 # original (raw) shape : S x H x W x C -> transposed shape (expected) : C x S x H x W 79 img = np.load(img_path) 80 labels = np.load(gt_path) 81 82 instances = _channels_to_instances(labels) 83 semantic = _channels_to_semantics(labels) 84 85 img = img.transpose(3, 0, 1, 2) 86 labels = labels.transpose(3, 0, 1, 2) 87 88 # img.shape -> (3, 2656, 256, 256) --- img_chunks -> (3, 1, 256, 256) 89 # (same logic as above for labels) 90 img_chunks = (img.shape[0], 1) + img.shape[2:] 91 label_chunks = (labels.shape[0], 1) + labels.shape[2:] 92 other_label_chunks = (1,) + labels.shape[2:] # for instance and semantic labels 93 94 with h5py.File(os.path.join(path, f"pannuke_{fold}.h5"), "w") as f: 95 f.create_dataset("images", data=img, compression="gzip", chunks=img_chunks) 96 f.create_dataset("labels/masks", data=labels, compression="gzip", chunks=label_chunks) 97 f.create_dataset("labels/instances", data=instances, compression="gzip", chunks=other_label_chunks) 98 f.create_dataset("labels/semantic", data=semantic, compression="gzip", chunks=other_label_chunks) 99 100 dir_to_rm = glob(os.path.join(path, "*[!.h5]")) 101 for tmp_dir in dir_to_rm: 102 shutil.rmtree(tmp_dir) 103 104 105def _channels_to_instances(labels): 106 """Converting the ground-truth of 6 (instance) channels into 1 label with instances from all channels 107 channel info - 108 (0: Neoplastic cells, 1: Inflammatory, 2: Connective/Soft tissue cells, 3: Dead Cells, 4: Epithelial, 6: Background) 109 110 Returns: 111 - instance labels of dimensions -> (C x H x W) 112 """ 113 import vigra 114 115 labels = labels.transpose(0, 3, 1, 2) # to access with the shape S x 6 x H x W 116 list_of_instances = [] 117 118 for label_slice in labels: # access the slices (each with 6 channels of H x W labels) 119 segmentation = np.zeros(labels.shape[2:]) 120 max_ids = [] 121 for label_channel in label_slice[:-1]: # access the channels 122 # the 'start_label' takes care of where to start allocating the instance ids from 123 this_labels, max_id, _ = vigra.analysis.relabelConsecutive( 124 label_channel.astype("uint64"), 125 start_label=max_ids[-1] + 1 if len(max_ids) > 0 else 1) 126 127 # some trailing channels might not have labels, hence appending only for elements with RoIs 128 if max_id > 0: 129 max_ids.append(max_id) 130 131 segmentation[this_labels > 0] = this_labels[this_labels > 0] 132 133 list_of_instances.append(segmentation) 134 135 f_segmentation = np.stack(list_of_instances) 136 137 return f_segmentation 138 139 140def _channels_to_semantics(labels): 141 """Converting the ground-truth of 6 (instance) channels into semantic labels, ollowing below the id info as: 142 (1 -> Neoplastic cells, 2 -> Inflammatory, 3 -> Connective/Soft tissue cells, 143 4 -> Dead Cells, 5 -> Epithelial, 0 -> Background) 144 145 Returns: 146 - semantic labels of dimensions -> (C x H x W) 147 """ 148 labels = labels.transpose(0, 3, 1, 2) 149 list_of_semantic = [] 150 151 for label_slice in labels: 152 segmentation = np.zeros(labels.shape[2:]) 153 for i, label_channel in enumerate(label_slice[:-1]): 154 segmentation[label_channel > 0] = i + 1 155 list_of_semantic.append(segmentation) 156 157 f_segmentation = np.stack(list_of_semantic) 158 159 return f_segmentation 160 161 162def get_pannuke_paths( 163 path: Union[os.PathLike, str], folds: List[str] = ["fold_1", "fold_2", "fold_3"], download: bool = False, 164) -> List[str]: 165 """Get paths to the PanNuke data. 166 167 Args: 168 path: Filepath to a folder where the downloaded data will be saved. 169 folds: The data fold(s) of choice to be used. 170 download: Whether to download the data if it is not present. 171 172 Returns: 173 List of filepaths to the stored data. 174 """ 175 get_pannuke_data(path, download, folds) 176 177 data_paths = [os.path.join(path, f"pannuke_{fold}.h5") for fold in folds] 178 return data_paths 179 180 181def get_pannuke_dataset( 182 path: Union[os.PathLike, str], 183 patch_shape: Tuple[int, ...], 184 folds: List[str] = ["fold_1", "fold_2", "fold_3"], 185 rois: Dict = {}, 186 download: bool = False, 187 custom_label_choice: str = "instances", 188 with_channels: bool = True, 189 with_label_channels: bool = False, 190 resize_inputs: bool = False, 191 **kwargs 192) -> Dataset: 193 """Get the PanNuke dataset for nucleus segmentation. 194 195 Args: 196 path: Filepath to a folder where the downloaded data will be saved. 197 patch_shape: The patch shape to use for training. 198 folds: The data fold(s) of choice to be used. 199 download: Whether to download the data if it is not present. 200 rois: The choice of rois per fold to create the dataloader for training. 201 custom_label_choice: The choice of labels to be used for training. 202 with_channels: Whether the inputs have channels. 203 with_label_channels: Whether the labels have channels. 204 resize_inputs: Whether to resize the inputs. 205 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 206 207 Returns: 208 The segmentation dataset 209 """ 210 assert custom_label_choice in [ 211 "masks", "instances", "semantic" 212 ], "Select the type of labels you want from [masks/instances/semantic] (See `_convert_to_hdf5` for details)" 213 214 if rois is not None: 215 assert isinstance(rois, dict) 216 217 data_paths = get_pannuke_paths(path, folds, download) 218 219 if resize_inputs: 220 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True} 221 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 222 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 223 ) 224 225 return torch_em.default_segmentation_dataset( 226 raw_paths=data_paths, 227 raw_key="images", 228 label_paths=data_paths, 229 label_key=f"labels/{custom_label_choice}", 230 patch_shape=patch_shape, 231 rois=[rois.get(fold, np.s_[:, :, :]) for fold in folds], 232 with_channels=with_channels, 233 with_label_channels=with_label_channels, 234 **kwargs 235 ) 236 237 238def get_pannuke_loader( 239 path: Union[os.PathLike, str], 240 patch_shape: Tuple[int, ...], 241 batch_size: str, 242 folds: List[str] = ["fold_1", "fold_2", "fold_3"], 243 download: bool = False, 244 rois: Dict = {}, 245 custom_label_choice: str = "instances", 246 resize_inputs: bool = False, 247 **kwargs 248) -> DataLoader: 249 """Get the PanNuke dataloader for nucleus segmentation. 250 251 Args: 252 path: Filepath to a folder where the downloaded data will be saved. 253 patch_shape: The patch shape to use for training. 254 batch_size: The batch size for training. 255 folds: The data fold(s) of choice to be used. 256 download: Whether to download the data if it is not present. 257 rois: The choice of rois per fold to create the dataloader for training. 258 custom_label_choice: The choice of labels to be used for training. 259 resize_inputs: Whether to resize the inputs. 260 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 261 262 Returns: 263 The DataLoader 264 """ 265 dataset_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 266 ds = get_pannuke_dataset( 267 path=path, 268 patch_shape=patch_shape, 269 folds=folds, 270 rois=rois, 271 download=download, 272 custom_label_choice=custom_label_choice, 273 resize_inputs=resize_inputs, 274 **dataset_kwargs 275 ) 276 return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)
URLS =
{'fold_1': 'https://warwick.ac.uk/fac/cross_fac/tia/data/pannuke/fold_1.zip', 'fold_2': 'https://warwick.ac.uk/fac/cross_fac/tia/data/pannuke/fold_2.zip', 'fold_3': 'https://warwick.ac.uk/fac/cross_fac/tia/data/pannuke/fold_3.zip'}
CHECKSUM =
{'fold_1': '6e19ad380300e8ce9480f9ab6a14cc91fa4b6a511609b40e3d70bdf9c881ed0b', 'fold_2': '5bc540cc509f64b5f5a274d6e5a245527dbd3e6d3155d43555115c5d54709b07', 'fold_3': 'c14d372981c42f611ebc80afad01702b89cad8c1b3089daa31931cf5a4b1a39d'}
def
get_pannuke_data(path, download, folds):
37def get_pannuke_data(path, download, folds): 38 """Download the PanNuke data. 39 40 Args: 41 path: Filepath to a folder where the downloaded data will be saved. 42 download: Whether to download the data if it is not present. 43 folds: The data fold(s) of choice to be used. 44 """ 45 os.makedirs(path, exist_ok=True) 46 for tmp_fold in folds: 47 assert tmp_fold in URLS.keys(), "Please choose one or more of existing folds: 'fold_1' / 'fold_2' / 'fold_3'." 48 if os.path.exists(os.path.join(path, f"pannuke_{tmp_fold}.h5")): 49 return 50 51 util.download_source(os.path.join(path, f"{tmp_fold}.zip"), URLS[tmp_fold], download, CHECKSUM[tmp_fold]) 52 53 print(f"Unzipping the PanNuke dataset in {tmp_fold} directories...") 54 util.unzip(os.path.join(path, f"{tmp_fold}.zip"), os.path.join(path, f"{tmp_fold}"), True) 55 56 _convert_to_hdf5(path, tmp_fold)
Download the PanNuke data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- download: Whether to download the data if it is not present.
- folds: The data fold(s) of choice to be used.
def
get_pannuke_paths( path: Union[os.PathLike, str], folds: List[str] = ['fold_1', 'fold_2', 'fold_3'], download: bool = False) -> List[str]:
163def get_pannuke_paths( 164 path: Union[os.PathLike, str], folds: List[str] = ["fold_1", "fold_2", "fold_3"], download: bool = False, 165) -> List[str]: 166 """Get paths to the PanNuke data. 167 168 Args: 169 path: Filepath to a folder where the downloaded data will be saved. 170 folds: The data fold(s) of choice to be used. 171 download: Whether to download the data if it is not present. 172 173 Returns: 174 List of filepaths to the stored data. 175 """ 176 get_pannuke_data(path, download, folds) 177 178 data_paths = [os.path.join(path, f"pannuke_{fold}.h5") for fold in folds] 179 return data_paths
Get paths to the PanNuke data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- folds: The data fold(s) of choice to be used.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths to the stored data.
def
get_pannuke_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], folds: List[str] = ['fold_1', 'fold_2', 'fold_3'], rois: Dict = {}, download: bool = False, custom_label_choice: str = 'instances', with_channels: bool = True, with_label_channels: bool = False, resize_inputs: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
182def get_pannuke_dataset( 183 path: Union[os.PathLike, str], 184 patch_shape: Tuple[int, ...], 185 folds: List[str] = ["fold_1", "fold_2", "fold_3"], 186 rois: Dict = {}, 187 download: bool = False, 188 custom_label_choice: str = "instances", 189 with_channels: bool = True, 190 with_label_channels: bool = False, 191 resize_inputs: bool = False, 192 **kwargs 193) -> Dataset: 194 """Get the PanNuke dataset for nucleus segmentation. 195 196 Args: 197 path: Filepath to a folder where the downloaded data will be saved. 198 patch_shape: The patch shape to use for training. 199 folds: The data fold(s) of choice to be used. 200 download: Whether to download the data if it is not present. 201 rois: The choice of rois per fold to create the dataloader for training. 202 custom_label_choice: The choice of labels to be used for training. 203 with_channels: Whether the inputs have channels. 204 with_label_channels: Whether the labels have channels. 205 resize_inputs: Whether to resize the inputs. 206 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 207 208 Returns: 209 The segmentation dataset 210 """ 211 assert custom_label_choice in [ 212 "masks", "instances", "semantic" 213 ], "Select the type of labels you want from [masks/instances/semantic] (See `_convert_to_hdf5` for details)" 214 215 if rois is not None: 216 assert isinstance(rois, dict) 217 218 data_paths = get_pannuke_paths(path, folds, download) 219 220 if resize_inputs: 221 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True} 222 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 223 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 224 ) 225 226 return torch_em.default_segmentation_dataset( 227 raw_paths=data_paths, 228 raw_key="images", 229 label_paths=data_paths, 230 label_key=f"labels/{custom_label_choice}", 231 patch_shape=patch_shape, 232 rois=[rois.get(fold, np.s_[:, :, :]) for fold in folds], 233 with_channels=with_channels, 234 with_label_channels=with_label_channels, 235 **kwargs 236 )
Get the PanNuke dataset for nucleus segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- folds: The data fold(s) of choice to be used.
- download: Whether to download the data if it is not present.
- rois: The choice of rois per fold to create the dataloader for training.
- custom_label_choice: The choice of labels to be used for training.
- with_channels: Whether the inputs have channels.
- with_label_channels: Whether the labels have channels.
- resize_inputs: Whether to resize the inputs.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset.
Returns:
The segmentation dataset
def
get_pannuke_loader( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], batch_size: str, folds: List[str] = ['fold_1', 'fold_2', 'fold_3'], download: bool = False, rois: Dict = {}, custom_label_choice: str = 'instances', resize_inputs: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
239def get_pannuke_loader( 240 path: Union[os.PathLike, str], 241 patch_shape: Tuple[int, ...], 242 batch_size: str, 243 folds: List[str] = ["fold_1", "fold_2", "fold_3"], 244 download: bool = False, 245 rois: Dict = {}, 246 custom_label_choice: str = "instances", 247 resize_inputs: bool = False, 248 **kwargs 249) -> DataLoader: 250 """Get the PanNuke dataloader for nucleus segmentation. 251 252 Args: 253 path: Filepath to a folder where the downloaded data will be saved. 254 patch_shape: The patch shape to use for training. 255 batch_size: The batch size for training. 256 folds: The data fold(s) of choice to be used. 257 download: Whether to download the data if it is not present. 258 rois: The choice of rois per fold to create the dataloader for training. 259 custom_label_choice: The choice of labels to be used for training. 260 resize_inputs: Whether to resize the inputs. 261 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 262 263 Returns: 264 The DataLoader 265 """ 266 dataset_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 267 ds = get_pannuke_dataset( 268 path=path, 269 patch_shape=patch_shape, 270 folds=folds, 271 rois=rois, 272 download=download, 273 custom_label_choice=custom_label_choice, 274 resize_inputs=resize_inputs, 275 **dataset_kwargs 276 ) 277 return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)
Get the PanNuke dataloader for nucleus segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- batch_size: The batch size for training.
- folds: The data fold(s) of choice to be used.
- download: Whether to download the data if it is not present.
- rois: The choice of rois per fold to create the dataloader for training.
- custom_label_choice: The choice of labels to be used for training.
- resize_inputs: Whether to resize the inputs.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_datasetor for the PyTorch DataLoader.
Returns:
The DataLoader