torch_em.data.datasets.histopathology.pannuke
The PanNuke datasets contains annotations for nucleus segmentation in histopathology images across different tissue types.
This dataset is from the publication https://doi.org/10.48550/arXiv.2003.10778. Please cite it if you use this dataset for your research.
1"""The PanNuke datasets contains annotations for nucleus segmentation 2in histopathology images across different tissue types. 3 4This dataset is from the publication https://doi.org/10.48550/arXiv.2003.10778. 5Please cite it if you use this dataset for your research. 6""" 7 8import os 9import shutil 10from glob import glob 11from typing import List, Union, Dict, Tuple 12 13import numpy as np 14 15from torch.utils.data import Dataset, DataLoader 16 17import torch_em 18 19from .. import util 20 21 22# PanNuke Dataset - https://warwick.ac.uk/fac/cross_fac/tia/data/pannuke 23URLS = { 24 "fold_1": "https://warwick.ac.uk/fac/cross_fac/tia/data/pannuke/fold_1.zip", 25 "fold_2": "https://warwick.ac.uk/fac/cross_fac/tia/data/pannuke/fold_2.zip", 26 "fold_3": "https://warwick.ac.uk/fac/cross_fac/tia/data/pannuke/fold_3.zip" 27} 28 29CHECKSUM = { 30 "fold_1": "6e19ad380300e8ce9480f9ab6a14cc91fa4b6a511609b40e3d70bdf9c881ed0b", 31 "fold_2": "5bc540cc509f64b5f5a274d6e5a245527dbd3e6d3155d43555115c5d54709b07", 32 "fold_3": "c14d372981c42f611ebc80afad01702b89cad8c1b3089daa31931cf5a4b1a39d" 33} 34 35 36def get_pannuke_data(path, download, folds): 37 """Download the PanNuke data. 38 39 Args: 40 path: Filepath to a folder where the downloaded data will be saved. 41 download: Whether to download the data if it is not present. 42 folds: The data fold(s) of choice to be used. 43 """ 44 os.makedirs(path, exist_ok=True) 45 for tmp_fold in folds: 46 if os.path.exists(os.path.join(path, f"pannuke_{tmp_fold}.h5")): 47 return 48 49 util.download_source(os.path.join(path, f"{tmp_fold}.zip"), URLS[tmp_fold], download, CHECKSUM[tmp_fold]) 50 51 print(f"Unzipping the PanNuke dataset in {tmp_fold} directories...") 52 util.unzip(os.path.join(path, f"{tmp_fold}.zip"), os.path.join(path, f"{tmp_fold}"), True) 53 54 _convert_to_hdf5(path, tmp_fold) 55 56 57def _convert_to_hdf5(path, fold): 58 """Here, we create the h5 files from the input data into 4 essentials (keys): 59 - "images" - the raw input images (transposed into the expected format) (S x 3 x H x W) 60 - "labels/masks" - the raw input masks (transposed as above) (S x 6 x H x W) 61 - "labels/instances" - the converted all-instance labels (S x H x W) 62 - "labels/semantic" - the converted semantic labels (S x H x W) 63 - where, the semantic instance representation is as follows: 64 (0: Background, 1: Neoplastic cells, 2: Inflammatory, 65 3: Connective/Soft tissue cells, 4: Dead Cells, 5: Epithelial) 66 """ 67 import h5py 68 69 if os.path.exists(os.path.join(path, f"pannuke_{fold}.h5")): 70 return 71 72 print(f"Converting {fold} into h5 file format...") 73 img_paths = glob(os.path.join(path, "**", "images.npy"), recursive=True) 74 gt_paths = glob(os.path.join(path, "**", "masks.npy"), recursive=True) 75 76 for img_path, gt_path in zip(img_paths, gt_paths): 77 # original (raw) shape : S x H x W x C -> transposed shape (expected) : C x S x H x W 78 img = np.load(img_path) 79 labels = np.load(gt_path) 80 81 instances = _channels_to_instances(labels) 82 semantic = _channels_to_semantics(labels) 83 84 img = img.transpose(3, 0, 1, 2) 85 labels = labels.transpose(3, 0, 1, 2) 86 87 # img.shape -> (3, 2656, 256, 256) --- img_chunks -> (3, 1, 256, 256) 88 # (same logic as above for labels) 89 img_chunks = (img.shape[0], 1) + img.shape[2:] 90 label_chunks = (labels.shape[0], 1) + labels.shape[2:] 91 other_label_chunks = (1,) + labels.shape[2:] # for instance and semantic labels 92 93 with h5py.File(os.path.join(path, f"pannuke_{fold}.h5"), "w") as f: 94 f.create_dataset("images", data=img, compression="gzip", chunks=img_chunks) 95 f.create_dataset("labels/masks", data=labels, compression="gzip", chunks=label_chunks) 96 f.create_dataset("labels/instances", data=instances, compression="gzip", chunks=other_label_chunks) 97 f.create_dataset("labels/semantic", data=semantic, compression="gzip", chunks=other_label_chunks) 98 99 dir_to_rm = glob(os.path.join(path, "*[!.h5]")) 100 for tmp_dir in dir_to_rm: 101 shutil.rmtree(tmp_dir) 102 103 104def _channels_to_instances(labels): 105 """Converting the ground-truth of 6 (instance) channels into 1 label with instances from all channels 106 channel info - 107 (0: Neoplastic cells, 1: Inflammatory, 2: Connective/Soft tissue cells, 3: Dead Cells, 4: Epithelial, 6: Background) 108 109 Returns: 110 - instance labels of dimensions -> (C x H x W) 111 """ 112 import vigra 113 114 labels = labels.transpose(0, 3, 1, 2) # to access with the shape S x 6 x H x W 115 list_of_instances = [] 116 117 for label_slice in labels: # access the slices (each with 6 channels of H x W labels) 118 segmentation = np.zeros(labels.shape[2:]) 119 max_ids = [] 120 for label_channel in label_slice[:-1]: # access the channels 121 # the 'start_label' takes care of where to start allocating the instance ids from 122 this_labels, max_id, _ = vigra.analysis.relabelConsecutive( 123 label_channel.astype("uint64"), 124 start_label=max_ids[-1] + 1 if len(max_ids) > 0 else 1) 125 126 # some trailing channels might not have labels, hence appending only for elements with RoIs 127 if max_id > 0: 128 max_ids.append(max_id) 129 130 segmentation[this_labels > 0] = this_labels[this_labels > 0] 131 132 list_of_instances.append(segmentation) 133 134 f_segmentation = np.stack(list_of_instances) 135 136 return f_segmentation 137 138 139def _channels_to_semantics(labels): 140 """Converting the ground-truth of 6 (instance) channels into semantic labels, ollowing below the id info as: 141 (1 -> Neoplastic cells, 2 -> Inflammatory, 3 -> Connective/Soft tissue cells, 142 4 -> Dead Cells, 5 -> Epithelial, 0 -> Background) 143 144 Returns: 145 - semantic labels of dimensions -> (C x H x W) 146 """ 147 labels = labels.transpose(0, 3, 1, 2) 148 list_of_semantic = [] 149 150 for label_slice in labels: 151 segmentation = np.zeros(labels.shape[2:]) 152 for i, label_channel in enumerate(label_slice[:-1]): 153 segmentation[label_channel > 0] = i + 1 154 list_of_semantic.append(segmentation) 155 156 f_segmentation = np.stack(list_of_semantic) 157 158 return f_segmentation 159 160 161def get_pannuke_paths( 162 path: Union[os.PathLike, str], folds: List[str] = ["fold_1", "fold_2", "fold_3"], download: bool = False, 163) -> List[str]: 164 """Get paths to the PanNuke data. 165 166 Args: 167 path: Filepath to a folder where the downloaded data will be saved. 168 folds: The data fold(s) of choice to be used. 169 download: Whether to download the data if it is not present. 170 171 Returns: 172 List of filepaths to the stored data. 173 """ 174 get_pannuke_data(path, download, folds) 175 176 data_paths = [os.path.join(path, f"pannuke_{fold}.h5") for fold in folds] 177 return data_paths 178 179 180def get_pannuke_dataset( 181 path: Union[os.PathLike, str], 182 patch_shape: Tuple[int, ...], 183 folds: List[str] = ["fold_1", "fold_2", "fold_3"], 184 rois: Dict = {}, 185 download: bool = False, 186 custom_label_choice: str = "instances", 187 with_channels: bool = True, 188 with_label_channels: bool = False, 189 resize_inputs: bool = False, 190 **kwargs 191) -> Dataset: 192 """Get the PanNuke dataset for nucleus segmentation. 193 194 Args: 195 path: Filepath to a folder where the downloaded data will be saved. 196 patch_shape: The patch shape to use for training. 197 folds: The data fold(s) of choice to be used. 198 download: Whether to download the data if it is not present. 199 rois: The choice of rois per fold to create the dataloader for training. 200 custom_label_choice: The choice of labels to be used for training. 201 with_channels: Whether the inputs have channels. 202 with_label_channels: Whether the labels have channels. 203 resize_inputs: Whether to resize the inputs. 204 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 205 206 Returns: 207 The segmentation dataset 208 """ 209 assert custom_label_choice in [ 210 "masks", "instances", "semantic" 211 ], "Select the type of labels you want from [masks/instances/semantic] (See `_convert_to_hdf5` for details)" 212 213 if rois is not None: 214 assert isinstance(rois, dict) 215 216 data_paths = get_pannuke_paths(path, folds, download) 217 218 if resize_inputs: 219 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True} 220 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 221 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 222 ) 223 224 return torch_em.default_segmentation_dataset( 225 raw_paths=data_paths, 226 raw_key="images", 227 label_paths=data_paths, 228 label_key=f"labels/{custom_label_choice}", 229 patch_shape=patch_shape, 230 rois=[rois.get(fold, np.s_[:, :, :]) for fold in folds], 231 with_channels=with_channels, 232 with_label_channels=with_label_channels, 233 **kwargs 234 ) 235 236 237def get_pannuke_loader( 238 path: Union[os.PathLike, str], 239 patch_shape: Tuple[int, ...], 240 batch_size: str, 241 folds: List[str] = ["fold_1", "fold_2", "fold_3"], 242 download: bool = False, 243 rois: Dict = {}, 244 custom_label_choice: str = "instances", 245 resize_inputs: bool = False, 246 **kwargs 247) -> DataLoader: 248 """Get the PanNuke dataloader for nucleus segmentation. 249 250 Args: 251 path: Filepath to a folder where the downloaded data will be saved. 252 patch_shape: The patch shape to use for training. 253 batch_size: The batch size for training. 254 folds: The data fold(s) of choice to be used. 255 download: Whether to download the data if it is not present. 256 rois: The choice of rois per fold to create the dataloader for training. 257 custom_label_choice: The choice of labels to be used for training. 258 resize_inputs: Whether to resize the inputs. 259 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 260 261 Returns: 262 The DataLoader 263 """ 264 dataset_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 265 ds = get_pannuke_dataset( 266 path=path, 267 patch_shape=patch_shape, 268 folds=folds, 269 rois=rois, 270 download=download, 271 custom_label_choice=custom_label_choice, 272 resize_inputs=resize_inputs, 273 **dataset_kwargs 274 ) 275 return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)
URLS =
{'fold_1': 'https://warwick.ac.uk/fac/cross_fac/tia/data/pannuke/fold_1.zip', 'fold_2': 'https://warwick.ac.uk/fac/cross_fac/tia/data/pannuke/fold_2.zip', 'fold_3': 'https://warwick.ac.uk/fac/cross_fac/tia/data/pannuke/fold_3.zip'}
CHECKSUM =
{'fold_1': '6e19ad380300e8ce9480f9ab6a14cc91fa4b6a511609b40e3d70bdf9c881ed0b', 'fold_2': '5bc540cc509f64b5f5a274d6e5a245527dbd3e6d3155d43555115c5d54709b07', 'fold_3': 'c14d372981c42f611ebc80afad01702b89cad8c1b3089daa31931cf5a4b1a39d'}
def
get_pannuke_data(path, download, folds):
37def get_pannuke_data(path, download, folds): 38 """Download the PanNuke data. 39 40 Args: 41 path: Filepath to a folder where the downloaded data will be saved. 42 download: Whether to download the data if it is not present. 43 folds: The data fold(s) of choice to be used. 44 """ 45 os.makedirs(path, exist_ok=True) 46 for tmp_fold in folds: 47 if os.path.exists(os.path.join(path, f"pannuke_{tmp_fold}.h5")): 48 return 49 50 util.download_source(os.path.join(path, f"{tmp_fold}.zip"), URLS[tmp_fold], download, CHECKSUM[tmp_fold]) 51 52 print(f"Unzipping the PanNuke dataset in {tmp_fold} directories...") 53 util.unzip(os.path.join(path, f"{tmp_fold}.zip"), os.path.join(path, f"{tmp_fold}"), True) 54 55 _convert_to_hdf5(path, tmp_fold)
Download the PanNuke data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- download: Whether to download the data if it is not present.
- folds: The data fold(s) of choice to be used.
def
get_pannuke_paths( path: Union[os.PathLike, str], folds: List[str] = ['fold_1', 'fold_2', 'fold_3'], download: bool = False) -> List[str]:
162def get_pannuke_paths( 163 path: Union[os.PathLike, str], folds: List[str] = ["fold_1", "fold_2", "fold_3"], download: bool = False, 164) -> List[str]: 165 """Get paths to the PanNuke data. 166 167 Args: 168 path: Filepath to a folder where the downloaded data will be saved. 169 folds: The data fold(s) of choice to be used. 170 download: Whether to download the data if it is not present. 171 172 Returns: 173 List of filepaths to the stored data. 174 """ 175 get_pannuke_data(path, download, folds) 176 177 data_paths = [os.path.join(path, f"pannuke_{fold}.h5") for fold in folds] 178 return data_paths
Get paths to the PanNuke data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- folds: The data fold(s) of choice to be used.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths to the stored data.
def
get_pannuke_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], folds: List[str] = ['fold_1', 'fold_2', 'fold_3'], rois: Dict = {}, download: bool = False, custom_label_choice: str = 'instances', with_channels: bool = True, with_label_channels: bool = False, resize_inputs: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
181def get_pannuke_dataset( 182 path: Union[os.PathLike, str], 183 patch_shape: Tuple[int, ...], 184 folds: List[str] = ["fold_1", "fold_2", "fold_3"], 185 rois: Dict = {}, 186 download: bool = False, 187 custom_label_choice: str = "instances", 188 with_channels: bool = True, 189 with_label_channels: bool = False, 190 resize_inputs: bool = False, 191 **kwargs 192) -> Dataset: 193 """Get the PanNuke dataset for nucleus segmentation. 194 195 Args: 196 path: Filepath to a folder where the downloaded data will be saved. 197 patch_shape: The patch shape to use for training. 198 folds: The data fold(s) of choice to be used. 199 download: Whether to download the data if it is not present. 200 rois: The choice of rois per fold to create the dataloader for training. 201 custom_label_choice: The choice of labels to be used for training. 202 with_channels: Whether the inputs have channels. 203 with_label_channels: Whether the labels have channels. 204 resize_inputs: Whether to resize the inputs. 205 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 206 207 Returns: 208 The segmentation dataset 209 """ 210 assert custom_label_choice in [ 211 "masks", "instances", "semantic" 212 ], "Select the type of labels you want from [masks/instances/semantic] (See `_convert_to_hdf5` for details)" 213 214 if rois is not None: 215 assert isinstance(rois, dict) 216 217 data_paths = get_pannuke_paths(path, folds, download) 218 219 if resize_inputs: 220 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True} 221 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 222 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 223 ) 224 225 return torch_em.default_segmentation_dataset( 226 raw_paths=data_paths, 227 raw_key="images", 228 label_paths=data_paths, 229 label_key=f"labels/{custom_label_choice}", 230 patch_shape=patch_shape, 231 rois=[rois.get(fold, np.s_[:, :, :]) for fold in folds], 232 with_channels=with_channels, 233 with_label_channels=with_label_channels, 234 **kwargs 235 )
Get the PanNuke dataset for nucleus segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- folds: The data fold(s) of choice to be used.
- download: Whether to download the data if it is not present.
- rois: The choice of rois per fold to create the dataloader for training.
- custom_label_choice: The choice of labels to be used for training.
- with_channels: Whether the inputs have channels.
- with_label_channels: Whether the labels have channels.
- resize_inputs: Whether to resize the inputs.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset
def
get_pannuke_loader( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], batch_size: str, folds: List[str] = ['fold_1', 'fold_2', 'fold_3'], download: bool = False, rois: Dict = {}, custom_label_choice: str = 'instances', resize_inputs: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
238def get_pannuke_loader( 239 path: Union[os.PathLike, str], 240 patch_shape: Tuple[int, ...], 241 batch_size: str, 242 folds: List[str] = ["fold_1", "fold_2", "fold_3"], 243 download: bool = False, 244 rois: Dict = {}, 245 custom_label_choice: str = "instances", 246 resize_inputs: bool = False, 247 **kwargs 248) -> DataLoader: 249 """Get the PanNuke dataloader for nucleus segmentation. 250 251 Args: 252 path: Filepath to a folder where the downloaded data will be saved. 253 patch_shape: The patch shape to use for training. 254 batch_size: The batch size for training. 255 folds: The data fold(s) of choice to be used. 256 download: Whether to download the data if it is not present. 257 rois: The choice of rois per fold to create the dataloader for training. 258 custom_label_choice: The choice of labels to be used for training. 259 resize_inputs: Whether to resize the inputs. 260 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 261 262 Returns: 263 The DataLoader 264 """ 265 dataset_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 266 ds = get_pannuke_dataset( 267 path=path, 268 patch_shape=patch_shape, 269 folds=folds, 270 rois=rois, 271 download=download, 272 custom_label_choice=custom_label_choice, 273 resize_inputs=resize_inputs, 274 **dataset_kwargs 275 ) 276 return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)
Get the PanNuke dataloader for nucleus segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- batch_size: The batch size for training.
- folds: The data fold(s) of choice to be used.
- download: Whether to download the data if it is not present.
- rois: The choice of rois per fold to create the dataloader for training.
- custom_label_choice: The choice of labels to be used for training.
- resize_inputs: Whether to resize the inputs.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader