torch_em.data.datasets.light_microscopy.hpa
This dataset was part of the HPA Kaggle challenge for protein identification. It contains confocal microscopy images and annotations for cell segmentation.
The dataset is described in the publication https://doi.org/10.1038/s41592-019-0658-6. Please cite it if you use this dataset in your research.
1"""This dataset was part of the HPA Kaggle challenge for protein identification. 2It contains confocal microscopy images and annotations for cell segmentation. 3 4The dataset is described in the publication https://doi.org/10.1038/s41592-019-0658-6. 5Please cite it if you use this dataset in your research. 6""" 7 8import os 9import json 10import shutil 11from concurrent import futures 12from functools import partial 13from glob import glob 14from typing import List, Optional, Sequence, Tuple, Union 15 16import imageio 17import h5py 18import numpy as np 19import torch_em 20from PIL import Image, ImageDraw 21from skimage import draw as skimage_draw 22from skimage import morphology 23from tqdm import tqdm 24from torch.utils.data import Dataset, DataLoader 25 26from .. import util 27 28 29URLS = { 30 "segmentation": "https://zenodo.org/record/4665863/files/hpa_dataset_v2.zip" 31} 32CHECKSUMS = { 33 "segmentation": "dcd6072293d88d49c71376d3d99f3f4f102e4ee83efb0187faa89c95ec49faa9" 34} 35 36 37def _download_hpa_data(path, name, download): 38 os.makedirs(path, exist_ok=True) 39 url = URLS[name] 40 checksum = CHECKSUMS[name] 41 zip_path = os.path.join(path, "data.zip") 42 util.download_source(zip_path, url, download=download, checksum=checksum) 43 util.unzip(zip_path, path, remove=True) 44 45 46def _load_features(features): 47 # Loop over list and create simple dictionary & get size of annotations 48 annot_dict = {} 49 skipped = [] 50 51 for feat_idx, feat in enumerate(features): 52 if feat["geometry"]["type"] not in ["Polygon", "LineString"]: 53 skipped.append(feat["geometry"]["type"]) 54 continue 55 56 # skip empty roi 57 if len(feat["geometry"]["coordinates"][0]) <= 0: 58 continue 59 60 key_annot = "annot_" + str(feat_idx) 61 annot_dict[key_annot] = {} 62 annot_dict[key_annot]["type"] = feat["geometry"]["type"] 63 annot_dict[key_annot]["pos"] = np.squeeze( 64 np.asarray(feat["geometry"]["coordinates"]) 65 ) 66 annot_dict[key_annot]["properties"] = feat["properties"] 67 68 # print("Skipped geometry type(s):", skipped) 69 return annot_dict 70 71 72def _generate_binary_masks(annot_dict, shape, erose_size=5, obj_size_rem=500, save_indiv=False): 73 # Get dimensions of image and created masks of same size 74 # This we need to save somewhere (e.g. as part of the geojson file?) 75 76 # Filled masks and edge mask for polygons 77 mask_fill = np.zeros(shape, dtype=np.uint8) 78 mask_edge = np.zeros(shape, dtype=np.uint8) 79 mask_labels = np.zeros(shape, dtype=np.uint16) 80 81 rr_all = [] 82 cc_all = [] 83 84 if save_indiv is True: 85 mask_edge_indiv = np.zeros( 86 (shape[0], shape[1], len(annot_dict)), dtype="bool" 87 ) 88 mask_fill_indiv = np.zeros( 89 (shape[0], shape[1], len(annot_dict)), dtype="bool" 90 ) 91 92 # Image used to draw lines - for edge mask for freelines 93 im_freeline = Image.new("1", (shape[1], shape[0]), color=0) 94 draw = ImageDraw.Draw(im_freeline) 95 96 # Loop over all roi 97 i_roi = 0 98 for roi_key, roi in annot_dict.items(): 99 roi_pos = roi["pos"] 100 101 # Check region type 102 103 # freeline - line 104 if roi["type"] == "freeline" or roi["type"] == "LineString": 105 106 # Loop over all pairs of points to draw the line 107 108 for ind in range(roi_pos.shape[0] - 1): 109 line_pos = ( 110 roi_pos[ind, 1], 111 roi_pos[ind, 0], 112 roi_pos[ind + 1, 1], 113 roi_pos[ind + 1, 0], 114 ) 115 draw.line(line_pos, fill=1, width=erose_size) 116 117 # freehand - polygon 118 elif ( 119 roi["type"] == "freehand" 120 or roi["type"] == "polygon" 121 or roi["type"] == "polyline" 122 or roi["type"] == "Polygon" 123 ): 124 125 # Draw polygon 126 rr, cc = skimage_draw.polygon( 127 [shape[0] - r for r in roi_pos[:, 1]], roi_pos[:, 0] 128 ) 129 130 # Make sure it's not outside 131 rr[rr < 0] = 0 132 rr[rr > shape[0] - 1] = shape[0] - 1 133 134 cc[cc < 0] = 0 135 cc[cc > shape[1] - 1] = shape[1] - 1 136 137 # Test if this region has already been added 138 if any(np.array_equal(rr, rr_test) for rr_test in rr_all) and any( 139 np.array_equal(cc, cc_test) for cc_test in cc_all 140 ): 141 # print('Region #{} has already been used'.format(i + 142 # 1)) 143 continue 144 145 rr_all.append(rr) 146 cc_all.append(cc) 147 148 # Generate mask 149 mask_fill_roi = np.zeros(shape, dtype=np.uint8) 150 mask_fill_roi[rr, cc] = 1 151 152 # Erode to get cell edge - both arrays are boolean to be used as 153 # index arrays later 154 mask_fill_roi_erode = morphology.binary_erosion( 155 mask_fill_roi, np.ones((erose_size, erose_size)) 156 ) 157 mask_edge_roi = ( 158 mask_fill_roi.astype("int") - mask_fill_roi_erode.astype("int") 159 ).astype("bool") 160 161 # Save array for mask and edge 162 mask_fill[mask_fill_roi > 0] = 1 163 mask_edge[mask_edge_roi] = 1 164 mask_labels[mask_fill_roi > 0] = i_roi + 1 165 166 if save_indiv is True: 167 mask_edge_indiv[:, :, i_roi] = mask_edge_roi.astype("bool") 168 mask_fill_indiv[:, :, i_roi] = mask_fill_roi_erode.astype("bool") 169 170 i_roi = i_roi + 1 171 172 else: 173 roi_type = roi["type"] 174 raise NotImplementedError( 175 f'Mask for roi type "{roi_type}" can not be created' 176 ) 177 178 del draw 179 180 # Convert mask from free-lines to numpy array 181 mask_edge_freeline = np.asarray(im_freeline) 182 mask_edge_freeline = mask_edge_freeline.astype("bool") 183 184 # Post-processing of fill and edge mask - if defined 185 mask_dict = {} 186 if np.any(mask_fill): 187 188 # (1) remove edges , (2) remove small objects 189 mask_fill = mask_fill & ~mask_edge 190 mask_fill = morphology.remove_small_objects( 191 mask_fill.astype("bool"), obj_size_rem 192 ) 193 194 # For edge - consider also freeline edge mask 195 196 mask_edge = mask_edge.astype("bool") 197 mask_edge = np.logical_or(mask_edge, mask_edge_freeline) 198 199 # Assign to dictionary for return 200 mask_dict["edge"] = mask_edge 201 mask_dict["fill"] = mask_fill.astype("bool") 202 mask_dict["labels"] = mask_labels.astype("uint16") 203 204 if save_indiv is True: 205 mask_dict["edge_indiv"] = mask_edge_indiv 206 mask_dict["fill_indiv"] = mask_fill_indiv 207 else: 208 mask_dict["edge_indiv"] = np.zeros(shape + (1,), dtype=np.uint8) 209 mask_dict["fill_indiv"] = np.zeros(shape + (1,), dtype=np.uint8) 210 211 # Only edge mask present 212 elif np.any(mask_edge_freeline): 213 mask_dict["edge"] = mask_edge_freeline 214 mask_dict["fill"] = mask_fill.astype("bool") 215 mask_dict["labels"] = mask_labels.astype("uint16") 216 217 mask_dict["edge_indiv"] = np.zeros(shape + (1,), dtype=np.uint8) 218 mask_dict["fill_indiv"] = np.zeros(shape + (1,), dtype=np.uint8) 219 220 else: 221 raise Exception("No mask has been created.") 222 223 return mask_dict 224 225 226# adapted from 227# https://github.com/imjoy-team/kaibu-utils/blob/main/kaibu_utils/__init__.py#L267 228def _get_labels(annotation_file, shape, label="*"): 229 with open(annotation_file) as f: 230 features = json.load(f)["features"] 231 if len(features) == 0: 232 return np.zeros(shape, dtype="uint16") 233 234 annot_dict_all = _load_features(features) 235 annot_types = set( 236 annot_dict_all[k]["properties"].get("label", "default") 237 for k in annot_dict_all.keys() 238 ) 239 for annot_type in annot_types: 240 if label and label != "*" and annot_type != label: 241 continue 242 # print("annot_type: ", annot_type) 243 # Filter the annotations by label 244 annot_dict = { 245 k: annot_dict_all[k] 246 for k in annot_dict_all.keys() 247 if label == "*" 248 or annot_dict_all[k]["properties"].get("label", "default") == annot_type 249 } 250 mask_dict = _generate_binary_masks( 251 annot_dict, shape, 252 erose_size=5, 253 obj_size_rem=500, 254 save_indiv=True, 255 ) 256 mask = mask_dict["labels"] 257 return mask 258 raise RuntimeError 259 260 261def _process_image(in_folder, out_path, channels, with_labels): 262 # TODO double check the default order and color matching 263 # correspondence to the HPA kaggle data: 264 # microtubules: red 265 # nuclei: blue 266 # er: yellow 267 # protein: green 268 # default order: rgby = micro, prot, nuclei, er 269 all_channels = {"microtubules", "protein", "nuclei", "er"} 270 assert len(list(set(channels) - all_channels)) == 0 271 raw = [] 272 for chan in channels: 273 im_path = os.path.join(in_folder, f"{chan}.png") 274 assert os.path.exists(im_path), im_path 275 raw.append(imageio.imread(im_path)[None]) 276 raw = np.concatenate(raw, axis=0) 277 278 if with_labels: 279 annotation_file = os.path.join(in_folder, "annotation.json") 280 assert os.path.exists(annotation_file), annotation_file 281 labels = _get_labels(annotation_file, raw.shape[1:]) 282 assert labels.shape == raw.shape[1:] 283 284 with h5py.File(out_path, "w") as f: 285 f.create_dataset("raw", data=raw, compression="gzip") 286 if with_labels: 287 f.create_dataset("labels", data=labels, compression="gzip") 288 289 290def _process_split(root_in, root_out, channels, n_workers, with_labels): 291 os.makedirs(root_out, exist_ok=True) 292 inputs = glob(os.path.join(root_in, "*")) 293 outputs = [os.path.join(root_out, f"{os.path.split(inp)[1]}.h5") for inp in inputs] 294 process = partial(_process_image, channels=channels, with_labels=with_labels) 295 with futures.ProcessPoolExecutor(n_workers) as pp: 296 list(tqdm(pp.map(process, inputs, outputs), total=len(inputs), desc=f"Process data in {root_in}")) 297 298 299# save data as h5 with 4 channel raw data and labels extracted from the geo json 300def _process_hpa_data(path, channels, n_workers, remove): 301 in_path = os.path.join(path, "hpa_dataset_v2") 302 assert os.path.exists(in_path), in_path 303 for split in ("train", "test", "valid"): 304 out_split = "val" if split == "valid" else split 305 _process_split(os.path.join(in_path, split), os.path.join(path, out_split), 306 channels=channels, n_workers=n_workers, with_labels=split != "test") 307 if remove: 308 shutil.rmtree(in_path) 309 310 311def _check_data(path): 312 have_train = len(glob(os.path.join(path, "train", "*.h5"))) == 257 313 have_test = len(glob(os.path.join(path, "test", "*.h5"))) == 10 314 have_val = len(glob(os.path.join(path, "val", "*.h5"))) == 9 315 return have_train and have_test and have_val 316 317 318def get_hpa_segmentation_data( 319 path: Union[os.PathLike, str], 320 download: bool, 321 channels: Sequence[str], 322 n_workers_preproc: int = 8 323) -> str: 324 """Download the HPA training data. 325 326 Args: 327 path: Filepath to a folder where the downloaded data will be saved. 328 download: Whether to download the data if it is not present. 329 channels: The image channels to extract. Available channels are 330 'microtubules', 'protein', 'nuclei' or 'er'. 331 n_workers_preproc: The number of workers to use for preprocessing. 332 333 Returns: 334 The filepath to the training data. 335 """ 336 data_is_complete = _check_data(path) 337 if not data_is_complete: 338 _download_hpa_data(path, "segmentation", download) 339 _process_hpa_data(path, channels, n_workers_preproc, remove=True) 340 return path 341 342 343def get_hpa_segmentation_dataset( 344 path: Union[os.PathLike, str], 345 split: str, 346 patch_shape: Tuple[int, int], 347 offsets: Optional[List[List[int]]] = None, 348 boundaries: bool = False, 349 binary: bool = False, 350 channels: Sequence[str] = ["microtubules", "protein", "nuclei", "er"], 351 download: bool = False, 352 n_workers_preproc: int = 8, 353 **kwargs 354) -> Dataset: 355 """Get the HPA dataset for segmenting cells in confocal microscopy. 356 357 Args: 358 path: Filepath to a folder where the downloaded data will be saved. 359 split: The split for the dataset. Available splits are 'train', 'val' or 'test'. 360 patch_shape: The patch shape to use for training. 361 offsets: Offset values for affinity computation used as target. 362 boundaries: Whether to compute boundaries as the target. 363 binary: Whether to use a binary segmentation target. 364 channels: The image channels to extract. Available channels are 365 'microtubules', 'protein', 'nuclei' or 'er'. 366 download: Whether to download the data if it is not present. 367 n_workers_preproc: The number of workers to use for preprocessing. 368 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 369 370 Returns: 371 The segmentation dataset. 372 """ 373 get_hpa_segmentation_data(path, download, channels, n_workers_preproc) 374 375 kwargs, _ = util.add_instance_label_transform( 376 kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets 377 ) 378 kwargs = util.update_kwargs(kwargs, "ndim", 2) 379 kwargs = util.update_kwargs(kwargs, "with_channels", True) 380 381 paths = glob(os.path.join(path, split, "*.h5")) 382 raw_key = "raw" 383 label_key = "labels" 384 385 return torch_em.default_segmentation_dataset(paths, raw_key, paths, label_key, patch_shape, **kwargs) 386 387 388def get_hpa_segmentation_loader( 389 path: Union[os.PathLike, str], 390 split: str, 391 patch_shape: Tuple[int, int], 392 batch_size: int, 393 offsets: Optional[List[List[int]]] = None, 394 boundaries: bool = False, 395 binary: bool = False, 396 channels: Sequence[str] = ["microtubules", "protein", "nuclei", "er"], 397 download: bool = False, 398 n_workers_preproc: int = 8, 399 **kwargs 400) -> DataLoader: 401 """Get the HPA dataloader for segmenting cells in confocal microscopy. 402 403 Args: 404 path: Filepath to a folder where the downloaded data will be saved. 405 split: The split for the dataset. Available splits are 'train', 'val' or 'test'. 406 patch_shape: The patch shape to use for training. 407 batch_size: The batch size for training. 408 offsets: Offset values for affinity computation used as target. 409 boundaries: Whether to compute boundaries as the target. 410 binary: Whether to use a binary segmentation target. 411 channels: The image channels to extract. Available channels are 412 'microtubules', 'protein', 'nuclei' or 'er'. 413 download: Whether to download the data if it is not present. 414 n_workers_preproc: The number of workers to use for preprocessing. 415 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 416 417 Returns: 418 The DataLoader. 419 """ 420 ds_kwargs, loader_kwargs = util.split_kwargs( 421 torch_em.default_segmentation_dataset, **kwargs 422 ) 423 dataset = get_hpa_segmentation_dataset( 424 path, split, patch_shape, 425 offsets=offsets, boundaries=boundaries, binary=binary, 426 channels=channels, download=download, n_workers_preproc=n_workers_preproc, 427 **ds_kwargs 428 ) 429 loader = torch_em.get_data_loader(dataset, batch_size, **loader_kwargs) 430 return loader
URLS =
{'segmentation': 'https://zenodo.org/record/4665863/files/hpa_dataset_v2.zip'}
CHECKSUMS =
{'segmentation': 'dcd6072293d88d49c71376d3d99f3f4f102e4ee83efb0187faa89c95ec49faa9'}
def
get_hpa_segmentation_data( path: Union[os.PathLike, str], download: bool, channels: Sequence[str], n_workers_preproc: int = 8) -> str:
319def get_hpa_segmentation_data( 320 path: Union[os.PathLike, str], 321 download: bool, 322 channels: Sequence[str], 323 n_workers_preproc: int = 8 324) -> str: 325 """Download the HPA training data. 326 327 Args: 328 path: Filepath to a folder where the downloaded data will be saved. 329 download: Whether to download the data if it is not present. 330 channels: The image channels to extract. Available channels are 331 'microtubules', 'protein', 'nuclei' or 'er'. 332 n_workers_preproc: The number of workers to use for preprocessing. 333 334 Returns: 335 The filepath to the training data. 336 """ 337 data_is_complete = _check_data(path) 338 if not data_is_complete: 339 _download_hpa_data(path, "segmentation", download) 340 _process_hpa_data(path, channels, n_workers_preproc, remove=True) 341 return path
Download the HPA training data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- download: Whether to download the data if it is not present.
- channels: The image channels to extract. Available channels are 'microtubules', 'protein', 'nuclei' or 'er'.
- n_workers_preproc: The number of workers to use for preprocessing.
Returns:
The filepath to the training data.
def
get_hpa_segmentation_dataset( path: Union[os.PathLike, str], split: str, patch_shape: Tuple[int, int], offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, channels: Sequence[str] = ['microtubules', 'protein', 'nuclei', 'er'], download: bool = False, n_workers_preproc: int = 8, **kwargs) -> torch.utils.data.dataset.Dataset:
344def get_hpa_segmentation_dataset( 345 path: Union[os.PathLike, str], 346 split: str, 347 patch_shape: Tuple[int, int], 348 offsets: Optional[List[List[int]]] = None, 349 boundaries: bool = False, 350 binary: bool = False, 351 channels: Sequence[str] = ["microtubules", "protein", "nuclei", "er"], 352 download: bool = False, 353 n_workers_preproc: int = 8, 354 **kwargs 355) -> Dataset: 356 """Get the HPA dataset for segmenting cells in confocal microscopy. 357 358 Args: 359 path: Filepath to a folder where the downloaded data will be saved. 360 split: The split for the dataset. Available splits are 'train', 'val' or 'test'. 361 patch_shape: The patch shape to use for training. 362 offsets: Offset values for affinity computation used as target. 363 boundaries: Whether to compute boundaries as the target. 364 binary: Whether to use a binary segmentation target. 365 channels: The image channels to extract. Available channels are 366 'microtubules', 'protein', 'nuclei' or 'er'. 367 download: Whether to download the data if it is not present. 368 n_workers_preproc: The number of workers to use for preprocessing. 369 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 370 371 Returns: 372 The segmentation dataset. 373 """ 374 get_hpa_segmentation_data(path, download, channels, n_workers_preproc) 375 376 kwargs, _ = util.add_instance_label_transform( 377 kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets 378 ) 379 kwargs = util.update_kwargs(kwargs, "ndim", 2) 380 kwargs = util.update_kwargs(kwargs, "with_channels", True) 381 382 paths = glob(os.path.join(path, split, "*.h5")) 383 raw_key = "raw" 384 label_key = "labels" 385 386 return torch_em.default_segmentation_dataset(paths, raw_key, paths, label_key, patch_shape, **kwargs)
Get the HPA dataset for segmenting cells in confocal microscopy.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The split for the dataset. Available splits are 'train', 'val' or 'test'.
- patch_shape: The patch shape to use for training.
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- binary: Whether to use a binary segmentation target.
- channels: The image channels to extract. Available channels are 'microtubules', 'protein', 'nuclei' or 'er'.
- download: Whether to download the data if it is not present.
- n_workers_preproc: The number of workers to use for preprocessing.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
def
get_hpa_segmentation_loader( path: Union[os.PathLike, str], split: str, patch_shape: Tuple[int, int], batch_size: int, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, channels: Sequence[str] = ['microtubules', 'protein', 'nuclei', 'er'], download: bool = False, n_workers_preproc: int = 8, **kwargs) -> torch.utils.data.dataloader.DataLoader:
389def get_hpa_segmentation_loader( 390 path: Union[os.PathLike, str], 391 split: str, 392 patch_shape: Tuple[int, int], 393 batch_size: int, 394 offsets: Optional[List[List[int]]] = None, 395 boundaries: bool = False, 396 binary: bool = False, 397 channels: Sequence[str] = ["microtubules", "protein", "nuclei", "er"], 398 download: bool = False, 399 n_workers_preproc: int = 8, 400 **kwargs 401) -> DataLoader: 402 """Get the HPA dataloader for segmenting cells in confocal microscopy. 403 404 Args: 405 path: Filepath to a folder where the downloaded data will be saved. 406 split: The split for the dataset. Available splits are 'train', 'val' or 'test'. 407 patch_shape: The patch shape to use for training. 408 batch_size: The batch size for training. 409 offsets: Offset values for affinity computation used as target. 410 boundaries: Whether to compute boundaries as the target. 411 binary: Whether to use a binary segmentation target. 412 channels: The image channels to extract. Available channels are 413 'microtubules', 'protein', 'nuclei' or 'er'. 414 download: Whether to download the data if it is not present. 415 n_workers_preproc: The number of workers to use for preprocessing. 416 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 417 418 Returns: 419 The DataLoader. 420 """ 421 ds_kwargs, loader_kwargs = util.split_kwargs( 422 torch_em.default_segmentation_dataset, **kwargs 423 ) 424 dataset = get_hpa_segmentation_dataset( 425 path, split, patch_shape, 426 offsets=offsets, boundaries=boundaries, binary=binary, 427 channels=channels, download=download, n_workers_preproc=n_workers_preproc, 428 **ds_kwargs 429 ) 430 loader = torch_em.get_data_loader(dataset, batch_size, **loader_kwargs) 431 return loader
Get the HPA dataloader for segmenting cells in confocal microscopy.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The split for the dataset. Available splits are 'train', 'val' or 'test'.
- patch_shape: The patch shape to use for training.
- batch_size: The batch size for training.
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- binary: Whether to use a binary segmentation target.
- channels: The image channels to extract. Available channels are 'microtubules', 'protein', 'nuclei' or 'er'.
- download: Whether to download the data if it is not present.
- n_workers_preproc: The number of workers to use for preprocessing.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader.