torch_em.data.datasets.light_microscopy.hpa

This dataset was part of the HPA Kaggle challenge for protein identification. It contains confocal microscopy images and annotations for cell segmentation.

The dataset is described in the publication https://doi.org/10.1038/s41592-019-0658-6. Please cite it if you use this dataset in your research.

  1"""This dataset was part of the HPA Kaggle challenge for protein identification.
  2It contains confocal microscopy images and annotations for cell segmentation.
  3
  4The dataset is described in the publication https://doi.org/10.1038/s41592-019-0658-6.
  5Please cite it if you use this dataset in your research.
  6"""
  7
  8import os
  9import json
 10import shutil
 11from concurrent import futures
 12from functools import partial
 13from glob import glob
 14from typing import List, Optional, Sequence, Tuple, Union
 15
 16import imageio
 17import h5py
 18import numpy as np
 19import torch_em
 20from PIL import Image, ImageDraw
 21from skimage import draw as skimage_draw
 22from skimage import morphology
 23from tqdm import tqdm
 24from torch.utils.data import Dataset, DataLoader
 25
 26from .. import util
 27
 28
 29URLS = {
 30    "segmentation": "https://zenodo.org/record/4665863/files/hpa_dataset_v2.zip"
 31}
 32CHECKSUMS = {
 33    "segmentation": "dcd6072293d88d49c71376d3d99f3f4f102e4ee83efb0187faa89c95ec49faa9"
 34}
 35
 36
 37def _download_hpa_data(path, name, download):
 38    os.makedirs(path, exist_ok=True)
 39    url = URLS[name]
 40    checksum = CHECKSUMS[name]
 41    zip_path = os.path.join(path, "data.zip")
 42    util.download_source(zip_path, url, download=download, checksum=checksum)
 43    util.unzip(zip_path, path, remove=True)
 44
 45
 46def _load_features(features):
 47    # Loop over list and create simple dictionary & get size of annotations
 48    annot_dict = {}
 49    skipped = []
 50
 51    for feat_idx, feat in enumerate(features):
 52        if feat["geometry"]["type"] not in ["Polygon", "LineString"]:
 53            skipped.append(feat["geometry"]["type"])
 54            continue
 55
 56        # skip empty roi
 57        if len(feat["geometry"]["coordinates"][0]) <= 0:
 58            continue
 59
 60        key_annot = "annot_" + str(feat_idx)
 61        annot_dict[key_annot] = {}
 62        annot_dict[key_annot]["type"] = feat["geometry"]["type"]
 63        annot_dict[key_annot]["pos"] = np.squeeze(
 64            np.asarray(feat["geometry"]["coordinates"])
 65        )
 66        annot_dict[key_annot]["properties"] = feat["properties"]
 67
 68    # print("Skipped geometry type(s):", skipped)
 69    return annot_dict
 70
 71
 72def _generate_binary_masks(annot_dict, shape, erose_size=5, obj_size_rem=500, save_indiv=False):
 73    # Get dimensions of image and created masks of same size
 74    # This we need to save somewhere (e.g. as part of the geojson file?)
 75
 76    # Filled masks and edge mask for polygons
 77    mask_fill = np.zeros(shape, dtype=np.uint8)
 78    mask_edge = np.zeros(shape, dtype=np.uint8)
 79    mask_labels = np.zeros(shape, dtype=np.uint16)
 80
 81    rr_all = []
 82    cc_all = []
 83
 84    if save_indiv is True:
 85        mask_edge_indiv = np.zeros(
 86            (shape[0], shape[1], len(annot_dict)), dtype="bool"
 87        )
 88        mask_fill_indiv = np.zeros(
 89            (shape[0], shape[1], len(annot_dict)), dtype="bool"
 90        )
 91
 92    # Image used to draw lines - for edge mask for freelines
 93    im_freeline = Image.new("1", (shape[1], shape[0]), color=0)
 94    draw = ImageDraw.Draw(im_freeline)
 95
 96    # Loop over all roi
 97    i_roi = 0
 98    for roi_key, roi in annot_dict.items():
 99        roi_pos = roi["pos"]
100
101        # Check region type
102
103        # freeline - line
104        if roi["type"] == "freeline" or roi["type"] == "LineString":
105
106            # Loop over all pairs of points to draw the line
107
108            for ind in range(roi_pos.shape[0] - 1):
109                line_pos = (
110                    roi_pos[ind, 1],
111                    roi_pos[ind, 0],
112                    roi_pos[ind + 1, 1],
113                    roi_pos[ind + 1, 0],
114                )
115                draw.line(line_pos, fill=1, width=erose_size)
116
117        # freehand - polygon
118        elif (
119            roi["type"] == "freehand"
120            or roi["type"] == "polygon"
121            or roi["type"] == "polyline"
122            or roi["type"] == "Polygon"
123        ):
124
125            # Draw polygon
126            rr, cc = skimage_draw.polygon(
127                [shape[0] - r for r in roi_pos[:, 1]], roi_pos[:, 0]
128            )
129
130            # Make sure it's not outside
131            rr[rr < 0] = 0
132            rr[rr > shape[0] - 1] = shape[0] - 1
133
134            cc[cc < 0] = 0
135            cc[cc > shape[1] - 1] = shape[1] - 1
136
137            # Test if this region has already been added
138            if any(np.array_equal(rr, rr_test) for rr_test in rr_all) and any(
139                np.array_equal(cc, cc_test) for cc_test in cc_all
140            ):
141                # print('Region #{} has already been used'.format(i +
142                # 1))
143                continue
144
145            rr_all.append(rr)
146            cc_all.append(cc)
147
148            # Generate mask
149            mask_fill_roi = np.zeros(shape, dtype=np.uint8)
150            mask_fill_roi[rr, cc] = 1
151
152            # Erode to get cell edge - both arrays are boolean to be used as
153            # index arrays later
154            mask_fill_roi_erode = morphology.binary_erosion(
155                mask_fill_roi, np.ones((erose_size, erose_size))
156            )
157            mask_edge_roi = (
158                mask_fill_roi.astype("int") - mask_fill_roi_erode.astype("int")
159            ).astype("bool")
160
161            # Save array for mask and edge
162            mask_fill[mask_fill_roi > 0] = 1
163            mask_edge[mask_edge_roi] = 1
164            mask_labels[mask_fill_roi > 0] = i_roi + 1
165
166            if save_indiv is True:
167                mask_edge_indiv[:, :, i_roi] = mask_edge_roi.astype("bool")
168                mask_fill_indiv[:, :, i_roi] = mask_fill_roi_erode.astype("bool")
169
170            i_roi = i_roi + 1
171
172        else:
173            roi_type = roi["type"]
174            raise NotImplementedError(
175                f'Mask for roi type "{roi_type}" can not be created'
176            )
177
178    del draw
179
180    # Convert mask from free-lines to numpy array
181    mask_edge_freeline = np.asarray(im_freeline)
182    mask_edge_freeline = mask_edge_freeline.astype("bool")
183
184    # Post-processing of fill and edge mask - if defined
185    mask_dict = {}
186    if np.any(mask_fill):
187
188        # (1) remove edges , (2) remove small  objects
189        mask_fill = mask_fill & ~mask_edge
190        mask_fill = morphology.remove_small_objects(
191            mask_fill.astype("bool"), obj_size_rem
192        )
193
194        # For edge - consider also freeline edge mask
195
196        mask_edge = mask_edge.astype("bool")
197        mask_edge = np.logical_or(mask_edge, mask_edge_freeline)
198
199        # Assign to dictionary for return
200        mask_dict["edge"] = mask_edge
201        mask_dict["fill"] = mask_fill.astype("bool")
202        mask_dict["labels"] = mask_labels.astype("uint16")
203
204        if save_indiv is True:
205            mask_dict["edge_indiv"] = mask_edge_indiv
206            mask_dict["fill_indiv"] = mask_fill_indiv
207        else:
208            mask_dict["edge_indiv"] = np.zeros(shape + (1,), dtype=np.uint8)
209            mask_dict["fill_indiv"] = np.zeros(shape + (1,), dtype=np.uint8)
210
211    # Only edge mask present
212    elif np.any(mask_edge_freeline):
213        mask_dict["edge"] = mask_edge_freeline
214        mask_dict["fill"] = mask_fill.astype("bool")
215        mask_dict["labels"] = mask_labels.astype("uint16")
216
217        mask_dict["edge_indiv"] = np.zeros(shape + (1,), dtype=np.uint8)
218        mask_dict["fill_indiv"] = np.zeros(shape + (1,), dtype=np.uint8)
219
220    else:
221        raise Exception("No mask has been created.")
222
223    return mask_dict
224
225
226# adapted from
227# https://github.com/imjoy-team/kaibu-utils/blob/main/kaibu_utils/__init__.py#L267
228def _get_labels(annotation_file, shape, label="*"):
229    with open(annotation_file) as f:
230        features = json.load(f)["features"]
231    if len(features) == 0:
232        return np.zeros(shape, dtype="uint16")
233
234    annot_dict_all = _load_features(features)
235    annot_types = set(
236        annot_dict_all[k]["properties"].get("label", "default")
237        for k in annot_dict_all.keys()
238    )
239    for annot_type in annot_types:
240        if label and label != "*" and annot_type != label:
241            continue
242        # print("annot_type: ", annot_type)
243        # Filter the annotations by label
244        annot_dict = {
245            k: annot_dict_all[k]
246            for k in annot_dict_all.keys()
247            if label == "*"
248            or annot_dict_all[k]["properties"].get("label", "default") == annot_type
249        }
250        mask_dict = _generate_binary_masks(
251            annot_dict, shape,
252            erose_size=5,
253            obj_size_rem=500,
254            save_indiv=True,
255        )
256        mask = mask_dict["labels"]
257        return mask
258    raise RuntimeError
259
260
261def _process_image(in_folder, out_path, channels, with_labels):
262    # TODO double check the default order and color matching
263    # correspondence to the HPA kaggle data:
264    # microtubules: red
265    # nuclei: blue
266    # er: yellow
267    # protein: green
268    # default order: rgby = micro, prot, nuclei, er
269    all_channels = {"microtubules", "protein", "nuclei", "er"}
270    assert len(list(set(channels) - all_channels)) == 0
271    raw = []
272    for chan in channels:
273        im_path = os.path.join(in_folder, f"{chan}.png")
274        assert os.path.exists(im_path), im_path
275        raw.append(imageio.imread(im_path)[None])
276    raw = np.concatenate(raw, axis=0)
277
278    if with_labels:
279        annotation_file = os.path.join(in_folder, "annotation.json")
280        assert os.path.exists(annotation_file), annotation_file
281        labels = _get_labels(annotation_file, raw.shape[1:])
282        assert labels.shape == raw.shape[1:]
283
284    with h5py.File(out_path, "w") as f:
285        f.create_dataset("raw", data=raw, compression="gzip")
286        if with_labels:
287            f.create_dataset("labels", data=labels, compression="gzip")
288
289
290def _process_split(root_in, root_out, channels, n_workers, with_labels):
291    os.makedirs(root_out, exist_ok=True)
292    inputs = glob(os.path.join(root_in, "*"))
293    outputs = [os.path.join(root_out, f"{os.path.split(inp)[1]}.h5") for inp in inputs]
294    process = partial(_process_image, channels=channels, with_labels=with_labels)
295    with futures.ProcessPoolExecutor(n_workers) as pp:
296        list(tqdm(pp.map(process, inputs, outputs), total=len(inputs), desc=f"Process data in {root_in}"))
297
298
299# save data as h5 with 4 channel raw data and labels extracted from the geo json
300def _process_hpa_data(path, channels, n_workers, remove):
301    in_path = os.path.join(path, "hpa_dataset_v2")
302    assert os.path.exists(in_path), in_path
303    for split in ("train", "test", "valid"):
304        out_split = "val" if split == "valid" else split
305        _process_split(os.path.join(in_path, split), os.path.join(path, out_split),
306                       channels=channels, n_workers=n_workers, with_labels=split != "test")
307    if remove:
308        shutil.rmtree(in_path)
309
310
311def _check_data(path):
312    have_train = len(glob(os.path.join(path, "train", "*.h5"))) == 257
313    have_test = len(glob(os.path.join(path, "test", "*.h5"))) == 10
314    have_val = len(glob(os.path.join(path, "val", "*.h5"))) == 9
315    return have_train and have_test and have_val
316
317
318def get_hpa_segmentation_data(
319    path: Union[os.PathLike, str],
320    download: bool,
321    channels: Sequence[str],
322    n_workers_preproc: int = 8
323) -> str:
324    """Download the HPA training data.
325
326    Args:
327        path: Filepath to a folder where the downloaded data will be saved.
328        download: Whether to download the data if it is not present.
329        channels: The image channels to extract. Available channels are
330            'microtubules', 'protein', 'nuclei' or 'er'.
331        n_workers_preproc: The number of workers to use for preprocessing.
332
333    Returns:
334        The filepath to the training data.
335    """
336    data_is_complete = _check_data(path)
337    if not data_is_complete:
338        _download_hpa_data(path, "segmentation", download)
339        _process_hpa_data(path, channels, n_workers_preproc, remove=True)
340    return path
341
342
343def get_hpa_segmentation_dataset(
344    path: Union[os.PathLike, str],
345    split: str,
346    patch_shape: Tuple[int, int],
347    offsets: Optional[List[List[int]]] = None,
348    boundaries: bool = False,
349    binary: bool = False,
350    channels: Sequence[str] = ["microtubules", "protein", "nuclei", "er"],
351    download: bool = False,
352    n_workers_preproc: int = 8,
353    **kwargs
354) -> Dataset:
355    """Get the HPA dataset for segmenting cells in confocal microscopy.
356
357    Args:
358        path: Filepath to a folder where the downloaded data will be saved.
359        split: The split for the dataset. Available splits are 'train', 'val' or 'test'.
360        patch_shape: The patch shape to use for training.
361        offsets: Offset values for affinity computation used as target.
362        boundaries: Whether to compute boundaries as the target.
363        binary: Whether to use a binary segmentation target.
364        channels: The image channels to extract. Available channels are
365            'microtubules', 'protein', 'nuclei' or 'er'.
366        download: Whether to download the data if it is not present.
367        n_workers_preproc: The number of workers to use for preprocessing.
368        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
369
370    Returns:
371       The segmentation dataset.
372    """
373    get_hpa_segmentation_data(path, download, channels, n_workers_preproc)
374
375    kwargs, _ = util.add_instance_label_transform(
376        kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets
377    )
378    kwargs = util.update_kwargs(kwargs, "ndim", 2)
379    kwargs = util.update_kwargs(kwargs, "with_channels", True)
380
381    paths = glob(os.path.join(path, split, "*.h5"))
382    raw_key = "raw"
383    label_key = "labels"
384
385    return torch_em.default_segmentation_dataset(paths, raw_key, paths, label_key, patch_shape, **kwargs)
386
387
388def get_hpa_segmentation_loader(
389    path: Union[os.PathLike, str],
390    split: str,
391    patch_shape: Tuple[int, int],
392    batch_size: int,
393    offsets: Optional[List[List[int]]] = None,
394    boundaries: bool = False,
395    binary: bool = False,
396    channels: Sequence[str] = ["microtubules", "protein", "nuclei", "er"],
397    download: bool = False,
398    n_workers_preproc: int = 8,
399    **kwargs
400) -> DataLoader:
401    """Get the HPA dataloader for segmenting cells in confocal microscopy.
402
403    Args:
404        path: Filepath to a folder where the downloaded data will be saved.
405        split: The split for the dataset. Available splits are 'train', 'val' or 'test'.
406        patch_shape: The patch shape to use for training.
407        batch_size: The batch size for training.
408        offsets: Offset values for affinity computation used as target.
409        boundaries: Whether to compute boundaries as the target.
410        binary: Whether to use a binary segmentation target.
411        channels: The image channels to extract. Available channels are
412            'microtubules', 'protein', 'nuclei' or 'er'.
413        download: Whether to download the data if it is not present.
414        n_workers_preproc: The number of workers to use for preprocessing.
415        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
416
417    Returns:
418       The DataLoader.
419    """
420    ds_kwargs, loader_kwargs = util.split_kwargs(
421        torch_em.default_segmentation_dataset, **kwargs
422    )
423    dataset = get_hpa_segmentation_dataset(
424        path, split, patch_shape,
425        offsets=offsets, boundaries=boundaries, binary=binary,
426        channels=channels, download=download, n_workers_preproc=n_workers_preproc,
427        **ds_kwargs
428    )
429    loader = torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
430    return loader
URLS = {'segmentation': 'https://zenodo.org/record/4665863/files/hpa_dataset_v2.zip'}
CHECKSUMS = {'segmentation': 'dcd6072293d88d49c71376d3d99f3f4f102e4ee83efb0187faa89c95ec49faa9'}
def get_hpa_segmentation_data( path: Union[os.PathLike, str], download: bool, channels: Sequence[str], n_workers_preproc: int = 8) -> str:
319def get_hpa_segmentation_data(
320    path: Union[os.PathLike, str],
321    download: bool,
322    channels: Sequence[str],
323    n_workers_preproc: int = 8
324) -> str:
325    """Download the HPA training data.
326
327    Args:
328        path: Filepath to a folder where the downloaded data will be saved.
329        download: Whether to download the data if it is not present.
330        channels: The image channels to extract. Available channels are
331            'microtubules', 'protein', 'nuclei' or 'er'.
332        n_workers_preproc: The number of workers to use for preprocessing.
333
334    Returns:
335        The filepath to the training data.
336    """
337    data_is_complete = _check_data(path)
338    if not data_is_complete:
339        _download_hpa_data(path, "segmentation", download)
340        _process_hpa_data(path, channels, n_workers_preproc, remove=True)
341    return path

Download the HPA training data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • download: Whether to download the data if it is not present.
  • channels: The image channels to extract. Available channels are 'microtubules', 'protein', 'nuclei' or 'er'.
  • n_workers_preproc: The number of workers to use for preprocessing.
Returns:

The filepath to the training data.

def get_hpa_segmentation_dataset( path: Union[os.PathLike, str], split: str, patch_shape: Tuple[int, int], offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, channels: Sequence[str] = ['microtubules', 'protein', 'nuclei', 'er'], download: bool = False, n_workers_preproc: int = 8, **kwargs) -> torch.utils.data.dataset.Dataset:
344def get_hpa_segmentation_dataset(
345    path: Union[os.PathLike, str],
346    split: str,
347    patch_shape: Tuple[int, int],
348    offsets: Optional[List[List[int]]] = None,
349    boundaries: bool = False,
350    binary: bool = False,
351    channels: Sequence[str] = ["microtubules", "protein", "nuclei", "er"],
352    download: bool = False,
353    n_workers_preproc: int = 8,
354    **kwargs
355) -> Dataset:
356    """Get the HPA dataset for segmenting cells in confocal microscopy.
357
358    Args:
359        path: Filepath to a folder where the downloaded data will be saved.
360        split: The split for the dataset. Available splits are 'train', 'val' or 'test'.
361        patch_shape: The patch shape to use for training.
362        offsets: Offset values for affinity computation used as target.
363        boundaries: Whether to compute boundaries as the target.
364        binary: Whether to use a binary segmentation target.
365        channels: The image channels to extract. Available channels are
366            'microtubules', 'protein', 'nuclei' or 'er'.
367        download: Whether to download the data if it is not present.
368        n_workers_preproc: The number of workers to use for preprocessing.
369        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
370
371    Returns:
372       The segmentation dataset.
373    """
374    get_hpa_segmentation_data(path, download, channels, n_workers_preproc)
375
376    kwargs, _ = util.add_instance_label_transform(
377        kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets
378    )
379    kwargs = util.update_kwargs(kwargs, "ndim", 2)
380    kwargs = util.update_kwargs(kwargs, "with_channels", True)
381
382    paths = glob(os.path.join(path, split, "*.h5"))
383    raw_key = "raw"
384    label_key = "labels"
385
386    return torch_em.default_segmentation_dataset(paths, raw_key, paths, label_key, patch_shape, **kwargs)

Get the HPA dataset for segmenting cells in confocal microscopy.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • split: The split for the dataset. Available splits are 'train', 'val' or 'test'.
  • patch_shape: The patch shape to use for training.
  • offsets: Offset values for affinity computation used as target.
  • boundaries: Whether to compute boundaries as the target.
  • binary: Whether to use a binary segmentation target.
  • channels: The image channels to extract. Available channels are 'microtubules', 'protein', 'nuclei' or 'er'.
  • download: Whether to download the data if it is not present.
  • n_workers_preproc: The number of workers to use for preprocessing.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_hpa_segmentation_loader( path: Union[os.PathLike, str], split: str, patch_shape: Tuple[int, int], batch_size: int, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, channels: Sequence[str] = ['microtubules', 'protein', 'nuclei', 'er'], download: bool = False, n_workers_preproc: int = 8, **kwargs) -> torch.utils.data.dataloader.DataLoader:
389def get_hpa_segmentation_loader(
390    path: Union[os.PathLike, str],
391    split: str,
392    patch_shape: Tuple[int, int],
393    batch_size: int,
394    offsets: Optional[List[List[int]]] = None,
395    boundaries: bool = False,
396    binary: bool = False,
397    channels: Sequence[str] = ["microtubules", "protein", "nuclei", "er"],
398    download: bool = False,
399    n_workers_preproc: int = 8,
400    **kwargs
401) -> DataLoader:
402    """Get the HPA dataloader for segmenting cells in confocal microscopy.
403
404    Args:
405        path: Filepath to a folder where the downloaded data will be saved.
406        split: The split for the dataset. Available splits are 'train', 'val' or 'test'.
407        patch_shape: The patch shape to use for training.
408        batch_size: The batch size for training.
409        offsets: Offset values for affinity computation used as target.
410        boundaries: Whether to compute boundaries as the target.
411        binary: Whether to use a binary segmentation target.
412        channels: The image channels to extract. Available channels are
413            'microtubules', 'protein', 'nuclei' or 'er'.
414        download: Whether to download the data if it is not present.
415        n_workers_preproc: The number of workers to use for preprocessing.
416        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
417
418    Returns:
419       The DataLoader.
420    """
421    ds_kwargs, loader_kwargs = util.split_kwargs(
422        torch_em.default_segmentation_dataset, **kwargs
423    )
424    dataset = get_hpa_segmentation_dataset(
425        path, split, patch_shape,
426        offsets=offsets, boundaries=boundaries, binary=binary,
427        channels=channels, download=download, n_workers_preproc=n_workers_preproc,
428        **ds_kwargs
429    )
430    loader = torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
431    return loader

Get the HPA dataloader for segmenting cells in confocal microscopy.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • split: The split for the dataset. Available splits are 'train', 'val' or 'test'.
  • patch_shape: The patch shape to use for training.
  • batch_size: The batch size for training.
  • offsets: Offset values for affinity computation used as target.
  • boundaries: Whether to compute boundaries as the target.
  • binary: Whether to use a binary segmentation target.
  • channels: The image channels to extract. Available channels are 'microtubules', 'protein', 'nuclei' or 'er'.
  • download: Whether to download the data if it is not present.
  • n_workers_preproc: The number of workers to use for preprocessing.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.