torch_em.data.datasets.light_microscopy.arvidsson

This dataset contains annotations for nucleus segmentation in high-content fluorescence microscopy images.

The dataset is located at https://zenodo.org/records/6657260. This dataset is from the publication https://doi.org/10.1016/j.dib.2022.108769. Please cite it if you use this dataset in your research.

  1"""This dataset contains annotations for nucleus segmentation in
  2high-content fluorescence microscopy images.
  3
  4The dataset is located at https://zenodo.org/records/6657260.
  5This dataset is from the publication https://doi.org/10.1016/j.dib.2022.108769.
  6Please cite it if you use this dataset in your research.
  7"""
  8
  9import os
 10from glob import glob
 11from tqdm import tqdm
 12from natsort import natsorted
 13from typing import Union, Tuple, Literal, List
 14
 15import numpy as np
 16import imageio.v3 as imageio
 17from skimage.measure import label as connected_components
 18
 19import torch_em
 20
 21from torch.utils.data import Dataset, DataLoader
 22
 23from .. import util
 24
 25
 26URLS = {
 27    "train": "https://zenodo.org/records/6657260/files/training_nuclei.zip",
 28    "val": "https://zenodo.org/records/6657260/files/development_nuclei.zip",
 29    "test": "https://zenodo.org/records/6657260/files/test_nuclei.zip",
 30}
 31
 32CHECKSUMS = {
 33    "train": "df075941f4e561f9ef82d4c48d22cf97e3627a0b63fa136675197614813fff90",
 34    "val": "722530a93fd5b67f61d52964651c715be6227c1c0508c4c95ef2b04b52fc1dd1",
 35    "test": "377dc719c4eaf9bfa30273f7e3a4042d98dbbfc4a1c4af2a467879237bff592f",
 36}
 37
 38
 39def get_arvidsson_data(
 40    path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], download: bool = False
 41) -> str:
 42    """Download the Arvidsson dataset.
 43
 44    Args:
 45        path: Filepath to a folder where the downloaded data will be saved.
 46        split: The data split to use. Either 'train', 'val' or 'test'.
 47        download: Whether to download the data if it is not present.
 48
 49    Returns:
 50        The filepath to the training data.
 51    """
 52    if split == "train":
 53        dname = "training_nuclei"
 54    elif split == "val":
 55        dname = "development_nuclei"
 56    elif split == "test":
 57        dname = "test_nuclei"
 58    else:
 59        raise ValueError(f"'{split}' is not a valid split.")
 60
 61    data_dir = os.path.join(path, dname)
 62    if os.path.exists(data_dir):
 63        return data_dir
 64
 65    os.makedirs(path, exist_ok=True)
 66
 67    zip_path = os.path.join(path, f"{dname}.zip")
 68    util.download_source(path=zip_path, url=URLS[split], download=download, checksum=CHECKSUMS[split])
 69    util.unzip(zip_path=os.path.join(path, f"{dname}.zip"), dst=path)
 70
 71    return data_dir
 72
 73
 74def get_arvidsson_paths(
 75    path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], download: bool = False,
 76) -> Tuple[List[int], List[int]]:
 77    """Get paths to the Arvidsson data.
 78
 79    Args:
 80        path: Filepath to a folder where the downloaded data will be saved.
 81        split: The data split to use. Either 'train', 'val' or 'test'.
 82        download: Whether to download the data if it is not present.
 83
 84    Returns:
 85        List of filepaths for the image data.
 86        List of filepaths for the label data.
 87    """
 88    data_dir = get_arvidsson_data(path, split, download)
 89
 90    raw_paths = natsorted(glob(os.path.join(data_dir, "images", "*.png")))
 91    label_paths = natsorted(glob(os.path.join(data_dir, "annotations", "*_preprocessed.tif")))
 92    if len(raw_paths) == len(label_paths):
 93        return raw_paths, label_paths
 94
 95    channel_label_paths = natsorted(glob(os.path.join(data_dir, "annotations", "*.png")))
 96    instance_paths = []
 97    for rpath, lpath in tqdm(
 98        zip(raw_paths, channel_label_paths), desc=f"Preprocessing labels for '{split}' split", total=len(raw_paths)
 99    ):
100        instance_path = lpath.replace(".png", "_preprocessed.tif")
101        instance_paths.append(instance_path)
102        if os.path.exists(instance_path):
103            continue
104
105        raw = imageio.imread(rpath)
106        labels = imageio.imread(lpath)
107
108        # NOTE: Converting the RGB-style instance labels to single channel instance labels.
109        # We do not operate over the backgroun region (with known pixel values: [0, 0, 0])
110        background_mask = np.all(labels == [0, 0, 0], axis=-1)
111        _, indices = np.unique(labels[~background_mask].reshape(-1, 3), axis=0, return_inverse=True)
112
113        instances = np.zeros(labels.shape[:2], dtype=np.int32)
114        instances[~background_mask] = indices + 1
115        instances = connected_components(instances)
116
117        assert raw.shape == instances.shape
118
119        imageio.imwrite(instance_path, instances, compression="zlib")
120
121    return raw_paths, instance_paths
122
123
124def get_arvidsson_dataset(
125    path: Union[os.PathLike, str],
126    patch_shape: Tuple[int, int],
127    split: Literal['train', 'val', 'test'],
128    download: bool = False,
129    **kwargs
130) -> Dataset:
131    """Get the Arvidsson dataset for nucleus segmentation.
132
133    Args:
134        path: Filepath to a folder where the downloaded data will be saved.
135        patch_shape: The patch shape to use for training.
136        split: The data split to use. Either 'train', 'val' or 'test'.
137        download: Whether to download the data if it is not present.
138        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
139
140    Returns:
141        The segmentation dataset.
142    """
143    raw_paths, label_paths = get_arvidsson_paths(path, split, download)
144
145    return torch_em.default_segmentation_dataset(
146        raw_paths=raw_paths,
147        raw_key=None,
148        label_paths=label_paths,
149        label_key=None,
150        patch_shape=patch_shape,
151        is_seg_dataset=False,
152        **kwargs
153    )
154
155
156def get_arvidsson_loader(
157    path: Union[os.PathLike, str],
158    batch_size: int,
159    patch_shape: Tuple[int, int],
160    split: Literal['train', 'val', 'test'],
161    download: bool = False,
162    **kwargs
163) -> DataLoader:
164    """Get the Arvidsson dataloader for nucleus segmentation.
165
166    Args:
167        path: Filepath to a folder where the downloaded data will be saved.
168        batch_size: The batch size for training.
169        patch_shape: The patch shape to use for training.
170        split: The data split to use. Either 'train', 'val' or 'test'.
171        download: Whether to download the data if it is not present.
172        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
173
174    Returns:
175        The DataLoader.
176    """
177    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
178    dataset = get_arvidsson_dataset(path, patch_shape, split, download, **ds_kwargs)
179    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URLS = {'train': 'https://zenodo.org/records/6657260/files/training_nuclei.zip', 'val': 'https://zenodo.org/records/6657260/files/development_nuclei.zip', 'test': 'https://zenodo.org/records/6657260/files/test_nuclei.zip'}
CHECKSUMS = {'train': 'df075941f4e561f9ef82d4c48d22cf97e3627a0b63fa136675197614813fff90', 'val': '722530a93fd5b67f61d52964651c715be6227c1c0508c4c95ef2b04b52fc1dd1', 'test': '377dc719c4eaf9bfa30273f7e3a4042d98dbbfc4a1c4af2a467879237bff592f'}
def get_arvidsson_data( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], download: bool = False) -> str:
40def get_arvidsson_data(
41    path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], download: bool = False
42) -> str:
43    """Download the Arvidsson dataset.
44
45    Args:
46        path: Filepath to a folder where the downloaded data will be saved.
47        split: The data split to use. Either 'train', 'val' or 'test'.
48        download: Whether to download the data if it is not present.
49
50    Returns:
51        The filepath to the training data.
52    """
53    if split == "train":
54        dname = "training_nuclei"
55    elif split == "val":
56        dname = "development_nuclei"
57    elif split == "test":
58        dname = "test_nuclei"
59    else:
60        raise ValueError(f"'{split}' is not a valid split.")
61
62    data_dir = os.path.join(path, dname)
63    if os.path.exists(data_dir):
64        return data_dir
65
66    os.makedirs(path, exist_ok=True)
67
68    zip_path = os.path.join(path, f"{dname}.zip")
69    util.download_source(path=zip_path, url=URLS[split], download=download, checksum=CHECKSUMS[split])
70    util.unzip(zip_path=os.path.join(path, f"{dname}.zip"), dst=path)
71
72    return data_dir

Download the Arvidsson dataset.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • split: The data split to use. Either 'train', 'val' or 'test'.
  • download: Whether to download the data if it is not present.
Returns:

The filepath to the training data.

def get_arvidsson_paths( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], download: bool = False) -> Tuple[List[int], List[int]]:
 75def get_arvidsson_paths(
 76    path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], download: bool = False,
 77) -> Tuple[List[int], List[int]]:
 78    """Get paths to the Arvidsson data.
 79
 80    Args:
 81        path: Filepath to a folder where the downloaded data will be saved.
 82        split: The data split to use. Either 'train', 'val' or 'test'.
 83        download: Whether to download the data if it is not present.
 84
 85    Returns:
 86        List of filepaths for the image data.
 87        List of filepaths for the label data.
 88    """
 89    data_dir = get_arvidsson_data(path, split, download)
 90
 91    raw_paths = natsorted(glob(os.path.join(data_dir, "images", "*.png")))
 92    label_paths = natsorted(glob(os.path.join(data_dir, "annotations", "*_preprocessed.tif")))
 93    if len(raw_paths) == len(label_paths):
 94        return raw_paths, label_paths
 95
 96    channel_label_paths = natsorted(glob(os.path.join(data_dir, "annotations", "*.png")))
 97    instance_paths = []
 98    for rpath, lpath in tqdm(
 99        zip(raw_paths, channel_label_paths), desc=f"Preprocessing labels for '{split}' split", total=len(raw_paths)
100    ):
101        instance_path = lpath.replace(".png", "_preprocessed.tif")
102        instance_paths.append(instance_path)
103        if os.path.exists(instance_path):
104            continue
105
106        raw = imageio.imread(rpath)
107        labels = imageio.imread(lpath)
108
109        # NOTE: Converting the RGB-style instance labels to single channel instance labels.
110        # We do not operate over the backgroun region (with known pixel values: [0, 0, 0])
111        background_mask = np.all(labels == [0, 0, 0], axis=-1)
112        _, indices = np.unique(labels[~background_mask].reshape(-1, 3), axis=0, return_inverse=True)
113
114        instances = np.zeros(labels.shape[:2], dtype=np.int32)
115        instances[~background_mask] = indices + 1
116        instances = connected_components(instances)
117
118        assert raw.shape == instances.shape
119
120        imageio.imwrite(instance_path, instances, compression="zlib")
121
122    return raw_paths, instance_paths

Get paths to the Arvidsson data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • split: The data split to use. Either 'train', 'val' or 'test'.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths for the image data. List of filepaths for the label data.

def get_arvidsson_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
125def get_arvidsson_dataset(
126    path: Union[os.PathLike, str],
127    patch_shape: Tuple[int, int],
128    split: Literal['train', 'val', 'test'],
129    download: bool = False,
130    **kwargs
131) -> Dataset:
132    """Get the Arvidsson dataset for nucleus segmentation.
133
134    Args:
135        path: Filepath to a folder where the downloaded data will be saved.
136        patch_shape: The patch shape to use for training.
137        split: The data split to use. Either 'train', 'val' or 'test'.
138        download: Whether to download the data if it is not present.
139        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
140
141    Returns:
142        The segmentation dataset.
143    """
144    raw_paths, label_paths = get_arvidsson_paths(path, split, download)
145
146    return torch_em.default_segmentation_dataset(
147        raw_paths=raw_paths,
148        raw_key=None,
149        label_paths=label_paths,
150        label_key=None,
151        patch_shape=patch_shape,
152        is_seg_dataset=False,
153        **kwargs
154    )

Get the Arvidsson dataset for nucleus segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • split: The data split to use. Either 'train', 'val' or 'test'.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_arvidsson_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
157def get_arvidsson_loader(
158    path: Union[os.PathLike, str],
159    batch_size: int,
160    patch_shape: Tuple[int, int],
161    split: Literal['train', 'val', 'test'],
162    download: bool = False,
163    **kwargs
164) -> DataLoader:
165    """Get the Arvidsson dataloader for nucleus segmentation.
166
167    Args:
168        path: Filepath to a folder where the downloaded data will be saved.
169        batch_size: The batch size for training.
170        patch_shape: The patch shape to use for training.
171        split: The data split to use. Either 'train', 'val' or 'test'.
172        download: Whether to download the data if it is not present.
173        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
174
175    Returns:
176        The DataLoader.
177    """
178    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
179    dataset = get_arvidsson_dataset(path, patch_shape, split, download, **ds_kwargs)
180    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the Arvidsson dataloader for nucleus segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape to use for training.
  • split: The data split to use. Either 'train', 'val' or 'test'.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.