torch_em.data.datasets.light_microscopy.plantseg

This dataset contains confocal and lightsheet microscopy images of plant cells with annotations for cell and nucleus segmentation.

The dataset part of the publication https://doi.org/10.7554/eLife.57613. Please cite it if you use this dataset in your research.

  1"""This dataset contains confocal and lightsheet microscopy images of plant cells
  2with annotations for cell and nucleus segmentation.
  3
  4The dataset part of the publication https://doi.org/10.7554/eLife.57613.
  5Please cite it if you use this dataset in your research.
  6"""
  7
  8import os
  9from glob import glob
 10from typing import List, Optional, Tuple, Union
 11
 12import torch_em
 13from torch.utils.data import Dataset, DataLoader
 14from .. import util
 15
 16URLS = {
 17    "root": {
 18        "train": "https://files.de-1.osf.io/v1/resources/9x3g2/providers/osfstorage/?zip=",
 19        "val": "https://files.de-1.osf.io/v1/resources/vs6gb/providers/osfstorage/?zip=",
 20        "test": "https://files.de-1.osf.io/v1/resources/tn4xj/providers/osfstorage/?zip=",
 21    },
 22    "nuclei": {
 23        "train": "https://files.de-1.osf.io/v1/resources/thxzn/providers/osfstorage/?zip=",
 24    },
 25    "ovules": {
 26        "train": "https://files.de-1.osf.io/v1/resources/x9yns/providers/osfstorage/?zip=",
 27        "val": "https://files.de-1.osf.io/v1/resources/xp5uf/providers/osfstorage/?zip=",
 28        "test": "https://files.de-1.osf.io/v1/resources/8jz7e/providers/osfstorage/?zip=",
 29    }
 30}
 31
 32# FIXME somehow the checksums are not reliably, this is a bit weird.
 33CHECKSUMS = {
 34    "root": {
 35        "train": None, "val": None, "test": None
 36        # "train": "f72e9525ff716ef14b70ab1318efd4bf303bbf9e0772bf2981a2db6e22a75794",
 37        # "val": "987280d9a56828c840e508422786431dcc3603e0ba4814aa06e7bf4424efcd9e",
 38        # "test": "ad71b8b9d20effba85fb5e1b42594ae35939d1a0cf905f3403789fc9e6afbc58",
 39    },
 40    "nuclei": {
 41        "train": None
 42        # "train": "9d19ddb61373e2a97effb6cf8bd8baae5f8a50f87024273070903ea8b1160396",
 43    },
 44    "ovules": {
 45        "train": None, "val": None, "test": None
 46        # "train": "70379673f1ab1866df6eb09d5ce11db7d3166d6d15b53a9c8b47376f04bae413",
 47        # "val": "872f516cb76879c30782d9a76d52df95236770a866f75365902c60c37b14fa36",
 48        # "test": "a7272f6ad1d765af6d121e20f436ac4f3609f1a90b1cb2346aa938d8c52800b9",
 49    }
 50}
 51# The resolution previous used for the resizing
 52# I have removed this feature since it was not reliable,
 53# but leaving this here for reference
 54# (also implementing resizing would be a good idea,
 55#  but more general and not for each dataset individually)
 56# NATIVE_RESOLUTION = (0.235, 0.075, 0.075)
 57
 58
 59def get_plantseg_data(path: Union[os.PathLike, str], download: bool, name: str, split: str) -> str:
 60    """Download the PlantSeg training data.
 61
 62    Args:
 63        path: Filepath to a folder where the downloaded data will be saved.
 64        download: Whether to download the data if it is not present.
 65        name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'.
 66        split: The split to download. Either 'train', 'val' or 'test'.
 67
 68    Returns:
 69        The filepath to the training data.
 70    """
 71    url = URLS[name][split]
 72    checksum = CHECKSUMS[name][split]
 73    os.makedirs(path, exist_ok=True)
 74    out_path = os.path.join(path, f"{name}_{split}")
 75    if os.path.exists(out_path):
 76        return out_path
 77    tmp_path = os.path.join(path, f"{name}_{split}.zip")
 78    util.download_source(tmp_path, url, download, checksum)
 79    util.unzip(tmp_path, out_path, remove=True)
 80    return out_path
 81
 82
 83def get_plantseg_dataset(
 84    path: Union[os.PathLike, str],
 85    name: str,
 86    split: str,
 87    patch_shape: Tuple[int, int, int],
 88    download: bool = False,
 89    offsets: Optional[List[List[int]]] = None,
 90    boundaries: bool = False,
 91    binary: bool = False,
 92    **kwargs,
 93) -> Dataset:
 94    """Get the PlantSeg dataset for segmenting nuclei or cells.
 95
 96    Args:
 97        path: Filepath to a folder where the downloaded data will be saved.
 98        name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'.
 99        split: The split to download. Either 'train', 'val' or 'test'.
100        patch_shape: The patch shape to use for training.
101        download: Whether to download the data if it is not present.
102        offsets: Offset values for affinity computation used as target.
103        boundaries: Whether to compute boundaries as the target.
104        binary: Whether to use a binary segmentation target.
105        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
106
107    Returns:
108       The segmentation dataset.
109    """
110    assert len(patch_shape) == 3
111    data_path = get_plantseg_data(path, download, name, split)
112
113    file_paths = glob(os.path.join(data_path, "*.h5"))
114    file_paths.sort()
115
116    kwargs, _ = util.add_instance_label_transform(
117        kwargs, add_binary_target=binary, binary=binary, boundaries=boundaries,
118        offsets=offsets, binary_is_exclusive=False
119    )
120
121    raw_key, label_key = "raw", "label"
122    return torch_em.default_segmentation_dataset(file_paths, raw_key, file_paths, label_key, patch_shape, **kwargs)
123
124
125# TODO add support for ignore label, key: "/label_with_ignore"
126def get_plantseg_loader(
127    path: Union[os.PathLike, str],
128    name: str,
129    split: str,
130    patch_shape: Tuple[int, int, int],
131    batch_size: int,
132    download: bool = False,
133    offsets: Optional[List[List[int]]] = None,
134    boundaries: bool = False,
135    binary: bool = False,
136    **kwargs,
137) -> DataLoader:
138    """Get the PlantSeg dataloader for segmenting nuclei or cells.
139
140    Args:
141        path: Filepath to a folder where the downloaded data will be saved.
142        name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'.
143        split: The split to download. Either 'train', 'val' or 'test'.
144        patch_shape: The patch shape to use for training.
145        batch_size: The batch size for training.
146        download: Whether to download the data if it is not present.
147        offsets: Offset values for affinity computation used as target.
148        boundaries: Whether to compute boundaries as the target.
149        binary: Whether to use a binary segmentation target.
150        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
151
152    Returns:
153       The DataLoader.
154    """
155    ds_kwargs, loader_kwargs = util.split_kwargs(
156        torch_em.default_segmentation_dataset, **kwargs
157    )
158    dataset = get_plantseg_dataset(
159        path, name, split, patch_shape,
160        download=download, offsets=offsets, boundaries=boundaries, binary=binary,
161        **ds_kwargs
162    )
163    loader = torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
164    return loader
URLS = {'root': {'train': 'https://files.de-1.osf.io/v1/resources/9x3g2/providers/osfstorage/?zip=', 'val': 'https://files.de-1.osf.io/v1/resources/vs6gb/providers/osfstorage/?zip=', 'test': 'https://files.de-1.osf.io/v1/resources/tn4xj/providers/osfstorage/?zip='}, 'nuclei': {'train': 'https://files.de-1.osf.io/v1/resources/thxzn/providers/osfstorage/?zip='}, 'ovules': {'train': 'https://files.de-1.osf.io/v1/resources/x9yns/providers/osfstorage/?zip=', 'val': 'https://files.de-1.osf.io/v1/resources/xp5uf/providers/osfstorage/?zip=', 'test': 'https://files.de-1.osf.io/v1/resources/8jz7e/providers/osfstorage/?zip='}}
CHECKSUMS = {'root': {'train': None, 'val': None, 'test': None}, 'nuclei': {'train': None}, 'ovules': {'train': None, 'val': None, 'test': None}}
def get_plantseg_data( path: Union[os.PathLike, str], download: bool, name: str, split: str) -> str:
60def get_plantseg_data(path: Union[os.PathLike, str], download: bool, name: str, split: str) -> str:
61    """Download the PlantSeg training data.
62
63    Args:
64        path: Filepath to a folder where the downloaded data will be saved.
65        download: Whether to download the data if it is not present.
66        name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'.
67        split: The split to download. Either 'train', 'val' or 'test'.
68
69    Returns:
70        The filepath to the training data.
71    """
72    url = URLS[name][split]
73    checksum = CHECKSUMS[name][split]
74    os.makedirs(path, exist_ok=True)
75    out_path = os.path.join(path, f"{name}_{split}")
76    if os.path.exists(out_path):
77        return out_path
78    tmp_path = os.path.join(path, f"{name}_{split}.zip")
79    util.download_source(tmp_path, url, download, checksum)
80    util.unzip(tmp_path, out_path, remove=True)
81    return out_path

Download the PlantSeg training data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • download: Whether to download the data if it is not present.
  • name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'.
  • split: The split to download. Either 'train', 'val' or 'test'.
Returns:

The filepath to the training data.

def get_plantseg_dataset( path: Union[os.PathLike, str], name: str, split: str, patch_shape: Tuple[int, int, int], download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
 84def get_plantseg_dataset(
 85    path: Union[os.PathLike, str],
 86    name: str,
 87    split: str,
 88    patch_shape: Tuple[int, int, int],
 89    download: bool = False,
 90    offsets: Optional[List[List[int]]] = None,
 91    boundaries: bool = False,
 92    binary: bool = False,
 93    **kwargs,
 94) -> Dataset:
 95    """Get the PlantSeg dataset for segmenting nuclei or cells.
 96
 97    Args:
 98        path: Filepath to a folder where the downloaded data will be saved.
 99        name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'.
100        split: The split to download. Either 'train', 'val' or 'test'.
101        patch_shape: The patch shape to use for training.
102        download: Whether to download the data if it is not present.
103        offsets: Offset values for affinity computation used as target.
104        boundaries: Whether to compute boundaries as the target.
105        binary: Whether to use a binary segmentation target.
106        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
107
108    Returns:
109       The segmentation dataset.
110    """
111    assert len(patch_shape) == 3
112    data_path = get_plantseg_data(path, download, name, split)
113
114    file_paths = glob(os.path.join(data_path, "*.h5"))
115    file_paths.sort()
116
117    kwargs, _ = util.add_instance_label_transform(
118        kwargs, add_binary_target=binary, binary=binary, boundaries=boundaries,
119        offsets=offsets, binary_is_exclusive=False
120    )
121
122    raw_key, label_key = "raw", "label"
123    return torch_em.default_segmentation_dataset(file_paths, raw_key, file_paths, label_key, patch_shape, **kwargs)

Get the PlantSeg dataset for segmenting nuclei or cells.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'.
  • split: The split to download. Either 'train', 'val' or 'test'.
  • patch_shape: The patch shape to use for training.
  • download: Whether to download the data if it is not present.
  • offsets: Offset values for affinity computation used as target.
  • boundaries: Whether to compute boundaries as the target.
  • binary: Whether to use a binary segmentation target.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_plantseg_loader( path: Union[os.PathLike, str], name: str, split: str, patch_shape: Tuple[int, int, int], batch_size: int, download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
127def get_plantseg_loader(
128    path: Union[os.PathLike, str],
129    name: str,
130    split: str,
131    patch_shape: Tuple[int, int, int],
132    batch_size: int,
133    download: bool = False,
134    offsets: Optional[List[List[int]]] = None,
135    boundaries: bool = False,
136    binary: bool = False,
137    **kwargs,
138) -> DataLoader:
139    """Get the PlantSeg dataloader for segmenting nuclei or cells.
140
141    Args:
142        path: Filepath to a folder where the downloaded data will be saved.
143        name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'.
144        split: The split to download. Either 'train', 'val' or 'test'.
145        patch_shape: The patch shape to use for training.
146        batch_size: The batch size for training.
147        download: Whether to download the data if it is not present.
148        offsets: Offset values for affinity computation used as target.
149        boundaries: Whether to compute boundaries as the target.
150        binary: Whether to use a binary segmentation target.
151        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
152
153    Returns:
154       The DataLoader.
155    """
156    ds_kwargs, loader_kwargs = util.split_kwargs(
157        torch_em.default_segmentation_dataset, **kwargs
158    )
159    dataset = get_plantseg_dataset(
160        path, name, split, patch_shape,
161        download=download, offsets=offsets, boundaries=boundaries, binary=binary,
162        **ds_kwargs
163    )
164    loader = torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
165    return loader

Get the PlantSeg dataloader for segmenting nuclei or cells.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'.
  • split: The split to download. Either 'train', 'val' or 'test'.
  • patch_shape: The patch shape to use for training.
  • batch_size: The batch size for training.
  • download: Whether to download the data if it is not present.
  • offsets: Offset values for affinity computation used as target.
  • boundaries: Whether to compute boundaries as the target.
  • binary: Whether to use a binary segmentation target.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.