torch_em.data.datasets.light_microscopy.plantseg

This dataset contains confocal and lightsheet microscopy images of plant cells with annotations for cell and nucleus segmentation.

The dataset part of the publication https://doi.org/10.7554/eLife.57613. Please cite it if you use this dataset in your research.

  1"""This dataset contains confocal and lightsheet microscopy images of plant cells
  2with annotations for cell and nucleus segmentation.
  3
  4The dataset part of the publication https://doi.org/10.7554/eLife.57613.
  5Please cite it if you use this dataset in your research.
  6"""
  7
  8import os
  9from glob import glob
 10from tqdm import tqdm
 11from typing import List, Optional, Tuple, Union
 12
 13from torch.utils.data import Dataset, DataLoader
 14
 15import torch_em
 16
 17from .. import util
 18
 19
 20URLS = {
 21    "root": {
 22        "train": "https://files.de-1.osf.io/v1/resources/9x3g2/providers/osfstorage/?zip=",
 23        "val": "https://files.de-1.osf.io/v1/resources/vs6gb/providers/osfstorage/?zip=",
 24        "test": "https://files.de-1.osf.io/v1/resources/tn4xj/providers/osfstorage/?zip=",
 25    },
 26    "nuclei": {
 27        "train": "https://files.de-1.osf.io/v1/resources/thxzn/providers/osfstorage/?zip=",
 28    },
 29    "ovules": {
 30        "train": "https://files.de-1.osf.io/v1/resources/x9yns/providers/osfstorage/?zip=",
 31        "val": "https://files.de-1.osf.io/v1/resources/xp5uf/providers/osfstorage/?zip=",
 32        "test": "https://files.de-1.osf.io/v1/resources/8jz7e/providers/osfstorage/?zip=",
 33    }
 34}
 35
 36# FIXME somehow the checksums are not reliably, this is a bit weird.
 37CHECKSUMS = {
 38    "root": {
 39        "train": None, "val": None, "test": None
 40        # "train": "f72e9525ff716ef14b70ab1318efd4bf303bbf9e0772bf2981a2db6e22a75794",
 41        # "val": "987280d9a56828c840e508422786431dcc3603e0ba4814aa06e7bf4424efcd9e",
 42        # "test": "ad71b8b9d20effba85fb5e1b42594ae35939d1a0cf905f3403789fc9e6afbc58",
 43    },
 44    "nuclei": {
 45        "train": None
 46        # "train": "9d19ddb61373e2a97effb6cf8bd8baae5f8a50f87024273070903ea8b1160396",
 47    },
 48    "ovules": {
 49        "train": None, "val": None, "test": None
 50        # "train": "70379673f1ab1866df6eb09d5ce11db7d3166d6d15b53a9c8b47376f04bae413",
 51        # "val": "872f516cb76879c30782d9a76d52df95236770a866f75365902c60c37b14fa36",
 52        # "test": "a7272f6ad1d765af6d121e20f436ac4f3609f1a90b1cb2346aa938d8c52800b9",
 53    }
 54}
 55
 56CROPPING_VOLUMES = {
 57    # root (train)
 58    "Movie2_T00006_crop_gt.h5": slice(4, None),
 59    "Movie2_T00008_crop_gt.h5": slice(None, -18),
 60    "Movie2_T00010_crop_gt.h5": slice(None, -32),
 61    "Movie2_T00012_crop_gt.h5": slice(None, -39),
 62    "Movie2_T00014_crop_gt.h5": slice(None, -40),
 63    "Movie2_T00016_crop_gt.h5": slice(None, -42),
 64    # root (test)
 65    "Movie2_T00020_crop_gt.h5": slice(None, -50),
 66    # ovules (train)
 67    "N_487_ds2x.h5": slice(17, None),
 68    "N_535_ds2x.h5": slice(None, -1),
 69    "N_534_ds2x.h5": slice(None, -1),
 70    "N_451_ds2x.h5": slice(None, -1),
 71    "N_425_ds2x.h5": slice(None, -1),
 72    # ovules (val)
 73    "N_420_ds2x.h5": slice(None, -1),
 74}
 75
 76# The resolution previous used for the resizing
 77# I have removed this feature since it was not reliable,
 78# but leaving this here for reference
 79# (also implementing resizing would be a good idea,
 80#  but more general and not for each dataset individually)
 81# NATIVE_RESOLUTION = (0.235, 0.075, 0.075)
 82
 83
 84def _fix_inconsistent_volumes(data_path, name, split):
 85    import h5py
 86
 87    file_paths = glob(os.path.join(data_path, "*.h5"))
 88    if name not in ["root", "ovules"] and split not in ["train", "val"]:
 89        return
 90
 91    for vol_path in tqdm(file_paths, desc="Fixing inconsistencies in volumes"):
 92        fname = os.path.basename(vol_path)
 93
 94        # avoid duplicated volumes in 'train' and 'test'.
 95        if fname == "Movie1_t00045_crop_gt.h5" and (name == "root" and split == "train"):
 96            os.remove(vol_path)
 97            continue
 98
 99        if fname not in CROPPING_VOLUMES:
100            continue
101
102        with h5py.File(vol_path, "r+") as f:
103            raw, labels = f["raw"], f["label"]
104
105            crop_slices = CROPPING_VOLUMES[fname]
106            resized_raw, resized_labels = raw[:][crop_slices], labels[:][crop_slices]
107
108            cropped_shape = resized_raw.shape
109            raw.resize(cropped_shape)
110            labels.resize(cropped_shape)
111
112            raw[...] = resized_raw
113            labels[...] = resized_labels
114
115
116def get_plantseg_data(path: Union[os.PathLike, str], name: str, split: str, download: bool = False) -> str:
117    """Download the PlantSeg training data.
118
119    Args:
120        path: Filepath to a folder where the downloaded data will be saved.
121        name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'.
122        split: The split to download. Either 'train', 'val' or 'test'.
123        download: Whether to download the data if it is not present.
124
125    Returns:
126        The filepath to the training data.
127    """
128    url = URLS[name][split]
129    checksum = CHECKSUMS[name][split]
130    os.makedirs(path, exist_ok=True)
131    out_path = os.path.join(path, f"{name}_{split}")
132    if os.path.exists(out_path):
133        return out_path
134    tmp_path = os.path.join(path, f"{name}_{split}.zip")
135    util.download_source(tmp_path, url, download, checksum)
136    util.unzip(tmp_path, out_path, remove=True)
137    _fix_inconsistent_volumes(out_path, name, split)
138    return out_path
139
140
141def get_plantseg_paths(
142    path: Union[os.PathLike, str],
143    name: str,
144    split: str,
145    download: bool = False
146) -> List[str]:
147    """Get paths to the PlantSeg data.
148
149    Args:
150        path: Filepath to a folder where the downloaded data will be saved.
151        name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'.
152        split: The split to download. Either 'train', 'val' or 'test'.
153        download: Whether to download the data if it is not present.
154
155    Returns:
156        List of filepaths for the data.
157    """
158    data_path = get_plantseg_data(path, name, split, download)
159    file_paths = sorted(glob(os.path.join(data_path, "*.h5")))
160    return file_paths
161
162
163def get_plantseg_dataset(
164    path: Union[os.PathLike, str],
165    name: str,
166    split: str,
167    patch_shape: Tuple[int, int, int],
168    download: bool = False,
169    offsets: Optional[List[List[int]]] = None,
170    boundaries: bool = False,
171    binary: bool = False,
172    **kwargs,
173) -> Dataset:
174    """Get the PlantSeg dataset for segmenting nuclei or cells.
175
176    Args:
177        path: Filepath to a folder where the downloaded data will be saved.
178        name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'.
179        split: The split to download. Either 'train', 'val' or 'test'.
180        patch_shape: The patch shape to use for training.
181        download: Whether to download the data if it is not present.
182        offsets: Offset values for affinity computation used as target.
183        boundaries: Whether to compute boundaries as the target.
184        binary: Whether to use a binary segmentation target.
185        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
186
187    Returns:
188       The segmentation dataset.
189    """
190    assert len(patch_shape) == 3
191
192    file_paths = get_plantseg_paths(path, name, split, download)
193
194    kwargs, _ = util.add_instance_label_transform(
195        kwargs, add_binary_target=binary, binary=binary, boundaries=boundaries,
196        offsets=offsets, binary_is_exclusive=False
197    )
198
199    return torch_em.default_segmentation_dataset(
200        raw_paths=file_paths,
201        raw_key="raw",
202        label_paths=file_paths,
203        label_key="label",
204        patch_shape=patch_shape,
205        **kwargs
206    )
207
208
209# TODO add support for ignore label, key: "/label_with_ignore"
210def get_plantseg_loader(
211    path: Union[os.PathLike, str],
212    name: str,
213    split: str,
214    patch_shape: Tuple[int, int, int],
215    batch_size: int,
216    download: bool = False,
217    offsets: Optional[List[List[int]]] = None,
218    boundaries: bool = False,
219    binary: bool = False,
220    **kwargs,
221) -> DataLoader:
222    """Get the PlantSeg dataloader for segmenting nuclei or cells.
223
224    Args:
225        path: Filepath to a folder where the downloaded data will be saved.
226        name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'.
227        split: The split to download. Either 'train', 'val' or 'test'.
228        patch_shape: The patch shape to use for training.
229        batch_size: The batch size for training.
230        download: Whether to download the data if it is not present.
231        offsets: Offset values for affinity computation used as target.
232        boundaries: Whether to compute boundaries as the target.
233        binary: Whether to use a binary segmentation target.
234        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
235
236    Returns:
237       The DataLoader.
238    """
239    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
240    dataset = get_plantseg_dataset(
241        path, name, split, patch_shape, download=download, offsets=offsets,
242        boundaries=boundaries, binary=binary, **ds_kwargs
243    )
244    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URLS = {'root': {'train': 'https://files.de-1.osf.io/v1/resources/9x3g2/providers/osfstorage/?zip=', 'val': 'https://files.de-1.osf.io/v1/resources/vs6gb/providers/osfstorage/?zip=', 'test': 'https://files.de-1.osf.io/v1/resources/tn4xj/providers/osfstorage/?zip='}, 'nuclei': {'train': 'https://files.de-1.osf.io/v1/resources/thxzn/providers/osfstorage/?zip='}, 'ovules': {'train': 'https://files.de-1.osf.io/v1/resources/x9yns/providers/osfstorage/?zip=', 'val': 'https://files.de-1.osf.io/v1/resources/xp5uf/providers/osfstorage/?zip=', 'test': 'https://files.de-1.osf.io/v1/resources/8jz7e/providers/osfstorage/?zip='}}
CHECKSUMS = {'root': {'train': None, 'val': None, 'test': None}, 'nuclei': {'train': None}, 'ovules': {'train': None, 'val': None, 'test': None}}
CROPPING_VOLUMES = {'Movie2_T00006_crop_gt.h5': slice(4, None, None), 'Movie2_T00008_crop_gt.h5': slice(None, -18, None), 'Movie2_T00010_crop_gt.h5': slice(None, -32, None), 'Movie2_T00012_crop_gt.h5': slice(None, -39, None), 'Movie2_T00014_crop_gt.h5': slice(None, -40, None), 'Movie2_T00016_crop_gt.h5': slice(None, -42, None), 'Movie2_T00020_crop_gt.h5': slice(None, -50, None), 'N_487_ds2x.h5': slice(17, None, None), 'N_535_ds2x.h5': slice(None, -1, None), 'N_534_ds2x.h5': slice(None, -1, None), 'N_451_ds2x.h5': slice(None, -1, None), 'N_425_ds2x.h5': slice(None, -1, None), 'N_420_ds2x.h5': slice(None, -1, None)}
def get_plantseg_data( path: Union[os.PathLike, str], name: str, split: str, download: bool = False) -> str:
117def get_plantseg_data(path: Union[os.PathLike, str], name: str, split: str, download: bool = False) -> str:
118    """Download the PlantSeg training data.
119
120    Args:
121        path: Filepath to a folder where the downloaded data will be saved.
122        name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'.
123        split: The split to download. Either 'train', 'val' or 'test'.
124        download: Whether to download the data if it is not present.
125
126    Returns:
127        The filepath to the training data.
128    """
129    url = URLS[name][split]
130    checksum = CHECKSUMS[name][split]
131    os.makedirs(path, exist_ok=True)
132    out_path = os.path.join(path, f"{name}_{split}")
133    if os.path.exists(out_path):
134        return out_path
135    tmp_path = os.path.join(path, f"{name}_{split}.zip")
136    util.download_source(tmp_path, url, download, checksum)
137    util.unzip(tmp_path, out_path, remove=True)
138    _fix_inconsistent_volumes(out_path, name, split)
139    return out_path

Download the PlantSeg training data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'.
  • split: The split to download. Either 'train', 'val' or 'test'.
  • download: Whether to download the data if it is not present.
Returns:

The filepath to the training data.

def get_plantseg_paths( path: Union[os.PathLike, str], name: str, split: str, download: bool = False) -> List[str]:
142def get_plantseg_paths(
143    path: Union[os.PathLike, str],
144    name: str,
145    split: str,
146    download: bool = False
147) -> List[str]:
148    """Get paths to the PlantSeg data.
149
150    Args:
151        path: Filepath to a folder where the downloaded data will be saved.
152        name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'.
153        split: The split to download. Either 'train', 'val' or 'test'.
154        download: Whether to download the data if it is not present.
155
156    Returns:
157        List of filepaths for the data.
158    """
159    data_path = get_plantseg_data(path, name, split, download)
160    file_paths = sorted(glob(os.path.join(data_path, "*.h5")))
161    return file_paths

Get paths to the PlantSeg data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'.
  • split: The split to download. Either 'train', 'val' or 'test'.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths for the data.

def get_plantseg_dataset( path: Union[os.PathLike, str], name: str, split: str, patch_shape: Tuple[int, int, int], download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
164def get_plantseg_dataset(
165    path: Union[os.PathLike, str],
166    name: str,
167    split: str,
168    patch_shape: Tuple[int, int, int],
169    download: bool = False,
170    offsets: Optional[List[List[int]]] = None,
171    boundaries: bool = False,
172    binary: bool = False,
173    **kwargs,
174) -> Dataset:
175    """Get the PlantSeg dataset for segmenting nuclei or cells.
176
177    Args:
178        path: Filepath to a folder where the downloaded data will be saved.
179        name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'.
180        split: The split to download. Either 'train', 'val' or 'test'.
181        patch_shape: The patch shape to use for training.
182        download: Whether to download the data if it is not present.
183        offsets: Offset values for affinity computation used as target.
184        boundaries: Whether to compute boundaries as the target.
185        binary: Whether to use a binary segmentation target.
186        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
187
188    Returns:
189       The segmentation dataset.
190    """
191    assert len(patch_shape) == 3
192
193    file_paths = get_plantseg_paths(path, name, split, download)
194
195    kwargs, _ = util.add_instance_label_transform(
196        kwargs, add_binary_target=binary, binary=binary, boundaries=boundaries,
197        offsets=offsets, binary_is_exclusive=False
198    )
199
200    return torch_em.default_segmentation_dataset(
201        raw_paths=file_paths,
202        raw_key="raw",
203        label_paths=file_paths,
204        label_key="label",
205        patch_shape=patch_shape,
206        **kwargs
207    )

Get the PlantSeg dataset for segmenting nuclei or cells.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'.
  • split: The split to download. Either 'train', 'val' or 'test'.
  • patch_shape: The patch shape to use for training.
  • download: Whether to download the data if it is not present.
  • offsets: Offset values for affinity computation used as target.
  • boundaries: Whether to compute boundaries as the target.
  • binary: Whether to use a binary segmentation target.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_plantseg_loader( path: Union[os.PathLike, str], name: str, split: str, patch_shape: Tuple[int, int, int], batch_size: int, download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
211def get_plantseg_loader(
212    path: Union[os.PathLike, str],
213    name: str,
214    split: str,
215    patch_shape: Tuple[int, int, int],
216    batch_size: int,
217    download: bool = False,
218    offsets: Optional[List[List[int]]] = None,
219    boundaries: bool = False,
220    binary: bool = False,
221    **kwargs,
222) -> DataLoader:
223    """Get the PlantSeg dataloader for segmenting nuclei or cells.
224
225    Args:
226        path: Filepath to a folder where the downloaded data will be saved.
227        name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'.
228        split: The split to download. Either 'train', 'val' or 'test'.
229        patch_shape: The patch shape to use for training.
230        batch_size: The batch size for training.
231        download: Whether to download the data if it is not present.
232        offsets: Offset values for affinity computation used as target.
233        boundaries: Whether to compute boundaries as the target.
234        binary: Whether to use a binary segmentation target.
235        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
236
237    Returns:
238       The DataLoader.
239    """
240    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
241    dataset = get_plantseg_dataset(
242        path, name, split, patch_shape, download=download, offsets=offsets,
243        boundaries=boundaries, binary=binary, **ds_kwargs
244    )
245    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the PlantSeg dataloader for segmenting nuclei or cells.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'.
  • split: The split to download. Either 'train', 'val' or 'test'.
  • patch_shape: The patch shape to use for training.
  • batch_size: The batch size for training.
  • download: Whether to download the data if it is not present.
  • offsets: Offset values for affinity computation used as target.
  • boundaries: Whether to compute boundaries as the target.
  • binary: Whether to use a binary segmentation target.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.