torch_em.data.datasets.light_microscopy.plantseg

This dataset contains confocal and lightsheet microscopy images of plant cells with annotations for cell and nucleus segmentation.

The dataset part of the publication https://doi.org/10.7554/eLife.57613. Please cite it if you use this dataset in your research.

View Source

  1"""This dataset contains confocal and lightsheet microscopy images of plant cells
  2with annotations for cell and nucleus segmentation.
  3
  4The dataset part of the publication https://doi.org/10.7554/eLife.57613.
  5Please cite it if you use this dataset in your research.
  6"""
  7
  8import os
  9from glob import glob
 10from tqdm import tqdm
 11from typing import List, Optional, Tuple, Union
 12
 13from torch.utils.data import Dataset, DataLoader
 14
 15import torch_em
 16
 17from .. import util
 18
 19
 20URLS = {
 21    "root": {
 22        "train": "https://files.de-1.osf.io/v1/resources/9x3g2/providers/osfstorage/?zip=",
 23        "val": "https://files.de-1.osf.io/v1/resources/vs6gb/providers/osfstorage/?zip=",
 24        "test": "https://files.de-1.osf.io/v1/resources/tn4xj/providers/osfstorage/?zip=",
 25    },
 26    "nuclei": {
 27        "train": "https://files.de-1.osf.io/v1/resources/thxzn/providers/osfstorage/?zip=",
 28    },
 29    "ovules": {
 30        "train": "https://files.de-1.osf.io/v1/resources/x9yns/providers/osfstorage/?zip=",
 31        "val": "https://files.de-1.osf.io/v1/resources/xp5uf/providers/osfstorage/?zip=",
 32        "test": "https://files.de-1.osf.io/v1/resources/8jz7e/providers/osfstorage/?zip=",
 33    }
 34}
 35
 36# FIXME somehow the checksums are not reliably, this is a bit weird.
 37CHECKSUMS = {
 38    "root": {
 39        "train": None, "val": None, "test": None
 40        # "train": "f72e9525ff716ef14b70ab1318efd4bf303bbf9e0772bf2981a2db6e22a75794",
 41        # "val": "987280d9a56828c840e508422786431dcc3603e0ba4814aa06e7bf4424efcd9e",
 42        # "test": "ad71b8b9d20effba85fb5e1b42594ae35939d1a0cf905f3403789fc9e6afbc58",
 43    },
 44    "nuclei": {
 45        "train": None
 46        # "train": "9d19ddb61373e2a97effb6cf8bd8baae5f8a50f87024273070903ea8b1160396",
 47    },
 48    "ovules": {
 49        "train": None, "val": None, "test": None
 50        # "train": "70379673f1ab1866df6eb09d5ce11db7d3166d6d15b53a9c8b47376f04bae413",
 51        # "val": "872f516cb76879c30782d9a76d52df95236770a866f75365902c60c37b14fa36",
 52        # "test": "a7272f6ad1d765af6d121e20f436ac4f3609f1a90b1cb2346aa938d8c52800b9",
 53    }
 54}
 55
 56CROPPING_VOLUMES = {
 57    # root (train)
 58    "Movie2_T00006_crop_gt.h5": slice(4, None),
 59    "Movie2_T00008_crop_gt.h5": slice(None, -18),
 60    "Movie2_T00010_crop_gt.h5": slice(None, -32),
 61    "Movie2_T00012_crop_gt.h5": slice(None, -39),
 62    "Movie2_T00014_crop_gt.h5": slice(None, -40),
 63    "Movie2_T00016_crop_gt.h5": slice(None, -42),
 64    # root (test)
 65    "Movie2_T00020_crop_gt.h5": slice(None, -50),
 66    # ovules (train)
 67    "N_487_ds2x.h5": slice(17, None),
 68    "N_535_ds2x.h5": slice(None, -1),
 69    "N_534_ds2x.h5": slice(None, -1),
 70    "N_451_ds2x.h5": slice(None, -1),
 71    "N_425_ds2x.h5": slice(None, -1),
 72    # ovules (val)
 73    "N_420_ds2x.h5": slice(None, -1),
 74}
 75
 76# The resolution previous used for the resizing
 77# I have removed this feature since it was not reliable,
 78# but leaving this here for reference
 79# (also implementing resizing would be a good idea,
 80#  but more general and not for each dataset individually)
 81# NATIVE_RESOLUTION = (0.235, 0.075, 0.075)
 82
 83
 84def _fix_inconsistent_volumes(data_path, name, split):
 85    import h5py
 86
 87    file_paths = glob(os.path.join(data_path, "*.h5"))
 88    if name not in ["root", "ovules"] and split not in ["train", "val"]:
 89        return
 90
 91    for vol_path in tqdm(file_paths, desc="Fixing inconsistencies in volumes"):
 92        fname = os.path.basename(vol_path)
 93
 94        # avoid duplicated volumes in 'train' and 'test'.
 95        if fname == "Movie1_t00045_crop_gt.h5" and (name == "root" and split == "train"):
 96            os.remove(vol_path)
 97            continue
 98
 99        if fname not in CROPPING_VOLUMES:
100            continue
101
102        with h5py.File(vol_path, "r+") as f:
103            raw, labels = f["raw"], f["label"]
104
105            crop_slices = CROPPING_VOLUMES[fname]
106            resized_raw, resized_labels = raw[:][crop_slices], labels[:][crop_slices]
107
108            cropped_shape = resized_raw.shape
109            raw.resize(cropped_shape)
110            labels.resize(cropped_shape)
111
112            raw[...] = resized_raw
113            labels[...] = resized_labels
114
115
116def get_plantseg_data(path: Union[os.PathLike, str], name: str, split: str, download: bool = False) -> str:
117    """Download the PlantSeg training data.
118
119    Args:
120        path: Filepath to a folder where the downloaded data will be saved.
121        name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'.
122        split: The split to download. Either 'train', 'val' or 'test'.
123        download: Whether to download the data if it is not present.
124
125    Returns:
126        The filepath to the training data.
127    """
128    url = URLS[name][split]
129    checksum = CHECKSUMS[name][split]
130    os.makedirs(path, exist_ok=True)
131    out_path = os.path.join(path, f"{name}_{split}")
132    if os.path.exists(out_path):
133        return out_path
134    tmp_path = os.path.join(path, f"{name}_{split}.zip")
135    util.download_source(tmp_path, url, download, checksum)
136    util.unzip(tmp_path, out_path, remove=True)
137    _fix_inconsistent_volumes(out_path, name, split)
138    return out_path
139
140
141def get_plantseg_paths(
142    path: Union[os.PathLike, str],
143    name: str,
144    split: str,
145    download: bool = False
146) -> List[str]:
147    """Get paths to the PlantSeg data.
148
149    Args:
150        path: Filepath to a folder where the downloaded data will be saved.
151        name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'.
152        split: The split to download. Either 'train', 'val' or 'test'.
153        download: Whether to download the data if it is not present.
154
155    Returns:
156        List of filepaths for the data.
157    """
158    data_path = get_plantseg_data(path, name, split, download)
159    file_paths = sorted(glob(os.path.join(data_path, "*.h5")))
160    return file_paths
161
162
163def get_plantseg_dataset(
164    path: Union[os.PathLike, str],
165    name: str,
166    split: str,
167    patch_shape: Tuple[int, int, int],
168    download: bool = False,
169    offsets: Optional[List[List[int]]] = None,
170    boundaries: bool = False,
171    binary: bool = False,
172    **kwargs,
173) -> Dataset:
174    """Get the PlantSeg dataset for segmenting nuclei or cells.
175
176    Args:
177        path: Filepath to a folder where the downloaded data will be saved.
178        name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'.
179        split: The split to download. Either 'train', 'val' or 'test'.
180        patch_shape: The patch shape to use for training.
181        download: Whether to download the data if it is not present.
182        offsets: Offset values for affinity computation used as target.
183        boundaries: Whether to compute boundaries as the target.
184        binary: Whether to use a binary segmentation target.
185        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
186
187    Returns:
188       The segmentation dataset.
189    """
190    assert len(patch_shape) == 3
191
192    file_paths = get_plantseg_paths(path, name, split, download)
193
194    kwargs, _ = util.add_instance_label_transform(
195        kwargs, add_binary_target=binary, binary=binary, boundaries=boundaries,
196        offsets=offsets, binary_is_exclusive=False
197    )
198
199    return torch_em.default_segmentation_dataset(
200        raw_paths=file_paths,
201        raw_key="raw",
202        label_paths=file_paths,
203        label_key="label",
204        patch_shape=patch_shape,
205        **kwargs
206    )
207
208
209# TODO add support for ignore label, key: "/label_with_ignore"
210def get_plantseg_loader(
211    path: Union[os.PathLike, str],
212    name: str,
213    split: str,
214    patch_shape: Tuple[int, int, int],
215    batch_size: int,
216    download: bool = False,
217    offsets: Optional[List[List[int]]] = None,
218    boundaries: bool = False,
219    binary: bool = False,
220    **kwargs,
221) -> DataLoader:
222    """Get the PlantSeg dataloader for segmenting nuclei or cells.
223
224    Args:
225        path: Filepath to a folder where the downloaded data will be saved.
226        name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'.
227        split: The split to download. Either 'train', 'val' or 'test'.
228        patch_shape: The patch shape to use for training.
229        batch_size: The batch size for training.
230        download: Whether to download the data if it is not present.
231        offsets: Offset values for affinity computation used as target.
232        boundaries: Whether to compute boundaries as the target.
233        binary: Whether to use a binary segmentation target.
234        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
235
236    Returns:
237       The DataLoader.
238    """
239    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
240    dataset = get_plantseg_dataset(
241        path, name, split, patch_shape, download=download, offsets=offsets,
242        boundaries=boundaries, binary=binary, **ds_kwargs
243    )
244    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

URLS = {'root': {'train': 'https://files.de-1.osf.io/v1/resources/9x3g2/providers/osfstorage/?zip=', 'val': 'https://files.de-1.osf.io/v1/resources/vs6gb/providers/osfstorage/?zip=', 'test': 'https://files.de-1.osf.io/v1/resources/tn4xj/providers/osfstorage/?zip='}, 'nuclei': {'train': 'https://files.de-1.osf.io/v1/resources/thxzn/providers/osfstorage/?zip='}, 'ovules': {'train': 'https://files.de-1.osf.io/v1/resources/x9yns/providers/osfstorage/?zip=', 'val': 'https://files.de-1.osf.io/v1/resources/xp5uf/providers/osfstorage/?zip=', 'test': 'https://files.de-1.osf.io/v1/resources/8jz7e/providers/osfstorage/?zip='}}

CHECKSUMS = {'root': {'train': None, 'val': None, 'test': None}, 'nuclei': {'train': None}, 'ovules': {'train': None, 'val': None, 'test': None}}

CROPPING_VOLUMES = {'Movie2_T00006_crop_gt.h5': slice(4, None, None), 'Movie2_T00008_crop_gt.h5': slice(None, -18, None), 'Movie2_T00010_crop_gt.h5': slice(None, -32, None), 'Movie2_T00012_crop_gt.h5': slice(None, -39, None), 'Movie2_T00014_crop_gt.h5': slice(None, -40, None), 'Movie2_T00016_crop_gt.h5': slice(None, -42, None), 'Movie2_T00020_crop_gt.h5': slice(None, -50, None), 'N_487_ds2x.h5': slice(17, None, None), 'N_535_ds2x.h5': slice(None, -1, None), 'N_534_ds2x.h5': slice(None, -1, None), 'N_451_ds2x.h5': slice(None, -1, None), 'N_425_ds2x.h5': slice(None, -1, None), 'N_420_ds2x.h5': slice(None, -1, None)}

def get_plantseg_data( path: Union[os.PathLike, str], name: str, split: str, download: bool = False) -> str: View Source

117def get_plantseg_data(path: Union[os.PathLike, str], name: str, split: str, download: bool = False) -> str:
118    """Download the PlantSeg training data.
119
120    Args:
121        path: Filepath to a folder where the downloaded data will be saved.
122        name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'.
123        split: The split to download. Either 'train', 'val' or 'test'.
124        download: Whether to download the data if it is not present.
125
126    Returns:
127        The filepath to the training data.
128    """
129    url = URLS[name][split]
130    checksum = CHECKSUMS[name][split]
131    os.makedirs(path, exist_ok=True)
132    out_path = os.path.join(path, f"{name}_{split}")
133    if os.path.exists(out_path):
134        return out_path
135    tmp_path = os.path.join(path, f"{name}_{split}.zip")
136    util.download_source(tmp_path, url, download, checksum)
137    util.unzip(tmp_path, out_path, remove=True)
138    _fix_inconsistent_volumes(out_path, name, split)
139    return out_path

Download the PlantSeg training data.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'.
split: The split to download. Either 'train', 'val' or 'test'.
download: Whether to download the data if it is not present.

Returns:

The filepath to the training data.

def get_plantseg_paths( path: Union[os.PathLike, str], name: str, split: str, download: bool = False) -> List[str]: View Source

142def get_plantseg_paths(
143    path: Union[os.PathLike, str],
144    name: str,
145    split: str,
146    download: bool = False
147) -> List[str]:
148    """Get paths to the PlantSeg data.
149
150    Args:
151        path: Filepath to a folder where the downloaded data will be saved.
152        name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'.
153        split: The split to download. Either 'train', 'val' or 'test'.
154        download: Whether to download the data if it is not present.
155
156    Returns:
157        List of filepaths for the data.
158    """
159    data_path = get_plantseg_data(path, name, split, download)
160    file_paths = sorted(glob(os.path.join(data_path, "*.h5")))
161    return file_paths

Get paths to the PlantSeg data.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'.
split: The split to download. Either 'train', 'val' or 'test'.
download: Whether to download the data if it is not present.

Returns:

List of filepaths for the data.

def get_plantseg_dataset( path: Union[os.PathLike, str], name: str, split: str, patch_shape: Tuple[int, int, int], download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset: View Source

164def get_plantseg_dataset(
165    path: Union[os.PathLike, str],
166    name: str,
167    split: str,
168    patch_shape: Tuple[int, int, int],
169    download: bool = False,
170    offsets: Optional[List[List[int]]] = None,
171    boundaries: bool = False,
172    binary: bool = False,
173    **kwargs,
174) -> Dataset:
175    """Get the PlantSeg dataset for segmenting nuclei or cells.
176
177    Args:
178        path: Filepath to a folder where the downloaded data will be saved.
179        name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'.
180        split: The split to download. Either 'train', 'val' or 'test'.
181        patch_shape: The patch shape to use for training.
182        download: Whether to download the data if it is not present.
183        offsets: Offset values for affinity computation used as target.
184        boundaries: Whether to compute boundaries as the target.
185        binary: Whether to use a binary segmentation target.
186        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
187
188    Returns:
189       The segmentation dataset.
190    """
191    assert len(patch_shape) == 3
192
193    file_paths = get_plantseg_paths(path, name, split, download)
194
195    kwargs, _ = util.add_instance_label_transform(
196        kwargs, add_binary_target=binary, binary=binary, boundaries=boundaries,
197        offsets=offsets, binary_is_exclusive=False
198    )
199
200    return torch_em.default_segmentation_dataset(
201        raw_paths=file_paths,
202        raw_key="raw",
203        label_paths=file_paths,
204        label_key="label",
205        patch_shape=patch_shape,
206        **kwargs
207    )

Get the PlantSeg dataset for segmenting nuclei or cells.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'.
split: The split to download. Either 'train', 'val' or 'test'.
patch_shape: The patch shape to use for training.
download: Whether to download the data if it is not present.
offsets: Offset values for affinity computation used as target.
boundaries: Whether to compute boundaries as the target.
binary: Whether to use a binary segmentation target.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

Returns:

The segmentation dataset.

def get_plantseg_loader( path: Union[os.PathLike, str], name: str, split: str, patch_shape: Tuple[int, int, int], batch_size: int, download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader: View Source

211def get_plantseg_loader(
212    path: Union[os.PathLike, str],
213    name: str,
214    split: str,
215    patch_shape: Tuple[int, int, int],
216    batch_size: int,
217    download: bool = False,
218    offsets: Optional[List[List[int]]] = None,
219    boundaries: bool = False,
220    binary: bool = False,
221    **kwargs,
222) -> DataLoader:
223    """Get the PlantSeg dataloader for segmenting nuclei or cells.
224
225    Args:
226        path: Filepath to a folder where the downloaded data will be saved.
227        name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'.
228        split: The split to download. Either 'train', 'val' or 'test'.
229        patch_shape: The patch shape to use for training.
230        batch_size: The batch size for training.
231        download: Whether to download the data if it is not present.
232        offsets: Offset values for affinity computation used as target.
233        boundaries: Whether to compute boundaries as the target.
234        binary: Whether to use a binary segmentation target.
235        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
236
237    Returns:
238       The DataLoader.
239    """
240    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
241    dataset = get_plantseg_dataset(
242        path, name, split, patch_shape, download=download, offsets=offsets,
243        boundaries=boundaries, binary=binary, **ds_kwargs
244    )
245    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the PlantSeg dataloader for segmenting nuclei or cells.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'.
split: The split to download. Either 'train', 'val' or 'test'.
patch_shape: The patch shape to use for training.
batch_size: The batch size for training.
download: Whether to download the data if it is not present.
offsets: Offset values for affinity computation used as target.
boundaries: Whether to compute boundaries as the target.
binary: Whether to use a binary segmentation target.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

Returns:

The DataLoader.