torch_em.data.datasets.light_microscopy.plantseg
This dataset contains confocal and lightsheet microscopy images of plant cells with annotations for cell and nucleus segmentation.
The dataset part of the publication https://doi.org/10.7554/eLife.57613. Please cite it if you use this dataset in your research.
1"""This dataset contains confocal and lightsheet microscopy images of plant cells 2with annotations for cell and nucleus segmentation. 3 4The dataset part of the publication https://doi.org/10.7554/eLife.57613. 5Please cite it if you use this dataset in your research. 6""" 7 8import os 9from glob import glob 10from tqdm import tqdm 11from typing import List, Optional, Tuple, Union 12 13from torch.utils.data import Dataset, DataLoader 14 15import torch_em 16 17from .. import util 18 19 20URLS = { 21 "root": { 22 "train": "https://files.de-1.osf.io/v1/resources/9x3g2/providers/osfstorage/?zip=", 23 "val": "https://files.de-1.osf.io/v1/resources/vs6gb/providers/osfstorage/?zip=", 24 "test": "https://files.de-1.osf.io/v1/resources/tn4xj/providers/osfstorage/?zip=", 25 }, 26 "nuclei": { 27 "train": "https://files.de-1.osf.io/v1/resources/thxzn/providers/osfstorage/?zip=", 28 }, 29 "ovules": { 30 "train": "https://files.de-1.osf.io/v1/resources/x9yns/providers/osfstorage/?zip=", 31 "val": "https://files.de-1.osf.io/v1/resources/xp5uf/providers/osfstorage/?zip=", 32 "test": "https://files.de-1.osf.io/v1/resources/8jz7e/providers/osfstorage/?zip=", 33 } 34} 35 36# FIXME somehow the checksums are not reliably, this is a bit weird. 37CHECKSUMS = { 38 "root": { 39 "train": None, "val": None, "test": None 40 # "train": "f72e9525ff716ef14b70ab1318efd4bf303bbf9e0772bf2981a2db6e22a75794", 41 # "val": "987280d9a56828c840e508422786431dcc3603e0ba4814aa06e7bf4424efcd9e", 42 # "test": "ad71b8b9d20effba85fb5e1b42594ae35939d1a0cf905f3403789fc9e6afbc58", 43 }, 44 "nuclei": { 45 "train": None 46 # "train": "9d19ddb61373e2a97effb6cf8bd8baae5f8a50f87024273070903ea8b1160396", 47 }, 48 "ovules": { 49 "train": None, "val": None, "test": None 50 # "train": "70379673f1ab1866df6eb09d5ce11db7d3166d6d15b53a9c8b47376f04bae413", 51 # "val": "872f516cb76879c30782d9a76d52df95236770a866f75365902c60c37b14fa36", 52 # "test": "a7272f6ad1d765af6d121e20f436ac4f3609f1a90b1cb2346aa938d8c52800b9", 53 } 54} 55 56CROPPING_VOLUMES = { 57 # root (train) 58 "Movie2_T00006_crop_gt.h5": slice(4, None), 59 "Movie2_T00008_crop_gt.h5": slice(None, -18), 60 "Movie2_T00010_crop_gt.h5": slice(None, -32), 61 "Movie2_T00012_crop_gt.h5": slice(None, -39), 62 "Movie2_T00014_crop_gt.h5": slice(None, -40), 63 "Movie2_T00016_crop_gt.h5": slice(None, -42), 64 # root (test) 65 "Movie2_T00020_crop_gt.h5": slice(None, -50), 66 # ovules (train) 67 "N_487_ds2x.h5": slice(17, None), 68 "N_535_ds2x.h5": slice(None, -1), 69 "N_534_ds2x.h5": slice(None, -1), 70 "N_451_ds2x.h5": slice(None, -1), 71 "N_425_ds2x.h5": slice(None, -1), 72 # ovules (val) 73 "N_420_ds2x.h5": slice(None, -1), 74} 75 76# The resolution previous used for the resizing 77# I have removed this feature since it was not reliable, 78# but leaving this here for reference 79# (also implementing resizing would be a good idea, 80# but more general and not for each dataset individually) 81# NATIVE_RESOLUTION = (0.235, 0.075, 0.075) 82 83 84def _fix_inconsistent_volumes(data_path, name, split): 85 import h5py 86 87 file_paths = glob(os.path.join(data_path, "*.h5")) 88 if name not in ["root", "ovules"] and split not in ["train", "val"]: 89 return 90 91 for vol_path in tqdm(file_paths, desc="Fixing inconsistencies in volumes"): 92 fname = os.path.basename(vol_path) 93 94 # avoid duplicated volumes in 'train' and 'test'. 95 if fname == "Movie1_t00045_crop_gt.h5" and (name == "root" and split == "train"): 96 os.remove(vol_path) 97 continue 98 99 if fname not in CROPPING_VOLUMES: 100 continue 101 102 with h5py.File(vol_path, "r+") as f: 103 raw, labels = f["raw"], f["label"] 104 105 crop_slices = CROPPING_VOLUMES[fname] 106 resized_raw, resized_labels = raw[:][crop_slices], labels[:][crop_slices] 107 108 cropped_shape = resized_raw.shape 109 raw.resize(cropped_shape) 110 labels.resize(cropped_shape) 111 112 raw[...] = resized_raw 113 labels[...] = resized_labels 114 115 116def get_plantseg_data(path: Union[os.PathLike, str], name: str, split: str, download: bool = False) -> str: 117 """Download the PlantSeg training data. 118 119 Args: 120 path: Filepath to a folder where the downloaded data will be saved. 121 name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'. 122 split: The split to download. Either 'train', 'val' or 'test'. 123 download: Whether to download the data if it is not present. 124 125 Returns: 126 The filepath to the training data. 127 """ 128 url = URLS[name][split] 129 checksum = CHECKSUMS[name][split] 130 os.makedirs(path, exist_ok=True) 131 out_path = os.path.join(path, f"{name}_{split}") 132 if os.path.exists(out_path): 133 return out_path 134 tmp_path = os.path.join(path, f"{name}_{split}.zip") 135 util.download_source(tmp_path, url, download, checksum) 136 util.unzip(tmp_path, out_path, remove=True) 137 _fix_inconsistent_volumes(out_path, name, split) 138 return out_path 139 140 141def get_plantseg_paths( 142 path: Union[os.PathLike, str], 143 name: str, 144 split: str, 145 download: bool = False 146) -> List[str]: 147 """Get paths to the PlantSeg data. 148 149 Args: 150 path: Filepath to a folder where the downloaded data will be saved. 151 name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'. 152 split: The split to download. Either 'train', 'val' or 'test'. 153 download: Whether to download the data if it is not present. 154 155 Returns: 156 List of filepaths for the data. 157 """ 158 data_path = get_plantseg_data(path, name, split, download) 159 file_paths = sorted(glob(os.path.join(data_path, "*.h5"))) 160 return file_paths 161 162 163def get_plantseg_dataset( 164 path: Union[os.PathLike, str], 165 name: str, 166 split: str, 167 patch_shape: Tuple[int, int, int], 168 download: bool = False, 169 offsets: Optional[List[List[int]]] = None, 170 boundaries: bool = False, 171 binary: bool = False, 172 **kwargs, 173) -> Dataset: 174 """Get the PlantSeg dataset for segmenting nuclei or cells. 175 176 Args: 177 path: Filepath to a folder where the downloaded data will be saved. 178 name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'. 179 split: The split to download. Either 'train', 'val' or 'test'. 180 patch_shape: The patch shape to use for training. 181 download: Whether to download the data if it is not present. 182 offsets: Offset values for affinity computation used as target. 183 boundaries: Whether to compute boundaries as the target. 184 binary: Whether to use a binary segmentation target. 185 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 186 187 Returns: 188 The segmentation dataset. 189 """ 190 assert len(patch_shape) == 3 191 192 file_paths = get_plantseg_paths(path, name, split, download) 193 194 kwargs, _ = util.add_instance_label_transform( 195 kwargs, add_binary_target=binary, binary=binary, boundaries=boundaries, 196 offsets=offsets, binary_is_exclusive=False 197 ) 198 199 return torch_em.default_segmentation_dataset( 200 raw_paths=file_paths, 201 raw_key="raw", 202 label_paths=file_paths, 203 label_key="label", 204 patch_shape=patch_shape, 205 **kwargs 206 ) 207 208 209# TODO add support for ignore label, key: "/label_with_ignore" 210def get_plantseg_loader( 211 path: Union[os.PathLike, str], 212 name: str, 213 split: str, 214 patch_shape: Tuple[int, int, int], 215 batch_size: int, 216 download: bool = False, 217 offsets: Optional[List[List[int]]] = None, 218 boundaries: bool = False, 219 binary: bool = False, 220 **kwargs, 221) -> DataLoader: 222 """Get the PlantSeg dataloader for segmenting nuclei or cells. 223 224 Args: 225 path: Filepath to a folder where the downloaded data will be saved. 226 name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'. 227 split: The split to download. Either 'train', 'val' or 'test'. 228 patch_shape: The patch shape to use for training. 229 batch_size: The batch size for training. 230 download: Whether to download the data if it is not present. 231 offsets: Offset values for affinity computation used as target. 232 boundaries: Whether to compute boundaries as the target. 233 binary: Whether to use a binary segmentation target. 234 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 235 236 Returns: 237 The DataLoader. 238 """ 239 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 240 dataset = get_plantseg_dataset( 241 path, name, split, patch_shape, download=download, offsets=offsets, 242 boundaries=boundaries, binary=binary, **ds_kwargs 243 ) 244 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URLS =
{'root': {'train': 'https://files.de-1.osf.io/v1/resources/9x3g2/providers/osfstorage/?zip=', 'val': 'https://files.de-1.osf.io/v1/resources/vs6gb/providers/osfstorage/?zip=', 'test': 'https://files.de-1.osf.io/v1/resources/tn4xj/providers/osfstorage/?zip='}, 'nuclei': {'train': 'https://files.de-1.osf.io/v1/resources/thxzn/providers/osfstorage/?zip='}, 'ovules': {'train': 'https://files.de-1.osf.io/v1/resources/x9yns/providers/osfstorage/?zip=', 'val': 'https://files.de-1.osf.io/v1/resources/xp5uf/providers/osfstorage/?zip=', 'test': 'https://files.de-1.osf.io/v1/resources/8jz7e/providers/osfstorage/?zip='}}
CHECKSUMS =
{'root': {'train': None, 'val': None, 'test': None}, 'nuclei': {'train': None}, 'ovules': {'train': None, 'val': None, 'test': None}}
CROPPING_VOLUMES =
{'Movie2_T00006_crop_gt.h5': slice(4, None, None), 'Movie2_T00008_crop_gt.h5': slice(None, -18, None), 'Movie2_T00010_crop_gt.h5': slice(None, -32, None), 'Movie2_T00012_crop_gt.h5': slice(None, -39, None), 'Movie2_T00014_crop_gt.h5': slice(None, -40, None), 'Movie2_T00016_crop_gt.h5': slice(None, -42, None), 'Movie2_T00020_crop_gt.h5': slice(None, -50, None), 'N_487_ds2x.h5': slice(17, None, None), 'N_535_ds2x.h5': slice(None, -1, None), 'N_534_ds2x.h5': slice(None, -1, None), 'N_451_ds2x.h5': slice(None, -1, None), 'N_425_ds2x.h5': slice(None, -1, None), 'N_420_ds2x.h5': slice(None, -1, None)}
def
get_plantseg_data( path: Union[os.PathLike, str], name: str, split: str, download: bool = False) -> str:
117def get_plantseg_data(path: Union[os.PathLike, str], name: str, split: str, download: bool = False) -> str: 118 """Download the PlantSeg training data. 119 120 Args: 121 path: Filepath to a folder where the downloaded data will be saved. 122 name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'. 123 split: The split to download. Either 'train', 'val' or 'test'. 124 download: Whether to download the data if it is not present. 125 126 Returns: 127 The filepath to the training data. 128 """ 129 url = URLS[name][split] 130 checksum = CHECKSUMS[name][split] 131 os.makedirs(path, exist_ok=True) 132 out_path = os.path.join(path, f"{name}_{split}") 133 if os.path.exists(out_path): 134 return out_path 135 tmp_path = os.path.join(path, f"{name}_{split}.zip") 136 util.download_source(tmp_path, url, download, checksum) 137 util.unzip(tmp_path, out_path, remove=True) 138 _fix_inconsistent_volumes(out_path, name, split) 139 return out_path
Download the PlantSeg training data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'.
- split: The split to download. Either 'train', 'val' or 'test'.
- download: Whether to download the data if it is not present.
Returns:
The filepath to the training data.
def
get_plantseg_paths( path: Union[os.PathLike, str], name: str, split: str, download: bool = False) -> List[str]:
142def get_plantseg_paths( 143 path: Union[os.PathLike, str], 144 name: str, 145 split: str, 146 download: bool = False 147) -> List[str]: 148 """Get paths to the PlantSeg data. 149 150 Args: 151 path: Filepath to a folder where the downloaded data will be saved. 152 name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'. 153 split: The split to download. Either 'train', 'val' or 'test'. 154 download: Whether to download the data if it is not present. 155 156 Returns: 157 List of filepaths for the data. 158 """ 159 data_path = get_plantseg_data(path, name, split, download) 160 file_paths = sorted(glob(os.path.join(data_path, "*.h5"))) 161 return file_paths
Get paths to the PlantSeg data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'.
- split: The split to download. Either 'train', 'val' or 'test'.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the data.
def
get_plantseg_dataset( path: Union[os.PathLike, str], name: str, split: str, patch_shape: Tuple[int, int, int], download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
164def get_plantseg_dataset( 165 path: Union[os.PathLike, str], 166 name: str, 167 split: str, 168 patch_shape: Tuple[int, int, int], 169 download: bool = False, 170 offsets: Optional[List[List[int]]] = None, 171 boundaries: bool = False, 172 binary: bool = False, 173 **kwargs, 174) -> Dataset: 175 """Get the PlantSeg dataset for segmenting nuclei or cells. 176 177 Args: 178 path: Filepath to a folder where the downloaded data will be saved. 179 name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'. 180 split: The split to download. Either 'train', 'val' or 'test'. 181 patch_shape: The patch shape to use for training. 182 download: Whether to download the data if it is not present. 183 offsets: Offset values for affinity computation used as target. 184 boundaries: Whether to compute boundaries as the target. 185 binary: Whether to use a binary segmentation target. 186 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 187 188 Returns: 189 The segmentation dataset. 190 """ 191 assert len(patch_shape) == 3 192 193 file_paths = get_plantseg_paths(path, name, split, download) 194 195 kwargs, _ = util.add_instance_label_transform( 196 kwargs, add_binary_target=binary, binary=binary, boundaries=boundaries, 197 offsets=offsets, binary_is_exclusive=False 198 ) 199 200 return torch_em.default_segmentation_dataset( 201 raw_paths=file_paths, 202 raw_key="raw", 203 label_paths=file_paths, 204 label_key="label", 205 patch_shape=patch_shape, 206 **kwargs 207 )
Get the PlantSeg dataset for segmenting nuclei or cells.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'.
- split: The split to download. Either 'train', 'val' or 'test'.
- patch_shape: The patch shape to use for training.
- download: Whether to download the data if it is not present.
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- binary: Whether to use a binary segmentation target.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
def
get_plantseg_loader( path: Union[os.PathLike, str], name: str, split: str, patch_shape: Tuple[int, int, int], batch_size: int, download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
211def get_plantseg_loader( 212 path: Union[os.PathLike, str], 213 name: str, 214 split: str, 215 patch_shape: Tuple[int, int, int], 216 batch_size: int, 217 download: bool = False, 218 offsets: Optional[List[List[int]]] = None, 219 boundaries: bool = False, 220 binary: bool = False, 221 **kwargs, 222) -> DataLoader: 223 """Get the PlantSeg dataloader for segmenting nuclei or cells. 224 225 Args: 226 path: Filepath to a folder where the downloaded data will be saved. 227 name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'. 228 split: The split to download. Either 'train', 'val' or 'test'. 229 patch_shape: The patch shape to use for training. 230 batch_size: The batch size for training. 231 download: Whether to download the data if it is not present. 232 offsets: Offset values for affinity computation used as target. 233 boundaries: Whether to compute boundaries as the target. 234 binary: Whether to use a binary segmentation target. 235 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 236 237 Returns: 238 The DataLoader. 239 """ 240 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 241 dataset = get_plantseg_dataset( 242 path, name, split, patch_shape, download=download, offsets=offsets, 243 boundaries=boundaries, binary=binary, **ds_kwargs 244 ) 245 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the PlantSeg dataloader for segmenting nuclei or cells.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- name: The name of the data to load. Either 'root', 'nuclei' or 'ovules'.
- split: The split to download. Either 'train', 'val' or 'test'.
- patch_shape: The patch shape to use for training.
- batch_size: The batch size for training.
- download: Whether to download the data if it is not present.
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- binary: Whether to use a binary segmentation target.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader.