torch_em.data.datasets.light_microscopy.gonuclear
This dataset contains annotation for nucleus segmentation in 3d fluorescence microscopy.
This dataset is from the publication https://doi.org/10.1242/dev.202800. Please cite it if you use this dataset in your research.
1"""This dataset contains annotation for nucleus segmentation in 3d fluorescence microscopy. 2 3This dataset is from the publication https://doi.org/10.1242/dev.202800. 4Please cite it if you use this dataset in your research. 5""" 6 7import os 8from glob import glob 9from shutil import rmtree 10from typing import Optional, Tuple, Union, List 11 12import numpy as np 13import imageio.v3 as imageio 14 15from torch.utils.data import Dataset, DataLoader 16 17import torch_em 18 19from .. import util 20 21 22URL = "https://www.ebi.ac.uk/biostudies/files/S-BIAD1026/Nuclei_training_segmentation/Training%20image%20dataset_Tiff%20Files.zip" # noqa 23CHECKSUM = "b103388a4aed01c7aadb2d5f49392d2dd08dd7cbeb2357b0c56355384ebb93a9" 24 25 26def _load_tif(path): 27 vol = None 28 29 path_tif = path + ".tif" 30 if os.path.exists(path_tif): 31 vol = imageio.imread(path_tif) 32 33 path_tiff = path + ".tiff" 34 if os.path.exists(path_tiff): 35 vol = imageio.imread(path_tiff) 36 37 if vol is None: 38 raise RuntimeError("Can't find tif or tiff file for {path}.") 39 40 return vol 41 42 43def _clip_shape(raw, labels): 44 shape = raw.shape 45 labels = labels[:shape[0], :shape[1], :shape[2]] 46 47 shape = labels.shape 48 raw = raw[:shape[0], :shape[1], :shape[2]] 49 50 assert labels.shape == raw.shape, f"{labels.shape}, {raw.shape}" 51 return raw, labels 52 53 54def _process_data(in_folder, out_folder): 55 import h5py 56 57 os.makedirs(out_folder, exist_ok=True) 58 59 sample_folders = glob(os.path.join(in_folder, "*")) 60 for folder in sample_folders: 61 sample = os.path.basename(folder) 62 out_path = os.path.join(out_folder, f"{sample}.h5") 63 64 cell_raw = _load_tif(os.path.join(folder, f"{sample}_cellwall")) 65 cell_labels = _load_tif(os.path.join(folder, f"{sample}_cellseg")) 66 cell_labels = cell_labels[:, ::-1] 67 cell_raw, cell_labels = _clip_shape(cell_raw, cell_labels) 68 69 nucleus_raw = _load_tif(os.path.join(folder, f"{sample}_n_H2BtdTomato")) 70 nucleus_labels = _load_tif(os.path.join(folder, f"{sample}_n_stain_StarDist_goldGT")) 71 nucleus_labels = nucleus_labels[:, ::-1] 72 nucleus_raw, nucleus_labels = _clip_shape(nucleus_raw, nucleus_labels) 73 74 # Remove last frames with artifacts for two volumes (1137 and 1170). 75 if sample in ["1137", "1170"]: 76 nucleus_raw, nucleus_labels = nucleus_raw[:-1], nucleus_labels[:-1] 77 cell_raw, cell_labels = cell_raw[:-1], cell_labels[:-1] 78 79 # Fixing cell labels for one volume (1136) is misaligned. 80 if sample == "1136": 81 cell_labels = np.fliplr(cell_labels) 82 83 with h5py.File(out_path, "w") as f: 84 f.create_dataset("raw/cells", data=cell_raw, compression="gzip") 85 f.create_dataset("raw/nuclei", data=nucleus_raw, compression="gzip") 86 87 f.create_dataset("labels/cells", data=cell_labels, compression="gzip") 88 f.create_dataset("labels/nuclei", data=nucleus_labels, compression="gzip") 89 90 91def get_gonuclear_data(path: Union[os.PathLike, str], download: bool) -> str: 92 """Download the GoNuclear training data. 93 94 Args: 95 path: Filepath to a folder where the downloaded data will be saved. 96 download: Whether to download the data if it is not present. 97 98 Returns: 99 The filepath to the training data. 100 """ 101 data_path = os.path.join(path, "gonuclear_datasets") 102 if os.path.exists(data_path): 103 return data_path 104 105 os.makedirs(path, exist_ok=True) 106 zip_path = os.path.join(path, "gonuclear.zip") 107 util.download_source(zip_path, URL, download, CHECKSUM) 108 util.unzip(zip_path, path, True) 109 110 extracted_path = os.path.join(path, "Training image dataset_Tiff Files") 111 assert os.path.exists(extracted_path), extracted_path 112 _process_data(extracted_path, data_path) 113 assert os.path.exists(data_path) 114 115 rmtree(extracted_path) 116 return data_path 117 118 119def get_gonuclear_paths( 120 path: Union[os.PathLike, str], 121 sample_ids: Optional[Union[int, Tuple[int, ...]]] = None, 122 download: bool = False 123) -> List[str]: 124 """Get paths to the GoNuclear data. 125 126 Args: 127 path: Filepath to a folder where the downloaded data will be saved. 128 sample_ids: The sample ids to load. The valid sample ids are: 129 1135, 1136, 1137, 1139, 1170. If none is given all samples will be loaded. 130 download: Whether to download the data if it is not present. 131 132 Returns: 133 List of filepaths for the stored data. 134 """ 135 data_root = get_gonuclear_data(path, download) 136 137 if sample_ids is None: 138 paths = sorted(glob(os.path.join(data_root, "*.h5"))) 139 else: 140 paths = [] 141 for sample_id in sample_ids: 142 sample_path = os.path.join(data_root, f"{sample_id}.h5") 143 if not os.path.exists(sample_path): 144 raise ValueError(f"Invalid sample id {sample_id}.") 145 paths.append(sample_path) 146 147 return paths 148 149 150def get_gonuclear_dataset( 151 path: Union[os.PathLike, str], 152 patch_shape: Tuple[int, int], 153 segmentation_task: str = "nuclei", 154 sample_ids: Optional[Union[int, Tuple[int, ...]]] = None, 155 offsets: Optional[List[List[int]]] = None, 156 boundaries: bool = False, 157 binary: bool = False, 158 download: bool = False, 159 **kwargs 160) -> Dataset: 161 """Get the GoNuclear dataset for segmenting nuclei in 3d fluorescence microscopy. 162 163 Args: 164 path: Filepath to a folder where the downloaded data will be saved. 165 patch_shape: The patch shape to use for training. 166 segmentation_task: The segmentation task. Either 'nuclei' or 'cells'. 167 sample_ids: The sample ids to load. The valid sample ids are: 168 1135, 1136, 1137, 1139, 1170. If none is given all samples will be loaded. 169 offsets: Offset values for affinity computation used as target. 170 boundaries: Whether to compute boundaries as the target. 171 binary: Whether to use a binary segmentation target. 172 download: Whether to download the data if it is not present. 173 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 174 175 Returns: 176 The segmentation dataset. 177 """ 178 paths = get_gonuclear_paths(path, sample_ids, download) 179 180 if segmentation_task == "nuclei": 181 raw_key = "raw/nuclei" 182 label_key = "labels/nuclei" 183 elif segmentation_task == "cells": 184 raw_key = "raw/cells" 185 label_key = "labels/cells" 186 else: 187 raise ValueError(f"Invalid segmentation task {segmentation_task}, expect one of 'cells' or 'nuclei'.") 188 189 kwargs, _ = util.add_instance_label_transform( 190 kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets, 191 ) 192 193 return torch_em.default_segmentation_dataset( 194 raw_paths=paths, 195 raw_key=raw_key, 196 label_paths=paths, 197 label_key=label_key, 198 patch_shape=patch_shape, 199 **kwargs 200 ) 201 202 203def get_gonuclear_loader( 204 path: Union[os.PathLike, str], 205 patch_shape: Tuple[int, int], 206 batch_size: int, 207 segmentation_task: str = "nuclei", 208 sample_ids: Optional[Union[int, Tuple[int, ...]]] = None, 209 offsets: Optional[List[List[int]]] = None, 210 boundaries: bool = False, 211 binary: bool = False, 212 download: bool = False, 213 **kwargs 214) -> DataLoader: 215 """Get the GoNuclear dataloader for segmenting nuclei in 3d fluorescence microscopy. 216 217 Args: 218 path: Filepath to a folder where the downloaded data will be saved. 219 patch_shape: The patch shape to use for training. 220 batch_size: The batch size for training. 221 segmentation_task: The segmentation task. Either 'nuclei' or 'cells'. 222 sample_ids: The sample ids to load. The valid sample ids are: 223 1135, 1136, 1137, 1139, 1170. If none is given all samples will be loaded. 224 offsets: Offset values for affinity computation used as target. 225 boundaries: Whether to compute boundaries as the target. 226 binary: Whether to use a binary segmentation target. 227 download: Whether to download the data if it is not present. 228 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 229 230 Returns: 231 The DataLoader. 232 """ 233 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 234 dataset = get_gonuclear_dataset( 235 path=path, 236 patch_shape=patch_shape, 237 segmentation_task=segmentation_task, 238 sample_ids=sample_ids, 239 offsets=offsets, 240 boundaries=boundaries, 241 binary=binary, 242 download=download, 243 **ds_kwargs, 244 ) 245 return torch_em.get_data_loader(dataset, batch_size=batch_size, **loader_kwargs)
URL =
'https://www.ebi.ac.uk/biostudies/files/S-BIAD1026/Nuclei_training_segmentation/Training%20image%20dataset_Tiff%20Files.zip'
CHECKSUM =
'b103388a4aed01c7aadb2d5f49392d2dd08dd7cbeb2357b0c56355384ebb93a9'
def
get_gonuclear_data(path: Union[os.PathLike, str], download: bool) -> str:
92def get_gonuclear_data(path: Union[os.PathLike, str], download: bool) -> str: 93 """Download the GoNuclear training data. 94 95 Args: 96 path: Filepath to a folder where the downloaded data will be saved. 97 download: Whether to download the data if it is not present. 98 99 Returns: 100 The filepath to the training data. 101 """ 102 data_path = os.path.join(path, "gonuclear_datasets") 103 if os.path.exists(data_path): 104 return data_path 105 106 os.makedirs(path, exist_ok=True) 107 zip_path = os.path.join(path, "gonuclear.zip") 108 util.download_source(zip_path, URL, download, CHECKSUM) 109 util.unzip(zip_path, path, True) 110 111 extracted_path = os.path.join(path, "Training image dataset_Tiff Files") 112 assert os.path.exists(extracted_path), extracted_path 113 _process_data(extracted_path, data_path) 114 assert os.path.exists(data_path) 115 116 rmtree(extracted_path) 117 return data_path
Download the GoNuclear training data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- download: Whether to download the data if it is not present.
Returns:
The filepath to the training data.
def
get_gonuclear_paths( path: Union[os.PathLike, str], sample_ids: Union[int, Tuple[int, ...], NoneType] = None, download: bool = False) -> List[str]:
120def get_gonuclear_paths( 121 path: Union[os.PathLike, str], 122 sample_ids: Optional[Union[int, Tuple[int, ...]]] = None, 123 download: bool = False 124) -> List[str]: 125 """Get paths to the GoNuclear data. 126 127 Args: 128 path: Filepath to a folder where the downloaded data will be saved. 129 sample_ids: The sample ids to load. The valid sample ids are: 130 1135, 1136, 1137, 1139, 1170. If none is given all samples will be loaded. 131 download: Whether to download the data if it is not present. 132 133 Returns: 134 List of filepaths for the stored data. 135 """ 136 data_root = get_gonuclear_data(path, download) 137 138 if sample_ids is None: 139 paths = sorted(glob(os.path.join(data_root, "*.h5"))) 140 else: 141 paths = [] 142 for sample_id in sample_ids: 143 sample_path = os.path.join(data_root, f"{sample_id}.h5") 144 if not os.path.exists(sample_path): 145 raise ValueError(f"Invalid sample id {sample_id}.") 146 paths.append(sample_path) 147 148 return paths
Get paths to the GoNuclear data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- sample_ids: The sample ids to load. The valid sample ids are: 1135, 1136, 1137, 1139, 1170. If none is given all samples will be loaded.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the stored data.
def
get_gonuclear_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], segmentation_task: str = 'nuclei', sample_ids: Union[int, Tuple[int, ...], NoneType] = None, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
151def get_gonuclear_dataset( 152 path: Union[os.PathLike, str], 153 patch_shape: Tuple[int, int], 154 segmentation_task: str = "nuclei", 155 sample_ids: Optional[Union[int, Tuple[int, ...]]] = None, 156 offsets: Optional[List[List[int]]] = None, 157 boundaries: bool = False, 158 binary: bool = False, 159 download: bool = False, 160 **kwargs 161) -> Dataset: 162 """Get the GoNuclear dataset for segmenting nuclei in 3d fluorescence microscopy. 163 164 Args: 165 path: Filepath to a folder where the downloaded data will be saved. 166 patch_shape: The patch shape to use for training. 167 segmentation_task: The segmentation task. Either 'nuclei' or 'cells'. 168 sample_ids: The sample ids to load. The valid sample ids are: 169 1135, 1136, 1137, 1139, 1170. If none is given all samples will be loaded. 170 offsets: Offset values for affinity computation used as target. 171 boundaries: Whether to compute boundaries as the target. 172 binary: Whether to use a binary segmentation target. 173 download: Whether to download the data if it is not present. 174 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 175 176 Returns: 177 The segmentation dataset. 178 """ 179 paths = get_gonuclear_paths(path, sample_ids, download) 180 181 if segmentation_task == "nuclei": 182 raw_key = "raw/nuclei" 183 label_key = "labels/nuclei" 184 elif segmentation_task == "cells": 185 raw_key = "raw/cells" 186 label_key = "labels/cells" 187 else: 188 raise ValueError(f"Invalid segmentation task {segmentation_task}, expect one of 'cells' or 'nuclei'.") 189 190 kwargs, _ = util.add_instance_label_transform( 191 kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets, 192 ) 193 194 return torch_em.default_segmentation_dataset( 195 raw_paths=paths, 196 raw_key=raw_key, 197 label_paths=paths, 198 label_key=label_key, 199 patch_shape=patch_shape, 200 **kwargs 201 )
Get the GoNuclear dataset for segmenting nuclei in 3d fluorescence microscopy.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- segmentation_task: The segmentation task. Either 'nuclei' or 'cells'.
- sample_ids: The sample ids to load. The valid sample ids are: 1135, 1136, 1137, 1139, 1170. If none is given all samples will be loaded.
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- binary: Whether to use a binary segmentation target.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
def
get_gonuclear_loader( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], batch_size: int, segmentation_task: str = 'nuclei', sample_ids: Union[int, Tuple[int, ...], NoneType] = None, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
204def get_gonuclear_loader( 205 path: Union[os.PathLike, str], 206 patch_shape: Tuple[int, int], 207 batch_size: int, 208 segmentation_task: str = "nuclei", 209 sample_ids: Optional[Union[int, Tuple[int, ...]]] = None, 210 offsets: Optional[List[List[int]]] = None, 211 boundaries: bool = False, 212 binary: bool = False, 213 download: bool = False, 214 **kwargs 215) -> DataLoader: 216 """Get the GoNuclear dataloader for segmenting nuclei in 3d fluorescence microscopy. 217 218 Args: 219 path: Filepath to a folder where the downloaded data will be saved. 220 patch_shape: The patch shape to use for training. 221 batch_size: The batch size for training. 222 segmentation_task: The segmentation task. Either 'nuclei' or 'cells'. 223 sample_ids: The sample ids to load. The valid sample ids are: 224 1135, 1136, 1137, 1139, 1170. If none is given all samples will be loaded. 225 offsets: Offset values for affinity computation used as target. 226 boundaries: Whether to compute boundaries as the target. 227 binary: Whether to use a binary segmentation target. 228 download: Whether to download the data if it is not present. 229 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 230 231 Returns: 232 The DataLoader. 233 """ 234 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 235 dataset = get_gonuclear_dataset( 236 path=path, 237 patch_shape=patch_shape, 238 segmentation_task=segmentation_task, 239 sample_ids=sample_ids, 240 offsets=offsets, 241 boundaries=boundaries, 242 binary=binary, 243 download=download, 244 **ds_kwargs, 245 ) 246 return torch_em.get_data_loader(dataset, batch_size=batch_size, **loader_kwargs)
Get the GoNuclear dataloader for segmenting nuclei in 3d fluorescence microscopy.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- batch_size: The batch size for training.
- segmentation_task: The segmentation task. Either 'nuclei' or 'cells'.
- sample_ids: The sample ids to load. The valid sample ids are: 1135, 1136, 1137, 1139, 1170. If none is given all samples will be loaded.
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- binary: Whether to use a binary segmentation target.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader.