torch_em.data.datasets.light_microscopy.yeastms
The YeastMS dataset contains annotations for yeast cell instance segmentation in brightfield microscopy images of microfluidic trap structures.
The dataset provides 493 annotated images (256x256) with instance segmentation masks for both cells and trap microstructures across train/val/test splits.
The dataset is located at https://tudatalib.ulb.tu-darmstadt.de/handle/tudatalib/3799. This dataset is from the publication https://doi.org/10.48550/arXiv.2304.07597. Please cite it if you use this dataset in your research.
1"""The YeastMS dataset contains annotations for yeast cell instance segmentation 2in brightfield microscopy images of microfluidic trap structures. 3 4The dataset provides 493 annotated images (256x256) with instance segmentation 5masks for both cells and trap microstructures across train/val/test splits. 6 7The dataset is located at https://tudatalib.ulb.tu-darmstadt.de/handle/tudatalib/3799. 8This dataset is from the publication https://doi.org/10.48550/arXiv.2304.07597. 9Please cite it if you use this dataset in your research. 10""" 11 12import os 13from glob import glob 14from typing import Union, Tuple, List, Literal 15 16import numpy as np 17 18from torch.utils.data import Dataset, DataLoader 19 20import torch_em 21 22from .. import util 23 24 25URL = "https://tudatalib.ulb.tu-darmstadt.de/bitstream/handle/tudatalib/3799/yeast_cell_in_microstructures_dataset.zip" 26CHECKSUM = "80d9e34266895a030b5dfbb81c25f9bd41e7d8c3d57f2c5aaeafd7c7c3a2d6b5" 27 28VALID_SPLITS = ["train", "val", "test"] 29 30 31def _create_h5_data(path, split): 32 """Create h5 files with raw images and cell instance labels from .pt tensors.""" 33 import h5py 34 import torch 35 from natsort import natsorted 36 from tqdm import tqdm 37 38 h5_dir = os.path.join(path, "h5_data", split) 39 os.makedirs(h5_dir, exist_ok=True) 40 41 input_dir = os.path.join(path, split, "inputs") 42 instance_dir = os.path.join(path, split, "instances") 43 class_dir = os.path.join(path, split, "classes") 44 45 input_paths = natsorted(glob(os.path.join(input_dir, "*.pt"))) 46 47 for input_path in tqdm(input_paths, desc=f"Creating h5 files for '{split}'"): 48 fname = os.path.basename(input_path).replace(".pt", ".h5") 49 h5_path = os.path.join(h5_dir, fname) 50 51 if os.path.exists(h5_path): 52 continue 53 54 sample_id = os.path.basename(input_path) 55 instance_path = os.path.join(instance_dir, sample_id) 56 class_path = os.path.join(class_dir, sample_id) 57 58 raw = torch.load(input_path, weights_only=False).numpy() 59 instances = torch.load(instance_path, weights_only=False).numpy() # (N, H, W) 60 classes = torch.load(class_path, weights_only=False).numpy() # (N,) 61 62 # Create cell instance labels (class 0 = cell, class 1 = trap). 63 labels = np.zeros(raw.shape, dtype="int64") 64 cell_id = 1 65 for i in range(instances.shape[0]): 66 if classes[i] == 0: # cell 67 labels[instances[i] > 0] = cell_id 68 cell_id += 1 69 70 with h5py.File(h5_path, "w") as f: 71 f.create_dataset("raw", data=raw, compression="gzip") 72 f.create_dataset("labels", data=labels, compression="gzip") 73 74 return h5_dir 75 76 77def get_yeastms_data(path: Union[os.PathLike, str], download: bool = False) -> str: 78 """Download the YeastMS dataset. 79 80 Args: 81 path: Filepath to a folder where the downloaded data will be saved. 82 download: Whether to download the data if it is not present. 83 84 Returns: 85 The filepath to the directory with the data. 86 """ 87 data_dir = os.path.join(path, "train") 88 if os.path.exists(data_dir): 89 return path 90 91 os.makedirs(path, exist_ok=True) 92 zip_path = os.path.join(path, "yeast_cell_in_microstructures_dataset.zip") 93 util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM) 94 util.unzip(zip_path=zip_path, dst=path) 95 96 return path 97 98 99def get_yeastms_paths( 100 path: Union[os.PathLike, str], 101 split: Literal["train", "val", "test"] = "train", 102 download: bool = False, 103) -> List[str]: 104 """Get paths to the YeastMS data. 105 106 Args: 107 path: Filepath to a folder where the downloaded data will be saved. 108 split: The data split to use. One of 'train', 'val' or 'test'. 109 download: Whether to download the data if it is not present. 110 111 Returns: 112 List of filepaths for the h5 data. 113 """ 114 from natsort import natsorted 115 116 assert split in VALID_SPLITS, f"'{split}' is not a valid split. Choose from {VALID_SPLITS}." 117 118 get_yeastms_data(path, download) 119 120 h5_dir = os.path.join(path, "h5_data", split) 121 if not os.path.exists(h5_dir) or len(glob(os.path.join(h5_dir, "*.h5"))) == 0: 122 _create_h5_data(path, split) 123 124 h5_paths = natsorted(glob(os.path.join(h5_dir, "*.h5"))) 125 assert len(h5_paths) > 0, f"No data found for split '{split}'" 126 127 return h5_paths 128 129 130def get_yeastms_dataset( 131 path: Union[os.PathLike, str], 132 patch_shape: Tuple[int, int], 133 split: Literal["train", "val", "test"] = "train", 134 download: bool = False, 135 **kwargs 136) -> Dataset: 137 """Get the YeastMS dataset for yeast cell segmentation in microstructures. 138 139 Args: 140 path: Filepath to a folder where the downloaded data will be saved. 141 patch_shape: The patch shape to use for training. 142 split: The data split to use. One of 'train', 'val' or 'test'. 143 download: Whether to download the data if it is not present. 144 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 145 146 Returns: 147 The segmentation dataset. 148 """ 149 h5_paths = get_yeastms_paths(path, split, download) 150 151 kwargs, _ = util.add_instance_label_transform( 152 kwargs, add_binary_target=True, 153 ) 154 kwargs = util.ensure_transforms(ndim=2, **kwargs) 155 156 return torch_em.default_segmentation_dataset( 157 raw_paths=h5_paths, 158 raw_key="raw", 159 label_paths=h5_paths, 160 label_key="labels", 161 patch_shape=patch_shape, 162 ndim=2, 163 **kwargs 164 ) 165 166 167def get_yeastms_loader( 168 path: Union[os.PathLike, str], 169 batch_size: int, 170 patch_shape: Tuple[int, int], 171 split: Literal["train", "val", "test"] = "train", 172 download: bool = False, 173 **kwargs 174) -> DataLoader: 175 """Get the YeastMS dataloader for yeast cell segmentation in microstructures. 176 177 Args: 178 path: Filepath to a folder where the downloaded data will be saved. 179 batch_size: The batch size for training. 180 patch_shape: The patch shape to use for training. 181 split: The data split to use. One of 'train', 'val' or 'test'. 182 download: Whether to download the data if it is not present. 183 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 184 185 Returns: 186 The DataLoader. 187 """ 188 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 189 dataset = get_yeastms_dataset( 190 path=path, 191 patch_shape=patch_shape, 192 split=split, 193 download=download, 194 **ds_kwargs, 195 ) 196 return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)
78def get_yeastms_data(path: Union[os.PathLike, str], download: bool = False) -> str: 79 """Download the YeastMS dataset. 80 81 Args: 82 path: Filepath to a folder where the downloaded data will be saved. 83 download: Whether to download the data if it is not present. 84 85 Returns: 86 The filepath to the directory with the data. 87 """ 88 data_dir = os.path.join(path, "train") 89 if os.path.exists(data_dir): 90 return path 91 92 os.makedirs(path, exist_ok=True) 93 zip_path = os.path.join(path, "yeast_cell_in_microstructures_dataset.zip") 94 util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM) 95 util.unzip(zip_path=zip_path, dst=path) 96 97 return path
Download the YeastMS dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- download: Whether to download the data if it is not present.
Returns:
The filepath to the directory with the data.
100def get_yeastms_paths( 101 path: Union[os.PathLike, str], 102 split: Literal["train", "val", "test"] = "train", 103 download: bool = False, 104) -> List[str]: 105 """Get paths to the YeastMS data. 106 107 Args: 108 path: Filepath to a folder where the downloaded data will be saved. 109 split: The data split to use. One of 'train', 'val' or 'test'. 110 download: Whether to download the data if it is not present. 111 112 Returns: 113 List of filepaths for the h5 data. 114 """ 115 from natsort import natsorted 116 117 assert split in VALID_SPLITS, f"'{split}' is not a valid split. Choose from {VALID_SPLITS}." 118 119 get_yeastms_data(path, download) 120 121 h5_dir = os.path.join(path, "h5_data", split) 122 if not os.path.exists(h5_dir) or len(glob(os.path.join(h5_dir, "*.h5"))) == 0: 123 _create_h5_data(path, split) 124 125 h5_paths = natsorted(glob(os.path.join(h5_dir, "*.h5"))) 126 assert len(h5_paths) > 0, f"No data found for split '{split}'" 127 128 return h5_paths
Get paths to the YeastMS data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The data split to use. One of 'train', 'val' or 'test'.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the h5 data.
131def get_yeastms_dataset( 132 path: Union[os.PathLike, str], 133 patch_shape: Tuple[int, int], 134 split: Literal["train", "val", "test"] = "train", 135 download: bool = False, 136 **kwargs 137) -> Dataset: 138 """Get the YeastMS dataset for yeast cell segmentation in microstructures. 139 140 Args: 141 path: Filepath to a folder where the downloaded data will be saved. 142 patch_shape: The patch shape to use for training. 143 split: The data split to use. One of 'train', 'val' or 'test'. 144 download: Whether to download the data if it is not present. 145 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 146 147 Returns: 148 The segmentation dataset. 149 """ 150 h5_paths = get_yeastms_paths(path, split, download) 151 152 kwargs, _ = util.add_instance_label_transform( 153 kwargs, add_binary_target=True, 154 ) 155 kwargs = util.ensure_transforms(ndim=2, **kwargs) 156 157 return torch_em.default_segmentation_dataset( 158 raw_paths=h5_paths, 159 raw_key="raw", 160 label_paths=h5_paths, 161 label_key="labels", 162 patch_shape=patch_shape, 163 ndim=2, 164 **kwargs 165 )
Get the YeastMS dataset for yeast cell segmentation in microstructures.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- split: The data split to use. One of 'train', 'val' or 'test'.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset.
Returns:
The segmentation dataset.
168def get_yeastms_loader( 169 path: Union[os.PathLike, str], 170 batch_size: int, 171 patch_shape: Tuple[int, int], 172 split: Literal["train", "val", "test"] = "train", 173 download: bool = False, 174 **kwargs 175) -> DataLoader: 176 """Get the YeastMS dataloader for yeast cell segmentation in microstructures. 177 178 Args: 179 path: Filepath to a folder where the downloaded data will be saved. 180 batch_size: The batch size for training. 181 patch_shape: The patch shape to use for training. 182 split: The data split to use. One of 'train', 'val' or 'test'. 183 download: Whether to download the data if it is not present. 184 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 185 186 Returns: 187 The DataLoader. 188 """ 189 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 190 dataset = get_yeastms_dataset( 191 path=path, 192 patch_shape=patch_shape, 193 split=split, 194 download=download, 195 **ds_kwargs, 196 ) 197 return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)
Get the YeastMS dataloader for yeast cell segmentation in microstructures.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- split: The data split to use. One of 'train', 'val' or 'test'.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_datasetor for the PyTorch DataLoader.
Returns:
The DataLoader.