torch_em.data.datasets.light_microscopy.aisegcell
The aiSEGcell dataset contains annotations for nucleus segmentation in paired brightfield and fluorescence images.
The dataset collection is located at https://www.research-collection.ethz.ch/handle/20.500.11850/679085. This dataset is from the publication https://doi.org/10.1371/journal.pcbi.1012361. Please cite it if you use this dataset in your research.
1"""The aiSEGcell dataset contains annotations for nucleus segmentation in 2paired brightfield and fluorescence images. 3 4The dataset collection is located at https://www.research-collection.ethz.ch/handle/20.500.11850/679085. 5This dataset is from the publication https://doi.org/10.1371/journal.pcbi.1012361. 6Please cite it if you use this dataset in your research. 7""" 8 9import os 10from glob import glob 11from tqdm import tqdm 12from pathlib import Path 13from natsort import natsorted 14from typing import List, Union, Tuple, Literal 15from concurrent.futures import ProcessPoolExecutor 16 17import numpy as np 18import imageio.v3 as imageio 19from skimage.measure import label as connected_components 20 21from torch.utils.data import Dataset, DataLoader 22 23import torch_em 24 25from .. import util 26 27 28URL = "https://libdrive.ethz.ch/index.php/s/VoF2SYkbLY8izjh/download" 29CHECKSUM = "f9115ee6b71e7c4364b83f7d7f8b66dce5b778344070bddb6a8f0e5086ca5de9" 30 31 32def _process_each_image(args): 33 import h5py 34 35 bpath, npath, gpath, data_dir = args 36 37 path_parents = Path(bpath).parents 38 split = path_parents[1].name.split("_")[-1] 39 dname = path_parents[2].name 40 41 neu_dir = os.path.join(data_dir, split, dname) 42 os.makedirs(neu_dir, exist_ok=True) 43 44 fpath = os.path.join(neu_dir, f"{Path(bpath).stem}.h5") 45 if os.path.exists(fpath): 46 return 47 48 bf = imageio.imread(bpath) 49 nuc = imageio.imread(npath) 50 gt = imageio.imread(gpath) 51 52 # Ensure all bf images have 3 channels. 53 if bf.ndim == 3: 54 bf = bf.transpose(2, 0, 1) 55 else: 56 bf = np.stack([bf] * 3, axis=0) 57 58 # Ensure all fluo images have 3 channels. 59 if nuc.ndim == 3: 60 nuc = nuc.transpose(2, 0, 1) 61 else: 62 nuc = np.stack([nuc] * 3, axis=0) 63 64 assert nuc.ndim == bf.ndim == 3 65 66 # Labels have 3 channels. Keep only one. 67 if gt.ndim == 3: 68 gt = gt[..., 0] 69 70 gt = connected_components(gt).astype("uint16") 71 72 with h5py.File(fpath, "w") as f: 73 f.create_dataset("raw/brightfield", data=bf, compression="gzip") 74 f.create_dataset("raw/fluorescence", data=nuc, compression="gzip") 75 f.create_dataset("labels", data=gt, compression="gzip") 76 77 78def _preprocess_data(data_dir, base_dir): 79 80 bf_paths = natsorted(glob(os.path.join(base_dir, "**", "brightfield", "*.png"), recursive=True)) 81 nucleus_paths = natsorted(glob(os.path.join(base_dir, "**", "nucleus", "*.png"), recursive=True)) 82 gt_paths = natsorted(glob(os.path.join(base_dir, "**", "masks", "*.png"), recursive=True)) 83 84 assert bf_paths and len(bf_paths) == len(nucleus_paths) == len(gt_paths) 85 86 tasks = [(b, n, g, data_dir) for b, n, g in zip(bf_paths, nucleus_paths, gt_paths)] 87 with ProcessPoolExecutor() as executor: 88 list(tqdm(executor.map(_process_each_image, tasks), total=len(tasks), desc="Processing data")) 89 90 91def get_aisegcell_data(path: Union[os.PathLike, str], download: bool = False) -> str: 92 """Download the aiSEGcell dataset. 93 94 Args: 95 path: Filepath to a folder where the downloaded data will be saved. 96 download: Whether to download the data if it is not present. 97 98 Returns: 99 Filepath where the dataset is stored. 100 """ 101 data_dir = os.path.join(path, "data") 102 if os.path.exists(data_dir): 103 return data_dir 104 105 os.makedirs(path, exist_ok=True) 106 zip_path = os.path.join(path, "data.zip") 107 util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM) 108 109 # We need to do multiple unzip and untar to get the data out. 110 print( 111 "'aiSEGcell' is a very large dataset (>60GB). It might take a couple of hours to download, " 112 "unzip and preprocess the data. Please ensure that you have a stable internet connection." 113 ) 114 util.unzip(zip_path=zip_path, dst=path, remove=False) 115 util.unzip_tarfile(tar_path=os.path.join(path, "679085", "aisegcell_supplement.tar"), dst=path) 116 util.unzip_tarfile( 117 tar_path=os.path.join(path, "aiSEGcell_supplement", "data_sets", "aiSEGcell_nucleus.tar"), dst=path, 118 ) 119 120 # Now that we have the core 'aiSEGcell_nucleus' folder on top-level directory, we can take it for processing data. 121 _preprocess_data(data_dir=data_dir, base_dir=os.path.join(path, "aiSEGcell_nucleus")) 122 123 return data_dir 124 125 126def get_aisegcell_paths( 127 path: Union[os.PathLike, str], split: Literal["train", "val", "test"], download: bool = False, 128) -> List[str]: 129 """Get paths to the aiSEGcell dataset. 130 131 Args: 132 path: Filepath to a folder where the downloaded data will be saved. 133 split: The data split to use. Either 'train', 'val' or 'test'. 134 download: Whether to download the data if it is not present. 135 136 Returns: 137 List of filepaths for the input data. 138 """ 139 data_dir = get_aisegcell_data(path, download) 140 141 if split not in ["train", "val", "test"]: 142 raise ValueError(f"'{split}' is not a valid split choice.") 143 144 data_paths = glob(os.path.join(data_dir, split, "**", "*.h5"), recursive=True) 145 assert len(data_paths) > 0 146 return data_paths 147 148 149def get_aisegcell_dataset( 150 path: Union[os.PathLike, str], 151 patch_shape: Tuple[int, int], 152 split: Literal["train", "val", "test"], 153 raw_channel: Literal["brightfield", "fluorescence"] = "brightfield", 154 download: bool = False, 155 **kwargs 156) -> Dataset: 157 """Get the aiSEGcell dataset for nucleus segmentation. 158 159 Args: 160 path: Filepath to a folder where the downloaded data will be saved. 161 patch_shape: The patch shape to use for training. 162 split: The data split to use. Either 'train', 'val' or 'test'. 163 raw_channel: The input channel to use. Either 'brightfield' or 'fluorescence'. 164 download: Whether to download the data if it is not present. 165 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 166 167 Returns: 168 The segmentation dataset. 169 """ 170 data_paths = get_aisegcell_paths(path, split, download) 171 172 return torch_em.default_segmentation_dataset( 173 raw_paths=data_paths, 174 raw_key=f"raw/{raw_channel}", 175 label_paths=data_paths, 176 label_key="labels", 177 is_seg_dataset=True, 178 patch_shape=patch_shape, 179 ndim=2, 180 with_channels=True, 181 **kwargs 182 ) 183 184 185def get_aisegcell_loader( 186 path: Union[os.PathLike, str], 187 batch_size: int, 188 patch_shape: Tuple[int, int], 189 split: Literal["train", "val", "test"], 190 raw_channel: Literal["brightfield", "fluorescence"] = "brightfield", 191 download: bool = False, 192 **kwargs 193) -> DataLoader: 194 """Get the aiSEGcell dataloader for nucleus segmentation. 195 196 Args: 197 path: Filepath to a folder where the downloaded data will be saved. 198 batch_size: The batch size for training. 199 patch_shape: The patch shape to use for training. 200 split: The data split to use. Either 'train', 'val' or 'test'. 201 raw_channel: The input channel to use. Either 'brightfield' or 'fluorescence'. 202 download: Whether to download the data if it is not present. 203 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 204 205 Returns: 206 The DataLoader. 207 """ 208 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 209 dataset = get_aisegcell_dataset(path, patch_shape, split, raw_channel, download, **ds_kwargs) 210 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URL =
'https://libdrive.ethz.ch/index.php/s/VoF2SYkbLY8izjh/download'
CHECKSUM =
'f9115ee6b71e7c4364b83f7d7f8b66dce5b778344070bddb6a8f0e5086ca5de9'
def
get_aisegcell_data(path: Union[os.PathLike, str], download: bool = False) -> str:
92def get_aisegcell_data(path: Union[os.PathLike, str], download: bool = False) -> str: 93 """Download the aiSEGcell dataset. 94 95 Args: 96 path: Filepath to a folder where the downloaded data will be saved. 97 download: Whether to download the data if it is not present. 98 99 Returns: 100 Filepath where the dataset is stored. 101 """ 102 data_dir = os.path.join(path, "data") 103 if os.path.exists(data_dir): 104 return data_dir 105 106 os.makedirs(path, exist_ok=True) 107 zip_path = os.path.join(path, "data.zip") 108 util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM) 109 110 # We need to do multiple unzip and untar to get the data out. 111 print( 112 "'aiSEGcell' is a very large dataset (>60GB). It might take a couple of hours to download, " 113 "unzip and preprocess the data. Please ensure that you have a stable internet connection." 114 ) 115 util.unzip(zip_path=zip_path, dst=path, remove=False) 116 util.unzip_tarfile(tar_path=os.path.join(path, "679085", "aisegcell_supplement.tar"), dst=path) 117 util.unzip_tarfile( 118 tar_path=os.path.join(path, "aiSEGcell_supplement", "data_sets", "aiSEGcell_nucleus.tar"), dst=path, 119 ) 120 121 # Now that we have the core 'aiSEGcell_nucleus' folder on top-level directory, we can take it for processing data. 122 _preprocess_data(data_dir=data_dir, base_dir=os.path.join(path, "aiSEGcell_nucleus")) 123 124 return data_dir
Download the aiSEGcell dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- download: Whether to download the data if it is not present.
Returns:
Filepath where the dataset is stored.
def
get_aisegcell_paths( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], download: bool = False) -> List[str]:
127def get_aisegcell_paths( 128 path: Union[os.PathLike, str], split: Literal["train", "val", "test"], download: bool = False, 129) -> List[str]: 130 """Get paths to the aiSEGcell dataset. 131 132 Args: 133 path: Filepath to a folder where the downloaded data will be saved. 134 split: The data split to use. Either 'train', 'val' or 'test'. 135 download: Whether to download the data if it is not present. 136 137 Returns: 138 List of filepaths for the input data. 139 """ 140 data_dir = get_aisegcell_data(path, download) 141 142 if split not in ["train", "val", "test"]: 143 raise ValueError(f"'{split}' is not a valid split choice.") 144 145 data_paths = glob(os.path.join(data_dir, split, "**", "*.h5"), recursive=True) 146 assert len(data_paths) > 0 147 return data_paths
Get paths to the aiSEGcell dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The data split to use. Either 'train', 'val' or 'test'.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the input data.
def
get_aisegcell_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], raw_channel: Literal['brightfield', 'fluorescence'] = 'brightfield', download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
150def get_aisegcell_dataset( 151 path: Union[os.PathLike, str], 152 patch_shape: Tuple[int, int], 153 split: Literal["train", "val", "test"], 154 raw_channel: Literal["brightfield", "fluorescence"] = "brightfield", 155 download: bool = False, 156 **kwargs 157) -> Dataset: 158 """Get the aiSEGcell dataset for nucleus segmentation. 159 160 Args: 161 path: Filepath to a folder where the downloaded data will be saved. 162 patch_shape: The patch shape to use for training. 163 split: The data split to use. Either 'train', 'val' or 'test'. 164 raw_channel: The input channel to use. Either 'brightfield' or 'fluorescence'. 165 download: Whether to download the data if it is not present. 166 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 167 168 Returns: 169 The segmentation dataset. 170 """ 171 data_paths = get_aisegcell_paths(path, split, download) 172 173 return torch_em.default_segmentation_dataset( 174 raw_paths=data_paths, 175 raw_key=f"raw/{raw_channel}", 176 label_paths=data_paths, 177 label_key="labels", 178 is_seg_dataset=True, 179 patch_shape=patch_shape, 180 ndim=2, 181 with_channels=True, 182 **kwargs 183 )
Get the aiSEGcell dataset for nucleus segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- split: The data split to use. Either 'train', 'val' or 'test'.
- raw_channel: The input channel to use. Either 'brightfield' or 'fluorescence'.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset.
Returns:
The segmentation dataset.
def
get_aisegcell_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], raw_channel: Literal['brightfield', 'fluorescence'] = 'brightfield', download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
186def get_aisegcell_loader( 187 path: Union[os.PathLike, str], 188 batch_size: int, 189 patch_shape: Tuple[int, int], 190 split: Literal["train", "val", "test"], 191 raw_channel: Literal["brightfield", "fluorescence"] = "brightfield", 192 download: bool = False, 193 **kwargs 194) -> DataLoader: 195 """Get the aiSEGcell dataloader for nucleus segmentation. 196 197 Args: 198 path: Filepath to a folder where the downloaded data will be saved. 199 batch_size: The batch size for training. 200 patch_shape: The patch shape to use for training. 201 split: The data split to use. Either 'train', 'val' or 'test'. 202 raw_channel: The input channel to use. Either 'brightfield' or 'fluorescence'. 203 download: Whether to download the data if it is not present. 204 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 205 206 Returns: 207 The DataLoader. 208 """ 209 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 210 dataset = get_aisegcell_dataset(path, patch_shape, split, raw_channel, download, **ds_kwargs) 211 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the aiSEGcell dataloader for nucleus segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- split: The data split to use. Either 'train', 'val' or 'test'.
- raw_channel: The input channel to use. Either 'brightfield' or 'fluorescence'.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_datasetor for the PyTorch DataLoader.
Returns:
The DataLoader.