torch_em.data.datasets.light_microscopy.aisegcell
The aiSEGcell dataset contains annotations for nucleus segmentation in paired brightfield and fluorescence images.
The dataset collection is located at https://www.research-collection.ethz.ch/handle/20.500.11850/679085. This dataset is from the publication https://doi.org/10.1371/journal.pcbi.1012361. Please cite it if you use this dataset in your research.
1"""The aiSEGcell dataset contains annotations for nucleus segmentation in 2paired brightfield and fluorescence images. 3 4The dataset collection is located at https://www.research-collection.ethz.ch/handle/20.500.11850/679085. 5This dataset is from the publication https://doi.org/10.1371/journal.pcbi.1012361. 6Please cite it if you use this dataset in your research. 7""" 8 9import os 10from glob import glob 11from tqdm import tqdm 12from pathlib import Path 13from natsort import natsorted 14from typing import List, Union, Tuple, Literal 15from concurrent.futures import ProcessPoolExecutor 16 17import numpy as np 18import imageio.v3 as imageio 19from skimage.measure import label as connected_components 20 21from torch.utils.data import Dataset, DataLoader 22 23import torch_em 24 25from .. import util 26 27 28URL = "https://libdrive.ethz.ch/index.php/s/VoF2SYkbLY8izjh/download" 29CHECKSUM = "f9115ee6b71e7c4364b83f7d7f8b66dce5b778344070bddb6a8f0e5086ca5de9" 30 31 32def _process_each_image(args): 33 import h5py 34 35 bpath, npath, gpath, data_dir = args 36 37 path_parents = Path(bpath).parents 38 split = path_parents[1].name.split("_")[-1] 39 dname = path_parents[2].name 40 41 neu_dir = os.path.join(data_dir, split, dname) 42 os.makedirs(neu_dir, exist_ok=True) 43 44 fpath = os.path.join(neu_dir, f"{Path(bpath).stem}.h5") 45 if os.path.exists(fpath): 46 return 47 48 bf = imageio.imread(bpath) 49 nuc = imageio.imread(npath) 50 gt = imageio.imread(gpath) 51 52 # Ensure all bf images have 3 channels. 53 if bf.ndim == 3: 54 bf = bf.transpose(2, 0, 1) 55 else: 56 bf = np.stack([bf] * 3, axis=0) 57 58 # Ensure all fluo images have 3 channels. 59 if nuc.ndim == 3: 60 nuc = nuc.transpose(2, 0, 1) 61 else: 62 nuc = np.stack([nuc] * 3, axis=0) 63 64 assert nuc.ndim == bf.ndim == 3 65 66 # Labels have 3 channels. Keep only one. 67 if gt.ndim == 3: 68 gt = gt[..., 0] 69 70 gt = connected_components(gt).astype("uint16") 71 72 with h5py.File(fpath, "w") as f: 73 f.create_dataset("raw/brightfield", data=bf, compression="gzip") 74 f.create_dataset("raw/fluorescence", data=nuc, compression="gzip") 75 f.create_dataset("labels", data=gt, compression="gzip") 76 77 78def _preprocess_data(data_dir, base_dir): 79 80 bf_paths = natsorted(glob(os.path.join(base_dir, "**", "brightfield", "*.png"), recursive=True)) 81 nucleus_paths = natsorted(glob(os.path.join(base_dir, "**", "nucleus", "*.png"), recursive=True)) 82 gt_paths = natsorted(glob(os.path.join(base_dir, "**", "masks", "*.png"), recursive=True)) 83 84 assert bf_paths and len(bf_paths) == len(nucleus_paths) == len(gt_paths) 85 86 tasks = [(b, n, g, data_dir) for b, n, g in zip(bf_paths, nucleus_paths, gt_paths)] 87 with ProcessPoolExecutor() as executor: 88 list(tqdm(executor.map(_process_each_image, tasks), total=len(tasks), desc="Processing data")) 89 90 91def get_aisegcell_data(path: Union[os.PathLike, str], download: bool = False) -> str: 92 """Download the aiSEGcell dataset. 93 94 Args: 95 path: Filepath to a folder where the downloaded data will be saved. 96 download: Whether to download the data if it is not present. 97 98 Returns: 99 Filepath where the dataset is stored. 100 """ 101 data_dir = os.path.join(path, "data") 102 if os.path.exists(data_dir): 103 return data_dir 104 105 zip_path = os.path.join(path, "data.zip") 106 util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM) 107 108 # We need to do multiple unzip and untar to get the data out. 109 print( 110 "'aiSEGcell' is a very large dataset (>60GB). It might take a couple of hours to download, " 111 "unzip and preprocess the data. Please ensure that you have a stable internet connection." 112 ) 113 util.unzip(zip_path=zip_path, dst=path, remove=False) 114 util.unzip_tarfile(tar_path=os.path.join(path, "679085", "aisegcell_supplement.tar"), dst=path) 115 util.unzip_tarfile( 116 tar_path=os.path.join(path, "679085", "aiSEGcell_supplement", "data_sets", "aiSEGcell_nucleus.tar"), dst=path, 117 ) 118 119 # Now that we have the core 'aiSEGcell_nucleus' folder on top-level directory, we can take it for processing data. 120 _preprocess_data(data_dir=data_dir, base_dir=os.path.join(path, "aiSEGcell_nucleus")) 121 122 return data_dir 123 124 125def get_aisegcell_paths( 126 path: Union[os.PathLike, str], split: Literal["train", "val", "test"], download: bool = False, 127) -> List[str]: 128 """Get paths to the aiSEGcell dataset. 129 130 Args: 131 path: Filepath to a folder where the downloaded data will be saved. 132 split: The data split to use. Either 'train', 'val' or 'test'. 133 download: Whether to download the data if it is not present. 134 135 Returns: 136 List of filepaths for the input data. 137 """ 138 data_dir = get_aisegcell_data(path, download) 139 140 if split not in ["train", "val", "test"]: 141 raise ValueError(f"'{split}' is not a valid split choice.") 142 143 data_paths = glob(os.path.join(data_dir, split, "**", "*.h5"), recursive=True) 144 assert len(data_paths) > 0 145 return data_paths 146 147 148def get_aisegcell_dataset( 149 path: Union[os.PathLike, str], 150 patch_shape: Tuple[int, int], 151 split: Literal["train", "val", "test"], 152 raw_channel: Literal["brightfield", "fluorescence"] = "brightfield", 153 download: bool = False, 154 **kwargs 155) -> Dataset: 156 """Get the aiSEGcell dataset for nucleus segmentation. 157 158 Args: 159 path: Filepath to a folder where the downloaded data will be saved. 160 patch_shape: The patch shape to use for training. 161 split: The data split to use. Either 'train', 'val' or 'test'. 162 raw_channel: The input channel to use. Either 'brightfield' or 'fluorescence'. 163 download: Whether to download the data if it is not present. 164 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 165 166 Returns: 167 The segmentation dataset. 168 """ 169 data_paths = get_aisegcell_paths(path, split, download) 170 171 return torch_em.default_segmentation_dataset( 172 raw_paths=data_paths, 173 raw_key=f"raw/{raw_channel}", 174 label_paths=data_paths, 175 label_key="labels", 176 is_seg_dataset=True, 177 patch_shape=patch_shape, 178 ndim=2, 179 with_channels=True, 180 **kwargs 181 ) 182 183 184def get_aisegcell_loader( 185 path: Union[os.PathLike, str], 186 batch_size: int, 187 patch_shape: Tuple[int, int], 188 split: Literal["train", "val", "test"], 189 raw_channel: Literal["brightfield", "fluorescence"] = "brightfield", 190 download: bool = False, 191 **kwargs 192) -> DataLoader: 193 """Get the aiSEGcell dataloader for nucleus segmentation. 194 195 Args: 196 path: Filepath to a folder where the downloaded data will be saved. 197 batch_size: The batch size for training. 198 patch_shape: The patch shape to use for training. 199 split: The data split to use. Either 'train', 'val' or 'test'. 200 raw_channel: The input channel to use. Either 'brightfield' or 'fluorescence'. 201 download: Whether to download the data if it is not present. 202 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 203 204 Returns: 205 The DataLoader. 206 """ 207 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 208 dataset = get_aisegcell_dataset(path, patch_shape, split, raw_channel, download, **ds_kwargs) 209 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URL =
'https://libdrive.ethz.ch/index.php/s/VoF2SYkbLY8izjh/download'
CHECKSUM =
'f9115ee6b71e7c4364b83f7d7f8b66dce5b778344070bddb6a8f0e5086ca5de9'
def
get_aisegcell_data(path: Union[os.PathLike, str], download: bool = False) -> str:
92def get_aisegcell_data(path: Union[os.PathLike, str], download: bool = False) -> str: 93 """Download the aiSEGcell dataset. 94 95 Args: 96 path: Filepath to a folder where the downloaded data will be saved. 97 download: Whether to download the data if it is not present. 98 99 Returns: 100 Filepath where the dataset is stored. 101 """ 102 data_dir = os.path.join(path, "data") 103 if os.path.exists(data_dir): 104 return data_dir 105 106 zip_path = os.path.join(path, "data.zip") 107 util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM) 108 109 # We need to do multiple unzip and untar to get the data out. 110 print( 111 "'aiSEGcell' is a very large dataset (>60GB). It might take a couple of hours to download, " 112 "unzip and preprocess the data. Please ensure that you have a stable internet connection." 113 ) 114 util.unzip(zip_path=zip_path, dst=path, remove=False) 115 util.unzip_tarfile(tar_path=os.path.join(path, "679085", "aisegcell_supplement.tar"), dst=path) 116 util.unzip_tarfile( 117 tar_path=os.path.join(path, "679085", "aiSEGcell_supplement", "data_sets", "aiSEGcell_nucleus.tar"), dst=path, 118 ) 119 120 # Now that we have the core 'aiSEGcell_nucleus' folder on top-level directory, we can take it for processing data. 121 _preprocess_data(data_dir=data_dir, base_dir=os.path.join(path, "aiSEGcell_nucleus")) 122 123 return data_dir
Download the aiSEGcell dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- download: Whether to download the data if it is not present.
Returns:
Filepath where the dataset is stored.
def
get_aisegcell_paths( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], download: bool = False) -> List[str]:
126def get_aisegcell_paths( 127 path: Union[os.PathLike, str], split: Literal["train", "val", "test"], download: bool = False, 128) -> List[str]: 129 """Get paths to the aiSEGcell dataset. 130 131 Args: 132 path: Filepath to a folder where the downloaded data will be saved. 133 split: The data split to use. Either 'train', 'val' or 'test'. 134 download: Whether to download the data if it is not present. 135 136 Returns: 137 List of filepaths for the input data. 138 """ 139 data_dir = get_aisegcell_data(path, download) 140 141 if split not in ["train", "val", "test"]: 142 raise ValueError(f"'{split}' is not a valid split choice.") 143 144 data_paths = glob(os.path.join(data_dir, split, "**", "*.h5"), recursive=True) 145 assert len(data_paths) > 0 146 return data_paths
Get paths to the aiSEGcell dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The data split to use. Either 'train', 'val' or 'test'.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the input data.
def
get_aisegcell_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], raw_channel: Literal['brightfield', 'fluorescence'] = 'brightfield', download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
149def get_aisegcell_dataset( 150 path: Union[os.PathLike, str], 151 patch_shape: Tuple[int, int], 152 split: Literal["train", "val", "test"], 153 raw_channel: Literal["brightfield", "fluorescence"] = "brightfield", 154 download: bool = False, 155 **kwargs 156) -> Dataset: 157 """Get the aiSEGcell dataset for nucleus segmentation. 158 159 Args: 160 path: Filepath to a folder where the downloaded data will be saved. 161 patch_shape: The patch shape to use for training. 162 split: The data split to use. Either 'train', 'val' or 'test'. 163 raw_channel: The input channel to use. Either 'brightfield' or 'fluorescence'. 164 download: Whether to download the data if it is not present. 165 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 166 167 Returns: 168 The segmentation dataset. 169 """ 170 data_paths = get_aisegcell_paths(path, split, download) 171 172 return torch_em.default_segmentation_dataset( 173 raw_paths=data_paths, 174 raw_key=f"raw/{raw_channel}", 175 label_paths=data_paths, 176 label_key="labels", 177 is_seg_dataset=True, 178 patch_shape=patch_shape, 179 ndim=2, 180 with_channels=True, 181 **kwargs 182 )
Get the aiSEGcell dataset for nucleus segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- split: The data split to use. Either 'train', 'val' or 'test'.
- raw_channel: The input channel to use. Either 'brightfield' or 'fluorescence'.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
def
get_aisegcell_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], raw_channel: Literal['brightfield', 'fluorescence'] = 'brightfield', download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
185def get_aisegcell_loader( 186 path: Union[os.PathLike, str], 187 batch_size: int, 188 patch_shape: Tuple[int, int], 189 split: Literal["train", "val", "test"], 190 raw_channel: Literal["brightfield", "fluorescence"] = "brightfield", 191 download: bool = False, 192 **kwargs 193) -> DataLoader: 194 """Get the aiSEGcell dataloader for nucleus segmentation. 195 196 Args: 197 path: Filepath to a folder where the downloaded data will be saved. 198 batch_size: The batch size for training. 199 patch_shape: The patch shape to use for training. 200 split: The data split to use. Either 'train', 'val' or 'test'. 201 raw_channel: The input channel to use. Either 'brightfield' or 'fluorescence'. 202 download: Whether to download the data if it is not present. 203 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 204 205 Returns: 206 The DataLoader. 207 """ 208 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 209 dataset = get_aisegcell_dataset(path, patch_shape, split, raw_channel, download, **ds_kwargs) 210 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the aiSEGcell dataloader for nucleus segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- split: The data split to use. Either 'train', 'val' or 'test'.
- raw_channel: The input channel to use. Either 'brightfield' or 'fluorescence'.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader.