torch_em.data.datasets.light_microscopy.arvidsson
This dataset contains annotations for nucleus segmentation in high-content fluorescence microscopy images.
The dataset is located at https://zenodo.org/records/6657260. This dataset is from the publication https://doi.org/10.1016/j.dib.2022.108769. Please cite it if you use this dataset in your research.
1"""This dataset contains annotations for nucleus segmentation in 2high-content fluorescence microscopy images. 3 4The dataset is located at https://zenodo.org/records/6657260. 5This dataset is from the publication https://doi.org/10.1016/j.dib.2022.108769. 6Please cite it if you use this dataset in your research. 7""" 8 9import os 10from glob import glob 11from tqdm import tqdm 12from natsort import natsorted 13from typing import Union, Tuple, Literal, List 14 15import numpy as np 16import imageio.v3 as imageio 17from skimage.measure import label as connected_components 18 19import torch_em 20 21from torch.utils.data import Dataset, DataLoader 22 23from .. import util 24 25 26URLS = { 27 "train": "https://zenodo.org/records/6657260/files/training_nuclei.zip", 28 "val": "https://zenodo.org/records/6657260/files/development_nuclei.zip", 29 "test": "https://zenodo.org/records/6657260/files/test_nuclei.zip", 30} 31 32CHECKSUMS = { 33 "train": "df075941f4e561f9ef82d4c48d22cf97e3627a0b63fa136675197614813fff90", 34 "val": "722530a93fd5b67f61d52964651c715be6227c1c0508c4c95ef2b04b52fc1dd1", 35 "test": "377dc719c4eaf9bfa30273f7e3a4042d98dbbfc4a1c4af2a467879237bff592f", 36} 37 38 39def get_arvidsson_data( 40 path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], download: bool = False 41) -> str: 42 """Download the Arvidsson dataset. 43 44 Args: 45 path: Filepath to a folder where the downloaded data will be saved. 46 split: The data split to use. Either 'train', 'val' or 'test'. 47 download: Whether to download the data if it is not present. 48 49 Returns: 50 The filepath to the training data. 51 """ 52 if split == "train": 53 dname = "training_nuclei" 54 elif split == "val": 55 dname = "development_nuclei" 56 elif split == "test": 57 dname = "test_nuclei" 58 else: 59 raise ValueError(f"'{split}' is not a valid split.") 60 61 data_dir = os.path.join(path, dname) 62 if os.path.exists(data_dir): 63 return data_dir 64 65 os.makedirs(path, exist_ok=True) 66 67 zip_path = os.path.join(path, f"{dname}.zip") 68 util.download_source(path=zip_path, url=URLS[split], download=download, checksum=CHECKSUMS[split]) 69 util.unzip(zip_path=os.path.join(path, f"{dname}.zip"), dst=path) 70 71 return data_dir 72 73 74def get_arvidsson_paths( 75 path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], download: bool = False, 76) -> Tuple[List[int], List[int]]: 77 """Get paths to the Arvidsson data. 78 79 Args: 80 path: Filepath to a folder where the downloaded data will be saved. 81 split: The data split to use. Either 'train', 'val' or 'test'. 82 download: Whether to download the data if it is not present. 83 84 Returns: 85 List of filepaths for the image data. 86 List of filepaths for the label data. 87 """ 88 data_dir = get_arvidsson_data(path, split, download) 89 90 raw_paths = natsorted(glob(os.path.join(data_dir, "images", "*.png"))) 91 label_paths = natsorted(glob(os.path.join(data_dir, "annotations", "*_preprocessed.tif"))) 92 if len(raw_paths) == len(label_paths): 93 return raw_paths, label_paths 94 95 channel_label_paths = natsorted(glob(os.path.join(data_dir, "annotations", "*.png"))) 96 instance_paths = [] 97 for rpath, lpath in tqdm( 98 zip(raw_paths, channel_label_paths), desc=f"Preprocessing labels for '{split}' split", total=len(raw_paths) 99 ): 100 instance_path = lpath.replace(".png", "_preprocessed.tif") 101 instance_paths.append(instance_path) 102 if os.path.exists(instance_path): 103 continue 104 105 raw = imageio.imread(rpath) 106 labels = imageio.imread(lpath) 107 108 # NOTE: Converting the RGB-style instance labels to single channel instance labels. 109 # We do not operate over the backgroun region (with known pixel values: [0, 0, 0]) 110 background_mask = np.all(labels == [0, 0, 0], axis=-1) 111 _, indices = np.unique(labels[~background_mask].reshape(-1, 3), axis=0, return_inverse=True) 112 113 instances = np.zeros(labels.shape[:2], dtype=np.int32) 114 instances[~background_mask] = indices + 1 115 instances = connected_components(instances) 116 117 assert raw.shape == instances.shape 118 119 imageio.imwrite(instance_path, instances, compression="zlib") 120 121 return raw_paths, instance_paths 122 123 124def get_arvidsson_dataset( 125 path: Union[os.PathLike, str], 126 patch_shape: Tuple[int, int], 127 split: Literal['train', 'val', 'test'], 128 download: bool = False, 129 **kwargs 130) -> Dataset: 131 """Get the Arvidsson dataset for nucleus segmentation. 132 133 Args: 134 path: Filepath to a folder where the downloaded data will be saved. 135 patch_shape: The patch shape to use for training. 136 split: The data split to use. Either 'train', 'val' or 'test'. 137 download: Whether to download the data if it is not present. 138 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 139 140 Returns: 141 The segmentation dataset. 142 """ 143 raw_paths, label_paths = get_arvidsson_paths(path, split, download) 144 145 return torch_em.default_segmentation_dataset( 146 raw_paths=raw_paths, 147 raw_key=None, 148 label_paths=label_paths, 149 label_key=None, 150 patch_shape=patch_shape, 151 is_seg_dataset=False, 152 **kwargs 153 ) 154 155 156def get_arvidsson_loader( 157 path: Union[os.PathLike, str], 158 batch_size: int, 159 patch_shape: Tuple[int, int], 160 split: Literal['train', 'val', 'test'], 161 download: bool = False, 162 **kwargs 163) -> DataLoader: 164 """Get the Arvidsson dataloader for nucleus segmentation. 165 166 Args: 167 path: Filepath to a folder where the downloaded data will be saved. 168 batch_size: The batch size for training. 169 patch_shape: The patch shape to use for training. 170 split: The data split to use. Either 'train', 'val' or 'test'. 171 download: Whether to download the data if it is not present. 172 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 173 174 Returns: 175 The DataLoader. 176 """ 177 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 178 dataset = get_arvidsson_dataset(path, patch_shape, split, download, **ds_kwargs) 179 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URLS =
{'train': 'https://zenodo.org/records/6657260/files/training_nuclei.zip', 'val': 'https://zenodo.org/records/6657260/files/development_nuclei.zip', 'test': 'https://zenodo.org/records/6657260/files/test_nuclei.zip'}
CHECKSUMS =
{'train': 'df075941f4e561f9ef82d4c48d22cf97e3627a0b63fa136675197614813fff90', 'val': '722530a93fd5b67f61d52964651c715be6227c1c0508c4c95ef2b04b52fc1dd1', 'test': '377dc719c4eaf9bfa30273f7e3a4042d98dbbfc4a1c4af2a467879237bff592f'}
def
get_arvidsson_data( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], download: bool = False) -> str:
40def get_arvidsson_data( 41 path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], download: bool = False 42) -> str: 43 """Download the Arvidsson dataset. 44 45 Args: 46 path: Filepath to a folder where the downloaded data will be saved. 47 split: The data split to use. Either 'train', 'val' or 'test'. 48 download: Whether to download the data if it is not present. 49 50 Returns: 51 The filepath to the training data. 52 """ 53 if split == "train": 54 dname = "training_nuclei" 55 elif split == "val": 56 dname = "development_nuclei" 57 elif split == "test": 58 dname = "test_nuclei" 59 else: 60 raise ValueError(f"'{split}' is not a valid split.") 61 62 data_dir = os.path.join(path, dname) 63 if os.path.exists(data_dir): 64 return data_dir 65 66 os.makedirs(path, exist_ok=True) 67 68 zip_path = os.path.join(path, f"{dname}.zip") 69 util.download_source(path=zip_path, url=URLS[split], download=download, checksum=CHECKSUMS[split]) 70 util.unzip(zip_path=os.path.join(path, f"{dname}.zip"), dst=path) 71 72 return data_dir
Download the Arvidsson dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The data split to use. Either 'train', 'val' or 'test'.
- download: Whether to download the data if it is not present.
Returns:
The filepath to the training data.
def
get_arvidsson_paths( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], download: bool = False) -> Tuple[List[int], List[int]]:
75def get_arvidsson_paths( 76 path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], download: bool = False, 77) -> Tuple[List[int], List[int]]: 78 """Get paths to the Arvidsson data. 79 80 Args: 81 path: Filepath to a folder where the downloaded data will be saved. 82 split: The data split to use. Either 'train', 'val' or 'test'. 83 download: Whether to download the data if it is not present. 84 85 Returns: 86 List of filepaths for the image data. 87 List of filepaths for the label data. 88 """ 89 data_dir = get_arvidsson_data(path, split, download) 90 91 raw_paths = natsorted(glob(os.path.join(data_dir, "images", "*.png"))) 92 label_paths = natsorted(glob(os.path.join(data_dir, "annotations", "*_preprocessed.tif"))) 93 if len(raw_paths) == len(label_paths): 94 return raw_paths, label_paths 95 96 channel_label_paths = natsorted(glob(os.path.join(data_dir, "annotations", "*.png"))) 97 instance_paths = [] 98 for rpath, lpath in tqdm( 99 zip(raw_paths, channel_label_paths), desc=f"Preprocessing labels for '{split}' split", total=len(raw_paths) 100 ): 101 instance_path = lpath.replace(".png", "_preprocessed.tif") 102 instance_paths.append(instance_path) 103 if os.path.exists(instance_path): 104 continue 105 106 raw = imageio.imread(rpath) 107 labels = imageio.imread(lpath) 108 109 # NOTE: Converting the RGB-style instance labels to single channel instance labels. 110 # We do not operate over the backgroun region (with known pixel values: [0, 0, 0]) 111 background_mask = np.all(labels == [0, 0, 0], axis=-1) 112 _, indices = np.unique(labels[~background_mask].reshape(-1, 3), axis=0, return_inverse=True) 113 114 instances = np.zeros(labels.shape[:2], dtype=np.int32) 115 instances[~background_mask] = indices + 1 116 instances = connected_components(instances) 117 118 assert raw.shape == instances.shape 119 120 imageio.imwrite(instance_path, instances, compression="zlib") 121 122 return raw_paths, instance_paths
Get paths to the Arvidsson data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The data split to use. Either 'train', 'val' or 'test'.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the image data. List of filepaths for the label data.
def
get_arvidsson_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
125def get_arvidsson_dataset( 126 path: Union[os.PathLike, str], 127 patch_shape: Tuple[int, int], 128 split: Literal['train', 'val', 'test'], 129 download: bool = False, 130 **kwargs 131) -> Dataset: 132 """Get the Arvidsson dataset for nucleus segmentation. 133 134 Args: 135 path: Filepath to a folder where the downloaded data will be saved. 136 patch_shape: The patch shape to use for training. 137 split: The data split to use. Either 'train', 'val' or 'test'. 138 download: Whether to download the data if it is not present. 139 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 140 141 Returns: 142 The segmentation dataset. 143 """ 144 raw_paths, label_paths = get_arvidsson_paths(path, split, download) 145 146 return torch_em.default_segmentation_dataset( 147 raw_paths=raw_paths, 148 raw_key=None, 149 label_paths=label_paths, 150 label_key=None, 151 patch_shape=patch_shape, 152 is_seg_dataset=False, 153 **kwargs 154 )
Get the Arvidsson dataset for nucleus segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- split: The data split to use. Either 'train', 'val' or 'test'.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
def
get_arvidsson_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
157def get_arvidsson_loader( 158 path: Union[os.PathLike, str], 159 batch_size: int, 160 patch_shape: Tuple[int, int], 161 split: Literal['train', 'val', 'test'], 162 download: bool = False, 163 **kwargs 164) -> DataLoader: 165 """Get the Arvidsson dataloader for nucleus segmentation. 166 167 Args: 168 path: Filepath to a folder where the downloaded data will be saved. 169 batch_size: The batch size for training. 170 patch_shape: The patch shape to use for training. 171 split: The data split to use. Either 'train', 'val' or 'test'. 172 download: Whether to download the data if it is not present. 173 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 174 175 Returns: 176 The DataLoader. 177 """ 178 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 179 dataset = get_arvidsson_dataset(path, patch_shape, split, download, **ds_kwargs) 180 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the Arvidsson dataloader for nucleus segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- split: The data split to use. Either 'train', 'val' or 'test'.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader.