torch_em.data.datasets.light_microscopy.scaffold_a549
The Scaffold-A549 dataset contains 3D confocal fluorescence microscopy images of A549 human lung cancer cells grown in a scaffold matrix, with one fully annotated volume for nucleus instance segmentation evaluation.
NOTE: The dataset contains 20 unlabeled training volumes and 1 labeled test volume (sf_a549_21), each of shape 64 x 512 x 512. Also, the labeled test volume isn't the best of annotation quality.
The dataset is located at https://github.com/Kaiseem/Scaffold-A549. This dataset is from the publication https://doi.org/10.1007/s12559-021-09944-4. Please cite it if you use this dataset in your research.
1"""The Scaffold-A549 dataset contains 3D confocal fluorescence microscopy images 2of A549 human lung cancer cells grown in a scaffold matrix, with one fully annotated 3volume for nucleus instance segmentation evaluation. 4 5NOTE: The dataset contains 20 unlabeled training volumes and 61 labeled test volume (sf_a549_21), each of shape 64 x 512 x 512. 7Also, the labeled test volume isn't the best of annotation quality. 8 9The dataset is located at https://github.com/Kaiseem/Scaffold-A549. 10This dataset is from the publication https://doi.org/10.1007/s12559-021-09944-4. 11Please cite it if you use this dataset in your research. 12""" 13 14import os 15from glob import glob 16from natsort import natsorted 17from typing import List, Tuple, Union 18 19import numpy as np 20 21from torch.utils.data import Dataset, DataLoader 22 23import torch_em 24 25from .. import util 26 27 28URL = "https://github.com/Kaiseem/Scaffold-A549/releases/download/v1.0/scaffold_a549.zip" 29CHECKSUM = None 30 31 32def get_scaffold_a549_data(path: Union[os.PathLike, str], download: bool = False) -> str: 33 """Download the Scaffold-A549 dataset. 34 35 Args: 36 path: Filepath to a folder where the downloaded data will be saved. 37 download: Whether to download the data if it is not present. 38 39 Returns: 40 The filepath to the extracted data directory. 41 """ 42 data_dir = os.path.join(path, "scaffold_a549") 43 if os.path.exists(data_dir): 44 return data_dir 45 46 os.makedirs(path, exist_ok=True) 47 zip_path = os.path.join(path, "scaffold_a549.zip") 48 util.download_source(zip_path, URL, download, checksum=CHECKSUM) 49 util.unzip(zip_path, path) 50 51 return data_dir 52 53 54def _convert_to_tif(data_dir): 55 """Convert .npy volumes to .tif for compatibility with torch_em loaders.""" 56 import imageio.v3 as imageio 57 58 for subdir in ("train", "test"): 59 npy_files = natsorted(glob(os.path.join(data_dir, subdir, "*.npy"))) 60 for npy_path in npy_files: 61 tif_path = npy_path.replace(".npy", ".tif") 62 if not os.path.exists(tif_path): 63 arr = np.load(npy_path) 64 imageio.imwrite(tif_path, arr) 65 66 67def get_scaffold_a549_paths( 68 path: Union[os.PathLike, str], 69 split: str = "test", 70 download: bool = False, 71) -> Tuple[List[str], List[str]]: 72 """Get paths to the Scaffold-A549 data. 73 74 Note: Only the test split has ground truth labels. The train split contains 75 unlabeled volumes only. 76 77 Args: 78 path: Filepath to a folder where the downloaded data will be saved. 79 split: The split to use. Either 'train' (unlabeled) or 'test' (labeled). 80 download: Whether to download the data if it is not present. 81 82 Returns: 83 List of filepaths for the image data. 84 List of filepaths for the label data (empty list for 'train' split). 85 """ 86 if split not in ("train", "test"): 87 raise ValueError(f"'{split}' is not a valid split. Choose 'train' or 'test'.") 88 89 data_dir = get_scaffold_a549_data(path, download) 90 _convert_to_tif(data_dir) 91 92 split_dir = os.path.join(data_dir, split) 93 if split == "test": 94 raw_paths = [os.path.join(split_dir, "sf_a549_21.tif")] 95 label_paths = [os.path.join(split_dir, "sf_a549_21_Label.tif")] 96 else: 97 raw_paths = natsorted(glob(os.path.join(split_dir, "sf_a549_*.tif"))) 98 raw_paths = [p for p in raw_paths if "Label" not in p] 99 label_paths = [] 100 101 return raw_paths, label_paths 102 103 104def get_scaffold_a549_dataset( 105 path: Union[os.PathLike, str], 106 patch_shape: Tuple[int, ...], 107 split: str = "test", 108 download: bool = False, 109 **kwargs, 110) -> Dataset: 111 """Get the Scaffold-A549 dataset for 3D nucleus instance segmentation. 112 113 Note: Only the test split has ground truth labels. The train split contains 114 20 unlabeled volumes that can be used for self-supervised learning. 115 116 Args: 117 path: Filepath to a folder where the downloaded data will be saved. 118 patch_shape: The patch shape to use for training. 119 split: The split to use. Either 'train' (unlabeled) or 'test' (labeled). 120 download: Whether to download the data if it is not present. 121 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 122 123 Returns: 124 The segmentation dataset. 125 """ 126 raw_paths, label_paths = get_scaffold_a549_paths(path, split, download) 127 128 if split == "test": 129 return torch_em.default_segmentation_dataset( 130 raw_paths=raw_paths, 131 raw_key=None, 132 label_paths=label_paths, 133 label_key=None, 134 patch_shape=patch_shape, 135 **kwargs, 136 ) 137 else: 138 return torch_em.default_segmentation_dataset( 139 raw_paths=raw_paths, 140 raw_key=None, 141 label_paths=None, 142 label_key=None, 143 patch_shape=patch_shape, 144 **kwargs, 145 ) 146 147 148def get_scaffold_a549_loader( 149 path: Union[os.PathLike, str], 150 batch_size: int, 151 patch_shape: Tuple[int, ...], 152 split: str = "test", 153 download: bool = False, 154 **kwargs, 155) -> DataLoader: 156 """Get the Scaffold-A549 dataloader for 3D nucleus instance segmentation. 157 158 Args: 159 path: Filepath to a folder where the downloaded data will be saved. 160 batch_size: The batch size for training. 161 patch_shape: The patch shape to use for training. 162 split: The split to use. Either 'train' (unlabeled) or 'test' (labeled). 163 download: Whether to download the data if it is not present. 164 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 165 166 Returns: 167 The DataLoader. 168 """ 169 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 170 dataset = get_scaffold_a549_dataset(path, patch_shape, split, download, **ds_kwargs) 171 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
33def get_scaffold_a549_data(path: Union[os.PathLike, str], download: bool = False) -> str: 34 """Download the Scaffold-A549 dataset. 35 36 Args: 37 path: Filepath to a folder where the downloaded data will be saved. 38 download: Whether to download the data if it is not present. 39 40 Returns: 41 The filepath to the extracted data directory. 42 """ 43 data_dir = os.path.join(path, "scaffold_a549") 44 if os.path.exists(data_dir): 45 return data_dir 46 47 os.makedirs(path, exist_ok=True) 48 zip_path = os.path.join(path, "scaffold_a549.zip") 49 util.download_source(zip_path, URL, download, checksum=CHECKSUM) 50 util.unzip(zip_path, path) 51 52 return data_dir
Download the Scaffold-A549 dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- download: Whether to download the data if it is not present.
Returns:
The filepath to the extracted data directory.
68def get_scaffold_a549_paths( 69 path: Union[os.PathLike, str], 70 split: str = "test", 71 download: bool = False, 72) -> Tuple[List[str], List[str]]: 73 """Get paths to the Scaffold-A549 data. 74 75 Note: Only the test split has ground truth labels. The train split contains 76 unlabeled volumes only. 77 78 Args: 79 path: Filepath to a folder where the downloaded data will be saved. 80 split: The split to use. Either 'train' (unlabeled) or 'test' (labeled). 81 download: Whether to download the data if it is not present. 82 83 Returns: 84 List of filepaths for the image data. 85 List of filepaths for the label data (empty list for 'train' split). 86 """ 87 if split not in ("train", "test"): 88 raise ValueError(f"'{split}' is not a valid split. Choose 'train' or 'test'.") 89 90 data_dir = get_scaffold_a549_data(path, download) 91 _convert_to_tif(data_dir) 92 93 split_dir = os.path.join(data_dir, split) 94 if split == "test": 95 raw_paths = [os.path.join(split_dir, "sf_a549_21.tif")] 96 label_paths = [os.path.join(split_dir, "sf_a549_21_Label.tif")] 97 else: 98 raw_paths = natsorted(glob(os.path.join(split_dir, "sf_a549_*.tif"))) 99 raw_paths = [p for p in raw_paths if "Label" not in p] 100 label_paths = [] 101 102 return raw_paths, label_paths
Get paths to the Scaffold-A549 data.
Note: Only the test split has ground truth labels. The train split contains unlabeled volumes only.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The split to use. Either 'train' (unlabeled) or 'test' (labeled).
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the image data. List of filepaths for the label data (empty list for 'train' split).
105def get_scaffold_a549_dataset( 106 path: Union[os.PathLike, str], 107 patch_shape: Tuple[int, ...], 108 split: str = "test", 109 download: bool = False, 110 **kwargs, 111) -> Dataset: 112 """Get the Scaffold-A549 dataset for 3D nucleus instance segmentation. 113 114 Note: Only the test split has ground truth labels. The train split contains 115 20 unlabeled volumes that can be used for self-supervised learning. 116 117 Args: 118 path: Filepath to a folder where the downloaded data will be saved. 119 patch_shape: The patch shape to use for training. 120 split: The split to use. Either 'train' (unlabeled) or 'test' (labeled). 121 download: Whether to download the data if it is not present. 122 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 123 124 Returns: 125 The segmentation dataset. 126 """ 127 raw_paths, label_paths = get_scaffold_a549_paths(path, split, download) 128 129 if split == "test": 130 return torch_em.default_segmentation_dataset( 131 raw_paths=raw_paths, 132 raw_key=None, 133 label_paths=label_paths, 134 label_key=None, 135 patch_shape=patch_shape, 136 **kwargs, 137 ) 138 else: 139 return torch_em.default_segmentation_dataset( 140 raw_paths=raw_paths, 141 raw_key=None, 142 label_paths=None, 143 label_key=None, 144 patch_shape=patch_shape, 145 **kwargs, 146 )
Get the Scaffold-A549 dataset for 3D nucleus instance segmentation.
Note: Only the test split has ground truth labels. The train split contains 20 unlabeled volumes that can be used for self-supervised learning.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- split: The split to use. Either 'train' (unlabeled) or 'test' (labeled).
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset.
Returns:
The segmentation dataset.
149def get_scaffold_a549_loader( 150 path: Union[os.PathLike, str], 151 batch_size: int, 152 patch_shape: Tuple[int, ...], 153 split: str = "test", 154 download: bool = False, 155 **kwargs, 156) -> DataLoader: 157 """Get the Scaffold-A549 dataloader for 3D nucleus instance segmentation. 158 159 Args: 160 path: Filepath to a folder where the downloaded data will be saved. 161 batch_size: The batch size for training. 162 patch_shape: The patch shape to use for training. 163 split: The split to use. Either 'train' (unlabeled) or 'test' (labeled). 164 download: Whether to download the data if it is not present. 165 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 166 167 Returns: 168 The DataLoader. 169 """ 170 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 171 dataset = get_scaffold_a549_dataset(path, patch_shape, split, download, **ds_kwargs) 172 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the Scaffold-A549 dataloader for 3D nucleus instance segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- split: The split to use. Either 'train' (unlabeled) or 'test' (labeled).
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_datasetor for the PyTorch DataLoader.
Returns:
The DataLoader.