torch_em.data.datasets.light_microscopy.scaffold_a549

The Scaffold-A549 dataset contains 3D confocal fluorescence microscopy images of A549 human lung cancer cells grown in a scaffold matrix, with one fully annotated volume for nucleus instance segmentation evaluation.

NOTE: The dataset contains 20 unlabeled training volumes and 1 labeled test volume (sf_a549_21), each of shape 64 x 512 x 512. Also, the labeled test volume isn't the best of annotation quality.

The dataset is located at https://github.com/Kaiseem/Scaffold-A549. This dataset is from the publication https://doi.org/10.1007/s12559-021-09944-4. Please cite it if you use this dataset in your research.

View Source

  1"""The Scaffold-A549 dataset contains 3D confocal fluorescence microscopy images
  2of A549 human lung cancer cells grown in a scaffold matrix, with one fully annotated
  3volume for nucleus instance segmentation evaluation.
  4
  5NOTE: The dataset contains 20 unlabeled training volumes and
  61 labeled test volume (sf_a549_21), each of shape 64 x 512 x 512.
  7Also, the labeled test volume isn't the best of annotation quality.
  8
  9The dataset is located at https://github.com/Kaiseem/Scaffold-A549.
 10This dataset is from the publication https://doi.org/10.1007/s12559-021-09944-4.
 11Please cite it if you use this dataset in your research.
 12"""
 13
 14import os
 15from glob import glob
 16from natsort import natsorted
 17from typing import List, Tuple, Union
 18
 19import numpy as np
 20
 21from torch.utils.data import Dataset, DataLoader
 22
 23import torch_em
 24
 25from .. import util
 26
 27
 28URL = "https://github.com/Kaiseem/Scaffold-A549/releases/download/v1.0/scaffold_a549.zip"
 29CHECKSUM = None
 30
 31
 32def get_scaffold_a549_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 33    """Download the Scaffold-A549 dataset.
 34
 35    Args:
 36        path: Filepath to a folder where the downloaded data will be saved.
 37        download: Whether to download the data if it is not present.
 38
 39    Returns:
 40        The filepath to the extracted data directory.
 41    """
 42    data_dir = os.path.join(path, "scaffold_a549")
 43    if os.path.exists(data_dir):
 44        return data_dir
 45
 46    os.makedirs(path, exist_ok=True)
 47    zip_path = os.path.join(path, "scaffold_a549.zip")
 48    util.download_source(zip_path, URL, download, checksum=CHECKSUM)
 49    util.unzip(zip_path, path)
 50
 51    return data_dir
 52
 53
 54def _convert_to_tif(data_dir):
 55    """Convert .npy volumes to .tif for compatibility with torch_em loaders."""
 56    import imageio.v3 as imageio
 57
 58    for subdir in ("train", "test"):
 59        npy_files = natsorted(glob(os.path.join(data_dir, subdir, "*.npy")))
 60        for npy_path in npy_files:
 61            tif_path = npy_path.replace(".npy", ".tif")
 62            if not os.path.exists(tif_path):
 63                arr = np.load(npy_path)
 64                imageio.imwrite(tif_path, arr)
 65
 66
 67def get_scaffold_a549_paths(
 68    path: Union[os.PathLike, str],
 69    split: str = "test",
 70    download: bool = False,
 71) -> Tuple[List[str], List[str]]:
 72    """Get paths to the Scaffold-A549 data.
 73
 74    Note: Only the test split has ground truth labels. The train split contains
 75    unlabeled volumes only.
 76
 77    Args:
 78        path: Filepath to a folder where the downloaded data will be saved.
 79        split: The split to use. Either 'train' (unlabeled) or 'test' (labeled).
 80        download: Whether to download the data if it is not present.
 81
 82    Returns:
 83        List of filepaths for the image data.
 84        List of filepaths for the label data (empty list for 'train' split).
 85    """
 86    if split not in ("train", "test"):
 87        raise ValueError(f"'{split}' is not a valid split. Choose 'train' or 'test'.")
 88
 89    data_dir = get_scaffold_a549_data(path, download)
 90    _convert_to_tif(data_dir)
 91
 92    split_dir = os.path.join(data_dir, split)
 93    if split == "test":
 94        raw_paths = [os.path.join(split_dir, "sf_a549_21.tif")]
 95        label_paths = [os.path.join(split_dir, "sf_a549_21_Label.tif")]
 96    else:
 97        raw_paths = natsorted(glob(os.path.join(split_dir, "sf_a549_*.tif")))
 98        raw_paths = [p for p in raw_paths if "Label" not in p]
 99        label_paths = []
100
101    return raw_paths, label_paths
102
103
104def get_scaffold_a549_dataset(
105    path: Union[os.PathLike, str],
106    patch_shape: Tuple[int, ...],
107    split: str = "test",
108    download: bool = False,
109    **kwargs,
110) -> Dataset:
111    """Get the Scaffold-A549 dataset for 3D nucleus instance segmentation.
112
113    Note: Only the test split has ground truth labels. The train split contains
114    20 unlabeled volumes that can be used for self-supervised learning.
115
116    Args:
117        path: Filepath to a folder where the downloaded data will be saved.
118        patch_shape: The patch shape to use for training.
119        split: The split to use. Either 'train' (unlabeled) or 'test' (labeled).
120        download: Whether to download the data if it is not present.
121        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
122
123    Returns:
124        The segmentation dataset.
125    """
126    raw_paths, label_paths = get_scaffold_a549_paths(path, split, download)
127
128    if split == "test":
129        return torch_em.default_segmentation_dataset(
130            raw_paths=raw_paths,
131            raw_key=None,
132            label_paths=label_paths,
133            label_key=None,
134            patch_shape=patch_shape,
135            **kwargs,
136        )
137    else:
138        return torch_em.default_segmentation_dataset(
139            raw_paths=raw_paths,
140            raw_key=None,
141            label_paths=None,
142            label_key=None,
143            patch_shape=patch_shape,
144            **kwargs,
145        )
146
147
148def get_scaffold_a549_loader(
149    path: Union[os.PathLike, str],
150    batch_size: int,
151    patch_shape: Tuple[int, ...],
152    split: str = "test",
153    download: bool = False,
154    **kwargs,
155) -> DataLoader:
156    """Get the Scaffold-A549 dataloader for 3D nucleus instance segmentation.
157
158    Args:
159        path: Filepath to a folder where the downloaded data will be saved.
160        batch_size: The batch size for training.
161        patch_shape: The patch shape to use for training.
162        split: The split to use. Either 'train' (unlabeled) or 'test' (labeled).
163        download: Whether to download the data if it is not present.
164        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
165
166    Returns:
167        The DataLoader.
168    """
169    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
170    dataset = get_scaffold_a549_dataset(path, patch_shape, split, download, **ds_kwargs)
171    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

URL = 'https://github.com/Kaiseem/Scaffold-A549/releases/download/v1.0/scaffold_a549.zip'

CHECKSUM = None

def get_scaffold_a549_data(path: Union[os.PathLike, str], download: bool = False) -> str: View Source

33def get_scaffold_a549_data(path: Union[os.PathLike, str], download: bool = False) -> str:
34    """Download the Scaffold-A549 dataset.
35
36    Args:
37        path: Filepath to a folder where the downloaded data will be saved.
38        download: Whether to download the data if it is not present.
39
40    Returns:
41        The filepath to the extracted data directory.
42    """
43    data_dir = os.path.join(path, "scaffold_a549")
44    if os.path.exists(data_dir):
45        return data_dir
46
47    os.makedirs(path, exist_ok=True)
48    zip_path = os.path.join(path, "scaffold_a549.zip")
49    util.download_source(zip_path, URL, download, checksum=CHECKSUM)
50    util.unzip(zip_path, path)
51
52    return data_dir

Download the Scaffold-A549 dataset.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
download: Whether to download the data if it is not present.

Returns:

The filepath to the extracted data directory.

def get_scaffold_a549_paths( path: Union[os.PathLike, str], split: str = 'test', download: bool = False) -> Tuple[List[str], List[str]]: View Source

 68def get_scaffold_a549_paths(
 69    path: Union[os.PathLike, str],
 70    split: str = "test",
 71    download: bool = False,
 72) -> Tuple[List[str], List[str]]:
 73    """Get paths to the Scaffold-A549 data.
 74
 75    Note: Only the test split has ground truth labels. The train split contains
 76    unlabeled volumes only.
 77
 78    Args:
 79        path: Filepath to a folder where the downloaded data will be saved.
 80        split: The split to use. Either 'train' (unlabeled) or 'test' (labeled).
 81        download: Whether to download the data if it is not present.
 82
 83    Returns:
 84        List of filepaths for the image data.
 85        List of filepaths for the label data (empty list for 'train' split).
 86    """
 87    if split not in ("train", "test"):
 88        raise ValueError(f"'{split}' is not a valid split. Choose 'train' or 'test'.")
 89
 90    data_dir = get_scaffold_a549_data(path, download)
 91    _convert_to_tif(data_dir)
 92
 93    split_dir = os.path.join(data_dir, split)
 94    if split == "test":
 95        raw_paths = [os.path.join(split_dir, "sf_a549_21.tif")]
 96        label_paths = [os.path.join(split_dir, "sf_a549_21_Label.tif")]
 97    else:
 98        raw_paths = natsorted(glob(os.path.join(split_dir, "sf_a549_*.tif")))
 99        raw_paths = [p for p in raw_paths if "Label" not in p]
100        label_paths = []
101
102    return raw_paths, label_paths

Get paths to the Scaffold-A549 data.

Note: Only the test split has ground truth labels. The train split contains unlabeled volumes only.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
split: The split to use. Either 'train' (unlabeled) or 'test' (labeled).
download: Whether to download the data if it is not present.

Returns:

List of filepaths for the image data. List of filepaths for the label data (empty list for 'train' split).

def get_scaffold_a549_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], split: str = 'test', download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset: View Source

105def get_scaffold_a549_dataset(
106    path: Union[os.PathLike, str],
107    patch_shape: Tuple[int, ...],
108    split: str = "test",
109    download: bool = False,
110    **kwargs,
111) -> Dataset:
112    """Get the Scaffold-A549 dataset for 3D nucleus instance segmentation.
113
114    Note: Only the test split has ground truth labels. The train split contains
115    20 unlabeled volumes that can be used for self-supervised learning.
116
117    Args:
118        path: Filepath to a folder where the downloaded data will be saved.
119        patch_shape: The patch shape to use for training.
120        split: The split to use. Either 'train' (unlabeled) or 'test' (labeled).
121        download: Whether to download the data if it is not present.
122        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
123
124    Returns:
125        The segmentation dataset.
126    """
127    raw_paths, label_paths = get_scaffold_a549_paths(path, split, download)
128
129    if split == "test":
130        return torch_em.default_segmentation_dataset(
131            raw_paths=raw_paths,
132            raw_key=None,
133            label_paths=label_paths,
134            label_key=None,
135            patch_shape=patch_shape,
136            **kwargs,
137        )
138    else:
139        return torch_em.default_segmentation_dataset(
140            raw_paths=raw_paths,
141            raw_key=None,
142            label_paths=None,
143            label_key=None,
144            patch_shape=patch_shape,
145            **kwargs,
146        )

Get the Scaffold-A549 dataset for 3D nucleus instance segmentation.

Note: Only the test split has ground truth labels. The train split contains 20 unlabeled volumes that can be used for self-supervised learning.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
patch_shape: The patch shape to use for training.
split: The split to use. Either 'train' (unlabeled) or 'test' (labeled).
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

Returns:

The segmentation dataset.

def get_scaffold_a549_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, ...], split: str = 'test', download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader: View Source

149def get_scaffold_a549_loader(
150    path: Union[os.PathLike, str],
151    batch_size: int,
152    patch_shape: Tuple[int, ...],
153    split: str = "test",
154    download: bool = False,
155    **kwargs,
156) -> DataLoader:
157    """Get the Scaffold-A549 dataloader for 3D nucleus instance segmentation.
158
159    Args:
160        path: Filepath to a folder where the downloaded data will be saved.
161        batch_size: The batch size for training.
162        patch_shape: The patch shape to use for training.
163        split: The split to use. Either 'train' (unlabeled) or 'test' (labeled).
164        download: Whether to download the data if it is not present.
165        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
166
167    Returns:
168        The DataLoader.
169    """
170    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
171    dataset = get_scaffold_a549_dataset(path, patch_shape, split, download, **ds_kwargs)
172    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the Scaffold-A549 dataloader for 3D nucleus instance segmentation.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
batch_size: The batch size for training.
patch_shape: The patch shape to use for training.
split: The split to use. Either 'train' (unlabeled) or 'test' (labeled).
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

Returns:

The DataLoader.