torch_em.data.datasets.light_microscopy.aisegcell

The aiSEGcell dataset contains annotations for nucleus segmentation in paired brightfield and fluorescence images.

The dataset collection is located at https://www.research-collection.ethz.ch/handle/20.500.11850/679085. This dataset is from the publication https://doi.org/10.1371/journal.pcbi.1012361. Please cite it if you use this dataset in your research.

View Source

  1"""The aiSEGcell dataset contains annotations for nucleus segmentation in
  2paired brightfield and fluorescence images.
  3
  4The dataset collection is located at https://www.research-collection.ethz.ch/handle/20.500.11850/679085.
  5This dataset is from the publication https://doi.org/10.1371/journal.pcbi.1012361.
  6Please cite it if you use this dataset in your research.
  7"""
  8
  9import os
 10from glob import glob
 11from tqdm import tqdm
 12from pathlib import Path
 13from natsort import natsorted
 14from typing import List, Union, Tuple, Literal
 15from concurrent.futures import ProcessPoolExecutor
 16
 17import numpy as np
 18import imageio.v3 as imageio
 19from skimage.measure import label as connected_components
 20
 21from torch.utils.data import Dataset, DataLoader
 22
 23import torch_em
 24
 25from .. import util
 26
 27
 28URL = "https://libdrive.ethz.ch/index.php/s/VoF2SYkbLY8izjh/download"
 29CHECKSUM = "f9115ee6b71e7c4364b83f7d7f8b66dce5b778344070bddb6a8f0e5086ca5de9"
 30
 31
 32def _process_each_image(args):
 33    import h5py
 34
 35    bpath, npath, gpath, data_dir = args
 36
 37    path_parents = Path(bpath).parents
 38    split = path_parents[1].name.split("_")[-1]
 39    dname = path_parents[2].name
 40
 41    neu_dir = os.path.join(data_dir, split, dname)
 42    os.makedirs(neu_dir, exist_ok=True)
 43
 44    fpath = os.path.join(neu_dir, f"{Path(bpath).stem}.h5")
 45    if os.path.exists(fpath):
 46        return
 47
 48    bf = imageio.imread(bpath)
 49    nuc = imageio.imread(npath)
 50    gt = imageio.imread(gpath)
 51
 52    # Ensure all bf images have 3 channels.
 53    if bf.ndim == 3:
 54        bf = bf.transpose(2, 0, 1)
 55    else:
 56        bf = np.stack([bf] * 3, axis=0)
 57
 58    # Ensure all fluo images have 3 channels.
 59    if nuc.ndim == 3:
 60        nuc = nuc.transpose(2, 0, 1)
 61    else:
 62        nuc = np.stack([nuc] * 3, axis=0)
 63
 64    assert nuc.ndim == bf.ndim == 3
 65
 66    # Labels have 3 channels. Keep only one.
 67    if gt.ndim == 3:
 68        gt = gt[..., 0]
 69
 70    gt = connected_components(gt).astype("uint16")
 71
 72    with h5py.File(fpath, "w") as f:
 73        f.create_dataset("raw/brightfield", data=bf, compression="gzip")
 74        f.create_dataset("raw/fluorescence", data=nuc, compression="gzip")
 75        f.create_dataset("labels", data=gt, compression="gzip")
 76
 77
 78def _preprocess_data(data_dir, base_dir):
 79
 80    bf_paths = natsorted(glob(os.path.join(base_dir, "**", "brightfield", "*.png"), recursive=True))
 81    nucleus_paths = natsorted(glob(os.path.join(base_dir, "**", "nucleus", "*.png"), recursive=True))
 82    gt_paths = natsorted(glob(os.path.join(base_dir, "**", "masks", "*.png"), recursive=True))
 83
 84    assert bf_paths and len(bf_paths) == len(nucleus_paths) == len(gt_paths)
 85
 86    tasks = [(b, n, g, data_dir) for b, n, g in zip(bf_paths, nucleus_paths, gt_paths)]
 87    with ProcessPoolExecutor() as executor:
 88        list(tqdm(executor.map(_process_each_image, tasks), total=len(tasks), desc="Processing data"))
 89
 90
 91def get_aisegcell_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 92    """Download the aiSEGcell dataset.
 93
 94    Args:
 95        path: Filepath to a folder where the downloaded data will be saved.
 96        download: Whether to download the data if it is not present.
 97
 98    Returns:
 99        Filepath where the dataset is stored.
100    """
101    data_dir = os.path.join(path, "data")
102    if os.path.exists(data_dir):
103        return data_dir
104
105    zip_path = os.path.join(path, "data.zip")
106    util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM)
107
108    # We need to do multiple unzip and untar to get the data out.
109    print(
110        "'aiSEGcell' is a very large dataset (>60GB). It might take a couple of hours to download, "
111        "unzip and preprocess the data. Please ensure that you have a stable internet connection."
112    )
113    util.unzip(zip_path=zip_path, dst=path, remove=False)
114    util.unzip_tarfile(tar_path=os.path.join(path, "679085", "aisegcell_supplement.tar"), dst=path)
115    util.unzip_tarfile(
116        tar_path=os.path.join(path, "679085", "aiSEGcell_supplement", "data_sets", "aiSEGcell_nucleus.tar"), dst=path,
117    )
118
119    # Now that we have the core 'aiSEGcell_nucleus' folder on top-level directory, we can take it for processing data.
120    _preprocess_data(data_dir=data_dir, base_dir=os.path.join(path, "aiSEGcell_nucleus"))
121
122    return data_dir
123
124
125def get_aisegcell_paths(
126    path: Union[os.PathLike, str], split: Literal["train", "val", "test"], download: bool = False,
127) -> List[str]:
128    """Get paths to the aiSEGcell dataset.
129
130    Args:
131        path: Filepath to a folder where the downloaded data will be saved.
132        split: The data split to use. Either 'train', 'val' or 'test'.
133        download: Whether to download the data if it is not present.
134
135    Returns:
136        List of filepaths for the input data.
137    """
138    data_dir = get_aisegcell_data(path, download)
139
140    if split not in ["train", "val", "test"]:
141        raise ValueError(f"'{split}' is not a valid split choice.")
142
143    data_paths = glob(os.path.join(data_dir, split, "**", "*.h5"), recursive=True)
144    assert len(data_paths) > 0
145    return data_paths
146
147
148def get_aisegcell_dataset(
149    path: Union[os.PathLike, str],
150    patch_shape: Tuple[int, int],
151    split: Literal["train", "val", "test"],
152    raw_channel: Literal["brightfield", "fluorescence"] = "brightfield",
153    download: bool = False,
154    **kwargs
155) -> Dataset:
156    """Get the aiSEGcell dataset for nucleus segmentation.
157
158    Args:
159        path: Filepath to a folder where the downloaded data will be saved.
160        patch_shape: The patch shape to use for training.
161        split: The data split to use. Either 'train', 'val' or 'test'.
162        raw_channel: The input channel to use. Either 'brightfield' or 'fluorescence'.
163        download: Whether to download the data if it is not present.
164        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
165
166    Returns:
167        The segmentation dataset.
168    """
169    data_paths = get_aisegcell_paths(path, split, download)
170
171    return torch_em.default_segmentation_dataset(
172        raw_paths=data_paths,
173        raw_key=f"raw/{raw_channel}",
174        label_paths=data_paths,
175        label_key="labels",
176        is_seg_dataset=True,
177        patch_shape=patch_shape,
178        ndim=2,
179        with_channels=True,
180        **kwargs
181    )
182
183
184def get_aisegcell_loader(
185    path: Union[os.PathLike, str],
186    batch_size: int,
187    patch_shape: Tuple[int, int],
188    split: Literal["train", "val", "test"],
189    raw_channel: Literal["brightfield", "fluorescence"] = "brightfield",
190    download: bool = False,
191    **kwargs
192) -> DataLoader:
193    """Get the aiSEGcell dataloader for nucleus segmentation.
194
195    Args:
196        path: Filepath to a folder where the downloaded data will be saved.
197        batch_size: The batch size for training.
198        patch_shape: The patch shape to use for training.
199        split: The data split to use. Either 'train', 'val' or 'test'.
200        raw_channel: The input channel to use. Either 'brightfield' or 'fluorescence'.
201        download: Whether to download the data if it is not present.
202        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
203
204    Returns:
205        The DataLoader.
206    """
207    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
208    dataset = get_aisegcell_dataset(path, patch_shape, split, raw_channel, download, **ds_kwargs)
209    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

URL = 'https://libdrive.ethz.ch/index.php/s/VoF2SYkbLY8izjh/download'

CHECKSUM = 'f9115ee6b71e7c4364b83f7d7f8b66dce5b778344070bddb6a8f0e5086ca5de9'

def get_aisegcell_data(path: Union[os.PathLike, str], download: bool = False) -> str: View Source

 92def get_aisegcell_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 93    """Download the aiSEGcell dataset.
 94
 95    Args:
 96        path: Filepath to a folder where the downloaded data will be saved.
 97        download: Whether to download the data if it is not present.
 98
 99    Returns:
100        Filepath where the dataset is stored.
101    """
102    data_dir = os.path.join(path, "data")
103    if os.path.exists(data_dir):
104        return data_dir
105
106    zip_path = os.path.join(path, "data.zip")
107    util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM)
108
109    # We need to do multiple unzip and untar to get the data out.
110    print(
111        "'aiSEGcell' is a very large dataset (>60GB). It might take a couple of hours to download, "
112        "unzip and preprocess the data. Please ensure that you have a stable internet connection."
113    )
114    util.unzip(zip_path=zip_path, dst=path, remove=False)
115    util.unzip_tarfile(tar_path=os.path.join(path, "679085", "aisegcell_supplement.tar"), dst=path)
116    util.unzip_tarfile(
117        tar_path=os.path.join(path, "679085", "aiSEGcell_supplement", "data_sets", "aiSEGcell_nucleus.tar"), dst=path,
118    )
119
120    # Now that we have the core 'aiSEGcell_nucleus' folder on top-level directory, we can take it for processing data.
121    _preprocess_data(data_dir=data_dir, base_dir=os.path.join(path, "aiSEGcell_nucleus"))
122
123    return data_dir

Download the aiSEGcell dataset.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
download: Whether to download the data if it is not present.

Returns:

Filepath where the dataset is stored.

def get_aisegcell_paths( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], download: bool = False) -> List[str]: View Source

126def get_aisegcell_paths(
127    path: Union[os.PathLike, str], split: Literal["train", "val", "test"], download: bool = False,
128) -> List[str]:
129    """Get paths to the aiSEGcell dataset.
130
131    Args:
132        path: Filepath to a folder where the downloaded data will be saved.
133        split: The data split to use. Either 'train', 'val' or 'test'.
134        download: Whether to download the data if it is not present.
135
136    Returns:
137        List of filepaths for the input data.
138    """
139    data_dir = get_aisegcell_data(path, download)
140
141    if split not in ["train", "val", "test"]:
142        raise ValueError(f"'{split}' is not a valid split choice.")
143
144    data_paths = glob(os.path.join(data_dir, split, "**", "*.h5"), recursive=True)
145    assert len(data_paths) > 0
146    return data_paths

Get paths to the aiSEGcell dataset.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
split: The data split to use. Either 'train', 'val' or 'test'.
download: Whether to download the data if it is not present.

Returns:

List of filepaths for the input data.

def get_aisegcell_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], raw_channel: Literal['brightfield', 'fluorescence'] = 'brightfield', download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset: View Source

149def get_aisegcell_dataset(
150    path: Union[os.PathLike, str],
151    patch_shape: Tuple[int, int],
152    split: Literal["train", "val", "test"],
153    raw_channel: Literal["brightfield", "fluorescence"] = "brightfield",
154    download: bool = False,
155    **kwargs
156) -> Dataset:
157    """Get the aiSEGcell dataset for nucleus segmentation.
158
159    Args:
160        path: Filepath to a folder where the downloaded data will be saved.
161        patch_shape: The patch shape to use for training.
162        split: The data split to use. Either 'train', 'val' or 'test'.
163        raw_channel: The input channel to use. Either 'brightfield' or 'fluorescence'.
164        download: Whether to download the data if it is not present.
165        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
166
167    Returns:
168        The segmentation dataset.
169    """
170    data_paths = get_aisegcell_paths(path, split, download)
171
172    return torch_em.default_segmentation_dataset(
173        raw_paths=data_paths,
174        raw_key=f"raw/{raw_channel}",
175        label_paths=data_paths,
176        label_key="labels",
177        is_seg_dataset=True,
178        patch_shape=patch_shape,
179        ndim=2,
180        with_channels=True,
181        **kwargs
182    )

Get the aiSEGcell dataset for nucleus segmentation.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
patch_shape: The patch shape to use for training.
split: The data split to use. Either 'train', 'val' or 'test'.
raw_channel: The input channel to use. Either 'brightfield' or 'fluorescence'.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

Returns:

The segmentation dataset.

def get_aisegcell_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], raw_channel: Literal['brightfield', 'fluorescence'] = 'brightfield', download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader: View Source

185def get_aisegcell_loader(
186    path: Union[os.PathLike, str],
187    batch_size: int,
188    patch_shape: Tuple[int, int],
189    split: Literal["train", "val", "test"],
190    raw_channel: Literal["brightfield", "fluorescence"] = "brightfield",
191    download: bool = False,
192    **kwargs
193) -> DataLoader:
194    """Get the aiSEGcell dataloader for nucleus segmentation.
195
196    Args:
197        path: Filepath to a folder where the downloaded data will be saved.
198        batch_size: The batch size for training.
199        patch_shape: The patch shape to use for training.
200        split: The data split to use. Either 'train', 'val' or 'test'.
201        raw_channel: The input channel to use. Either 'brightfield' or 'fluorescence'.
202        download: Whether to download the data if it is not present.
203        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
204
205    Returns:
206        The DataLoader.
207    """
208    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
209    dataset = get_aisegcell_dataset(path, patch_shape, split, raw_channel, download, **ds_kwargs)
210    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the aiSEGcell dataloader for nucleus segmentation.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
batch_size: The batch size for training.
patch_shape: The patch shape to use for training.
split: The data split to use. Either 'train', 'val' or 'test'.
raw_channel: The input channel to use. Either 'brightfield' or 'fluorescence'.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

Returns:

The DataLoader.