torch_em.data.datasets.light_microscopy.aisegcell

The aiSEGcell dataset contains annotations for nucleus segmentation in paired brightfield and fluorescence images.

The dataset collection is located at https://www.research-collection.ethz.ch/handle/20.500.11850/679085. This dataset is from the publication https://doi.org/10.1371/journal.pcbi.1012361. Please cite it if you use this dataset in your research.

  1"""The aiSEGcell dataset contains annotations for nucleus segmentation in
  2paired brightfield and fluorescence images.
  3
  4The dataset collection is located at https://www.research-collection.ethz.ch/handle/20.500.11850/679085.
  5This dataset is from the publication https://doi.org/10.1371/journal.pcbi.1012361.
  6Please cite it if you use this dataset in your research.
  7"""
  8
  9import os
 10from glob import glob
 11from tqdm import tqdm
 12from pathlib import Path
 13from natsort import natsorted
 14from typing import List, Union, Tuple, Literal
 15from concurrent.futures import ProcessPoolExecutor
 16
 17import numpy as np
 18import imageio.v3 as imageio
 19from skimage.measure import label as connected_components
 20
 21from torch.utils.data import Dataset, DataLoader
 22
 23import torch_em
 24
 25from .. import util
 26
 27
 28URL = "https://libdrive.ethz.ch/index.php/s/VoF2SYkbLY8izjh/download"
 29CHECKSUM = "f9115ee6b71e7c4364b83f7d7f8b66dce5b778344070bddb6a8f0e5086ca5de9"
 30
 31
 32def _process_each_image(args):
 33    import h5py
 34
 35    bpath, npath, gpath, data_dir = args
 36
 37    path_parents = Path(bpath).parents
 38    split = path_parents[1].name.split("_")[-1]
 39    dname = path_parents[2].name
 40
 41    neu_dir = os.path.join(data_dir, split, dname)
 42    os.makedirs(neu_dir, exist_ok=True)
 43
 44    fpath = os.path.join(neu_dir, f"{Path(bpath).stem}.h5")
 45    if os.path.exists(fpath):
 46        return
 47
 48    bf = imageio.imread(bpath)
 49    nuc = imageio.imread(npath)
 50    gt = imageio.imread(gpath)
 51
 52    # Ensure all bf images have 3 channels.
 53    if bf.ndim == 3:
 54        bf = bf.transpose(2, 0, 1)
 55    else:
 56        bf = np.stack([bf] * 3, axis=0)
 57
 58    # Ensure all fluo images have 3 channels.
 59    if nuc.ndim == 3:
 60        nuc = nuc.transpose(2, 0, 1)
 61    else:
 62        nuc = np.stack([nuc] * 3, axis=0)
 63
 64    assert nuc.ndim == bf.ndim == 3
 65
 66    # Labels have 3 channels. Keep only one.
 67    if gt.ndim == 3:
 68        gt = gt[..., 0]
 69
 70    gt = connected_components(gt).astype("uint16")
 71
 72    with h5py.File(fpath, "w") as f:
 73        f.create_dataset("raw/brightfield", data=bf, compression="gzip")
 74        f.create_dataset("raw/fluorescence", data=nuc, compression="gzip")
 75        f.create_dataset("labels", data=gt, compression="gzip")
 76
 77
 78def _preprocess_data(data_dir, base_dir):
 79
 80    bf_paths = natsorted(glob(os.path.join(base_dir, "**", "brightfield", "*.png"), recursive=True))
 81    nucleus_paths = natsorted(glob(os.path.join(base_dir, "**", "nucleus", "*.png"), recursive=True))
 82    gt_paths = natsorted(glob(os.path.join(base_dir, "**", "masks", "*.png"), recursive=True))
 83
 84    assert bf_paths and len(bf_paths) == len(nucleus_paths) == len(gt_paths)
 85
 86    tasks = [(b, n, g, data_dir) for b, n, g in zip(bf_paths, nucleus_paths, gt_paths)]
 87    with ProcessPoolExecutor() as executor:
 88        list(tqdm(executor.map(_process_each_image, tasks), total=len(tasks), desc="Processing data"))
 89
 90
 91def get_aisegcell_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 92    """Download the aiSEGcell dataset.
 93
 94    Args:
 95        path: Filepath to a folder where the downloaded data will be saved.
 96        download: Whether to download the data if it is not present.
 97
 98    Returns:
 99        Filepath where the dataset is stored.
100    """
101    data_dir = os.path.join(path, "data")
102    if os.path.exists(data_dir):
103        return data_dir
104
105    os.makedirs(path, exist_ok=True)
106    zip_path = os.path.join(path, "data.zip")
107    util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM)
108
109    # We need to do multiple unzip and untar to get the data out.
110    print(
111        "'aiSEGcell' is a very large dataset (>60GB). It might take a couple of hours to download, "
112        "unzip and preprocess the data. Please ensure that you have a stable internet connection."
113    )
114    util.unzip(zip_path=zip_path, dst=path, remove=False)
115    util.unzip_tarfile(tar_path=os.path.join(path, "679085", "aisegcell_supplement.tar"), dst=path)
116    util.unzip_tarfile(
117        tar_path=os.path.join(path, "aiSEGcell_supplement", "data_sets", "aiSEGcell_nucleus.tar"), dst=path,
118    )
119
120    # Now that we have the core 'aiSEGcell_nucleus' folder on top-level directory, we can take it for processing data.
121    _preprocess_data(data_dir=data_dir, base_dir=os.path.join(path, "aiSEGcell_nucleus"))
122
123    return data_dir
124
125
126def get_aisegcell_paths(
127    path: Union[os.PathLike, str], split: Literal["train", "val", "test"], download: bool = False,
128) -> List[str]:
129    """Get paths to the aiSEGcell dataset.
130
131    Args:
132        path: Filepath to a folder where the downloaded data will be saved.
133        split: The data split to use. Either 'train', 'val' or 'test'.
134        download: Whether to download the data if it is not present.
135
136    Returns:
137        List of filepaths for the input data.
138    """
139    data_dir = get_aisegcell_data(path, download)
140
141    if split not in ["train", "val", "test"]:
142        raise ValueError(f"'{split}' is not a valid split choice.")
143
144    data_paths = glob(os.path.join(data_dir, split, "**", "*.h5"), recursive=True)
145    assert len(data_paths) > 0
146    return data_paths
147
148
149def get_aisegcell_dataset(
150    path: Union[os.PathLike, str],
151    patch_shape: Tuple[int, int],
152    split: Literal["train", "val", "test"],
153    raw_channel: Literal["brightfield", "fluorescence"] = "brightfield",
154    download: bool = False,
155    **kwargs
156) -> Dataset:
157    """Get the aiSEGcell dataset for nucleus segmentation.
158
159    Args:
160        path: Filepath to a folder where the downloaded data will be saved.
161        patch_shape: The patch shape to use for training.
162        split: The data split to use. Either 'train', 'val' or 'test'.
163        raw_channel: The input channel to use. Either 'brightfield' or 'fluorescence'.
164        download: Whether to download the data if it is not present.
165        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
166
167    Returns:
168        The segmentation dataset.
169    """
170    data_paths = get_aisegcell_paths(path, split, download)
171
172    return torch_em.default_segmentation_dataset(
173        raw_paths=data_paths,
174        raw_key=f"raw/{raw_channel}",
175        label_paths=data_paths,
176        label_key="labels",
177        is_seg_dataset=True,
178        patch_shape=patch_shape,
179        ndim=2,
180        with_channels=True,
181        **kwargs
182    )
183
184
185def get_aisegcell_loader(
186    path: Union[os.PathLike, str],
187    batch_size: int,
188    patch_shape: Tuple[int, int],
189    split: Literal["train", "val", "test"],
190    raw_channel: Literal["brightfield", "fluorescence"] = "brightfield",
191    download: bool = False,
192    **kwargs
193) -> DataLoader:
194    """Get the aiSEGcell dataloader for nucleus segmentation.
195
196    Args:
197        path: Filepath to a folder where the downloaded data will be saved.
198        batch_size: The batch size for training.
199        patch_shape: The patch shape to use for training.
200        split: The data split to use. Either 'train', 'val' or 'test'.
201        raw_channel: The input channel to use. Either 'brightfield' or 'fluorescence'.
202        download: Whether to download the data if it is not present.
203        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
204
205    Returns:
206        The DataLoader.
207    """
208    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
209    dataset = get_aisegcell_dataset(path, patch_shape, split, raw_channel, download, **ds_kwargs)
210    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URL = 'https://libdrive.ethz.ch/index.php/s/VoF2SYkbLY8izjh/download'
CHECKSUM = 'f9115ee6b71e7c4364b83f7d7f8b66dce5b778344070bddb6a8f0e5086ca5de9'
def get_aisegcell_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 92def get_aisegcell_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 93    """Download the aiSEGcell dataset.
 94
 95    Args:
 96        path: Filepath to a folder where the downloaded data will be saved.
 97        download: Whether to download the data if it is not present.
 98
 99    Returns:
100        Filepath where the dataset is stored.
101    """
102    data_dir = os.path.join(path, "data")
103    if os.path.exists(data_dir):
104        return data_dir
105
106    os.makedirs(path, exist_ok=True)
107    zip_path = os.path.join(path, "data.zip")
108    util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM)
109
110    # We need to do multiple unzip and untar to get the data out.
111    print(
112        "'aiSEGcell' is a very large dataset (>60GB). It might take a couple of hours to download, "
113        "unzip and preprocess the data. Please ensure that you have a stable internet connection."
114    )
115    util.unzip(zip_path=zip_path, dst=path, remove=False)
116    util.unzip_tarfile(tar_path=os.path.join(path, "679085", "aisegcell_supplement.tar"), dst=path)
117    util.unzip_tarfile(
118        tar_path=os.path.join(path, "aiSEGcell_supplement", "data_sets", "aiSEGcell_nucleus.tar"), dst=path,
119    )
120
121    # Now that we have the core 'aiSEGcell_nucleus' folder on top-level directory, we can take it for processing data.
122    _preprocess_data(data_dir=data_dir, base_dir=os.path.join(path, "aiSEGcell_nucleus"))
123
124    return data_dir

Download the aiSEGcell dataset.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • download: Whether to download the data if it is not present.
Returns:

Filepath where the dataset is stored.

def get_aisegcell_paths( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], download: bool = False) -> List[str]:
127def get_aisegcell_paths(
128    path: Union[os.PathLike, str], split: Literal["train", "val", "test"], download: bool = False,
129) -> List[str]:
130    """Get paths to the aiSEGcell dataset.
131
132    Args:
133        path: Filepath to a folder where the downloaded data will be saved.
134        split: The data split to use. Either 'train', 'val' or 'test'.
135        download: Whether to download the data if it is not present.
136
137    Returns:
138        List of filepaths for the input data.
139    """
140    data_dir = get_aisegcell_data(path, download)
141
142    if split not in ["train", "val", "test"]:
143        raise ValueError(f"'{split}' is not a valid split choice.")
144
145    data_paths = glob(os.path.join(data_dir, split, "**", "*.h5"), recursive=True)
146    assert len(data_paths) > 0
147    return data_paths

Get paths to the aiSEGcell dataset.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • split: The data split to use. Either 'train', 'val' or 'test'.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths for the input data.

def get_aisegcell_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], raw_channel: Literal['brightfield', 'fluorescence'] = 'brightfield', download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
150def get_aisegcell_dataset(
151    path: Union[os.PathLike, str],
152    patch_shape: Tuple[int, int],
153    split: Literal["train", "val", "test"],
154    raw_channel: Literal["brightfield", "fluorescence"] = "brightfield",
155    download: bool = False,
156    **kwargs
157) -> Dataset:
158    """Get the aiSEGcell dataset for nucleus segmentation.
159
160    Args:
161        path: Filepath to a folder where the downloaded data will be saved.
162        patch_shape: The patch shape to use for training.
163        split: The data split to use. Either 'train', 'val' or 'test'.
164        raw_channel: The input channel to use. Either 'brightfield' or 'fluorescence'.
165        download: Whether to download the data if it is not present.
166        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
167
168    Returns:
169        The segmentation dataset.
170    """
171    data_paths = get_aisegcell_paths(path, split, download)
172
173    return torch_em.default_segmentation_dataset(
174        raw_paths=data_paths,
175        raw_key=f"raw/{raw_channel}",
176        label_paths=data_paths,
177        label_key="labels",
178        is_seg_dataset=True,
179        patch_shape=patch_shape,
180        ndim=2,
181        with_channels=True,
182        **kwargs
183    )

Get the aiSEGcell dataset for nucleus segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • split: The data split to use. Either 'train', 'val' or 'test'.
  • raw_channel: The input channel to use. Either 'brightfield' or 'fluorescence'.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_aisegcell_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], raw_channel: Literal['brightfield', 'fluorescence'] = 'brightfield', download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
186def get_aisegcell_loader(
187    path: Union[os.PathLike, str],
188    batch_size: int,
189    patch_shape: Tuple[int, int],
190    split: Literal["train", "val", "test"],
191    raw_channel: Literal["brightfield", "fluorescence"] = "brightfield",
192    download: bool = False,
193    **kwargs
194) -> DataLoader:
195    """Get the aiSEGcell dataloader for nucleus segmentation.
196
197    Args:
198        path: Filepath to a folder where the downloaded data will be saved.
199        batch_size: The batch size for training.
200        patch_shape: The patch shape to use for training.
201        split: The data split to use. Either 'train', 'val' or 'test'.
202        raw_channel: The input channel to use. Either 'brightfield' or 'fluorescence'.
203        download: Whether to download the data if it is not present.
204        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
205
206    Returns:
207        The DataLoader.
208    """
209    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
210    dataset = get_aisegcell_dataset(path, patch_shape, split, raw_channel, download, **ds_kwargs)
211    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the aiSEGcell dataloader for nucleus segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape to use for training.
  • split: The data split to use. Either 'train', 'val' or 'test'.
  • raw_channel: The input channel to use. Either 'brightfield' or 'fluorescence'.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.