torch_em.data.datasets.light_microscopy.yeaz

The YeaZ dataset contains annotations for yeast cells in brightfield (2d) and phase-contrast (2d+t) microscopy images.

NOTE: The data is located at:

The dataset is located at https://www.epfl.ch/labs/lpbs/data-and-software/. This dataset is from the publication https://doi.org/10.1038/s41467-020-19557-4. Please cite it if you use this dataset for your research.

  1"""The YeaZ dataset contains annotations for yeast cells in brightfield (2d)
  2and phase-contrast (2d+t) microscopy images.
  3
  4NOTE: The data is located at:
  5- Phase-contrast: https://drive.google.com/file/d/14MUIN26ou0L12UC9UV_AC2S3isj1qBMY.
  6- Brightfield: https://drive.google.com/file/d/1Sot3bau0F0dsBjRxoQzdGOeUy_wMezal
  7
  8The dataset is located at https://www.epfl.ch/labs/lpbs/data-and-software/.
  9This dataset is from the publication https://doi.org/10.1038/s41467-020-19557-4.
 10Please cite it if you use this dataset for your research.
 11"""
 12
 13import os
 14from glob import glob
 15from natsort import natsorted
 16from typing import Union, Tuple, Literal, List
 17
 18import json
 19from sklearn.model_selection import train_test_split
 20
 21from torch.utils.data import Dataset, DataLoader
 22
 23import torch_em
 24
 25from .. import util
 26
 27
 28URL = {
 29    "phc": "https://drive.google.com/file/d/14MUIN26ou0L12UC9UV_AC2S3isj1qBMY",
 30    "bf": "https://drive.google.com/file/d/1Sot3bau0F0dsBjRxoQzdGOeUy_wMezal"
 31}
 32
 33
 34def get_yeaz_data(path: Union[os.PathLike, str], choice: Literal['bf, phc'], download: bool = False) -> str:
 35    """Obtain the YeaZ dataset.
 36
 37    NOTE: Please download the dataset manually.
 38
 39    Args:
 40        path: Filepath to a folder where the data is expected to be downloaded for further processing.
 41        download: Whether to download the data if it is not present. Not implemented for this data.
 42
 43    Returns:
 44        Filepath where the data is expected to be downloaded.
 45    """
 46    if choice not in ['bf', 'phc']:
 47        raise ValueError(f"'{choice}' is not a valid choice of dataset.")
 48
 49    data_dir = os.path.join(path, "gold-standard-PhC-plus-2" if choice == "phc" else "gold-standard-BF-V-1")
 50    if os.path.exists(data_dir):
 51        return data_dir
 52
 53    os.makedirs(path, exist_ok=True)
 54
 55    tar_path = os.path.join(
 56        path, "gold-standard-PhC-plus-2.tar.gz" if choice == "phc" else "gold-standard-BF-V-1.tar.gz"
 57    )
 58
 59    if not os.path.exists(tar_path) or download:
 60        raise NotImplementedError(
 61            f"Automatic download is not supported. Please download the data manually from '{URL[choice]}'."
 62        )
 63
 64    util.unzip_tarfile(tar_path=tar_path, dst=path, remove=False)
 65
 66    return data_dir
 67
 68
 69def _create_data_splits(path, data_dir, choice, split, raw_paths):
 70    json_file = os.path.join(path, f"yeaz_{choice}_splits.json")
 71    if os.path.exists(json_file):
 72        with open(json_file, "r") as f:
 73            data = json.load(f)
 74    else:
 75        # Get the filenames
 76        names = [os.path.basename(p) for p in raw_paths]
 77
 78        # Create train / val / test splits
 79        train_split, test_split = train_test_split(names, test_size=0.2)
 80        train_split, val_split = train_test_split(train_split, test_size=0.15)
 81        data = {"train": train_split, "val": val_split, "test": test_split}
 82
 83        # Write the filenames with splits to a json file.
 84        with open(json_file, "w") as f:
 85            json.dump(data, f, indent=4)
 86
 87    _raw_paths = [os.path.join(data_dir, name) for name in data[split]]
 88    _label_paths = [p.replace("_im.tif", "_mask.tif") for p in _raw_paths]
 89
 90    return _raw_paths, _label_paths
 91
 92
 93def get_yeaz_paths(
 94    path: Union[os.PathLike, str],
 95    choice: Literal['bf, phc'],
 96    split: Literal['train', 'val', 'test'],
 97    download: bool = False
 98) -> Tuple[List[str], List[str]]:
 99    """Get the YeaZ data.
100
101    Args:
102        path: Filepath to a folder where the data is expected to be downloaded for further processing.
103        choice: The choice of modality for dataset.
104        split: The choice of data split.
105        download: Whether to download the data if it is not present. Not implemented for this data.
106
107    Returns:
108        List of filepaths for the image data.
109        List of filepaths for the label data.
110    """
111    data_dir = get_yeaz_data(path, choice, download)
112
113    raw_paths = natsorted(glob(os.path.join(data_dir, "*_im.tif")))
114
115    # Get the raw and label paths.
116    raw_paths, label_paths = _create_data_splits(path, data_dir, choice, split, raw_paths)
117
118    assert len(raw_paths) == len(label_paths) and len(raw_paths) > 0
119
120    return raw_paths, label_paths
121
122
123def get_yeaz_dataset(
124    path: Union[os.PathLike, str],
125    patch_shape: Tuple[int, int],
126    choice: Literal['bf, phc'],
127    split: Literal['train', 'val', 'test'],
128    download: bool = False,
129    **kwargs
130) -> Dataset:
131    """Get the YeaZ dataset for yeast cell segmentation.
132
133    Args:
134        path: Filepath to a folder where the data is expected to be downloaded for further processing.
135        patch_shape: The patch shape to use for training.
136        choice: The choice of modality for dataset.
137        split: The choice of data split.
138        download: Whether to download the data if it is not present. Not implemented for this data.
139        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
140
141    Returns:
142        The segmentation dataset.
143    """
144    raw_paths, label_paths = get_yeaz_paths(path, choice, split, download)
145
146    return torch_em.default_segmentation_dataset(
147        raw_paths=raw_paths,
148        raw_key=None,
149        label_paths=label_paths,
150        label_key=None,
151        patch_shape=patch_shape,
152        **kwargs
153    )
154
155
156def get_yeaz_loader(
157    path: Union[os.PathLike, str],
158    batch_size: int,
159    patch_shape: Tuple[int, int],
160    choice: Literal['bf, phc'],
161    split: Literal['train', 'val', 'test'],
162    download: bool = False,
163    **kwargs
164) -> DataLoader:
165    """Get the YeaZ dataloader for yeast cell segmentation.
166
167    Args:
168        path: Filepath to a folder where the data is expected to be downloaded for further processing.
169        batch_size: The batch size for training.
170        patch_shape: The patch shape to use for training.
171        choice: The choice of modality for dataset.
172        split: The choice of data split.
173        download: Whether to download the data if it is not present. Not implemented for this data.
174        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
175
176    Returns:
177        The DataLoader.
178    """
179    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
180    dataset = get_yeaz_dataset(path, patch_shape, choice, split, download, **ds_kwargs)
181    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URL = {'phc': 'https://drive.google.com/file/d/14MUIN26ou0L12UC9UV_AC2S3isj1qBMY', 'bf': 'https://drive.google.com/file/d/1Sot3bau0F0dsBjRxoQzdGOeUy_wMezal'}
def get_yeaz_data( path: Union[os.PathLike, str], choice: Literal['bf, phc'], download: bool = False) -> str:
35def get_yeaz_data(path: Union[os.PathLike, str], choice: Literal['bf, phc'], download: bool = False) -> str:
36    """Obtain the YeaZ dataset.
37
38    NOTE: Please download the dataset manually.
39
40    Args:
41        path: Filepath to a folder where the data is expected to be downloaded for further processing.
42        download: Whether to download the data if it is not present. Not implemented for this data.
43
44    Returns:
45        Filepath where the data is expected to be downloaded.
46    """
47    if choice not in ['bf', 'phc']:
48        raise ValueError(f"'{choice}' is not a valid choice of dataset.")
49
50    data_dir = os.path.join(path, "gold-standard-PhC-plus-2" if choice == "phc" else "gold-standard-BF-V-1")
51    if os.path.exists(data_dir):
52        return data_dir
53
54    os.makedirs(path, exist_ok=True)
55
56    tar_path = os.path.join(
57        path, "gold-standard-PhC-plus-2.tar.gz" if choice == "phc" else "gold-standard-BF-V-1.tar.gz"
58    )
59
60    if not os.path.exists(tar_path) or download:
61        raise NotImplementedError(
62            f"Automatic download is not supported. Please download the data manually from '{URL[choice]}'."
63        )
64
65    util.unzip_tarfile(tar_path=tar_path, dst=path, remove=False)
66
67    return data_dir

Obtain the YeaZ dataset.

NOTE: Please download the dataset manually.

Arguments:
  • path: Filepath to a folder where the data is expected to be downloaded for further processing.
  • download: Whether to download the data if it is not present. Not implemented for this data.
Returns:

Filepath where the data is expected to be downloaded.

def get_yeaz_paths( path: Union[os.PathLike, str], choice: Literal['bf, phc'], split: Literal['train', 'val', 'test'], download: bool = False) -> Tuple[List[str], List[str]]:
 94def get_yeaz_paths(
 95    path: Union[os.PathLike, str],
 96    choice: Literal['bf, phc'],
 97    split: Literal['train', 'val', 'test'],
 98    download: bool = False
 99) -> Tuple[List[str], List[str]]:
100    """Get the YeaZ data.
101
102    Args:
103        path: Filepath to a folder where the data is expected to be downloaded for further processing.
104        choice: The choice of modality for dataset.
105        split: The choice of data split.
106        download: Whether to download the data if it is not present. Not implemented for this data.
107
108    Returns:
109        List of filepaths for the image data.
110        List of filepaths for the label data.
111    """
112    data_dir = get_yeaz_data(path, choice, download)
113
114    raw_paths = natsorted(glob(os.path.join(data_dir, "*_im.tif")))
115
116    # Get the raw and label paths.
117    raw_paths, label_paths = _create_data_splits(path, data_dir, choice, split, raw_paths)
118
119    assert len(raw_paths) == len(label_paths) and len(raw_paths) > 0
120
121    return raw_paths, label_paths

Get the YeaZ data.

Arguments:
  • path: Filepath to a folder where the data is expected to be downloaded for further processing.
  • choice: The choice of modality for dataset.
  • split: The choice of data split.
  • download: Whether to download the data if it is not present. Not implemented for this data.
Returns:

List of filepaths for the image data. List of filepaths for the label data.

def get_yeaz_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], choice: Literal['bf, phc'], split: Literal['train', 'val', 'test'], download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
124def get_yeaz_dataset(
125    path: Union[os.PathLike, str],
126    patch_shape: Tuple[int, int],
127    choice: Literal['bf, phc'],
128    split: Literal['train', 'val', 'test'],
129    download: bool = False,
130    **kwargs
131) -> Dataset:
132    """Get the YeaZ dataset for yeast cell segmentation.
133
134    Args:
135        path: Filepath to a folder where the data is expected to be downloaded for further processing.
136        patch_shape: The patch shape to use for training.
137        choice: The choice of modality for dataset.
138        split: The choice of data split.
139        download: Whether to download the data if it is not present. Not implemented for this data.
140        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
141
142    Returns:
143        The segmentation dataset.
144    """
145    raw_paths, label_paths = get_yeaz_paths(path, choice, split, download)
146
147    return torch_em.default_segmentation_dataset(
148        raw_paths=raw_paths,
149        raw_key=None,
150        label_paths=label_paths,
151        label_key=None,
152        patch_shape=patch_shape,
153        **kwargs
154    )

Get the YeaZ dataset for yeast cell segmentation.

Arguments:
  • path: Filepath to a folder where the data is expected to be downloaded for further processing.
  • patch_shape: The patch shape to use for training.
  • choice: The choice of modality for dataset.
  • split: The choice of data split.
  • download: Whether to download the data if it is not present. Not implemented for this data.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_yeaz_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], choice: Literal['bf, phc'], split: Literal['train', 'val', 'test'], download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
157def get_yeaz_loader(
158    path: Union[os.PathLike, str],
159    batch_size: int,
160    patch_shape: Tuple[int, int],
161    choice: Literal['bf, phc'],
162    split: Literal['train', 'val', 'test'],
163    download: bool = False,
164    **kwargs
165) -> DataLoader:
166    """Get the YeaZ dataloader for yeast cell segmentation.
167
168    Args:
169        path: Filepath to a folder where the data is expected to be downloaded for further processing.
170        batch_size: The batch size for training.
171        patch_shape: The patch shape to use for training.
172        choice: The choice of modality for dataset.
173        split: The choice of data split.
174        download: Whether to download the data if it is not present. Not implemented for this data.
175        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
176
177    Returns:
178        The DataLoader.
179    """
180    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
181    dataset = get_yeaz_dataset(path, patch_shape, choice, split, download, **ds_kwargs)
182    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the YeaZ dataloader for yeast cell segmentation.

Arguments:
  • path: Filepath to a folder where the data is expected to be downloaded for further processing.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape to use for training.
  • choice: The choice of modality for dataset.
  • split: The choice of data split.
  • download: Whether to download the data if it is not present. Not implemented for this data.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.