torch_em.data.datasets.light_microscopy.organoid

The OrganoID dataset contains annotations for pancreatic organoids in brightfield images.

The dataset is from the publication https://doi.org/10.1371/journal.pcbi.1010584. Please cite it if you use this dataset for your research.

  1"""The OrganoID dataset contains annotations for pancreatic organoids in brightfield images.
  2
  3The dataset is from the publication https://doi.org/10.1371/journal.pcbi.1010584.
  4Please cite it if you use this dataset for your research.
  5"""
  6
  7import os
  8import shutil
  9from glob import glob
 10from pathlib import Path
 11from natsort import natsorted
 12from typing import Union, Tuple, List, Literal, Optional
 13
 14import numpy as np
 15import imageio.v3 as imageio
 16from skimage.measure import label as connected_components
 17
 18from torch.utils.data import DataLoader, Dataset
 19
 20import torch_em
 21
 22from .. import util
 23
 24
 25URL = "https://osf.io/download/69nr8/"
 26# CHECKSUM = "a399288524d12bbadeebb38d52711fa746402456257b0cc6531d8c3c5a0cb8f1"
 27CHECKSUM = None  # NOTE: I remember osf checksums fail for some reason. I am sure this might as well.
 28
 29
 30def _store_files_as_h5(data_dir, image_dir, image_pattern, label_dir, label_pattern):
 31
 32    import h5py
 33
 34    if os.path.exists(data_dir):
 35        return
 36
 37    os.makedirs(data_dir, exist_ok=True)
 38
 39    image_paths = natsorted(glob(os.path.join(image_dir, image_pattern)))
 40    gt_paths = natsorted(glob(os.path.join(label_dir, label_pattern)))
 41
 42    assert image_paths and len(image_paths) == len(gt_paths)
 43
 44    for image_path, gt_path in zip(image_paths, gt_paths):
 45        image = imageio.imread(image_path)
 46        gt = imageio.imread(gt_path)
 47
 48        if gt.ndim == 3:
 49            gt = gt[..., 0]  # Choose one label channel as all are same.
 50
 51        gt = connected_components(gt > 0).astype("uint16")  # Run connected components to get instances.
 52
 53        # Preprocess the image (ensure all images are 3-channel).
 54        if image.ndim == 3 and image.shape[-1] == 4:
 55            image = image[..., :-1]  # Remove alpha channel
 56        elif image.ndim == 2:
 57            image = np.stack([image] * 3, axis=-1)
 58
 59        assert image.ndim == 3 and image.shape[-1] == 3, image.shape
 60
 61        # Now, make channels first (to make this work with our dataset)
 62        image = image.transpose(2, 0, 1)
 63
 64        with h5py.File(os.path.join(data_dir, f"{Path(image_path).stem}.h5"), "w") as f:
 65            f.create_dataset(name="raw", data=image, compression="gzip")
 66            f.create_dataset(name="labels", data=gt, compression="gzip")
 67
 68
 69def _preprocess_per_species(data_dir, stype, dirname):
 70
 71    _store_files_as_h5(
 72        data_dir=os.path.join(data_dir, dirname, "train"),
 73        image_dir=os.path.join(data_dir, stype, "training", "pre_augmented", "images"),
 74        image_pattern="*",
 75        label_dir=os.path.join(data_dir, stype, "training", "pre_augmented", "segmentations"),
 76        label_pattern="*",
 77    )
 78
 79    _store_files_as_h5(
 80        data_dir=os.path.join(data_dir, dirname, "val"),
 81        image_dir=os.path.join(data_dir, stype, "validation", "images"), image_pattern="*",
 82        label_dir=os.path.join(data_dir, stype, "validation", "segmentations"), label_pattern="*",
 83    )
 84
 85    _store_files_as_h5(
 86        data_dir=os.path.join(data_dir, dirname, "test"),
 87        image_dir=os.path.join(data_dir, stype, "testing", "images"), image_pattern="*",
 88        label_dir=os.path.join(data_dir, stype, "testing", "segmentations"), label_pattern="*",
 89    )
 90
 91
 92def _preprocess_data(data_dir):
 93
 94    import h5py
 95
 96    # Let's start assorting the OG PDAC organoids data. We will call this the "original" data.
 97    print("Preprocessing 'original' data")
 98    _preprocess_per_species(data_dir, "OriginalData", "original")
 99
100    # Next, we go to the 'MouseOrganoids' data. We will call this the "mouse" data.
101    print("Preprocessing 'mouse' data")
102    _preprocess_per_species(data_dir, "MouseOrganoids", "mouse")
103
104    # And finally, the 'GemcitabineScreen' data. This is a cool data, as the inputs
105    # have two channels: BF and PI (propidium iodide), responsible for reporting cellular necrosis.
106    # We will call this data as "gemcitabine".
107    gdir = os.path.join(data_dir, "gemcitabine")
108    if not os.path.exists(gdir):
109        print("Preprocessing 'gemcitabine' data")
110        os.makedirs(os.path.join(data_dir, "gemcitabine"), exist_ok=True)
111
112        bf_paths = natsorted(glob(os.path.join(data_dir, "GemcitabineScreen", "BF", "*.tif")))
113        pi_paths = natsorted(glob(os.path.join(data_dir, "GemcitabineScreen", "PI", "*.tif")))
114        label_paths = natsorted(glob(os.path.join(data_dir, "GemcitabineScreen", "OrganoIDProcessed", "*_labeled.tif")))
115
116        assert label_paths and len(label_paths) == len(bf_paths) == len(pi_paths)
117
118        for bf_path, pi_path, label_path in zip(bf_paths, pi_paths, label_paths):
119            bf_image = imageio.imread(bf_path)
120            pi_image = imageio.imread(pi_path)
121            gt = imageio.imread(label_path)
122
123            assert bf_image.shape == pi_image.shape == gt.shape
124
125            with h5py.File(os.path.join(gdir, f"{Path(bf_path).stem}.h5"), "w") as f:
126                f.create_dataset(name="raw/bf", data=bf_image, compression="gzip")
127                f.create_dataset(name="raw/pi", data=pi_image, compression="gzip")
128                f.create_dataset(name="labels", data=gt, compression="gzip")
129
130    # Let's remove all other data folders.
131    shutil.rmtree(os.path.join(data_dir, "OriginalData"))
132    shutil.rmtree(os.path.join(data_dir, "MouseOrganoids"))
133    shutil.rmtree(os.path.join(data_dir, "GemcitabineScreen"))
134
135
136def get_organoid_data(path: Union[os.PathLike, str], download: bool = False) -> str:
137    """Download the OrganoID dataset.
138
139    Args:
140        path: Filepath to the folder where the downloaded data will be saved.
141        download: Whether to download the data if it is not present.
142
143    Returns:
144        The filepath where the data is downloaded.
145    """
146    data_dir = os.path.join(path, "data")
147    if os.path.exists(data_dir):
148        return data_dir
149
150    zip_path = os.path.join(path, "data.zip")
151    util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM)
152    util.unzip(zip_path=zip_path, dst=data_dir, remove=False)
153
154    _preprocess_data(data_dir)
155
156    return data_dir
157
158
159def get_organoid_paths(
160    path: Union[os.PathLike, str],
161    split: Optional[Literal["train", "val", "test"]] = None,
162    source: Literal["gemcitabine", "mouse", "original"] = "original",
163    download: bool = False,
164) -> List[str]:
165    """Get paths to the OrganoID data.
166
167    Args:
168        path: Filepath to the folder where the downloaded data will be saved.
169        split: The data split to use.
170        source: The data source to use.
171        download: Whether to download the data if it is not present.
172
173    Returns:
174        List of filepaths for the input data.
175    """
176    if source == "gemcitabine":
177        assert split is None, "The 'gemcitabine' data has no data splits."
178        split = ""
179    else:
180        assert split is not None, f"The '{source}' data expects a data split to be chosen."
181
182    data_dir = get_organoid_data(path, download)
183    input_paths = natsorted(glob(os.path.join(data_dir, source, split, "*.h5")))
184    assert input_paths and len(input_paths) > 0
185    return input_paths
186
187
188def get_organoid_dataset(
189    path: Union[os.PathLike, str],
190    patch_shape: Tuple[int, int],
191    split: Optional[Literal["train", "val", "test"]] = None,
192    source: Literal["gemcitabine", "mouse", "original"] = "original",
193    source_channels: Optional[Union[str, List[str]]] = None,
194    download: bool = False,
195    **kwargs,
196) -> Dataset:
197    """Get OrganoID dataset for organoid segmentation in brightfield microscopy images.
198
199    Args:
200        path: Filepath to the folder where the downloaded data will be saved.
201        patch_shape: The patch shape to use for training.
202        split: The data split to use.
203        source: The data source to use.
204        source_channel: The data source channel to use.
205        download: Whether to download the data if it is not present.
206        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
207
208    Returns:
209        The segmentation dataset.
210    """
211    input_paths = get_organoid_paths(path, split, source, download)
212
213    if source == "gemcitabine":
214        assert source_channels is not None, "You must choose a 'source_channel' for 'gemcitabine' data."
215        ndim = 3
216        if isinstance(source_channels, str):
217            raw_key = f"raw/{source_channels}"
218            with_channels = False
219        else:
220            raw_key = [f"raw/{per_rkey}" for per_rkey in source_channels]
221            with_channels = True
222    else:
223        assert source_channels is None, f"You cannot choose a 'source_channel' for '{source}' data."
224        ndim = 2
225        raw_key = "raw"
226        with_channels = True
227
228    return torch_em.default_segmentation_dataset(
229        raw_paths=input_paths,
230        raw_key=raw_key,
231        label_paths=input_paths,
232        label_key="labels",
233        is_seg_dataset=True,
234        patch_shape=patch_shape,
235        ndim=ndim,
236        with_channels=with_channels,
237        **kwargs
238    )
239
240
241def get_organoid_loader(
242    path: Union[os.PathLike, str],
243    batch_size: int,
244    patch_shape: Tuple[int, int],
245    split: Optional[Literal["train", "val", "test"]] = None,
246    source: Literal["gemcitabine", "mouse", "original"] = "original",
247    source_channels: Optional[Union[str, List[str]]] = None,
248    download: bool = False,
249    **kwargs,
250) -> DataLoader:
251    """Get OrganoID dataloader for organoid segmentation in brightfield microscopy images.
252
253    Args:
254        path: Filepath to the folder where the downloaded data will be saved.
255        batch_size: The batch size for training.
256        patch_shape: The patch shape to use for training.
257        split: The data split to use.
258        source: The data source to use.
259        source_channel: The data source channel to use.
260        download: Whether to download the data if it is not present.
261        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
262
263    Returns:
264        The DataLoader.
265    """
266    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
267    dataset = get_organoid_dataset(path, patch_shape, split, source, source_channels, download, **ds_kwargs)
268    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URL = 'https://osf.io/download/69nr8/'
CHECKSUM = None
def get_organoid_data(path: Union[os.PathLike, str], download: bool = False) -> str:
137def get_organoid_data(path: Union[os.PathLike, str], download: bool = False) -> str:
138    """Download the OrganoID dataset.
139
140    Args:
141        path: Filepath to the folder where the downloaded data will be saved.
142        download: Whether to download the data if it is not present.
143
144    Returns:
145        The filepath where the data is downloaded.
146    """
147    data_dir = os.path.join(path, "data")
148    if os.path.exists(data_dir):
149        return data_dir
150
151    zip_path = os.path.join(path, "data.zip")
152    util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM)
153    util.unzip(zip_path=zip_path, dst=data_dir, remove=False)
154
155    _preprocess_data(data_dir)
156
157    return data_dir

Download the OrganoID dataset.

Arguments:
  • path: Filepath to the folder where the downloaded data will be saved.
  • download: Whether to download the data if it is not present.
Returns:

The filepath where the data is downloaded.

def get_organoid_paths( path: Union[os.PathLike, str], split: Optional[Literal['train', 'val', 'test']] = None, source: Literal['gemcitabine', 'mouse', 'original'] = 'original', download: bool = False) -> List[str]:
160def get_organoid_paths(
161    path: Union[os.PathLike, str],
162    split: Optional[Literal["train", "val", "test"]] = None,
163    source: Literal["gemcitabine", "mouse", "original"] = "original",
164    download: bool = False,
165) -> List[str]:
166    """Get paths to the OrganoID data.
167
168    Args:
169        path: Filepath to the folder where the downloaded data will be saved.
170        split: The data split to use.
171        source: The data source to use.
172        download: Whether to download the data if it is not present.
173
174    Returns:
175        List of filepaths for the input data.
176    """
177    if source == "gemcitabine":
178        assert split is None, "The 'gemcitabine' data has no data splits."
179        split = ""
180    else:
181        assert split is not None, f"The '{source}' data expects a data split to be chosen."
182
183    data_dir = get_organoid_data(path, download)
184    input_paths = natsorted(glob(os.path.join(data_dir, source, split, "*.h5")))
185    assert input_paths and len(input_paths) > 0
186    return input_paths

Get paths to the OrganoID data.

Arguments:
  • path: Filepath to the folder where the downloaded data will be saved.
  • split: The data split to use.
  • source: The data source to use.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths for the input data.

def get_organoid_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Optional[Literal['train', 'val', 'test']] = None, source: Literal['gemcitabine', 'mouse', 'original'] = 'original', source_channels: Union[List[str], str, NoneType] = None, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
189def get_organoid_dataset(
190    path: Union[os.PathLike, str],
191    patch_shape: Tuple[int, int],
192    split: Optional[Literal["train", "val", "test"]] = None,
193    source: Literal["gemcitabine", "mouse", "original"] = "original",
194    source_channels: Optional[Union[str, List[str]]] = None,
195    download: bool = False,
196    **kwargs,
197) -> Dataset:
198    """Get OrganoID dataset for organoid segmentation in brightfield microscopy images.
199
200    Args:
201        path: Filepath to the folder where the downloaded data will be saved.
202        patch_shape: The patch shape to use for training.
203        split: The data split to use.
204        source: The data source to use.
205        source_channel: The data source channel to use.
206        download: Whether to download the data if it is not present.
207        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
208
209    Returns:
210        The segmentation dataset.
211    """
212    input_paths = get_organoid_paths(path, split, source, download)
213
214    if source == "gemcitabine":
215        assert source_channels is not None, "You must choose a 'source_channel' for 'gemcitabine' data."
216        ndim = 3
217        if isinstance(source_channels, str):
218            raw_key = f"raw/{source_channels}"
219            with_channels = False
220        else:
221            raw_key = [f"raw/{per_rkey}" for per_rkey in source_channels]
222            with_channels = True
223    else:
224        assert source_channels is None, f"You cannot choose a 'source_channel' for '{source}' data."
225        ndim = 2
226        raw_key = "raw"
227        with_channels = True
228
229    return torch_em.default_segmentation_dataset(
230        raw_paths=input_paths,
231        raw_key=raw_key,
232        label_paths=input_paths,
233        label_key="labels",
234        is_seg_dataset=True,
235        patch_shape=patch_shape,
236        ndim=ndim,
237        with_channels=with_channels,
238        **kwargs
239    )

Get OrganoID dataset for organoid segmentation in brightfield microscopy images.

Arguments:
  • path: Filepath to the folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • split: The data split to use.
  • source: The data source to use.
  • source_channel: The data source channel to use.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_organoid_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Optional[Literal['train', 'val', 'test']] = None, source: Literal['gemcitabine', 'mouse', 'original'] = 'original', source_channels: Union[List[str], str, NoneType] = None, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
242def get_organoid_loader(
243    path: Union[os.PathLike, str],
244    batch_size: int,
245    patch_shape: Tuple[int, int],
246    split: Optional[Literal["train", "val", "test"]] = None,
247    source: Literal["gemcitabine", "mouse", "original"] = "original",
248    source_channels: Optional[Union[str, List[str]]] = None,
249    download: bool = False,
250    **kwargs,
251) -> DataLoader:
252    """Get OrganoID dataloader for organoid segmentation in brightfield microscopy images.
253
254    Args:
255        path: Filepath to the folder where the downloaded data will be saved.
256        batch_size: The batch size for training.
257        patch_shape: The patch shape to use for training.
258        split: The data split to use.
259        source: The data source to use.
260        source_channel: The data source channel to use.
261        download: Whether to download the data if it is not present.
262        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
263
264    Returns:
265        The DataLoader.
266    """
267    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
268    dataset = get_organoid_dataset(path, patch_shape, split, source, source_channels, download, **ds_kwargs)
269    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get OrganoID dataloader for organoid segmentation in brightfield microscopy images.

Arguments:
  • path: Filepath to the folder where the downloaded data will be saved.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape to use for training.
  • split: The data split to use.
  • source: The data source to use.
  • source_channel: The data source channel to use.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.