torch_em.data.datasets.light_microscopy.orgaextractor

The OrgaExtractor dataset contains annotations for colon organoids in brightfield images.

NOTE: This dataset is kind of sparsely annotated (quite some organoids per image were missing when AA visualized).

This dataset is from the publication https://www.nature.com/articles/s41598-023-46485-2. And the dataset is located at https://github.com/tpark16/orgaextractor, pointing to the drive link at https://drive.google.com/drive/folders/17K4N7gEZUqAcwf9N2-I5DPbywwPvzAvo.

Please cite the publication if you use this dataset for your research.

  1"""The OrgaExtractor dataset contains annotations for colon organoids in brightfield images.
  2
  3NOTE: This dataset is kind of sparsely annotated (quite some organoids per image were missing when AA visualized).
  4
  5This dataset is from the publication https://www.nature.com/articles/s41598-023-46485-2.
  6And the dataset is located at https://github.com/tpark16/orgaextractor, pointing to the
  7drive link at https://drive.google.com/drive/folders/17K4N7gEZUqAcwf9N2-I5DPbywwPvzAvo.
  8
  9Please cite the publication if you use this dataset for your research.
 10"""
 11
 12import os
 13from glob import glob
 14from natsort import natsorted
 15from typing import Union, Tuple, List, Literal
 16
 17import imageio.v3 as imageio
 18from skimage.measure import label as connected_components
 19
 20from torch.utils.data import Dataset, DataLoader
 21
 22import torch_em
 23
 24from .. import util
 25
 26
 27# NOTE: The odd thing is, 'val' has no labels, but 'test' has labels.
 28# So, users are allowed to only request for 'train' and 'test' splits.
 29URLS = {
 30    "train": "https://drive.google.com/uc?export=download&id=1u987UNcZxWkEwe5gjLoR3-M0lBNicXQ1",
 31    "val": "https://drive.google.com/uc?export=download&id=1UsBrHOYY0Orkb4vsRP8SaDj-CeYfGpFG",
 32    "test": "https://drive.google.com/uc?export=download&id=1IXqu1MqMZzfw1_GzZauUhg1As_abbk6N",
 33}
 34
 35CHECKSUMS = {
 36    "train": "279bcfbcbd2fba23bbdea362b23eedacc53193034f4d23eb94ef570896da4f60",
 37    "val": "3d2288a7be39a692af2eb86bea520e7db332191cd372a8c970679b5bede61b7e",
 38    "test": "8e110ad8543031ed61c61bee5e8b41492b746d0dc8c503b6f8d4869b29a308e6",
 39}
 40
 41
 42def _preprocess_data(data_dir):
 43    gt_paths = natsorted(glob(os.path.join(data_dir, "*.tif")))
 44    for gt_path in gt_paths:
 45        gt = imageio.imread(gt_path)[..., 0]  # labels are with 3 channels. choose one as all channels are same.
 46        gt = connected_components(gt).astype("uint16")  # convert semantic labels to instances
 47        imageio.imwrite(gt_path, gt, compression="zlib")
 48
 49
 50def get_orgaextractor_data(
 51    path: Union[os.PathLike, str], split: Literal["train", "test"], download: bool = False,
 52) -> str:
 53    """Download the OrgaExtractor dataset.
 54
 55    Args:
 56        path: Filepath to the folder where the downloaded data will be saved.
 57        split: The data split to use.
 58        download: Whether to download the data if it is not present.
 59
 60    Returns:
 61        The filepath where the data is downloaded.
 62    """
 63    data_dir = os.path.join(path, split)
 64    if os.path.exists(data_dir):
 65        return data_dir
 66
 67    os.makedirs(data_dir, exist_ok=True)
 68
 69    zip_path = os.path.join(data_dir, f"{split}.zip")
 70    util.download_source_gdrive(
 71        path=zip_path, url=URLS[split], download=download, checksum=CHECKSUMS[split], download_type="zip",
 72    )
 73    util.unzip(zip_path=zip_path, dst=data_dir)
 74
 75    _preprocess_data(data_dir)
 76
 77    return data_dir
 78
 79
 80def get_orgaextractor_paths(
 81    path: Union[os.PathLike, str], split: Literal["train", "test"], download: bool = False,
 82) -> Tuple[List[str], List[str]]:
 83    """Get paths to the OrgaExtractor data.
 84
 85    Args:
 86        path: Filepath to the folder where the downloaded data will be saved.
 87        split: The data split to use.
 88        download: Whether to download the data if it is not present.
 89
 90    Returns:
 91        List of filepaths for the image data.
 92        List of filepaths for the label data.
 93    """
 94    data_dir = get_orgaextractor_data(path, split, download)
 95
 96    image_paths = natsorted(glob(os.path.join(data_dir, "*.jpg")))
 97    gt_paths = natsorted(glob(os.path.join(data_dir, "*.tif")))
 98
 99    assert image_paths and len(image_paths) == len(gt_paths)
100
101    return image_paths, gt_paths
102
103
104def get_orgaextractor_dataset(
105    path: Union[os.PathLike, str],
106    patch_shape: Tuple[int, int],
107    split: Literal["train", "test"],
108    download: bool = False,
109    **kwargs
110) -> Dataset:
111    """Get the OrgaExtractor dataset for organoid segmentation in brightfield microscopy images.
112
113    Args:
114        path: Filepath to the folder where the downloaded data will be saved.
115        patch_shape: The patch shape to use for training.
116        split: The data split to use.
117        download: Whether to download the data if it is not present.
118        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
119
120    Returns:
121        The segmentation dataset.
122    """
123    image_paths, gt_paths = get_orgaextractor_paths(path, split, download)
124
125    return torch_em.default_segmentation_dataset(
126        raw_paths=image_paths,
127        raw_key=None,
128        label_paths=gt_paths,
129        label_key=None,
130        patch_shape=patch_shape,
131        is_seg_dataset=False,
132        ndim=2,
133        **kwargs
134    )
135
136
137def get_orgaextractor_loader(
138    path: Union[os.PathLike, str],
139    batch_size: int,
140    patch_shape: Tuple[int, int],
141    split: Literal["train", "test"],
142    download: bool = False,
143    **kwargs
144) -> DataLoader:
145    """Get the OrgaExtractor dataloader for organoid segmentation in brightfield microscopy images.
146
147    Args:
148        path: Filepath to the folder where the downloaded data will be saved.
149        batch_size: The batch size for training.
150        patch_shape: The patch shape to use for training.
151        split: The data split to use.
152        download: Whether to download the data if it is not present.
153        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
154
155    Returns:
156        The DataLoader.
157    """
158    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
159    dataset = get_orgaextractor_dataset(path, patch_shape, split, download, **ds_kwargs)
160    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URLS = {'train': 'https://drive.google.com/uc?export=download&id=1u987UNcZxWkEwe5gjLoR3-M0lBNicXQ1', 'val': 'https://drive.google.com/uc?export=download&id=1UsBrHOYY0Orkb4vsRP8SaDj-CeYfGpFG', 'test': 'https://drive.google.com/uc?export=download&id=1IXqu1MqMZzfw1_GzZauUhg1As_abbk6N'}
CHECKSUMS = {'train': '279bcfbcbd2fba23bbdea362b23eedacc53193034f4d23eb94ef570896da4f60', 'val': '3d2288a7be39a692af2eb86bea520e7db332191cd372a8c970679b5bede61b7e', 'test': '8e110ad8543031ed61c61bee5e8b41492b746d0dc8c503b6f8d4869b29a308e6'}
def get_orgaextractor_data( path: Union[os.PathLike, str], split: Literal['train', 'test'], download: bool = False) -> str:
51def get_orgaextractor_data(
52    path: Union[os.PathLike, str], split: Literal["train", "test"], download: bool = False,
53) -> str:
54    """Download the OrgaExtractor dataset.
55
56    Args:
57        path: Filepath to the folder where the downloaded data will be saved.
58        split: The data split to use.
59        download: Whether to download the data if it is not present.
60
61    Returns:
62        The filepath where the data is downloaded.
63    """
64    data_dir = os.path.join(path, split)
65    if os.path.exists(data_dir):
66        return data_dir
67
68    os.makedirs(data_dir, exist_ok=True)
69
70    zip_path = os.path.join(data_dir, f"{split}.zip")
71    util.download_source_gdrive(
72        path=zip_path, url=URLS[split], download=download, checksum=CHECKSUMS[split], download_type="zip",
73    )
74    util.unzip(zip_path=zip_path, dst=data_dir)
75
76    _preprocess_data(data_dir)
77
78    return data_dir

Download the OrgaExtractor dataset.

Arguments:
  • path: Filepath to the folder where the downloaded data will be saved.
  • split: The data split to use.
  • download: Whether to download the data if it is not present.
Returns:

The filepath where the data is downloaded.

def get_orgaextractor_paths( path: Union[os.PathLike, str], split: Literal['train', 'test'], download: bool = False) -> Tuple[List[str], List[str]]:
 81def get_orgaextractor_paths(
 82    path: Union[os.PathLike, str], split: Literal["train", "test"], download: bool = False,
 83) -> Tuple[List[str], List[str]]:
 84    """Get paths to the OrgaExtractor data.
 85
 86    Args:
 87        path: Filepath to the folder where the downloaded data will be saved.
 88        split: The data split to use.
 89        download: Whether to download the data if it is not present.
 90
 91    Returns:
 92        List of filepaths for the image data.
 93        List of filepaths for the label data.
 94    """
 95    data_dir = get_orgaextractor_data(path, split, download)
 96
 97    image_paths = natsorted(glob(os.path.join(data_dir, "*.jpg")))
 98    gt_paths = natsorted(glob(os.path.join(data_dir, "*.tif")))
 99
100    assert image_paths and len(image_paths) == len(gt_paths)
101
102    return image_paths, gt_paths

Get paths to the OrgaExtractor data.

Arguments:
  • path: Filepath to the folder where the downloaded data will be saved.
  • split: The data split to use.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths for the image data. List of filepaths for the label data.

def get_orgaextractor_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Literal['train', 'test'], download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
105def get_orgaextractor_dataset(
106    path: Union[os.PathLike, str],
107    patch_shape: Tuple[int, int],
108    split: Literal["train", "test"],
109    download: bool = False,
110    **kwargs
111) -> Dataset:
112    """Get the OrgaExtractor dataset for organoid segmentation in brightfield microscopy images.
113
114    Args:
115        path: Filepath to the folder where the downloaded data will be saved.
116        patch_shape: The patch shape to use for training.
117        split: The data split to use.
118        download: Whether to download the data if it is not present.
119        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
120
121    Returns:
122        The segmentation dataset.
123    """
124    image_paths, gt_paths = get_orgaextractor_paths(path, split, download)
125
126    return torch_em.default_segmentation_dataset(
127        raw_paths=image_paths,
128        raw_key=None,
129        label_paths=gt_paths,
130        label_key=None,
131        patch_shape=patch_shape,
132        is_seg_dataset=False,
133        ndim=2,
134        **kwargs
135    )

Get the OrgaExtractor dataset for organoid segmentation in brightfield microscopy images.

Arguments:
  • path: Filepath to the folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • split: The data split to use.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_orgaextractor_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Literal['train', 'test'], download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
138def get_orgaextractor_loader(
139    path: Union[os.PathLike, str],
140    batch_size: int,
141    patch_shape: Tuple[int, int],
142    split: Literal["train", "test"],
143    download: bool = False,
144    **kwargs
145) -> DataLoader:
146    """Get the OrgaExtractor dataloader for organoid segmentation in brightfield microscopy images.
147
148    Args:
149        path: Filepath to the folder where the downloaded data will be saved.
150        batch_size: The batch size for training.
151        patch_shape: The patch shape to use for training.
152        split: The data split to use.
153        download: Whether to download the data if it is not present.
154        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
155
156    Returns:
157        The DataLoader.
158    """
159    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
160    dataset = get_orgaextractor_dataset(path, patch_shape, split, download, **ds_kwargs)
161    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the OrgaExtractor dataloader for organoid segmentation in brightfield microscopy images.

Arguments:
  • path: Filepath to the folder where the downloaded data will be saved.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape to use for training.
  • split: The data split to use.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.