torch_em.data.datasets.light_microscopy.cartocell

The CartoCell dataset contains annotations of cell segmentation in whole epithelial cysts in high-content screening microscopy images.

The dataset is located at https://data.mendeley.com/datasets/7gbkxgngpm/2. This dataset is from the publication https://doi.org/10.1016/j.crmeth.2023.100597. Please cite it if you use this dataset for your research.

  1"""The CartoCell dataset contains annotations of cell segmentation in
  2whole epithelial cysts in high-content screening microscopy images.
  3
  4The dataset is located at https://data.mendeley.com/datasets/7gbkxgngpm/2.
  5This dataset is from the publication https://doi.org/10.1016/j.crmeth.2023.100597.
  6Please cite it if you use this dataset for your research.
  7"""
  8
  9import os
 10import shutil
 11from glob import glob
 12from natsort import natsorted
 13from typing import Union, Tuple, Optional, Literal, List
 14
 15from torch.utils.data import Dataset, DataLoader
 16
 17import torch_em
 18
 19from .. import util
 20
 21
 22URL = "https://prod-dcd-datasets-cache-zipfiles.s3.eu-west-1.amazonaws.com/7gbkxgngpm-2.zip"
 23CHECKSUM = "ca3fc289e7b67febfc03cdd55fd791078f7527820c8dbcee0b98d03d993bb6f5"
 24DNAME = "CartoCell, a high-content pipeline for accurate 3D image analysis, unveils cell morphology patterns in epithelial cysts"  # noqa
 25
 26
 27def get_cartocell_data(path: Union[os.PathLike, str], download: bool = False):
 28    """Download the CartoCell dataset.
 29
 30    Args:
 31        path: Filepath to a folder where the downloaded data will be saved.
 32        download: Whether to download the data if it is not present.
 33    """
 34    data_dir = os.path.join(path, "data")
 35    if os.path.exists(data_dir):
 36        return
 37
 38    os.makedirs(path, exist_ok=True)
 39
 40    zip_path = os.path.join(path, "cartocell.zip")
 41    util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM)
 42    util.unzip(zip_path=zip_path, dst=path)
 43    shutil.move(src=os.path.join(path, DNAME), dst=data_dir)
 44
 45
 46def get_cartocell_paths(
 47    path: Union[os.PathLike, str],
 48    split: Optional[Literal["train", "test"]] = None,
 49    name: Optional[Literal["eggChambers", "embryoids", "MDCK-Normoxia", "MDCK-Hypoxia"]] = None,
 50    download: bool = False
 51) -> Tuple[List[str], List[str]]:
 52    """Get paths to the CartoCell data.
 53
 54    Args:
 55        path: Filepath to a folder where the downloaded data will be saved.
 56        split: The data split to use. Either 'train', or 'test'.
 57        name: The name of data subset. Either 'eggChambers', 'embryoids', 'MDCK-Normoxia' or 'MDCK-Hypoxia'.
 58        download: Whether to download the data if it is not present.
 59
 60    Returns:
 61        List of filepaths for the image data.
 62        List of filepaths for the label data.
 63    """
 64    get_cartocell_data(path, download)
 65
 66    if split is None:
 67        split = ""
 68    else:
 69        split = split + "_"
 70
 71    if name is None:
 72        name = "*"
 73    elif name == "MDCK-Hypoxia":
 74        raise ValueError(f"'{name}' has mismatching shapes for image and corresponding labels.")
 75
 76    raw_paths = natsorted(glob(os.path.join(path, "data", f"low-resolution_{name}_{split}raw_images", "*")))
 77
 78    # NOTE: The 'MDCK-Hypoxia' inputs have mismatching input-label shapes (and axes seem interchanged)
 79    raw_paths = [rpath for rpath in raw_paths if rpath.find("MDCK-Hypoxia") == -1]
 80    label_paths = [rpath.replace("raw", "label") for rpath in raw_paths]
 81
 82    assert len(raw_paths) > 0 and len(raw_paths) == len(label_paths)
 83
 84    return raw_paths, label_paths
 85
 86
 87def get_cartocell_dataset(
 88    path: Union[os.PathLike, str],
 89    patch_shape: Tuple[int, ...],
 90    split: Optional[Literal["train", "test"]] = None,
 91    name: Optional[Literal["eggChambers", "embryoids", "MDCK-Normoxia", "MDCK-Hypoxia"]] = None,
 92    download: bool = False, **kwargs
 93) -> Dataset:
 94    """Get the CartoCell dataset for cell segmentation.
 95
 96    Args:
 97        path: Filepath to a folder where the downloaded data will be saved.
 98        patch_shape: The patch shape to use for training.
 99        split: The data split to use. Either 'train', or 'test'.
100        name: The name of data subset. Either 'eggChambers', 'embryoids', 'MDCK-Normoxia' or 'MDCK-Hypoxia'.
101        download: Whether to download the data if it is not present.
102        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
103
104    Returns:
105        The segmentation dataset.
106    """
107    raw_paths, label_paths = get_cartocell_paths(path, split, name, download)
108
109    return torch_em.default_segmentation_dataset(
110        raw_paths=raw_paths,
111        raw_key=None,
112        label_paths=label_paths,
113        label_key=None,
114        patch_shape=patch_shape,
115        is_seg_dataset=True,
116        **kwargs
117    )
118
119
120def get_cartocell_loader(
121    path: Union[os.PathLike, str],
122    batch_size: int,
123    patch_shape: Tuple[int, ...],
124    split: Optional[Literal["train", "test"]] = None,
125    name: Optional[Literal["eggChambers", "embryoids", "MDCK-Normoxia", "MDCK-Hypoxia"]] = None,
126    download: bool = False,
127    **kwargs
128) -> DataLoader:
129    """Get the CartoCell dataloader for cell segmentation.
130
131    Args:
132        path: Filepath to a folder where the downloaded data will be saved.
133        batch_size: The batch size for training.
134        patch_shape: The patch shape to use for training.
135        split: The data split to use. Either 'train', or 'test'.
136        name: The name of data subset. Either 'eggChambers', 'embryoids', 'MDCK-Normoxia' or 'MDCK-Hypoxia'.
137        download: Whether to download the data if it is not present.
138        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
139
140    Returns:
141        The DataLoader.
142    """
143    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
144    dataset = get_cartocell_dataset(path, patch_shape, split, name, download, **ds_kwargs)
145    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URL = 'https://prod-dcd-datasets-cache-zipfiles.s3.eu-west-1.amazonaws.com/7gbkxgngpm-2.zip'
CHECKSUM = 'ca3fc289e7b67febfc03cdd55fd791078f7527820c8dbcee0b98d03d993bb6f5'
DNAME = 'CartoCell, a high-content pipeline for accurate 3D image analysis, unveils cell morphology patterns in epithelial cysts'
def get_cartocell_data(path: Union[os.PathLike, str], download: bool = False):
28def get_cartocell_data(path: Union[os.PathLike, str], download: bool = False):
29    """Download the CartoCell dataset.
30
31    Args:
32        path: Filepath to a folder where the downloaded data will be saved.
33        download: Whether to download the data if it is not present.
34    """
35    data_dir = os.path.join(path, "data")
36    if os.path.exists(data_dir):
37        return
38
39    os.makedirs(path, exist_ok=True)
40
41    zip_path = os.path.join(path, "cartocell.zip")
42    util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM)
43    util.unzip(zip_path=zip_path, dst=path)
44    shutil.move(src=os.path.join(path, DNAME), dst=data_dir)

Download the CartoCell dataset.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • download: Whether to download the data if it is not present.
def get_cartocell_paths( path: Union[os.PathLike, str], split: Optional[Literal['train', 'test']] = None, name: Optional[Literal['eggChambers', 'embryoids', 'MDCK-Normoxia', 'MDCK-Hypoxia']] = None, download: bool = False) -> Tuple[List[str], List[str]]:
47def get_cartocell_paths(
48    path: Union[os.PathLike, str],
49    split: Optional[Literal["train", "test"]] = None,
50    name: Optional[Literal["eggChambers", "embryoids", "MDCK-Normoxia", "MDCK-Hypoxia"]] = None,
51    download: bool = False
52) -> Tuple[List[str], List[str]]:
53    """Get paths to the CartoCell data.
54
55    Args:
56        path: Filepath to a folder where the downloaded data will be saved.
57        split: The data split to use. Either 'train', or 'test'.
58        name: The name of data subset. Either 'eggChambers', 'embryoids', 'MDCK-Normoxia' or 'MDCK-Hypoxia'.
59        download: Whether to download the data if it is not present.
60
61    Returns:
62        List of filepaths for the image data.
63        List of filepaths for the label data.
64    """
65    get_cartocell_data(path, download)
66
67    if split is None:
68        split = ""
69    else:
70        split = split + "_"
71
72    if name is None:
73        name = "*"
74    elif name == "MDCK-Hypoxia":
75        raise ValueError(f"'{name}' has mismatching shapes for image and corresponding labels.")
76
77    raw_paths = natsorted(glob(os.path.join(path, "data", f"low-resolution_{name}_{split}raw_images", "*")))
78
79    # NOTE: The 'MDCK-Hypoxia' inputs have mismatching input-label shapes (and axes seem interchanged)
80    raw_paths = [rpath for rpath in raw_paths if rpath.find("MDCK-Hypoxia") == -1]
81    label_paths = [rpath.replace("raw", "label") for rpath in raw_paths]
82
83    assert len(raw_paths) > 0 and len(raw_paths) == len(label_paths)
84
85    return raw_paths, label_paths

Get paths to the CartoCell data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • split: The data split to use. Either 'train', or 'test'.
  • name: The name of data subset. Either 'eggChambers', 'embryoids', 'MDCK-Normoxia' or 'MDCK-Hypoxia'.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths for the image data. List of filepaths for the label data.

def get_cartocell_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], split: Optional[Literal['train', 'test']] = None, name: Optional[Literal['eggChambers', 'embryoids', 'MDCK-Normoxia', 'MDCK-Hypoxia']] = None, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
 88def get_cartocell_dataset(
 89    path: Union[os.PathLike, str],
 90    patch_shape: Tuple[int, ...],
 91    split: Optional[Literal["train", "test"]] = None,
 92    name: Optional[Literal["eggChambers", "embryoids", "MDCK-Normoxia", "MDCK-Hypoxia"]] = None,
 93    download: bool = False, **kwargs
 94) -> Dataset:
 95    """Get the CartoCell dataset for cell segmentation.
 96
 97    Args:
 98        path: Filepath to a folder where the downloaded data will be saved.
 99        patch_shape: The patch shape to use for training.
100        split: The data split to use. Either 'train', or 'test'.
101        name: The name of data subset. Either 'eggChambers', 'embryoids', 'MDCK-Normoxia' or 'MDCK-Hypoxia'.
102        download: Whether to download the data if it is not present.
103        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
104
105    Returns:
106        The segmentation dataset.
107    """
108    raw_paths, label_paths = get_cartocell_paths(path, split, name, download)
109
110    return torch_em.default_segmentation_dataset(
111        raw_paths=raw_paths,
112        raw_key=None,
113        label_paths=label_paths,
114        label_key=None,
115        patch_shape=patch_shape,
116        is_seg_dataset=True,
117        **kwargs
118    )

Get the CartoCell dataset for cell segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • split: The data split to use. Either 'train', or 'test'.
  • name: The name of data subset. Either 'eggChambers', 'embryoids', 'MDCK-Normoxia' or 'MDCK-Hypoxia'.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_cartocell_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, ...], split: Optional[Literal['train', 'test']] = None, name: Optional[Literal['eggChambers', 'embryoids', 'MDCK-Normoxia', 'MDCK-Hypoxia']] = None, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
121def get_cartocell_loader(
122    path: Union[os.PathLike, str],
123    batch_size: int,
124    patch_shape: Tuple[int, ...],
125    split: Optional[Literal["train", "test"]] = None,
126    name: Optional[Literal["eggChambers", "embryoids", "MDCK-Normoxia", "MDCK-Hypoxia"]] = None,
127    download: bool = False,
128    **kwargs
129) -> DataLoader:
130    """Get the CartoCell dataloader for cell segmentation.
131
132    Args:
133        path: Filepath to a folder where the downloaded data will be saved.
134        batch_size: The batch size for training.
135        patch_shape: The patch shape to use for training.
136        split: The data split to use. Either 'train', or 'test'.
137        name: The name of data subset. Either 'eggChambers', 'embryoids', 'MDCK-Normoxia' or 'MDCK-Hypoxia'.
138        download: Whether to download the data if it is not present.
139        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
140
141    Returns:
142        The DataLoader.
143    """
144    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
145    dataset = get_cartocell_dataset(path, patch_shape, split, name, download, **ds_kwargs)
146    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the CartoCell dataloader for cell segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape to use for training.
  • split: The data split to use. Either 'train', or 'test'.
  • name: The name of data subset. Either 'eggChambers', 'embryoids', 'MDCK-Normoxia' or 'MDCK-Hypoxia'.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.