torch_em.data.datasets.histopathology.cytodark0

The cytoDArk0 dataset contains cell annotations for Nissl-stained histological images of mammalian brain.

NOTE: The dataset contains instance segmentation annotations of all types of neuron and glia cells. In addition, it contains semantic segmentation annotations for foreground (cells) vs background vs boundary between touching and closely positioned cells (four-classes in total).

The original dataset is located at https://zenodo.org/records/13694738. The dataset is from the publication https://www.sciencedirect.com/science/article/pii/S0010482525013708. Please cite it if you use this dataset for your research.

  1"""The cytoDArk0 dataset contains cell annotations for Nissl-stained histological images of mammalian brain.
  2
  3NOTE: The dataset contains instance segmentation annotations of all types of neuron and glia cells.
  4In addition, it contains semantic segmentation annotations for foreground (cells) vs background vs boundary between
  5touching and closely positioned cells (four-classes in total).
  6
  7The original dataset is located at https://zenodo.org/records/13694738.
  8The dataset is from the publication https://www.sciencedirect.com/science/article/pii/S0010482525013708.
  9Please cite it if you use this dataset for your research.
 10"""
 11
 12import os
 13import shutil
 14from glob import glob
 15from tqdm import tqdm
 16from pathlib import Path
 17from typing import Union, Tuple, Literal, List, Optional
 18
 19import pandas as pd
 20import imageio.v3 as imageio
 21
 22from torch.utils.data import Dataset, DataLoader
 23
 24import torch_em
 25
 26from .. import util
 27
 28
 29URL = "https://zenodo.org/records/13694738/files/cytoDArk0.zip"
 30CHECKSUM = "ce4b05675aa5057e277c8d4ab74524307e2402a3703f6bd80643b93ca9b70ff8"
 31
 32
 33def _preprocess_images(path, data_dir):
 34    import h5py
 35
 36    def _process_per_magnification(mag):
 37        # Let's sort one magnification images first.
 38        if mag == "20x":
 39            base_dir = os.path.join(data_dir, "20x", "1024x1024")
 40        elif mag == "40x":
 41            base_dir = os.path.join(data_dir, "40x", "2048x2048")
 42        else:
 43            raise ValueError
 44
 45        preprocessed_dir = os.path.join(path, "preprocessed", mag)
 46        os.makedirs(preprocessed_dir, exist_ok=True)
 47
 48        # 1. Load each image and corresponding labels
 49        for image_path in tqdm(glob(os.path.join(base_dir, "image", "*.png")), desc=f"Preprocess {mag} images"):
 50            image_name = Path(image_path).stem
 51
 52            image = imageio.imread(image_path)
 53            instances = imageio.imread(os.path.join(base_dir, "label", f"{image_name}.tiff"))
 54            semantics = imageio.imread(os.path.join(base_dir, "graymask4", f"{image_name}.png"))
 55
 56            assert image.ndim == 3 and image.shape[-1] == 3, image.shape
 57            image = image.transpose(2, 0, 1)
 58
 59            with h5py.File(os.path.join(preprocessed_dir, f"{image_name}.h5"), "w") as f:
 60                f.create_dataset("raw", data=image, compression="gzip")
 61                f.create_dataset("labels/instances", data=instances, compression="gzip")
 62                f.create_dataset("labels/semantic/pixels_classification", data=semantics, compression="gzip")
 63
 64        # Next, let's sort them in split folders.
 65        # 1. Load the file with fold information.
 66        fold = pd.read_csv(os.path.join(base_dir, "folds.csv"))
 67
 68        # 2. Make split folders, find files and drop them.
 69        train_paths, val_paths, test_paths = (fold.loc[fold["fold"] == i, "img_id"].tolist() for i in range(3))
 70
 71        train_paths = [os.path.join(preprocessed_dir, f"{p}.h5") for p in train_paths]
 72        val_paths = [os.path.join(preprocessed_dir, f"{p}.h5") for p in val_paths]
 73        test_paths = [os.path.join(preprocessed_dir, f"{p}.h5") for p in test_paths]
 74
 75        # Move them to their own split folders.
 76        def _move_files(split, paths):
 77            assert split in ["train", "val", "test"]
 78
 79            trg_dir = os.path.join(preprocessed_dir, split)
 80            os.makedirs(trg_dir, exist_ok=True)
 81            [shutil.move(p, os.path.join(trg_dir, os.path.basename(p))) for p in paths]
 82
 83        _move_files("train", train_paths)
 84        _move_files("val", val_paths)
 85        _move_files("test", test_paths)
 86
 87    _process_per_magnification("20x")
 88    _process_per_magnification("40x")
 89
 90    # Finally, remove all other files because we don't care about them anymore.
 91    shutil.rmtree(data_dir)
 92
 93
 94def get_cytodark0_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 95    """Download the cytoDArk0 dataset.
 96
 97    Args:
 98        path: Filepath to a folder where the downloaded data is saved.
 99        download: Whether to download the data if it is not present.
100
101    Returns:
102        Filepath where dataset is downloaded for further processing.
103    """
104    data_dir = os.path.join(path, "preprocessed")
105    if os.path.exists(data_dir):
106        return data_dir
107
108    os.makedirs(path, exist_ok=True)
109
110    zip_path = os.path.join(path, "cytoDArk0.zip")
111    util.download_source(zip_path, url=URL, download=download, checksum=CHECKSUM)
112    util.unzip(zip_path, path)
113
114    _preprocess_images(path, os.path.join(path, "cytoDArk0"))
115
116    return data_dir
117
118
119def get_cytodark0_paths(
120    path: Union[os.PathLike, str],
121    split: Literal["train", "val", "test"],
122    magnification: Optional[Literal["20x", "40x"]] = None,
123    download: bool = False,
124) -> List[str]:
125    """Get paths to the cytoDArk0 data.
126
127    Args:
128        path: Filepath to a folder where the downloaded data is saved.
129        split: The choice of data split. Either 'train', 'val' or 'test'.
130        magnification: The choice of magnification, by default returns all images across all magnification,
131            i.e. '20x' and '40x'.
132        download: Whether to download the data if it is not present.
133
134    Returns:
135        List of filepaths for the input data.
136    """
137    data_dir = get_cytodark0_data(path, download)
138
139    assert split in ["train", "val", "test"], split
140    if magnification is None:
141        magnification = "*"
142    else:
143        assert magnification in ["20x", "40x"], magnification
144
145    input_paths = glob(os.path.join(data_dir, magnification, split, "*.h5"))
146    return input_paths
147
148
149def get_cytodark0_dataset(
150    path: Union[os.PathLike, str],
151    patch_shape: Tuple[int, int],
152    split: Literal["train", "val", "test"],
153    magnification: Optional[Literal["20x", "40x"]] = None,
154    download: bool = False,
155    **kwargs
156) -> Dataset:
157    """Get the cytoDArk0 dataset for cell segmentation.
158
159    Args:
160        path: Filepath to a folder where the downloaded data is saved.
161        patch_shape: The patch shape to use for training.
162        split: The choice of data split. Either 'train', 'val' or 'test'.
163        magnification: The choice of magnification, by default returns all images across all magnification,
164            i.e. '20x' and '40x'.
165        download: Whether to download the data if it is not present.
166        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
167
168    Returns:
169        The segmentation dataset.
170    """
171    input_paths = get_cytodark0_paths(path, split, magnification, download)
172
173    return torch_em.default_segmentation_dataset(
174        raw_paths=input_paths,
175        raw_key="raw",
176        label_paths=input_paths,
177        label_key="labels/instances",
178        patch_shape=patch_shape,
179        ndim=2,
180        with_channels=True,
181        **kwargs
182    )
183
184
185def get_cytodark0_loader(
186    path: Union[os.PathLike, str],
187    batch_size: int,
188    patch_shape: Tuple[int, int],
189    split: Literal["train", "val", "test"],
190    magnification: Optional[Literal["20x", "40x"]] = None,
191    download: bool = False,
192    **kwargs
193) -> DataLoader:
194    """Get the cytoDArk0 dataloader for cell segmentation.
195
196    Args:
197        path: Filepath to a folder where the downloaded data is saved.
198        batch_size: The batch size for training
199        patch_shape: The patch shape to use for training.
200        split: The choice of data split. Either 'train', 'val' or 'test'.
201        magnification: The choice of magnification, by default returns all images across all magnification,
202            i.e. '20x' and '40x'.
203        download: Whether to download the data if it is not present.
204        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
205
206    Returns:
207        The DataLoader.
208    """
209    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
210    dataset = get_cytodark0_dataset(path, patch_shape, split, magnification, download, **ds_kwargs)
211    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URL = 'https://zenodo.org/records/13694738/files/cytoDArk0.zip'
CHECKSUM = 'ce4b05675aa5057e277c8d4ab74524307e2402a3703f6bd80643b93ca9b70ff8'
def get_cytodark0_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 95def get_cytodark0_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 96    """Download the cytoDArk0 dataset.
 97
 98    Args:
 99        path: Filepath to a folder where the downloaded data is saved.
100        download: Whether to download the data if it is not present.
101
102    Returns:
103        Filepath where dataset is downloaded for further processing.
104    """
105    data_dir = os.path.join(path, "preprocessed")
106    if os.path.exists(data_dir):
107        return data_dir
108
109    os.makedirs(path, exist_ok=True)
110
111    zip_path = os.path.join(path, "cytoDArk0.zip")
112    util.download_source(zip_path, url=URL, download=download, checksum=CHECKSUM)
113    util.unzip(zip_path, path)
114
115    _preprocess_images(path, os.path.join(path, "cytoDArk0"))
116
117    return data_dir

Download the cytoDArk0 dataset.

Arguments:
  • path: Filepath to a folder where the downloaded data is saved.
  • download: Whether to download the data if it is not present.
Returns:

Filepath where dataset is downloaded for further processing.

def get_cytodark0_paths( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], magnification: Optional[Literal['20x', '40x']] = None, download: bool = False) -> List[str]:
120def get_cytodark0_paths(
121    path: Union[os.PathLike, str],
122    split: Literal["train", "val", "test"],
123    magnification: Optional[Literal["20x", "40x"]] = None,
124    download: bool = False,
125) -> List[str]:
126    """Get paths to the cytoDArk0 data.
127
128    Args:
129        path: Filepath to a folder where the downloaded data is saved.
130        split: The choice of data split. Either 'train', 'val' or 'test'.
131        magnification: The choice of magnification, by default returns all images across all magnification,
132            i.e. '20x' and '40x'.
133        download: Whether to download the data if it is not present.
134
135    Returns:
136        List of filepaths for the input data.
137    """
138    data_dir = get_cytodark0_data(path, download)
139
140    assert split in ["train", "val", "test"], split
141    if magnification is None:
142        magnification = "*"
143    else:
144        assert magnification in ["20x", "40x"], magnification
145
146    input_paths = glob(os.path.join(data_dir, magnification, split, "*.h5"))
147    return input_paths

Get paths to the cytoDArk0 data.

Arguments:
  • path: Filepath to a folder where the downloaded data is saved.
  • split: The choice of data split. Either 'train', 'val' or 'test'.
  • magnification: The choice of magnification, by default returns all images across all magnification, i.e. '20x' and '40x'.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths for the input data.

def get_cytodark0_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], magnification: Optional[Literal['20x', '40x']] = None, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
150def get_cytodark0_dataset(
151    path: Union[os.PathLike, str],
152    patch_shape: Tuple[int, int],
153    split: Literal["train", "val", "test"],
154    magnification: Optional[Literal["20x", "40x"]] = None,
155    download: bool = False,
156    **kwargs
157) -> Dataset:
158    """Get the cytoDArk0 dataset for cell segmentation.
159
160    Args:
161        path: Filepath to a folder where the downloaded data is saved.
162        patch_shape: The patch shape to use for training.
163        split: The choice of data split. Either 'train', 'val' or 'test'.
164        magnification: The choice of magnification, by default returns all images across all magnification,
165            i.e. '20x' and '40x'.
166        download: Whether to download the data if it is not present.
167        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
168
169    Returns:
170        The segmentation dataset.
171    """
172    input_paths = get_cytodark0_paths(path, split, magnification, download)
173
174    return torch_em.default_segmentation_dataset(
175        raw_paths=input_paths,
176        raw_key="raw",
177        label_paths=input_paths,
178        label_key="labels/instances",
179        patch_shape=patch_shape,
180        ndim=2,
181        with_channels=True,
182        **kwargs
183    )

Get the cytoDArk0 dataset for cell segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data is saved.
  • patch_shape: The patch shape to use for training.
  • split: The choice of data split. Either 'train', 'val' or 'test'.
  • magnification: The choice of magnification, by default returns all images across all magnification, i.e. '20x' and '40x'.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_cytodark0_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], magnification: Optional[Literal['20x', '40x']] = None, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
186def get_cytodark0_loader(
187    path: Union[os.PathLike, str],
188    batch_size: int,
189    patch_shape: Tuple[int, int],
190    split: Literal["train", "val", "test"],
191    magnification: Optional[Literal["20x", "40x"]] = None,
192    download: bool = False,
193    **kwargs
194) -> DataLoader:
195    """Get the cytoDArk0 dataloader for cell segmentation.
196
197    Args:
198        path: Filepath to a folder where the downloaded data is saved.
199        batch_size: The batch size for training
200        patch_shape: The patch shape to use for training.
201        split: The choice of data split. Either 'train', 'val' or 'test'.
202        magnification: The choice of magnification, by default returns all images across all magnification,
203            i.e. '20x' and '40x'.
204        download: Whether to download the data if it is not present.
205        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
206
207    Returns:
208        The DataLoader.
209    """
210    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
211    dataset = get_cytodark0_dataset(path, patch_shape, split, magnification, download, **ds_kwargs)
212    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the cytoDArk0 dataloader for cell segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data is saved.
  • batch_size: The batch size for training
  • patch_shape: The patch shape to use for training.
  • split: The choice of data split. Either 'train', 'val' or 'test'.
  • magnification: The choice of magnification, by default returns all images across all magnification, i.e. '20x' and '40x'.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.