torch_em.data.datasets.histopathology.cytodark0
The cytoDArk0 dataset contains cell annotations for Nissl-stained histological images of mammalian brain.
NOTE: The dataset contains instance segmentation annotations of all types of neuron and glia cells. In addition, it contains semantic segmentation annotations for foreground (cells) vs background vs boundary between touching and closely positioned cells (four-classes in total).
The original dataset is located at https://zenodo.org/records/13694738. The dataset is from the publication https://www.sciencedirect.com/science/article/pii/S0010482525013708. Please cite it if you use this dataset for your research.
1"""The cytoDArk0 dataset contains cell annotations for Nissl-stained histological images of mammalian brain. 2 3NOTE: The dataset contains instance segmentation annotations of all types of neuron and glia cells. 4In addition, it contains semantic segmentation annotations for foreground (cells) vs background vs boundary between 5touching and closely positioned cells (four-classes in total). 6 7The original dataset is located at https://zenodo.org/records/13694738. 8The dataset is from the publication https://www.sciencedirect.com/science/article/pii/S0010482525013708. 9Please cite it if you use this dataset for your research. 10""" 11 12import os 13import shutil 14from glob import glob 15from tqdm import tqdm 16from pathlib import Path 17from typing import Union, Tuple, Literal, List, Optional 18 19import pandas as pd 20import imageio.v3 as imageio 21 22from torch.utils.data import Dataset, DataLoader 23 24import torch_em 25 26from .. import util 27 28 29URL = "https://zenodo.org/records/13694738/files/cytoDArk0.zip" 30CHECKSUM = "ce4b05675aa5057e277c8d4ab74524307e2402a3703f6bd80643b93ca9b70ff8" 31 32 33def _preprocess_images(path, data_dir): 34 import h5py 35 36 def _process_per_magnification(mag): 37 # Let's sort one magnification images first. 38 if mag == "20x": 39 base_dir = os.path.join(data_dir, "20x", "1024x1024") 40 elif mag == "40x": 41 base_dir = os.path.join(data_dir, "40x", "2048x2048") 42 else: 43 raise ValueError 44 45 preprocessed_dir = os.path.join(path, "preprocessed", mag) 46 os.makedirs(preprocessed_dir, exist_ok=True) 47 48 # 1. Load each image and corresponding labels 49 for image_path in tqdm(glob(os.path.join(base_dir, "image", "*.png")), desc=f"Preprocess {mag} images"): 50 image_name = Path(image_path).stem 51 52 image = imageio.imread(image_path) 53 instances = imageio.imread(os.path.join(base_dir, "label", f"{image_name}.tiff")) 54 semantics = imageio.imread(os.path.join(base_dir, "graymask4", f"{image_name}.png")) 55 56 assert image.ndim == 3 and image.shape[-1] == 3, image.shape 57 image = image.transpose(2, 0, 1) 58 59 with h5py.File(os.path.join(preprocessed_dir, f"{image_name}.h5"), "w") as f: 60 f.create_dataset("raw", data=image, compression="gzip") 61 f.create_dataset("labels/instances", data=instances, compression="gzip") 62 f.create_dataset("labels/semantic/pixels_classification", data=semantics, compression="gzip") 63 64 # Next, let's sort them in split folders. 65 # 1. Load the file with fold information. 66 fold = pd.read_csv(os.path.join(base_dir, "folds.csv")) 67 68 # 2. Make split folders, find files and drop them. 69 train_paths, val_paths, test_paths = (fold.loc[fold["fold"] == i, "img_id"].tolist() for i in range(3)) 70 71 train_paths = [os.path.join(preprocessed_dir, f"{p}.h5") for p in train_paths] 72 val_paths = [os.path.join(preprocessed_dir, f"{p}.h5") for p in val_paths] 73 test_paths = [os.path.join(preprocessed_dir, f"{p}.h5") for p in test_paths] 74 75 # Move them to their own split folders. 76 def _move_files(split, paths): 77 assert split in ["train", "val", "test"] 78 79 trg_dir = os.path.join(preprocessed_dir, split) 80 os.makedirs(trg_dir, exist_ok=True) 81 [shutil.move(p, os.path.join(trg_dir, os.path.basename(p))) for p in paths] 82 83 _move_files("train", train_paths) 84 _move_files("val", val_paths) 85 _move_files("test", test_paths) 86 87 _process_per_magnification("20x") 88 _process_per_magnification("40x") 89 90 # Finally, remove all other files because we don't care about them anymore. 91 shutil.rmtree(data_dir) 92 93 94def get_cytodark0_data(path: Union[os.PathLike, str], download: bool = False) -> str: 95 """Download the cytoDArk0 dataset. 96 97 Args: 98 path: Filepath to a folder where the downloaded data is saved. 99 download: Whether to download the data if it is not present. 100 101 Returns: 102 Filepath where dataset is downloaded for further processing. 103 """ 104 data_dir = os.path.join(path, "preprocessed") 105 if os.path.exists(data_dir): 106 return data_dir 107 108 os.makedirs(path, exist_ok=True) 109 110 zip_path = os.path.join(path, "cytoDArk0.zip") 111 util.download_source(zip_path, url=URL, download=download, checksum=CHECKSUM) 112 util.unzip(zip_path, path) 113 114 _preprocess_images(path, os.path.join(path, "cytoDArk0")) 115 116 return data_dir 117 118 119def get_cytodark0_paths( 120 path: Union[os.PathLike, str], 121 split: Literal["train", "val", "test"], 122 magnification: Optional[Literal["20x", "40x"]] = None, 123 download: bool = False, 124) -> List[str]: 125 """Get paths to the cytoDArk0 data. 126 127 Args: 128 path: Filepath to a folder where the downloaded data is saved. 129 split: The choice of data split. Either 'train', 'val' or 'test'. 130 magnification: The choice of magnification, by default returns all images across all magnification, 131 i.e. '20x' and '40x'. 132 download: Whether to download the data if it is not present. 133 134 Returns: 135 List of filepaths for the input data. 136 """ 137 data_dir = get_cytodark0_data(path, download) 138 139 assert split in ["train", "val", "test"], split 140 if magnification is None: 141 magnification = "*" 142 else: 143 assert magnification in ["20x", "40x"], magnification 144 145 input_paths = glob(os.path.join(data_dir, magnification, split, "*.h5")) 146 return input_paths 147 148 149def get_cytodark0_dataset( 150 path: Union[os.PathLike, str], 151 patch_shape: Tuple[int, int], 152 split: Literal["train", "val", "test"], 153 magnification: Optional[Literal["20x", "40x"]] = None, 154 download: bool = False, 155 **kwargs 156) -> Dataset: 157 """Get the cytoDArk0 dataset for cell segmentation. 158 159 Args: 160 path: Filepath to a folder where the downloaded data is saved. 161 patch_shape: The patch shape to use for training. 162 split: The choice of data split. Either 'train', 'val' or 'test'. 163 magnification: The choice of magnification, by default returns all images across all magnification, 164 i.e. '20x' and '40x'. 165 download: Whether to download the data if it is not present. 166 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 167 168 Returns: 169 The segmentation dataset. 170 """ 171 input_paths = get_cytodark0_paths(path, split, magnification, download) 172 173 return torch_em.default_segmentation_dataset( 174 raw_paths=input_paths, 175 raw_key="raw", 176 label_paths=input_paths, 177 label_key="labels/instances", 178 patch_shape=patch_shape, 179 ndim=2, 180 with_channels=True, 181 **kwargs 182 ) 183 184 185def get_cytodark0_loader( 186 path: Union[os.PathLike, str], 187 batch_size: int, 188 patch_shape: Tuple[int, int], 189 split: Literal["train", "val", "test"], 190 magnification: Optional[Literal["20x", "40x"]] = None, 191 download: bool = False, 192 **kwargs 193) -> DataLoader: 194 """Get the cytoDArk0 dataloader for cell segmentation. 195 196 Args: 197 path: Filepath to a folder where the downloaded data is saved. 198 batch_size: The batch size for training 199 patch_shape: The patch shape to use for training. 200 split: The choice of data split. Either 'train', 'val' or 'test'. 201 magnification: The choice of magnification, by default returns all images across all magnification, 202 i.e. '20x' and '40x'. 203 download: Whether to download the data if it is not present. 204 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 205 206 Returns: 207 The DataLoader. 208 """ 209 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 210 dataset = get_cytodark0_dataset(path, patch_shape, split, magnification, download, **ds_kwargs) 211 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
95def get_cytodark0_data(path: Union[os.PathLike, str], download: bool = False) -> str: 96 """Download the cytoDArk0 dataset. 97 98 Args: 99 path: Filepath to a folder where the downloaded data is saved. 100 download: Whether to download the data if it is not present. 101 102 Returns: 103 Filepath where dataset is downloaded for further processing. 104 """ 105 data_dir = os.path.join(path, "preprocessed") 106 if os.path.exists(data_dir): 107 return data_dir 108 109 os.makedirs(path, exist_ok=True) 110 111 zip_path = os.path.join(path, "cytoDArk0.zip") 112 util.download_source(zip_path, url=URL, download=download, checksum=CHECKSUM) 113 util.unzip(zip_path, path) 114 115 _preprocess_images(path, os.path.join(path, "cytoDArk0")) 116 117 return data_dir
Download the cytoDArk0 dataset.
Arguments:
- path: Filepath to a folder where the downloaded data is saved.
- download: Whether to download the data if it is not present.
Returns:
Filepath where dataset is downloaded for further processing.
120def get_cytodark0_paths( 121 path: Union[os.PathLike, str], 122 split: Literal["train", "val", "test"], 123 magnification: Optional[Literal["20x", "40x"]] = None, 124 download: bool = False, 125) -> List[str]: 126 """Get paths to the cytoDArk0 data. 127 128 Args: 129 path: Filepath to a folder where the downloaded data is saved. 130 split: The choice of data split. Either 'train', 'val' or 'test'. 131 magnification: The choice of magnification, by default returns all images across all magnification, 132 i.e. '20x' and '40x'. 133 download: Whether to download the data if it is not present. 134 135 Returns: 136 List of filepaths for the input data. 137 """ 138 data_dir = get_cytodark0_data(path, download) 139 140 assert split in ["train", "val", "test"], split 141 if magnification is None: 142 magnification = "*" 143 else: 144 assert magnification in ["20x", "40x"], magnification 145 146 input_paths = glob(os.path.join(data_dir, magnification, split, "*.h5")) 147 return input_paths
Get paths to the cytoDArk0 data.
Arguments:
- path: Filepath to a folder where the downloaded data is saved.
- split: The choice of data split. Either 'train', 'val' or 'test'.
- magnification: The choice of magnification, by default returns all images across all magnification, i.e. '20x' and '40x'.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the input data.
150def get_cytodark0_dataset( 151 path: Union[os.PathLike, str], 152 patch_shape: Tuple[int, int], 153 split: Literal["train", "val", "test"], 154 magnification: Optional[Literal["20x", "40x"]] = None, 155 download: bool = False, 156 **kwargs 157) -> Dataset: 158 """Get the cytoDArk0 dataset for cell segmentation. 159 160 Args: 161 path: Filepath to a folder where the downloaded data is saved. 162 patch_shape: The patch shape to use for training. 163 split: The choice of data split. Either 'train', 'val' or 'test'. 164 magnification: The choice of magnification, by default returns all images across all magnification, 165 i.e. '20x' and '40x'. 166 download: Whether to download the data if it is not present. 167 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 168 169 Returns: 170 The segmentation dataset. 171 """ 172 input_paths = get_cytodark0_paths(path, split, magnification, download) 173 174 return torch_em.default_segmentation_dataset( 175 raw_paths=input_paths, 176 raw_key="raw", 177 label_paths=input_paths, 178 label_key="labels/instances", 179 patch_shape=patch_shape, 180 ndim=2, 181 with_channels=True, 182 **kwargs 183 )
Get the cytoDArk0 dataset for cell segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data is saved.
- patch_shape: The patch shape to use for training.
- split: The choice of data split. Either 'train', 'val' or 'test'.
- magnification: The choice of magnification, by default returns all images across all magnification, i.e. '20x' and '40x'.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
186def get_cytodark0_loader( 187 path: Union[os.PathLike, str], 188 batch_size: int, 189 patch_shape: Tuple[int, int], 190 split: Literal["train", "val", "test"], 191 magnification: Optional[Literal["20x", "40x"]] = None, 192 download: bool = False, 193 **kwargs 194) -> DataLoader: 195 """Get the cytoDArk0 dataloader for cell segmentation. 196 197 Args: 198 path: Filepath to a folder where the downloaded data is saved. 199 batch_size: The batch size for training 200 patch_shape: The patch shape to use for training. 201 split: The choice of data split. Either 'train', 'val' or 'test'. 202 magnification: The choice of magnification, by default returns all images across all magnification, 203 i.e. '20x' and '40x'. 204 download: Whether to download the data if it is not present. 205 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 206 207 Returns: 208 The DataLoader. 209 """ 210 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 211 dataset = get_cytodark0_dataset(path, patch_shape, split, magnification, download, **ds_kwargs) 212 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the cytoDArk0 dataloader for cell segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data is saved.
- batch_size: The batch size for training
- patch_shape: The patch shape to use for training.
- split: The choice of data split. Either 'train', 'val' or 'test'.
- magnification: The choice of magnification, by default returns all images across all magnification, i.e. '20x' and '40x'.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader.