torch_em.data.datasets.light_microscopy.organoid
The OrganoID dataset contains annotations for pancreatic organoids in brightfield images.
The dataset is from the publication https://doi.org/10.1371/journal.pcbi.1010584. Please cite it if you use this dataset for your research.
1"""The OrganoID dataset contains annotations for pancreatic organoids in brightfield images. 2 3The dataset is from the publication https://doi.org/10.1371/journal.pcbi.1010584. 4Please cite it if you use this dataset for your research. 5""" 6 7import os 8import shutil 9from glob import glob 10from pathlib import Path 11from natsort import natsorted 12from typing import Union, Tuple, List, Literal, Optional 13 14import numpy as np 15import imageio.v3 as imageio 16from skimage.measure import label as connected_components 17 18from torch.utils.data import DataLoader, Dataset 19 20import torch_em 21 22from .. import util 23 24 25URL = "https://osf.io/download/69nr8/" 26# CHECKSUM = "a399288524d12bbadeebb38d52711fa746402456257b0cc6531d8c3c5a0cb8f1" 27CHECKSUM = None # NOTE: I remember osf checksums fail for some reason. I am sure this might as well. 28 29 30def _store_files_as_h5(data_dir, image_dir, image_pattern, label_dir, label_pattern): 31 32 import h5py 33 34 if os.path.exists(data_dir): 35 return 36 37 os.makedirs(data_dir, exist_ok=True) 38 39 image_paths = natsorted(glob(os.path.join(image_dir, image_pattern))) 40 gt_paths = natsorted(glob(os.path.join(label_dir, label_pattern))) 41 42 assert image_paths and len(image_paths) == len(gt_paths) 43 44 for image_path, gt_path in zip(image_paths, gt_paths): 45 image = imageio.imread(image_path) 46 gt = imageio.imread(gt_path) 47 48 if gt.ndim == 3: 49 gt = gt[..., 0] # Choose one label channel as all are same. 50 51 gt = connected_components(gt > 0).astype("uint16") # Run connected components to get instances. 52 53 # Preprocess the image (ensure all images are 3-channel). 54 if image.ndim == 3 and image.shape[-1] == 4: 55 image = image[..., :-1] # Remove alpha channel 56 elif image.ndim == 2: 57 image = np.stack([image] * 3, axis=-1) 58 59 assert image.ndim == 3 and image.shape[-1] == 3, image.shape 60 61 # Now, make channels first (to make this work with our dataset) 62 image = image.transpose(2, 0, 1) 63 64 with h5py.File(os.path.join(data_dir, f"{Path(image_path).stem}.h5"), "w") as f: 65 f.create_dataset(name="raw", data=image, compression="gzip") 66 f.create_dataset(name="labels", data=gt, compression="gzip") 67 68 69def _preprocess_per_species(data_dir, stype, dirname): 70 71 _store_files_as_h5( 72 data_dir=os.path.join(data_dir, dirname, "train"), 73 image_dir=os.path.join(data_dir, stype, "training", "pre_augmented", "images"), 74 image_pattern="*", 75 label_dir=os.path.join(data_dir, stype, "training", "pre_augmented", "segmentations"), 76 label_pattern="*", 77 ) 78 79 _store_files_as_h5( 80 data_dir=os.path.join(data_dir, dirname, "val"), 81 image_dir=os.path.join(data_dir, stype, "validation", "images"), image_pattern="*", 82 label_dir=os.path.join(data_dir, stype, "validation", "segmentations"), label_pattern="*", 83 ) 84 85 _store_files_as_h5( 86 data_dir=os.path.join(data_dir, dirname, "test"), 87 image_dir=os.path.join(data_dir, stype, "testing", "images"), image_pattern="*", 88 label_dir=os.path.join(data_dir, stype, "testing", "segmentations"), label_pattern="*", 89 ) 90 91 92def _preprocess_data(data_dir): 93 94 import h5py 95 96 # Let's start assorting the OG PDAC organoids data. We will call this the "original" data. 97 print("Preprocessing 'original' data") 98 _preprocess_per_species(data_dir, "OriginalData", "original") 99 100 # Next, we go to the 'MouseOrganoids' data. We will call this the "mouse" data. 101 print("Preprocessing 'mouse' data") 102 _preprocess_per_species(data_dir, "MouseOrganoids", "mouse") 103 104 # And finally, the 'GemcitabineScreen' data. This is a cool data, as the inputs 105 # have two channels: BF and PI (propidium iodide), responsible for reporting cellular necrosis. 106 # We will call this data as "gemcitabine". 107 gdir = os.path.join(data_dir, "gemcitabine") 108 if not os.path.exists(gdir): 109 print("Preprocessing 'gemcitabine' data") 110 os.makedirs(os.path.join(data_dir, "gemcitabine"), exist_ok=True) 111 112 bf_paths = natsorted(glob(os.path.join(data_dir, "GemcitabineScreen", "BF", "*.tif"))) 113 pi_paths = natsorted(glob(os.path.join(data_dir, "GemcitabineScreen", "PI", "*.tif"))) 114 label_paths = natsorted(glob(os.path.join(data_dir, "GemcitabineScreen", "OrganoIDProcessed", "*_labeled.tif"))) 115 116 assert label_paths and len(label_paths) == len(bf_paths) == len(pi_paths) 117 118 for bf_path, pi_path, label_path in zip(bf_paths, pi_paths, label_paths): 119 bf_image = imageio.imread(bf_path) 120 pi_image = imageio.imread(pi_path) 121 gt = imageio.imread(label_path) 122 123 assert bf_image.shape == pi_image.shape == gt.shape 124 125 with h5py.File(os.path.join(gdir, f"{Path(bf_path).stem}.h5"), "w") as f: 126 f.create_dataset(name="raw/bf", data=bf_image, compression="gzip") 127 f.create_dataset(name="raw/pi", data=pi_image, compression="gzip") 128 f.create_dataset(name="labels", data=gt, compression="gzip") 129 130 # Let's remove all other data folders. 131 shutil.rmtree(os.path.join(data_dir, "OriginalData")) 132 shutil.rmtree(os.path.join(data_dir, "MouseOrganoids")) 133 shutil.rmtree(os.path.join(data_dir, "GemcitabineScreen")) 134 135 136def get_organoid_data(path: Union[os.PathLike, str], download: bool = False) -> str: 137 """Download the OrganoID dataset. 138 139 Args: 140 path: Filepath to the folder where the downloaded data will be saved. 141 download: Whether to download the data if it is not present. 142 143 Returns: 144 The filepath where the data is downloaded. 145 """ 146 data_dir = os.path.join(path, "data") 147 if os.path.exists(data_dir): 148 return data_dir 149 150 zip_path = os.path.join(path, "data.zip") 151 util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM) 152 util.unzip(zip_path=zip_path, dst=data_dir, remove=False) 153 154 _preprocess_data(data_dir) 155 156 return data_dir 157 158 159def get_organoid_paths( 160 path: Union[os.PathLike, str], 161 split: Optional[Literal["train", "val", "test"]] = None, 162 source: Literal["gemcitabine", "mouse", "original"] = "original", 163 download: bool = False, 164) -> List[str]: 165 """Get paths to the OrganoID data. 166 167 Args: 168 path: Filepath to the folder where the downloaded data will be saved. 169 split: The data split to use. 170 source: The data source to use. 171 download: Whether to download the data if it is not present. 172 173 Returns: 174 List of filepaths for the input data. 175 """ 176 if source == "gemcitabine": 177 assert split is None, "The 'gemcitabine' data has no data splits." 178 split = "" 179 else: 180 assert split is not None, f"The '{source}' data expects a data split to be chosen." 181 182 data_dir = get_organoid_data(path, download) 183 input_paths = natsorted(glob(os.path.join(data_dir, source, split, "*.h5"))) 184 assert input_paths and len(input_paths) > 0 185 return input_paths 186 187 188def get_organoid_dataset( 189 path: Union[os.PathLike, str], 190 patch_shape: Tuple[int, int], 191 split: Optional[Literal["train", "val", "test"]] = None, 192 source: Literal["gemcitabine", "mouse", "original"] = "original", 193 source_channels: Optional[Union[str, List[str]]] = None, 194 download: bool = False, 195 **kwargs, 196) -> Dataset: 197 """Get OrganoID dataset for organoid segmentation in brightfield microscopy images. 198 199 Args: 200 path: Filepath to the folder where the downloaded data will be saved. 201 patch_shape: The patch shape to use for training. 202 split: The data split to use. 203 source: The data source to use. 204 source_channel: The data source channel to use. 205 download: Whether to download the data if it is not present. 206 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 207 208 Returns: 209 The segmentation dataset. 210 """ 211 input_paths = get_organoid_paths(path, split, source, download) 212 213 if source == "gemcitabine": 214 assert source_channels is not None, "You must choose a 'source_channel' for 'gemcitabine' data." 215 ndim = 3 216 if isinstance(source_channels, str): 217 raw_key = f"raw/{source_channels}" 218 with_channels = False 219 else: 220 raw_key = [f"raw/{per_rkey}" for per_rkey in source_channels] 221 with_channels = True 222 else: 223 assert source_channels is None, f"You cannot choose a 'source_channel' for '{source}' data." 224 ndim = 2 225 raw_key = "raw" 226 with_channels = True 227 228 return torch_em.default_segmentation_dataset( 229 raw_paths=input_paths, 230 raw_key=raw_key, 231 label_paths=input_paths, 232 label_key="labels", 233 is_seg_dataset=True, 234 patch_shape=patch_shape, 235 ndim=ndim, 236 with_channels=with_channels, 237 **kwargs 238 ) 239 240 241def get_organoid_loader( 242 path: Union[os.PathLike, str], 243 batch_size: int, 244 patch_shape: Tuple[int, int], 245 split: Optional[Literal["train", "val", "test"]] = None, 246 source: Literal["gemcitabine", "mouse", "original"] = "original", 247 source_channels: Optional[Union[str, List[str]]] = None, 248 download: bool = False, 249 **kwargs, 250) -> DataLoader: 251 """Get OrganoID dataloader for organoid segmentation in brightfield microscopy images. 252 253 Args: 254 path: Filepath to the folder where the downloaded data will be saved. 255 batch_size: The batch size for training. 256 patch_shape: The patch shape to use for training. 257 split: The data split to use. 258 source: The data source to use. 259 source_channel: The data source channel to use. 260 download: Whether to download the data if it is not present. 261 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 262 263 Returns: 264 The DataLoader. 265 """ 266 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 267 dataset = get_organoid_dataset(path, patch_shape, split, source, source_channels, download, **ds_kwargs) 268 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URL =
'https://osf.io/download/69nr8/'
CHECKSUM =
None
def
get_organoid_data(path: Union[os.PathLike, str], download: bool = False) -> str:
137def get_organoid_data(path: Union[os.PathLike, str], download: bool = False) -> str: 138 """Download the OrganoID dataset. 139 140 Args: 141 path: Filepath to the folder where the downloaded data will be saved. 142 download: Whether to download the data if it is not present. 143 144 Returns: 145 The filepath where the data is downloaded. 146 """ 147 data_dir = os.path.join(path, "data") 148 if os.path.exists(data_dir): 149 return data_dir 150 151 zip_path = os.path.join(path, "data.zip") 152 util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM) 153 util.unzip(zip_path=zip_path, dst=data_dir, remove=False) 154 155 _preprocess_data(data_dir) 156 157 return data_dir
Download the OrganoID dataset.
Arguments:
- path: Filepath to the folder where the downloaded data will be saved.
- download: Whether to download the data if it is not present.
Returns:
The filepath where the data is downloaded.
def
get_organoid_paths( path: Union[os.PathLike, str], split: Optional[Literal['train', 'val', 'test']] = None, source: Literal['gemcitabine', 'mouse', 'original'] = 'original', download: bool = False) -> List[str]:
160def get_organoid_paths( 161 path: Union[os.PathLike, str], 162 split: Optional[Literal["train", "val", "test"]] = None, 163 source: Literal["gemcitabine", "mouse", "original"] = "original", 164 download: bool = False, 165) -> List[str]: 166 """Get paths to the OrganoID data. 167 168 Args: 169 path: Filepath to the folder where the downloaded data will be saved. 170 split: The data split to use. 171 source: The data source to use. 172 download: Whether to download the data if it is not present. 173 174 Returns: 175 List of filepaths for the input data. 176 """ 177 if source == "gemcitabine": 178 assert split is None, "The 'gemcitabine' data has no data splits." 179 split = "" 180 else: 181 assert split is not None, f"The '{source}' data expects a data split to be chosen." 182 183 data_dir = get_organoid_data(path, download) 184 input_paths = natsorted(glob(os.path.join(data_dir, source, split, "*.h5"))) 185 assert input_paths and len(input_paths) > 0 186 return input_paths
Get paths to the OrganoID data.
Arguments:
- path: Filepath to the folder where the downloaded data will be saved.
- split: The data split to use.
- source: The data source to use.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the input data.
def
get_organoid_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Optional[Literal['train', 'val', 'test']] = None, source: Literal['gemcitabine', 'mouse', 'original'] = 'original', source_channels: Union[List[str], str, NoneType] = None, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
189def get_organoid_dataset( 190 path: Union[os.PathLike, str], 191 patch_shape: Tuple[int, int], 192 split: Optional[Literal["train", "val", "test"]] = None, 193 source: Literal["gemcitabine", "mouse", "original"] = "original", 194 source_channels: Optional[Union[str, List[str]]] = None, 195 download: bool = False, 196 **kwargs, 197) -> Dataset: 198 """Get OrganoID dataset for organoid segmentation in brightfield microscopy images. 199 200 Args: 201 path: Filepath to the folder where the downloaded data will be saved. 202 patch_shape: The patch shape to use for training. 203 split: The data split to use. 204 source: The data source to use. 205 source_channel: The data source channel to use. 206 download: Whether to download the data if it is not present. 207 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 208 209 Returns: 210 The segmentation dataset. 211 """ 212 input_paths = get_organoid_paths(path, split, source, download) 213 214 if source == "gemcitabine": 215 assert source_channels is not None, "You must choose a 'source_channel' for 'gemcitabine' data." 216 ndim = 3 217 if isinstance(source_channels, str): 218 raw_key = f"raw/{source_channels}" 219 with_channels = False 220 else: 221 raw_key = [f"raw/{per_rkey}" for per_rkey in source_channels] 222 with_channels = True 223 else: 224 assert source_channels is None, f"You cannot choose a 'source_channel' for '{source}' data." 225 ndim = 2 226 raw_key = "raw" 227 with_channels = True 228 229 return torch_em.default_segmentation_dataset( 230 raw_paths=input_paths, 231 raw_key=raw_key, 232 label_paths=input_paths, 233 label_key="labels", 234 is_seg_dataset=True, 235 patch_shape=patch_shape, 236 ndim=ndim, 237 with_channels=with_channels, 238 **kwargs 239 )
Get OrganoID dataset for organoid segmentation in brightfield microscopy images.
Arguments:
- path: Filepath to the folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- split: The data split to use.
- source: The data source to use.
- source_channel: The data source channel to use.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
def
get_organoid_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Optional[Literal['train', 'val', 'test']] = None, source: Literal['gemcitabine', 'mouse', 'original'] = 'original', source_channels: Union[List[str], str, NoneType] = None, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
242def get_organoid_loader( 243 path: Union[os.PathLike, str], 244 batch_size: int, 245 patch_shape: Tuple[int, int], 246 split: Optional[Literal["train", "val", "test"]] = None, 247 source: Literal["gemcitabine", "mouse", "original"] = "original", 248 source_channels: Optional[Union[str, List[str]]] = None, 249 download: bool = False, 250 **kwargs, 251) -> DataLoader: 252 """Get OrganoID dataloader for organoid segmentation in brightfield microscopy images. 253 254 Args: 255 path: Filepath to the folder where the downloaded data will be saved. 256 batch_size: The batch size for training. 257 patch_shape: The patch shape to use for training. 258 split: The data split to use. 259 source: The data source to use. 260 source_channel: The data source channel to use. 261 download: Whether to download the data if it is not present. 262 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 263 264 Returns: 265 The DataLoader. 266 """ 267 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 268 dataset = get_organoid_dataset(path, patch_shape, split, source, source_channels, download, **ds_kwargs) 269 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get OrganoID dataloader for organoid segmentation in brightfield microscopy images.
Arguments:
- path: Filepath to the folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- split: The data split to use.
- source: The data source to use.
- source_channel: The data source channel to use.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader.