torch_em.data.datasets.light_microscopy.orgline
The OrgLine dataset contains organoid images and associated segmentation masks.
The organoids come from different organs and were assembled from different prior publications. Specifically:
- Intestine: from OrgaQuant (https://doi.org/10.1038/s41598-019-48874-y) from OrgaSegment (https://doi.org/10.1038/s42003-024-05966-4)
- Brain: from https://doi.org/10.1038/s41597-024-03330-z
- Colon: from OrgaExtractor (https://doi.org/10.1038/s41598-023-46485-2)
- PDAC: from OrganoID (https://doi.org/10.1371/journal.pcbi.1010584) from OrganoidNet (https://doi.org/10.1007/s13402-024-00958-2)
- Stomach: from https://zenodo.org/records/18447547
- Breast: from https://zenodo.org/records/18447547
Please cite the associated zenodo entry (https://zenodo.org/records/16355179) and the relevant original publications if you use this dataset for your research.
1"""The OrgLine dataset contains organoid images and associated segmentation masks. 2 3The organoids come from different organs and were assembled from different prior publications. 4Specifically: 5- Intestine: from OrgaQuant (https://doi.org/10.1038/s41598-019-48874-y) 6 from OrgaSegment (https://doi.org/10.1038/s42003-024-05966-4) 7- Brain: from https://doi.org/10.1038/s41597-024-03330-z 8- Colon: from OrgaExtractor (https://doi.org/10.1038/s41598-023-46485-2) 9- PDAC: from OrganoID (https://doi.org/10.1371/journal.pcbi.1010584) 10 from OrganoidNet (https://doi.org/10.1007/s13402-024-00958-2) 11- Stomach: from https://zenodo.org/records/18447547 12- Breast: from https://zenodo.org/records/18447547 13 14Please cite the associated zenodo entry (https://zenodo.org/records/16355179) and the relevant original publications 15if you use this dataset for your research. 16""" 17 18import os 19import shutil 20from glob import glob 21from typing import Union, Tuple, List, Literal, Optional, Sequence 22 23import h5py 24import imageio.v3 as imageio 25import numpy as np 26 27from sklearn.model_selection import train_test_split 28from tqdm import tqdm 29from torch.utils.data import DataLoader, Dataset 30 31import torch_em 32 33from .. import util 34 35try: 36 from pycocotools.coco import COCO 37except ImportError: 38 COCO = None 39 40 41URL1 = "https://zenodo.org/records/16355179/files/InstanceSeg.zip?download=1" 42URL2 = "https://zenodo.org/records/18447547/files/data.zip?download=1" 43 44CHECKSUM1 = "6787dc47ee5f800e7ecf4a51d958fc88591c877ca7f8f03c2aa3e7fa7c4aca50" 45CHECKSUM2 = "8b5984ee19232c06cdf5366080a3f3b27fb2109f38a2a345316e22dd2bb9a1c2" 46 47ORGANS1 = ("PDAC", "colon", "Intestine", "brain") 48ORGANS2 = ("stomach", "breast") 49 50 51def _annotations_to_instances(coco, image_metadata): 52 from skimage.measure import label 53 from skimage.segmentation import relabel_sequential 54 55 # create and save the segmentation 56 annotation_ids = coco.getAnnIds(imgIds=image_metadata["id"]) 57 annotations = coco.loadAnns(annotation_ids) 58 assert len(annotations) <= np.iinfo("uint16").max 59 shape = (image_metadata["height"], image_metadata["width"]) 60 seg = np.zeros(shape, dtype="uint32") 61 62 sizes = [ann["area"] for ann in annotations] 63 sorting = np.argsort(sizes) 64 annotations = [annotations[i] for i in sorting] 65 66 for seg_id, annotation in enumerate(annotations, 1): 67 mask = coco.annToMask(annotation).astype("bool") 68 assert mask.shape == seg.shape 69 seg[mask] = seg_id 70 71 # Filter out small pieces from pasting organoids on top of each other. 72 min_size = 25 73 seg = label(seg) 74 seg_ids, sizes = np.unique(seg, return_counts=True) 75 seg[np.isin(seg, seg_ids[sizes < min_size])] = 0 76 seg, _, _ = relabel_sequential(seg) 77 78 return seg.astype("uint16") 79 80 81def _prepare_data(data_dir, organ): 82 if organ in ORGANS1: 83 for org in ORGANS1: 84 input_root, output_root = os.path.join(data_dir, "InstanceSeg", org), os.path.join(data_dir, org) 85 for split in ("train", "val", "test"): 86 images = sorted(glob(os.path.join(input_root, split, "images", "*"))) 87 masks = sorted(glob(os.path.join(input_root, split, "masks", "*"))) 88 if len(images) != len(masks): 89 continue 90 assert len(images) == len(masks) 91 output_folder = os.path.join(output_root, split) 92 os.makedirs(output_folder, exist_ok=True) 93 for im_path, mask_path in tqdm( 94 zip(images, masks), total=len(images), desc=f"Converting {org}, {split}-split" 95 ): 96 im = imageio.imread(im_path) 97 mask = np.load(mask_path) if mask_path.endswith(".npy") else imageio.imread(mask_path) 98 if im.ndim == 3: 99 im = im[..., 0] 100 assert im.shape == mask.shape 101 out_path = os.path.join(output_folder, f"{os.path.basename(im_path)}.h5") 102 with h5py.File(out_path, mode="w") as f: 103 f.create_dataset("image", data=im, compression="gzip") 104 f.create_dataset("masks", data=mask, compression="gzip") 105 shutil.rmtree(os.path.join(data_dir, "InstanceSeg")) 106 107 else: 108 if COCO is None: 109 raise ModuleNotFoundError( 110 "'pycocotools' is required for processing the OrgLine ground-truth. " 111 "Install it with 'conda install -c conda-forge pycocotools'." 112 ) 113 for org in ORGANS2: 114 input_root, output_root = os.path.join(data_dir, org), os.path.join(data_dir, org) 115 coco_file = os.path.join(input_root, "coco.json") 116 coco = COCO(coco_file) 117 118 image_ids = coco.getImgIds() 119 # Create splits. 120 train_ids, test_ids = train_test_split(image_ids, test_size=0.2, random_state=42) 121 test_ids, val_ids = train_test_split(test_ids, test_size=0.6, random_state=42) 122 train_out, val_out = os.path.join(output_root, "train"), os.path.join(output_root, "val") 123 test_out = os.path.join(output_root, "test") 124 os.makedirs(train_out, exist_ok=True) 125 os.makedirs(val_out, exist_ok=True) 126 os.makedirs(test_out, exist_ok=True) 127 128 for image_id in tqdm(image_ids, desc=f"Converting {org}"): 129 image_metadata = coco.loadImgs(image_id)[0] 130 file_name = image_metadata["file_name"] 131 image_path = os.path.join(input_root, file_name) 132 im = imageio.imread(image_path) 133 if im.ndim == 3: 134 im = np.mean(im[..., :3], axis=-1) 135 mask = _annotations_to_instances(coco, image_metadata) 136 assert im.shape == mask.shape 137 # For debugging. 138 # import napari 139 # v = napari.Viewer() 140 # v.add_image(im) 141 # v.add_labels(mask) 142 # napari.run() 143 if image_id in train_ids: 144 output_folder = train_out 145 elif image_id in val_ids: 146 output_folder = val_out 147 else: 148 output_folder = test_out 149 out_path = os.path.join(output_folder, f"{os.path.basename(image_path)}.h5") 150 with h5py.File(out_path, mode="w") as f: 151 f.create_dataset("image", data=im, compression="gzip") 152 f.create_dataset("masks", data=mask, compression="gzip") 153 154 # Clean up. 155 shutil.rmtree(os.path.join(input_root, "images")) 156 json_files = glob(os.path.join(input_root, "*.json")) 157 for json_file in json_files: 158 os.remove(json_file) 159 160 161def get_orgline_data(path: Union[os.PathLike, str], organ: str, download: bool = False) -> str: 162 """Download the OrgLine dataset. 163 164 Args: 165 path: Filepath to the folder where the downloaded data will be saved. 166 organ: The organ from which the organoids are derived. 167 download: Whether to download the data if it is not present. 168 169 Returns: 170 The filepath where the data is downloaded. 171 """ 172 if organ in ORGANS1: 173 url, checksum = URL1, CHECKSUM1 174 data_folder = "data1" 175 elif organ in ORGANS2: 176 url, checksum = URL2, CHECKSUM2 177 data_folder = "data2" 178 else: 179 raise ValueError(f"Invalid organ: {organ}. Must be one of {ORGANS1 + ORGANS2}.") 180 181 data_dir = os.path.join(path, data_folder) 182 if os.path.exists(data_dir): 183 return data_dir 184 185 os.makedirs(data_dir, exist_ok=True) 186 zip_path = os.path.join(data_dir, "data.zip") 187 util.download_source(path=zip_path, url=url, download=download, checksum=checksum) 188 util.unzip(zip_path=zip_path, dst=data_dir, remove=True) 189 _prepare_data(data_dir, organ) 190 return data_dir 191 192 193def get_orgline_paths( 194 path: Union[os.PathLike, str], 195 split: Literal["train", "val", "test"], 196 organs: Optional[Union[str, Sequence[str]]] = None, 197 download: bool = False, 198) -> List[str]: 199 """Get paths to the OrgLine data. 200 201 Args: 202 path: Filepath to the folder where the downloaded data will be saved. 203 organ: . 204 split: The data split to use. 205 download: Whether to download the data if it is not present. 206 207 Returns: 208 List of filepaths for the input data. 209 """ 210 if isinstance(organs, str): 211 organs = [organs] 212 elif organs is None: 213 organs = ORGANS1 + ORGANS2 214 paths = [] 215 for organ in organs: 216 data_dir = get_orgline_data(path, organ, download) 217 this_paths = sorted(glob(os.path.join(data_dir, organ, split, "*.h5"))) 218 paths.extend(this_paths) 219 return paths 220 221 222def get_orgline_dataset( 223 path: Union[os.PathLike, str], 224 patch_shape: Tuple[int, int], 225 split: Literal["train", "val", "test"], 226 organs: Optional[Union[str, Sequence[str]]] = None, 227 download: bool = False, 228 **kwargs, 229) -> Dataset: 230 """Get OrgLine dataset for organoid segmentation in brightfield microscopy images. 231 232 Args: 233 path: Filepath to the folder where the downloaded data will be saved. 234 patch_shape: The patch shape to use for training. 235 split: The data split to use. 236 organ: 237 download: Whether to download the data if it is not present. 238 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 239 240 Returns: 241 The segmentation dataset. 242 """ 243 paths = get_orgline_paths(path, split, organs, download) 244 return torch_em.default_segmentation_dataset( 245 raw_paths=paths, 246 raw_key="image", 247 label_paths=paths, 248 label_key="masks", 249 is_seg_dataset=True, 250 patch_shape=patch_shape, 251 ndim=2, 252 **kwargs 253 ) 254 255 256def get_orgline_loader( 257 path: Union[os.PathLike, str], 258 batch_size: int, 259 patch_shape: Tuple[int, int], 260 split: Literal["train", "val", "test"], 261 organs: Optional[Union[str, Sequence[str]]] = None, 262 download: bool = False, 263 **kwargs, 264) -> DataLoader: 265 """Get OrgLine dataloader for organoid segmentation in brightfield microscopy images. 266 267 Args: 268 path: Filepath to the folder where the downloaded data will be saved. 269 batch_size: The batch size for training. 270 patch_shape: The patch shape to use for training. 271 272 split: The data split to use. 273 274 download: Whether to download the data if it is not present. 275 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 276 277 Returns: 278 The DataLoader. 279 """ 280 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 281 dataset = get_orgline_dataset(path, patch_shape, split=split, organs=organs, download=download, **ds_kwargs) 282 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URL1 =
'https://zenodo.org/records/16355179/files/InstanceSeg.zip?download=1'
URL2 =
'https://zenodo.org/records/18447547/files/data.zip?download=1'
CHECKSUM1 =
'6787dc47ee5f800e7ecf4a51d958fc88591c877ca7f8f03c2aa3e7fa7c4aca50'
CHECKSUM2 =
'8b5984ee19232c06cdf5366080a3f3b27fb2109f38a2a345316e22dd2bb9a1c2'
ORGANS1 =
('PDAC', 'colon', 'Intestine', 'brain')
ORGANS2 =
('stomach', 'breast')
def
get_orgline_data(path: Union[os.PathLike, str], organ: str, download: bool = False) -> str:
162def get_orgline_data(path: Union[os.PathLike, str], organ: str, download: bool = False) -> str: 163 """Download the OrgLine dataset. 164 165 Args: 166 path: Filepath to the folder where the downloaded data will be saved. 167 organ: The organ from which the organoids are derived. 168 download: Whether to download the data if it is not present. 169 170 Returns: 171 The filepath where the data is downloaded. 172 """ 173 if organ in ORGANS1: 174 url, checksum = URL1, CHECKSUM1 175 data_folder = "data1" 176 elif organ in ORGANS2: 177 url, checksum = URL2, CHECKSUM2 178 data_folder = "data2" 179 else: 180 raise ValueError(f"Invalid organ: {organ}. Must be one of {ORGANS1 + ORGANS2}.") 181 182 data_dir = os.path.join(path, data_folder) 183 if os.path.exists(data_dir): 184 return data_dir 185 186 os.makedirs(data_dir, exist_ok=True) 187 zip_path = os.path.join(data_dir, "data.zip") 188 util.download_source(path=zip_path, url=url, download=download, checksum=checksum) 189 util.unzip(zip_path=zip_path, dst=data_dir, remove=True) 190 _prepare_data(data_dir, organ) 191 return data_dir
Download the OrgLine dataset.
Arguments:
- path: Filepath to the folder where the downloaded data will be saved.
- organ: The organ from which the organoids are derived.
- download: Whether to download the data if it is not present.
Returns:
The filepath where the data is downloaded.
def
get_orgline_paths( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], organs: Union[str, Sequence[str], NoneType] = None, download: bool = False) -> List[str]:
194def get_orgline_paths( 195 path: Union[os.PathLike, str], 196 split: Literal["train", "val", "test"], 197 organs: Optional[Union[str, Sequence[str]]] = None, 198 download: bool = False, 199) -> List[str]: 200 """Get paths to the OrgLine data. 201 202 Args: 203 path: Filepath to the folder where the downloaded data will be saved. 204 organ: . 205 split: The data split to use. 206 download: Whether to download the data if it is not present. 207 208 Returns: 209 List of filepaths for the input data. 210 """ 211 if isinstance(organs, str): 212 organs = [organs] 213 elif organs is None: 214 organs = ORGANS1 + ORGANS2 215 paths = [] 216 for organ in organs: 217 data_dir = get_orgline_data(path, organ, download) 218 this_paths = sorted(glob(os.path.join(data_dir, organ, split, "*.h5"))) 219 paths.extend(this_paths) 220 return paths
Get paths to the OrgLine data.
Arguments:
- path: Filepath to the folder where the downloaded data will be saved.
- organ: .
- split: The data split to use.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the input data.
def
get_orgline_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], organs: Union[str, Sequence[str], NoneType] = None, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
223def get_orgline_dataset( 224 path: Union[os.PathLike, str], 225 patch_shape: Tuple[int, int], 226 split: Literal["train", "val", "test"], 227 organs: Optional[Union[str, Sequence[str]]] = None, 228 download: bool = False, 229 **kwargs, 230) -> Dataset: 231 """Get OrgLine dataset for organoid segmentation in brightfield microscopy images. 232 233 Args: 234 path: Filepath to the folder where the downloaded data will be saved. 235 patch_shape: The patch shape to use for training. 236 split: The data split to use. 237 organ: 238 download: Whether to download the data if it is not present. 239 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 240 241 Returns: 242 The segmentation dataset. 243 """ 244 paths = get_orgline_paths(path, split, organs, download) 245 return torch_em.default_segmentation_dataset( 246 raw_paths=paths, 247 raw_key="image", 248 label_paths=paths, 249 label_key="masks", 250 is_seg_dataset=True, 251 patch_shape=patch_shape, 252 ndim=2, 253 **kwargs 254 )
Get OrgLine dataset for organoid segmentation in brightfield microscopy images.
Arguments:
- path: Filepath to the folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- split: The data split to use.
- organ:
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset.
Returns:
The segmentation dataset.
def
get_orgline_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], organs: Union[str, Sequence[str], NoneType] = None, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
257def get_orgline_loader( 258 path: Union[os.PathLike, str], 259 batch_size: int, 260 patch_shape: Tuple[int, int], 261 split: Literal["train", "val", "test"], 262 organs: Optional[Union[str, Sequence[str]]] = None, 263 download: bool = False, 264 **kwargs, 265) -> DataLoader: 266 """Get OrgLine dataloader for organoid segmentation in brightfield microscopy images. 267 268 Args: 269 path: Filepath to the folder where the downloaded data will be saved. 270 batch_size: The batch size for training. 271 patch_shape: The patch shape to use for training. 272 273 split: The data split to use. 274 275 download: Whether to download the data if it is not present. 276 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 277 278 Returns: 279 The DataLoader. 280 """ 281 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 282 dataset = get_orgline_dataset(path, patch_shape, split=split, organs=organs, download=download, **ds_kwargs) 283 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get OrgLine dataloader for organoid segmentation in brightfield microscopy images.
Arguments:
- path: Filepath to the folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- split: The data split to use.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_datasetor for the PyTorch DataLoader.
Returns:
The DataLoader.