torch_em.data.datasets.light_microscopy.orgaextractor
The OrgaExtractor dataset contains annotations for colon organoids in brightfield images.
NOTE: This dataset is kind of sparsely annotated (quite some organoids per image were missing when AA visualized).
This dataset is from the publication https://www.nature.com/articles/s41598-023-46485-2. And the dataset is located at https://github.com/tpark16/orgaextractor, pointing to the drive link at https://drive.google.com/drive/folders/17K4N7gEZUqAcwf9N2-I5DPbywwPvzAvo.
Please cite the publication if you use this dataset for your research.
1"""The OrgaExtractor dataset contains annotations for colon organoids in brightfield images. 2 3NOTE: This dataset is kind of sparsely annotated (quite some organoids per image were missing when AA visualized). 4 5This dataset is from the publication https://www.nature.com/articles/s41598-023-46485-2. 6And the dataset is located at https://github.com/tpark16/orgaextractor, pointing to the 7drive link at https://drive.google.com/drive/folders/17K4N7gEZUqAcwf9N2-I5DPbywwPvzAvo. 8 9Please cite the publication if you use this dataset for your research. 10""" 11 12import os 13from glob import glob 14from natsort import natsorted 15from typing import Union, Tuple, List, Literal 16 17import imageio.v3 as imageio 18from skimage.measure import label as connected_components 19 20from torch.utils.data import Dataset, DataLoader 21 22import torch_em 23 24from .. import util 25 26 27# NOTE: The odd thing is, 'val' has no labels, but 'test' has labels. 28# So, users are allowed to only request for 'train' and 'test' splits. 29URLS = { 30 "train": "https://drive.google.com/uc?export=download&id=1u987UNcZxWkEwe5gjLoR3-M0lBNicXQ1", 31 "val": "https://drive.google.com/uc?export=download&id=1UsBrHOYY0Orkb4vsRP8SaDj-CeYfGpFG", 32 "test": "https://drive.google.com/uc?export=download&id=1IXqu1MqMZzfw1_GzZauUhg1As_abbk6N", 33} 34 35CHECKSUMS = { 36 "train": "279bcfbcbd2fba23bbdea362b23eedacc53193034f4d23eb94ef570896da4f60", 37 "val": "3d2288a7be39a692af2eb86bea520e7db332191cd372a8c970679b5bede61b7e", 38 "test": "8e110ad8543031ed61c61bee5e8b41492b746d0dc8c503b6f8d4869b29a308e6", 39} 40 41 42def _preprocess_data(data_dir): 43 gt_paths = natsorted(glob(os.path.join(data_dir, "*.tif"))) 44 for gt_path in gt_paths: 45 gt = imageio.imread(gt_path)[..., 0] # labels are with 3 channels. choose one as all channels are same. 46 gt = connected_components(gt).astype("uint16") # convert semantic labels to instances 47 imageio.imwrite(gt_path, gt, compression="zlib") 48 49 50def get_orgaextractor_data( 51 path: Union[os.PathLike, str], split: Literal["train", "test"], download: bool = False, 52) -> str: 53 """Download the OrgaExtractor dataset. 54 55 Args: 56 path: Filepath to the folder where the downloaded data will be saved. 57 split: The data split to use. 58 download: Whether to download the data if it is not present. 59 60 Returns: 61 The filepath where the data is downloaded. 62 """ 63 data_dir = os.path.join(path, split) 64 if os.path.exists(data_dir): 65 return data_dir 66 67 os.makedirs(data_dir, exist_ok=True) 68 69 zip_path = os.path.join(data_dir, f"{split}.zip") 70 util.download_source_gdrive( 71 path=zip_path, url=URLS[split], download=download, checksum=CHECKSUMS[split], download_type="zip", 72 ) 73 util.unzip(zip_path=zip_path, dst=data_dir) 74 75 _preprocess_data(data_dir) 76 77 return data_dir 78 79 80def get_orgaextractor_paths( 81 path: Union[os.PathLike, str], split: Literal["train", "test"], download: bool = False, 82) -> Tuple[List[str], List[str]]: 83 """Get paths to the OrgaExtractor data. 84 85 Args: 86 path: Filepath to the folder where the downloaded data will be saved. 87 split: The data split to use. 88 download: Whether to download the data if it is not present. 89 90 Returns: 91 List of filepaths for the image data. 92 List of filepaths for the label data. 93 """ 94 data_dir = get_orgaextractor_data(path, split, download) 95 96 image_paths = natsorted(glob(os.path.join(data_dir, "*.jpg"))) 97 gt_paths = natsorted(glob(os.path.join(data_dir, "*.tif"))) 98 99 assert image_paths and len(image_paths) == len(gt_paths) 100 101 return image_paths, gt_paths 102 103 104def get_orgaextractor_dataset( 105 path: Union[os.PathLike, str], 106 patch_shape: Tuple[int, int], 107 split: Literal["train", "test"], 108 download: bool = False, 109 **kwargs 110) -> Dataset: 111 """Get the OrgaExtractor dataset for organoid segmentation in brightfield microscopy images. 112 113 Args: 114 path: Filepath to the folder where the downloaded data will be saved. 115 patch_shape: The patch shape to use for training. 116 split: The data split to use. 117 download: Whether to download the data if it is not present. 118 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 119 120 Returns: 121 The segmentation dataset. 122 """ 123 image_paths, gt_paths = get_orgaextractor_paths(path, split, download) 124 125 return torch_em.default_segmentation_dataset( 126 raw_paths=image_paths, 127 raw_key=None, 128 label_paths=gt_paths, 129 label_key=None, 130 patch_shape=patch_shape, 131 is_seg_dataset=False, 132 ndim=2, 133 **kwargs 134 ) 135 136 137def get_orgaextractor_loader( 138 path: Union[os.PathLike, str], 139 batch_size: int, 140 patch_shape: Tuple[int, int], 141 split: Literal["train", "test"], 142 download: bool = False, 143 **kwargs 144) -> DataLoader: 145 """Get the OrgaExtractor dataloader for organoid segmentation in brightfield microscopy images. 146 147 Args: 148 path: Filepath to the folder where the downloaded data will be saved. 149 batch_size: The batch size for training. 150 patch_shape: The patch shape to use for training. 151 split: The data split to use. 152 download: Whether to download the data if it is not present. 153 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 154 155 Returns: 156 The DataLoader. 157 """ 158 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 159 dataset = get_orgaextractor_dataset(path, patch_shape, split, download, **ds_kwargs) 160 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
51def get_orgaextractor_data( 52 path: Union[os.PathLike, str], split: Literal["train", "test"], download: bool = False, 53) -> str: 54 """Download the OrgaExtractor dataset. 55 56 Args: 57 path: Filepath to the folder where the downloaded data will be saved. 58 split: The data split to use. 59 download: Whether to download the data if it is not present. 60 61 Returns: 62 The filepath where the data is downloaded. 63 """ 64 data_dir = os.path.join(path, split) 65 if os.path.exists(data_dir): 66 return data_dir 67 68 os.makedirs(data_dir, exist_ok=True) 69 70 zip_path = os.path.join(data_dir, f"{split}.zip") 71 util.download_source_gdrive( 72 path=zip_path, url=URLS[split], download=download, checksum=CHECKSUMS[split], download_type="zip", 73 ) 74 util.unzip(zip_path=zip_path, dst=data_dir) 75 76 _preprocess_data(data_dir) 77 78 return data_dir
Download the OrgaExtractor dataset.
Arguments:
- path: Filepath to the folder where the downloaded data will be saved.
- split: The data split to use.
- download: Whether to download the data if it is not present.
Returns:
The filepath where the data is downloaded.
81def get_orgaextractor_paths( 82 path: Union[os.PathLike, str], split: Literal["train", "test"], download: bool = False, 83) -> Tuple[List[str], List[str]]: 84 """Get paths to the OrgaExtractor data. 85 86 Args: 87 path: Filepath to the folder where the downloaded data will be saved. 88 split: The data split to use. 89 download: Whether to download the data if it is not present. 90 91 Returns: 92 List of filepaths for the image data. 93 List of filepaths for the label data. 94 """ 95 data_dir = get_orgaextractor_data(path, split, download) 96 97 image_paths = natsorted(glob(os.path.join(data_dir, "*.jpg"))) 98 gt_paths = natsorted(glob(os.path.join(data_dir, "*.tif"))) 99 100 assert image_paths and len(image_paths) == len(gt_paths) 101 102 return image_paths, gt_paths
Get paths to the OrgaExtractor data.
Arguments:
- path: Filepath to the folder where the downloaded data will be saved.
- split: The data split to use.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the image data. List of filepaths for the label data.
105def get_orgaextractor_dataset( 106 path: Union[os.PathLike, str], 107 patch_shape: Tuple[int, int], 108 split: Literal["train", "test"], 109 download: bool = False, 110 **kwargs 111) -> Dataset: 112 """Get the OrgaExtractor dataset for organoid segmentation in brightfield microscopy images. 113 114 Args: 115 path: Filepath to the folder where the downloaded data will be saved. 116 patch_shape: The patch shape to use for training. 117 split: The data split to use. 118 download: Whether to download the data if it is not present. 119 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 120 121 Returns: 122 The segmentation dataset. 123 """ 124 image_paths, gt_paths = get_orgaextractor_paths(path, split, download) 125 126 return torch_em.default_segmentation_dataset( 127 raw_paths=image_paths, 128 raw_key=None, 129 label_paths=gt_paths, 130 label_key=None, 131 patch_shape=patch_shape, 132 is_seg_dataset=False, 133 ndim=2, 134 **kwargs 135 )
Get the OrgaExtractor dataset for organoid segmentation in brightfield microscopy images.
Arguments:
- path: Filepath to the folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- split: The data split to use.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
138def get_orgaextractor_loader( 139 path: Union[os.PathLike, str], 140 batch_size: int, 141 patch_shape: Tuple[int, int], 142 split: Literal["train", "test"], 143 download: bool = False, 144 **kwargs 145) -> DataLoader: 146 """Get the OrgaExtractor dataloader for organoid segmentation in brightfield microscopy images. 147 148 Args: 149 path: Filepath to the folder where the downloaded data will be saved. 150 batch_size: The batch size for training. 151 patch_shape: The patch shape to use for training. 152 split: The data split to use. 153 download: Whether to download the data if it is not present. 154 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 155 156 Returns: 157 The DataLoader. 158 """ 159 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 160 dataset = get_orgaextractor_dataset(path, patch_shape, split, download, **ds_kwargs) 161 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the OrgaExtractor dataloader for organoid segmentation in brightfield microscopy images.
Arguments:
- path: Filepath to the folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- split: The data split to use.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader.