torch_em.data.datasets.light_microscopy.dsb
This Dataset was used in a Kaggle Data Science Bowl. It contains light microscopy images with annotations for nucleus segmentation.
NOTE:
- The 'full' dataset has been taken from https://github.com/ibmua/data-science-bowl-2018-train-set, as recommended in BBBC website: https://bbbc.broadinstitute.org/BBBC038.
- The 'reduced' dataset is the fluorescence image set from StarDist.
The dataset is described in the publication https://doi.org/10.1038/s41592-019-0612-7. Please cite it if you use this dataset in your research.
1"""This Dataset was used in a Kaggle Data Science Bowl. It contains light microscopy 2images with annotations for nucleus segmentation. 3 4NOTE: 5- The 'full' dataset has been taken from https://github.com/ibmua/data-science-bowl-2018-train-set, 6as recommended in BBBC website: https://bbbc.broadinstitute.org/BBBC038. 7- The 'reduced' dataset is the fluorescence image set from StarDist. 8 9The dataset is described in the publication https://doi.org/10.1038/s41592-019-0612-7. 10Please cite it if you use this dataset in your research. 11""" 12 13import os 14import shutil 15from glob import glob 16from tqdm import tqdm 17from natsort import natsorted 18from typing import List, Optional, Tuple, Union, Literal 19 20import numpy as np 21import imageio.v3 as imageio 22 23from torch.utils.data import Dataset, DataLoader 24 25import torch_em 26 27from .. import util 28from .neurips_cell_seg import to_rgb 29 30 31DSB_URLS = { 32 "full": "https://github.com/ibmua/data-science-bowl-2018-train-set/raw/master/train-hand.zip", 33 "reduced": "https://github.com/stardist/stardist/releases/download/0.1.0/dsb2018.zip" 34} 35CHECKSUMS = { 36 "full": "d218b8706cd7b9a2d7171268a6e99c7b0e94605af46521ff2ffd5a17708b1af6", 37 "reduced": "e44921950edce378063aa4457e625581ba35b4c2dbd9a07c19d48900129f386f" 38} 39 40 41def _merge_instances(path): 42 for id_path in tqdm(glob(os.path.join(path, "full", "*")), desc="Preprocessing labels"): 43 id = os.path.basename(id_path) 44 45 # Let's preprocess the image: remove alpha channel and make distinction of histopatho vs fluo images. 46 image = imageio.imread(os.path.join(id_path, "images", f"{id}.png")) 47 assert image.ndim == 3 and image.shape[-1] == 4, image.shape 48 49 image = image[..., :-1] # Remove alpha channel 50 r, g, b = image.transpose(2, 0, 1) 51 if np.array_equal(r, g) and np.array_equal(g, b): 52 dname = "fluo" 53 # Store only one channel for fluorescence images. 54 imageio.imwrite(os.path.join(id_path, "images", f"{dname}_{id}.png"), image[..., -1], compression="zlib") 55 else: 56 dname = "histopatho" 57 # Store all three channels for histopathology images. 58 imageio.imwrite(os.path.join(id_path, "images", f"{dname}_{id}.png"), image, compression="zlib") 59 60 os.remove(os.path.join(id_path, "images", f"{id}.png")) 61 62 # Next, let's merge the instances. 63 label_paths = glob(os.path.join(id_path, "masks", "*")) 64 shape = imageio.imread(label_paths[0]).shape 65 66 instances = np.zeros(shape) 67 for i, lpath in enumerate(label_paths, start=1): 68 instances[imageio.imread(lpath) > 0] = i 69 70 os.makedirs(os.path.join(id_path, "preprocessed_labels")) 71 imageio.imwrite( 72 os.path.join(id_path, "preprocessed_labels", f"{dname}_{id}.tif"), 73 instances.astype("uint32"), 74 compression="zlib" 75 ) 76 shutil.rmtree(os.path.join(id_path, "masks")) # Removing per-object masks after storing merged instances. 77 78 79def get_dsb_data(path: Union[os.PathLike, str], source: Literal["full", "reduced"], download: bool): 80 """Download the DSB training data. 81 82 Args: 83 path: Filepath to a folder where the downloaded data will be saved. 84 source: The source of the dataset. Can either be 'full' for the complete dataset, 85 or 'reduced' for the dataset excluding histopathology images. 86 download: Whether to download the data if it is not present. 87 """ 88 if source not in DSB_URLS.keys(): 89 raise ValueError(f"'{source}' is not a valid data source.") 90 91 train_out_path = os.path.join(path, "train") 92 test_out_path = os.path.join(path, "test") 93 if source == "reduced" and os.path.exists(train_out_path) and os.path.exists(test_out_path): 94 return 95 96 full_out_path = os.path.join(path, "full") 97 if source == "full" and os.path.exists(full_out_path): 98 return 99 100 os.makedirs(path, exist_ok=True) 101 102 zip_path = os.path.join(path, "dsb.zip" if source == "reduced" else "train-hand.zip") 103 util.download_source(zip_path, DSB_URLS[source], download, CHECKSUMS[source]) 104 util.unzip(zip_path, path, True) 105 106 if source == "reduced": 107 shutil.move(os.path.join(path, "dsb2018", "train"), train_out_path) 108 shutil.move(os.path.join(path, "dsb2018", "test"), test_out_path) 109 else: 110 shutil.move(os.path.join(path, "train-hand"), os.path.join(path, "full")) 111 _merge_instances(path) 112 113 114def get_dsb_paths( 115 path: Union[os.PathLike, str], 116 source: Literal["full", "reduced"], 117 split: Optional[Literal["train", "test"]] = None, 118 domain: Optional[Literal["fluo", "histopatho"]] = None, 119 download: bool = False, 120) -> Tuple[List[str], List[str]]: 121 """Get paths to the DSB data. 122 123 Args: 124 path: Filepath to a folder where the downloaded data will be saved. 125 source: The source of the dataset. Can either be 'full' for the complete dataset, 126 or 'reduced' for the dataset excluding histopathology images. 127 split: The split to use for the dataset. Either 'train' or 'test'. 128 domain: The choice of modality in dataset. 129 download: Whether to download the data if it is not present. 130 131 Returns: 132 List of filepaths for the folder where the images are stored. 133 List of filepaths for the folder where the labels are stored. 134 """ 135 get_dsb_data(path, source, download) 136 137 if source == "reduced": 138 if domain is not None: 139 assert domain in "fluo", "The reduced set only has 'fluo' images." 140 141 if split is None: 142 split = "t*" # reduced set returns all "train" and "test" sets if split is None. 143 144 raw_paths = natsorted(glob(os.path.join(path, split, "images", "*.tif"))) 145 label_paths = natsorted(glob(os.path.join(path, split, "masks", "*.tif"))) 146 else: 147 if domain is None: 148 domain = "*" 149 150 assert split is None, "There are no splits available for this data." 151 152 raw_paths = natsorted(glob(os.path.join(path, "full", "*", "images", f"{domain}_*.png"))) 153 label_paths = natsorted(glob(os.path.join(path, "full", "*", "preprocessed_labels", f"{domain}_*.tif"))) 154 155 assert len(raw_paths) == len(label_paths) and len(raw_paths) > 0 156 157 return raw_paths, label_paths 158 159 160def get_dsb_dataset( 161 path: Union[os.PathLike, str], 162 patch_shape: Tuple[int, int], 163 source: Literal["full", "reduced"] = "reduced", 164 split: Optional[Literal["train", "test"]] = None, 165 domain: Optional[Literal["fluo", "histopatho"]] = None, 166 binary: bool = False, 167 boundaries: bool = False, 168 offsets: Optional[List[List[int]]] = None, 169 download: bool = False, 170 **kwargs 171) -> Dataset: 172 """Get the DSB dataset for nucleus segmentation. 173 174 Args: 175 path: Filepath to a folder where the downloaded data will be saved. 176 patch_shape: The patch shape to use for training. 177 source: The source of the dataset. Can either be 'full' for the complete dataset, 178 or 'reduced' for the dataset excluding histopathology images. 179 split: The split to use for the dataset. Either 'train' or 'test'. 180 domain: The choice of modality in dataset. 181 binary: Whether to use a binary segmentation target. 182 boundaries: Whether to compute boundaries as the target. 183 offsets: Offset values for affinity computation used as target. 184 download: Whether to download the data if it is not present. 185 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 186 187 Returns: 188 The segmentation dataset. 189 """ 190 raw_paths, label_paths = get_dsb_paths(path, source, split, domain, download) 191 192 kwargs, _ = util.add_instance_label_transform( 193 kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets 194 ) 195 kwargs = util.update_kwargs(kwargs, "ndim", 2) 196 197 # This is done for when user requests all images in "full" dataset. 198 if "raw_transform" not in kwargs and domain is None: 199 kwargs["raw_transform"] = torch_em.transform.get_raw_transform(augmentation2=to_rgb) 200 201 return torch_em.default_segmentation_dataset( 202 raw_paths=raw_paths, 203 raw_key=None, 204 label_paths=label_paths, 205 label_key=None, 206 patch_shape=patch_shape, 207 is_seg_dataset=False, 208 **kwargs 209 ) 210 211 212def get_dsb_loader( 213 path: Union[os.PathLike, str], 214 batch_size: int, 215 patch_shape: Tuple[int, int], 216 source: Literal["full", "reduced"] = "reduced", 217 split: Optional[Literal["train", "test"]] = None, 218 domain: Optional[Literal["fluo", "histopatho"]] = None, 219 binary: bool = False, 220 boundaries: bool = False, 221 offsets: Optional[List[List[int]]] = None, 222 download: bool = False, 223 **kwargs 224) -> DataLoader: 225 """Get the DSB dataloader for nucleus segmentation. 226 227 Args: 228 path: Filepath to a folder where the downloaded data will be saved. 229 batch_size: The batch size for training. 230 patch_shape: The patch shape to use for training. 231 source: The source of the dataset. Can either be 'full' for the complete dataset, 232 or 'reduced' for the dataset excluding histopathology images. 233 split: The split to use for the dataset. Either 'train' or 'test'. 234 domain: The choice of modality in dataset. 235 binary: Whether to use a binary segmentation target. 236 boundaries: Whether to compute boundaries as the target. 237 offsets: Offset values for affinity computation used as target. 238 download: Whether to download the data if it is not present. 239 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 240 241 Returns: 242 The DataLoader. 243 """ 244 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 245 dataset = get_dsb_dataset( 246 path, patch_shape, source, split, domain, binary, boundaries, offsets, download, **ds_kwargs 247 ) 248 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
DSB_URLS =
{'full': 'https://github.com/ibmua/data-science-bowl-2018-train-set/raw/master/train-hand.zip', 'reduced': 'https://github.com/stardist/stardist/releases/download/0.1.0/dsb2018.zip'}
CHECKSUMS =
{'full': 'd218b8706cd7b9a2d7171268a6e99c7b0e94605af46521ff2ffd5a17708b1af6', 'reduced': 'e44921950edce378063aa4457e625581ba35b4c2dbd9a07c19d48900129f386f'}
def
get_dsb_data( path: Union[os.PathLike, str], source: Literal['full', 'reduced'], download: bool):
80def get_dsb_data(path: Union[os.PathLike, str], source: Literal["full", "reduced"], download: bool): 81 """Download the DSB training data. 82 83 Args: 84 path: Filepath to a folder where the downloaded data will be saved. 85 source: The source of the dataset. Can either be 'full' for the complete dataset, 86 or 'reduced' for the dataset excluding histopathology images. 87 download: Whether to download the data if it is not present. 88 """ 89 if source not in DSB_URLS.keys(): 90 raise ValueError(f"'{source}' is not a valid data source.") 91 92 train_out_path = os.path.join(path, "train") 93 test_out_path = os.path.join(path, "test") 94 if source == "reduced" and os.path.exists(train_out_path) and os.path.exists(test_out_path): 95 return 96 97 full_out_path = os.path.join(path, "full") 98 if source == "full" and os.path.exists(full_out_path): 99 return 100 101 os.makedirs(path, exist_ok=True) 102 103 zip_path = os.path.join(path, "dsb.zip" if source == "reduced" else "train-hand.zip") 104 util.download_source(zip_path, DSB_URLS[source], download, CHECKSUMS[source]) 105 util.unzip(zip_path, path, True) 106 107 if source == "reduced": 108 shutil.move(os.path.join(path, "dsb2018", "train"), train_out_path) 109 shutil.move(os.path.join(path, "dsb2018", "test"), test_out_path) 110 else: 111 shutil.move(os.path.join(path, "train-hand"), os.path.join(path, "full")) 112 _merge_instances(path)
Download the DSB training data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- source: The source of the dataset. Can either be 'full' for the complete dataset, or 'reduced' for the dataset excluding histopathology images.
- download: Whether to download the data if it is not present.
def
get_dsb_paths( path: Union[os.PathLike, str], source: Literal['full', 'reduced'], split: Optional[Literal['train', 'test']] = None, domain: Optional[Literal['fluo', 'histopatho']] = None, download: bool = False) -> Tuple[List[str], List[str]]:
115def get_dsb_paths( 116 path: Union[os.PathLike, str], 117 source: Literal["full", "reduced"], 118 split: Optional[Literal["train", "test"]] = None, 119 domain: Optional[Literal["fluo", "histopatho"]] = None, 120 download: bool = False, 121) -> Tuple[List[str], List[str]]: 122 """Get paths to the DSB data. 123 124 Args: 125 path: Filepath to a folder where the downloaded data will be saved. 126 source: The source of the dataset. Can either be 'full' for the complete dataset, 127 or 'reduced' for the dataset excluding histopathology images. 128 split: The split to use for the dataset. Either 'train' or 'test'. 129 domain: The choice of modality in dataset. 130 download: Whether to download the data if it is not present. 131 132 Returns: 133 List of filepaths for the folder where the images are stored. 134 List of filepaths for the folder where the labels are stored. 135 """ 136 get_dsb_data(path, source, download) 137 138 if source == "reduced": 139 if domain is not None: 140 assert domain in "fluo", "The reduced set only has 'fluo' images." 141 142 if split is None: 143 split = "t*" # reduced set returns all "train" and "test" sets if split is None. 144 145 raw_paths = natsorted(glob(os.path.join(path, split, "images", "*.tif"))) 146 label_paths = natsorted(glob(os.path.join(path, split, "masks", "*.tif"))) 147 else: 148 if domain is None: 149 domain = "*" 150 151 assert split is None, "There are no splits available for this data." 152 153 raw_paths = natsorted(glob(os.path.join(path, "full", "*", "images", f"{domain}_*.png"))) 154 label_paths = natsorted(glob(os.path.join(path, "full", "*", "preprocessed_labels", f"{domain}_*.tif"))) 155 156 assert len(raw_paths) == len(label_paths) and len(raw_paths) > 0 157 158 return raw_paths, label_paths
Get paths to the DSB data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- source: The source of the dataset. Can either be 'full' for the complete dataset, or 'reduced' for the dataset excluding histopathology images.
- split: The split to use for the dataset. Either 'train' or 'test'.
- domain: The choice of modality in dataset.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the folder where the images are stored. List of filepaths for the folder where the labels are stored.
def
get_dsb_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], source: Literal['full', 'reduced'] = 'reduced', split: Optional[Literal['train', 'test']] = None, domain: Optional[Literal['fluo', 'histopatho']] = None, binary: bool = False, boundaries: bool = False, offsets: Optional[List[List[int]]] = None, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
161def get_dsb_dataset( 162 path: Union[os.PathLike, str], 163 patch_shape: Tuple[int, int], 164 source: Literal["full", "reduced"] = "reduced", 165 split: Optional[Literal["train", "test"]] = None, 166 domain: Optional[Literal["fluo", "histopatho"]] = None, 167 binary: bool = False, 168 boundaries: bool = False, 169 offsets: Optional[List[List[int]]] = None, 170 download: bool = False, 171 **kwargs 172) -> Dataset: 173 """Get the DSB dataset for nucleus segmentation. 174 175 Args: 176 path: Filepath to a folder where the downloaded data will be saved. 177 patch_shape: The patch shape to use for training. 178 source: The source of the dataset. Can either be 'full' for the complete dataset, 179 or 'reduced' for the dataset excluding histopathology images. 180 split: The split to use for the dataset. Either 'train' or 'test'. 181 domain: The choice of modality in dataset. 182 binary: Whether to use a binary segmentation target. 183 boundaries: Whether to compute boundaries as the target. 184 offsets: Offset values for affinity computation used as target. 185 download: Whether to download the data if it is not present. 186 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 187 188 Returns: 189 The segmentation dataset. 190 """ 191 raw_paths, label_paths = get_dsb_paths(path, source, split, domain, download) 192 193 kwargs, _ = util.add_instance_label_transform( 194 kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets 195 ) 196 kwargs = util.update_kwargs(kwargs, "ndim", 2) 197 198 # This is done for when user requests all images in "full" dataset. 199 if "raw_transform" not in kwargs and domain is None: 200 kwargs["raw_transform"] = torch_em.transform.get_raw_transform(augmentation2=to_rgb) 201 202 return torch_em.default_segmentation_dataset( 203 raw_paths=raw_paths, 204 raw_key=None, 205 label_paths=label_paths, 206 label_key=None, 207 patch_shape=patch_shape, 208 is_seg_dataset=False, 209 **kwargs 210 )
Get the DSB dataset for nucleus segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- source: The source of the dataset. Can either be 'full' for the complete dataset, or 'reduced' for the dataset excluding histopathology images.
- split: The split to use for the dataset. Either 'train' or 'test'.
- domain: The choice of modality in dataset.
- binary: Whether to use a binary segmentation target.
- boundaries: Whether to compute boundaries as the target.
- offsets: Offset values for affinity computation used as target.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
def
get_dsb_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], source: Literal['full', 'reduced'] = 'reduced', split: Optional[Literal['train', 'test']] = None, domain: Optional[Literal['fluo', 'histopatho']] = None, binary: bool = False, boundaries: bool = False, offsets: Optional[List[List[int]]] = None, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
213def get_dsb_loader( 214 path: Union[os.PathLike, str], 215 batch_size: int, 216 patch_shape: Tuple[int, int], 217 source: Literal["full", "reduced"] = "reduced", 218 split: Optional[Literal["train", "test"]] = None, 219 domain: Optional[Literal["fluo", "histopatho"]] = None, 220 binary: bool = False, 221 boundaries: bool = False, 222 offsets: Optional[List[List[int]]] = None, 223 download: bool = False, 224 **kwargs 225) -> DataLoader: 226 """Get the DSB dataloader for nucleus segmentation. 227 228 Args: 229 path: Filepath to a folder where the downloaded data will be saved. 230 batch_size: The batch size for training. 231 patch_shape: The patch shape to use for training. 232 source: The source of the dataset. Can either be 'full' for the complete dataset, 233 or 'reduced' for the dataset excluding histopathology images. 234 split: The split to use for the dataset. Either 'train' or 'test'. 235 domain: The choice of modality in dataset. 236 binary: Whether to use a binary segmentation target. 237 boundaries: Whether to compute boundaries as the target. 238 offsets: Offset values for affinity computation used as target. 239 download: Whether to download the data if it is not present. 240 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 241 242 Returns: 243 The DataLoader. 244 """ 245 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 246 dataset = get_dsb_dataset( 247 path, patch_shape, source, split, domain, binary, boundaries, offsets, download, **ds_kwargs 248 ) 249 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the DSB dataloader for nucleus segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- source: The source of the dataset. Can either be 'full' for the complete dataset, or 'reduced' for the dataset excluding histopathology images.
- split: The split to use for the dataset. Either 'train' or 'test'.
- domain: The choice of modality in dataset.
- binary: Whether to use a binary segmentation target.
- boundaries: Whether to compute boundaries as the target.
- offsets: Offset values for affinity computation used as target.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader.