torch_em.data.datasets.light_microscopy.dsb

This Dataset was used in a Kaggle Data Science Bowl. It contains light microscopy images with annotations for nucleus segmentation.

NOTE:

The dataset is described in the publication https://doi.org/10.1038/s41592-019-0612-7. Please cite it if you use this dataset in your research.

  1"""This Dataset was used in a Kaggle Data Science Bowl. It contains light microscopy
  2images with annotations for nucleus segmentation.
  3
  4NOTE:
  5- The 'full' dataset has been taken from https://github.com/ibmua/data-science-bowl-2018-train-set,
  6as recommended in BBBC website: https://bbbc.broadinstitute.org/BBBC038.
  7- The 'reduced' dataset is the fluorescence image set from StarDist.
  8
  9The dataset is described in the publication https://doi.org/10.1038/s41592-019-0612-7.
 10Please cite it if you use this dataset in your research.
 11"""
 12
 13import os
 14import shutil
 15from glob import glob
 16from tqdm import tqdm
 17from natsort import natsorted
 18from typing import List, Optional, Tuple, Union, Literal
 19
 20import numpy as np
 21import imageio.v3 as imageio
 22
 23from torch.utils.data import Dataset, DataLoader
 24
 25import torch_em
 26
 27from .. import util
 28from .neurips_cell_seg import to_rgb
 29
 30
 31DSB_URLS = {
 32    "full": "https://github.com/ibmua/data-science-bowl-2018-train-set/raw/master/train-hand.zip",
 33    "reduced": "https://github.com/stardist/stardist/releases/download/0.1.0/dsb2018.zip"
 34}
 35CHECKSUMS = {
 36    "full": "d218b8706cd7b9a2d7171268a6e99c7b0e94605af46521ff2ffd5a17708b1af6",
 37    "reduced": "e44921950edce378063aa4457e625581ba35b4c2dbd9a07c19d48900129f386f"
 38}
 39
 40
 41def _merge_instances(path):
 42    for id_path in tqdm(glob(os.path.join(path, "full", "*")), desc="Preprocessing labels"):
 43        id = os.path.basename(id_path)
 44
 45        # Let's preprocess the image: remove alpha channel and make distinction of histopatho vs fluo images.
 46        image = imageio.imread(os.path.join(id_path, "images", f"{id}.png"))
 47        assert image.ndim == 3 and image.shape[-1] == 4, image.shape
 48
 49        image = image[..., :-1]  # Remove alpha channel
 50        r, g, b = image.transpose(2, 0, 1)
 51        if np.array_equal(r, g) and np.array_equal(g, b):
 52            dname = "fluo"
 53            # Store only one channel for fluorescence images.
 54            imageio.imwrite(os.path.join(id_path, "images", f"{dname}_{id}.png"), image[..., -1], compression="zlib")
 55        else:
 56            dname = "histopatho"
 57            # Store all three channels for histopathology images.
 58            imageio.imwrite(os.path.join(id_path, "images", f"{dname}_{id}.png"), image, compression="zlib")
 59
 60        os.remove(os.path.join(id_path, "images", f"{id}.png"))
 61
 62        # Next, let's merge the instances.
 63        label_paths = glob(os.path.join(id_path, "masks", "*"))
 64        shape = imageio.imread(label_paths[0]).shape
 65
 66        instances = np.zeros(shape)
 67        for i, lpath in enumerate(label_paths, start=1):
 68            instances[imageio.imread(lpath) > 0] = i
 69
 70        os.makedirs(os.path.join(id_path, "preprocessed_labels"))
 71        imageio.imwrite(
 72            os.path.join(id_path, "preprocessed_labels", f"{dname}_{id}.tif"),
 73            instances.astype("uint32"),
 74            compression="zlib"
 75        )
 76        shutil.rmtree(os.path.join(id_path, "masks"))  # Removing per-object masks after storing merged instances.
 77
 78
 79def get_dsb_data(path: Union[os.PathLike, str], source: Literal["full", "reduced"], download: bool):
 80    """Download the DSB training data.
 81
 82    Args:
 83        path: Filepath to a folder where the downloaded data will be saved.
 84        source: The source of the dataset. Can either be 'full' for the complete dataset,
 85            or 'reduced' for the dataset excluding histopathology images.
 86        download: Whether to download the data if it is not present.
 87    """
 88    if source not in DSB_URLS.keys():
 89        raise ValueError(f"'{source}' is not a valid data source.")
 90
 91    train_out_path = os.path.join(path, "train")
 92    test_out_path = os.path.join(path, "test")
 93    if source == "reduced" and os.path.exists(train_out_path) and os.path.exists(test_out_path):
 94        return
 95
 96    full_out_path = os.path.join(path, "full")
 97    if source == "full" and os.path.exists(full_out_path):
 98        return
 99
100    os.makedirs(path, exist_ok=True)
101
102    zip_path = os.path.join(path, "dsb.zip" if source == "reduced" else "train-hand.zip")
103    util.download_source(zip_path, DSB_URLS[source], download, CHECKSUMS[source])
104    util.unzip(zip_path, path, True)
105
106    if source == "reduced":
107        shutil.move(os.path.join(path, "dsb2018", "train"), train_out_path)
108        shutil.move(os.path.join(path, "dsb2018", "test"), test_out_path)
109    else:
110        shutil.move(os.path.join(path, "train-hand"), os.path.join(path, "full"))
111        _merge_instances(path)
112
113
114def get_dsb_paths(
115    path: Union[os.PathLike, str],
116    source: Literal["full", "reduced"],
117    split: Optional[Literal["train", "test"]] = None,
118    domain: Optional[Literal["fluo", "histopatho"]] = None,
119    download: bool = False,
120) -> Tuple[List[str], List[str]]:
121    """Get paths to the DSB data.
122
123    Args:
124        path: Filepath to a folder where the downloaded data will be saved.
125        source: The source of the dataset. Can either be 'full' for the complete dataset,
126            or 'reduced' for the dataset excluding histopathology images.
127        split: The split to use for the dataset. Either 'train' or 'test'.
128        domain: The choice of modality in dataset.
129        download: Whether to download the data if it is not present.
130
131    Returns:
132        List of filepaths for the folder where the images are stored.
133        List of filepaths for the folder where the labels are stored.
134    """
135    get_dsb_data(path, source, download)
136
137    if source == "reduced":
138        if domain is not None:
139            assert domain in "fluo", "The reduced set only has 'fluo' images."
140
141        if split is None:
142            split = "t*"  # reduced set returns all "train" and "test" sets if split is None.
143
144        raw_paths = natsorted(glob(os.path.join(path, split, "images", "*.tif")))
145        label_paths = natsorted(glob(os.path.join(path, split, "masks", "*.tif")))
146    else:
147        if domain is None:
148            domain = "*"
149
150        assert split is None, "There are no splits available for this data."
151
152        raw_paths = natsorted(glob(os.path.join(path, "full", "*", "images", f"{domain}_*.png")))
153        label_paths = natsorted(glob(os.path.join(path, "full", "*", "preprocessed_labels", f"{domain}_*.tif")))
154
155    assert len(raw_paths) == len(label_paths) and len(raw_paths) > 0
156
157    return raw_paths, label_paths
158
159
160def get_dsb_dataset(
161    path: Union[os.PathLike, str],
162    patch_shape: Tuple[int, int],
163    source: Literal["full", "reduced"] = "reduced",
164    split: Optional[Literal["train", "test"]] = None,
165    domain: Optional[Literal["fluo", "histopatho"]] = None,
166    binary: bool = False,
167    boundaries: bool = False,
168    offsets: Optional[List[List[int]]] = None,
169    download: bool = False,
170    **kwargs
171) -> Dataset:
172    """Get the DSB dataset for nucleus segmentation.
173
174    Args:
175        path: Filepath to a folder where the downloaded data will be saved.
176        patch_shape: The patch shape to use for training.
177        source: The source of the dataset. Can either be 'full' for the complete dataset,
178            or 'reduced' for the dataset excluding histopathology images.
179        split: The split to use for the dataset. Either 'train' or 'test'.
180        domain: The choice of modality in dataset.
181        binary: Whether to use a binary segmentation target.
182        boundaries: Whether to compute boundaries as the target.
183        offsets: Offset values for affinity computation used as target.
184        download: Whether to download the data if it is not present.
185        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
186
187    Returns:
188       The segmentation dataset.
189    """
190    raw_paths, label_paths = get_dsb_paths(path, source, split, domain, download)
191
192    kwargs, _ = util.add_instance_label_transform(
193        kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets
194    )
195    kwargs = util.update_kwargs(kwargs, "ndim", 2)
196
197    # This is done for when user requests all images in "full" dataset.
198    if "raw_transform" not in kwargs and domain is None:
199        kwargs["raw_transform"] = torch_em.transform.get_raw_transform(augmentation2=to_rgb)
200
201    return torch_em.default_segmentation_dataset(
202        raw_paths=raw_paths,
203        raw_key=None,
204        label_paths=label_paths,
205        label_key=None,
206        patch_shape=patch_shape,
207        is_seg_dataset=False,
208        **kwargs
209    )
210
211
212def get_dsb_loader(
213    path: Union[os.PathLike, str],
214    batch_size: int,
215    patch_shape: Tuple[int, int],
216    source: Literal["full", "reduced"] = "reduced",
217    split: Optional[Literal["train", "test"]] = None,
218    domain: Optional[Literal["fluo", "histopatho"]] = None,
219    binary: bool = False,
220    boundaries: bool = False,
221    offsets: Optional[List[List[int]]] = None,
222    download: bool = False,
223    **kwargs
224) -> DataLoader:
225    """Get the DSB dataloader for nucleus segmentation.
226
227    Args:
228        path: Filepath to a folder where the downloaded data will be saved.
229        batch_size: The batch size for training.
230        patch_shape: The patch shape to use for training.
231        source: The source of the dataset. Can either be 'full' for the complete dataset,
232            or 'reduced' for the dataset excluding histopathology images.
233        split: The split to use for the dataset. Either 'train' or 'test'.
234        domain: The choice of modality in dataset.
235        binary: Whether to use a binary segmentation target.
236        boundaries: Whether to compute boundaries as the target.
237        offsets: Offset values for affinity computation used as target.
238        download: Whether to download the data if it is not present.
239        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
240
241    Returns:
242        The DataLoader.
243    """
244    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
245    dataset = get_dsb_dataset(
246        path, patch_shape, source, split, domain, binary, boundaries, offsets, download, **ds_kwargs
247    )
248    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
DSB_URLS = {'full': 'https://github.com/ibmua/data-science-bowl-2018-train-set/raw/master/train-hand.zip', 'reduced': 'https://github.com/stardist/stardist/releases/download/0.1.0/dsb2018.zip'}
CHECKSUMS = {'full': 'd218b8706cd7b9a2d7171268a6e99c7b0e94605af46521ff2ffd5a17708b1af6', 'reduced': 'e44921950edce378063aa4457e625581ba35b4c2dbd9a07c19d48900129f386f'}
def get_dsb_data( path: Union[os.PathLike, str], source: Literal['full', 'reduced'], download: bool):
 80def get_dsb_data(path: Union[os.PathLike, str], source: Literal["full", "reduced"], download: bool):
 81    """Download the DSB training data.
 82
 83    Args:
 84        path: Filepath to a folder where the downloaded data will be saved.
 85        source: The source of the dataset. Can either be 'full' for the complete dataset,
 86            or 'reduced' for the dataset excluding histopathology images.
 87        download: Whether to download the data if it is not present.
 88    """
 89    if source not in DSB_URLS.keys():
 90        raise ValueError(f"'{source}' is not a valid data source.")
 91
 92    train_out_path = os.path.join(path, "train")
 93    test_out_path = os.path.join(path, "test")
 94    if source == "reduced" and os.path.exists(train_out_path) and os.path.exists(test_out_path):
 95        return
 96
 97    full_out_path = os.path.join(path, "full")
 98    if source == "full" and os.path.exists(full_out_path):
 99        return
100
101    os.makedirs(path, exist_ok=True)
102
103    zip_path = os.path.join(path, "dsb.zip" if source == "reduced" else "train-hand.zip")
104    util.download_source(zip_path, DSB_URLS[source], download, CHECKSUMS[source])
105    util.unzip(zip_path, path, True)
106
107    if source == "reduced":
108        shutil.move(os.path.join(path, "dsb2018", "train"), train_out_path)
109        shutil.move(os.path.join(path, "dsb2018", "test"), test_out_path)
110    else:
111        shutil.move(os.path.join(path, "train-hand"), os.path.join(path, "full"))
112        _merge_instances(path)

Download the DSB training data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • source: The source of the dataset. Can either be 'full' for the complete dataset, or 'reduced' for the dataset excluding histopathology images.
  • download: Whether to download the data if it is not present.
def get_dsb_paths( path: Union[os.PathLike, str], source: Literal['full', 'reduced'], split: Optional[Literal['train', 'test']] = None, domain: Optional[Literal['fluo', 'histopatho']] = None, download: bool = False) -> Tuple[List[str], List[str]]:
115def get_dsb_paths(
116    path: Union[os.PathLike, str],
117    source: Literal["full", "reduced"],
118    split: Optional[Literal["train", "test"]] = None,
119    domain: Optional[Literal["fluo", "histopatho"]] = None,
120    download: bool = False,
121) -> Tuple[List[str], List[str]]:
122    """Get paths to the DSB data.
123
124    Args:
125        path: Filepath to a folder where the downloaded data will be saved.
126        source: The source of the dataset. Can either be 'full' for the complete dataset,
127            or 'reduced' for the dataset excluding histopathology images.
128        split: The split to use for the dataset. Either 'train' or 'test'.
129        domain: The choice of modality in dataset.
130        download: Whether to download the data if it is not present.
131
132    Returns:
133        List of filepaths for the folder where the images are stored.
134        List of filepaths for the folder where the labels are stored.
135    """
136    get_dsb_data(path, source, download)
137
138    if source == "reduced":
139        if domain is not None:
140            assert domain in "fluo", "The reduced set only has 'fluo' images."
141
142        if split is None:
143            split = "t*"  # reduced set returns all "train" and "test" sets if split is None.
144
145        raw_paths = natsorted(glob(os.path.join(path, split, "images", "*.tif")))
146        label_paths = natsorted(glob(os.path.join(path, split, "masks", "*.tif")))
147    else:
148        if domain is None:
149            domain = "*"
150
151        assert split is None, "There are no splits available for this data."
152
153        raw_paths = natsorted(glob(os.path.join(path, "full", "*", "images", f"{domain}_*.png")))
154        label_paths = natsorted(glob(os.path.join(path, "full", "*", "preprocessed_labels", f"{domain}_*.tif")))
155
156    assert len(raw_paths) == len(label_paths) and len(raw_paths) > 0
157
158    return raw_paths, label_paths

Get paths to the DSB data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • source: The source of the dataset. Can either be 'full' for the complete dataset, or 'reduced' for the dataset excluding histopathology images.
  • split: The split to use for the dataset. Either 'train' or 'test'.
  • domain: The choice of modality in dataset.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths for the folder where the images are stored. List of filepaths for the folder where the labels are stored.

def get_dsb_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], source: Literal['full', 'reduced'] = 'reduced', split: Optional[Literal['train', 'test']] = None, domain: Optional[Literal['fluo', 'histopatho']] = None, binary: bool = False, boundaries: bool = False, offsets: Optional[List[List[int]]] = None, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
161def get_dsb_dataset(
162    path: Union[os.PathLike, str],
163    patch_shape: Tuple[int, int],
164    source: Literal["full", "reduced"] = "reduced",
165    split: Optional[Literal["train", "test"]] = None,
166    domain: Optional[Literal["fluo", "histopatho"]] = None,
167    binary: bool = False,
168    boundaries: bool = False,
169    offsets: Optional[List[List[int]]] = None,
170    download: bool = False,
171    **kwargs
172) -> Dataset:
173    """Get the DSB dataset for nucleus segmentation.
174
175    Args:
176        path: Filepath to a folder where the downloaded data will be saved.
177        patch_shape: The patch shape to use for training.
178        source: The source of the dataset. Can either be 'full' for the complete dataset,
179            or 'reduced' for the dataset excluding histopathology images.
180        split: The split to use for the dataset. Either 'train' or 'test'.
181        domain: The choice of modality in dataset.
182        binary: Whether to use a binary segmentation target.
183        boundaries: Whether to compute boundaries as the target.
184        offsets: Offset values for affinity computation used as target.
185        download: Whether to download the data if it is not present.
186        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
187
188    Returns:
189       The segmentation dataset.
190    """
191    raw_paths, label_paths = get_dsb_paths(path, source, split, domain, download)
192
193    kwargs, _ = util.add_instance_label_transform(
194        kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets
195    )
196    kwargs = util.update_kwargs(kwargs, "ndim", 2)
197
198    # This is done for when user requests all images in "full" dataset.
199    if "raw_transform" not in kwargs and domain is None:
200        kwargs["raw_transform"] = torch_em.transform.get_raw_transform(augmentation2=to_rgb)
201
202    return torch_em.default_segmentation_dataset(
203        raw_paths=raw_paths,
204        raw_key=None,
205        label_paths=label_paths,
206        label_key=None,
207        patch_shape=patch_shape,
208        is_seg_dataset=False,
209        **kwargs
210    )

Get the DSB dataset for nucleus segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • source: The source of the dataset. Can either be 'full' for the complete dataset, or 'reduced' for the dataset excluding histopathology images.
  • split: The split to use for the dataset. Either 'train' or 'test'.
  • domain: The choice of modality in dataset.
  • binary: Whether to use a binary segmentation target.
  • boundaries: Whether to compute boundaries as the target.
  • offsets: Offset values for affinity computation used as target.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_dsb_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], source: Literal['full', 'reduced'] = 'reduced', split: Optional[Literal['train', 'test']] = None, domain: Optional[Literal['fluo', 'histopatho']] = None, binary: bool = False, boundaries: bool = False, offsets: Optional[List[List[int]]] = None, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
213def get_dsb_loader(
214    path: Union[os.PathLike, str],
215    batch_size: int,
216    patch_shape: Tuple[int, int],
217    source: Literal["full", "reduced"] = "reduced",
218    split: Optional[Literal["train", "test"]] = None,
219    domain: Optional[Literal["fluo", "histopatho"]] = None,
220    binary: bool = False,
221    boundaries: bool = False,
222    offsets: Optional[List[List[int]]] = None,
223    download: bool = False,
224    **kwargs
225) -> DataLoader:
226    """Get the DSB dataloader for nucleus segmentation.
227
228    Args:
229        path: Filepath to a folder where the downloaded data will be saved.
230        batch_size: The batch size for training.
231        patch_shape: The patch shape to use for training.
232        source: The source of the dataset. Can either be 'full' for the complete dataset,
233            or 'reduced' for the dataset excluding histopathology images.
234        split: The split to use for the dataset. Either 'train' or 'test'.
235        domain: The choice of modality in dataset.
236        binary: Whether to use a binary segmentation target.
237        boundaries: Whether to compute boundaries as the target.
238        offsets: Offset values for affinity computation used as target.
239        download: Whether to download the data if it is not present.
240        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
241
242    Returns:
243        The DataLoader.
244    """
245    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
246    dataset = get_dsb_dataset(
247        path, patch_shape, source, split, domain, binary, boundaries, offsets, download, **ds_kwargs
248    )
249    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the DSB dataloader for nucleus segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape to use for training.
  • source: The source of the dataset. Can either be 'full' for the complete dataset, or 'reduced' for the dataset excluding histopathology images.
  • split: The split to use for the dataset. Either 'train' or 'test'.
  • domain: The choice of modality in dataset.
  • binary: Whether to use a binary segmentation target.
  • boundaries: Whether to compute boundaries as the target.
  • offsets: Offset values for affinity computation used as target.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.