torch_em.data.datasets.light_microscopy.dsb

This Dataset was used in a Kaggle Data Science Bowl. It contains light microscopy images with annotations for nucleus segmentation.

The dataset is described in the publication https://doi.org/10.1038/s41592-019-0612-7. Please cite it if you use this dataset in your research.

  1"""This Dataset was used in a Kaggle Data Science Bowl. It contains light microscopy
  2images with annotations for nucleus segmentation.
  3
  4The dataset is described in the publication https://doi.org/10.1038/s41592-019-0612-7.
  5Please cite it if you use this dataset in your research.
  6"""
  7
  8import os
  9from shutil import move
 10from typing import List, Optional, Tuple, Union
 11
 12import torch_em
 13from torch.utils.data import Dataset, DataLoader
 14from .. import util
 15
 16DSB_URLS = {
 17    "full": "",  # TODO
 18    "reduced": "https://github.com/stardist/stardist/releases/download/0.1.0/dsb2018.zip"
 19}
 20CHECKSUMS = {
 21    "full": None,
 22    "reduced": "e44921950edce378063aa4457e625581ba35b4c2dbd9a07c19d48900129f386f"
 23}
 24
 25
 26def get_dsb_data(path: Union[os.PathLike, str], source: str, download: bool) -> str:
 27    """Download the DeepBacs training data.
 28
 29    Args:
 30        path: Filepath to a folder where the downloaded data will be saved.
 31        source: The source of the dataset. Can either be 'full' for the complete dataset,
 32            or 'reduced' for the dataset excluding histopathology images.
 33        download: Whether to download the data if it is not present.
 34
 35    Returns:
 36        The filepath to the training data.
 37    """
 38    os.makedirs(path, exist_ok=True)
 39    url = DSB_URLS[source]
 40    checksum = CHECKSUMS[source]
 41
 42    train_out_path = os.path.join(path, "train")
 43    test_out_path = os.path.join(path, "test")
 44
 45    if os.path.exists(train_out_path) and os.path.exists(test_out_path):
 46        return path
 47
 48    zip_path = os.path.join(path, "dsb.zip")
 49    util.download_source(zip_path, url, download, checksum)
 50    util.unzip(zip_path, path, True)
 51
 52    move(os.path.join(path, "dsb2018", "train"), train_out_path)
 53    move(os.path.join(path, "dsb2018", "test"), test_out_path)
 54    return path
 55
 56
 57def get_dsb_dataset(
 58    path: Union[os.PathLike, str],
 59    split: str,
 60    patch_shape: Tuple[int, int],
 61    download: bool = False,
 62    offsets: Optional[List[List[int]]] = None,
 63    boundaries: bool = False,
 64    binary: bool = False,
 65    source: str = "reduced",
 66    **kwargs
 67) -> Dataset:
 68    """Get the DSB dataset for nucleus segmentation.
 69
 70    Args:
 71        path: Filepath to a folder where the downloaded data will be saved.
 72        split: The split to use for the dataset. Either 'train' or 'test'.
 73        patch_shape: The patch shape to use for training.
 74        download: Whether to download the data if it is not present.
 75        offsets: Offset values for affinity computation used as target.
 76        boundaries: Whether to compute boundaries as the target.
 77        binary: Whether to use a binary segmentation target.
 78        source: The source of the dataset. Can either be 'full' for the complete dataset,
 79            or 'reduced' for the dataset excluding histopathology images.
 80        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
 81
 82    Returns:
 83       The segmentation dataset.
 84    """
 85    assert split in ("test", "train"), split
 86    get_dsb_data(path, source, download)
 87
 88    image_path = os.path.join(path, split, "images")
 89    label_path = os.path.join(path, split, "masks")
 90
 91    kwargs, _ = util.add_instance_label_transform(
 92        kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets
 93    )
 94    kwargs = util.update_kwargs(kwargs, "ndim", 2)
 95    return torch_em.default_segmentation_dataset(
 96        image_path, "*.tif", label_path, "*.tif", patch_shape, **kwargs
 97    )
 98
 99
100def get_dsb_loader(
101    path: Union[os.PathLike, str],
102    split: str,
103    patch_shape: Tuple[int, int],
104    batch_size: int,
105    download: bool = False,
106    offsets: Optional[List[List[int]]] = None,
107    boundaries: bool = False,
108    binary: bool = False,
109    source: str = "reduced",
110    **kwargs
111) -> DataLoader:
112    """Get the DSB dataloader for nucleus segmentation.
113
114    Args:
115        path: Filepath to a folder where the downloaded data will be saved.
116        split: The split to use for the dataset. Either 'train' or 'test'.
117        patch_shape: The patch shape to use for training.
118        batch_size: The batch size for training.
119        download: Whether to download the data if it is not present.
120        offsets: Offset values for affinity computation used as target.
121        boundaries: Whether to compute boundaries as the target.
122        binary: Whether to use a binary segmentation target.
123        source: The source of the dataset. Can either be 'full' for the complete dataset,
124            or 'reduced' for the dataset excluding histopathology images.
125        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
126
127    Returns:
128        The DataLoader.
129    """
130    ds_kwargs, loader_kwargs = util.split_kwargs(
131        torch_em.default_segmentation_dataset, **kwargs
132    )
133    dataset = get_dsb_dataset(
134        path, split, patch_shape, download=download,
135        offsets=offsets, boundaries=boundaries, binary=binary,
136        source=source, **ds_kwargs,
137    )
138    loader = torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
139    return loader
DSB_URLS = {'full': '', 'reduced': 'https://github.com/stardist/stardist/releases/download/0.1.0/dsb2018.zip'}
CHECKSUMS = {'full': None, 'reduced': 'e44921950edce378063aa4457e625581ba35b4c2dbd9a07c19d48900129f386f'}
def get_dsb_data(path: Union[os.PathLike, str], source: str, download: bool) -> str:
27def get_dsb_data(path: Union[os.PathLike, str], source: str, download: bool) -> str:
28    """Download the DeepBacs training data.
29
30    Args:
31        path: Filepath to a folder where the downloaded data will be saved.
32        source: The source of the dataset. Can either be 'full' for the complete dataset,
33            or 'reduced' for the dataset excluding histopathology images.
34        download: Whether to download the data if it is not present.
35
36    Returns:
37        The filepath to the training data.
38    """
39    os.makedirs(path, exist_ok=True)
40    url = DSB_URLS[source]
41    checksum = CHECKSUMS[source]
42
43    train_out_path = os.path.join(path, "train")
44    test_out_path = os.path.join(path, "test")
45
46    if os.path.exists(train_out_path) and os.path.exists(test_out_path):
47        return path
48
49    zip_path = os.path.join(path, "dsb.zip")
50    util.download_source(zip_path, url, download, checksum)
51    util.unzip(zip_path, path, True)
52
53    move(os.path.join(path, "dsb2018", "train"), train_out_path)
54    move(os.path.join(path, "dsb2018", "test"), test_out_path)
55    return path

Download the DeepBacs training data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • source: The source of the dataset. Can either be 'full' for the complete dataset, or 'reduced' for the dataset excluding histopathology images.
  • download: Whether to download the data if it is not present.
Returns:

The filepath to the training data.

def get_dsb_dataset( path: Union[os.PathLike, str], split: str, patch_shape: Tuple[int, int], download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, source: str = 'reduced', **kwargs) -> torch.utils.data.dataset.Dataset:
58def get_dsb_dataset(
59    path: Union[os.PathLike, str],
60    split: str,
61    patch_shape: Tuple[int, int],
62    download: bool = False,
63    offsets: Optional[List[List[int]]] = None,
64    boundaries: bool = False,
65    binary: bool = False,
66    source: str = "reduced",
67    **kwargs
68) -> Dataset:
69    """Get the DSB dataset for nucleus segmentation.
70
71    Args:
72        path: Filepath to a folder where the downloaded data will be saved.
73        split: The split to use for the dataset. Either 'train' or 'test'.
74        patch_shape: The patch shape to use for training.
75        download: Whether to download the data if it is not present.
76        offsets: Offset values for affinity computation used as target.
77        boundaries: Whether to compute boundaries as the target.
78        binary: Whether to use a binary segmentation target.
79        source: The source of the dataset. Can either be 'full' for the complete dataset,
80            or 'reduced' for the dataset excluding histopathology images.
81        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
82
83    Returns:
84       The segmentation dataset.
85    """
86    assert split in ("test", "train"), split
87    get_dsb_data(path, source, download)
88
89    image_path = os.path.join(path, split, "images")
90    label_path = os.path.join(path, split, "masks")
91
92    kwargs, _ = util.add_instance_label_transform(
93        kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets
94    )
95    kwargs = util.update_kwargs(kwargs, "ndim", 2)
96    return torch_em.default_segmentation_dataset(
97        image_path, "*.tif", label_path, "*.tif", patch_shape, **kwargs
98    )

Get the DSB dataset for nucleus segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • split: The split to use for the dataset. Either 'train' or 'test'.
  • patch_shape: The patch shape to use for training.
  • download: Whether to download the data if it is not present.
  • offsets: Offset values for affinity computation used as target.
  • boundaries: Whether to compute boundaries as the target.
  • binary: Whether to use a binary segmentation target.
  • source: The source of the dataset. Can either be 'full' for the complete dataset, or 'reduced' for the dataset excluding histopathology images.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_dsb_loader( path: Union[os.PathLike, str], split: str, patch_shape: Tuple[int, int], batch_size: int, download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, source: str = 'reduced', **kwargs) -> torch.utils.data.dataloader.DataLoader:
101def get_dsb_loader(
102    path: Union[os.PathLike, str],
103    split: str,
104    patch_shape: Tuple[int, int],
105    batch_size: int,
106    download: bool = False,
107    offsets: Optional[List[List[int]]] = None,
108    boundaries: bool = False,
109    binary: bool = False,
110    source: str = "reduced",
111    **kwargs
112) -> DataLoader:
113    """Get the DSB dataloader for nucleus segmentation.
114
115    Args:
116        path: Filepath to a folder where the downloaded data will be saved.
117        split: The split to use for the dataset. Either 'train' or 'test'.
118        patch_shape: The patch shape to use for training.
119        batch_size: The batch size for training.
120        download: Whether to download the data if it is not present.
121        offsets: Offset values for affinity computation used as target.
122        boundaries: Whether to compute boundaries as the target.
123        binary: Whether to use a binary segmentation target.
124        source: The source of the dataset. Can either be 'full' for the complete dataset,
125            or 'reduced' for the dataset excluding histopathology images.
126        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
127
128    Returns:
129        The DataLoader.
130    """
131    ds_kwargs, loader_kwargs = util.split_kwargs(
132        torch_em.default_segmentation_dataset, **kwargs
133    )
134    dataset = get_dsb_dataset(
135        path, split, patch_shape, download=download,
136        offsets=offsets, boundaries=boundaries, binary=binary,
137        source=source, **ds_kwargs,
138    )
139    loader = torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
140    return loader

Get the DSB dataloader for nucleus segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • split: The split to use for the dataset. Either 'train' or 'test'.
  • patch_shape: The patch shape to use for training.
  • batch_size: The batch size for training.
  • download: Whether to download the data if it is not present.
  • offsets: Offset values for affinity computation used as target.
  • boundaries: Whether to compute boundaries as the target.
  • binary: Whether to use a binary segmentation target.
  • source: The source of the dataset. Can either be 'full' for the complete dataset, or 'reduced' for the dataset excluding histopathology images.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.