torch_em.data.datasets.light_microscopy.cellbindb

CellBinDB contains annotations for cell segmentation in multi-modal images.

  • Consists of DAPI, ssDNA, H&E, and mIF staining.
  • Covers more than 30 normal and diseased tissue types from human and mouse samples.

The dataset is located at https://db.cngb.org/search/project/CNP0006370/. This dataset is from the publication https://doi.org/10.1101/2024.11.20.619750. Please cite it if you use this dataset for your research.

  1"""CellBinDB contains annotations for cell segmentation in multi-modal images.
  2- Consists of DAPI, ssDNA, H&E, and mIF staining.
  3- Covers more than 30 normal and diseased tissue types from human and mouse samples.
  4
  5The dataset is located at https://db.cngb.org/search/project/CNP0006370/.
  6This dataset is from the publication https://doi.org/10.1101/2024.11.20.619750.
  7Please cite it if you use this dataset for your research.
  8"""
  9
 10import os
 11import subprocess
 12from glob import glob
 13from natsort import natsorted
 14from typing import Union, Tuple, List, Optional
 15
 16import torch_em
 17
 18from torch.utils.data import Dataset, DataLoader
 19
 20from .. import util
 21from .neurips_cell_seg import to_rgb
 22
 23
 24DOWNLOAD_SCRIPT = 'wget -c -nH -np -r -R "index.html*" --cut-dirs 4 ftp://ftp.cngb.org/pub/CNSA/data5/CNP0006370/Other/'
 25
 26CHOICES = ["10×Genomics_DAPI", "10×Genomics_HE", "DAPI", "HE", "mIF", "ssDNA"]
 27
 28
 29def get_cellbindb_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 30    """Download the CellBinDB dataset.
 31
 32    Args:
 33        path: Filepath to a folder where the data is downloaded.
 34        download: Whether to download the data if it is not present.
 35
 36    Returns:
 37        The filepath to the data.
 38    """
 39    data_dir = os.path.join(path, "Other")
 40    if os.path.exists(data_dir):
 41        return data_dir
 42
 43    os.makedirs(path, exist_ok=True)
 44
 45    if not download:
 46        raise AssertionError("The dataset is not found and download is set to 'False'.")
 47
 48    print(
 49        "Downloading the dataset takes several hours and is extremely (like very very) slow. "
 50        "Make sure you have consistent internet connection or run it in background over a cluster."
 51    )
 52    splits = DOWNLOAD_SCRIPT.split(" ")
 53    subprocess.run([*splits[:-1], "-P", os.path.abspath(path), splits[-1]])
 54    return data_dir
 55
 56
 57def get_cellbindb_paths(
 58    path: Union[os.PathLike, str], data_choice: Optional[Union[str, List[str]]] = None, download: bool = False
 59) -> Tuple[List[str], List[str]]:
 60    """Get paths to the CellBinDB data.
 61
 62    Args:
 63        path: Filepath to a folder where the data is downloaded.
 64        data_choice: The choice of datasets.
 65        download: Whether to download the data if it is not present.
 66
 67    Returns:
 68        List of filepaths for the image data.
 69        List of filepaths for the label data.
 70    """
 71    data_dir = get_cellbindb_data(path, download)
 72
 73    if data_choice is None:
 74        data_choice = CHOICES
 75    else:
 76        if isinstance(data_choice, str):
 77            data_choice = [data_choice]
 78
 79    raw_paths, label_paths = [], []
 80    for dchoice in data_choice:
 81        assert dchoice in CHOICES, f"'{dchoice}' is not a valid data choice."
 82        raw_paths.extend(natsorted(glob(os.path.join(data_dir, dchoice, "*", "*-img.tif"))))
 83        label_paths.extend(natsorted(glob(os.path.join(data_dir, dchoice, "*", "*-instancemask.tif"))))
 84
 85    assert len(raw_paths) == len(label_paths) and len(raw_paths) > 0
 86
 87    return raw_paths, label_paths
 88
 89
 90def get_cellbindb_dataset(
 91    path: Union[os.PathLike, str],
 92    patch_shape: Tuple[int, int],
 93    data_choice: Optional[Union[str, List[str]]] = None,
 94    download: bool = False,
 95    **kwargs
 96) -> Dataset:
 97    """Get the CellBinDB dataset for cell segmentation.
 98
 99    Args:
100        path: Filepath to a folder where the data is downloaded.
101        patch_shape: The patch shape to use for training.
102        data_choice: The choice of datasets.
103        download: Whether to download the data if it is not present.
104        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
105
106    Returns:
107        The segmentation dataset.
108    """
109    raw_paths, label_paths = get_cellbindb_paths(path, data_choice, download)
110
111    if "raw_transform" not in kwargs:
112        kwargs["raw_transform"] = torch_em.transform.get_raw_transform(augmentation2=to_rgb)
113
114    return torch_em.default_segmentation_dataset(
115        raw_paths=raw_paths,
116        raw_key=None,
117        label_paths=label_paths,
118        label_key=None,
119        is_seg_dataset=False,
120        ndim=2,
121        patch_shape=patch_shape,
122        **kwargs
123    )
124
125
126def get_cellbindb_loader(
127    path: Union[os.PathLike, str],
128    batch_size: int,
129    patch_shape: Tuple[int, int],
130    data_choice: Optional[Union[str, List[str]]] = None,
131    download: bool = False,
132    **kwargs
133) -> DataLoader:
134    """Get the CellBinDB dataloader for cell segmentation.
135
136    Args:
137        path: Filepath to a folder where the data is downloaded.
138        patch_shape: The patch shape to use for training.
139        data_choice: The choice of datasets.
140        download: Whether to download the data if it is not present.
141        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
142
143    Returns:
144        The DataLoader.
145    """
146    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
147    dataset = get_cellbindb_dataset(path, patch_shape, data_choice, download, **ds_kwargs)
148    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
DOWNLOAD_SCRIPT = 'wget -c -nH -np -r -R "index.html*" --cut-dirs 4 ftp://ftp.cngb.org/pub/CNSA/data5/CNP0006370/Other/'
CHOICES = ['10×Genomics_DAPI', '10×Genomics_HE', 'DAPI', 'HE', 'mIF', 'ssDNA']
def get_cellbindb_data(path: Union[os.PathLike, str], download: bool = False) -> str:
30def get_cellbindb_data(path: Union[os.PathLike, str], download: bool = False) -> str:
31    """Download the CellBinDB dataset.
32
33    Args:
34        path: Filepath to a folder where the data is downloaded.
35        download: Whether to download the data if it is not present.
36
37    Returns:
38        The filepath to the data.
39    """
40    data_dir = os.path.join(path, "Other")
41    if os.path.exists(data_dir):
42        return data_dir
43
44    os.makedirs(path, exist_ok=True)
45
46    if not download:
47        raise AssertionError("The dataset is not found and download is set to 'False'.")
48
49    print(
50        "Downloading the dataset takes several hours and is extremely (like very very) slow. "
51        "Make sure you have consistent internet connection or run it in background over a cluster."
52    )
53    splits = DOWNLOAD_SCRIPT.split(" ")
54    subprocess.run([*splits[:-1], "-P", os.path.abspath(path), splits[-1]])
55    return data_dir

Download the CellBinDB dataset.

Arguments:
  • path: Filepath to a folder where the data is downloaded.
  • download: Whether to download the data if it is not present.
Returns:

The filepath to the data.

def get_cellbindb_paths( path: Union[os.PathLike, str], data_choice: Union[List[str], str, NoneType] = None, download: bool = False) -> Tuple[List[str], List[str]]:
58def get_cellbindb_paths(
59    path: Union[os.PathLike, str], data_choice: Optional[Union[str, List[str]]] = None, download: bool = False
60) -> Tuple[List[str], List[str]]:
61    """Get paths to the CellBinDB data.
62
63    Args:
64        path: Filepath to a folder where the data is downloaded.
65        data_choice: The choice of datasets.
66        download: Whether to download the data if it is not present.
67
68    Returns:
69        List of filepaths for the image data.
70        List of filepaths for the label data.
71    """
72    data_dir = get_cellbindb_data(path, download)
73
74    if data_choice is None:
75        data_choice = CHOICES
76    else:
77        if isinstance(data_choice, str):
78            data_choice = [data_choice]
79
80    raw_paths, label_paths = [], []
81    for dchoice in data_choice:
82        assert dchoice in CHOICES, f"'{dchoice}' is not a valid data choice."
83        raw_paths.extend(natsorted(glob(os.path.join(data_dir, dchoice, "*", "*-img.tif"))))
84        label_paths.extend(natsorted(glob(os.path.join(data_dir, dchoice, "*", "*-instancemask.tif"))))
85
86    assert len(raw_paths) == len(label_paths) and len(raw_paths) > 0
87
88    return raw_paths, label_paths

Get paths to the CellBinDB data.

Arguments:
  • path: Filepath to a folder where the data is downloaded.
  • data_choice: The choice of datasets.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths for the image data. List of filepaths for the label data.

def get_cellbindb_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], data_choice: Union[List[str], str, NoneType] = None, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
 91def get_cellbindb_dataset(
 92    path: Union[os.PathLike, str],
 93    patch_shape: Tuple[int, int],
 94    data_choice: Optional[Union[str, List[str]]] = None,
 95    download: bool = False,
 96    **kwargs
 97) -> Dataset:
 98    """Get the CellBinDB dataset for cell segmentation.
 99
100    Args:
101        path: Filepath to a folder where the data is downloaded.
102        patch_shape: The patch shape to use for training.
103        data_choice: The choice of datasets.
104        download: Whether to download the data if it is not present.
105        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
106
107    Returns:
108        The segmentation dataset.
109    """
110    raw_paths, label_paths = get_cellbindb_paths(path, data_choice, download)
111
112    if "raw_transform" not in kwargs:
113        kwargs["raw_transform"] = torch_em.transform.get_raw_transform(augmentation2=to_rgb)
114
115    return torch_em.default_segmentation_dataset(
116        raw_paths=raw_paths,
117        raw_key=None,
118        label_paths=label_paths,
119        label_key=None,
120        is_seg_dataset=False,
121        ndim=2,
122        patch_shape=patch_shape,
123        **kwargs
124    )

Get the CellBinDB dataset for cell segmentation.

Arguments:
  • path: Filepath to a folder where the data is downloaded.
  • patch_shape: The patch shape to use for training.
  • data_choice: The choice of datasets.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_cellbindb_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], data_choice: Union[List[str], str, NoneType] = None, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
127def get_cellbindb_loader(
128    path: Union[os.PathLike, str],
129    batch_size: int,
130    patch_shape: Tuple[int, int],
131    data_choice: Optional[Union[str, List[str]]] = None,
132    download: bool = False,
133    **kwargs
134) -> DataLoader:
135    """Get the CellBinDB dataloader for cell segmentation.
136
137    Args:
138        path: Filepath to a folder where the data is downloaded.
139        patch_shape: The patch shape to use for training.
140        data_choice: The choice of datasets.
141        download: Whether to download the data if it is not present.
142        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
143
144    Returns:
145        The DataLoader.
146    """
147    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
148    dataset = get_cellbindb_dataset(path, patch_shape, data_choice, download, **ds_kwargs)
149    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the CellBinDB dataloader for cell segmentation.

Arguments:
  • path: Filepath to a folder where the data is downloaded.
  • patch_shape: The patch shape to use for training.
  • data_choice: The choice of datasets.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.