torch_em.data.datasets.light_microscopy.cellbindb

CellBinDB contains annotations for cell segmentation in multi-modal images.

  • Consists of DAPI, ssDNA, H&E, and mIF staining.
  • Covers more than 30 normal and diseased tissue types from human and mouse samples.

The dataset is located at https://db.cngb.org/search/project/CNP0006370/. This dataset is from the publication https://doi.org/10.1101/2024.11.20.619750. Please cite it if you use this dataset for your research.

  1"""CellBinDB contains annotations for cell segmentation in multi-modal images.
  2- Consists of DAPI, ssDNA, H&E, and mIF staining.
  3- Covers more than 30 normal and diseased tissue types from human and mouse samples.
  4
  5The dataset is located at https://db.cngb.org/search/project/CNP0006370/.
  6This dataset is from the publication https://doi.org/10.1101/2024.11.20.619750.
  7Please cite it if you use this dataset for your research.
  8"""
  9
 10import os
 11import subprocess
 12from glob import glob
 13from natsort import natsorted
 14from typing import Union, Tuple, List, Optional
 15
 16import torch_em
 17
 18from torch.utils.data import Dataset, DataLoader
 19
 20from .. import util
 21from .neurips_cell_seg import to_rgb
 22
 23
 24DOWNLOAD_SCRIPT = 'wget -c -nH -np -r -R "index.html*" --cut-dirs 4 ftp://ftp.cngb.org/pub/CNSA/data5/CNP0006370/Other/'
 25
 26CHOICES = ["10×Genomics_DAPI", "10×Genomics_HE", "DAPI", "HE", "mIF", "ssDNA"]
 27
 28
 29def get_cellbindb_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 30    """Download the CellBinDB dataset.
 31
 32    Args:
 33        path: Filepath to a folder where the data is downloaded.
 34        download: Whether to download the data if it is not present.
 35
 36    Returns:
 37        The filepath to the data.
 38    """
 39    data_dir = os.path.join(path, "Other")
 40    if os.path.exists(data_dir):
 41        return data_dir
 42
 43    os.makedirs(path, exist_ok=True)
 44
 45    if not download:
 46        raise AssertionError("The dataset is not found and download is set to 'False'.")
 47
 48    print(
 49        "Downloading the dataset takes several hours and is extremely (like very very) slow. "
 50        "Make sure you have consistent internet connection or run it in background over a cluster."
 51    )
 52    splits = DOWNLOAD_SCRIPT.split(" ")
 53    subprocess.run([*splits[:-1], "-P", os.path.abspath(path), splits[-1]])
 54    return data_dir
 55
 56
 57def get_cellbindb_paths(
 58    path: Union[os.PathLike, str], data_choice: Optional[Union[str, List[str]]] = None, download: bool = False
 59) -> Tuple[List[str], List[str]]:
 60    """Get paths to the CellBinDB data.
 61
 62    Args:
 63        path: Filepath to a folder where the data is downloaded.
 64        data_choice: The choice of datasets.
 65        download: Whether to download the data if it is not present.
 66
 67    Returns:
 68        List of filepaths for the image data.
 69        List of filepaths for the label data.
 70    """
 71    data_dir = get_cellbindb_data(path, download)
 72
 73    if data_choice is None:
 74        data_choice = CHOICES
 75    else:
 76        if isinstance(data_choice, str):
 77            data_choice = [data_choice]
 78
 79    raw_paths, label_paths = [], []
 80    for dchoice in data_choice:
 81        assert dchoice in CHOICES, f"'{dchoice}' is not a valid data choice."
 82        raw_paths.extend(natsorted(glob(os.path.join(data_dir, dchoice, "*", "*-img.tif"))))
 83        label_paths.extend(natsorted(glob(os.path.join(data_dir, dchoice, "*", "*-instancemask.tif"))))
 84
 85    # NOTE: Some files are corrupted from source. Since it's just a few of them, let's bump them out.
 86    valid_paired_images = [
 87        (rp, lp) for rp, lp in zip(raw_paths, label_paths) if _is_valid_image(rp) and _is_valid_image(lp)
 88    ]
 89    raw_paths, label_paths = zip(*valid_paired_images)
 90    raw_paths, label_paths = list(raw_paths), list(label_paths)
 91
 92    assert len(raw_paths) == len(label_paths) and len(raw_paths) > 0
 93
 94    return raw_paths, label_paths
 95
 96
 97def _is_valid_image(im_path):
 98    import tifffile
 99
100    try:
101        _ = tifffile.imread(im_path)
102        return True
103    except Exception as e:
104        print(f"'{im_path}' throwing '{type(e).__name__}': '{e}'")
105        return False
106
107
108def get_cellbindb_dataset(
109    path: Union[os.PathLike, str],
110    patch_shape: Tuple[int, int],
111    data_choice: Optional[Union[str, List[str]]] = None,
112    download: bool = False,
113    **kwargs
114) -> Dataset:
115    """Get the CellBinDB dataset for cell segmentation.
116
117    Args:
118        path: Filepath to a folder where the data is downloaded.
119        patch_shape: The patch shape to use for training.
120        data_choice: The choice of datasets.
121        download: Whether to download the data if it is not present.
122        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
123
124    Returns:
125        The segmentation dataset.
126    """
127    raw_paths, label_paths = get_cellbindb_paths(path, data_choice, download)
128
129    if "raw_transform" not in kwargs:
130        kwargs["raw_transform"] = torch_em.transform.get_raw_transform(augmentation2=to_rgb)
131
132    return torch_em.default_segmentation_dataset(
133        raw_paths=raw_paths,
134        raw_key=None,
135        label_paths=label_paths,
136        label_key=None,
137        is_seg_dataset=False,
138        ndim=2,
139        patch_shape=patch_shape,
140        **kwargs
141    )
142
143
144def get_cellbindb_loader(
145    path: Union[os.PathLike, str],
146    batch_size: int,
147    patch_shape: Tuple[int, int],
148    data_choice: Optional[Union[str, List[str]]] = None,
149    download: bool = False,
150    **kwargs
151) -> DataLoader:
152    """Get the CellBinDB dataloader for cell segmentation.
153
154    Args:
155        path: Filepath to a folder where the data is downloaded.
156        patch_shape: The patch shape to use for training.
157        data_choice: The choice of datasets.
158        download: Whether to download the data if it is not present.
159        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
160
161    Returns:
162        The DataLoader.
163    """
164    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
165    dataset = get_cellbindb_dataset(path, patch_shape, data_choice, download, **ds_kwargs)
166    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
DOWNLOAD_SCRIPT = 'wget -c -nH -np -r -R "index.html*" --cut-dirs 4 ftp://ftp.cngb.org/pub/CNSA/data5/CNP0006370/Other/'
CHOICES = ['10×Genomics_DAPI', '10×Genomics_HE', 'DAPI', 'HE', 'mIF', 'ssDNA']
def get_cellbindb_data(path: Union[os.PathLike, str], download: bool = False) -> str:
30def get_cellbindb_data(path: Union[os.PathLike, str], download: bool = False) -> str:
31    """Download the CellBinDB dataset.
32
33    Args:
34        path: Filepath to a folder where the data is downloaded.
35        download: Whether to download the data if it is not present.
36
37    Returns:
38        The filepath to the data.
39    """
40    data_dir = os.path.join(path, "Other")
41    if os.path.exists(data_dir):
42        return data_dir
43
44    os.makedirs(path, exist_ok=True)
45
46    if not download:
47        raise AssertionError("The dataset is not found and download is set to 'False'.")
48
49    print(
50        "Downloading the dataset takes several hours and is extremely (like very very) slow. "
51        "Make sure you have consistent internet connection or run it in background over a cluster."
52    )
53    splits = DOWNLOAD_SCRIPT.split(" ")
54    subprocess.run([*splits[:-1], "-P", os.path.abspath(path), splits[-1]])
55    return data_dir

Download the CellBinDB dataset.

Arguments:
  • path: Filepath to a folder where the data is downloaded.
  • download: Whether to download the data if it is not present.
Returns:

The filepath to the data.

def get_cellbindb_paths( path: Union[os.PathLike, str], data_choice: Union[List[str], str, NoneType] = None, download: bool = False) -> Tuple[List[str], List[str]]:
58def get_cellbindb_paths(
59    path: Union[os.PathLike, str], data_choice: Optional[Union[str, List[str]]] = None, download: bool = False
60) -> Tuple[List[str], List[str]]:
61    """Get paths to the CellBinDB data.
62
63    Args:
64        path: Filepath to a folder where the data is downloaded.
65        data_choice: The choice of datasets.
66        download: Whether to download the data if it is not present.
67
68    Returns:
69        List of filepaths for the image data.
70        List of filepaths for the label data.
71    """
72    data_dir = get_cellbindb_data(path, download)
73
74    if data_choice is None:
75        data_choice = CHOICES
76    else:
77        if isinstance(data_choice, str):
78            data_choice = [data_choice]
79
80    raw_paths, label_paths = [], []
81    for dchoice in data_choice:
82        assert dchoice in CHOICES, f"'{dchoice}' is not a valid data choice."
83        raw_paths.extend(natsorted(glob(os.path.join(data_dir, dchoice, "*", "*-img.tif"))))
84        label_paths.extend(natsorted(glob(os.path.join(data_dir, dchoice, "*", "*-instancemask.tif"))))
85
86    # NOTE: Some files are corrupted from source. Since it's just a few of them, let's bump them out.
87    valid_paired_images = [
88        (rp, lp) for rp, lp in zip(raw_paths, label_paths) if _is_valid_image(rp) and _is_valid_image(lp)
89    ]
90    raw_paths, label_paths = zip(*valid_paired_images)
91    raw_paths, label_paths = list(raw_paths), list(label_paths)
92
93    assert len(raw_paths) == len(label_paths) and len(raw_paths) > 0
94
95    return raw_paths, label_paths

Get paths to the CellBinDB data.

Arguments:
  • path: Filepath to a folder where the data is downloaded.
  • data_choice: The choice of datasets.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths for the image data. List of filepaths for the label data.

def get_cellbindb_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], data_choice: Union[List[str], str, NoneType] = None, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
109def get_cellbindb_dataset(
110    path: Union[os.PathLike, str],
111    patch_shape: Tuple[int, int],
112    data_choice: Optional[Union[str, List[str]]] = None,
113    download: bool = False,
114    **kwargs
115) -> Dataset:
116    """Get the CellBinDB dataset for cell segmentation.
117
118    Args:
119        path: Filepath to a folder where the data is downloaded.
120        patch_shape: The patch shape to use for training.
121        data_choice: The choice of datasets.
122        download: Whether to download the data if it is not present.
123        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
124
125    Returns:
126        The segmentation dataset.
127    """
128    raw_paths, label_paths = get_cellbindb_paths(path, data_choice, download)
129
130    if "raw_transform" not in kwargs:
131        kwargs["raw_transform"] = torch_em.transform.get_raw_transform(augmentation2=to_rgb)
132
133    return torch_em.default_segmentation_dataset(
134        raw_paths=raw_paths,
135        raw_key=None,
136        label_paths=label_paths,
137        label_key=None,
138        is_seg_dataset=False,
139        ndim=2,
140        patch_shape=patch_shape,
141        **kwargs
142    )

Get the CellBinDB dataset for cell segmentation.

Arguments:
  • path: Filepath to a folder where the data is downloaded.
  • patch_shape: The patch shape to use for training.
  • data_choice: The choice of datasets.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_cellbindb_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], data_choice: Union[List[str], str, NoneType] = None, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
145def get_cellbindb_loader(
146    path: Union[os.PathLike, str],
147    batch_size: int,
148    patch_shape: Tuple[int, int],
149    data_choice: Optional[Union[str, List[str]]] = None,
150    download: bool = False,
151    **kwargs
152) -> DataLoader:
153    """Get the CellBinDB dataloader for cell segmentation.
154
155    Args:
156        path: Filepath to a folder where the data is downloaded.
157        patch_shape: The patch shape to use for training.
158        data_choice: The choice of datasets.
159        download: Whether to download the data if it is not present.
160        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
161
162    Returns:
163        The DataLoader.
164    """
165    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
166    dataset = get_cellbindb_dataset(path, patch_shape, data_choice, download, **ds_kwargs)
167    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the CellBinDB dataloader for cell segmentation.

Arguments:
  • path: Filepath to a folder where the data is downloaded.
  • patch_shape: The patch shape to use for training.
  • data_choice: The choice of datasets.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.