torch_em.data.datasets.light_microscopy.cellbindb
CellBinDB contains annotations for cell segmentation in multi-modal images.
- Consists of DAPI, ssDNA, H&E, and mIF staining.
- Covers more than 30 normal and diseased tissue types from human and mouse samples.
The dataset is located at https://db.cngb.org/search/project/CNP0006370/. This dataset is from the publication https://doi.org/10.1101/2024.11.20.619750. Please cite it if you use this dataset for your research.
1"""CellBinDB contains annotations for cell segmentation in multi-modal images. 2- Consists of DAPI, ssDNA, H&E, and mIF staining. 3- Covers more than 30 normal and diseased tissue types from human and mouse samples. 4 5The dataset is located at https://db.cngb.org/search/project/CNP0006370/. 6This dataset is from the publication https://doi.org/10.1101/2024.11.20.619750. 7Please cite it if you use this dataset for your research. 8""" 9 10import os 11import subprocess 12from glob import glob 13from natsort import natsorted 14from typing import Union, Tuple, List, Optional 15 16import torch_em 17 18from torch.utils.data import Dataset, DataLoader 19 20from .. import util 21from .neurips_cell_seg import to_rgb 22 23 24DOWNLOAD_SCRIPT = 'wget -c -nH -np -r -R "index.html*" --cut-dirs 4 ftp://ftp.cngb.org/pub/CNSA/data5/CNP0006370/Other/' 25 26CHOICES = ["10×Genomics_DAPI", "10×Genomics_HE", "DAPI", "HE", "mIF", "ssDNA"] 27 28 29def get_cellbindb_data(path: Union[os.PathLike, str], download: bool = False) -> str: 30 """Download the CellBinDB dataset. 31 32 Args: 33 path: Filepath to a folder where the data is downloaded. 34 download: Whether to download the data if it is not present. 35 36 Returns: 37 The filepath to the data. 38 """ 39 data_dir = os.path.join(path, "Other") 40 if os.path.exists(data_dir): 41 return data_dir 42 43 os.makedirs(path, exist_ok=True) 44 45 if not download: 46 raise AssertionError("The dataset is not found and download is set to 'False'.") 47 48 print( 49 "Downloading the dataset takes several hours and is extremely (like very very) slow. " 50 "Make sure you have consistent internet connection or run it in background over a cluster." 51 ) 52 splits = DOWNLOAD_SCRIPT.split(" ") 53 subprocess.run([*splits[:-1], "-P", os.path.abspath(path), splits[-1]]) 54 return data_dir 55 56 57def get_cellbindb_paths( 58 path: Union[os.PathLike, str], data_choice: Optional[Union[str, List[str]]] = None, download: bool = False 59) -> Tuple[List[str], List[str]]: 60 """Get paths to the CellBinDB data. 61 62 Args: 63 path: Filepath to a folder where the data is downloaded. 64 data_choice: The choice of datasets. 65 download: Whether to download the data if it is not present. 66 67 Returns: 68 List of filepaths for the image data. 69 List of filepaths for the label data. 70 """ 71 data_dir = get_cellbindb_data(path, download) 72 73 if data_choice is None: 74 data_choice = CHOICES 75 else: 76 if isinstance(data_choice, str): 77 data_choice = [data_choice] 78 79 raw_paths, label_paths = [], [] 80 for dchoice in data_choice: 81 assert dchoice in CHOICES, f"'{dchoice}' is not a valid data choice." 82 raw_paths.extend(natsorted(glob(os.path.join(data_dir, dchoice, "*", "*-img.tif")))) 83 label_paths.extend(natsorted(glob(os.path.join(data_dir, dchoice, "*", "*-instancemask.tif")))) 84 85 assert len(raw_paths) == len(label_paths) and len(raw_paths) > 0 86 87 return raw_paths, label_paths 88 89 90def get_cellbindb_dataset( 91 path: Union[os.PathLike, str], 92 patch_shape: Tuple[int, int], 93 data_choice: Optional[Union[str, List[str]]] = None, 94 download: bool = False, 95 **kwargs 96) -> Dataset: 97 """Get the CellBinDB dataset for cell segmentation. 98 99 Args: 100 path: Filepath to a folder where the data is downloaded. 101 patch_shape: The patch shape to use for training. 102 data_choice: The choice of datasets. 103 download: Whether to download the data if it is not present. 104 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 105 106 Returns: 107 The segmentation dataset. 108 """ 109 raw_paths, label_paths = get_cellbindb_paths(path, data_choice, download) 110 111 if "raw_transform" not in kwargs: 112 kwargs["raw_transform"] = torch_em.transform.get_raw_transform(augmentation2=to_rgb) 113 114 return torch_em.default_segmentation_dataset( 115 raw_paths=raw_paths, 116 raw_key=None, 117 label_paths=label_paths, 118 label_key=None, 119 is_seg_dataset=False, 120 ndim=2, 121 patch_shape=patch_shape, 122 **kwargs 123 ) 124 125 126def get_cellbindb_loader( 127 path: Union[os.PathLike, str], 128 batch_size: int, 129 patch_shape: Tuple[int, int], 130 data_choice: Optional[Union[str, List[str]]] = None, 131 download: bool = False, 132 **kwargs 133) -> DataLoader: 134 """Get the CellBinDB dataloader for cell segmentation. 135 136 Args: 137 path: Filepath to a folder where the data is downloaded. 138 patch_shape: The patch shape to use for training. 139 data_choice: The choice of datasets. 140 download: Whether to download the data if it is not present. 141 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 142 143 Returns: 144 The DataLoader. 145 """ 146 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 147 dataset = get_cellbindb_dataset(path, patch_shape, data_choice, download, **ds_kwargs) 148 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
DOWNLOAD_SCRIPT =
'wget -c -nH -np -r -R "index.html*" --cut-dirs 4 ftp://ftp.cngb.org/pub/CNSA/data5/CNP0006370/Other/'
CHOICES =
['10×Genomics_DAPI', '10×Genomics_HE', 'DAPI', 'HE', 'mIF', 'ssDNA']
def
get_cellbindb_data(path: Union[os.PathLike, str], download: bool = False) -> str:
30def get_cellbindb_data(path: Union[os.PathLike, str], download: bool = False) -> str: 31 """Download the CellBinDB dataset. 32 33 Args: 34 path: Filepath to a folder where the data is downloaded. 35 download: Whether to download the data if it is not present. 36 37 Returns: 38 The filepath to the data. 39 """ 40 data_dir = os.path.join(path, "Other") 41 if os.path.exists(data_dir): 42 return data_dir 43 44 os.makedirs(path, exist_ok=True) 45 46 if not download: 47 raise AssertionError("The dataset is not found and download is set to 'False'.") 48 49 print( 50 "Downloading the dataset takes several hours and is extremely (like very very) slow. " 51 "Make sure you have consistent internet connection or run it in background over a cluster." 52 ) 53 splits = DOWNLOAD_SCRIPT.split(" ") 54 subprocess.run([*splits[:-1], "-P", os.path.abspath(path), splits[-1]]) 55 return data_dir
Download the CellBinDB dataset.
Arguments:
- path: Filepath to a folder where the data is downloaded.
- download: Whether to download the data if it is not present.
Returns:
The filepath to the data.
def
get_cellbindb_paths( path: Union[os.PathLike, str], data_choice: Union[List[str], str, NoneType] = None, download: bool = False) -> Tuple[List[str], List[str]]:
58def get_cellbindb_paths( 59 path: Union[os.PathLike, str], data_choice: Optional[Union[str, List[str]]] = None, download: bool = False 60) -> Tuple[List[str], List[str]]: 61 """Get paths to the CellBinDB data. 62 63 Args: 64 path: Filepath to a folder where the data is downloaded. 65 data_choice: The choice of datasets. 66 download: Whether to download the data if it is not present. 67 68 Returns: 69 List of filepaths for the image data. 70 List of filepaths for the label data. 71 """ 72 data_dir = get_cellbindb_data(path, download) 73 74 if data_choice is None: 75 data_choice = CHOICES 76 else: 77 if isinstance(data_choice, str): 78 data_choice = [data_choice] 79 80 raw_paths, label_paths = [], [] 81 for dchoice in data_choice: 82 assert dchoice in CHOICES, f"'{dchoice}' is not a valid data choice." 83 raw_paths.extend(natsorted(glob(os.path.join(data_dir, dchoice, "*", "*-img.tif")))) 84 label_paths.extend(natsorted(glob(os.path.join(data_dir, dchoice, "*", "*-instancemask.tif")))) 85 86 assert len(raw_paths) == len(label_paths) and len(raw_paths) > 0 87 88 return raw_paths, label_paths
Get paths to the CellBinDB data.
Arguments:
- path: Filepath to a folder where the data is downloaded.
- data_choice: The choice of datasets.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the image data. List of filepaths for the label data.
def
get_cellbindb_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], data_choice: Union[List[str], str, NoneType] = None, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
91def get_cellbindb_dataset( 92 path: Union[os.PathLike, str], 93 patch_shape: Tuple[int, int], 94 data_choice: Optional[Union[str, List[str]]] = None, 95 download: bool = False, 96 **kwargs 97) -> Dataset: 98 """Get the CellBinDB dataset for cell segmentation. 99 100 Args: 101 path: Filepath to a folder where the data is downloaded. 102 patch_shape: The patch shape to use for training. 103 data_choice: The choice of datasets. 104 download: Whether to download the data if it is not present. 105 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 106 107 Returns: 108 The segmentation dataset. 109 """ 110 raw_paths, label_paths = get_cellbindb_paths(path, data_choice, download) 111 112 if "raw_transform" not in kwargs: 113 kwargs["raw_transform"] = torch_em.transform.get_raw_transform(augmentation2=to_rgb) 114 115 return torch_em.default_segmentation_dataset( 116 raw_paths=raw_paths, 117 raw_key=None, 118 label_paths=label_paths, 119 label_key=None, 120 is_seg_dataset=False, 121 ndim=2, 122 patch_shape=patch_shape, 123 **kwargs 124 )
Get the CellBinDB dataset for cell segmentation.
Arguments:
- path: Filepath to a folder where the data is downloaded.
- patch_shape: The patch shape to use for training.
- data_choice: The choice of datasets.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
def
get_cellbindb_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], data_choice: Union[List[str], str, NoneType] = None, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
127def get_cellbindb_loader( 128 path: Union[os.PathLike, str], 129 batch_size: int, 130 patch_shape: Tuple[int, int], 131 data_choice: Optional[Union[str, List[str]]] = None, 132 download: bool = False, 133 **kwargs 134) -> DataLoader: 135 """Get the CellBinDB dataloader for cell segmentation. 136 137 Args: 138 path: Filepath to a folder where the data is downloaded. 139 patch_shape: The patch shape to use for training. 140 data_choice: The choice of datasets. 141 download: Whether to download the data if it is not present. 142 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 143 144 Returns: 145 The DataLoader. 146 """ 147 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 148 dataset = get_cellbindb_dataset(path, patch_shape, data_choice, download, **ds_kwargs) 149 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the CellBinDB dataloader for cell segmentation.
Arguments:
- path: Filepath to a folder where the data is downloaded.
- patch_shape: The patch shape to use for training.
- data_choice: The choice of datasets.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader.