torch_em.data.datasets.light_microscopy.cellbindb
CellBinDB contains annotations for cell segmentation in multi-modal images.
- Consists of DAPI, ssDNA, H&E, and mIF staining.
- Covers more than 30 normal and diseased tissue types from human and mouse samples.
The dataset is located at https://db.cngb.org/search/project/CNP0006370/. This dataset is from the publication https://doi.org/10.1101/2024.11.20.619750. Please cite it if you use this dataset for your research.
1"""CellBinDB contains annotations for cell segmentation in multi-modal images. 2- Consists of DAPI, ssDNA, H&E, and mIF staining. 3- Covers more than 30 normal and diseased tissue types from human and mouse samples. 4 5The dataset is located at https://db.cngb.org/search/project/CNP0006370/. 6This dataset is from the publication https://doi.org/10.1101/2024.11.20.619750. 7Please cite it if you use this dataset for your research. 8""" 9 10import os 11import subprocess 12from glob import glob 13from natsort import natsorted 14from typing import Union, Tuple, List, Optional 15 16import torch_em 17 18from torch.utils.data import Dataset, DataLoader 19 20from .. import util 21from .neurips_cell_seg import to_rgb 22 23 24DOWNLOAD_SCRIPT = 'wget -c -nH -np -r -R "index.html*" --cut-dirs 4 ftp://ftp.cngb.org/pub/CNSA/data5/CNP0006370/Other/' 25 26CHOICES = ["10×Genomics_DAPI", "10×Genomics_HE", "DAPI", "HE", "mIF", "ssDNA"] 27 28 29def get_cellbindb_data(path: Union[os.PathLike, str], download: bool = False) -> str: 30 """Download the CellBinDB dataset. 31 32 Args: 33 path: Filepath to a folder where the data is downloaded. 34 download: Whether to download the data if it is not present. 35 36 Returns: 37 The filepath to the data. 38 """ 39 data_dir = os.path.join(path, "Other") 40 if os.path.exists(data_dir): 41 return data_dir 42 43 os.makedirs(path, exist_ok=True) 44 45 if not download: 46 raise AssertionError("The dataset is not found and download is set to 'False'.") 47 48 print( 49 "Downloading the dataset takes several hours and is extremely (like very very) slow. " 50 "Make sure you have consistent internet connection or run it in background over a cluster." 51 ) 52 splits = DOWNLOAD_SCRIPT.split(" ") 53 subprocess.run([*splits[:-1], "-P", os.path.abspath(path), splits[-1]]) 54 return data_dir 55 56 57def get_cellbindb_paths( 58 path: Union[os.PathLike, str], data_choice: Optional[Union[str, List[str]]] = None, download: bool = False 59) -> Tuple[List[str], List[str]]: 60 """Get paths to the CellBinDB data. 61 62 Args: 63 path: Filepath to a folder where the data is downloaded. 64 data_choice: The choice of datasets. 65 download: Whether to download the data if it is not present. 66 67 Returns: 68 List of filepaths for the image data. 69 List of filepaths for the label data. 70 """ 71 data_dir = get_cellbindb_data(path, download) 72 73 if data_choice is None: 74 data_choice = CHOICES 75 else: 76 if isinstance(data_choice, str): 77 data_choice = [data_choice] 78 79 raw_paths, label_paths = [], [] 80 for dchoice in data_choice: 81 assert dchoice in CHOICES, f"'{dchoice}' is not a valid data choice." 82 raw_paths.extend(natsorted(glob(os.path.join(data_dir, dchoice, "*", "*-img.tif")))) 83 label_paths.extend(natsorted(glob(os.path.join(data_dir, dchoice, "*", "*-instancemask.tif")))) 84 85 # NOTE: Some files are corrupted from source. Since it's just a few of them, let's bump them out. 86 valid_paired_images = [ 87 (rp, lp) for rp, lp in zip(raw_paths, label_paths) if _is_valid_image(rp) and _is_valid_image(lp) 88 ] 89 raw_paths, label_paths = zip(*valid_paired_images) 90 raw_paths, label_paths = list(raw_paths), list(label_paths) 91 92 assert len(raw_paths) == len(label_paths) and len(raw_paths) > 0 93 94 return raw_paths, label_paths 95 96 97def _is_valid_image(im_path): 98 import tifffile 99 100 try: 101 _ = tifffile.imread(im_path) 102 return True 103 except Exception as e: 104 print(f"'{im_path}' throwing '{type(e).__name__}': '{e}'") 105 return False 106 107 108def get_cellbindb_dataset( 109 path: Union[os.PathLike, str], 110 patch_shape: Tuple[int, int], 111 data_choice: Optional[Union[str, List[str]]] = None, 112 download: bool = False, 113 **kwargs 114) -> Dataset: 115 """Get the CellBinDB dataset for cell segmentation. 116 117 Args: 118 path: Filepath to a folder where the data is downloaded. 119 patch_shape: The patch shape to use for training. 120 data_choice: The choice of datasets. 121 download: Whether to download the data if it is not present. 122 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 123 124 Returns: 125 The segmentation dataset. 126 """ 127 raw_paths, label_paths = get_cellbindb_paths(path, data_choice, download) 128 129 if "raw_transform" not in kwargs: 130 kwargs["raw_transform"] = torch_em.transform.get_raw_transform(augmentation2=to_rgb) 131 132 return torch_em.default_segmentation_dataset( 133 raw_paths=raw_paths, 134 raw_key=None, 135 label_paths=label_paths, 136 label_key=None, 137 is_seg_dataset=False, 138 ndim=2, 139 patch_shape=patch_shape, 140 **kwargs 141 ) 142 143 144def get_cellbindb_loader( 145 path: Union[os.PathLike, str], 146 batch_size: int, 147 patch_shape: Tuple[int, int], 148 data_choice: Optional[Union[str, List[str]]] = None, 149 download: bool = False, 150 **kwargs 151) -> DataLoader: 152 """Get the CellBinDB dataloader for cell segmentation. 153 154 Args: 155 path: Filepath to a folder where the data is downloaded. 156 patch_shape: The patch shape to use for training. 157 data_choice: The choice of datasets. 158 download: Whether to download the data if it is not present. 159 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 160 161 Returns: 162 The DataLoader. 163 """ 164 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 165 dataset = get_cellbindb_dataset(path, patch_shape, data_choice, download, **ds_kwargs) 166 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
DOWNLOAD_SCRIPT =
'wget -c -nH -np -r -R "index.html*" --cut-dirs 4 ftp://ftp.cngb.org/pub/CNSA/data5/CNP0006370/Other/'
CHOICES =
['10×Genomics_DAPI', '10×Genomics_HE', 'DAPI', 'HE', 'mIF', 'ssDNA']
def
get_cellbindb_data(path: Union[os.PathLike, str], download: bool = False) -> str:
30def get_cellbindb_data(path: Union[os.PathLike, str], download: bool = False) -> str: 31 """Download the CellBinDB dataset. 32 33 Args: 34 path: Filepath to a folder where the data is downloaded. 35 download: Whether to download the data if it is not present. 36 37 Returns: 38 The filepath to the data. 39 """ 40 data_dir = os.path.join(path, "Other") 41 if os.path.exists(data_dir): 42 return data_dir 43 44 os.makedirs(path, exist_ok=True) 45 46 if not download: 47 raise AssertionError("The dataset is not found and download is set to 'False'.") 48 49 print( 50 "Downloading the dataset takes several hours and is extremely (like very very) slow. " 51 "Make sure you have consistent internet connection or run it in background over a cluster." 52 ) 53 splits = DOWNLOAD_SCRIPT.split(" ") 54 subprocess.run([*splits[:-1], "-P", os.path.abspath(path), splits[-1]]) 55 return data_dir
Download the CellBinDB dataset.
Arguments:
- path: Filepath to a folder where the data is downloaded.
- download: Whether to download the data if it is not present.
Returns:
The filepath to the data.
def
get_cellbindb_paths( path: Union[os.PathLike, str], data_choice: Union[List[str], str, NoneType] = None, download: bool = False) -> Tuple[List[str], List[str]]:
58def get_cellbindb_paths( 59 path: Union[os.PathLike, str], data_choice: Optional[Union[str, List[str]]] = None, download: bool = False 60) -> Tuple[List[str], List[str]]: 61 """Get paths to the CellBinDB data. 62 63 Args: 64 path: Filepath to a folder where the data is downloaded. 65 data_choice: The choice of datasets. 66 download: Whether to download the data if it is not present. 67 68 Returns: 69 List of filepaths for the image data. 70 List of filepaths for the label data. 71 """ 72 data_dir = get_cellbindb_data(path, download) 73 74 if data_choice is None: 75 data_choice = CHOICES 76 else: 77 if isinstance(data_choice, str): 78 data_choice = [data_choice] 79 80 raw_paths, label_paths = [], [] 81 for dchoice in data_choice: 82 assert dchoice in CHOICES, f"'{dchoice}' is not a valid data choice." 83 raw_paths.extend(natsorted(glob(os.path.join(data_dir, dchoice, "*", "*-img.tif")))) 84 label_paths.extend(natsorted(glob(os.path.join(data_dir, dchoice, "*", "*-instancemask.tif")))) 85 86 # NOTE: Some files are corrupted from source. Since it's just a few of them, let's bump them out. 87 valid_paired_images = [ 88 (rp, lp) for rp, lp in zip(raw_paths, label_paths) if _is_valid_image(rp) and _is_valid_image(lp) 89 ] 90 raw_paths, label_paths = zip(*valid_paired_images) 91 raw_paths, label_paths = list(raw_paths), list(label_paths) 92 93 assert len(raw_paths) == len(label_paths) and len(raw_paths) > 0 94 95 return raw_paths, label_paths
Get paths to the CellBinDB data.
Arguments:
- path: Filepath to a folder where the data is downloaded.
- data_choice: The choice of datasets.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the image data. List of filepaths for the label data.
def
get_cellbindb_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], data_choice: Union[List[str], str, NoneType] = None, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
109def get_cellbindb_dataset( 110 path: Union[os.PathLike, str], 111 patch_shape: Tuple[int, int], 112 data_choice: Optional[Union[str, List[str]]] = None, 113 download: bool = False, 114 **kwargs 115) -> Dataset: 116 """Get the CellBinDB dataset for cell segmentation. 117 118 Args: 119 path: Filepath to a folder where the data is downloaded. 120 patch_shape: The patch shape to use for training. 121 data_choice: The choice of datasets. 122 download: Whether to download the data if it is not present. 123 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 124 125 Returns: 126 The segmentation dataset. 127 """ 128 raw_paths, label_paths = get_cellbindb_paths(path, data_choice, download) 129 130 if "raw_transform" not in kwargs: 131 kwargs["raw_transform"] = torch_em.transform.get_raw_transform(augmentation2=to_rgb) 132 133 return torch_em.default_segmentation_dataset( 134 raw_paths=raw_paths, 135 raw_key=None, 136 label_paths=label_paths, 137 label_key=None, 138 is_seg_dataset=False, 139 ndim=2, 140 patch_shape=patch_shape, 141 **kwargs 142 )
Get the CellBinDB dataset for cell segmentation.
Arguments:
- path: Filepath to a folder where the data is downloaded.
- patch_shape: The patch shape to use for training.
- data_choice: The choice of datasets.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset.
Returns:
The segmentation dataset.
def
get_cellbindb_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], data_choice: Union[List[str], str, NoneType] = None, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
145def get_cellbindb_loader( 146 path: Union[os.PathLike, str], 147 batch_size: int, 148 patch_shape: Tuple[int, int], 149 data_choice: Optional[Union[str, List[str]]] = None, 150 download: bool = False, 151 **kwargs 152) -> DataLoader: 153 """Get the CellBinDB dataloader for cell segmentation. 154 155 Args: 156 path: Filepath to a folder where the data is downloaded. 157 patch_shape: The patch shape to use for training. 158 data_choice: The choice of datasets. 159 download: Whether to download the data if it is not present. 160 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 161 162 Returns: 163 The DataLoader. 164 """ 165 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 166 dataset = get_cellbindb_dataset(path, patch_shape, data_choice, download, **ds_kwargs) 167 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the CellBinDB dataloader for cell segmentation.
Arguments:
- path: Filepath to a folder where the data is downloaded.
- patch_shape: The patch shape to use for training.
- data_choice: The choice of datasets.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_datasetor for the PyTorch DataLoader.
Returns:
The DataLoader.