torch_em.data.datasets.electron_microscopy.densecell
The DenseCell dataset contains annotations for semantic segmentation of densely-packed cellular organelles in serial block-face scanning electron microscopy (SBF-SEM) images of platelet tissue.
The dataset was published in https://doi.org/10.1038/s41598-021-81590-0. Please cite this publication if you use the dataset in your research.
1"""The DenseCell dataset contains annotations for semantic segmentation of densely-packed cellular organelles 2in serial block-face scanning electron microscopy (SBF-SEM) images of platelet tissue. 3 4The dataset was published in https://doi.org/10.1038/s41598-021-81590-0. 5Please cite this publication if you use the dataset in your research. 6""" 7 8import os 9from shutil import rmtree 10from typing import Tuple, Union, Literal, Optional 11 12import numpy as np 13 14import torch_em 15 16from torch.utils.data import Dataset, DataLoader 17 18from .. import util 19 20 21URL = "https://www.dropbox.com/s/68yclbraqq1diza/platelet_data_1219.zip?dl=1" 22CHECKSUM = None 23 24ORGANELLES = { 25 1: "cell", 26 2: "mitochondrion", 27 3: "alpha_granule", 28 4: "canalicular_vessel", 29 5: "dense_granule", 30 6: "dense_core", 31} 32 33SPLIT_FILES = { 34 "train": {"images": "train-images.tif", "labels": "train-labels.tif"}, 35 "val": {"images": "eval-images.tif", "labels": "eval-labels.tif"}, 36 "test": {"images": "test-images.tif", "labels": "test-labels.tif"}, 37} 38 39 40def get_densecell_data( 41 path: Union[os.PathLike, str], split: Literal["train", "val", "test"], download: bool = False 42) -> str: 43 """Download the DenseCell dataset. 44 45 Args: 46 path: Filepath to a folder where the downloaded data will be saved. 47 split: The split to download. Either 'train', 'val', or 'test'. 48 download: Whether to download the data if it is not present. 49 50 Returns: 51 The filepath for the downloaded data. 52 """ 53 import h5py 54 import tifffile 55 56 data_path = os.path.join(path, f"densecell_{split}.h5") 57 if os.path.exists(data_path): 58 with h5py.File(data_path, "r") as f: 59 if "labels/original" in f: 60 return data_path 61 62 # Remove old file with outdated structure. 63 os.remove(data_path) 64 65 os.makedirs(path, exist_ok=True) 66 67 # Download and extract the ZIP if the source TIFFs are not available. 68 platelet_dir = os.path.join(path, "platelet_data") 69 if not os.path.exists(platelet_dir): 70 zip_path = os.path.join(path, "platelet_data_1219.zip") 71 util.download_source(zip_path, URL, download, checksum=CHECKSUM) 72 util.unzip(zip_path, path, remove=True) 73 74 assert os.path.exists(platelet_dir), f"Expected extracted directory at {platelet_dir}" 75 76 for _split, files in SPLIT_FILES.items(): 77 out_path = os.path.join(path, f"densecell_{_split}.h5") 78 if os.path.exists(out_path): 79 with h5py.File(out_path, "r") as f: 80 if "labels/original" in f: 81 continue 82 83 os.remove(out_path) 84 85 raw = tifffile.imread(os.path.join(platelet_dir, files["images"])) 86 labels = tifffile.imread(os.path.join(platelet_dir, files["labels"])) 87 assert raw.shape == labels.shape, f"Shape mismatch for {_split}: {raw.shape} vs {labels.shape}" 88 89 labels = labels.astype(np.uint8) 90 with h5py.File(out_path, "w") as f: 91 f.create_dataset("raw", data=raw, compression="gzip") 92 f.create_dataset("labels/original", data=labels, compression="gzip") 93 for label_id, name in ORGANELLES.items(): 94 # For cells, use all non-background labels to avoid holes from internal organelles. 95 if name == "cell": 96 binary_mask = (labels >= 1).astype(np.uint8) 97 else: 98 binary_mask = (labels == label_id).astype(np.uint8) 99 100 f.create_dataset(f"labels/{name}", data=binary_mask, compression="gzip") 101 102 rmtree(platelet_dir) 103 104 assert os.path.exists(data_path), data_path 105 return data_path 106 107 108def get_densecell_paths( 109 path: Union[os.PathLike, str], split: Literal["train", "val", "test"], download: bool = False 110) -> str: 111 """Get paths to the DenseCell data. 112 113 Args: 114 path: Filepath to a folder where the downloaded data will be saved. 115 split: The data split. Either 'train', 'val', or 'test'. 116 download: Whether to download the data if it is not present. 117 118 Returns: 119 The filepath for the stored data. 120 """ 121 get_densecell_data(path, split, download) 122 data_path = os.path.join(path, f"densecell_{split}.h5") 123 return data_path 124 125 126def get_densecell_dataset( 127 path: Union[os.PathLike, str], 128 split: Literal["train", "val", "test"], 129 patch_shape: Tuple[int, int, int], 130 label_choice: Optional[str] = None, 131 download: bool = False, 132 **kwargs 133) -> Dataset: 134 """Get dataset for segmentation of organelles in SBF-SEM platelet images. 135 136 Args: 137 path: Filepath to a folder where the downloaded data will be saved. 138 split: The data split. Either 'train', 'val', or 'test'. 139 patch_shape: The patch shape to use for training. 140 label_choice: The organelle to segment. Available choices are: 141 'cell', 'mitochondrion', 'alpha_granule', 'canalicular_vessel', 'dense_granule', 'dense_core'. 142 If None, uses 'original' which contains all semantic labels (0-6). 143 download: Whether to download the data if it is not present. 144 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 145 146 Returns: 147 The segmentation dataset. 148 """ 149 assert split in ("train", "val", "test") 150 151 if label_choice is None: 152 label_key = "labels/original" 153 else: 154 valid_choices = list(ORGANELLES.values()) 155 assert label_choice in valid_choices, f"'{label_choice}' is not valid. Choose from {valid_choices}." 156 label_key = f"labels/{label_choice}" 157 158 data_path = get_densecell_paths(path, split, download) 159 160 return torch_em.default_segmentation_dataset( 161 raw_paths=data_path, 162 raw_key="raw", 163 label_paths=data_path, 164 label_key=label_key, 165 patch_shape=patch_shape, 166 **kwargs 167 ) 168 169 170def get_densecell_loader( 171 path: Union[os.PathLike, str], 172 split: Literal["train", "val", "test"], 173 patch_shape: Tuple[int, int, int], 174 batch_size: int, 175 label_choice: Optional[str] = None, 176 download: bool = False, 177 **kwargs 178) -> DataLoader: 179 """Get dataloader for segmentation of organelles in SBF-SEM platelet images. 180 181 Args: 182 path: Filepath to a folder where the downloaded data will be saved. 183 split: The data split. Either 'train', 'val', or 'test'. 184 patch_shape: The patch shape to use for training. 185 batch_size: The batch size for training. 186 label_choice: The organelle to segment. Available choices are: 187 'cell', 'mitochondrion', 'alpha_granule', 'canalicular_vessel', 'dense_granule', 'dense_core'. 188 If None, uses 'original' which contains all semantic labels (0-6). 189 download: Whether to download the data if it is not present. 190 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 191 192 Returns: 193 The PyTorch DataLoader. 194 """ 195 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 196 dataset = get_densecell_dataset(path, split, patch_shape, label_choice=label_choice, download=download, **ds_kwargs) 197 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URL =
'https://www.dropbox.com/s/68yclbraqq1diza/platelet_data_1219.zip?dl=1'
CHECKSUM =
None
ORGANELLES =
{1: 'cell', 2: 'mitochondrion', 3: 'alpha_granule', 4: 'canalicular_vessel', 5: 'dense_granule', 6: 'dense_core'}
SPLIT_FILES =
{'train': {'images': 'train-images.tif', 'labels': 'train-labels.tif'}, 'val': {'images': 'eval-images.tif', 'labels': 'eval-labels.tif'}, 'test': {'images': 'test-images.tif', 'labels': 'test-labels.tif'}}
def
get_densecell_data( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], download: bool = False) -> str:
41def get_densecell_data( 42 path: Union[os.PathLike, str], split: Literal["train", "val", "test"], download: bool = False 43) -> str: 44 """Download the DenseCell dataset. 45 46 Args: 47 path: Filepath to a folder where the downloaded data will be saved. 48 split: The split to download. Either 'train', 'val', or 'test'. 49 download: Whether to download the data if it is not present. 50 51 Returns: 52 The filepath for the downloaded data. 53 """ 54 import h5py 55 import tifffile 56 57 data_path = os.path.join(path, f"densecell_{split}.h5") 58 if os.path.exists(data_path): 59 with h5py.File(data_path, "r") as f: 60 if "labels/original" in f: 61 return data_path 62 63 # Remove old file with outdated structure. 64 os.remove(data_path) 65 66 os.makedirs(path, exist_ok=True) 67 68 # Download and extract the ZIP if the source TIFFs are not available. 69 platelet_dir = os.path.join(path, "platelet_data") 70 if not os.path.exists(platelet_dir): 71 zip_path = os.path.join(path, "platelet_data_1219.zip") 72 util.download_source(zip_path, URL, download, checksum=CHECKSUM) 73 util.unzip(zip_path, path, remove=True) 74 75 assert os.path.exists(platelet_dir), f"Expected extracted directory at {platelet_dir}" 76 77 for _split, files in SPLIT_FILES.items(): 78 out_path = os.path.join(path, f"densecell_{_split}.h5") 79 if os.path.exists(out_path): 80 with h5py.File(out_path, "r") as f: 81 if "labels/original" in f: 82 continue 83 84 os.remove(out_path) 85 86 raw = tifffile.imread(os.path.join(platelet_dir, files["images"])) 87 labels = tifffile.imread(os.path.join(platelet_dir, files["labels"])) 88 assert raw.shape == labels.shape, f"Shape mismatch for {_split}: {raw.shape} vs {labels.shape}" 89 90 labels = labels.astype(np.uint8) 91 with h5py.File(out_path, "w") as f: 92 f.create_dataset("raw", data=raw, compression="gzip") 93 f.create_dataset("labels/original", data=labels, compression="gzip") 94 for label_id, name in ORGANELLES.items(): 95 # For cells, use all non-background labels to avoid holes from internal organelles. 96 if name == "cell": 97 binary_mask = (labels >= 1).astype(np.uint8) 98 else: 99 binary_mask = (labels == label_id).astype(np.uint8) 100 101 f.create_dataset(f"labels/{name}", data=binary_mask, compression="gzip") 102 103 rmtree(platelet_dir) 104 105 assert os.path.exists(data_path), data_path 106 return data_path
Download the DenseCell dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The split to download. Either 'train', 'val', or 'test'.
- download: Whether to download the data if it is not present.
Returns:
The filepath for the downloaded data.
def
get_densecell_paths( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], download: bool = False) -> str:
109def get_densecell_paths( 110 path: Union[os.PathLike, str], split: Literal["train", "val", "test"], download: bool = False 111) -> str: 112 """Get paths to the DenseCell data. 113 114 Args: 115 path: Filepath to a folder where the downloaded data will be saved. 116 split: The data split. Either 'train', 'val', or 'test'. 117 download: Whether to download the data if it is not present. 118 119 Returns: 120 The filepath for the stored data. 121 """ 122 get_densecell_data(path, split, download) 123 data_path = os.path.join(path, f"densecell_{split}.h5") 124 return data_path
Get paths to the DenseCell data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The data split. Either 'train', 'val', or 'test'.
- download: Whether to download the data if it is not present.
Returns:
The filepath for the stored data.
def
get_densecell_dataset( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], patch_shape: Tuple[int, int, int], label_choice: Optional[str] = None, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
127def get_densecell_dataset( 128 path: Union[os.PathLike, str], 129 split: Literal["train", "val", "test"], 130 patch_shape: Tuple[int, int, int], 131 label_choice: Optional[str] = None, 132 download: bool = False, 133 **kwargs 134) -> Dataset: 135 """Get dataset for segmentation of organelles in SBF-SEM platelet images. 136 137 Args: 138 path: Filepath to a folder where the downloaded data will be saved. 139 split: The data split. Either 'train', 'val', or 'test'. 140 patch_shape: The patch shape to use for training. 141 label_choice: The organelle to segment. Available choices are: 142 'cell', 'mitochondrion', 'alpha_granule', 'canalicular_vessel', 'dense_granule', 'dense_core'. 143 If None, uses 'original' which contains all semantic labels (0-6). 144 download: Whether to download the data if it is not present. 145 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 146 147 Returns: 148 The segmentation dataset. 149 """ 150 assert split in ("train", "val", "test") 151 152 if label_choice is None: 153 label_key = "labels/original" 154 else: 155 valid_choices = list(ORGANELLES.values()) 156 assert label_choice in valid_choices, f"'{label_choice}' is not valid. Choose from {valid_choices}." 157 label_key = f"labels/{label_choice}" 158 159 data_path = get_densecell_paths(path, split, download) 160 161 return torch_em.default_segmentation_dataset( 162 raw_paths=data_path, 163 raw_key="raw", 164 label_paths=data_path, 165 label_key=label_key, 166 patch_shape=patch_shape, 167 **kwargs 168 )
Get dataset for segmentation of organelles in SBF-SEM platelet images.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The data split. Either 'train', 'val', or 'test'.
- patch_shape: The patch shape to use for training.
- label_choice: The organelle to segment. Available choices are: 'cell', 'mitochondrion', 'alpha_granule', 'canalicular_vessel', 'dense_granule', 'dense_core'. If None, uses 'original' which contains all semantic labels (0-6).
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset.
Returns:
The segmentation dataset.
def
get_densecell_loader( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], patch_shape: Tuple[int, int, int], batch_size: int, label_choice: Optional[str] = None, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
171def get_densecell_loader( 172 path: Union[os.PathLike, str], 173 split: Literal["train", "val", "test"], 174 patch_shape: Tuple[int, int, int], 175 batch_size: int, 176 label_choice: Optional[str] = None, 177 download: bool = False, 178 **kwargs 179) -> DataLoader: 180 """Get dataloader for segmentation of organelles in SBF-SEM platelet images. 181 182 Args: 183 path: Filepath to a folder where the downloaded data will be saved. 184 split: The data split. Either 'train', 'val', or 'test'. 185 patch_shape: The patch shape to use for training. 186 batch_size: The batch size for training. 187 label_choice: The organelle to segment. Available choices are: 188 'cell', 'mitochondrion', 'alpha_granule', 'canalicular_vessel', 'dense_granule', 'dense_core'. 189 If None, uses 'original' which contains all semantic labels (0-6). 190 download: Whether to download the data if it is not present. 191 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 192 193 Returns: 194 The PyTorch DataLoader. 195 """ 196 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 197 dataset = get_densecell_dataset(path, split, patch_shape, label_choice=label_choice, download=download, **ds_kwargs) 198 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get dataloader for segmentation of organelles in SBF-SEM platelet images.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The data split. Either 'train', 'val', or 'test'.
- patch_shape: The patch shape to use for training.
- batch_size: The batch size for training.
- label_choice: The organelle to segment. Available choices are: 'cell', 'mitochondrion', 'alpha_granule', 'canalicular_vessel', 'dense_granule', 'dense_core'. If None, uses 'original' which contains all semantic labels (0-6).
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_datasetor for the PyTorch DataLoader.
Returns:
The PyTorch DataLoader.