torch_em.data.datasets.electron_microscopy.nuc_mm
NucMM is a dataset for the segmentation of nuclei in EM and X-Ray.
This dataset is from the publication https://doi.org/10.1007/978-3-030-87193-2_16. Please cite it if you use this dataset for a publication.
1"""NucMM is a dataset for the segmentation of nuclei in EM and X-Ray. 2 3This dataset is from the publication https://doi.org/10.1007/978-3-030-87193-2_16. 4Please cite it if you use this dataset for a publication. 5""" 6 7import os 8from glob import glob 9from typing import Tuple, Union, Literal, List 10 11import torch_em 12 13from torch.utils.data import Dataset, DataLoader 14 15from .. import util 16 17 18URL = "https://drive.google.com/drive/folders/1_4CrlYvzx0ITnGlJOHdgcTRgeSkm9wT8" 19 20 21def _extract_split(image_folder, label_folder, output_folder): 22 import h5py 23 24 os.makedirs(output_folder, exist_ok=True) 25 image_files = sorted(glob(os.path.join(image_folder, "*.h5"))) 26 label_files = sorted(glob(os.path.join(label_folder, "*.h5"))) 27 assert len(image_files) == len(label_files) 28 for image, label in zip(image_files, label_files): 29 with h5py.File(image, "r") as f: 30 vol = f["main"][:] 31 with h5py.File(label, "r") as f: 32 seg = f["main"][:] 33 assert vol.shape == seg.shape 34 out_path = os.path.join(output_folder, os.path.basename(image)) 35 with h5py.File(out_path, "a") as f: 36 f.create_dataset("raw", data=vol, compression="gzip") 37 f.create_dataset("labels", data=seg, compression="gzip") 38 39 40def get_nuc_mm_data(path: Union[os.PathLike, str], sample: Literal['mouse', 'zebrafish'], download: bool) -> str: 41 """Download the NucMM training data. 42 43 Args: 44 path: Filepath to a folder where the downloaded data will be saved. 45 sample: The NucMM samples to use. The available samples are 'mouse' and 'zebrafish'. 46 download: Whether to download the data if it is not present. 47 48 Returns: 49 The filepath to the training data. 50 """ 51 assert sample in ("mouse", "zebrafish") 52 53 sample_folder = os.path.join(path, sample) 54 if os.path.exists(sample_folder): 55 return sample_folder 56 57 # Downloading the dataset 58 util.download_source_gdrive(path, URL, download, download_type="folder") 59 60 if sample == "mouse": 61 input_folder = os.path.join(path, "Mouse (NucMM-M)") 62 else: 63 input_folder = os.path.join(path, "Zebrafish (NucMM-Z)") 64 assert os.path.exists(input_folder), input_folder 65 66 sample_folder = os.path.join(path, sample) 67 _extract_split( 68 os.path.join(input_folder, "Image", "train"), os.path.join(input_folder, "Label", "train"), 69 os.path.join(sample_folder, "train") 70 ) 71 _extract_split( 72 os.path.join(input_folder, "Image", "val"), os.path.join(input_folder, "Label", "val"), 73 os.path.join(sample_folder, "val") 74 ) 75 return sample_folder 76 77 78def get_nuc_mm_paths( 79 path: Union[os.PathLike], sample: Literal['mouse', 'zebrafish'], split: str, download: bool = False, 80) -> List[str]: 81 """Get paths to the NucMM data. 82 83 Args: 84 path: Filepath to a folder where the downloaded data will be saved. 85 sample: The NucMM samples to use. The available samples are 'mouse' and 'zebrafish'. 86 split: The split for the dataset, either 'train' or 'val'. 87 download: Whether to download the data if it is not present. 88 89 Returns: 90 The filepaths to the stored data. 91 """ 92 get_nuc_mm_data(path, sample, download) 93 split_folder = os.path.join(path, sample, split) 94 paths = sorted(glob(os.path.join(split_folder, "*.h5"))) 95 return paths 96 97 98def get_nuc_mm_dataset( 99 path: Union[os.PathLike, str], 100 sample: Literal['mouse', 'zebrafish'], 101 split: str, 102 patch_shape: Tuple[int, int, int], 103 download: bool = False, 104 **kwargs 105) -> Dataset: 106 """Get the NucMM dataset for the segmentation of nuclei in X-Ray and EM. 107 108 Args: 109 path: Filepath to a folder where the downloaded data will be saved. 110 sample: The NucMM samples to use. The available samples are 'mouse' and 'zebrafish'. 111 split: The split for the dataset, either 'train' or 'val'. 112 patch_shape: The patch shape to use for training. 113 download: Whether to download the data if it is not present. 114 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 115 116 Returns: 117 The segmentation dataset. 118 """ 119 assert split in ("train", "val") 120 121 paths = get_nuc_mm_paths(path, sample, split, download) 122 123 return torch_em.default_segmentation_dataset( 124 raw_paths=paths, 125 raw_key="raw", 126 label_paths=paths, 127 label_key="labels", 128 patch_shape=patch_shape, 129 is_seg_dataset=True, 130 **kwargs 131 ) 132 133 134def get_nuc_mm_loader( 135 path: Union[os.PathLike, str], 136 sample: Literal['mouse', 'zebrafish'], 137 split: str, 138 patch_shape: Tuple[int, int, int], 139 batch_size: int, 140 download: bool = False, 141 **kwargs 142) -> DataLoader: 143 """Get the NucMM dataset for the segmentation of nuclei in X-Ray and EM. 144 145 Args: 146 path: Filepath to a folder where the downloaded data will be saved. 147 sample: The NucMM samples to use. The available samples are 'mouse' and 'zebrafish'. 148 split: The split for the dataset, either 'train' or 'val'. 149 patch_shape: The patch shape to use for training. 150 batch_size: The batch size for training. 151 download: Whether to download the data if it is not present. 152 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 153 154 Returns: 155 The segmentation dataset. 156 """ 157 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 158 ds = get_nuc_mm_dataset(path, sample, split, patch_shape, download, **ds_kwargs) 159 return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)
URL =
'https://drive.google.com/drive/folders/1_4CrlYvzx0ITnGlJOHdgcTRgeSkm9wT8'
def
get_nuc_mm_data( path: Union[os.PathLike, str], sample: Literal['mouse', 'zebrafish'], download: bool) -> str:
41def get_nuc_mm_data(path: Union[os.PathLike, str], sample: Literal['mouse', 'zebrafish'], download: bool) -> str: 42 """Download the NucMM training data. 43 44 Args: 45 path: Filepath to a folder where the downloaded data will be saved. 46 sample: The NucMM samples to use. The available samples are 'mouse' and 'zebrafish'. 47 download: Whether to download the data if it is not present. 48 49 Returns: 50 The filepath to the training data. 51 """ 52 assert sample in ("mouse", "zebrafish") 53 54 sample_folder = os.path.join(path, sample) 55 if os.path.exists(sample_folder): 56 return sample_folder 57 58 # Downloading the dataset 59 util.download_source_gdrive(path, URL, download, download_type="folder") 60 61 if sample == "mouse": 62 input_folder = os.path.join(path, "Mouse (NucMM-M)") 63 else: 64 input_folder = os.path.join(path, "Zebrafish (NucMM-Z)") 65 assert os.path.exists(input_folder), input_folder 66 67 sample_folder = os.path.join(path, sample) 68 _extract_split( 69 os.path.join(input_folder, "Image", "train"), os.path.join(input_folder, "Label", "train"), 70 os.path.join(sample_folder, "train") 71 ) 72 _extract_split( 73 os.path.join(input_folder, "Image", "val"), os.path.join(input_folder, "Label", "val"), 74 os.path.join(sample_folder, "val") 75 ) 76 return sample_folder
Download the NucMM training data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- sample: The NucMM samples to use. The available samples are 'mouse' and 'zebrafish'.
- download: Whether to download the data if it is not present.
Returns:
The filepath to the training data.
def
get_nuc_mm_paths( path: os.PathLike, sample: Literal['mouse', 'zebrafish'], split: str, download: bool = False) -> List[str]:
79def get_nuc_mm_paths( 80 path: Union[os.PathLike], sample: Literal['mouse', 'zebrafish'], split: str, download: bool = False, 81) -> List[str]: 82 """Get paths to the NucMM data. 83 84 Args: 85 path: Filepath to a folder where the downloaded data will be saved. 86 sample: The NucMM samples to use. The available samples are 'mouse' and 'zebrafish'. 87 split: The split for the dataset, either 'train' or 'val'. 88 download: Whether to download the data if it is not present. 89 90 Returns: 91 The filepaths to the stored data. 92 """ 93 get_nuc_mm_data(path, sample, download) 94 split_folder = os.path.join(path, sample, split) 95 paths = sorted(glob(os.path.join(split_folder, "*.h5"))) 96 return paths
Get paths to the NucMM data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- sample: The NucMM samples to use. The available samples are 'mouse' and 'zebrafish'.
- split: The split for the dataset, either 'train' or 'val'.
- download: Whether to download the data if it is not present.
Returns:
The filepaths to the stored data.
def
get_nuc_mm_dataset( path: Union[os.PathLike, str], sample: Literal['mouse', 'zebrafish'], split: str, patch_shape: Tuple[int, int, int], download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
99def get_nuc_mm_dataset( 100 path: Union[os.PathLike, str], 101 sample: Literal['mouse', 'zebrafish'], 102 split: str, 103 patch_shape: Tuple[int, int, int], 104 download: bool = False, 105 **kwargs 106) -> Dataset: 107 """Get the NucMM dataset for the segmentation of nuclei in X-Ray and EM. 108 109 Args: 110 path: Filepath to a folder where the downloaded data will be saved. 111 sample: The NucMM samples to use. The available samples are 'mouse' and 'zebrafish'. 112 split: The split for the dataset, either 'train' or 'val'. 113 patch_shape: The patch shape to use for training. 114 download: Whether to download the data if it is not present. 115 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 116 117 Returns: 118 The segmentation dataset. 119 """ 120 assert split in ("train", "val") 121 122 paths = get_nuc_mm_paths(path, sample, split, download) 123 124 return torch_em.default_segmentation_dataset( 125 raw_paths=paths, 126 raw_key="raw", 127 label_paths=paths, 128 label_key="labels", 129 patch_shape=patch_shape, 130 is_seg_dataset=True, 131 **kwargs 132 )
Get the NucMM dataset for the segmentation of nuclei in X-Ray and EM.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- sample: The NucMM samples to use. The available samples are 'mouse' and 'zebrafish'.
- split: The split for the dataset, either 'train' or 'val'.
- patch_shape: The patch shape to use for training.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
def
get_nuc_mm_loader( path: Union[os.PathLike, str], sample: Literal['mouse', 'zebrafish'], split: str, patch_shape: Tuple[int, int, int], batch_size: int, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
135def get_nuc_mm_loader( 136 path: Union[os.PathLike, str], 137 sample: Literal['mouse', 'zebrafish'], 138 split: str, 139 patch_shape: Tuple[int, int, int], 140 batch_size: int, 141 download: bool = False, 142 **kwargs 143) -> DataLoader: 144 """Get the NucMM dataset for the segmentation of nuclei in X-Ray and EM. 145 146 Args: 147 path: Filepath to a folder where the downloaded data will be saved. 148 sample: The NucMM samples to use. The available samples are 'mouse' and 'zebrafish'. 149 split: The split for the dataset, either 'train' or 'val'. 150 patch_shape: The patch shape to use for training. 151 batch_size: The batch size for training. 152 download: Whether to download the data if it is not present. 153 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 154 155 Returns: 156 The segmentation dataset. 157 """ 158 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 159 ds = get_nuc_mm_dataset(path, sample, split, patch_shape, download, **ds_kwargs) 160 return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)
Get the NucMM dataset for the segmentation of nuclei in X-Ray and EM.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- sample: The NucMM samples to use. The available samples are 'mouse' and 'zebrafish'.
- split: The split for the dataset, either 'train' or 'val'.
- patch_shape: The patch shape to use for training.
- batch_size: The batch size for training.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The segmentation dataset.