torch_em.data.datasets.electron_microscopy.nuc_mm
NucMM is a dataset for the segmentation of nuclei in EM and X-Ray.
This dataset is from the publication https://doi.org/10.1007/978-3-030-87193-2_16. Please cite it if you use this dataset for a publication.
1"""NucMM is a dataset for the segmentation of nuclei in EM and X-Ray. 2 3This dataset is from the publication https://doi.org/10.1007/978-3-030-87193-2_16. 4Please cite it if you use this dataset for a publication. 5""" 6 7 8import os 9from glob import glob 10from typing import Tuple, Union 11 12import h5py 13import torch_em 14from torch.utils.data import Dataset, DataLoader 15 16from .. import util 17 18URL = "https://drive.google.com/drive/folders/1_4CrlYvzx0ITnGlJOHdgcTRgeSkm9wT8" 19 20 21def _extract_split(image_folder, label_folder, output_folder): 22 os.makedirs(output_folder, exist_ok=True) 23 image_files = sorted(glob(os.path.join(image_folder, "*.h5"))) 24 label_files = sorted(glob(os.path.join(label_folder, "*.h5"))) 25 assert len(image_files) == len(label_files) 26 for image, label in zip(image_files, label_files): 27 with h5py.File(image, "r") as f: 28 vol = f["main"][:] 29 with h5py.File(label, "r") as f: 30 seg = f["main"][:] 31 assert vol.shape == seg.shape 32 out_path = os.path.join(output_folder, os.path.basename(image)) 33 with h5py.File(out_path, "a") as f: 34 f.create_dataset("raw", data=vol, compression="gzip") 35 f.create_dataset("labels", data=seg, compression="gzip") 36 37 38def get_nuc_mm_data(path: Union[os.PathLike, str], sample: str, download: bool) -> str: 39 """Download the NucMM training data. 40 41 Args: 42 path: Filepath to a folder where the downloaded data will be saved. 43 sample: The NucMM samples to use. The available samples are 'mouse' and 'zebrafish'. 44 download: Whether to download the data if it is not present. 45 46 Returns: 47 The filepath to the training data. 48 """ 49 assert sample in ("mouse", "zebrafish") 50 51 sample_folder = os.path.join(path, sample) 52 if os.path.exists(sample_folder): 53 return sample_folder 54 55 # Downloading the dataset 56 util.download_source_gdrive(path, URL, download, download_type="folder") 57 58 if sample == "mouse": 59 input_folder = os.path.join(path, "Mouse (NucMM-M)") 60 else: 61 input_folder = os.path.join(path, "Zebrafish (NucMM-Z)") 62 assert os.path.exists(input_folder), input_folder 63 64 sample_folder = os.path.join(path, sample) 65 _extract_split( 66 os.path.join(input_folder, "Image", "train"), os.path.join(input_folder, "Label", "train"), 67 os.path.join(sample_folder, "train") 68 ) 69 _extract_split( 70 os.path.join(input_folder, "Image", "val"), os.path.join(input_folder, "Label", "val"), 71 os.path.join(sample_folder, "val") 72 ) 73 return sample_folder 74 75 76def get_nuc_mm_dataset( 77 path: Union[os.PathLike, str], 78 sample: str, 79 split: str, 80 patch_shape: Tuple[int, int, int], 81 download: bool = False, 82 **kwargs 83) -> Dataset: 84 """Get the NucMM dataset for the segmentation of nuclei in X-Ray and EM. 85 86 Args: 87 path: Filepath to a folder where the downloaded data will be saved. 88 sample: The CREMI samples to use. The available samples are 'A', 'B', 'C'. 89 split: The split for the dataset, either 'train' or 'val'. 90 patch_shape: The patch shape to use for training. 91 download: Whether to download the data if it is not present. 92 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 93 94 Returns: 95 The segmentation dataset. 96 """ 97 assert split in ("train", "val") 98 99 sample_folder = get_nuc_mm_data(path, sample, download) 100 split_folder = os.path.join(sample_folder, split) 101 paths = sorted(glob(os.path.join(split_folder, "*.h5"))) 102 103 raw_key, label_key = "raw", "labels" 104 return torch_em.default_segmentation_dataset( 105 paths, raw_key, paths, label_key, patch_shape, is_seg_dataset=True, **kwargs 106 ) 107 108 109def get_nuc_mm_loader( 110 path: Union[os.PathLike, str], 111 sample: str, 112 split: str, 113 patch_shape: Tuple[int, int, int], 114 batch_size: int, 115 download: bool = False, 116 **kwargs 117) -> DataLoader: 118 """Get the NucMM dataset for the segmentation of nuclei in X-Ray and EM. 119 120 Args: 121 path: Filepath to a folder where the downloaded data will be saved. 122 sample: The CREMI samples to use. The available samples are 'A', 'B', 'C'. 123 split: The split for the dataset, either 'train' or 'val'. 124 patch_shape: The patch shape to use for training. 125 batch_size: The batch size for training. 126 download: Whether to download the data if it is not present. 127 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 128 129 Returns: 130 The segmentation dataset. 131 """ 132 ds_kwargs, loader_kwargs = util.split_kwargs( 133 torch_em.default_segmentation_dataset, **kwargs 134 ) 135 ds = get_nuc_mm_dataset(path, sample, split, patch_shape, download, **ds_kwargs) 136 return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)
URL =
'https://drive.google.com/drive/folders/1_4CrlYvzx0ITnGlJOHdgcTRgeSkm9wT8'
def
get_nuc_mm_data(path: Union[os.PathLike, str], sample: str, download: bool) -> str:
39def get_nuc_mm_data(path: Union[os.PathLike, str], sample: str, download: bool) -> str: 40 """Download the NucMM training data. 41 42 Args: 43 path: Filepath to a folder where the downloaded data will be saved. 44 sample: The NucMM samples to use. The available samples are 'mouse' and 'zebrafish'. 45 download: Whether to download the data if it is not present. 46 47 Returns: 48 The filepath to the training data. 49 """ 50 assert sample in ("mouse", "zebrafish") 51 52 sample_folder = os.path.join(path, sample) 53 if os.path.exists(sample_folder): 54 return sample_folder 55 56 # Downloading the dataset 57 util.download_source_gdrive(path, URL, download, download_type="folder") 58 59 if sample == "mouse": 60 input_folder = os.path.join(path, "Mouse (NucMM-M)") 61 else: 62 input_folder = os.path.join(path, "Zebrafish (NucMM-Z)") 63 assert os.path.exists(input_folder), input_folder 64 65 sample_folder = os.path.join(path, sample) 66 _extract_split( 67 os.path.join(input_folder, "Image", "train"), os.path.join(input_folder, "Label", "train"), 68 os.path.join(sample_folder, "train") 69 ) 70 _extract_split( 71 os.path.join(input_folder, "Image", "val"), os.path.join(input_folder, "Label", "val"), 72 os.path.join(sample_folder, "val") 73 ) 74 return sample_folder
Download the NucMM training data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- sample: The NucMM samples to use. The available samples are 'mouse' and 'zebrafish'.
- download: Whether to download the data if it is not present.
Returns:
The filepath to the training data.
def
get_nuc_mm_dataset( path: Union[os.PathLike, str], sample: str, split: str, patch_shape: Tuple[int, int, int], download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
77def get_nuc_mm_dataset( 78 path: Union[os.PathLike, str], 79 sample: str, 80 split: str, 81 patch_shape: Tuple[int, int, int], 82 download: bool = False, 83 **kwargs 84) -> Dataset: 85 """Get the NucMM dataset for the segmentation of nuclei in X-Ray and EM. 86 87 Args: 88 path: Filepath to a folder where the downloaded data will be saved. 89 sample: The CREMI samples to use. The available samples are 'A', 'B', 'C'. 90 split: The split for the dataset, either 'train' or 'val'. 91 patch_shape: The patch shape to use for training. 92 download: Whether to download the data if it is not present. 93 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 94 95 Returns: 96 The segmentation dataset. 97 """ 98 assert split in ("train", "val") 99 100 sample_folder = get_nuc_mm_data(path, sample, download) 101 split_folder = os.path.join(sample_folder, split) 102 paths = sorted(glob(os.path.join(split_folder, "*.h5"))) 103 104 raw_key, label_key = "raw", "labels" 105 return torch_em.default_segmentation_dataset( 106 paths, raw_key, paths, label_key, patch_shape, is_seg_dataset=True, **kwargs 107 )
Get the NucMM dataset for the segmentation of nuclei in X-Ray and EM.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- sample: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
- split: The split for the dataset, either 'train' or 'val'.
- patch_shape: The patch shape to use for training.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
def
get_nuc_mm_loader( path: Union[os.PathLike, str], sample: str, split: str, patch_shape: Tuple[int, int, int], batch_size: int, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
110def get_nuc_mm_loader( 111 path: Union[os.PathLike, str], 112 sample: str, 113 split: str, 114 patch_shape: Tuple[int, int, int], 115 batch_size: int, 116 download: bool = False, 117 **kwargs 118) -> DataLoader: 119 """Get the NucMM dataset for the segmentation of nuclei in X-Ray and EM. 120 121 Args: 122 path: Filepath to a folder where the downloaded data will be saved. 123 sample: The CREMI samples to use. The available samples are 'A', 'B', 'C'. 124 split: The split for the dataset, either 'train' or 'val'. 125 patch_shape: The patch shape to use for training. 126 batch_size: The batch size for training. 127 download: Whether to download the data if it is not present. 128 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 129 130 Returns: 131 The segmentation dataset. 132 """ 133 ds_kwargs, loader_kwargs = util.split_kwargs( 134 torch_em.default_segmentation_dataset, **kwargs 135 ) 136 ds = get_nuc_mm_dataset(path, sample, split, patch_shape, download, **ds_kwargs) 137 return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)
Get the NucMM dataset for the segmentation of nuclei in X-Ray and EM.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- sample: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
- split: The split for the dataset, either 'train' or 'val'.
- patch_shape: The patch shape to use for training.
- batch_size: The batch size for training.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The segmentation dataset.