torch_em.data.datasets.electron_microscopy.nuc_mm

NucMM is a dataset for the segmentation of nuclei in EM and X-Ray.

This dataset is from the publication https://doi.org/10.1007/978-3-030-87193-2_16. Please cite it if you use this dataset for a publication.

  1"""NucMM is a dataset for the segmentation of nuclei in EM and X-Ray.
  2
  3This dataset is from the publication https://doi.org/10.1007/978-3-030-87193-2_16.
  4Please cite it if you use this dataset for a publication.
  5"""
  6
  7
  8import os
  9from glob import glob
 10from typing import Tuple, Union
 11
 12import h5py
 13import torch_em
 14from torch.utils.data import Dataset, DataLoader
 15
 16from .. import util
 17
 18URL = "https://drive.google.com/drive/folders/1_4CrlYvzx0ITnGlJOHdgcTRgeSkm9wT8"
 19
 20
 21def _extract_split(image_folder, label_folder, output_folder):
 22    os.makedirs(output_folder, exist_ok=True)
 23    image_files = sorted(glob(os.path.join(image_folder, "*.h5")))
 24    label_files = sorted(glob(os.path.join(label_folder, "*.h5")))
 25    assert len(image_files) == len(label_files)
 26    for image, label in zip(image_files, label_files):
 27        with h5py.File(image, "r") as f:
 28            vol = f["main"][:]
 29        with h5py.File(label, "r") as f:
 30            seg = f["main"][:]
 31        assert vol.shape == seg.shape
 32        out_path = os.path.join(output_folder, os.path.basename(image))
 33        with h5py.File(out_path, "a") as f:
 34            f.create_dataset("raw", data=vol, compression="gzip")
 35            f.create_dataset("labels", data=seg, compression="gzip")
 36
 37
 38def get_nuc_mm_data(path: Union[os.PathLike, str], sample: str, download: bool) -> str:
 39    """Download the NucMM training data.
 40
 41    Args:
 42        path: Filepath to a folder where the downloaded data will be saved.
 43        sample: The NucMM samples to use. The available samples are 'mouse' and 'zebrafish'.
 44        download: Whether to download the data if it is not present.
 45
 46    Returns:
 47        The filepath to the training data.
 48    """
 49    assert sample in ("mouse", "zebrafish")
 50
 51    sample_folder = os.path.join(path, sample)
 52    if os.path.exists(sample_folder):
 53        return sample_folder
 54
 55    # Downloading the dataset
 56    util.download_source_gdrive(path, URL, download, download_type="folder")
 57
 58    if sample == "mouse":
 59        input_folder = os.path.join(path, "Mouse (NucMM-M)")
 60    else:
 61        input_folder = os.path.join(path, "Zebrafish (NucMM-Z)")
 62    assert os.path.exists(input_folder), input_folder
 63
 64    sample_folder = os.path.join(path, sample)
 65    _extract_split(
 66        os.path.join(input_folder, "Image", "train"), os.path.join(input_folder, "Label", "train"),
 67        os.path.join(sample_folder, "train")
 68    )
 69    _extract_split(
 70        os.path.join(input_folder, "Image", "val"), os.path.join(input_folder, "Label", "val"),
 71        os.path.join(sample_folder, "val")
 72    )
 73    return sample_folder
 74
 75
 76def get_nuc_mm_dataset(
 77    path: Union[os.PathLike, str],
 78    sample: str,
 79    split: str,
 80    patch_shape: Tuple[int, int, int],
 81    download: bool = False,
 82    **kwargs
 83) -> Dataset:
 84    """Get the NucMM dataset for the segmentation of nuclei in X-Ray and EM.
 85
 86    Args:
 87        path: Filepath to a folder where the downloaded data will be saved.
 88        sample: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
 89        split: The split for the dataset, either 'train' or 'val'.
 90        patch_shape: The patch shape to use for training.
 91        download: Whether to download the data if it is not present.
 92        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
 93
 94    Returns:
 95       The segmentation dataset.
 96    """
 97    assert split in ("train", "val")
 98
 99    sample_folder = get_nuc_mm_data(path, sample, download)
100    split_folder = os.path.join(sample_folder, split)
101    paths = sorted(glob(os.path.join(split_folder, "*.h5")))
102
103    raw_key, label_key = "raw", "labels"
104    return torch_em.default_segmentation_dataset(
105        paths, raw_key, paths, label_key, patch_shape, is_seg_dataset=True, **kwargs
106    )
107
108
109def get_nuc_mm_loader(
110    path: Union[os.PathLike, str],
111    sample: str,
112    split: str,
113    patch_shape: Tuple[int, int, int],
114    batch_size: int,
115    download: bool = False,
116    **kwargs
117) -> DataLoader:
118    """Get the NucMM dataset for the segmentation of nuclei in X-Ray and EM.
119
120    Args:
121        path: Filepath to a folder where the downloaded data will be saved.
122        sample: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
123        split: The split for the dataset, either 'train' or 'val'.
124        patch_shape: The patch shape to use for training.
125        batch_size: The batch size for training.
126        download: Whether to download the data if it is not present.
127        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
128
129    Returns:
130       The segmentation dataset.
131    """
132    ds_kwargs, loader_kwargs = util.split_kwargs(
133        torch_em.default_segmentation_dataset, **kwargs
134    )
135    ds = get_nuc_mm_dataset(path, sample, split, patch_shape, download, **ds_kwargs)
136    return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)
URL = 'https://drive.google.com/drive/folders/1_4CrlYvzx0ITnGlJOHdgcTRgeSkm9wT8'
def get_nuc_mm_data(path: Union[os.PathLike, str], sample: str, download: bool) -> str:
39def get_nuc_mm_data(path: Union[os.PathLike, str], sample: str, download: bool) -> str:
40    """Download the NucMM training data.
41
42    Args:
43        path: Filepath to a folder where the downloaded data will be saved.
44        sample: The NucMM samples to use. The available samples are 'mouse' and 'zebrafish'.
45        download: Whether to download the data if it is not present.
46
47    Returns:
48        The filepath to the training data.
49    """
50    assert sample in ("mouse", "zebrafish")
51
52    sample_folder = os.path.join(path, sample)
53    if os.path.exists(sample_folder):
54        return sample_folder
55
56    # Downloading the dataset
57    util.download_source_gdrive(path, URL, download, download_type="folder")
58
59    if sample == "mouse":
60        input_folder = os.path.join(path, "Mouse (NucMM-M)")
61    else:
62        input_folder = os.path.join(path, "Zebrafish (NucMM-Z)")
63    assert os.path.exists(input_folder), input_folder
64
65    sample_folder = os.path.join(path, sample)
66    _extract_split(
67        os.path.join(input_folder, "Image", "train"), os.path.join(input_folder, "Label", "train"),
68        os.path.join(sample_folder, "train")
69    )
70    _extract_split(
71        os.path.join(input_folder, "Image", "val"), os.path.join(input_folder, "Label", "val"),
72        os.path.join(sample_folder, "val")
73    )
74    return sample_folder

Download the NucMM training data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • sample: The NucMM samples to use. The available samples are 'mouse' and 'zebrafish'.
  • download: Whether to download the data if it is not present.
Returns:

The filepath to the training data.

def get_nuc_mm_dataset( path: Union[os.PathLike, str], sample: str, split: str, patch_shape: Tuple[int, int, int], download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
 77def get_nuc_mm_dataset(
 78    path: Union[os.PathLike, str],
 79    sample: str,
 80    split: str,
 81    patch_shape: Tuple[int, int, int],
 82    download: bool = False,
 83    **kwargs
 84) -> Dataset:
 85    """Get the NucMM dataset for the segmentation of nuclei in X-Ray and EM.
 86
 87    Args:
 88        path: Filepath to a folder where the downloaded data will be saved.
 89        sample: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
 90        split: The split for the dataset, either 'train' or 'val'.
 91        patch_shape: The patch shape to use for training.
 92        download: Whether to download the data if it is not present.
 93        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
 94
 95    Returns:
 96       The segmentation dataset.
 97    """
 98    assert split in ("train", "val")
 99
100    sample_folder = get_nuc_mm_data(path, sample, download)
101    split_folder = os.path.join(sample_folder, split)
102    paths = sorted(glob(os.path.join(split_folder, "*.h5")))
103
104    raw_key, label_key = "raw", "labels"
105    return torch_em.default_segmentation_dataset(
106        paths, raw_key, paths, label_key, patch_shape, is_seg_dataset=True, **kwargs
107    )

Get the NucMM dataset for the segmentation of nuclei in X-Ray and EM.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • sample: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
  • split: The split for the dataset, either 'train' or 'val'.
  • patch_shape: The patch shape to use for training.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_nuc_mm_loader( path: Union[os.PathLike, str], sample: str, split: str, patch_shape: Tuple[int, int, int], batch_size: int, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
110def get_nuc_mm_loader(
111    path: Union[os.PathLike, str],
112    sample: str,
113    split: str,
114    patch_shape: Tuple[int, int, int],
115    batch_size: int,
116    download: bool = False,
117    **kwargs
118) -> DataLoader:
119    """Get the NucMM dataset for the segmentation of nuclei in X-Ray and EM.
120
121    Args:
122        path: Filepath to a folder where the downloaded data will be saved.
123        sample: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
124        split: The split for the dataset, either 'train' or 'val'.
125        patch_shape: The patch shape to use for training.
126        batch_size: The batch size for training.
127        download: Whether to download the data if it is not present.
128        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
129
130    Returns:
131       The segmentation dataset.
132    """
133    ds_kwargs, loader_kwargs = util.split_kwargs(
134        torch_em.default_segmentation_dataset, **kwargs
135    )
136    ds = get_nuc_mm_dataset(path, sample, split, patch_shape, download, **ds_kwargs)
137    return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)

Get the NucMM dataset for the segmentation of nuclei in X-Ray and EM.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • sample: The CREMI samples to use. The available samples are 'A', 'B', 'C'.
  • split: The split for the dataset, either 'train' or 'val'.
  • patch_shape: The patch shape to use for training.
  • batch_size: The batch size for training.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The segmentation dataset.