torch_em.data.datasets.electron_microscopy.nuc_mm

NucMM is a dataset for the segmentation of nuclei in EM and X-Ray.

This dataset is from the publication https://doi.org/10.1007/978-3-030-87193-2_16. Please cite it if you use this dataset for a publication.

  1"""NucMM is a dataset for the segmentation of nuclei in EM and X-Ray.
  2
  3This dataset is from the publication https://doi.org/10.1007/978-3-030-87193-2_16.
  4Please cite it if you use this dataset for a publication.
  5"""
  6
  7import os
  8from glob import glob
  9from typing import Tuple, Union, Literal, List
 10
 11import torch_em
 12
 13from torch.utils.data import Dataset, DataLoader
 14
 15from .. import util
 16
 17
 18URL = "https://drive.google.com/drive/folders/1_4CrlYvzx0ITnGlJOHdgcTRgeSkm9wT8"
 19
 20
 21def _extract_split(image_folder, label_folder, output_folder):
 22    import h5py
 23
 24    os.makedirs(output_folder, exist_ok=True)
 25    image_files = sorted(glob(os.path.join(image_folder, "*.h5")))
 26    label_files = sorted(glob(os.path.join(label_folder, "*.h5")))
 27    assert len(image_files) == len(label_files)
 28    for image, label in zip(image_files, label_files):
 29        with h5py.File(image, "r") as f:
 30            vol = f["main"][:]
 31        with h5py.File(label, "r") as f:
 32            seg = f["main"][:]
 33        assert vol.shape == seg.shape
 34        out_path = os.path.join(output_folder, os.path.basename(image))
 35        with h5py.File(out_path, "a") as f:
 36            f.create_dataset("raw", data=vol, compression="gzip")
 37            f.create_dataset("labels", data=seg, compression="gzip")
 38
 39
 40def get_nuc_mm_data(path: Union[os.PathLike, str], sample: Literal['mouse', 'zebrafish'], download: bool) -> str:
 41    """Download the NucMM training data.
 42
 43    Args:
 44        path: Filepath to a folder where the downloaded data will be saved.
 45        sample: The NucMM samples to use. The available samples are 'mouse' and 'zebrafish'.
 46        download: Whether to download the data if it is not present.
 47
 48    Returns:
 49        The filepath to the training data.
 50    """
 51    assert sample in ("mouse", "zebrafish")
 52
 53    sample_folder = os.path.join(path, sample)
 54    if os.path.exists(sample_folder):
 55        return sample_folder
 56
 57    # Downloading the dataset
 58    util.download_source_gdrive(path, URL, download, download_type="folder")
 59
 60    if sample == "mouse":
 61        input_folder = os.path.join(path, "Mouse (NucMM-M)")
 62    else:
 63        input_folder = os.path.join(path, "Zebrafish (NucMM-Z)")
 64    assert os.path.exists(input_folder), input_folder
 65
 66    sample_folder = os.path.join(path, sample)
 67    _extract_split(
 68        os.path.join(input_folder, "Image", "train"), os.path.join(input_folder, "Label", "train"),
 69        os.path.join(sample_folder, "train")
 70    )
 71    _extract_split(
 72        os.path.join(input_folder, "Image", "val"), os.path.join(input_folder, "Label", "val"),
 73        os.path.join(sample_folder, "val")
 74    )
 75    return sample_folder
 76
 77
 78def get_nuc_mm_paths(
 79    path: Union[os.PathLike], sample: Literal['mouse', 'zebrafish'], split: str, download: bool = False,
 80) -> List[str]:
 81    """Get paths to the NucMM data.
 82
 83    Args:
 84        path: Filepath to a folder where the downloaded data will be saved.
 85        sample: The NucMM samples to use. The available samples are 'mouse' and 'zebrafish'.
 86        split: The split for the dataset, either 'train' or 'val'.
 87        download: Whether to download the data if it is not present.
 88
 89    Returns:
 90        The filepaths to the stored data.
 91    """
 92    get_nuc_mm_data(path, sample, download)
 93    split_folder = os.path.join(path, sample, split)
 94    paths = sorted(glob(os.path.join(split_folder, "*.h5")))
 95    return paths
 96
 97
 98def get_nuc_mm_dataset(
 99    path: Union[os.PathLike, str],
100    sample: Literal['mouse', 'zebrafish'],
101    split: str,
102    patch_shape: Tuple[int, int, int],
103    download: bool = False,
104    **kwargs
105) -> Dataset:
106    """Get the NucMM dataset for the segmentation of nuclei in X-Ray and EM.
107
108    Args:
109        path: Filepath to a folder where the downloaded data will be saved.
110        sample: The NucMM samples to use. The available samples are 'mouse' and 'zebrafish'.
111        split: The split for the dataset, either 'train' or 'val'.
112        patch_shape: The patch shape to use for training.
113        download: Whether to download the data if it is not present.
114        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
115
116    Returns:
117       The segmentation dataset.
118    """
119    assert split in ("train", "val")
120
121    paths = get_nuc_mm_paths(path, sample, split, download)
122
123    return torch_em.default_segmentation_dataset(
124        raw_paths=paths,
125        raw_key="raw",
126        label_paths=paths,
127        label_key="labels",
128        patch_shape=patch_shape,
129        is_seg_dataset=True,
130        **kwargs
131    )
132
133
134def get_nuc_mm_loader(
135    path: Union[os.PathLike, str],
136    sample: Literal['mouse', 'zebrafish'],
137    split: str,
138    patch_shape: Tuple[int, int, int],
139    batch_size: int,
140    download: bool = False,
141    **kwargs
142) -> DataLoader:
143    """Get the NucMM dataset for the segmentation of nuclei in X-Ray and EM.
144
145    Args:
146        path: Filepath to a folder where the downloaded data will be saved.
147        sample: The NucMM samples to use. The available samples are 'mouse' and 'zebrafish'.
148        split: The split for the dataset, either 'train' or 'val'.
149        patch_shape: The patch shape to use for training.
150        batch_size: The batch size for training.
151        download: Whether to download the data if it is not present.
152        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
153
154    Returns:
155       The segmentation dataset.
156    """
157    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
158    ds = get_nuc_mm_dataset(path, sample, split, patch_shape, download, **ds_kwargs)
159    return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)
URL = 'https://drive.google.com/drive/folders/1_4CrlYvzx0ITnGlJOHdgcTRgeSkm9wT8'
def get_nuc_mm_data( path: Union[os.PathLike, str], sample: Literal['mouse', 'zebrafish'], download: bool) -> str:
41def get_nuc_mm_data(path: Union[os.PathLike, str], sample: Literal['mouse', 'zebrafish'], download: bool) -> str:
42    """Download the NucMM training data.
43
44    Args:
45        path: Filepath to a folder where the downloaded data will be saved.
46        sample: The NucMM samples to use. The available samples are 'mouse' and 'zebrafish'.
47        download: Whether to download the data if it is not present.
48
49    Returns:
50        The filepath to the training data.
51    """
52    assert sample in ("mouse", "zebrafish")
53
54    sample_folder = os.path.join(path, sample)
55    if os.path.exists(sample_folder):
56        return sample_folder
57
58    # Downloading the dataset
59    util.download_source_gdrive(path, URL, download, download_type="folder")
60
61    if sample == "mouse":
62        input_folder = os.path.join(path, "Mouse (NucMM-M)")
63    else:
64        input_folder = os.path.join(path, "Zebrafish (NucMM-Z)")
65    assert os.path.exists(input_folder), input_folder
66
67    sample_folder = os.path.join(path, sample)
68    _extract_split(
69        os.path.join(input_folder, "Image", "train"), os.path.join(input_folder, "Label", "train"),
70        os.path.join(sample_folder, "train")
71    )
72    _extract_split(
73        os.path.join(input_folder, "Image", "val"), os.path.join(input_folder, "Label", "val"),
74        os.path.join(sample_folder, "val")
75    )
76    return sample_folder

Download the NucMM training data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • sample: The NucMM samples to use. The available samples are 'mouse' and 'zebrafish'.
  • download: Whether to download the data if it is not present.
Returns:

The filepath to the training data.

def get_nuc_mm_paths( path: os.PathLike, sample: Literal['mouse', 'zebrafish'], split: str, download: bool = False) -> List[str]:
79def get_nuc_mm_paths(
80    path: Union[os.PathLike], sample: Literal['mouse', 'zebrafish'], split: str, download: bool = False,
81) -> List[str]:
82    """Get paths to the NucMM data.
83
84    Args:
85        path: Filepath to a folder where the downloaded data will be saved.
86        sample: The NucMM samples to use. The available samples are 'mouse' and 'zebrafish'.
87        split: The split for the dataset, either 'train' or 'val'.
88        download: Whether to download the data if it is not present.
89
90    Returns:
91        The filepaths to the stored data.
92    """
93    get_nuc_mm_data(path, sample, download)
94    split_folder = os.path.join(path, sample, split)
95    paths = sorted(glob(os.path.join(split_folder, "*.h5")))
96    return paths

Get paths to the NucMM data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • sample: The NucMM samples to use. The available samples are 'mouse' and 'zebrafish'.
  • split: The split for the dataset, either 'train' or 'val'.
  • download: Whether to download the data if it is not present.
Returns:

The filepaths to the stored data.

def get_nuc_mm_dataset( path: Union[os.PathLike, str], sample: Literal['mouse', 'zebrafish'], split: str, patch_shape: Tuple[int, int, int], download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
 99def get_nuc_mm_dataset(
100    path: Union[os.PathLike, str],
101    sample: Literal['mouse', 'zebrafish'],
102    split: str,
103    patch_shape: Tuple[int, int, int],
104    download: bool = False,
105    **kwargs
106) -> Dataset:
107    """Get the NucMM dataset for the segmentation of nuclei in X-Ray and EM.
108
109    Args:
110        path: Filepath to a folder where the downloaded data will be saved.
111        sample: The NucMM samples to use. The available samples are 'mouse' and 'zebrafish'.
112        split: The split for the dataset, either 'train' or 'val'.
113        patch_shape: The patch shape to use for training.
114        download: Whether to download the data if it is not present.
115        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
116
117    Returns:
118       The segmentation dataset.
119    """
120    assert split in ("train", "val")
121
122    paths = get_nuc_mm_paths(path, sample, split, download)
123
124    return torch_em.default_segmentation_dataset(
125        raw_paths=paths,
126        raw_key="raw",
127        label_paths=paths,
128        label_key="labels",
129        patch_shape=patch_shape,
130        is_seg_dataset=True,
131        **kwargs
132    )

Get the NucMM dataset for the segmentation of nuclei in X-Ray and EM.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • sample: The NucMM samples to use. The available samples are 'mouse' and 'zebrafish'.
  • split: The split for the dataset, either 'train' or 'val'.
  • patch_shape: The patch shape to use for training.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_nuc_mm_loader( path: Union[os.PathLike, str], sample: Literal['mouse', 'zebrafish'], split: str, patch_shape: Tuple[int, int, int], batch_size: int, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
135def get_nuc_mm_loader(
136    path: Union[os.PathLike, str],
137    sample: Literal['mouse', 'zebrafish'],
138    split: str,
139    patch_shape: Tuple[int, int, int],
140    batch_size: int,
141    download: bool = False,
142    **kwargs
143) -> DataLoader:
144    """Get the NucMM dataset for the segmentation of nuclei in X-Ray and EM.
145
146    Args:
147        path: Filepath to a folder where the downloaded data will be saved.
148        sample: The NucMM samples to use. The available samples are 'mouse' and 'zebrafish'.
149        split: The split for the dataset, either 'train' or 'val'.
150        patch_shape: The patch shape to use for training.
151        batch_size: The batch size for training.
152        download: Whether to download the data if it is not present.
153        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
154
155    Returns:
156       The segmentation dataset.
157    """
158    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
159    ds = get_nuc_mm_dataset(path, sample, split, patch_shape, download, **ds_kwargs)
160    return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)

Get the NucMM dataset for the segmentation of nuclei in X-Ray and EM.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • sample: The NucMM samples to use. The available samples are 'mouse' and 'zebrafish'.
  • split: The split for the dataset, either 'train' or 'val'.
  • patch_shape: The patch shape to use for training.
  • batch_size: The batch size for training.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The segmentation dataset.