torch_em.data.datasets.electron_microscopy.emps

The EMPS dataset contains electron microscopy images of nanoparticles with pixel-level instance segmentation annotations.

It contains 465 TEM/SEM images of nanoparticles sourced from scientific publications, each paired with a 32-bit integer instance segmentation map where each unique value identifies an individual particle (0 = background).

The dataset is available at https://github.com/by256/emps. The dataset was published in https://doi.org/10.1021/acs.jcim.0c01455. Please cite this publication if you use the dataset in your research.

  1"""The EMPS dataset contains electron microscopy images of nanoparticles with
  2pixel-level instance segmentation annotations.
  3
  4It contains 465 TEM/SEM images of nanoparticles sourced from scientific publications,
  5each paired with a 32-bit integer instance segmentation map where each unique value
  6identifies an individual particle (0 = background).
  7
  8The dataset is available at https://github.com/by256/emps.
  9The dataset was published in https://doi.org/10.1021/acs.jcim.0c01455.
 10Please cite this publication if you use the dataset in your research.
 11"""
 12
 13import os
 14from glob import glob
 15from shutil import rmtree
 16from typing import List, Literal, Tuple, Union
 17
 18from torch.utils.data import DataLoader, Dataset
 19
 20import torch_em
 21
 22from .. import util
 23
 24
 25URL = "https://github.com/by256/emps/archive/refs/heads/main.zip"
 26CHECKSUM = None
 27
 28
 29def _create_h5_files(data_root, split, out_dir):
 30    """Convert PNG image/segmap pairs for the given split into HDF5 files."""
 31    import h5py
 32    import imageio.v3 as imageio
 33
 34    split_csv = os.path.join(data_root, f"{split}.csv")
 35    with open(split_csv) as f:
 36        filenames = [line.strip() for line in f if line.strip()]
 37
 38    # The CSV may or may not include the .png extension.
 39    filenames = [fn if fn.endswith(".png") else f"{fn}.png" for fn in filenames]
 40
 41    os.makedirs(out_dir, exist_ok=True)
 42
 43    for fname in filenames:
 44        img_path = os.path.join(data_root, "images", fname)
 45        seg_path = os.path.join(data_root, "segmaps", fname)
 46
 47        assert os.path.exists(img_path), f"Image not found: {img_path}"
 48        assert os.path.exists(seg_path), f"Segmap not found: {seg_path}"
 49
 50        raw = imageio.imread(img_path)
 51        if raw.ndim == 3:
 52            raw = raw[..., 0]
 53
 54        labels = imageio.imread(seg_path)
 55        if labels.ndim == 3:
 56            labels = labels[..., 0]
 57
 58        stem = os.path.splitext(fname)[0]
 59        out_path = os.path.join(out_dir, f"{stem}.h5")
 60
 61        with h5py.File(out_path, "w") as f:
 62            f.create_dataset("raw", data=raw.astype("uint8"), compression="gzip")
 63            f.create_dataset("labels", data=labels.astype("int32"), compression="gzip")
 64
 65
 66def get_emps_data(
 67    path: Union[os.PathLike, str],
 68    split: Literal["train", "test"],
 69    download: bool = False,
 70) -> str:
 71    """Download and preprocess the EMPS dataset.
 72
 73    Args:
 74        path: Filepath to a folder where the downloaded data will be saved.
 75        split: The data split, either 'train' or 'test'.
 76        download: Whether to download the data if it is not present.
 77
 78    Returns:
 79        The path to the directory containing the HDF5 files for the given split.
 80    """
 81    assert split in ("train", "test"), f"split must be 'train' or 'test', got {split!r}"
 82
 83    out_dir = os.path.join(path, split)
 84    if os.path.exists(out_dir) and len(glob(os.path.join(out_dir, "*.h5"))) > 0:
 85        return out_dir
 86
 87    os.makedirs(path, exist_ok=True)
 88
 89    zip_path = os.path.join(path, "emps.zip")
 90    util.download_source(zip_path, URL, download, checksum=CHECKSUM)
 91
 92    extract_dir = os.path.join(path, "_extracted")
 93    util.unzip(zip_path, extract_dir, remove=True)
 94
 95    # The zip extracts to a single root folder (e.g. "emps-main/").
 96    subdirs = [d for d in os.listdir(extract_dir) if os.path.isdir(os.path.join(extract_dir, d))]
 97    data_root = os.path.join(extract_dir, subdirs[0]) if subdirs else extract_dir
 98
 99    for s in ("train", "test"):
100        _create_h5_files(data_root, s, os.path.join(path, s))
101
102    rmtree(extract_dir)
103
104    return out_dir
105
106
107def get_emps_paths(
108    path: Union[os.PathLike, str],
109    split: Literal["train", "test"],
110    download: bool = False,
111) -> List[str]:
112    """Get paths to the EMPS HDF5 files.
113
114    Args:
115        path: Filepath to a folder where the downloaded data will be saved.
116        split: The data split, either 'train' or 'test'.
117        download: Whether to download the data if it is not present.
118
119    Returns:
120        List of filepaths to the HDF5 files.
121    """
122    data_dir = get_emps_data(path, split, download)
123    paths = sorted(glob(os.path.join(data_dir, "*.h5")))
124    assert len(paths) > 0, f"No HDF5 files found in '{data_dir}'"
125    return paths
126
127
128def get_emps_dataset(
129    path: Union[os.PathLike, str],
130    patch_shape: Tuple[int, int],
131    split: Literal["train", "test"],
132    download: bool = False,
133    **kwargs,
134) -> Dataset:
135    """Get the EMPS dataset for nanoparticle instance segmentation in electron microscopy.
136
137    Args:
138        path: Filepath to a folder where the downloaded data will be saved.
139        patch_shape: The patch shape to use for training.
140        split: The data split, either 'train' or 'test'.
141        download: Whether to download the data if it is not present.
142        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
143
144    Returns:
145        The segmentation dataset.
146    """
147    paths = get_emps_paths(path, split, download)
148
149    kwargs = util.update_kwargs(kwargs, "is_seg_dataset", True)
150    kwargs, _ = util.add_instance_label_transform(kwargs, add_binary_target=True)
151
152    return torch_em.default_segmentation_dataset(
153        raw_paths=paths,
154        raw_key="raw",
155        label_paths=paths,
156        label_key="labels",
157        patch_shape=patch_shape,
158        **kwargs,
159    )
160
161
162def get_emps_loader(
163    path: Union[os.PathLike, str],
164    patch_shape: Tuple[int, int],
165    batch_size: int,
166    split: Literal["train", "test"],
167    download: bool = False,
168    **kwargs,
169) -> DataLoader:
170    """Get the DataLoader for nanoparticle instance segmentation in the EMPS dataset.
171
172    Args:
173        path: Filepath to a folder where the downloaded data will be saved.
174        patch_shape: The patch shape to use for training.
175        batch_size: The batch size for training.
176        split: The data split, either 'train' or 'test'.
177        download: Whether to download the data if it is not present.
178        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
179
180    Returns:
181        The DataLoader.
182    """
183    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
184    dataset = get_emps_dataset(path, patch_shape, split, download, **ds_kwargs)
185    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URL = 'https://github.com/by256/emps/archive/refs/heads/main.zip'
CHECKSUM = None
def get_emps_data( path: Union[os.PathLike, str], split: Literal['train', 'test'], download: bool = False) -> str:
 67def get_emps_data(
 68    path: Union[os.PathLike, str],
 69    split: Literal["train", "test"],
 70    download: bool = False,
 71) -> str:
 72    """Download and preprocess the EMPS dataset.
 73
 74    Args:
 75        path: Filepath to a folder where the downloaded data will be saved.
 76        split: The data split, either 'train' or 'test'.
 77        download: Whether to download the data if it is not present.
 78
 79    Returns:
 80        The path to the directory containing the HDF5 files for the given split.
 81    """
 82    assert split in ("train", "test"), f"split must be 'train' or 'test', got {split!r}"
 83
 84    out_dir = os.path.join(path, split)
 85    if os.path.exists(out_dir) and len(glob(os.path.join(out_dir, "*.h5"))) > 0:
 86        return out_dir
 87
 88    os.makedirs(path, exist_ok=True)
 89
 90    zip_path = os.path.join(path, "emps.zip")
 91    util.download_source(zip_path, URL, download, checksum=CHECKSUM)
 92
 93    extract_dir = os.path.join(path, "_extracted")
 94    util.unzip(zip_path, extract_dir, remove=True)
 95
 96    # The zip extracts to a single root folder (e.g. "emps-main/").
 97    subdirs = [d for d in os.listdir(extract_dir) if os.path.isdir(os.path.join(extract_dir, d))]
 98    data_root = os.path.join(extract_dir, subdirs[0]) if subdirs else extract_dir
 99
100    for s in ("train", "test"):
101        _create_h5_files(data_root, s, os.path.join(path, s))
102
103    rmtree(extract_dir)
104
105    return out_dir

Download and preprocess the EMPS dataset.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • split: The data split, either 'train' or 'test'.
  • download: Whether to download the data if it is not present.
Returns:

The path to the directory containing the HDF5 files for the given split.

def get_emps_paths( path: Union[os.PathLike, str], split: Literal['train', 'test'], download: bool = False) -> List[str]:
108def get_emps_paths(
109    path: Union[os.PathLike, str],
110    split: Literal["train", "test"],
111    download: bool = False,
112) -> List[str]:
113    """Get paths to the EMPS HDF5 files.
114
115    Args:
116        path: Filepath to a folder where the downloaded data will be saved.
117        split: The data split, either 'train' or 'test'.
118        download: Whether to download the data if it is not present.
119
120    Returns:
121        List of filepaths to the HDF5 files.
122    """
123    data_dir = get_emps_data(path, split, download)
124    paths = sorted(glob(os.path.join(data_dir, "*.h5")))
125    assert len(paths) > 0, f"No HDF5 files found in '{data_dir}'"
126    return paths

Get paths to the EMPS HDF5 files.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • split: The data split, either 'train' or 'test'.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths to the HDF5 files.

def get_emps_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Literal['train', 'test'], download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
129def get_emps_dataset(
130    path: Union[os.PathLike, str],
131    patch_shape: Tuple[int, int],
132    split: Literal["train", "test"],
133    download: bool = False,
134    **kwargs,
135) -> Dataset:
136    """Get the EMPS dataset for nanoparticle instance segmentation in electron microscopy.
137
138    Args:
139        path: Filepath to a folder where the downloaded data will be saved.
140        patch_shape: The patch shape to use for training.
141        split: The data split, either 'train' or 'test'.
142        download: Whether to download the data if it is not present.
143        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
144
145    Returns:
146        The segmentation dataset.
147    """
148    paths = get_emps_paths(path, split, download)
149
150    kwargs = util.update_kwargs(kwargs, "is_seg_dataset", True)
151    kwargs, _ = util.add_instance_label_transform(kwargs, add_binary_target=True)
152
153    return torch_em.default_segmentation_dataset(
154        raw_paths=paths,
155        raw_key="raw",
156        label_paths=paths,
157        label_key="labels",
158        patch_shape=patch_shape,
159        **kwargs,
160    )

Get the EMPS dataset for nanoparticle instance segmentation in electron microscopy.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • split: The data split, either 'train' or 'test'.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_emps_loader( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], batch_size: int, split: Literal['train', 'test'], download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
163def get_emps_loader(
164    path: Union[os.PathLike, str],
165    patch_shape: Tuple[int, int],
166    batch_size: int,
167    split: Literal["train", "test"],
168    download: bool = False,
169    **kwargs,
170) -> DataLoader:
171    """Get the DataLoader for nanoparticle instance segmentation in the EMPS dataset.
172
173    Args:
174        path: Filepath to a folder where the downloaded data will be saved.
175        patch_shape: The patch shape to use for training.
176        batch_size: The batch size for training.
177        split: The data split, either 'train' or 'test'.
178        download: Whether to download the data if it is not present.
179        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
180
181    Returns:
182        The DataLoader.
183    """
184    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
185    dataset = get_emps_dataset(path, patch_shape, split, download, **ds_kwargs)
186    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the DataLoader for nanoparticle instance segmentation in the EMPS dataset.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • batch_size: The batch size for training.
  • split: The data split, either 'train' or 'test'.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.