torch_em.data.datasets.electron_microscopy.emps
The EMPS dataset contains electron microscopy images of nanoparticles with pixel-level instance segmentation annotations.
It contains 465 TEM/SEM images of nanoparticles sourced from scientific publications, each paired with a 32-bit integer instance segmentation map where each unique value identifies an individual particle (0 = background).
The dataset is available at https://github.com/by256/emps. The dataset was published in https://doi.org/10.1021/acs.jcim.0c01455. Please cite this publication if you use the dataset in your research.
1"""The EMPS dataset contains electron microscopy images of nanoparticles with 2pixel-level instance segmentation annotations. 3 4It contains 465 TEM/SEM images of nanoparticles sourced from scientific publications, 5each paired with a 32-bit integer instance segmentation map where each unique value 6identifies an individual particle (0 = background). 7 8The dataset is available at https://github.com/by256/emps. 9The dataset was published in https://doi.org/10.1021/acs.jcim.0c01455. 10Please cite this publication if you use the dataset in your research. 11""" 12 13import os 14from glob import glob 15from shutil import rmtree 16from typing import List, Literal, Tuple, Union 17 18from torch.utils.data import DataLoader, Dataset 19 20import torch_em 21 22from .. import util 23 24 25URL = "https://github.com/by256/emps/archive/refs/heads/main.zip" 26CHECKSUM = None 27 28 29def _create_h5_files(data_root, split, out_dir): 30 """Convert PNG image/segmap pairs for the given split into HDF5 files.""" 31 import h5py 32 import imageio.v3 as imageio 33 34 split_csv = os.path.join(data_root, f"{split}.csv") 35 with open(split_csv) as f: 36 filenames = [line.strip() for line in f if line.strip()] 37 38 # The CSV may or may not include the .png extension. 39 filenames = [fn if fn.endswith(".png") else f"{fn}.png" for fn in filenames] 40 41 os.makedirs(out_dir, exist_ok=True) 42 43 for fname in filenames: 44 img_path = os.path.join(data_root, "images", fname) 45 seg_path = os.path.join(data_root, "segmaps", fname) 46 47 assert os.path.exists(img_path), f"Image not found: {img_path}" 48 assert os.path.exists(seg_path), f"Segmap not found: {seg_path}" 49 50 raw = imageio.imread(img_path) 51 if raw.ndim == 3: 52 raw = raw[..., 0] 53 54 labels = imageio.imread(seg_path) 55 if labels.ndim == 3: 56 labels = labels[..., 0] 57 58 stem = os.path.splitext(fname)[0] 59 out_path = os.path.join(out_dir, f"{stem}.h5") 60 61 with h5py.File(out_path, "w") as f: 62 f.create_dataset("raw", data=raw.astype("uint8"), compression="gzip") 63 f.create_dataset("labels", data=labels.astype("int32"), compression="gzip") 64 65 66def get_emps_data( 67 path: Union[os.PathLike, str], 68 split: Literal["train", "test"], 69 download: bool = False, 70) -> str: 71 """Download and preprocess the EMPS dataset. 72 73 Args: 74 path: Filepath to a folder where the downloaded data will be saved. 75 split: The data split, either 'train' or 'test'. 76 download: Whether to download the data if it is not present. 77 78 Returns: 79 The path to the directory containing the HDF5 files for the given split. 80 """ 81 assert split in ("train", "test"), f"split must be 'train' or 'test', got {split!r}" 82 83 out_dir = os.path.join(path, split) 84 if os.path.exists(out_dir) and len(glob(os.path.join(out_dir, "*.h5"))) > 0: 85 return out_dir 86 87 os.makedirs(path, exist_ok=True) 88 89 zip_path = os.path.join(path, "emps.zip") 90 util.download_source(zip_path, URL, download, checksum=CHECKSUM) 91 92 extract_dir = os.path.join(path, "_extracted") 93 util.unzip(zip_path, extract_dir, remove=True) 94 95 # The zip extracts to a single root folder (e.g. "emps-main/"). 96 subdirs = [d for d in os.listdir(extract_dir) if os.path.isdir(os.path.join(extract_dir, d))] 97 data_root = os.path.join(extract_dir, subdirs[0]) if subdirs else extract_dir 98 99 for s in ("train", "test"): 100 _create_h5_files(data_root, s, os.path.join(path, s)) 101 102 rmtree(extract_dir) 103 104 return out_dir 105 106 107def get_emps_paths( 108 path: Union[os.PathLike, str], 109 split: Literal["train", "test"], 110 download: bool = False, 111) -> List[str]: 112 """Get paths to the EMPS HDF5 files. 113 114 Args: 115 path: Filepath to a folder where the downloaded data will be saved. 116 split: The data split, either 'train' or 'test'. 117 download: Whether to download the data if it is not present. 118 119 Returns: 120 List of filepaths to the HDF5 files. 121 """ 122 data_dir = get_emps_data(path, split, download) 123 paths = sorted(glob(os.path.join(data_dir, "*.h5"))) 124 assert len(paths) > 0, f"No HDF5 files found in '{data_dir}'" 125 return paths 126 127 128def get_emps_dataset( 129 path: Union[os.PathLike, str], 130 patch_shape: Tuple[int, int], 131 split: Literal["train", "test"], 132 download: bool = False, 133 **kwargs, 134) -> Dataset: 135 """Get the EMPS dataset for nanoparticle instance segmentation in electron microscopy. 136 137 Args: 138 path: Filepath to a folder where the downloaded data will be saved. 139 patch_shape: The patch shape to use for training. 140 split: The data split, either 'train' or 'test'. 141 download: Whether to download the data if it is not present. 142 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 143 144 Returns: 145 The segmentation dataset. 146 """ 147 paths = get_emps_paths(path, split, download) 148 149 kwargs = util.update_kwargs(kwargs, "is_seg_dataset", True) 150 kwargs, _ = util.add_instance_label_transform(kwargs, add_binary_target=True) 151 152 return torch_em.default_segmentation_dataset( 153 raw_paths=paths, 154 raw_key="raw", 155 label_paths=paths, 156 label_key="labels", 157 patch_shape=patch_shape, 158 **kwargs, 159 ) 160 161 162def get_emps_loader( 163 path: Union[os.PathLike, str], 164 patch_shape: Tuple[int, int], 165 batch_size: int, 166 split: Literal["train", "test"], 167 download: bool = False, 168 **kwargs, 169) -> DataLoader: 170 """Get the DataLoader for nanoparticle instance segmentation in the EMPS dataset. 171 172 Args: 173 path: Filepath to a folder where the downloaded data will be saved. 174 patch_shape: The patch shape to use for training. 175 batch_size: The batch size for training. 176 split: The data split, either 'train' or 'test'. 177 download: Whether to download the data if it is not present. 178 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 179 180 Returns: 181 The DataLoader. 182 """ 183 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 184 dataset = get_emps_dataset(path, patch_shape, split, download, **ds_kwargs) 185 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
67def get_emps_data( 68 path: Union[os.PathLike, str], 69 split: Literal["train", "test"], 70 download: bool = False, 71) -> str: 72 """Download and preprocess the EMPS dataset. 73 74 Args: 75 path: Filepath to a folder where the downloaded data will be saved. 76 split: The data split, either 'train' or 'test'. 77 download: Whether to download the data if it is not present. 78 79 Returns: 80 The path to the directory containing the HDF5 files for the given split. 81 """ 82 assert split in ("train", "test"), f"split must be 'train' or 'test', got {split!r}" 83 84 out_dir = os.path.join(path, split) 85 if os.path.exists(out_dir) and len(glob(os.path.join(out_dir, "*.h5"))) > 0: 86 return out_dir 87 88 os.makedirs(path, exist_ok=True) 89 90 zip_path = os.path.join(path, "emps.zip") 91 util.download_source(zip_path, URL, download, checksum=CHECKSUM) 92 93 extract_dir = os.path.join(path, "_extracted") 94 util.unzip(zip_path, extract_dir, remove=True) 95 96 # The zip extracts to a single root folder (e.g. "emps-main/"). 97 subdirs = [d for d in os.listdir(extract_dir) if os.path.isdir(os.path.join(extract_dir, d))] 98 data_root = os.path.join(extract_dir, subdirs[0]) if subdirs else extract_dir 99 100 for s in ("train", "test"): 101 _create_h5_files(data_root, s, os.path.join(path, s)) 102 103 rmtree(extract_dir) 104 105 return out_dir
Download and preprocess the EMPS dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The data split, either 'train' or 'test'.
- download: Whether to download the data if it is not present.
Returns:
The path to the directory containing the HDF5 files for the given split.
108def get_emps_paths( 109 path: Union[os.PathLike, str], 110 split: Literal["train", "test"], 111 download: bool = False, 112) -> List[str]: 113 """Get paths to the EMPS HDF5 files. 114 115 Args: 116 path: Filepath to a folder where the downloaded data will be saved. 117 split: The data split, either 'train' or 'test'. 118 download: Whether to download the data if it is not present. 119 120 Returns: 121 List of filepaths to the HDF5 files. 122 """ 123 data_dir = get_emps_data(path, split, download) 124 paths = sorted(glob(os.path.join(data_dir, "*.h5"))) 125 assert len(paths) > 0, f"No HDF5 files found in '{data_dir}'" 126 return paths
Get paths to the EMPS HDF5 files.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The data split, either 'train' or 'test'.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths to the HDF5 files.
129def get_emps_dataset( 130 path: Union[os.PathLike, str], 131 patch_shape: Tuple[int, int], 132 split: Literal["train", "test"], 133 download: bool = False, 134 **kwargs, 135) -> Dataset: 136 """Get the EMPS dataset for nanoparticle instance segmentation in electron microscopy. 137 138 Args: 139 path: Filepath to a folder where the downloaded data will be saved. 140 patch_shape: The patch shape to use for training. 141 split: The data split, either 'train' or 'test'. 142 download: Whether to download the data if it is not present. 143 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 144 145 Returns: 146 The segmentation dataset. 147 """ 148 paths = get_emps_paths(path, split, download) 149 150 kwargs = util.update_kwargs(kwargs, "is_seg_dataset", True) 151 kwargs, _ = util.add_instance_label_transform(kwargs, add_binary_target=True) 152 153 return torch_em.default_segmentation_dataset( 154 raw_paths=paths, 155 raw_key="raw", 156 label_paths=paths, 157 label_key="labels", 158 patch_shape=patch_shape, 159 **kwargs, 160 )
Get the EMPS dataset for nanoparticle instance segmentation in electron microscopy.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- split: The data split, either 'train' or 'test'.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset.
Returns:
The segmentation dataset.
163def get_emps_loader( 164 path: Union[os.PathLike, str], 165 patch_shape: Tuple[int, int], 166 batch_size: int, 167 split: Literal["train", "test"], 168 download: bool = False, 169 **kwargs, 170) -> DataLoader: 171 """Get the DataLoader for nanoparticle instance segmentation in the EMPS dataset. 172 173 Args: 174 path: Filepath to a folder where the downloaded data will be saved. 175 patch_shape: The patch shape to use for training. 176 batch_size: The batch size for training. 177 split: The data split, either 'train' or 'test'. 178 download: Whether to download the data if it is not present. 179 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 180 181 Returns: 182 The DataLoader. 183 """ 184 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 185 dataset = get_emps_dataset(path, patch_shape, split, download, **ds_kwargs) 186 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the DataLoader for nanoparticle instance segmentation in the EMPS dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- batch_size: The batch size for training.
- split: The data split, either 'train' or 'test'.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_datasetor for the PyTorch DataLoader.
Returns:
The DataLoader.