torch_em.data.datasets.light_microscopy.synthmt
The SynthMT dataset contains synthetic interference reflection microscopy (IRM) images of microtubules with instance segmentation annotations.
The dataset provides 6,600 synthetically generated 512x512 RGB images with per-instance binary masks for microtubule segmentation. It was designed to train foundation models (e.g. SAM) for automated in vitro microtubule analysis.
The dataset is located at https://huggingface.co/datasets/HTW-KI-Werkstatt/SynthMT. This dataset is from the publication https://doi.org/10.64898/2026.01.09.698597. Please cite it if you use this dataset in your research.
1"""The SynthMT dataset contains synthetic interference reflection microscopy (IRM) images 2of microtubules with instance segmentation annotations. 3 4The dataset provides 6,600 synthetically generated 512x512 RGB images with per-instance 5binary masks for microtubule segmentation. It was designed to train foundation models 6(e.g. SAM) for automated in vitro microtubule analysis. 7 8The dataset is located at https://huggingface.co/datasets/HTW-KI-Werkstatt/SynthMT. 9This dataset is from the publication https://doi.org/10.64898/2026.01.09.698597. 10Please cite it if you use this dataset in your research. 11""" 12 13import os 14from glob import glob 15from typing import Union, Tuple, List 16 17import numpy as np 18 19from torch.utils.data import Dataset, DataLoader 20 21import torch_em 22 23from .. import util 24 25 26URL = "https://huggingface.co/datasets/HTW-KI-Werkstatt/SynthMT/resolve/main/data/{FILENAME}" 27NUM_PARQUET_FILES = 8 28 29 30def _download_parquets(path, download): 31 """Download all parquet files for the dataset.""" 32 parquet_dir = os.path.join(path, "parquets") 33 os.makedirs(parquet_dir, exist_ok=True) 34 35 for i in range(NUM_PARQUET_FILES): 36 fname = f"train-{i:05d}-of-{NUM_PARQUET_FILES:05d}.parquet" 37 fpath = os.path.join(parquet_dir, fname) 38 if not os.path.exists(fpath): 39 url = URL.format(FILENAME=fname) 40 util.download_source(path=fpath, url=url, download=download, checksum=None) 41 42 return parquet_dir 43 44 45def _create_images_from_parquets(path): 46 """Extract images and instance labels from parquet files and save as TIF.""" 47 import imageio.v3 as imageio 48 import pandas as pd 49 from io import BytesIO 50 from PIL import Image 51 from tqdm import tqdm 52 53 image_dir = os.path.join(path, "images") 54 label_dir = os.path.join(path, "labels") 55 os.makedirs(image_dir, exist_ok=True) 56 os.makedirs(label_dir, exist_ok=True) 57 58 parquet_dir = os.path.join(path, "parquets") 59 parquet_files = sorted(glob(os.path.join(parquet_dir, "*.parquet"))) 60 61 for pfile in tqdm(parquet_files, desc="Processing parquet files"): 62 df = pd.read_parquet(pfile) 63 for _, row in df.iterrows(): 64 sample_id = row["id"] 65 img_path = os.path.join(image_dir, f"{sample_id}.tif") 66 lbl_path = os.path.join(label_dir, f"{sample_id}.tif") 67 68 if os.path.exists(img_path) and os.path.exists(lbl_path): 69 continue 70 71 # Decode the image. 72 img = Image.open(BytesIO(row["image"]["bytes"])).convert("RGB") 73 img_arr = np.array(img) 74 75 # Decode instance masks and merge into a single label map. 76 masks = row["mask"] 77 instances = np.zeros(img_arr.shape[:2], dtype="uint32") 78 for i, mask_entry in enumerate(masks, start=1): 79 mask = np.array(Image.open(BytesIO(mask_entry["bytes"])).convert("L")) 80 instances[mask > 0] = i 81 82 imageio.imwrite(img_path, img_arr, compression="zlib") 83 imageio.imwrite(lbl_path, instances, compression="zlib") 84 85 86def get_synthmt_data( 87 path: Union[os.PathLike, str], 88 download: bool = False, 89) -> str: 90 """Download the SynthMT dataset. 91 92 Args: 93 path: Filepath to a folder where the downloaded data will be saved. 94 download: Whether to download the data if it is not present. 95 96 Returns: 97 The filepath to the directory with the data. 98 """ 99 _download_parquets(path, download) 100 101 image_dir = os.path.join(path, "images") 102 label_dir = os.path.join(path, "labels") 103 if not os.path.exists(image_dir) or not os.path.exists(label_dir): 104 _create_images_from_parquets(path) 105 106 return path 107 108 109def get_synthmt_paths( 110 path: Union[os.PathLike, str], 111 download: bool = False, 112) -> Tuple[List[str], List[str]]: 113 """Get paths to the SynthMT data. 114 115 Args: 116 path: Filepath to a folder where the downloaded data will be saved. 117 download: Whether to download the data if it is not present. 118 119 Returns: 120 List of filepaths for the image data. 121 List of filepaths for the label data. 122 """ 123 from natsort import natsorted 124 125 get_synthmt_data(path, download) 126 127 image_paths = natsorted(glob(os.path.join(path, "images", "*.tif"))) 128 label_paths = natsorted(glob(os.path.join(path, "labels", "*.tif"))) 129 130 assert len(image_paths) == len(label_paths) and len(image_paths) > 0 131 132 return image_paths, label_paths 133 134 135def get_synthmt_dataset( 136 path: Union[os.PathLike, str], 137 patch_shape: Tuple[int, int], 138 download: bool = False, 139 **kwargs, 140) -> Dataset: 141 """Get the SynthMT dataset for microtubule instance segmentation. 142 143 Args: 144 path: Filepath to a folder where the downloaded data will be saved. 145 patch_shape: The patch shape to use for training. 146 download: Whether to download the data if it is not present. 147 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 148 149 Returns: 150 The segmentation dataset. 151 """ 152 image_paths, label_paths = get_synthmt_paths(path, download) 153 154 kwargs, _ = util.add_instance_label_transform( 155 kwargs, add_binary_target=True, 156 ) 157 kwargs = util.update_kwargs(kwargs, "ndim", 2) 158 159 return torch_em.default_segmentation_dataset( 160 raw_paths=image_paths, 161 raw_key=None, 162 label_paths=label_paths, 163 label_key=None, 164 patch_shape=patch_shape, 165 is_seg_dataset=False, 166 **kwargs, 167 ) 168 169 170def get_synthmt_loader( 171 path: Union[os.PathLike, str], 172 batch_size: int, 173 patch_shape: Tuple[int, int], 174 download: bool = False, 175 **kwargs, 176) -> DataLoader: 177 """Get the SynthMT dataloader for microtubule instance segmentation. 178 179 Args: 180 path: Filepath to a folder where the downloaded data will be saved. 181 batch_size: The batch size for training. 182 patch_shape: The patch shape to use for training. 183 download: Whether to download the data if it is not present. 184 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 185 186 Returns: 187 The DataLoader. 188 """ 189 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 190 dataset = get_synthmt_dataset(path, patch_shape, download, **ds_kwargs) 191 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
87def get_synthmt_data( 88 path: Union[os.PathLike, str], 89 download: bool = False, 90) -> str: 91 """Download the SynthMT dataset. 92 93 Args: 94 path: Filepath to a folder where the downloaded data will be saved. 95 download: Whether to download the data if it is not present. 96 97 Returns: 98 The filepath to the directory with the data. 99 """ 100 _download_parquets(path, download) 101 102 image_dir = os.path.join(path, "images") 103 label_dir = os.path.join(path, "labels") 104 if not os.path.exists(image_dir) or not os.path.exists(label_dir): 105 _create_images_from_parquets(path) 106 107 return path
Download the SynthMT dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- download: Whether to download the data if it is not present.
Returns:
The filepath to the directory with the data.
110def get_synthmt_paths( 111 path: Union[os.PathLike, str], 112 download: bool = False, 113) -> Tuple[List[str], List[str]]: 114 """Get paths to the SynthMT data. 115 116 Args: 117 path: Filepath to a folder where the downloaded data will be saved. 118 download: Whether to download the data if it is not present. 119 120 Returns: 121 List of filepaths for the image data. 122 List of filepaths for the label data. 123 """ 124 from natsort import natsorted 125 126 get_synthmt_data(path, download) 127 128 image_paths = natsorted(glob(os.path.join(path, "images", "*.tif"))) 129 label_paths = natsorted(glob(os.path.join(path, "labels", "*.tif"))) 130 131 assert len(image_paths) == len(label_paths) and len(image_paths) > 0 132 133 return image_paths, label_paths
Get paths to the SynthMT data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the image data. List of filepaths for the label data.
136def get_synthmt_dataset( 137 path: Union[os.PathLike, str], 138 patch_shape: Tuple[int, int], 139 download: bool = False, 140 **kwargs, 141) -> Dataset: 142 """Get the SynthMT dataset for microtubule instance segmentation. 143 144 Args: 145 path: Filepath to a folder where the downloaded data will be saved. 146 patch_shape: The patch shape to use for training. 147 download: Whether to download the data if it is not present. 148 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 149 150 Returns: 151 The segmentation dataset. 152 """ 153 image_paths, label_paths = get_synthmt_paths(path, download) 154 155 kwargs, _ = util.add_instance_label_transform( 156 kwargs, add_binary_target=True, 157 ) 158 kwargs = util.update_kwargs(kwargs, "ndim", 2) 159 160 return torch_em.default_segmentation_dataset( 161 raw_paths=image_paths, 162 raw_key=None, 163 label_paths=label_paths, 164 label_key=None, 165 patch_shape=patch_shape, 166 is_seg_dataset=False, 167 **kwargs, 168 )
Get the SynthMT dataset for microtubule instance segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset.
Returns:
The segmentation dataset.
171def get_synthmt_loader( 172 path: Union[os.PathLike, str], 173 batch_size: int, 174 patch_shape: Tuple[int, int], 175 download: bool = False, 176 **kwargs, 177) -> DataLoader: 178 """Get the SynthMT dataloader for microtubule instance segmentation. 179 180 Args: 181 path: Filepath to a folder where the downloaded data will be saved. 182 batch_size: The batch size for training. 183 patch_shape: The patch shape to use for training. 184 download: Whether to download the data if it is not present. 185 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 186 187 Returns: 188 The DataLoader. 189 """ 190 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 191 dataset = get_synthmt_dataset(path, patch_shape, download, **ds_kwargs) 192 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the SynthMT dataloader for microtubule instance segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_datasetor for the PyTorch DataLoader.
Returns:
The DataLoader.