torch_em.data.datasets.light_microscopy.synthmt

The SynthMT dataset contains synthetic interference reflection microscopy (IRM) images of microtubules with instance segmentation annotations.

The dataset provides 6,600 synthetically generated 512x512 RGB images with per-instance binary masks for microtubule segmentation. It was designed to train foundation models (e.g. SAM) for automated in vitro microtubule analysis.

The dataset is located at https://huggingface.co/datasets/HTW-KI-Werkstatt/SynthMT. This dataset is from the publication https://doi.org/10.64898/2026.01.09.698597. Please cite it if you use this dataset in your research.

View Source

  1"""The SynthMT dataset contains synthetic interference reflection microscopy (IRM) images
  2of microtubules with instance segmentation annotations.
  3
  4The dataset provides 6,600 synthetically generated 512x512 RGB images with per-instance
  5binary masks for microtubule segmentation. It was designed to train foundation models
  6(e.g. SAM) for automated in vitro microtubule analysis.
  7
  8The dataset is located at https://huggingface.co/datasets/HTW-KI-Werkstatt/SynthMT.
  9This dataset is from the publication https://doi.org/10.64898/2026.01.09.698597.
 10Please cite it if you use this dataset in your research.
 11"""
 12
 13import os
 14from glob import glob
 15from typing import Union, Tuple, List
 16
 17import numpy as np
 18
 19from torch.utils.data import Dataset, DataLoader
 20
 21import torch_em
 22
 23from .. import util
 24
 25
 26URL = "https://huggingface.co/datasets/HTW-KI-Werkstatt/SynthMT/resolve/main/data/{FILENAME}"
 27NUM_PARQUET_FILES = 8
 28
 29
 30def _download_parquets(path, download):
 31    """Download all parquet files for the dataset."""
 32    parquet_dir = os.path.join(path, "parquets")
 33    os.makedirs(parquet_dir, exist_ok=True)
 34
 35    for i in range(NUM_PARQUET_FILES):
 36        fname = f"train-{i:05d}-of-{NUM_PARQUET_FILES:05d}.parquet"
 37        fpath = os.path.join(parquet_dir, fname)
 38        if not os.path.exists(fpath):
 39            url = URL.format(FILENAME=fname)
 40            util.download_source(path=fpath, url=url, download=download, checksum=None)
 41
 42    return parquet_dir
 43
 44
 45def _create_images_from_parquets(path):
 46    """Extract images and instance labels from parquet files and save as TIF."""
 47    import imageio.v3 as imageio
 48    import pandas as pd
 49    from io import BytesIO
 50    from PIL import Image
 51    from tqdm import tqdm
 52
 53    image_dir = os.path.join(path, "images")
 54    label_dir = os.path.join(path, "labels")
 55    os.makedirs(image_dir, exist_ok=True)
 56    os.makedirs(label_dir, exist_ok=True)
 57
 58    parquet_dir = os.path.join(path, "parquets")
 59    parquet_files = sorted(glob(os.path.join(parquet_dir, "*.parquet")))
 60
 61    for pfile in tqdm(parquet_files, desc="Processing parquet files"):
 62        df = pd.read_parquet(pfile)
 63        for _, row in df.iterrows():
 64            sample_id = row["id"]
 65            img_path = os.path.join(image_dir, f"{sample_id}.tif")
 66            lbl_path = os.path.join(label_dir, f"{sample_id}.tif")
 67
 68            if os.path.exists(img_path) and os.path.exists(lbl_path):
 69                continue
 70
 71            # Decode the image.
 72            img = Image.open(BytesIO(row["image"]["bytes"])).convert("RGB")
 73            img_arr = np.array(img)
 74
 75            # Decode instance masks and merge into a single label map.
 76            masks = row["mask"]
 77            instances = np.zeros(img_arr.shape[:2], dtype="uint32")
 78            for i, mask_entry in enumerate(masks, start=1):
 79                mask = np.array(Image.open(BytesIO(mask_entry["bytes"])).convert("L"))
 80                instances[mask > 0] = i
 81
 82            imageio.imwrite(img_path, img_arr, compression="zlib")
 83            imageio.imwrite(lbl_path, instances, compression="zlib")
 84
 85
 86def get_synthmt_data(
 87    path: Union[os.PathLike, str],
 88    download: bool = False,
 89) -> str:
 90    """Download the SynthMT dataset.
 91
 92    Args:
 93        path: Filepath to a folder where the downloaded data will be saved.
 94        download: Whether to download the data if it is not present.
 95
 96    Returns:
 97        The filepath to the directory with the data.
 98    """
 99    _download_parquets(path, download)
100
101    image_dir = os.path.join(path, "images")
102    label_dir = os.path.join(path, "labels")
103    if not os.path.exists(image_dir) or not os.path.exists(label_dir):
104        _create_images_from_parquets(path)
105
106    return path
107
108
109def get_synthmt_paths(
110    path: Union[os.PathLike, str],
111    download: bool = False,
112) -> Tuple[List[str], List[str]]:
113    """Get paths to the SynthMT data.
114
115    Args:
116        path: Filepath to a folder where the downloaded data will be saved.
117        download: Whether to download the data if it is not present.
118
119    Returns:
120        List of filepaths for the image data.
121        List of filepaths for the label data.
122    """
123    from natsort import natsorted
124
125    get_synthmt_data(path, download)
126
127    image_paths = natsorted(glob(os.path.join(path, "images", "*.tif")))
128    label_paths = natsorted(glob(os.path.join(path, "labels", "*.tif")))
129
130    assert len(image_paths) == len(label_paths) and len(image_paths) > 0
131
132    return image_paths, label_paths
133
134
135def get_synthmt_dataset(
136    path: Union[os.PathLike, str],
137    patch_shape: Tuple[int, int],
138    download: bool = False,
139    **kwargs,
140) -> Dataset:
141    """Get the SynthMT dataset for microtubule instance segmentation.
142
143    Args:
144        path: Filepath to a folder where the downloaded data will be saved.
145        patch_shape: The patch shape to use for training.
146        download: Whether to download the data if it is not present.
147        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
148
149    Returns:
150        The segmentation dataset.
151    """
152    image_paths, label_paths = get_synthmt_paths(path, download)
153
154    kwargs, _ = util.add_instance_label_transform(
155        kwargs, add_binary_target=True,
156    )
157    kwargs = util.update_kwargs(kwargs, "ndim", 2)
158
159    return torch_em.default_segmentation_dataset(
160        raw_paths=image_paths,
161        raw_key=None,
162        label_paths=label_paths,
163        label_key=None,
164        patch_shape=patch_shape,
165        is_seg_dataset=False,
166        **kwargs,
167    )
168
169
170def get_synthmt_loader(
171    path: Union[os.PathLike, str],
172    batch_size: int,
173    patch_shape: Tuple[int, int],
174    download: bool = False,
175    **kwargs,
176) -> DataLoader:
177    """Get the SynthMT dataloader for microtubule instance segmentation.
178
179    Args:
180        path: Filepath to a folder where the downloaded data will be saved.
181        batch_size: The batch size for training.
182        patch_shape: The patch shape to use for training.
183        download: Whether to download the data if it is not present.
184        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
185
186    Returns:
187        The DataLoader.
188    """
189    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
190    dataset = get_synthmt_dataset(path, patch_shape, download, **ds_kwargs)
191    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

URL = 'https://huggingface.co/datasets/HTW-KI-Werkstatt/SynthMT/resolve/main/data/{FILENAME}'

NUM_PARQUET_FILES = 8

def get_synthmt_data(path: Union[os.PathLike, str], download: bool = False) -> str: View Source

 87def get_synthmt_data(
 88    path: Union[os.PathLike, str],
 89    download: bool = False,
 90) -> str:
 91    """Download the SynthMT dataset.
 92
 93    Args:
 94        path: Filepath to a folder where the downloaded data will be saved.
 95        download: Whether to download the data if it is not present.
 96
 97    Returns:
 98        The filepath to the directory with the data.
 99    """
100    _download_parquets(path, download)
101
102    image_dir = os.path.join(path, "images")
103    label_dir = os.path.join(path, "labels")
104    if not os.path.exists(image_dir) or not os.path.exists(label_dir):
105        _create_images_from_parquets(path)
106
107    return path

Download the SynthMT dataset.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
download: Whether to download the data if it is not present.

Returns:

The filepath to the directory with the data.

def get_synthmt_paths( path: Union[os.PathLike, str], download: bool = False) -> Tuple[List[str], List[str]]: View Source

110def get_synthmt_paths(
111    path: Union[os.PathLike, str],
112    download: bool = False,
113) -> Tuple[List[str], List[str]]:
114    """Get paths to the SynthMT data.
115
116    Args:
117        path: Filepath to a folder where the downloaded data will be saved.
118        download: Whether to download the data if it is not present.
119
120    Returns:
121        List of filepaths for the image data.
122        List of filepaths for the label data.
123    """
124    from natsort import natsorted
125
126    get_synthmt_data(path, download)
127
128    image_paths = natsorted(glob(os.path.join(path, "images", "*.tif")))
129    label_paths = natsorted(glob(os.path.join(path, "labels", "*.tif")))
130
131    assert len(image_paths) == len(label_paths) and len(image_paths) > 0
132
133    return image_paths, label_paths

Get paths to the SynthMT data.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
download: Whether to download the data if it is not present.

Returns:

List of filepaths for the image data. List of filepaths for the label data.

def get_synthmt_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset: View Source

136def get_synthmt_dataset(
137    path: Union[os.PathLike, str],
138    patch_shape: Tuple[int, int],
139    download: bool = False,
140    **kwargs,
141) -> Dataset:
142    """Get the SynthMT dataset for microtubule instance segmentation.
143
144    Args:
145        path: Filepath to a folder where the downloaded data will be saved.
146        patch_shape: The patch shape to use for training.
147        download: Whether to download the data if it is not present.
148        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
149
150    Returns:
151        The segmentation dataset.
152    """
153    image_paths, label_paths = get_synthmt_paths(path, download)
154
155    kwargs, _ = util.add_instance_label_transform(
156        kwargs, add_binary_target=True,
157    )
158    kwargs = util.update_kwargs(kwargs, "ndim", 2)
159
160    return torch_em.default_segmentation_dataset(
161        raw_paths=image_paths,
162        raw_key=None,
163        label_paths=label_paths,
164        label_key=None,
165        patch_shape=patch_shape,
166        is_seg_dataset=False,
167        **kwargs,
168    )

Get the SynthMT dataset for microtubule instance segmentation.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
patch_shape: The patch shape to use for training.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

Returns:

The segmentation dataset.

def get_synthmt_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader: View Source

171def get_synthmt_loader(
172    path: Union[os.PathLike, str],
173    batch_size: int,
174    patch_shape: Tuple[int, int],
175    download: bool = False,
176    **kwargs,
177) -> DataLoader:
178    """Get the SynthMT dataloader for microtubule instance segmentation.
179
180    Args:
181        path: Filepath to a folder where the downloaded data will be saved.
182        batch_size: The batch size for training.
183        patch_shape: The patch shape to use for training.
184        download: Whether to download the data if it is not present.
185        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
186
187    Returns:
188        The DataLoader.
189    """
190    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
191    dataset = get_synthmt_dataset(path, patch_shape, download, **ds_kwargs)
192    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the SynthMT dataloader for microtubule instance segmentation.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
batch_size: The batch size for training.
patch_shape: The patch shape to use for training.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

Returns:

The DataLoader.