torch_em.data.datasets.light_microscopy.mcellseg

The mCellSeg dataset contains expert-annotated microscopy images for cell instance segmentation.

It contains 200 annotated 2D images from two human cell lines (HEK-293T and HUVEC), acquired with differential interference contrast (DIC) and fluorescence microscopy. Each image has a paired instance segmentation mask (0 = background, unique integer per cell). A further 100 unannotated images are included for semi-supervised learning (not used here).

This dataset is from the publication: https://doi.org/10.1016/j.cmpb.2026.108919 Please cite it if you use this dataset for a publication.

The data is available at https://doi.org/10.5281/zenodo.20174259.

View Source

  1"""The mCellSeg dataset contains expert-annotated microscopy images for cell instance segmentation.
  2
  3It contains 200 annotated 2D images from two human cell lines (HEK-293T and HUVEC),
  4acquired with differential interference contrast (DIC) and fluorescence microscopy.
  5Each image has a paired instance segmentation mask (0 = background, unique integer per cell).
  6A further 100 unannotated images are included for semi-supervised learning (not used here).
  7
  8This dataset is from the publication:
  9https://doi.org/10.1016/j.cmpb.2026.108919
 10Please cite it if you use this dataset for a publication.
 11
 12The data is available at https://doi.org/10.5281/zenodo.20174259.
 13"""
 14
 15import os
 16from glob import glob
 17from natsort import natsorted
 18from typing import List, Optional, Tuple, Union
 19
 20from torch.utils.data import DataLoader, Dataset
 21
 22import torch_em
 23from .. import util
 24
 25
 26URL = "https://zenodo.org/records/20174259/files/mCellSeg.zip?download=1"
 27CHECKSUM = "55fec21acab10a78837718431f21f74e87e0777ebd5907ea9ef8a57a8a197217"
 28
 29
 30def get_mcellseg_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 31    """Download the mCellSeg dataset.
 32
 33    Args:
 34        path: Filepath to a folder where the downloaded data will be saved.
 35        download: Whether to download the data if it is not present.
 36
 37    Returns:
 38        Path to the folder containing the downloaded data.
 39    """
 40    data_dir = os.path.join(str(path), "mCellSeg")
 41    if os.path.exists(data_dir):
 42        return data_dir
 43
 44    os.makedirs(str(path), exist_ok=True)
 45    zip_path = os.path.join(str(path), "mCellSeg.zip")
 46    util.download_source(zip_path, URL, download, checksum=CHECKSUM)
 47    util.unzip(zip_path, str(path), remove=True)
 48
 49    return data_dir
 50
 51
 52def get_mcellseg_paths(
 53    path: Union[os.PathLike, str],
 54    val_fraction: Optional[float] = None,
 55    split: Optional[str] = None,
 56    download: bool = False,
 57) -> Tuple[List[str], List[str]]:
 58    """Get paths to the mCellSeg image and mask files.
 59
 60    Only the 200 images that have corresponding instance masks are returned.
 61
 62    Args:
 63        path: Filepath to a folder where the downloaded data will be saved.
 64        val_fraction: The fraction of data to use for validation. If None, all data is returned.
 65        split: The split to use, either "train" or "val". Required if val_fraction is set.
 66        download: Whether to download the data if it is not present.
 67
 68    Returns:
 69        Tuple of (raw image paths, label mask paths).
 70    """
 71    data_dir = get_mcellseg_data(path, download)
 72
 73    mask_paths = natsorted(glob(os.path.join(data_dir, "labeled", "masks", "*.tif")))
 74    raw_paths = []
 75    valid_mask_paths = []
 76    for mask_path in mask_paths:
 77        mask_name = os.path.basename(mask_path)
 78        img_name = mask_name.replace("_mask.tif", ".tif")
 79        img_path = os.path.join(data_dir, "labeled", "images", img_name)
 80        if os.path.exists(img_path):
 81            raw_paths.append(img_path)
 82            valid_mask_paths.append(mask_path)
 83
 84    if val_fraction is not None:
 85        assert split in ("train", "val"), f"'split' must be 'train' or 'val', got '{split}'."
 86        n_val = max(1, int(len(raw_paths) * val_fraction))
 87        if split == "train":
 88            raw_paths = raw_paths[n_val:]
 89            valid_mask_paths = valid_mask_paths[n_val:]
 90        else:
 91            raw_paths = raw_paths[:n_val]
 92            valid_mask_paths = valid_mask_paths[:n_val]
 93
 94    return raw_paths, valid_mask_paths
 95
 96
 97def get_mcellseg_dataset(
 98    path: Union[os.PathLike, str],
 99    patch_shape: Tuple[int, int],
100    val_fraction: Optional[float] = None,
101    split: Optional[str] = None,
102    download: bool = False,
103    offsets: Optional[List[List[int]]] = None,
104    boundaries: bool = False,
105    binary: bool = False,
106    **kwargs,
107) -> Dataset:
108    """Get the mCellSeg dataset for cell instance segmentation.
109
110    Args:
111        path: Filepath to a folder where the downloaded data will be saved.
112        patch_shape: The patch shape (H, W) to use for training.
113        val_fraction: The fraction of data to use for validation.
114        split: The split to use, either "train" or "val". Required if val_fraction is set.
115        download: Whether to download the data if it is not present.
116        offsets: Offset values for affinity computation used as target.
117        boundaries: Whether to compute boundaries as the target.
118        binary: Whether to return a binary segmentation target.
119        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
120
121    Returns:
122        The segmentation dataset.
123    """
124    assert sum((offsets is not None, boundaries, binary)) <= 1, f"{offsets}, {boundaries}, {binary}"
125
126    raw_paths, label_paths = get_mcellseg_paths(path, val_fraction, split, download)
127
128    if offsets is not None:
129        label_transform = torch_em.transform.label.AffinityTransform(
130            offsets=offsets, ignore_label=None, add_binary_target=True, add_mask=True
131        )
132        msg = "Offsets are passed, but 'label_transform2' is in the kwargs. It will be over-ridden."
133        kwargs = util.update_kwargs(kwargs, "label_transform2", label_transform, msg=msg)
134    elif boundaries:
135        label_transform = torch_em.transform.label.BoundaryTransform(add_binary_target=True)
136        msg = "Boundaries is set to True, but 'label_transform' is in the kwargs. It will be over-ridden."
137        kwargs = util.update_kwargs(kwargs, "label_transform", label_transform, msg=msg)
138    elif binary:
139        label_transform = torch_em.transform.label.labels_to_binary
140        msg = "Binary is set to True, but 'label_transform' is in the kwargs. It will be over-ridden."
141        kwargs = util.update_kwargs(kwargs, "label_transform", label_transform, msg=msg)
142
143    kwargs = util.update_kwargs(kwargs, "is_seg_dataset", False)
144
145    return torch_em.default_segmentation_dataset(
146        raw_paths=raw_paths,
147        raw_key=None,
148        label_paths=label_paths,
149        label_key=None,
150        patch_shape=patch_shape,
151        **kwargs,
152    )
153
154
155def get_mcellseg_loader(
156    path: Union[os.PathLike, str],
157    patch_shape: Tuple[int, int],
158    batch_size: int,
159    val_fraction: Optional[float] = None,
160    split: Optional[str] = None,
161    download: bool = False,
162    offsets: Optional[List[List[int]]] = None,
163    boundaries: bool = False,
164    binary: bool = False,
165    **kwargs,
166) -> DataLoader:
167    """Get the DataLoader for cell instance segmentation in mCellSeg.
168
169    Args:
170        path: Filepath to a folder where the downloaded data will be saved.
171        patch_shape: The patch shape (H, W) to use for training.
172        batch_size: The batch size for training.
173        val_fraction: The fraction of data to use for validation.
174        split: The split to use, either "train" or "val". Required if val_fraction is set.
175        download: Whether to download the data if it is not present.
176        offsets: Offset values for affinity computation used as target.
177        boundaries: Whether to compute boundaries as the target.
178        binary: Whether to return a binary segmentation target.
179        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`
180            or for the PyTorch DataLoader.
181
182    Returns:
183        The DataLoader.
184    """
185    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
186    dataset = get_mcellseg_dataset(
187        path, patch_shape, val_fraction=val_fraction, split=split, download=download,
188        offsets=offsets, boundaries=boundaries, binary=binary, **ds_kwargs,
189    )
190    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

URL = 'https://zenodo.org/records/20174259/files/mCellSeg.zip?download=1'

CHECKSUM = '55fec21acab10a78837718431f21f74e87e0777ebd5907ea9ef8a57a8a197217'

def get_mcellseg_data(path: Union[os.PathLike, str], download: bool = False) -> str: View Source

31def get_mcellseg_data(path: Union[os.PathLike, str], download: bool = False) -> str:
32    """Download the mCellSeg dataset.
33
34    Args:
35        path: Filepath to a folder where the downloaded data will be saved.
36        download: Whether to download the data if it is not present.
37
38    Returns:
39        Path to the folder containing the downloaded data.
40    """
41    data_dir = os.path.join(str(path), "mCellSeg")
42    if os.path.exists(data_dir):
43        return data_dir
44
45    os.makedirs(str(path), exist_ok=True)
46    zip_path = os.path.join(str(path), "mCellSeg.zip")
47    util.download_source(zip_path, URL, download, checksum=CHECKSUM)
48    util.unzip(zip_path, str(path), remove=True)
49
50    return data_dir

Download the mCellSeg dataset.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
download: Whether to download the data if it is not present.

Returns:

Path to the folder containing the downloaded data.

def get_mcellseg_paths( path: Union[os.PathLike, str], val_fraction: Optional[float] = None, split: Optional[str] = None, download: bool = False) -> Tuple[List[str], List[str]]: View Source

53def get_mcellseg_paths(
54    path: Union[os.PathLike, str],
55    val_fraction: Optional[float] = None,
56    split: Optional[str] = None,
57    download: bool = False,
58) -> Tuple[List[str], List[str]]:
59    """Get paths to the mCellSeg image and mask files.
60
61    Only the 200 images that have corresponding instance masks are returned.
62
63    Args:
64        path: Filepath to a folder where the downloaded data will be saved.
65        val_fraction: The fraction of data to use for validation. If None, all data is returned.
66        split: The split to use, either "train" or "val". Required if val_fraction is set.
67        download: Whether to download the data if it is not present.
68
69    Returns:
70        Tuple of (raw image paths, label mask paths).
71    """
72    data_dir = get_mcellseg_data(path, download)
73
74    mask_paths = natsorted(glob(os.path.join(data_dir, "labeled", "masks", "*.tif")))
75    raw_paths = []
76    valid_mask_paths = []
77    for mask_path in mask_paths:
78        mask_name = os.path.basename(mask_path)
79        img_name = mask_name.replace("_mask.tif", ".tif")
80        img_path = os.path.join(data_dir, "labeled", "images", img_name)
81        if os.path.exists(img_path):
82            raw_paths.append(img_path)
83            valid_mask_paths.append(mask_path)
84
85    if val_fraction is not None:
86        assert split in ("train", "val"), f"'split' must be 'train' or 'val', got '{split}'."
87        n_val = max(1, int(len(raw_paths) * val_fraction))
88        if split == "train":
89            raw_paths = raw_paths[n_val:]
90            valid_mask_paths = valid_mask_paths[n_val:]
91        else:
92            raw_paths = raw_paths[:n_val]
93            valid_mask_paths = valid_mask_paths[:n_val]
94
95    return raw_paths, valid_mask_paths

Get paths to the mCellSeg image and mask files.

Only the 200 images that have corresponding instance masks are returned.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
val_fraction: The fraction of data to use for validation. If None, all data is returned.
split: The split to use, either "train" or "val". Required if val_fraction is set.
download: Whether to download the data if it is not present.

Returns:

Tuple of (raw image paths, label mask paths).

def get_mcellseg_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], val_fraction: Optional[float] = None, split: Optional[str] = None, download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset: View Source

 98def get_mcellseg_dataset(
 99    path: Union[os.PathLike, str],
100    patch_shape: Tuple[int, int],
101    val_fraction: Optional[float] = None,
102    split: Optional[str] = None,
103    download: bool = False,
104    offsets: Optional[List[List[int]]] = None,
105    boundaries: bool = False,
106    binary: bool = False,
107    **kwargs,
108) -> Dataset:
109    """Get the mCellSeg dataset for cell instance segmentation.
110
111    Args:
112        path: Filepath to a folder where the downloaded data will be saved.
113        patch_shape: The patch shape (H, W) to use for training.
114        val_fraction: The fraction of data to use for validation.
115        split: The split to use, either "train" or "val". Required if val_fraction is set.
116        download: Whether to download the data if it is not present.
117        offsets: Offset values for affinity computation used as target.
118        boundaries: Whether to compute boundaries as the target.
119        binary: Whether to return a binary segmentation target.
120        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
121
122    Returns:
123        The segmentation dataset.
124    """
125    assert sum((offsets is not None, boundaries, binary)) <= 1, f"{offsets}, {boundaries}, {binary}"
126
127    raw_paths, label_paths = get_mcellseg_paths(path, val_fraction, split, download)
128
129    if offsets is not None:
130        label_transform = torch_em.transform.label.AffinityTransform(
131            offsets=offsets, ignore_label=None, add_binary_target=True, add_mask=True
132        )
133        msg = "Offsets are passed, but 'label_transform2' is in the kwargs. It will be over-ridden."
134        kwargs = util.update_kwargs(kwargs, "label_transform2", label_transform, msg=msg)
135    elif boundaries:
136        label_transform = torch_em.transform.label.BoundaryTransform(add_binary_target=True)
137        msg = "Boundaries is set to True, but 'label_transform' is in the kwargs. It will be over-ridden."
138        kwargs = util.update_kwargs(kwargs, "label_transform", label_transform, msg=msg)
139    elif binary:
140        label_transform = torch_em.transform.label.labels_to_binary
141        msg = "Binary is set to True, but 'label_transform' is in the kwargs. It will be over-ridden."
142        kwargs = util.update_kwargs(kwargs, "label_transform", label_transform, msg=msg)
143
144    kwargs = util.update_kwargs(kwargs, "is_seg_dataset", False)
145
146    return torch_em.default_segmentation_dataset(
147        raw_paths=raw_paths,
148        raw_key=None,
149        label_paths=label_paths,
150        label_key=None,
151        patch_shape=patch_shape,
152        **kwargs,
153    )

Get the mCellSeg dataset for cell instance segmentation.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
patch_shape: The patch shape (H, W) to use for training.
val_fraction: The fraction of data to use for validation.
split: The split to use, either "train" or "val". Required if val_fraction is set.
download: Whether to download the data if it is not present.
offsets: Offset values for affinity computation used as target.
boundaries: Whether to compute boundaries as the target.
binary: Whether to return a binary segmentation target.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

Returns:

The segmentation dataset.

def get_mcellseg_loader( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], batch_size: int, val_fraction: Optional[float] = None, split: Optional[str] = None, download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader: View Source

156def get_mcellseg_loader(
157    path: Union[os.PathLike, str],
158    patch_shape: Tuple[int, int],
159    batch_size: int,
160    val_fraction: Optional[float] = None,
161    split: Optional[str] = None,
162    download: bool = False,
163    offsets: Optional[List[List[int]]] = None,
164    boundaries: bool = False,
165    binary: bool = False,
166    **kwargs,
167) -> DataLoader:
168    """Get the DataLoader for cell instance segmentation in mCellSeg.
169
170    Args:
171        path: Filepath to a folder where the downloaded data will be saved.
172        patch_shape: The patch shape (H, W) to use for training.
173        batch_size: The batch size for training.
174        val_fraction: The fraction of data to use for validation.
175        split: The split to use, either "train" or "val". Required if val_fraction is set.
176        download: Whether to download the data if it is not present.
177        offsets: Offset values for affinity computation used as target.
178        boundaries: Whether to compute boundaries as the target.
179        binary: Whether to return a binary segmentation target.
180        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`
181            or for the PyTorch DataLoader.
182
183    Returns:
184        The DataLoader.
185    """
186    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
187    dataset = get_mcellseg_dataset(
188        path, patch_shape, val_fraction=val_fraction, split=split, download=download,
189        offsets=offsets, boundaries=boundaries, binary=binary, **ds_kwargs,
190    )
191    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the DataLoader for cell instance segmentation in mCellSeg.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
patch_shape: The patch shape (H, W) to use for training.
batch_size: The batch size for training.
val_fraction: The fraction of data to use for validation.
split: The split to use, either "train" or "val". Required if val_fraction is set.
download: Whether to download the data if it is not present.
offsets: Offset values for affinity computation used as target.
boundaries: Whether to compute boundaries as the target.
binary: Whether to return a binary segmentation target.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

Returns:

The DataLoader.