torch_em.data.datasets.light_microscopy.mcellseg
The mCellSeg dataset contains expert-annotated microscopy images for cell instance segmentation.
It contains 200 annotated 2D images from two human cell lines (HEK-293T and HUVEC), acquired with differential interference contrast (DIC) and fluorescence microscopy. Each image has a paired instance segmentation mask (0 = background, unique integer per cell). A further 100 unannotated images are included for semi-supervised learning (not used here).
This dataset is from the publication: https://doi.org/10.1016/j.cmpb.2026.108919 Please cite it if you use this dataset for a publication.
The data is available at https://doi.org/10.5281/zenodo.20174259.
1"""The mCellSeg dataset contains expert-annotated microscopy images for cell instance segmentation. 2 3It contains 200 annotated 2D images from two human cell lines (HEK-293T and HUVEC), 4acquired with differential interference contrast (DIC) and fluorescence microscopy. 5Each image has a paired instance segmentation mask (0 = background, unique integer per cell). 6A further 100 unannotated images are included for semi-supervised learning (not used here). 7 8This dataset is from the publication: 9https://doi.org/10.1016/j.cmpb.2026.108919 10Please cite it if you use this dataset for a publication. 11 12The data is available at https://doi.org/10.5281/zenodo.20174259. 13""" 14 15import os 16from glob import glob 17from natsort import natsorted 18from typing import List, Optional, Tuple, Union 19 20from torch.utils.data import DataLoader, Dataset 21 22import torch_em 23from .. import util 24 25 26URL = "https://zenodo.org/records/20174259/files/mCellSeg.zip?download=1" 27CHECKSUM = "55fec21acab10a78837718431f21f74e87e0777ebd5907ea9ef8a57a8a197217" 28 29 30def get_mcellseg_data(path: Union[os.PathLike, str], download: bool = False) -> str: 31 """Download the mCellSeg dataset. 32 33 Args: 34 path: Filepath to a folder where the downloaded data will be saved. 35 download: Whether to download the data if it is not present. 36 37 Returns: 38 Path to the folder containing the downloaded data. 39 """ 40 data_dir = os.path.join(str(path), "mCellSeg") 41 if os.path.exists(data_dir): 42 return data_dir 43 44 os.makedirs(str(path), exist_ok=True) 45 zip_path = os.path.join(str(path), "mCellSeg.zip") 46 util.download_source(zip_path, URL, download, checksum=CHECKSUM) 47 util.unzip(zip_path, str(path), remove=True) 48 49 return data_dir 50 51 52def get_mcellseg_paths( 53 path: Union[os.PathLike, str], 54 val_fraction: Optional[float] = None, 55 split: Optional[str] = None, 56 download: bool = False, 57) -> Tuple[List[str], List[str]]: 58 """Get paths to the mCellSeg image and mask files. 59 60 Only the 200 images that have corresponding instance masks are returned. 61 62 Args: 63 path: Filepath to a folder where the downloaded data will be saved. 64 val_fraction: The fraction of data to use for validation. If None, all data is returned. 65 split: The split to use, either "train" or "val". Required if val_fraction is set. 66 download: Whether to download the data if it is not present. 67 68 Returns: 69 Tuple of (raw image paths, label mask paths). 70 """ 71 data_dir = get_mcellseg_data(path, download) 72 73 mask_paths = natsorted(glob(os.path.join(data_dir, "labeled", "masks", "*.tif"))) 74 raw_paths = [] 75 valid_mask_paths = [] 76 for mask_path in mask_paths: 77 mask_name = os.path.basename(mask_path) 78 img_name = mask_name.replace("_mask.tif", ".tif") 79 img_path = os.path.join(data_dir, "labeled", "images", img_name) 80 if os.path.exists(img_path): 81 raw_paths.append(img_path) 82 valid_mask_paths.append(mask_path) 83 84 if val_fraction is not None: 85 assert split in ("train", "val"), f"'split' must be 'train' or 'val', got '{split}'." 86 n_val = max(1, int(len(raw_paths) * val_fraction)) 87 if split == "train": 88 raw_paths = raw_paths[n_val:] 89 valid_mask_paths = valid_mask_paths[n_val:] 90 else: 91 raw_paths = raw_paths[:n_val] 92 valid_mask_paths = valid_mask_paths[:n_val] 93 94 return raw_paths, valid_mask_paths 95 96 97def get_mcellseg_dataset( 98 path: Union[os.PathLike, str], 99 patch_shape: Tuple[int, int], 100 val_fraction: Optional[float] = None, 101 split: Optional[str] = None, 102 download: bool = False, 103 offsets: Optional[List[List[int]]] = None, 104 boundaries: bool = False, 105 binary: bool = False, 106 **kwargs, 107) -> Dataset: 108 """Get the mCellSeg dataset for cell instance segmentation. 109 110 Args: 111 path: Filepath to a folder where the downloaded data will be saved. 112 patch_shape: The patch shape (H, W) to use for training. 113 val_fraction: The fraction of data to use for validation. 114 split: The split to use, either "train" or "val". Required if val_fraction is set. 115 download: Whether to download the data if it is not present. 116 offsets: Offset values for affinity computation used as target. 117 boundaries: Whether to compute boundaries as the target. 118 binary: Whether to return a binary segmentation target. 119 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 120 121 Returns: 122 The segmentation dataset. 123 """ 124 assert sum((offsets is not None, boundaries, binary)) <= 1, f"{offsets}, {boundaries}, {binary}" 125 126 raw_paths, label_paths = get_mcellseg_paths(path, val_fraction, split, download) 127 128 if offsets is not None: 129 label_transform = torch_em.transform.label.AffinityTransform( 130 offsets=offsets, ignore_label=None, add_binary_target=True, add_mask=True 131 ) 132 msg = "Offsets are passed, but 'label_transform2' is in the kwargs. It will be over-ridden." 133 kwargs = util.update_kwargs(kwargs, "label_transform2", label_transform, msg=msg) 134 elif boundaries: 135 label_transform = torch_em.transform.label.BoundaryTransform(add_binary_target=True) 136 msg = "Boundaries is set to True, but 'label_transform' is in the kwargs. It will be over-ridden." 137 kwargs = util.update_kwargs(kwargs, "label_transform", label_transform, msg=msg) 138 elif binary: 139 label_transform = torch_em.transform.label.labels_to_binary 140 msg = "Binary is set to True, but 'label_transform' is in the kwargs. It will be over-ridden." 141 kwargs = util.update_kwargs(kwargs, "label_transform", label_transform, msg=msg) 142 143 kwargs = util.update_kwargs(kwargs, "is_seg_dataset", False) 144 145 return torch_em.default_segmentation_dataset( 146 raw_paths=raw_paths, 147 raw_key=None, 148 label_paths=label_paths, 149 label_key=None, 150 patch_shape=patch_shape, 151 **kwargs, 152 ) 153 154 155def get_mcellseg_loader( 156 path: Union[os.PathLike, str], 157 patch_shape: Tuple[int, int], 158 batch_size: int, 159 val_fraction: Optional[float] = None, 160 split: Optional[str] = None, 161 download: bool = False, 162 offsets: Optional[List[List[int]]] = None, 163 boundaries: bool = False, 164 binary: bool = False, 165 **kwargs, 166) -> DataLoader: 167 """Get the DataLoader for cell instance segmentation in mCellSeg. 168 169 Args: 170 path: Filepath to a folder where the downloaded data will be saved. 171 patch_shape: The patch shape (H, W) to use for training. 172 batch_size: The batch size for training. 173 val_fraction: The fraction of data to use for validation. 174 split: The split to use, either "train" or "val". Required if val_fraction is set. 175 download: Whether to download the data if it is not present. 176 offsets: Offset values for affinity computation used as target. 177 boundaries: Whether to compute boundaries as the target. 178 binary: Whether to return a binary segmentation target. 179 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` 180 or for the PyTorch DataLoader. 181 182 Returns: 183 The DataLoader. 184 """ 185 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 186 dataset = get_mcellseg_dataset( 187 path, patch_shape, val_fraction=val_fraction, split=split, download=download, 188 offsets=offsets, boundaries=boundaries, binary=binary, **ds_kwargs, 189 ) 190 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
31def get_mcellseg_data(path: Union[os.PathLike, str], download: bool = False) -> str: 32 """Download the mCellSeg dataset. 33 34 Args: 35 path: Filepath to a folder where the downloaded data will be saved. 36 download: Whether to download the data if it is not present. 37 38 Returns: 39 Path to the folder containing the downloaded data. 40 """ 41 data_dir = os.path.join(str(path), "mCellSeg") 42 if os.path.exists(data_dir): 43 return data_dir 44 45 os.makedirs(str(path), exist_ok=True) 46 zip_path = os.path.join(str(path), "mCellSeg.zip") 47 util.download_source(zip_path, URL, download, checksum=CHECKSUM) 48 util.unzip(zip_path, str(path), remove=True) 49 50 return data_dir
Download the mCellSeg dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- download: Whether to download the data if it is not present.
Returns:
Path to the folder containing the downloaded data.
53def get_mcellseg_paths( 54 path: Union[os.PathLike, str], 55 val_fraction: Optional[float] = None, 56 split: Optional[str] = None, 57 download: bool = False, 58) -> Tuple[List[str], List[str]]: 59 """Get paths to the mCellSeg image and mask files. 60 61 Only the 200 images that have corresponding instance masks are returned. 62 63 Args: 64 path: Filepath to a folder where the downloaded data will be saved. 65 val_fraction: The fraction of data to use for validation. If None, all data is returned. 66 split: The split to use, either "train" or "val". Required if val_fraction is set. 67 download: Whether to download the data if it is not present. 68 69 Returns: 70 Tuple of (raw image paths, label mask paths). 71 """ 72 data_dir = get_mcellseg_data(path, download) 73 74 mask_paths = natsorted(glob(os.path.join(data_dir, "labeled", "masks", "*.tif"))) 75 raw_paths = [] 76 valid_mask_paths = [] 77 for mask_path in mask_paths: 78 mask_name = os.path.basename(mask_path) 79 img_name = mask_name.replace("_mask.tif", ".tif") 80 img_path = os.path.join(data_dir, "labeled", "images", img_name) 81 if os.path.exists(img_path): 82 raw_paths.append(img_path) 83 valid_mask_paths.append(mask_path) 84 85 if val_fraction is not None: 86 assert split in ("train", "val"), f"'split' must be 'train' or 'val', got '{split}'." 87 n_val = max(1, int(len(raw_paths) * val_fraction)) 88 if split == "train": 89 raw_paths = raw_paths[n_val:] 90 valid_mask_paths = valid_mask_paths[n_val:] 91 else: 92 raw_paths = raw_paths[:n_val] 93 valid_mask_paths = valid_mask_paths[:n_val] 94 95 return raw_paths, valid_mask_paths
Get paths to the mCellSeg image and mask files.
Only the 200 images that have corresponding instance masks are returned.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- val_fraction: The fraction of data to use for validation. If None, all data is returned.
- split: The split to use, either "train" or "val". Required if val_fraction is set.
- download: Whether to download the data if it is not present.
Returns:
Tuple of (raw image paths, label mask paths).
98def get_mcellseg_dataset( 99 path: Union[os.PathLike, str], 100 patch_shape: Tuple[int, int], 101 val_fraction: Optional[float] = None, 102 split: Optional[str] = None, 103 download: bool = False, 104 offsets: Optional[List[List[int]]] = None, 105 boundaries: bool = False, 106 binary: bool = False, 107 **kwargs, 108) -> Dataset: 109 """Get the mCellSeg dataset for cell instance segmentation. 110 111 Args: 112 path: Filepath to a folder where the downloaded data will be saved. 113 patch_shape: The patch shape (H, W) to use for training. 114 val_fraction: The fraction of data to use for validation. 115 split: The split to use, either "train" or "val". Required if val_fraction is set. 116 download: Whether to download the data if it is not present. 117 offsets: Offset values for affinity computation used as target. 118 boundaries: Whether to compute boundaries as the target. 119 binary: Whether to return a binary segmentation target. 120 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 121 122 Returns: 123 The segmentation dataset. 124 """ 125 assert sum((offsets is not None, boundaries, binary)) <= 1, f"{offsets}, {boundaries}, {binary}" 126 127 raw_paths, label_paths = get_mcellseg_paths(path, val_fraction, split, download) 128 129 if offsets is not None: 130 label_transform = torch_em.transform.label.AffinityTransform( 131 offsets=offsets, ignore_label=None, add_binary_target=True, add_mask=True 132 ) 133 msg = "Offsets are passed, but 'label_transform2' is in the kwargs. It will be over-ridden." 134 kwargs = util.update_kwargs(kwargs, "label_transform2", label_transform, msg=msg) 135 elif boundaries: 136 label_transform = torch_em.transform.label.BoundaryTransform(add_binary_target=True) 137 msg = "Boundaries is set to True, but 'label_transform' is in the kwargs. It will be over-ridden." 138 kwargs = util.update_kwargs(kwargs, "label_transform", label_transform, msg=msg) 139 elif binary: 140 label_transform = torch_em.transform.label.labels_to_binary 141 msg = "Binary is set to True, but 'label_transform' is in the kwargs. It will be over-ridden." 142 kwargs = util.update_kwargs(kwargs, "label_transform", label_transform, msg=msg) 143 144 kwargs = util.update_kwargs(kwargs, "is_seg_dataset", False) 145 146 return torch_em.default_segmentation_dataset( 147 raw_paths=raw_paths, 148 raw_key=None, 149 label_paths=label_paths, 150 label_key=None, 151 patch_shape=patch_shape, 152 **kwargs, 153 )
Get the mCellSeg dataset for cell instance segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape (H, W) to use for training.
- val_fraction: The fraction of data to use for validation.
- split: The split to use, either "train" or "val". Required if val_fraction is set.
- download: Whether to download the data if it is not present.
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- binary: Whether to return a binary segmentation target.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset.
Returns:
The segmentation dataset.
156def get_mcellseg_loader( 157 path: Union[os.PathLike, str], 158 patch_shape: Tuple[int, int], 159 batch_size: int, 160 val_fraction: Optional[float] = None, 161 split: Optional[str] = None, 162 download: bool = False, 163 offsets: Optional[List[List[int]]] = None, 164 boundaries: bool = False, 165 binary: bool = False, 166 **kwargs, 167) -> DataLoader: 168 """Get the DataLoader for cell instance segmentation in mCellSeg. 169 170 Args: 171 path: Filepath to a folder where the downloaded data will be saved. 172 patch_shape: The patch shape (H, W) to use for training. 173 batch_size: The batch size for training. 174 val_fraction: The fraction of data to use for validation. 175 split: The split to use, either "train" or "val". Required if val_fraction is set. 176 download: Whether to download the data if it is not present. 177 offsets: Offset values for affinity computation used as target. 178 boundaries: Whether to compute boundaries as the target. 179 binary: Whether to return a binary segmentation target. 180 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` 181 or for the PyTorch DataLoader. 182 183 Returns: 184 The DataLoader. 185 """ 186 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 187 dataset = get_mcellseg_dataset( 188 path, patch_shape, val_fraction=val_fraction, split=split, download=download, 189 offsets=offsets, boundaries=boundaries, binary=binary, **ds_kwargs, 190 ) 191 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the DataLoader for cell instance segmentation in mCellSeg.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape (H, W) to use for training.
- batch_size: The batch size for training.
- val_fraction: The fraction of data to use for validation.
- split: The split to use, either "train" or "val". Required if val_fraction is set.
- download: Whether to download the data if it is not present.
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- binary: Whether to return a binary segmentation target.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_datasetor for the PyTorch DataLoader.
Returns:
The DataLoader.