torch_em.data.datasets.light_microscopy.glioma_c6

The Glioma C6 dataset contains phase-contrast microscopy images of Glioma C6 rat brain tumor cells annotated for instance segmentation. It consists of two subsets:

  • Glioma C6-spec: 45 images (30 train / 4 val / 11 test) under controlled conditions.
  • Glioma C6-gen: 30 images acquired under varied imaging conditions for generalization.

Images are 2592 × 1944 pixels (8-bit TIFF). Annotations are provided in COCO format with over 20,000 annotated cell and nuclei instances.

The dataset is located at https://zenodo.org/records/15083188. This dataset is from the following publication:

  1"""The Glioma C6 dataset contains phase-contrast microscopy images of Glioma C6
  2rat brain tumor cells annotated for instance segmentation. It consists of two subsets:
  3
  4- Glioma C6-spec: 45 images (30 train / 4 val / 11 test) under controlled conditions.
  5- Glioma C6-gen: 30 images acquired under varied imaging conditions for generalization.
  6
  7Images are 2592 × 1944 pixels (8-bit TIFF). Annotations are provided in COCO format
  8with over 20,000 annotated cell and nuclei instances.
  9
 10The dataset is located at https://zenodo.org/records/15083188.
 11This dataset is from the following publication:
 12- Malashin et al. (2025): https://doi.org/10.48550/arXiv.2511.07286
 13Please cite it if you use this dataset in your research.
 14"""
 15
 16import os
 17import json
 18from collections import defaultdict
 19from glob import glob
 20from natsort import natsorted
 21from typing import List, Literal, Optional, Tuple, Union
 22
 23import numpy as np
 24import imageio.v3 as imageio
 25
 26from torch.utils.data import Dataset, DataLoader
 27
 28import torch_em
 29
 30from .. import util
 31
 32
 33URL = "https://zenodo.org/records/15083188/files/dataset.zip?download=1"
 34CHECKSUM = None
 35
 36
 37def _coco_to_instance_masks(image_dir: str, annotation_file: str, mask_dir: str) -> None:
 38    """Convert COCO polygon annotations to per-image instance segmentation TIF masks.
 39
 40    Only cell annotations (supercategory 'cell') are included; nucleus annotations
 41    (supercategory 'cell_part') are skipped.
 42    """
 43    from skimage.draw import polygon as draw_polygon
 44
 45    with open(annotation_file, "r") as f:
 46        coco = json.load(f)
 47
 48    # Keep only cell categories, not cell parts (nuclei).
 49    cell_cat_ids = {c["id"] for c in coco["categories"] if c.get("supercategory") != "cell_part"}
 50
 51    images = {img["id"]: img for img in coco["images"]}
 52
 53    ann_by_image = defaultdict(list)
 54    for ann in coco["annotations"]:
 55        if ann["category_id"] in cell_cat_ids:
 56            ann_by_image[ann["image_id"]].append(ann)
 57
 58    os.makedirs(mask_dir, exist_ok=True)
 59
 60    for img_id, img_info in images.items():
 61        fname = img_info["file_name"]
 62        h, w = img_info["height"], img_info["width"]
 63
 64        mask = np.zeros((h, w), dtype=np.int32)
 65        instance_id = 1
 66
 67        for ann in ann_by_image[img_id]:
 68            segs = ann.get("segmentation", [])
 69            if isinstance(segs, dict):
 70                # RLE format - skip (requires pycocotools)
 71                continue
 72            for seg in segs:
 73                pts = np.array(seg).reshape(-1, 2)
 74                rr, cc = draw_polygon(pts[:, 1], pts[:, 0], shape=(h, w))
 75                mask[rr, cc] = instance_id
 76                instance_id += 1
 77
 78        mask_name = os.path.splitext(os.path.basename(fname))[0] + "_mask.tif"
 79        imageio.imwrite(os.path.join(mask_dir, mask_name), mask)
 80
 81
 82def get_glioma_c6_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 83    """Download the Glioma C6 dataset and convert COCO annotations to instance masks.
 84
 85    Args:
 86        path: Filepath to a folder where the downloaded data will be saved.
 87        download: Whether to download the data if it is not present.
 88
 89    Returns:
 90        The filepath to the extracted data directory.
 91    """
 92    data_dir = os.path.join(path, "GliomaC6")
 93    if os.path.exists(data_dir):
 94        return data_dir
 95
 96    os.makedirs(data_dir, exist_ok=True)
 97    zip_path = os.path.join(path, "glioma_c6_dataset.zip")
 98    util.download_source(zip_path, URL, download, checksum=CHECKSUM)
 99    util.unzip(zip_path, data_dir)
100
101    # Convert COCO annotations to instance masks for each subset/split.
102    for ann_file in natsorted(glob(os.path.join(data_dir, "**", "*.json"), recursive=True)):
103        subset_dir = os.path.dirname(ann_file)
104        image_dir = os.path.join(subset_dir, "images")
105        if not os.path.isdir(image_dir):
106            image_dir = subset_dir
107
108        split_name = os.path.splitext(os.path.basename(ann_file))[0]
109        mask_dir = os.path.join(subset_dir, "masks", split_name)
110        _coco_to_instance_masks(image_dir, ann_file, mask_dir)
111
112    return data_dir
113
114
115def get_glioma_c6_paths(
116    path: Union[os.PathLike, str],
117    subset: Literal["spec", "gen"] = "spec",
118    split: Optional[Literal["train", "val", "test"]] = None,
119    download: bool = False,
120) -> Tuple[List[str], List[str]]:
121    """Get paths to the Glioma C6 data.
122
123    Args:
124        path: Filepath to a folder where the downloaded data will be saved.
125        subset: The dataset subset. Either 'spec' (controlled, predefined splits) or
126            'gen' (generalization, varied conditions).
127        split: The data split. One of 'train', 'val', 'test'. Only applies to 'spec'.
128            For 'gen', pass None to return all images.
129        download: Whether to download the data if it is not present.
130
131    Returns:
132        List of filepaths for the image data.
133        List of filepaths for the label data.
134    """
135    data_dir = get_glioma_c6_data(path, download)
136    # Zip extracts as dataset/{subset}/... inside data_dir.
137    dataset_dir = os.path.join(data_dir, "dataset", subset)
138
139    if not os.path.isdir(dataset_dir):
140        raise RuntimeError(
141            f"Could not find '{subset}' subset at {dataset_dir}. "
142            "Please check the dataset structure after downloading."
143        )
144
145    if subset == "gen":
146        image_dir = os.path.join(dataset_dir, "images")
147        mask_dir = os.path.join(dataset_dir, "masks", "anno_gen")
148        raw_paths = natsorted(glob(os.path.join(image_dir, "*.tif")))
149        label_paths = natsorted(glob(os.path.join(mask_dir, "*.tif")))
150    else:
151        # spec subset: each split lives in its own subdirectory.
152        # The on-disk directory for "val" is "valid".
153        split_dir_name = "valid" if split == "val" else split
154        if split_dir_name is None:
155            # Return all splits combined.
156            raw_paths, label_paths = [], []
157            for s, d in [("train", "train"), ("val", "valid"), ("test", "test")]:
158                rp, lp = get_glioma_c6_paths(path, subset, s, download)
159                raw_paths.extend(rp)
160                label_paths.extend(lp)
161            return raw_paths, label_paths
162
163        split_dir = os.path.join(dataset_dir, split_dir_name)
164        image_dir = os.path.join(split_dir, "images")
165        mask_dir = os.path.join(split_dir, "masks", f"anno_{split_dir_name}")
166        raw_paths = natsorted(glob(os.path.join(image_dir, "*.tif")))
167        label_paths = natsorted(glob(os.path.join(mask_dir, "*.tif")))
168
169    if len(raw_paths) == 0:
170        raise RuntimeError(f"No images found for subset='{subset}', split='{split}' in {dataset_dir}.")
171
172    return raw_paths, label_paths
173
174
175def get_glioma_c6_dataset(
176    path: Union[os.PathLike, str],
177    patch_shape: Tuple[int, int],
178    subset: Literal["spec", "gen"] = "spec",
179    split: Optional[Literal["train", "val", "test"]] = None,
180    download: bool = False,
181    **kwargs,
182) -> Dataset:
183    """Get the Glioma C6 dataset for phase-contrast cell instance segmentation.
184
185    Args:
186        path: Filepath to a folder where the downloaded data will be saved.
187        patch_shape: The patch shape to use for training.
188        subset: The dataset subset. Either 'spec' or 'gen'.
189        split: The data split. One of 'train', 'val', 'test' (only for 'spec').
190        download: Whether to download the data if it is not present.
191        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
192
193    Returns:
194        The segmentation dataset.
195    """
196    raw_paths, label_paths = get_glioma_c6_paths(path, subset, split, download)
197
198    return torch_em.default_segmentation_dataset(
199        raw_paths=raw_paths,
200        raw_key=None,
201        label_paths=label_paths,
202        label_key=None,
203        patch_shape=patch_shape,
204        is_seg_dataset=False,
205        **kwargs,
206    )
207
208
209def get_glioma_c6_loader(
210    path: Union[os.PathLike, str],
211    batch_size: int,
212    patch_shape: Tuple[int, int],
213    subset: Literal["spec", "gen"] = "spec",
214    split: Optional[Literal["train", "val", "test"]] = None,
215    download: bool = False,
216    **kwargs,
217) -> DataLoader:
218    """Get the Glioma C6 dataloader for phase-contrast cell instance segmentation.
219
220    Args:
221        path: Filepath to a folder where the downloaded data will be saved.
222        batch_size: The batch size for training.
223        patch_shape: The patch shape to use for training.
224        subset: The dataset subset. Either 'spec' or 'gen'.
225        split: The data split. One of 'train', 'val', 'test' (only for 'spec').
226        download: Whether to download the data if it is not present.
227        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
228
229    Returns:
230        The DataLoader.
231    """
232    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
233    dataset = get_glioma_c6_dataset(path, patch_shape, subset, split, download, **ds_kwargs)
234    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URL = 'https://zenodo.org/records/15083188/files/dataset.zip?download=1'
CHECKSUM = None
def get_glioma_c6_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 83def get_glioma_c6_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 84    """Download the Glioma C6 dataset and convert COCO annotations to instance masks.
 85
 86    Args:
 87        path: Filepath to a folder where the downloaded data will be saved.
 88        download: Whether to download the data if it is not present.
 89
 90    Returns:
 91        The filepath to the extracted data directory.
 92    """
 93    data_dir = os.path.join(path, "GliomaC6")
 94    if os.path.exists(data_dir):
 95        return data_dir
 96
 97    os.makedirs(data_dir, exist_ok=True)
 98    zip_path = os.path.join(path, "glioma_c6_dataset.zip")
 99    util.download_source(zip_path, URL, download, checksum=CHECKSUM)
100    util.unzip(zip_path, data_dir)
101
102    # Convert COCO annotations to instance masks for each subset/split.
103    for ann_file in natsorted(glob(os.path.join(data_dir, "**", "*.json"), recursive=True)):
104        subset_dir = os.path.dirname(ann_file)
105        image_dir = os.path.join(subset_dir, "images")
106        if not os.path.isdir(image_dir):
107            image_dir = subset_dir
108
109        split_name = os.path.splitext(os.path.basename(ann_file))[0]
110        mask_dir = os.path.join(subset_dir, "masks", split_name)
111        _coco_to_instance_masks(image_dir, ann_file, mask_dir)
112
113    return data_dir

Download the Glioma C6 dataset and convert COCO annotations to instance masks.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • download: Whether to download the data if it is not present.
Returns:

The filepath to the extracted data directory.

def get_glioma_c6_paths( path: Union[os.PathLike, str], subset: Literal['spec', 'gen'] = 'spec', split: Optional[Literal['train', 'val', 'test']] = None, download: bool = False) -> Tuple[List[str], List[str]]:
116def get_glioma_c6_paths(
117    path: Union[os.PathLike, str],
118    subset: Literal["spec", "gen"] = "spec",
119    split: Optional[Literal["train", "val", "test"]] = None,
120    download: bool = False,
121) -> Tuple[List[str], List[str]]:
122    """Get paths to the Glioma C6 data.
123
124    Args:
125        path: Filepath to a folder where the downloaded data will be saved.
126        subset: The dataset subset. Either 'spec' (controlled, predefined splits) or
127            'gen' (generalization, varied conditions).
128        split: The data split. One of 'train', 'val', 'test'. Only applies to 'spec'.
129            For 'gen', pass None to return all images.
130        download: Whether to download the data if it is not present.
131
132    Returns:
133        List of filepaths for the image data.
134        List of filepaths for the label data.
135    """
136    data_dir = get_glioma_c6_data(path, download)
137    # Zip extracts as dataset/{subset}/... inside data_dir.
138    dataset_dir = os.path.join(data_dir, "dataset", subset)
139
140    if not os.path.isdir(dataset_dir):
141        raise RuntimeError(
142            f"Could not find '{subset}' subset at {dataset_dir}. "
143            "Please check the dataset structure after downloading."
144        )
145
146    if subset == "gen":
147        image_dir = os.path.join(dataset_dir, "images")
148        mask_dir = os.path.join(dataset_dir, "masks", "anno_gen")
149        raw_paths = natsorted(glob(os.path.join(image_dir, "*.tif")))
150        label_paths = natsorted(glob(os.path.join(mask_dir, "*.tif")))
151    else:
152        # spec subset: each split lives in its own subdirectory.
153        # The on-disk directory for "val" is "valid".
154        split_dir_name = "valid" if split == "val" else split
155        if split_dir_name is None:
156            # Return all splits combined.
157            raw_paths, label_paths = [], []
158            for s, d in [("train", "train"), ("val", "valid"), ("test", "test")]:
159                rp, lp = get_glioma_c6_paths(path, subset, s, download)
160                raw_paths.extend(rp)
161                label_paths.extend(lp)
162            return raw_paths, label_paths
163
164        split_dir = os.path.join(dataset_dir, split_dir_name)
165        image_dir = os.path.join(split_dir, "images")
166        mask_dir = os.path.join(split_dir, "masks", f"anno_{split_dir_name}")
167        raw_paths = natsorted(glob(os.path.join(image_dir, "*.tif")))
168        label_paths = natsorted(glob(os.path.join(mask_dir, "*.tif")))
169
170    if len(raw_paths) == 0:
171        raise RuntimeError(f"No images found for subset='{subset}', split='{split}' in {dataset_dir}.")
172
173    return raw_paths, label_paths

Get paths to the Glioma C6 data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • subset: The dataset subset. Either 'spec' (controlled, predefined splits) or 'gen' (generalization, varied conditions).
  • split: The data split. One of 'train', 'val', 'test'. Only applies to 'spec'. For 'gen', pass None to return all images.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths for the image data. List of filepaths for the label data.

def get_glioma_c6_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], subset: Literal['spec', 'gen'] = 'spec', split: Optional[Literal['train', 'val', 'test']] = None, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
176def get_glioma_c6_dataset(
177    path: Union[os.PathLike, str],
178    patch_shape: Tuple[int, int],
179    subset: Literal["spec", "gen"] = "spec",
180    split: Optional[Literal["train", "val", "test"]] = None,
181    download: bool = False,
182    **kwargs,
183) -> Dataset:
184    """Get the Glioma C6 dataset for phase-contrast cell instance segmentation.
185
186    Args:
187        path: Filepath to a folder where the downloaded data will be saved.
188        patch_shape: The patch shape to use for training.
189        subset: The dataset subset. Either 'spec' or 'gen'.
190        split: The data split. One of 'train', 'val', 'test' (only for 'spec').
191        download: Whether to download the data if it is not present.
192        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
193
194    Returns:
195        The segmentation dataset.
196    """
197    raw_paths, label_paths = get_glioma_c6_paths(path, subset, split, download)
198
199    return torch_em.default_segmentation_dataset(
200        raw_paths=raw_paths,
201        raw_key=None,
202        label_paths=label_paths,
203        label_key=None,
204        patch_shape=patch_shape,
205        is_seg_dataset=False,
206        **kwargs,
207    )

Get the Glioma C6 dataset for phase-contrast cell instance segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • subset: The dataset subset. Either 'spec' or 'gen'.
  • split: The data split. One of 'train', 'val', 'test' (only for 'spec').
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_glioma_c6_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], subset: Literal['spec', 'gen'] = 'spec', split: Optional[Literal['train', 'val', 'test']] = None, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
210def get_glioma_c6_loader(
211    path: Union[os.PathLike, str],
212    batch_size: int,
213    patch_shape: Tuple[int, int],
214    subset: Literal["spec", "gen"] = "spec",
215    split: Optional[Literal["train", "val", "test"]] = None,
216    download: bool = False,
217    **kwargs,
218) -> DataLoader:
219    """Get the Glioma C6 dataloader for phase-contrast cell instance segmentation.
220
221    Args:
222        path: Filepath to a folder where the downloaded data will be saved.
223        batch_size: The batch size for training.
224        patch_shape: The patch shape to use for training.
225        subset: The dataset subset. Either 'spec' or 'gen'.
226        split: The data split. One of 'train', 'val', 'test' (only for 'spec').
227        download: Whether to download the data if it is not present.
228        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
229
230    Returns:
231        The DataLoader.
232    """
233    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
234    dataset = get_glioma_c6_dataset(path, patch_shape, subset, split, download, **ds_kwargs)
235    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the Glioma C6 dataloader for phase-contrast cell instance segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape to use for training.
  • subset: The dataset subset. Either 'spec' or 'gen'.
  • split: The data split. One of 'train', 'val', 'test' (only for 'spec').
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.