torch_em.data.datasets.light_microscopy.glioma_c6
The Glioma C6 dataset contains phase-contrast microscopy images of Glioma C6 rat brain tumor cells annotated for instance segmentation. It consists of two subsets:
- Glioma C6-spec: 45 images (30 train / 4 val / 11 test) under controlled conditions.
- Glioma C6-gen: 30 images acquired under varied imaging conditions for generalization.
Images are 2592 × 1944 pixels (8-bit TIFF). Annotations are provided in COCO format with over 20,000 annotated cell and nuclei instances.
The dataset is located at https://zenodo.org/records/15083188. This dataset is from the following publication:
- Malashin et al. (2025): https://doi.org/10.48550/arXiv.2511.07286 Please cite it if you use this dataset in your research.
1"""The Glioma C6 dataset contains phase-contrast microscopy images of Glioma C6 2rat brain tumor cells annotated for instance segmentation. It consists of two subsets: 3 4- Glioma C6-spec: 45 images (30 train / 4 val / 11 test) under controlled conditions. 5- Glioma C6-gen: 30 images acquired under varied imaging conditions for generalization. 6 7Images are 2592 × 1944 pixels (8-bit TIFF). Annotations are provided in COCO format 8with over 20,000 annotated cell and nuclei instances. 9 10The dataset is located at https://zenodo.org/records/15083188. 11This dataset is from the following publication: 12- Malashin et al. (2025): https://doi.org/10.48550/arXiv.2511.07286 13Please cite it if you use this dataset in your research. 14""" 15 16import os 17import json 18from collections import defaultdict 19from glob import glob 20from natsort import natsorted 21from typing import List, Literal, Optional, Tuple, Union 22 23import numpy as np 24import imageio.v3 as imageio 25 26from torch.utils.data import Dataset, DataLoader 27 28import torch_em 29 30from .. import util 31 32 33URL = "https://zenodo.org/records/15083188/files/dataset.zip?download=1" 34CHECKSUM = None 35 36 37def _coco_to_instance_masks(image_dir: str, annotation_file: str, mask_dir: str) -> None: 38 """Convert COCO polygon annotations to per-image instance segmentation TIF masks. 39 40 Only cell annotations (supercategory 'cell') are included; nucleus annotations 41 (supercategory 'cell_part') are skipped. 42 """ 43 from skimage.draw import polygon as draw_polygon 44 45 with open(annotation_file, "r") as f: 46 coco = json.load(f) 47 48 # Keep only cell categories, not cell parts (nuclei). 49 cell_cat_ids = {c["id"] for c in coco["categories"] if c.get("supercategory") != "cell_part"} 50 51 images = {img["id"]: img for img in coco["images"]} 52 53 ann_by_image = defaultdict(list) 54 for ann in coco["annotations"]: 55 if ann["category_id"] in cell_cat_ids: 56 ann_by_image[ann["image_id"]].append(ann) 57 58 os.makedirs(mask_dir, exist_ok=True) 59 60 for img_id, img_info in images.items(): 61 fname = img_info["file_name"] 62 h, w = img_info["height"], img_info["width"] 63 64 mask = np.zeros((h, w), dtype=np.int32) 65 instance_id = 1 66 67 for ann in ann_by_image[img_id]: 68 segs = ann.get("segmentation", []) 69 if isinstance(segs, dict): 70 # RLE format - skip (requires pycocotools) 71 continue 72 for seg in segs: 73 pts = np.array(seg).reshape(-1, 2) 74 rr, cc = draw_polygon(pts[:, 1], pts[:, 0], shape=(h, w)) 75 mask[rr, cc] = instance_id 76 instance_id += 1 77 78 mask_name = os.path.splitext(os.path.basename(fname))[0] + "_mask.tif" 79 imageio.imwrite(os.path.join(mask_dir, mask_name), mask) 80 81 82def get_glioma_c6_data(path: Union[os.PathLike, str], download: bool = False) -> str: 83 """Download the Glioma C6 dataset and convert COCO annotations to instance masks. 84 85 Args: 86 path: Filepath to a folder where the downloaded data will be saved. 87 download: Whether to download the data if it is not present. 88 89 Returns: 90 The filepath to the extracted data directory. 91 """ 92 data_dir = os.path.join(path, "GliomaC6") 93 if os.path.exists(data_dir): 94 return data_dir 95 96 os.makedirs(data_dir, exist_ok=True) 97 zip_path = os.path.join(path, "glioma_c6_dataset.zip") 98 util.download_source(zip_path, URL, download, checksum=CHECKSUM) 99 util.unzip(zip_path, data_dir) 100 101 # Convert COCO annotations to instance masks for each subset/split. 102 for ann_file in natsorted(glob(os.path.join(data_dir, "**", "*.json"), recursive=True)): 103 subset_dir = os.path.dirname(ann_file) 104 image_dir = os.path.join(subset_dir, "images") 105 if not os.path.isdir(image_dir): 106 image_dir = subset_dir 107 108 split_name = os.path.splitext(os.path.basename(ann_file))[0] 109 mask_dir = os.path.join(subset_dir, "masks", split_name) 110 _coco_to_instance_masks(image_dir, ann_file, mask_dir) 111 112 return data_dir 113 114 115def get_glioma_c6_paths( 116 path: Union[os.PathLike, str], 117 subset: Literal["spec", "gen"] = "spec", 118 split: Optional[Literal["train", "val", "test"]] = None, 119 download: bool = False, 120) -> Tuple[List[str], List[str]]: 121 """Get paths to the Glioma C6 data. 122 123 Args: 124 path: Filepath to a folder where the downloaded data will be saved. 125 subset: The dataset subset. Either 'spec' (controlled, predefined splits) or 126 'gen' (generalization, varied conditions). 127 split: The data split. One of 'train', 'val', 'test'. Only applies to 'spec'. 128 For 'gen', pass None to return all images. 129 download: Whether to download the data if it is not present. 130 131 Returns: 132 List of filepaths for the image data. 133 List of filepaths for the label data. 134 """ 135 data_dir = get_glioma_c6_data(path, download) 136 # Zip extracts as dataset/{subset}/... inside data_dir. 137 dataset_dir = os.path.join(data_dir, "dataset", subset) 138 139 if not os.path.isdir(dataset_dir): 140 raise RuntimeError( 141 f"Could not find '{subset}' subset at {dataset_dir}. " 142 "Please check the dataset structure after downloading." 143 ) 144 145 if subset == "gen": 146 image_dir = os.path.join(dataset_dir, "images") 147 mask_dir = os.path.join(dataset_dir, "masks", "anno_gen") 148 raw_paths = natsorted(glob(os.path.join(image_dir, "*.tif"))) 149 label_paths = natsorted(glob(os.path.join(mask_dir, "*.tif"))) 150 else: 151 # spec subset: each split lives in its own subdirectory. 152 # The on-disk directory for "val" is "valid". 153 split_dir_name = "valid" if split == "val" else split 154 if split_dir_name is None: 155 # Return all splits combined. 156 raw_paths, label_paths = [], [] 157 for s, d in [("train", "train"), ("val", "valid"), ("test", "test")]: 158 rp, lp = get_glioma_c6_paths(path, subset, s, download) 159 raw_paths.extend(rp) 160 label_paths.extend(lp) 161 return raw_paths, label_paths 162 163 split_dir = os.path.join(dataset_dir, split_dir_name) 164 image_dir = os.path.join(split_dir, "images") 165 mask_dir = os.path.join(split_dir, "masks", f"anno_{split_dir_name}") 166 raw_paths = natsorted(glob(os.path.join(image_dir, "*.tif"))) 167 label_paths = natsorted(glob(os.path.join(mask_dir, "*.tif"))) 168 169 if len(raw_paths) == 0: 170 raise RuntimeError(f"No images found for subset='{subset}', split='{split}' in {dataset_dir}.") 171 172 return raw_paths, label_paths 173 174 175def get_glioma_c6_dataset( 176 path: Union[os.PathLike, str], 177 patch_shape: Tuple[int, int], 178 subset: Literal["spec", "gen"] = "spec", 179 split: Optional[Literal["train", "val", "test"]] = None, 180 download: bool = False, 181 **kwargs, 182) -> Dataset: 183 """Get the Glioma C6 dataset for phase-contrast cell instance segmentation. 184 185 Args: 186 path: Filepath to a folder where the downloaded data will be saved. 187 patch_shape: The patch shape to use for training. 188 subset: The dataset subset. Either 'spec' or 'gen'. 189 split: The data split. One of 'train', 'val', 'test' (only for 'spec'). 190 download: Whether to download the data if it is not present. 191 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 192 193 Returns: 194 The segmentation dataset. 195 """ 196 raw_paths, label_paths = get_glioma_c6_paths(path, subset, split, download) 197 198 return torch_em.default_segmentation_dataset( 199 raw_paths=raw_paths, 200 raw_key=None, 201 label_paths=label_paths, 202 label_key=None, 203 patch_shape=patch_shape, 204 is_seg_dataset=False, 205 **kwargs, 206 ) 207 208 209def get_glioma_c6_loader( 210 path: Union[os.PathLike, str], 211 batch_size: int, 212 patch_shape: Tuple[int, int], 213 subset: Literal["spec", "gen"] = "spec", 214 split: Optional[Literal["train", "val", "test"]] = None, 215 download: bool = False, 216 **kwargs, 217) -> DataLoader: 218 """Get the Glioma C6 dataloader for phase-contrast cell instance segmentation. 219 220 Args: 221 path: Filepath to a folder where the downloaded data will be saved. 222 batch_size: The batch size for training. 223 patch_shape: The patch shape to use for training. 224 subset: The dataset subset. Either 'spec' or 'gen'. 225 split: The data split. One of 'train', 'val', 'test' (only for 'spec'). 226 download: Whether to download the data if it is not present. 227 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 228 229 Returns: 230 The DataLoader. 231 """ 232 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 233 dataset = get_glioma_c6_dataset(path, patch_shape, subset, split, download, **ds_kwargs) 234 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
83def get_glioma_c6_data(path: Union[os.PathLike, str], download: bool = False) -> str: 84 """Download the Glioma C6 dataset and convert COCO annotations to instance masks. 85 86 Args: 87 path: Filepath to a folder where the downloaded data will be saved. 88 download: Whether to download the data if it is not present. 89 90 Returns: 91 The filepath to the extracted data directory. 92 """ 93 data_dir = os.path.join(path, "GliomaC6") 94 if os.path.exists(data_dir): 95 return data_dir 96 97 os.makedirs(data_dir, exist_ok=True) 98 zip_path = os.path.join(path, "glioma_c6_dataset.zip") 99 util.download_source(zip_path, URL, download, checksum=CHECKSUM) 100 util.unzip(zip_path, data_dir) 101 102 # Convert COCO annotations to instance masks for each subset/split. 103 for ann_file in natsorted(glob(os.path.join(data_dir, "**", "*.json"), recursive=True)): 104 subset_dir = os.path.dirname(ann_file) 105 image_dir = os.path.join(subset_dir, "images") 106 if not os.path.isdir(image_dir): 107 image_dir = subset_dir 108 109 split_name = os.path.splitext(os.path.basename(ann_file))[0] 110 mask_dir = os.path.join(subset_dir, "masks", split_name) 111 _coco_to_instance_masks(image_dir, ann_file, mask_dir) 112 113 return data_dir
Download the Glioma C6 dataset and convert COCO annotations to instance masks.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- download: Whether to download the data if it is not present.
Returns:
The filepath to the extracted data directory.
116def get_glioma_c6_paths( 117 path: Union[os.PathLike, str], 118 subset: Literal["spec", "gen"] = "spec", 119 split: Optional[Literal["train", "val", "test"]] = None, 120 download: bool = False, 121) -> Tuple[List[str], List[str]]: 122 """Get paths to the Glioma C6 data. 123 124 Args: 125 path: Filepath to a folder where the downloaded data will be saved. 126 subset: The dataset subset. Either 'spec' (controlled, predefined splits) or 127 'gen' (generalization, varied conditions). 128 split: The data split. One of 'train', 'val', 'test'. Only applies to 'spec'. 129 For 'gen', pass None to return all images. 130 download: Whether to download the data if it is not present. 131 132 Returns: 133 List of filepaths for the image data. 134 List of filepaths for the label data. 135 """ 136 data_dir = get_glioma_c6_data(path, download) 137 # Zip extracts as dataset/{subset}/... inside data_dir. 138 dataset_dir = os.path.join(data_dir, "dataset", subset) 139 140 if not os.path.isdir(dataset_dir): 141 raise RuntimeError( 142 f"Could not find '{subset}' subset at {dataset_dir}. " 143 "Please check the dataset structure after downloading." 144 ) 145 146 if subset == "gen": 147 image_dir = os.path.join(dataset_dir, "images") 148 mask_dir = os.path.join(dataset_dir, "masks", "anno_gen") 149 raw_paths = natsorted(glob(os.path.join(image_dir, "*.tif"))) 150 label_paths = natsorted(glob(os.path.join(mask_dir, "*.tif"))) 151 else: 152 # spec subset: each split lives in its own subdirectory. 153 # The on-disk directory for "val" is "valid". 154 split_dir_name = "valid" if split == "val" else split 155 if split_dir_name is None: 156 # Return all splits combined. 157 raw_paths, label_paths = [], [] 158 for s, d in [("train", "train"), ("val", "valid"), ("test", "test")]: 159 rp, lp = get_glioma_c6_paths(path, subset, s, download) 160 raw_paths.extend(rp) 161 label_paths.extend(lp) 162 return raw_paths, label_paths 163 164 split_dir = os.path.join(dataset_dir, split_dir_name) 165 image_dir = os.path.join(split_dir, "images") 166 mask_dir = os.path.join(split_dir, "masks", f"anno_{split_dir_name}") 167 raw_paths = natsorted(glob(os.path.join(image_dir, "*.tif"))) 168 label_paths = natsorted(glob(os.path.join(mask_dir, "*.tif"))) 169 170 if len(raw_paths) == 0: 171 raise RuntimeError(f"No images found for subset='{subset}', split='{split}' in {dataset_dir}.") 172 173 return raw_paths, label_paths
Get paths to the Glioma C6 data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- subset: The dataset subset. Either 'spec' (controlled, predefined splits) or 'gen' (generalization, varied conditions).
- split: The data split. One of 'train', 'val', 'test'. Only applies to 'spec'. For 'gen', pass None to return all images.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the image data. List of filepaths for the label data.
176def get_glioma_c6_dataset( 177 path: Union[os.PathLike, str], 178 patch_shape: Tuple[int, int], 179 subset: Literal["spec", "gen"] = "spec", 180 split: Optional[Literal["train", "val", "test"]] = None, 181 download: bool = False, 182 **kwargs, 183) -> Dataset: 184 """Get the Glioma C6 dataset for phase-contrast cell instance segmentation. 185 186 Args: 187 path: Filepath to a folder where the downloaded data will be saved. 188 patch_shape: The patch shape to use for training. 189 subset: The dataset subset. Either 'spec' or 'gen'. 190 split: The data split. One of 'train', 'val', 'test' (only for 'spec'). 191 download: Whether to download the data if it is not present. 192 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 193 194 Returns: 195 The segmentation dataset. 196 """ 197 raw_paths, label_paths = get_glioma_c6_paths(path, subset, split, download) 198 199 return torch_em.default_segmentation_dataset( 200 raw_paths=raw_paths, 201 raw_key=None, 202 label_paths=label_paths, 203 label_key=None, 204 patch_shape=patch_shape, 205 is_seg_dataset=False, 206 **kwargs, 207 )
Get the Glioma C6 dataset for phase-contrast cell instance segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- subset: The dataset subset. Either 'spec' or 'gen'.
- split: The data split. One of 'train', 'val', 'test' (only for 'spec').
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset.
Returns:
The segmentation dataset.
210def get_glioma_c6_loader( 211 path: Union[os.PathLike, str], 212 batch_size: int, 213 patch_shape: Tuple[int, int], 214 subset: Literal["spec", "gen"] = "spec", 215 split: Optional[Literal["train", "val", "test"]] = None, 216 download: bool = False, 217 **kwargs, 218) -> DataLoader: 219 """Get the Glioma C6 dataloader for phase-contrast cell instance segmentation. 220 221 Args: 222 path: Filepath to a folder where the downloaded data will be saved. 223 batch_size: The batch size for training. 224 patch_shape: The patch shape to use for training. 225 subset: The dataset subset. Either 'spec' or 'gen'. 226 split: The data split. One of 'train', 'val', 'test' (only for 'spec'). 227 download: Whether to download the data if it is not present. 228 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 229 230 Returns: 231 The DataLoader. 232 """ 233 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 234 dataset = get_glioma_c6_dataset(path, patch_shape, subset, split, download, **ds_kwargs) 235 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the Glioma C6 dataloader for phase-contrast cell instance segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- subset: The dataset subset. Either 'spec' or 'gen'.
- split: The data split. One of 'train', 'val', 'test' (only for 'spec').
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_datasetor for the PyTorch DataLoader.
Returns:
The DataLoader.