torch_em.data.datasets.light_microscopy.enseg
The ENSeg dataset contains annotations for enteric neuron cells in microscopy images.
The dataset is located at https://www.kaggle.com/datasets/gustavozanonifelipe/enseg-dataset. This dataset is from the publication https://doi.org/10.3390/app15031046. Please cite it if you use this dataset in your research.
1"""The ENSeg dataset contains annotations for enteric neuron cells in microscopy images. 2 3The dataset is located at https://www.kaggle.com/datasets/gustavozanonifelipe/enseg-dataset. 4This dataset is from the publication https://doi.org/10.3390/app15031046. 5Please cite it if you use this dataset in your research. 6""" 7 8import os 9import json 10import base64 11from glob import glob 12from tqdm import tqdm 13from natsort import natsorted 14from typing import Union, Tuple, Optional, Sequence, List 15 16import numpy as np 17import imageio.v3 as imageio 18 19from skimage.draw import polygon as draw_polygon 20 21from torch.utils.data import Dataset, DataLoader 22 23import torch_em 24 25from .. import util 26 27 28KAGGLE_DATASET_NAME = "gustavozanonifelipe/enseg-dataset" 29 30ANIMAL_TAGS = ["2C", "4C", "5C", "22TW", "23TW", "28TW"] 31 32 33def _process_json(json_path, image_dir, seg_dir): 34 """Extract image and instance segmentation mask from a LabelMe JSON file.""" 35 with open(json_path) as f: 36 data = json.load(f) 37 38 animal_tag = data["animalTag"] 39 stem = f"{animal_tag}_{os.path.basename(json_path).replace('.json', '')}" 40 41 image_path = os.path.join(image_dir, f"{stem}.png") 42 seg_path = os.path.join(seg_dir, f"{stem}.tif") 43 44 # Extract image from base64 data. 45 if not os.path.exists(image_path): 46 image_bytes = base64.b64decode(data["imageData"]) 47 image = imageio.imread(image_bytes, extension=".jpg") 48 imageio.imwrite(image_path, image) 49 50 # Create instance segmentation mask from polygon annotations. 51 if not os.path.exists(seg_path): 52 shape = (data["imageHeight"], data["imageWidth"]) 53 seg = np.zeros(shape, dtype="uint16") 54 for seg_id, obj in enumerate(data["shapes"], 1): 55 points = np.array(obj["points"]) 56 rr, cc = draw_polygon(points[:, 1], points[:, 0], shape=shape) 57 seg[rr, cc] = seg_id 58 59 imageio.imwrite(seg_path, seg, compression="zlib") 60 61 return image_path, seg_path, animal_tag 62 63 64def _preprocess_data(data_dir, image_dir, seg_dir): 65 """Extract all images and create instance masks from LabelMe JSON files.""" 66 os.makedirs(image_dir, exist_ok=True) 67 os.makedirs(seg_dir, exist_ok=True) 68 69 json_paths = natsorted(glob(os.path.join(data_dir, "*", "*.json"))) 70 assert len(json_paths) > 0, f"No JSON annotation files found in {data_dir}" 71 72 for json_path in tqdm(json_paths, desc="Processing ENSeg data"): 73 _process_json(json_path, image_dir, seg_dir) 74 75 76def get_enseg_data(path: Union[os.PathLike, str], download: bool = False) -> str: 77 """Download the ENSeg dataset. 78 79 Args: 80 path: Filepath to a folder where the downloaded data will be saved. 81 download: Whether to download the data if it is not present. 82 83 Returns: 84 Filepath where the data is downloaded. 85 """ 86 data_dir = os.path.join(path, "data") 87 if os.path.exists(data_dir): 88 return data_dir 89 90 os.makedirs(path, exist_ok=True) 91 util.download_source_kaggle(path=path, dataset_name=KAGGLE_DATASET_NAME, download=download) 92 util.unzip(zip_path=os.path.join(path, "enseg-dataset.zip"), dst=data_dir) 93 94 return data_dir 95 96 97def get_enseg_paths( 98 path: Union[os.PathLike, str], 99 animal_tags: Optional[Sequence[str]] = None, 100 download: bool = False, 101) -> Tuple[List[str], List[str]]: 102 """Get paths to the ENSeg data. 103 104 Args: 105 path: Filepath to a folder where the downloaded data will be saved. 106 animal_tags: Filter images by animal tags (e.g. ['2C', '4C']). 107 Valid tags: '2C', '4C', '5C' (Control) and '22TW', '23TW', '28TW' (Tumor). 108 download: Whether to download the data if it is not present. 109 110 Returns: 111 List of filepaths for the image data. 112 List of filepaths for the label data. 113 """ 114 data_dir = get_enseg_data(path, download) 115 116 image_dir = os.path.join(path, "images") 117 seg_dir = os.path.join(path, "segmentations") 118 119 # Preprocess: extract images from JSON and create instance masks. 120 if not os.path.exists(image_dir) or not os.path.exists(seg_dir): 121 _preprocess_data(data_dir, image_dir, seg_dir) 122 123 seg_paths = natsorted(glob(os.path.join(seg_dir, "*.tif"))) 124 image_paths = natsorted(glob(os.path.join(image_dir, "*.png"))) 125 assert len(image_paths) == len(seg_paths) and len(image_paths) > 0 126 127 if animal_tags is not None: 128 assert isinstance(animal_tags, (list, tuple)), \ 129 f"'animal_tags' must be a list or tuple, got {type(animal_tags)}" 130 for tag in animal_tags: 131 assert tag in ANIMAL_TAGS, f"'{tag}' is not a valid animal tag. Choose from {ANIMAL_TAGS}." 132 133 # Filter by animal tag using the filename prefix (<animal_tag>_<number>). 134 filtered_image_paths, filtered_seg_paths = [], [] 135 for image_path, seg_path in zip(image_paths, seg_paths): 136 fname = os.path.basename(image_path) 137 # The tag is everything before the last underscore-number part. 138 tag = fname.rsplit("_", 1)[0] 139 if tag in animal_tags: 140 filtered_image_paths.append(image_path) 141 filtered_seg_paths.append(seg_path) 142 143 image_paths, seg_paths = filtered_image_paths, filtered_seg_paths 144 assert len(image_paths) > 0, f"No images found for animal tags {animal_tags}." 145 146 return image_paths, seg_paths 147 148 149def get_enseg_dataset( 150 path: Union[os.PathLike, str], 151 patch_shape: Tuple[int, int], 152 animal_tags: Optional[Sequence[str]] = None, 153 offsets: Optional[List[List[int]]] = None, 154 boundaries: bool = False, 155 binary: bool = False, 156 download: bool = False, 157 **kwargs 158) -> Dataset: 159 """Get the ENSeg dataset for enteric neuron cell segmentation. 160 161 Args: 162 path: Filepath to a folder where the downloaded data will be saved. 163 patch_shape: The patch shape to use for training. 164 animal_tags: Filter images by animal tags (e.g. ['2C', '4C']). 165 Valid tags: '2C', '4C', '5C' (Control) and '22TW', '23TW', '28TW' (Tumor). 166 offsets: Offset values for affinity computation used as target. 167 boundaries: Whether to compute boundaries as the target. 168 binary: Whether to use a binary segmentation target. 169 download: Whether to download the data if it is not present. 170 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 171 172 Returns: 173 The segmentation dataset. 174 """ 175 image_paths, seg_paths = get_enseg_paths(path, animal_tags, download) 176 177 kwargs = util.ensure_transforms(ndim=2, **kwargs) 178 kwargs, _ = util.add_instance_label_transform( 179 kwargs, add_binary_target=True, offsets=offsets, boundaries=boundaries, binary=binary 180 ) 181 182 return torch_em.default_segmentation_dataset( 183 raw_paths=image_paths, 184 raw_key=None, 185 label_paths=seg_paths, 186 label_key=None, 187 patch_shape=patch_shape, 188 is_seg_dataset=False, 189 with_channels=True, 190 ndim=2, 191 **kwargs 192 ) 193 194 195def get_enseg_loader( 196 path: Union[os.PathLike, str], 197 batch_size: int, 198 patch_shape: Tuple[int, int], 199 animal_tags: Optional[Sequence[str]] = None, 200 offsets: Optional[List[List[int]]] = None, 201 boundaries: bool = False, 202 binary: bool = False, 203 download: bool = False, 204 **kwargs 205) -> DataLoader: 206 """Get the ENSeg dataloader for enteric neuron cell segmentation. 207 208 Args: 209 path: Filepath to a folder where the downloaded data will be saved. 210 batch_size: The batch size for training. 211 patch_shape: The patch shape to use for training. 212 animal_tags: Filter images by animal tags (e.g. ['2C', '4C']). 213 Valid tags: '2C', '4C', '5C' (Control) and '22TW', '23TW', '28TW' (Tumor). 214 offsets: Offset values for affinity computation used as target. 215 boundaries: Whether to compute boundaries as the target. 216 binary: Whether to use a binary segmentation target. 217 download: Whether to download the data if it is not present. 218 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 219 220 Returns: 221 The DataLoader. 222 """ 223 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 224 dataset = get_enseg_dataset( 225 path=path, 226 patch_shape=patch_shape, 227 animal_tags=animal_tags, 228 offsets=offsets, 229 boundaries=boundaries, 230 binary=binary, 231 download=download, 232 **ds_kwargs, 233 ) 234 return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)
KAGGLE_DATASET_NAME =
'gustavozanonifelipe/enseg-dataset'
ANIMAL_TAGS =
['2C', '4C', '5C', '22TW', '23TW', '28TW']
def
get_enseg_data(path: Union[os.PathLike, str], download: bool = False) -> str:
77def get_enseg_data(path: Union[os.PathLike, str], download: bool = False) -> str: 78 """Download the ENSeg dataset. 79 80 Args: 81 path: Filepath to a folder where the downloaded data will be saved. 82 download: Whether to download the data if it is not present. 83 84 Returns: 85 Filepath where the data is downloaded. 86 """ 87 data_dir = os.path.join(path, "data") 88 if os.path.exists(data_dir): 89 return data_dir 90 91 os.makedirs(path, exist_ok=True) 92 util.download_source_kaggle(path=path, dataset_name=KAGGLE_DATASET_NAME, download=download) 93 util.unzip(zip_path=os.path.join(path, "enseg-dataset.zip"), dst=data_dir) 94 95 return data_dir
Download the ENSeg dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- download: Whether to download the data if it is not present.
Returns:
Filepath where the data is downloaded.
def
get_enseg_paths( path: Union[os.PathLike, str], animal_tags: Optional[Sequence[str]] = None, download: bool = False) -> Tuple[List[str], List[str]]:
98def get_enseg_paths( 99 path: Union[os.PathLike, str], 100 animal_tags: Optional[Sequence[str]] = None, 101 download: bool = False, 102) -> Tuple[List[str], List[str]]: 103 """Get paths to the ENSeg data. 104 105 Args: 106 path: Filepath to a folder where the downloaded data will be saved. 107 animal_tags: Filter images by animal tags (e.g. ['2C', '4C']). 108 Valid tags: '2C', '4C', '5C' (Control) and '22TW', '23TW', '28TW' (Tumor). 109 download: Whether to download the data if it is not present. 110 111 Returns: 112 List of filepaths for the image data. 113 List of filepaths for the label data. 114 """ 115 data_dir = get_enseg_data(path, download) 116 117 image_dir = os.path.join(path, "images") 118 seg_dir = os.path.join(path, "segmentations") 119 120 # Preprocess: extract images from JSON and create instance masks. 121 if not os.path.exists(image_dir) or not os.path.exists(seg_dir): 122 _preprocess_data(data_dir, image_dir, seg_dir) 123 124 seg_paths = natsorted(glob(os.path.join(seg_dir, "*.tif"))) 125 image_paths = natsorted(glob(os.path.join(image_dir, "*.png"))) 126 assert len(image_paths) == len(seg_paths) and len(image_paths) > 0 127 128 if animal_tags is not None: 129 assert isinstance(animal_tags, (list, tuple)), \ 130 f"'animal_tags' must be a list or tuple, got {type(animal_tags)}" 131 for tag in animal_tags: 132 assert tag in ANIMAL_TAGS, f"'{tag}' is not a valid animal tag. Choose from {ANIMAL_TAGS}." 133 134 # Filter by animal tag using the filename prefix (<animal_tag>_<number>). 135 filtered_image_paths, filtered_seg_paths = [], [] 136 for image_path, seg_path in zip(image_paths, seg_paths): 137 fname = os.path.basename(image_path) 138 # The tag is everything before the last underscore-number part. 139 tag = fname.rsplit("_", 1)[0] 140 if tag in animal_tags: 141 filtered_image_paths.append(image_path) 142 filtered_seg_paths.append(seg_path) 143 144 image_paths, seg_paths = filtered_image_paths, filtered_seg_paths 145 assert len(image_paths) > 0, f"No images found for animal tags {animal_tags}." 146 147 return image_paths, seg_paths
Get paths to the ENSeg data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- animal_tags: Filter images by animal tags (e.g. ['2C', '4C']). Valid tags: '2C', '4C', '5C' (Control) and '22TW', '23TW', '28TW' (Tumor).
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the image data. List of filepaths for the label data.
def
get_enseg_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], animal_tags: Optional[Sequence[str]] = None, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
150def get_enseg_dataset( 151 path: Union[os.PathLike, str], 152 patch_shape: Tuple[int, int], 153 animal_tags: Optional[Sequence[str]] = None, 154 offsets: Optional[List[List[int]]] = None, 155 boundaries: bool = False, 156 binary: bool = False, 157 download: bool = False, 158 **kwargs 159) -> Dataset: 160 """Get the ENSeg dataset for enteric neuron cell segmentation. 161 162 Args: 163 path: Filepath to a folder where the downloaded data will be saved. 164 patch_shape: The patch shape to use for training. 165 animal_tags: Filter images by animal tags (e.g. ['2C', '4C']). 166 Valid tags: '2C', '4C', '5C' (Control) and '22TW', '23TW', '28TW' (Tumor). 167 offsets: Offset values for affinity computation used as target. 168 boundaries: Whether to compute boundaries as the target. 169 binary: Whether to use a binary segmentation target. 170 download: Whether to download the data if it is not present. 171 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 172 173 Returns: 174 The segmentation dataset. 175 """ 176 image_paths, seg_paths = get_enseg_paths(path, animal_tags, download) 177 178 kwargs = util.ensure_transforms(ndim=2, **kwargs) 179 kwargs, _ = util.add_instance_label_transform( 180 kwargs, add_binary_target=True, offsets=offsets, boundaries=boundaries, binary=binary 181 ) 182 183 return torch_em.default_segmentation_dataset( 184 raw_paths=image_paths, 185 raw_key=None, 186 label_paths=seg_paths, 187 label_key=None, 188 patch_shape=patch_shape, 189 is_seg_dataset=False, 190 with_channels=True, 191 ndim=2, 192 **kwargs 193 )
Get the ENSeg dataset for enteric neuron cell segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- animal_tags: Filter images by animal tags (e.g. ['2C', '4C']). Valid tags: '2C', '4C', '5C' (Control) and '22TW', '23TW', '28TW' (Tumor).
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- binary: Whether to use a binary segmentation target.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset.
Returns:
The segmentation dataset.
def
get_enseg_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], animal_tags: Optional[Sequence[str]] = None, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
196def get_enseg_loader( 197 path: Union[os.PathLike, str], 198 batch_size: int, 199 patch_shape: Tuple[int, int], 200 animal_tags: Optional[Sequence[str]] = None, 201 offsets: Optional[List[List[int]]] = None, 202 boundaries: bool = False, 203 binary: bool = False, 204 download: bool = False, 205 **kwargs 206) -> DataLoader: 207 """Get the ENSeg dataloader for enteric neuron cell segmentation. 208 209 Args: 210 path: Filepath to a folder where the downloaded data will be saved. 211 batch_size: The batch size for training. 212 patch_shape: The patch shape to use for training. 213 animal_tags: Filter images by animal tags (e.g. ['2C', '4C']). 214 Valid tags: '2C', '4C', '5C' (Control) and '22TW', '23TW', '28TW' (Tumor). 215 offsets: Offset values for affinity computation used as target. 216 boundaries: Whether to compute boundaries as the target. 217 binary: Whether to use a binary segmentation target. 218 download: Whether to download the data if it is not present. 219 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 220 221 Returns: 222 The DataLoader. 223 """ 224 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 225 dataset = get_enseg_dataset( 226 path=path, 227 patch_shape=patch_shape, 228 animal_tags=animal_tags, 229 offsets=offsets, 230 boundaries=boundaries, 231 binary=binary, 232 download=download, 233 **ds_kwargs, 234 ) 235 return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)
Get the ENSeg dataloader for enteric neuron cell segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- animal_tags: Filter images by animal tags (e.g. ['2C', '4C']). Valid tags: '2C', '4C', '5C' (Control) and '22TW', '23TW', '28TW' (Tumor).
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- binary: Whether to use a binary segmentation target.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_datasetor for the PyTorch DataLoader.
Returns:
The DataLoader.