torch_em.data.datasets.light_microscopy.enseg

The ENSeg dataset contains annotations for enteric neuron cells in microscopy images.

The dataset is located at https://www.kaggle.com/datasets/gustavozanonifelipe/enseg-dataset. This dataset is from the publication https://doi.org/10.3390/app15031046. Please cite it if you use this dataset in your research.

  1"""The ENSeg dataset contains annotations for enteric neuron cells in microscopy images.
  2
  3The dataset is located at https://www.kaggle.com/datasets/gustavozanonifelipe/enseg-dataset.
  4This dataset is from the publication https://doi.org/10.3390/app15031046.
  5Please cite it if you use this dataset in your research.
  6"""
  7
  8import os
  9import json
 10import base64
 11from glob import glob
 12from tqdm import tqdm
 13from natsort import natsorted
 14from typing import Union, Tuple, Optional, Sequence, List
 15
 16import numpy as np
 17import imageio.v3 as imageio
 18
 19from skimage.draw import polygon as draw_polygon
 20
 21from torch.utils.data import Dataset, DataLoader
 22
 23import torch_em
 24
 25from .. import util
 26
 27
 28KAGGLE_DATASET_NAME = "gustavozanonifelipe/enseg-dataset"
 29
 30ANIMAL_TAGS = ["2C", "4C", "5C", "22TW", "23TW", "28TW"]
 31
 32
 33def _process_json(json_path, image_dir, seg_dir):
 34    """Extract image and instance segmentation mask from a LabelMe JSON file."""
 35    with open(json_path) as f:
 36        data = json.load(f)
 37
 38    animal_tag = data["animalTag"]
 39    stem = f"{animal_tag}_{os.path.basename(json_path).replace('.json', '')}"
 40
 41    image_path = os.path.join(image_dir, f"{stem}.png")
 42    seg_path = os.path.join(seg_dir, f"{stem}.tif")
 43
 44    # Extract image from base64 data.
 45    if not os.path.exists(image_path):
 46        image_bytes = base64.b64decode(data["imageData"])
 47        image = imageio.imread(image_bytes, extension=".jpg")
 48        imageio.imwrite(image_path, image)
 49
 50    # Create instance segmentation mask from polygon annotations.
 51    if not os.path.exists(seg_path):
 52        shape = (data["imageHeight"], data["imageWidth"])
 53        seg = np.zeros(shape, dtype="uint16")
 54        for seg_id, obj in enumerate(data["shapes"], 1):
 55            points = np.array(obj["points"])
 56            rr, cc = draw_polygon(points[:, 1], points[:, 0], shape=shape)
 57            seg[rr, cc] = seg_id
 58
 59        imageio.imwrite(seg_path, seg, compression="zlib")
 60
 61    return image_path, seg_path, animal_tag
 62
 63
 64def _preprocess_data(data_dir, image_dir, seg_dir):
 65    """Extract all images and create instance masks from LabelMe JSON files."""
 66    os.makedirs(image_dir, exist_ok=True)
 67    os.makedirs(seg_dir, exist_ok=True)
 68
 69    json_paths = natsorted(glob(os.path.join(data_dir, "*", "*.json")))
 70    assert len(json_paths) > 0, f"No JSON annotation files found in {data_dir}"
 71
 72    for json_path in tqdm(json_paths, desc="Processing ENSeg data"):
 73        _process_json(json_path, image_dir, seg_dir)
 74
 75
 76def get_enseg_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 77    """Download the ENSeg dataset.
 78
 79    Args:
 80        path: Filepath to a folder where the downloaded data will be saved.
 81        download: Whether to download the data if it is not present.
 82
 83    Returns:
 84        Filepath where the data is downloaded.
 85    """
 86    data_dir = os.path.join(path, "data")
 87    if os.path.exists(data_dir):
 88        return data_dir
 89
 90    os.makedirs(path, exist_ok=True)
 91    util.download_source_kaggle(path=path, dataset_name=KAGGLE_DATASET_NAME, download=download)
 92    util.unzip(zip_path=os.path.join(path, "enseg-dataset.zip"), dst=data_dir)
 93
 94    return data_dir
 95
 96
 97def get_enseg_paths(
 98    path: Union[os.PathLike, str],
 99    animal_tags: Optional[Sequence[str]] = None,
100    download: bool = False,
101) -> Tuple[List[str], List[str]]:
102    """Get paths to the ENSeg data.
103
104    Args:
105        path: Filepath to a folder where the downloaded data will be saved.
106        animal_tags: Filter images by animal tags (e.g. ['2C', '4C']).
107            Valid tags: '2C', '4C', '5C' (Control) and '22TW', '23TW', '28TW' (Tumor).
108        download: Whether to download the data if it is not present.
109
110    Returns:
111        List of filepaths for the image data.
112        List of filepaths for the label data.
113    """
114    data_dir = get_enseg_data(path, download)
115
116    image_dir = os.path.join(path, "images")
117    seg_dir = os.path.join(path, "segmentations")
118
119    # Preprocess: extract images from JSON and create instance masks.
120    if not os.path.exists(image_dir) or not os.path.exists(seg_dir):
121        _preprocess_data(data_dir, image_dir, seg_dir)
122
123    seg_paths = natsorted(glob(os.path.join(seg_dir, "*.tif")))
124    image_paths = natsorted(glob(os.path.join(image_dir, "*.png")))
125    assert len(image_paths) == len(seg_paths) and len(image_paths) > 0
126
127    if animal_tags is not None:
128        assert isinstance(animal_tags, (list, tuple)), \
129            f"'animal_tags' must be a list or tuple, got {type(animal_tags)}"
130        for tag in animal_tags:
131            assert tag in ANIMAL_TAGS, f"'{tag}' is not a valid animal tag. Choose from {ANIMAL_TAGS}."
132
133        # Filter by animal tag using the filename prefix (<animal_tag>_<number>).
134        filtered_image_paths, filtered_seg_paths = [], []
135        for image_path, seg_path in zip(image_paths, seg_paths):
136            fname = os.path.basename(image_path)
137            # The tag is everything before the last underscore-number part.
138            tag = fname.rsplit("_", 1)[0]
139            if tag in animal_tags:
140                filtered_image_paths.append(image_path)
141                filtered_seg_paths.append(seg_path)
142
143        image_paths, seg_paths = filtered_image_paths, filtered_seg_paths
144        assert len(image_paths) > 0, f"No images found for animal tags {animal_tags}."
145
146    return image_paths, seg_paths
147
148
149def get_enseg_dataset(
150    path: Union[os.PathLike, str],
151    patch_shape: Tuple[int, int],
152    animal_tags: Optional[Sequence[str]] = None,
153    offsets: Optional[List[List[int]]] = None,
154    boundaries: bool = False,
155    binary: bool = False,
156    download: bool = False,
157    **kwargs
158) -> Dataset:
159    """Get the ENSeg dataset for enteric neuron cell segmentation.
160
161    Args:
162        path: Filepath to a folder where the downloaded data will be saved.
163        patch_shape: The patch shape to use for training.
164        animal_tags: Filter images by animal tags (e.g. ['2C', '4C']).
165            Valid tags: '2C', '4C', '5C' (Control) and '22TW', '23TW', '28TW' (Tumor).
166        offsets: Offset values for affinity computation used as target.
167        boundaries: Whether to compute boundaries as the target.
168        binary: Whether to use a binary segmentation target.
169        download: Whether to download the data if it is not present.
170        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
171
172    Returns:
173        The segmentation dataset.
174    """
175    image_paths, seg_paths = get_enseg_paths(path, animal_tags, download)
176
177    kwargs = util.ensure_transforms(ndim=2, **kwargs)
178    kwargs, _ = util.add_instance_label_transform(
179        kwargs, add_binary_target=True, offsets=offsets, boundaries=boundaries, binary=binary
180    )
181
182    return torch_em.default_segmentation_dataset(
183        raw_paths=image_paths,
184        raw_key=None,
185        label_paths=seg_paths,
186        label_key=None,
187        patch_shape=patch_shape,
188        is_seg_dataset=False,
189        with_channels=True,
190        ndim=2,
191        **kwargs
192    )
193
194
195def get_enseg_loader(
196    path: Union[os.PathLike, str],
197    batch_size: int,
198    patch_shape: Tuple[int, int],
199    animal_tags: Optional[Sequence[str]] = None,
200    offsets: Optional[List[List[int]]] = None,
201    boundaries: bool = False,
202    binary: bool = False,
203    download: bool = False,
204    **kwargs
205) -> DataLoader:
206    """Get the ENSeg dataloader for enteric neuron cell segmentation.
207
208    Args:
209        path: Filepath to a folder where the downloaded data will be saved.
210        batch_size: The batch size for training.
211        patch_shape: The patch shape to use for training.
212        animal_tags: Filter images by animal tags (e.g. ['2C', '4C']).
213            Valid tags: '2C', '4C', '5C' (Control) and '22TW', '23TW', '28TW' (Tumor).
214        offsets: Offset values for affinity computation used as target.
215        boundaries: Whether to compute boundaries as the target.
216        binary: Whether to use a binary segmentation target.
217        download: Whether to download the data if it is not present.
218        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
219
220    Returns:
221        The DataLoader.
222    """
223    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
224    dataset = get_enseg_dataset(
225        path=path,
226        patch_shape=patch_shape,
227        animal_tags=animal_tags,
228        offsets=offsets,
229        boundaries=boundaries,
230        binary=binary,
231        download=download,
232        **ds_kwargs,
233    )
234    return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)
KAGGLE_DATASET_NAME = 'gustavozanonifelipe/enseg-dataset'
ANIMAL_TAGS = ['2C', '4C', '5C', '22TW', '23TW', '28TW']
def get_enseg_data(path: Union[os.PathLike, str], download: bool = False) -> str:
77def get_enseg_data(path: Union[os.PathLike, str], download: bool = False) -> str:
78    """Download the ENSeg dataset.
79
80    Args:
81        path: Filepath to a folder where the downloaded data will be saved.
82        download: Whether to download the data if it is not present.
83
84    Returns:
85        Filepath where the data is downloaded.
86    """
87    data_dir = os.path.join(path, "data")
88    if os.path.exists(data_dir):
89        return data_dir
90
91    os.makedirs(path, exist_ok=True)
92    util.download_source_kaggle(path=path, dataset_name=KAGGLE_DATASET_NAME, download=download)
93    util.unzip(zip_path=os.path.join(path, "enseg-dataset.zip"), dst=data_dir)
94
95    return data_dir

Download the ENSeg dataset.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • download: Whether to download the data if it is not present.
Returns:

Filepath where the data is downloaded.

def get_enseg_paths( path: Union[os.PathLike, str], animal_tags: Optional[Sequence[str]] = None, download: bool = False) -> Tuple[List[str], List[str]]:
 98def get_enseg_paths(
 99    path: Union[os.PathLike, str],
100    animal_tags: Optional[Sequence[str]] = None,
101    download: bool = False,
102) -> Tuple[List[str], List[str]]:
103    """Get paths to the ENSeg data.
104
105    Args:
106        path: Filepath to a folder where the downloaded data will be saved.
107        animal_tags: Filter images by animal tags (e.g. ['2C', '4C']).
108            Valid tags: '2C', '4C', '5C' (Control) and '22TW', '23TW', '28TW' (Tumor).
109        download: Whether to download the data if it is not present.
110
111    Returns:
112        List of filepaths for the image data.
113        List of filepaths for the label data.
114    """
115    data_dir = get_enseg_data(path, download)
116
117    image_dir = os.path.join(path, "images")
118    seg_dir = os.path.join(path, "segmentations")
119
120    # Preprocess: extract images from JSON and create instance masks.
121    if not os.path.exists(image_dir) or not os.path.exists(seg_dir):
122        _preprocess_data(data_dir, image_dir, seg_dir)
123
124    seg_paths = natsorted(glob(os.path.join(seg_dir, "*.tif")))
125    image_paths = natsorted(glob(os.path.join(image_dir, "*.png")))
126    assert len(image_paths) == len(seg_paths) and len(image_paths) > 0
127
128    if animal_tags is not None:
129        assert isinstance(animal_tags, (list, tuple)), \
130            f"'animal_tags' must be a list or tuple, got {type(animal_tags)}"
131        for tag in animal_tags:
132            assert tag in ANIMAL_TAGS, f"'{tag}' is not a valid animal tag. Choose from {ANIMAL_TAGS}."
133
134        # Filter by animal tag using the filename prefix (<animal_tag>_<number>).
135        filtered_image_paths, filtered_seg_paths = [], []
136        for image_path, seg_path in zip(image_paths, seg_paths):
137            fname = os.path.basename(image_path)
138            # The tag is everything before the last underscore-number part.
139            tag = fname.rsplit("_", 1)[0]
140            if tag in animal_tags:
141                filtered_image_paths.append(image_path)
142                filtered_seg_paths.append(seg_path)
143
144        image_paths, seg_paths = filtered_image_paths, filtered_seg_paths
145        assert len(image_paths) > 0, f"No images found for animal tags {animal_tags}."
146
147    return image_paths, seg_paths

Get paths to the ENSeg data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • animal_tags: Filter images by animal tags (e.g. ['2C', '4C']). Valid tags: '2C', '4C', '5C' (Control) and '22TW', '23TW', '28TW' (Tumor).
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths for the image data. List of filepaths for the label data.

def get_enseg_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], animal_tags: Optional[Sequence[str]] = None, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
150def get_enseg_dataset(
151    path: Union[os.PathLike, str],
152    patch_shape: Tuple[int, int],
153    animal_tags: Optional[Sequence[str]] = None,
154    offsets: Optional[List[List[int]]] = None,
155    boundaries: bool = False,
156    binary: bool = False,
157    download: bool = False,
158    **kwargs
159) -> Dataset:
160    """Get the ENSeg dataset for enteric neuron cell segmentation.
161
162    Args:
163        path: Filepath to a folder where the downloaded data will be saved.
164        patch_shape: The patch shape to use for training.
165        animal_tags: Filter images by animal tags (e.g. ['2C', '4C']).
166            Valid tags: '2C', '4C', '5C' (Control) and '22TW', '23TW', '28TW' (Tumor).
167        offsets: Offset values for affinity computation used as target.
168        boundaries: Whether to compute boundaries as the target.
169        binary: Whether to use a binary segmentation target.
170        download: Whether to download the data if it is not present.
171        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
172
173    Returns:
174        The segmentation dataset.
175    """
176    image_paths, seg_paths = get_enseg_paths(path, animal_tags, download)
177
178    kwargs = util.ensure_transforms(ndim=2, **kwargs)
179    kwargs, _ = util.add_instance_label_transform(
180        kwargs, add_binary_target=True, offsets=offsets, boundaries=boundaries, binary=binary
181    )
182
183    return torch_em.default_segmentation_dataset(
184        raw_paths=image_paths,
185        raw_key=None,
186        label_paths=seg_paths,
187        label_key=None,
188        patch_shape=patch_shape,
189        is_seg_dataset=False,
190        with_channels=True,
191        ndim=2,
192        **kwargs
193    )

Get the ENSeg dataset for enteric neuron cell segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • animal_tags: Filter images by animal tags (e.g. ['2C', '4C']). Valid tags: '2C', '4C', '5C' (Control) and '22TW', '23TW', '28TW' (Tumor).
  • offsets: Offset values for affinity computation used as target.
  • boundaries: Whether to compute boundaries as the target.
  • binary: Whether to use a binary segmentation target.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_enseg_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], animal_tags: Optional[Sequence[str]] = None, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
196def get_enseg_loader(
197    path: Union[os.PathLike, str],
198    batch_size: int,
199    patch_shape: Tuple[int, int],
200    animal_tags: Optional[Sequence[str]] = None,
201    offsets: Optional[List[List[int]]] = None,
202    boundaries: bool = False,
203    binary: bool = False,
204    download: bool = False,
205    **kwargs
206) -> DataLoader:
207    """Get the ENSeg dataloader for enteric neuron cell segmentation.
208
209    Args:
210        path: Filepath to a folder where the downloaded data will be saved.
211        batch_size: The batch size for training.
212        patch_shape: The patch shape to use for training.
213        animal_tags: Filter images by animal tags (e.g. ['2C', '4C']).
214            Valid tags: '2C', '4C', '5C' (Control) and '22TW', '23TW', '28TW' (Tumor).
215        offsets: Offset values for affinity computation used as target.
216        boundaries: Whether to compute boundaries as the target.
217        binary: Whether to use a binary segmentation target.
218        download: Whether to download the data if it is not present.
219        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
220
221    Returns:
222        The DataLoader.
223    """
224    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
225    dataset = get_enseg_dataset(
226        path=path,
227        patch_shape=patch_shape,
228        animal_tags=animal_tags,
229        offsets=offsets,
230        boundaries=boundaries,
231        binary=binary,
232        download=download,
233        **ds_kwargs,
234    )
235    return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)

Get the ENSeg dataloader for enteric neuron cell segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape to use for training.
  • animal_tags: Filter images by animal tags (e.g. ['2C', '4C']). Valid tags: '2C', '4C', '5C' (Control) and '22TW', '23TW', '28TW' (Tumor).
  • offsets: Offset values for affinity computation used as target.
  • boundaries: Whether to compute boundaries as the target.
  • binary: Whether to use a binary segmentation target.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.