torch_em.data.datasets.light_microscopy.dic_hepg2

This dataset ontains annotation for cell segmentation in differential interference contrast (DIC) microscopy images.

This dataset is from the publication https://doi.org/10.1016/j.compbiomed.2024.109151. Please cite it if you use this dataset in your research.

View Source

  1"""This dataset ontains annotation for cell segmentation in
  2differential interference contrast (DIC) microscopy images.
  3
  4This dataset is from the publication https://doi.org/10.1016/j.compbiomed.2024.109151.
  5Please cite it if you use this dataset in your research.
  6"""
  7
  8import os
  9from tqdm import tqdm
 10from glob import glob
 11from pathlib import Path
 12from natsort import natsorted
 13from typing import Union, Literal, Tuple, Optional, List
 14
 15import imageio.v3 as imageio
 16
 17from torch.utils.data import Dataset, DataLoader
 18
 19import torch_em
 20
 21try:
 22    from pycocotools.coco import COCO
 23except ImportError:
 24    COCO = None
 25
 26from .. import util
 27from .livecell import _annotations_to_instances
 28
 29
 30URL = "https://zenodo.org/records/13120679/files/2021-11-15_HepG2_Calcein_AM.zip"
 31CHECKSUM = "42b939d01c5fc2517dc3ad34bde596ac38dbeba2a96173f37e1b6dfe14cbe3a2"
 32
 33
 34def get_dic_hepg2_data(path: Union[str, os.PathLike], download: bool = False) -> str:
 35    """Download the DIC HepG2 dataset.
 36
 37    Args:
 38        path: Filepath to a folder where the downloaded data will be stored.
 39        download: Whether to download the data if it is not present.
 40
 41    Returns:
 42        The path to the folder where data is stored.
 43    """
 44    if os.path.exists(path):
 45        return path
 46
 47    os.makedirs(path, exist_ok=True)
 48    zip_path = os.path.join(path, "2021-11-15_HepG2_Calcein_AM.zip")
 49    util.download_source(zip_path, URL, download, CHECKSUM)
 50    util.unzip(zip_path, path, True)
 51
 52    return path
 53
 54
 55def _create_segmentations_from_coco_annotation(path, split):
 56    assert COCO is not None, "pycocotools is required for processing the LiveCELL ground-truth."
 57
 58    base_dir = os.path.join(path, "2021-11-15_HepG2_Calcein_AM", "coco_format", split)
 59    image_folder = os.path.join(base_dir, "images")
 60    gt_folder = os.path.join(base_dir, "annotations")
 61    if os.path.exists(gt_folder):
 62        return image_folder, gt_folder
 63
 64    os.makedirs(gt_folder, exist_ok=True)
 65
 66    ann_file = os.path.join(base_dir, "annotations.json")
 67    assert os.path.exists(ann_file)
 68    coco = COCO(ann_file)
 69    category_ids = coco.getCatIds(catNms=["cell"])
 70    image_ids = coco.getImgIds(catIds=category_ids)
 71
 72    for image_id in tqdm(
 73        image_ids, desc="Creating DIC HepG2 segmentations from coco-style annotations"
 74    ):
 75        image_metadata = coco.loadImgs(image_id)[0]
 76        fname = image_metadata["file_name"]
 77
 78        gt_path = os.path.join(gt_folder, Path(fname).with_suffix(".tif"))
 79
 80        gt = _annotations_to_instances(coco, image_metadata, category_ids)
 81        imageio.imwrite(gt_path, gt, compression="zlib")
 82
 83    return image_folder, gt_folder
 84
 85
 86def get_dic_hepg2_paths(
 87    path: Union[os.PathLike, str], split: str, download: bool = False
 88) -> Tuple[List[str], List[str]]:
 89    """Get paths to DIC HepG2 data.
 90
 91    Args:
 92        path: Filepath to a folder where the downloaded data will be saved.
 93        split: The data split to use. Either 'train', 'val' or 'test'.
 94        download: Whether to download the data if it is not present.
 95
 96    Returns:
 97        List of filepaths for the image data.
 98        List of filepaths for the label data.
 99    """
100    path = get_dic_hepg2_data(path=path, download=download)
101
102    image_folder, gt_folder = _create_segmentations_from_coco_annotation(path=path, split=split)
103    gt_paths = natsorted(glob(os.path.join(gt_folder, "*.tif")))
104    image_paths = [os.path.join(image_folder, f"{Path(gt_path).stem}.png") for gt_path in gt_paths]
105
106    return image_paths, gt_paths
107
108
109def get_dic_hepg2_dataset(
110    path: Union[str, os.PathLike],
111    patch_shape: Tuple[int, int],
112    split: Literal["train", "val", "test"],
113    offsets: Optional[List[List[int]]] = None,
114    boundaries: bool = False,
115    binary: bool = False,
116    download: bool = False,
117    **kwargs
118) -> Dataset:
119    """Get the DIC HepG2 dataset for segmenting cells in differential interference contrast microscopy.
120
121    Args:
122        path: Filepath to a folder where the downloaded data will be saved.
123        split: The data split to use. Either 'train', 'val' or 'test'.
124        patch_shape: The patch shape to use for training.
125        download: Whether to download the data if it is not present.
126        offsets: Offset values for affinity computation used as target.
127        boundaries: Whether to compute boundaries as the target.
128        binary: Whether to use a binary segmentation target.
129        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
130
131    Returns:
132        The segmentation dataset.
133    """
134    image_paths, gt_paths = get_dic_hepg2_paths(path=path, split=split)
135
136    kwargs = util.ensure_transforms(ndim=2, **kwargs)
137    kwargs, _ = util.add_instance_label_transform(
138        kwargs, add_binary_target=True, offsets=offsets, boundaries=boundaries, binary=binary
139    )
140
141    return torch_em.default_segmentation_dataset(
142        raw_paths=image_paths,
143        raw_key=None,
144        label_paths=gt_paths,
145        label_key=None,
146        patch_shape=patch_shape,
147        is_seg_dataset=False,
148        **kwargs
149    )
150
151
152def get_dic_hepg2_loader(
153    path: Union[str, os.PathLike],
154    split: Literal['train', 'val', 'test'],
155    patch_shape: Tuple[int, int],
156    batch_size: int,
157    offsets: Optional[List[List[int]]] = None,
158    boundaries: bool = False,
159    binary: bool = False,
160    download: bool = False,
161    **kwargs
162) -> DataLoader:
163    """Get the DIC HepG2 dataloader for segmenting cells in differential interference contrast microscopy.
164
165    Args:
166        path: Filepath to a folder where the downloaded data will be saved.
167        split: The data split to use. Either 'train', 'val' or 'test'.
168        patch_shape: The patch shape to use for training.
169        batch_size: The batch size for training.
170        download: Whether to download the data if it is not present.
171        offsets: Offset values for affinity computation used as target.
172        boundaries: Whether to compute boundaries as the target.
173        binary: Whether to use a binary segmentation target.
174        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
175
176    Returns:
177        The DataLoader.
178    """
179    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
180    dataset = get_dic_hepg2_dataset(
181        path=path,
182        patch_shape=patch_shape,
183        split=split,
184        offsets=offsets,
185        boundaries=boundaries,
186        binary=binary,
187        download=download,
188        **ds_kwargs
189    )
190    return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)

URL = 'https://zenodo.org/records/13120679/files/2021-11-15_HepG2_Calcein_AM.zip'

CHECKSUM = '42b939d01c5fc2517dc3ad34bde596ac38dbeba2a96173f37e1b6dfe14cbe3a2'

def get_dic_hepg2_data(path: Union[str, os.PathLike], download: bool = False) -> str: View Source

35def get_dic_hepg2_data(path: Union[str, os.PathLike], download: bool = False) -> str:
36    """Download the DIC HepG2 dataset.
37
38    Args:
39        path: Filepath to a folder where the downloaded data will be stored.
40        download: Whether to download the data if it is not present.
41
42    Returns:
43        The path to the folder where data is stored.
44    """
45    if os.path.exists(path):
46        return path
47
48    os.makedirs(path, exist_ok=True)
49    zip_path = os.path.join(path, "2021-11-15_HepG2_Calcein_AM.zip")
50    util.download_source(zip_path, URL, download, CHECKSUM)
51    util.unzip(zip_path, path, True)
52
53    return path

Download the DIC HepG2 dataset.

Arguments:

path: Filepath to a folder where the downloaded data will be stored.
download: Whether to download the data if it is not present.

Returns:

The path to the folder where data is stored.

def get_dic_hepg2_paths( path: Union[os.PathLike, str], split: str, download: bool = False) -> Tuple[List[str], List[str]]: View Source

 87def get_dic_hepg2_paths(
 88    path: Union[os.PathLike, str], split: str, download: bool = False
 89) -> Tuple[List[str], List[str]]:
 90    """Get paths to DIC HepG2 data.
 91
 92    Args:
 93        path: Filepath to a folder where the downloaded data will be saved.
 94        split: The data split to use. Either 'train', 'val' or 'test'.
 95        download: Whether to download the data if it is not present.
 96
 97    Returns:
 98        List of filepaths for the image data.
 99        List of filepaths for the label data.
100    """
101    path = get_dic_hepg2_data(path=path, download=download)
102
103    image_folder, gt_folder = _create_segmentations_from_coco_annotation(path=path, split=split)
104    gt_paths = natsorted(glob(os.path.join(gt_folder, "*.tif")))
105    image_paths = [os.path.join(image_folder, f"{Path(gt_path).stem}.png") for gt_path in gt_paths]
106
107    return image_paths, gt_paths

Get paths to DIC HepG2 data.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
split: The data split to use. Either 'train', 'val' or 'test'.
download: Whether to download the data if it is not present.

Returns:

List of filepaths for the image data. List of filepaths for the label data.

def get_dic_hepg2_dataset( path: Union[str, os.PathLike], patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset: View Source

110def get_dic_hepg2_dataset(
111    path: Union[str, os.PathLike],
112    patch_shape: Tuple[int, int],
113    split: Literal["train", "val", "test"],
114    offsets: Optional[List[List[int]]] = None,
115    boundaries: bool = False,
116    binary: bool = False,
117    download: bool = False,
118    **kwargs
119) -> Dataset:
120    """Get the DIC HepG2 dataset for segmenting cells in differential interference contrast microscopy.
121
122    Args:
123        path: Filepath to a folder where the downloaded data will be saved.
124        split: The data split to use. Either 'train', 'val' or 'test'.
125        patch_shape: The patch shape to use for training.
126        download: Whether to download the data if it is not present.
127        offsets: Offset values for affinity computation used as target.
128        boundaries: Whether to compute boundaries as the target.
129        binary: Whether to use a binary segmentation target.
130        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
131
132    Returns:
133        The segmentation dataset.
134    """
135    image_paths, gt_paths = get_dic_hepg2_paths(path=path, split=split)
136
137    kwargs = util.ensure_transforms(ndim=2, **kwargs)
138    kwargs, _ = util.add_instance_label_transform(
139        kwargs, add_binary_target=True, offsets=offsets, boundaries=boundaries, binary=binary
140    )
141
142    return torch_em.default_segmentation_dataset(
143        raw_paths=image_paths,
144        raw_key=None,
145        label_paths=gt_paths,
146        label_key=None,
147        patch_shape=patch_shape,
148        is_seg_dataset=False,
149        **kwargs
150    )

Get the DIC HepG2 dataset for segmenting cells in differential interference contrast microscopy.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
split: The data split to use. Either 'train', 'val' or 'test'.
patch_shape: The patch shape to use for training.
download: Whether to download the data if it is not present.
offsets: Offset values for affinity computation used as target.
boundaries: Whether to compute boundaries as the target.
binary: Whether to use a binary segmentation target.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

Returns:

The segmentation dataset.

def get_dic_hepg2_loader( path: Union[str, os.PathLike], split: Literal['train', 'val', 'test'], patch_shape: Tuple[int, int], batch_size: int, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader: View Source

153def get_dic_hepg2_loader(
154    path: Union[str, os.PathLike],
155    split: Literal['train', 'val', 'test'],
156    patch_shape: Tuple[int, int],
157    batch_size: int,
158    offsets: Optional[List[List[int]]] = None,
159    boundaries: bool = False,
160    binary: bool = False,
161    download: bool = False,
162    **kwargs
163) -> DataLoader:
164    """Get the DIC HepG2 dataloader for segmenting cells in differential interference contrast microscopy.
165
166    Args:
167        path: Filepath to a folder where the downloaded data will be saved.
168        split: The data split to use. Either 'train', 'val' or 'test'.
169        patch_shape: The patch shape to use for training.
170        batch_size: The batch size for training.
171        download: Whether to download the data if it is not present.
172        offsets: Offset values for affinity computation used as target.
173        boundaries: Whether to compute boundaries as the target.
174        binary: Whether to use a binary segmentation target.
175        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
176
177    Returns:
178        The DataLoader.
179    """
180    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
181    dataset = get_dic_hepg2_dataset(
182        path=path,
183        patch_shape=patch_shape,
184        split=split,
185        offsets=offsets,
186        boundaries=boundaries,
187        binary=binary,
188        download=download,
189        **ds_kwargs
190    )
191    return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)

Get the DIC HepG2 dataloader for segmenting cells in differential interference contrast microscopy.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
split: The data split to use. Either 'train', 'val' or 'test'.
patch_shape: The patch shape to use for training.
batch_size: The batch size for training.
download: Whether to download the data if it is not present.
offsets: Offset values for affinity computation used as target.
boundaries: Whether to compute boundaries as the target.
binary: Whether to use a binary segmentation target.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

Returns:

The DataLoader.