The LIVECell dataset contains phase-contrast microscopy images and annotations for cell segmentations for 8 different cell lines.

This dataset is described in the publication Please cite it if you use this dataset in your research.

  1"""The LIVECell dataset contains phase-contrast microscopy images
  2and annotations for cell segmentations for 8 different cell lines.
  4This dataset is described in the publication
  5Please cite it if you use this dataset in your research.
  8import os
  9import requests
 10from tqdm import tqdm
 11from shutil import copyfileobj
 12from typing import List, Optional, Sequence, Tuple, Union
 14import numpy as np
 15import imageio.v3 as imageio
 17import torch
 18from import Dataset, DataLoader
 20import torch_em
 22from .. import util
 23from ... import ImageCollectionDataset
 26    from pycocotools.coco import COCO
 27except ImportError:
 28    COCO = None
 30URLS = {
 31    "images": "",
 32    "train": (""
 33              "LIVECell/livecell_coco_train.json"),
 34    "val": (""
 35            "LIVECell/livecell_coco_val.json"),
 36    "test": (""
 37             "LIVECell/livecell_coco_test.json")
 39# TODO
 40CHECKSUM = None
 42CELL_TYPES = ['A172', 'BT474', 'BV2', 'Huh7', 'MCF7', 'SHSY5Y', 'SkBr3', 'SKOV3']
 45# TODO use download flag
 46def _download_annotation_file(path, split, download):
 47    annotation_file = os.path.join(path, f"{split}.json")
 48    if not os.path.exists(annotation_file):
 49        url = URLS[split]
 50        print("Downloading livecell annotation file from", url)
 51        with requests.get(url, stream=True) as r:
 52            with open(annotation_file, 'wb') as f:
 53                copyfileobj(r.raw, f)
 54    return annotation_file
 57def _annotations_to_instances(coco, image_metadata, category_ids):
 58    import vigra
 60    # create and save the segmentation
 61    annotation_ids = coco.getAnnIds(imgIds=image_metadata["id"], catIds=category_ids)
 62    annotations = coco.loadAnns(annotation_ids)
 63    assert len(annotations) <= np.iinfo("uint16").max
 64    shape = (image_metadata["height"], image_metadata["width"])
 65    seg = np.zeros(shape, dtype="uint32")
 67    # sort annotations by size, except for iscrowd which go first
 68    # we do this to minimize small noise from overlapping multi annotations
 69    # (see below)
 70    sizes = [ann["area"] if ann["iscrowd"] == 0 else 1 for ann in annotations]
 71    sorting = np.argsort(sizes)
 72    annotations = [annotations[i] for i in sorting]
 74    for seg_id, annotation in enumerate(annotations, 1):
 75        mask = coco.annToMask(annotation).astype("bool")
 76        assert mask.shape == seg.shape
 77        seg[mask] = seg_id
 79    # some images have multiple masks per object with slightly different foreground
 80    # this causes small noise objects we need to filter
 81    min_size = 50
 82    seg_ids, sizes = np.unique(seg, return_counts=True)
 83    seg[np.isin(seg, seg_ids[sizes < min_size])] = 0
 85    vigra.analysis.relabelConsecutive(seg, out=seg)
 87    return seg.astype("uint16")
 90def _create_segmentations_from_annotations(annotation_file, image_folder, seg_folder, cell_types):
 91    assert COCO is not None, "pycocotools is required for processing the LIVECell ground-truth."
 93    coco = COCO(annotation_file)
 94    category_ids = coco.getCatIds(catNms=["cell"])
 95    image_ids = coco.getImgIds(catIds=category_ids)
 97    image_paths, seg_paths = [], []
 98    for image_id in tqdm(image_ids, desc="creating livecell segmentations from coco-style annotations"):
 99        # get the path for the image data and make sure the corresponding image exists
100        image_metadata = coco.loadImgs(image_id)[0]
101        file_name = image_metadata["file_name"]
103        # if cell_type names are given we only select file names that match a cell_type
104        if cell_types is not None and (not any([cell_type in file_name for cell_type in cell_types])):
105            continue
107        sub_folder = file_name.split("_")[0]
108        image_path = os.path.join(image_folder, sub_folder, file_name)
109        # something changed in the image layout? we keep the old version around in case this changes back...
110        if not os.path.exists(image_path):
111            image_path = os.path.join(image_folder, file_name)
112        assert os.path.exists(image_path), image_path
113        image_paths.append(image_path)
115        # get the output path
116        out_folder = os.path.join(seg_folder, sub_folder)
117        os.makedirs(out_folder, exist_ok=True)
118        seg_path = os.path.join(out_folder, file_name)
119        seg_paths.append(seg_path)
120        if os.path.exists(seg_path):
121            continue
123        seg = _annotations_to_instances(coco, image_metadata, category_ids)
124        imageio.imwrite(seg_path, seg)
126    assert len(image_paths) == len(seg_paths)
127    assert len(image_paths) > 0, \
128        f"No matching image paths were found. Did you pass invalid cell type names ({cell_types})?"
130    return image_paths, seg_paths
133def _download_livecell_annotations(path, split, download, cell_types, label_path):
134    annotation_file = _download_annotation_file(path, split, download)
135    if split == "test":
136        split_name = "livecell_test_images"
137    else:
138        split_name = "livecell_train_val_images"
140    image_folder = os.path.join(path, "images", split_name)
141    seg_folder = os.path.join(path, "annotations", split_name) if label_path is None\
142        else os.path.join(label_path, "annotations", split_name)
144    assert os.path.exists(image_folder), image_folder
146    return _create_segmentations_from_annotations(annotation_file, image_folder, seg_folder, cell_types)
149def get_livecell_data(path: Union[os.PathLike], download: bool = False):
150    """Download the LIVECell dataset.
152    Args:
153        path: Filepath to a folder where the downloaded data will be saved.
154        download: Whether to download the data if it is not present.
155    """
156    os.makedirs(path, exist_ok=True)
157    image_path = os.path.join(path, "images")
159    if os.path.exists(image_path):
160        return
162    url = URLS["images"]
163    checksum = CHECKSUM
164    zip_path = os.path.join(path, "")
165    util.download_source(zip_path, url, download, checksum)
166    util.unzip(zip_path, path, True)
169def get_livecell_paths(
170    path: Union[os.PathLike, str],
171    split: str,
172    download: bool = False,
173    cell_types: Optional[Sequence[str]] = None,
174    label_path: Optional[Union[os.PathLike, str]] = None
175) -> Tuple[List[str], List[str]]:
176    """Get paths to the LIVECell data.
178    Args:
179        path: Filepath to a folder where the downloaded data will be saved.
180        split: The data split to use. Either 'train', 'val' or 'test'.
181        download: Whether to download the data if it is not present.
182        cell_types: The cell types for which to get the data paths.
183        label_path: Optional path for loading the label data.
185    Returns:
186        List of filepaths for the image data.
187        List of filepaths for the label data.
188    """
189    get_livecell_data(path, download)
190    image_paths, seg_paths = _download_livecell_annotations(path, split, download, cell_types, label_path)
191    return image_paths, seg_paths
194def get_livecell_dataset(
195    path: Union[os.PathLike, str],
196    split: str,
197    patch_shape: Tuple[int, int],
198    download: bool = False,
199    offsets: Optional[List[List[int]]] = None,
200    boundaries: bool = False,
201    binary: bool = False,
202    cell_types: Optional[Sequence[str]] = None,
203    label_path: Optional[Union[os.PathLike, str]] = None,
204    label_dtype=torch.int64,
205    **kwargs
206) -> Dataset:
207    """Get the LIVECell dataset for segmenting cells in phase-contrast microscopy.
209    Args:
210        path: Filepath to a folder where the downloaded data will be saved.
211        split: The data split to use. Either 'train', 'val' or 'test'.
212        patch_shape: The patch shape to use for training.
213        download: Whether to download the data if it is not present.
214        offsets: Offset values for affinity computation used as target.
215        boundaries: Whether to compute boundaries as the target.
216        binary: Whether to use a binary segmentation target.
217        cell_types: The cell types for which to get the data paths.
218        label_path: Optional path for loading the label data.
219        label_dtype: The datatype of the label data.
220        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
222    Returns:
223        The segmentation dataset.
224    """
225    assert split in ("train", "val", "test")
226    if cell_types is not None:
227        assert isinstance(cell_types, (list, tuple)), \
228            f"cell_types must be passed as a list or tuple instead of {cell_types}"
230    image_paths, seg_paths = get_livecell_paths(path, split, download, cell_types, label_path)
232    kwargs = util.ensure_transforms(ndim=2, **kwargs)
233    kwargs, label_dtype = util.add_instance_label_transform(
234        kwargs, add_binary_target=True, label_dtype=label_dtype, offsets=offsets, boundaries=boundaries, binary=binary
235    )
237    return ImageCollectionDataset(
238        raw_image_paths=image_paths,
239        label_image_paths=seg_paths,
240        patch_shape=patch_shape,
241        label_dtype=label_dtype,
242        **kwargs
243    )
246def get_livecell_loader(
247    path: Union[os.PathLike, str],
248    split: str,
249    patch_shape: Tuple[int, int],
250    batch_size: int,
251    download: bool = False,
252    offsets: Optional[List[List[int]]] = None,
253    boundaries: bool = False,
254    binary: bool = False,
255    cell_types: Optional[Sequence[str]] = None,
256    label_path: Optional[Union[os.PathLike, str]] = None,
257    label_dtype=torch.int64,
258    **kwargs
259) -> DataLoader:
260    """Get the LIVECell dataloader for segmenting cells in phase-contrast microscopy.
262    Args:
263        path: Filepath to a folder where the downloaded data will be saved.
264        split: The data split to use. Either 'train', 'val' or 'test'.
265        patch_shape: The patch shape to use for training.
266        batch_size: The batch size for training.
267        download: Whether to download the data if it is not present.
268        offsets: Offset values for affinity computation used as target.
269        boundaries: Whether to compute boundaries as the target.
270        binary: Whether to use a binary segmentation target.
271        cell_types: The cell types for which to get the data paths.
272        label_path: Optional path for loading the label data.
273        label_dtype: The datatype of the label data.
274        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
276    Returns:
277        The DataLoader.
278    """
279    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
280    dataset = get_livecell_dataset(
281        path, split, patch_shape, download=download, offsets=offsets, boundaries=boundaries, binary=binary,
282        cell_types=cell_types, label_path=label_path, label_dtype=label_dtype, **ds_kwargs
283    )
284    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
