torch_em.data.datasets.light_microscopy.livecell

The LIVECell dataset contains phase-contrast microscopy images and annotations for cell segmentations for 8 different cell lines.

This dataset is described in the publication https://doi.org/10.1038/s41592-021-01249-6. Please cite it if you use this dataset in your research.

  1"""The LIVECell dataset contains phase-contrast microscopy images
  2and annotations for cell segmentations for 8 different cell lines.
  3
  4This dataset is described in the publication https://doi.org/10.1038/s41592-021-01249-6.
  5Please cite it if you use this dataset in your research.
  6"""
  7
  8import os
  9import json
 10import requests
 11from tqdm import tqdm
 12from shutil import copyfileobj
 13from typing import List, Optional, Sequence, Tuple, Union
 14
 15import numpy as np
 16import imageio.v3 as imageio
 17
 18import torch
 19from torch.utils.data import Dataset, DataLoader
 20
 21import torch_em
 22
 23from .. import util
 24from ... import ImageCollectionDataset
 25
 26try:
 27    from pycocotools.coco import COCO
 28except ImportError:
 29    COCO = None
 30
 31URLS = {
 32    "images": "http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/images.zip",
 33    "train": ("http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/"
 34              "LIVECell/livecell_coco_train.json"),
 35    "val": ("http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/"
 36            "LIVECell/livecell_coco_val.json"),
 37    "test": ("http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/"
 38             "LIVECell/livecell_coco_test.json")
 39}
 40# TODO
 41CHECKSUM = None
 42
 43CELL_TYPES = ['A172', 'BT474', 'BV2', 'Huh7', 'MCF7', 'SHSY5Y', 'SkBr3', 'SKOV3']
 44
 45
 46# TODO use download flag
 47def _download_annotation_file(path, split, download):
 48    annotation_file = os.path.join(path, f"{split}.json")
 49    if not os.path.exists(annotation_file):
 50        url = URLS[split]
 51        print("Downloading livecell annotation file from", url)
 52        with requests.get(url, stream=True) as r:
 53            with open(annotation_file, 'wb') as f:
 54                copyfileobj(r.raw, f)
 55    return annotation_file
 56
 57
 58def _annotations_to_instances(coco, image_metadata, category_ids):
 59    import vigra
 60
 61    # create and save the segmentation
 62    annotation_ids = coco.getAnnIds(imgIds=image_metadata["id"], catIds=category_ids)
 63    annotations = coco.loadAnns(annotation_ids)
 64    assert len(annotations) <= np.iinfo("uint16").max
 65    shape = (image_metadata["height"], image_metadata["width"])
 66    seg = np.zeros(shape, dtype="uint32")
 67
 68    # sort annotations by size, except for iscrowd which go first
 69    # we do this to minimize small noise from overlapping multi annotations
 70    # (see below)
 71    sizes = [ann["area"] if ann["iscrowd"] == 0 else 1 for ann in annotations]
 72    sorting = np.argsort(sizes)
 73    annotations = [annotations[i] for i in sorting]
 74
 75    for seg_id, annotation in enumerate(annotations, 1):
 76        mask = coco.annToMask(annotation).astype("bool")
 77        assert mask.shape == seg.shape
 78        seg[mask] = seg_id
 79
 80    # some images have multiple masks per object with slightly different foreground
 81    # this causes small noise objects we need to filter
 82    min_size = 50
 83    seg_ids, sizes = np.unique(seg, return_counts=True)
 84    seg[np.isin(seg, seg_ids[sizes < min_size])] = 0
 85
 86    vigra.analysis.relabelConsecutive(seg, out=seg)
 87
 88    return seg.astype("uint16")
 89
 90
 91def _create_segmentations_from_annotations(annotation_file, image_folder, seg_folder, cell_types):
 92    # Use a per-cell_types cache to avoid reloading the COCO JSON when data is already prepared.
 93    cache_key = "all" if cell_types is None else "_".join(sorted(cell_types))
 94    cache_file = os.path.join(seg_folder, f"seg_paths_{cache_key}.json")
 95    if os.path.exists(cache_file):
 96        with open(cache_file) as f:
 97            cached = json.load(f)
 98        image_paths = [os.path.join(seg_folder, fname) for fname in cached["image_paths"]]
 99        seg_paths = [os.path.join(seg_folder, fname) for fname in cached["seg_paths"]]
100        return image_paths, seg_paths
101
102    if COCO is None:
103        raise ModuleNotFoundError(
104            "'pycocotools' is required for processing the LIVECell ground-truth. "
105            "Install it with 'conda install -c conda-forge pycocotools'."
106        )
107
108    coco = COCO(annotation_file)
109    category_ids = coco.getCatIds(catNms=["cell"])
110    image_ids = coco.getImgIds(catIds=category_ids)
111
112    image_paths, seg_paths = [], []
113    for image_id in tqdm(image_ids, desc="creating livecell segmentations from coco-style annotations"):
114        # get the path for the image data and make sure the corresponding image exists
115        image_metadata = coco.loadImgs(image_id)[0]
116        file_name = image_metadata["file_name"]
117
118        # if cell_type names are given we only select file names that match a cell_type
119        if cell_types is not None and (not any([cell_type in file_name for cell_type in cell_types])):
120            continue
121
122        sub_folder = file_name.split("_")[0]
123        image_path = os.path.join(image_folder, sub_folder, file_name)
124        # something changed in the image layout? we keep the old version around in case this changes back...
125        if not os.path.exists(image_path):
126            image_path = os.path.join(image_folder, file_name)
127        assert os.path.exists(image_path), image_path
128        image_paths.append(image_path)
129
130        # get the output path
131        out_folder = os.path.join(seg_folder, sub_folder)
132        os.makedirs(out_folder, exist_ok=True)
133        seg_path = os.path.join(out_folder, file_name)
134        seg_paths.append(seg_path)
135        if os.path.exists(seg_path):
136            continue
137
138        seg = _annotations_to_instances(coco, image_metadata, category_ids)
139        imageio.imwrite(seg_path, seg)
140
141    assert len(image_paths) == len(seg_paths)
142    assert len(image_paths) > 0, \
143        f"No matching image paths were found. Did you pass invalid cell type names ({cell_types})?"
144
145    cache_dir = os.path.dirname(cache_file)
146    image_paths_rel = [os.path.relpath(image_path, start=cache_dir) for image_path in image_paths]
147    seg_paths_rel = [os.path.relpath(seg_path, start=cache_dir) for seg_path in seg_paths]
148    with open(cache_file, "w") as f:
149        json.dump({"image_paths": image_paths_rel, "seg_paths": seg_paths_rel}, f)
150
151    return image_paths, seg_paths
152
153
154def _download_livecell_annotations(path, split, download, cell_types, label_path):
155    annotation_file = _download_annotation_file(path, split, download)
156    if split == "test":
157        split_name = "livecell_test_images"
158    else:
159        split_name = "livecell_train_val_images"
160
161    image_folder = os.path.join(path, "images", split_name)
162    seg_folder = os.path.join(path, "annotations", split_name) if label_path is None\
163        else os.path.join(label_path, "annotations", split_name)
164
165    assert os.path.exists(image_folder), image_folder
166
167    return _create_segmentations_from_annotations(annotation_file, image_folder, seg_folder, cell_types)
168
169
170def get_livecell_data(path: Union[os.PathLike], download: bool = False):
171    """Download the LIVECell dataset.
172
173    Args:
174        path: Filepath to a folder where the downloaded data will be saved.
175        download: Whether to download the data if it is not present.
176    """
177    os.makedirs(path, exist_ok=True)
178    image_path = os.path.join(path, "images")
179
180    if os.path.exists(image_path):
181        return
182
183    url = URLS["images"]
184    checksum = CHECKSUM
185    zip_path = os.path.join(path, "livecell.zip")
186    util.download_source(zip_path, url, download, checksum)
187    util.unzip(zip_path, path, True)
188
189
190def get_livecell_paths(
191    path: Union[os.PathLike, str],
192    split: str,
193    download: bool = False,
194    cell_types: Optional[Sequence[str]] = None,
195    label_path: Optional[Union[os.PathLike, str]] = None
196) -> Tuple[List[str], List[str]]:
197    """Get paths to the LIVECell data.
198
199    Args:
200        path: Filepath to a folder where the downloaded data will be saved.
201        split: The data split to use. Either 'train', 'val' or 'test'.
202        download: Whether to download the data if it is not present.
203        cell_types: The cell types for which to get the data paths.
204        label_path: Optional path for loading the label data.
205
206    Returns:
207        List of filepaths for the image data.
208        List of filepaths for the label data.
209    """
210    get_livecell_data(path, download)
211    image_paths, seg_paths = _download_livecell_annotations(path, split, download, cell_types, label_path)
212    return image_paths, seg_paths
213
214
215def get_livecell_dataset(
216    path: Union[os.PathLike, str],
217    split: str,
218    patch_shape: Tuple[int, int],
219    download: bool = False,
220    offsets: Optional[List[List[int]]] = None,
221    boundaries: bool = False,
222    binary: bool = False,
223    cell_types: Optional[Sequence[str]] = None,
224    label_path: Optional[Union[os.PathLike, str]] = None,
225    label_dtype=torch.int64,
226    **kwargs
227) -> Dataset:
228    """Get the LIVECell dataset for segmenting cells in phase-contrast microscopy.
229
230    Args:
231        path: Filepath to a folder where the downloaded data will be saved.
232        split: The data split to use. Either 'train', 'val' or 'test'.
233        patch_shape: The patch shape to use for training.
234        download: Whether to download the data if it is not present.
235        offsets: Offset values for affinity computation used as target.
236        boundaries: Whether to compute boundaries as the target.
237        binary: Whether to use a binary segmentation target.
238        cell_types: The cell types for which to get the data paths.
239        label_path: Optional path for loading the label data.
240        label_dtype: The datatype of the label data.
241        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
242
243    Returns:
244        The segmentation dataset.
245    """
246    assert split in ("train", "val", "test")
247    if cell_types is not None:
248        assert isinstance(cell_types, (list, tuple)), \
249            f"cell_types must be passed as a list or tuple instead of {cell_types}"
250
251    image_paths, seg_paths = get_livecell_paths(path, split, download, cell_types, label_path)
252
253    kwargs = util.ensure_transforms(ndim=2, **kwargs)
254    kwargs, label_dtype = util.add_instance_label_transform(
255        kwargs, add_binary_target=True, label_dtype=label_dtype, offsets=offsets, boundaries=boundaries, binary=binary
256    )
257
258    return ImageCollectionDataset(
259        raw_image_paths=image_paths,
260        label_image_paths=seg_paths,
261        patch_shape=patch_shape,
262        label_dtype=label_dtype,
263        **kwargs
264    )
265
266
267def get_livecell_loader(
268    path: Union[os.PathLike, str],
269    split: str,
270    patch_shape: Tuple[int, int],
271    batch_size: int,
272    download: bool = False,
273    offsets: Optional[List[List[int]]] = None,
274    boundaries: bool = False,
275    binary: bool = False,
276    cell_types: Optional[Sequence[str]] = None,
277    label_path: Optional[Union[os.PathLike, str]] = None,
278    label_dtype=torch.int64,
279    **kwargs
280) -> DataLoader:
281    """Get the LIVECell dataloader for segmenting cells in phase-contrast microscopy.
282
283    Args:
284        path: Filepath to a folder where the downloaded data will be saved.
285        split: The data split to use. Either 'train', 'val' or 'test'.
286        patch_shape: The patch shape to use for training.
287        batch_size: The batch size for training.
288        download: Whether to download the data if it is not present.
289        offsets: Offset values for affinity computation used as target.
290        boundaries: Whether to compute boundaries as the target.
291        binary: Whether to use a binary segmentation target.
292        cell_types: The cell types for which to get the data paths.
293        label_path: Optional path for loading the label data.
294        label_dtype: The datatype of the label data.
295        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
296
297    Returns:
298        The DataLoader.
299    """
300    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
301    dataset = get_livecell_dataset(
302        path, split, patch_shape, download=download, offsets=offsets, boundaries=boundaries, binary=binary,
303        cell_types=cell_types, label_path=label_path, label_dtype=label_dtype, **ds_kwargs
304    )
305    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URLS = {'images': 'http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/images.zip', 'train': 'http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/LIVECell/livecell_coco_train.json', 'val': 'http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/LIVECell/livecell_coco_val.json', 'test': 'http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/LIVECell/livecell_coco_test.json'}
CHECKSUM = None
CELL_TYPES = ['A172', 'BT474', 'BV2', 'Huh7', 'MCF7', 'SHSY5Y', 'SkBr3', 'SKOV3']
def get_livecell_data(path: os.PathLike, download: bool = False):
171def get_livecell_data(path: Union[os.PathLike], download: bool = False):
172    """Download the LIVECell dataset.
173
174    Args:
175        path: Filepath to a folder where the downloaded data will be saved.
176        download: Whether to download the data if it is not present.
177    """
178    os.makedirs(path, exist_ok=True)
179    image_path = os.path.join(path, "images")
180
181    if os.path.exists(image_path):
182        return
183
184    url = URLS["images"]
185    checksum = CHECKSUM
186    zip_path = os.path.join(path, "livecell.zip")
187    util.download_source(zip_path, url, download, checksum)
188    util.unzip(zip_path, path, True)

Download the LIVECell dataset.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • download: Whether to download the data if it is not present.
def get_livecell_paths( path: Union[os.PathLike, str], split: str, download: bool = False, cell_types: Optional[Sequence[str]] = None, label_path: Union[os.PathLike, str, NoneType] = None) -> Tuple[List[str], List[str]]:
191def get_livecell_paths(
192    path: Union[os.PathLike, str],
193    split: str,
194    download: bool = False,
195    cell_types: Optional[Sequence[str]] = None,
196    label_path: Optional[Union[os.PathLike, str]] = None
197) -> Tuple[List[str], List[str]]:
198    """Get paths to the LIVECell data.
199
200    Args:
201        path: Filepath to a folder where the downloaded data will be saved.
202        split: The data split to use. Either 'train', 'val' or 'test'.
203        download: Whether to download the data if it is not present.
204        cell_types: The cell types for which to get the data paths.
205        label_path: Optional path for loading the label data.
206
207    Returns:
208        List of filepaths for the image data.
209        List of filepaths for the label data.
210    """
211    get_livecell_data(path, download)
212    image_paths, seg_paths = _download_livecell_annotations(path, split, download, cell_types, label_path)
213    return image_paths, seg_paths

Get paths to the LIVECell data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • split: The data split to use. Either 'train', 'val' or 'test'.
  • download: Whether to download the data if it is not present.
  • cell_types: The cell types for which to get the data paths.
  • label_path: Optional path for loading the label data.
Returns:

List of filepaths for the image data. List of filepaths for the label data.

def get_livecell_dataset( path: Union[os.PathLike, str], split: str, patch_shape: Tuple[int, int], download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, cell_types: Optional[Sequence[str]] = None, label_path: Union[os.PathLike, str, NoneType] = None, label_dtype=torch.int64, **kwargs) -> torch.utils.data.dataset.Dataset:
216def get_livecell_dataset(
217    path: Union[os.PathLike, str],
218    split: str,
219    patch_shape: Tuple[int, int],
220    download: bool = False,
221    offsets: Optional[List[List[int]]] = None,
222    boundaries: bool = False,
223    binary: bool = False,
224    cell_types: Optional[Sequence[str]] = None,
225    label_path: Optional[Union[os.PathLike, str]] = None,
226    label_dtype=torch.int64,
227    **kwargs
228) -> Dataset:
229    """Get the LIVECell dataset for segmenting cells in phase-contrast microscopy.
230
231    Args:
232        path: Filepath to a folder where the downloaded data will be saved.
233        split: The data split to use. Either 'train', 'val' or 'test'.
234        patch_shape: The patch shape to use for training.
235        download: Whether to download the data if it is not present.
236        offsets: Offset values for affinity computation used as target.
237        boundaries: Whether to compute boundaries as the target.
238        binary: Whether to use a binary segmentation target.
239        cell_types: The cell types for which to get the data paths.
240        label_path: Optional path for loading the label data.
241        label_dtype: The datatype of the label data.
242        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
243
244    Returns:
245        The segmentation dataset.
246    """
247    assert split in ("train", "val", "test")
248    if cell_types is not None:
249        assert isinstance(cell_types, (list, tuple)), \
250            f"cell_types must be passed as a list or tuple instead of {cell_types}"
251
252    image_paths, seg_paths = get_livecell_paths(path, split, download, cell_types, label_path)
253
254    kwargs = util.ensure_transforms(ndim=2, **kwargs)
255    kwargs, label_dtype = util.add_instance_label_transform(
256        kwargs, add_binary_target=True, label_dtype=label_dtype, offsets=offsets, boundaries=boundaries, binary=binary
257    )
258
259    return ImageCollectionDataset(
260        raw_image_paths=image_paths,
261        label_image_paths=seg_paths,
262        patch_shape=patch_shape,
263        label_dtype=label_dtype,
264        **kwargs
265    )

Get the LIVECell dataset for segmenting cells in phase-contrast microscopy.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • split: The data split to use. Either 'train', 'val' or 'test'.
  • patch_shape: The patch shape to use for training.
  • download: Whether to download the data if it is not present.
  • offsets: Offset values for affinity computation used as target.
  • boundaries: Whether to compute boundaries as the target.
  • binary: Whether to use a binary segmentation target.
  • cell_types: The cell types for which to get the data paths.
  • label_path: Optional path for loading the label data.
  • label_dtype: The datatype of the label data.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_livecell_loader( path: Union[os.PathLike, str], split: str, patch_shape: Tuple[int, int], batch_size: int, download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, cell_types: Optional[Sequence[str]] = None, label_path: Union[os.PathLike, str, NoneType] = None, label_dtype=torch.int64, **kwargs) -> torch.utils.data.dataloader.DataLoader:
268def get_livecell_loader(
269    path: Union[os.PathLike, str],
270    split: str,
271    patch_shape: Tuple[int, int],
272    batch_size: int,
273    download: bool = False,
274    offsets: Optional[List[List[int]]] = None,
275    boundaries: bool = False,
276    binary: bool = False,
277    cell_types: Optional[Sequence[str]] = None,
278    label_path: Optional[Union[os.PathLike, str]] = None,
279    label_dtype=torch.int64,
280    **kwargs
281) -> DataLoader:
282    """Get the LIVECell dataloader for segmenting cells in phase-contrast microscopy.
283
284    Args:
285        path: Filepath to a folder where the downloaded data will be saved.
286        split: The data split to use. Either 'train', 'val' or 'test'.
287        patch_shape: The patch shape to use for training.
288        batch_size: The batch size for training.
289        download: Whether to download the data if it is not present.
290        offsets: Offset values for affinity computation used as target.
291        boundaries: Whether to compute boundaries as the target.
292        binary: Whether to use a binary segmentation target.
293        cell_types: The cell types for which to get the data paths.
294        label_path: Optional path for loading the label data.
295        label_dtype: The datatype of the label data.
296        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
297
298    Returns:
299        The DataLoader.
300    """
301    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
302    dataset = get_livecell_dataset(
303        path, split, patch_shape, download=download, offsets=offsets, boundaries=boundaries, binary=binary,
304        cell_types=cell_types, label_path=label_path, label_dtype=label_dtype, **ds_kwargs
305    )
306    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the LIVECell dataloader for segmenting cells in phase-contrast microscopy.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • split: The data split to use. Either 'train', 'val' or 'test'.
  • patch_shape: The patch shape to use for training.
  • batch_size: The batch size for training.
  • download: Whether to download the data if it is not present.
  • offsets: Offset values for affinity computation used as target.
  • boundaries: Whether to compute boundaries as the target.
  • binary: Whether to use a binary segmentation target.
  • cell_types: The cell types for which to get the data paths.
  • label_path: Optional path for loading the label data.
  • label_dtype: The datatype of the label data.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.