torch_em.data.datasets.light_microscopy.livecell

The LIVECell dataset contains phase-contrast microscopy images and annotations for cell segmentations for 8 different cell lines.

This dataset is desceibed in the publication https://doi.org/10.1038/s41592-021-01249-6. Please cite it if you use this dataset in your research.

  1"""The LIVECell dataset contains phase-contrast microscopy images
  2and annotations for cell segmentations for 8 different cell lines.
  3
  4This dataset is desceibed in the publication https://doi.org/10.1038/s41592-021-01249-6.
  5Please cite it if you use this dataset in your research.
  6"""
  7
  8import os
  9from shutil import copyfileobj
 10from typing import List, Optional, Sequence, Tuple, Union
 11
 12import imageio
 13import numpy as np
 14import requests
 15import vigra
 16from tqdm import tqdm
 17
 18import torch_em
 19import torch.utils.data
 20from torch.utils.data import Dataset, DataLoader
 21from .. import util
 22
 23try:
 24    from pycocotools.coco import COCO
 25except ImportError:
 26    COCO = None
 27
 28URLS = {
 29    "images": "http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/images.zip",
 30    "train": ("http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/"
 31              "LIVECell/livecell_coco_train.json"),
 32    "val": ("http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/"
 33            "LIVECell/livecell_coco_val.json"),
 34    "test": ("http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/"
 35             "LIVECell/livecell_coco_test.json")
 36}
 37# TODO
 38CHECKSUM = None
 39
 40
 41def _download_livecell_images(path, download):
 42    os.makedirs(path, exist_ok=True)
 43    image_path = os.path.join(path, "images")
 44
 45    if os.path.exists(image_path):
 46        return
 47
 48    url = URLS["images"]
 49    checksum = CHECKSUM
 50    zip_path = os.path.join(path, "livecell.zip")
 51    util.download_source(zip_path, url, download, checksum)
 52    util.unzip(zip_path, path, True)
 53
 54
 55# TODO use download flag
 56def _download_annotation_file(path, split, download):
 57    annotation_file = os.path.join(path, f"{split}.json")
 58    if not os.path.exists(annotation_file):
 59        url = URLS[split]
 60        print("Downloading livecell annotation file from", url)
 61        with requests.get(url, stream=True) as r:
 62            with open(annotation_file, 'wb') as f:
 63                copyfileobj(r.raw, f)
 64    return annotation_file
 65
 66
 67def _annotations_to_instances(coco, image_metadata, category_ids):
 68    # create and save the segmentation
 69    annotation_ids = coco.getAnnIds(imgIds=image_metadata["id"], catIds=category_ids)
 70    annotations = coco.loadAnns(annotation_ids)
 71    assert len(annotations) <= np.iinfo("uint16").max
 72    shape = (image_metadata["height"], image_metadata["width"])
 73    seg = np.zeros(shape, dtype="uint32")
 74
 75    # sort annotations by size, except for iscrowd which go first
 76    # we do this to minimize small noise from overlapping multi annotations
 77    # (see below)
 78    sizes = [ann["area"] if ann["iscrowd"] == 0 else 1 for ann in annotations]
 79    sorting = np.argsort(sizes)
 80    annotations = [annotations[i] for i in sorting]
 81
 82    for seg_id, annotation in enumerate(annotations, 1):
 83        mask = coco.annToMask(annotation).astype("bool")
 84        assert mask.shape == seg.shape
 85        seg[mask] = seg_id
 86
 87    # some images have multiple masks per object with slightly different foreground
 88    # this causes small noise objects we need to filter
 89    min_size = 50
 90    seg_ids, sizes = np.unique(seg, return_counts=True)
 91    seg[np.isin(seg, seg_ids[sizes < min_size])] = 0
 92
 93    vigra.analysis.relabelConsecutive(seg, out=seg)
 94
 95    return seg.astype("uint16")
 96
 97
 98def _create_segmentations_from_annotations(annotation_file, image_folder, seg_folder, cell_types):
 99    assert COCO is not None, "pycocotools is required for processing the LiveCELL ground-truth."
100
101    coco = COCO(annotation_file)
102    category_ids = coco.getCatIds(catNms=["cell"])
103    image_ids = coco.getImgIds(catIds=category_ids)
104
105    image_paths, seg_paths = [], []
106    for image_id in tqdm(image_ids, desc="creating livecell segmentations from coco-style annotations"):
107        # get the path for the image data and make sure the corresponding image exists
108        image_metadata = coco.loadImgs(image_id)[0]
109        file_name = image_metadata["file_name"]
110
111        # if cell_type names are given we only select file names that match a cell_type
112        if cell_types is not None and (not any([cell_type in file_name for cell_type in cell_types])):
113            continue
114
115        sub_folder = file_name.split("_")[0]
116        image_path = os.path.join(image_folder, sub_folder, file_name)
117        # something changed in the image layout? we keep the old version around in case this changes back...
118        if not os.path.exists(image_path):
119            image_path = os.path.join(image_folder, file_name)
120        assert os.path.exists(image_path), image_path
121        image_paths.append(image_path)
122
123        # get the output path
124        out_folder = os.path.join(seg_folder, sub_folder)
125        os.makedirs(out_folder, exist_ok=True)
126        seg_path = os.path.join(out_folder, file_name)
127        seg_paths.append(seg_path)
128        if os.path.exists(seg_path):
129            continue
130
131        seg = _annotations_to_instances(coco, image_metadata, category_ids)
132        imageio.imwrite(seg_path, seg)
133
134    assert len(image_paths) == len(seg_paths)
135    assert len(image_paths) > 0, \
136        f"No matching image paths were found. Did you pass invalid cell type naems ({cell_types})?"
137    return image_paths, seg_paths
138
139
140def _download_livecell_annotations(path, split, download, cell_types, label_path):
141    annotation_file = _download_annotation_file(path, split, download)
142    if split == "test":
143        split_name = "livecell_test_images"
144    else:
145        split_name = "livecell_train_val_images"
146
147    image_folder = os.path.join(path, "images", split_name)
148    seg_folder = os.path.join(path, "annotations", split_name) if label_path is None\
149        else os.path.join(label_path, "annotations", split_name)
150
151    assert os.path.exists(image_folder), image_folder
152
153    return _create_segmentations_from_annotations(annotation_file, image_folder, seg_folder, cell_types)
154
155
156def get_livecell_data(
157    path: Union[os.PathLike, str],
158    split: str,
159    download: bool,
160    cell_types: Optional[Sequence[str]] = None,
161    label_path: Optional[Union[os.PathLike, str]] = None
162) -> Tuple[List[str], List[str]]:
163    """Download the LIVECell dataset.
164
165    Args:
166        path: Filepath to a folder where the downloaded data will be saved.
167        split: The data split to use. Either 'train', 'val' or 'test'.
168        download: Whether to download the data if it is not present.
169        cell_types: The cell types for which to get the data paths.
170        label_path: Optional path for loading the label data.
171
172    Returns:
173        The paths to the image data.
174        The paths to the label data.
175    """
176    _download_livecell_images(path, download)
177    image_paths, seg_paths = _download_livecell_annotations(path, split, download, cell_types, label_path)
178    return image_paths, seg_paths
179
180
181def get_livecell_dataset(
182    path: Union[os.PathLike, str],
183    split: str,
184    patch_shape: Tuple[int, int],
185    download: bool = False,
186    offsets: Optional[List[List[int]]] = None,
187    boundaries: bool = False,
188    binary: bool = False,
189    cell_types: Optional[Sequence[str]] = None,
190    label_path: Optional[Union[os.PathLike, str]] = None,
191    label_dtype=torch.int64,
192    **kwargs
193) -> Dataset:
194    """Get the LIVECell dataset for segmenting cells in phase-contrast microscopy.
195
196    Args:
197        path: Filepath to a folder where the downloaded data will be saved.
198        split: The data split to use. Either 'train', 'val' or 'test'.
199        patch_shape: The patch shape to use for training.
200        download: Whether to download the data if it is not present.
201        offsets: Offset values for affinity computation used as target.
202        boundaries: Whether to compute boundaries as the target.
203        binary: Whether to use a binary segmentation target.
204        cell_types: The cell types for which to get the data paths.
205        label_path: Optional path for loading the label data.
206        label_dtype: The datatype of the label data.
207        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
208
209    Returns:
210        The segmentation dataset.
211    """
212    assert split in ("train", "val", "test")
213    if cell_types is not None:
214        assert isinstance(cell_types, (list, tuple)), \
215            f"cell_types must be passed as a list or tuple instead of {cell_types}"
216
217    image_paths, seg_paths = get_livecell_data(path, split, download, cell_types, label_path)
218
219    kwargs = util.ensure_transforms(ndim=2, **kwargs)
220    kwargs, label_dtype = util.add_instance_label_transform(
221        kwargs, add_binary_target=True, label_dtype=label_dtype,
222        offsets=offsets, boundaries=boundaries, binary=binary
223    )
224
225    dataset = torch_em.data.ImageCollectionDataset(
226        image_paths, seg_paths, patch_shape=patch_shape, label_dtype=label_dtype, **kwargs
227    )
228    return dataset
229
230
231def get_livecell_loader(
232    path: Union[os.PathLike, str],
233    split: str,
234    patch_shape: Tuple[int, int],
235    batch_size: int,
236    download: bool = False,
237    offsets: Optional[List[List[int]]] = None,
238    boundaries: bool = False,
239    binary: bool = False,
240    cell_types: Optional[Sequence[str]] = None,
241    label_path: Optional[Union[os.PathLike, str]] = None,
242    label_dtype=torch.int64,
243    **kwargs
244) -> DataLoader:
245    """Get the LIVECell dataloader for segmenting cells in phase-contrast microscopy.
246
247    Args:
248        path: Filepath to a folder where the downloaded data will be saved.
249        split: The data split to use. Either 'train', 'val' or 'test'.
250        patch_shape: The patch shape to use for training.
251        batch_size: The batch size for training.
252        download: Whether to download the data if it is not present.
253        offsets: Offset values for affinity computation used as target.
254        boundaries: Whether to compute boundaries as the target.
255        binary: Whether to use a binary segmentation target.
256        cell_types: The cell types for which to get the data paths.
257        label_path: Optional path for loading the label data.
258        label_dtype: The datatype of the label data.
259        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
260
261    Returns:
262        The DataLoader.
263    """
264    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
265    dataset = get_livecell_dataset(
266        path, split, patch_shape, download=download, offsets=offsets, boundaries=boundaries, binary=binary,
267        cell_types=cell_types, label_path=label_path, label_dtype=label_dtype, **ds_kwargs
268    )
269    loader = torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
270    return loader
URLS = {'images': 'http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/images.zip', 'train': 'http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/LIVECell/livecell_coco_train.json', 'val': 'http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/LIVECell/livecell_coco_val.json', 'test': 'http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/LIVECell/livecell_coco_test.json'}
CHECKSUM = None
def get_livecell_data( path: Union[os.PathLike, str], split: str, download: bool, cell_types: Optional[Sequence[str]] = None, label_path: Union[os.PathLike, str, NoneType] = None) -> Tuple[List[str], List[str]]:
157def get_livecell_data(
158    path: Union[os.PathLike, str],
159    split: str,
160    download: bool,
161    cell_types: Optional[Sequence[str]] = None,
162    label_path: Optional[Union[os.PathLike, str]] = None
163) -> Tuple[List[str], List[str]]:
164    """Download the LIVECell dataset.
165
166    Args:
167        path: Filepath to a folder where the downloaded data will be saved.
168        split: The data split to use. Either 'train', 'val' or 'test'.
169        download: Whether to download the data if it is not present.
170        cell_types: The cell types for which to get the data paths.
171        label_path: Optional path for loading the label data.
172
173    Returns:
174        The paths to the image data.
175        The paths to the label data.
176    """
177    _download_livecell_images(path, download)
178    image_paths, seg_paths = _download_livecell_annotations(path, split, download, cell_types, label_path)
179    return image_paths, seg_paths

Download the LIVECell dataset.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • split: The data split to use. Either 'train', 'val' or 'test'.
  • download: Whether to download the data if it is not present.
  • cell_types: The cell types for which to get the data paths.
  • label_path: Optional path for loading the label data.
Returns:

The paths to the image data. The paths to the label data.

def get_livecell_dataset( path: Union[os.PathLike, str], split: str, patch_shape: Tuple[int, int], download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, cell_types: Optional[Sequence[str]] = None, label_path: Union[os.PathLike, str, NoneType] = None, label_dtype=torch.int64, **kwargs) -> torch.utils.data.dataset.Dataset:
182def get_livecell_dataset(
183    path: Union[os.PathLike, str],
184    split: str,
185    patch_shape: Tuple[int, int],
186    download: bool = False,
187    offsets: Optional[List[List[int]]] = None,
188    boundaries: bool = False,
189    binary: bool = False,
190    cell_types: Optional[Sequence[str]] = None,
191    label_path: Optional[Union[os.PathLike, str]] = None,
192    label_dtype=torch.int64,
193    **kwargs
194) -> Dataset:
195    """Get the LIVECell dataset for segmenting cells in phase-contrast microscopy.
196
197    Args:
198        path: Filepath to a folder where the downloaded data will be saved.
199        split: The data split to use. Either 'train', 'val' or 'test'.
200        patch_shape: The patch shape to use for training.
201        download: Whether to download the data if it is not present.
202        offsets: Offset values for affinity computation used as target.
203        boundaries: Whether to compute boundaries as the target.
204        binary: Whether to use a binary segmentation target.
205        cell_types: The cell types for which to get the data paths.
206        label_path: Optional path for loading the label data.
207        label_dtype: The datatype of the label data.
208        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
209
210    Returns:
211        The segmentation dataset.
212    """
213    assert split in ("train", "val", "test")
214    if cell_types is not None:
215        assert isinstance(cell_types, (list, tuple)), \
216            f"cell_types must be passed as a list or tuple instead of {cell_types}"
217
218    image_paths, seg_paths = get_livecell_data(path, split, download, cell_types, label_path)
219
220    kwargs = util.ensure_transforms(ndim=2, **kwargs)
221    kwargs, label_dtype = util.add_instance_label_transform(
222        kwargs, add_binary_target=True, label_dtype=label_dtype,
223        offsets=offsets, boundaries=boundaries, binary=binary
224    )
225
226    dataset = torch_em.data.ImageCollectionDataset(
227        image_paths, seg_paths, patch_shape=patch_shape, label_dtype=label_dtype, **kwargs
228    )
229    return dataset

Get the LIVECell dataset for segmenting cells in phase-contrast microscopy.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • split: The data split to use. Either 'train', 'val' or 'test'.
  • patch_shape: The patch shape to use for training.
  • download: Whether to download the data if it is not present.
  • offsets: Offset values for affinity computation used as target.
  • boundaries: Whether to compute boundaries as the target.
  • binary: Whether to use a binary segmentation target.
  • cell_types: The cell types for which to get the data paths.
  • label_path: Optional path for loading the label data.
  • label_dtype: The datatype of the label data.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_livecell_loader( path: Union[os.PathLike, str], split: str, patch_shape: Tuple[int, int], batch_size: int, download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, cell_types: Optional[Sequence[str]] = None, label_path: Union[os.PathLike, str, NoneType] = None, label_dtype=torch.int64, **kwargs) -> torch.utils.data.dataloader.DataLoader:
232def get_livecell_loader(
233    path: Union[os.PathLike, str],
234    split: str,
235    patch_shape: Tuple[int, int],
236    batch_size: int,
237    download: bool = False,
238    offsets: Optional[List[List[int]]] = None,
239    boundaries: bool = False,
240    binary: bool = False,
241    cell_types: Optional[Sequence[str]] = None,
242    label_path: Optional[Union[os.PathLike, str]] = None,
243    label_dtype=torch.int64,
244    **kwargs
245) -> DataLoader:
246    """Get the LIVECell dataloader for segmenting cells in phase-contrast microscopy.
247
248    Args:
249        path: Filepath to a folder where the downloaded data will be saved.
250        split: The data split to use. Either 'train', 'val' or 'test'.
251        patch_shape: The patch shape to use for training.
252        batch_size: The batch size for training.
253        download: Whether to download the data if it is not present.
254        offsets: Offset values for affinity computation used as target.
255        boundaries: Whether to compute boundaries as the target.
256        binary: Whether to use a binary segmentation target.
257        cell_types: The cell types for which to get the data paths.
258        label_path: Optional path for loading the label data.
259        label_dtype: The datatype of the label data.
260        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
261
262    Returns:
263        The DataLoader.
264    """
265    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
266    dataset = get_livecell_dataset(
267        path, split, patch_shape, download=download, offsets=offsets, boundaries=boundaries, binary=binary,
268        cell_types=cell_types, label_path=label_path, label_dtype=label_dtype, **ds_kwargs
269    )
270    loader = torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
271    return loader

Get the LIVECell dataloader for segmenting cells in phase-contrast microscopy.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • split: The data split to use. Either 'train', 'val' or 'test'.
  • patch_shape: The patch shape to use for training.
  • batch_size: The batch size for training.
  • download: Whether to download the data if it is not present.
  • offsets: Offset values for affinity computation used as target.
  • boundaries: Whether to compute boundaries as the target.
  • binary: Whether to use a binary segmentation target.
  • cell_types: The cell types for which to get the data paths.
  • label_path: Optional path for loading the label data.
  • label_dtype: The datatype of the label data.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.