torch_em.data.datasets.light_microscopy.livecell

The LIVECell dataset contains phase-contrast microscopy images and annotations for cell segmentations for 8 different cell lines.

This dataset is described in the publication https://doi.org/10.1038/s41592-021-01249-6. Please cite it if you use this dataset in your research.

  1"""The LIVECell dataset contains phase-contrast microscopy images
  2and annotations for cell segmentations for 8 different cell lines.
  3
  4This dataset is described in the publication https://doi.org/10.1038/s41592-021-01249-6.
  5Please cite it if you use this dataset in your research.
  6"""
  7
  8import os
  9import requests
 10from tqdm import tqdm
 11from shutil import copyfileobj
 12from typing import List, Optional, Sequence, Tuple, Union
 13
 14import numpy as np
 15import imageio.v3 as imageio
 16
 17import torch
 18from torch.utils.data import Dataset, DataLoader
 19
 20import torch_em
 21
 22from .. import util
 23from ... import ImageCollectionDataset
 24
 25try:
 26    from pycocotools.coco import COCO
 27except ImportError:
 28    COCO = None
 29
 30URLS = {
 31    "images": "http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/images.zip",
 32    "train": ("http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/"
 33              "LIVECell/livecell_coco_train.json"),
 34    "val": ("http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/"
 35            "LIVECell/livecell_coco_val.json"),
 36    "test": ("http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/"
 37             "LIVECell/livecell_coco_test.json")
 38}
 39# TODO
 40CHECKSUM = None
 41
 42CELL_TYPES = ['A172', 'BT474', 'BV2', 'Huh7', 'MCF7', 'SHSY5Y', 'SkBr3', 'SKOV3']
 43
 44
 45# TODO use download flag
 46def _download_annotation_file(path, split, download):
 47    annotation_file = os.path.join(path, f"{split}.json")
 48    if not os.path.exists(annotation_file):
 49        url = URLS[split]
 50        print("Downloading livecell annotation file from", url)
 51        with requests.get(url, stream=True) as r:
 52            with open(annotation_file, 'wb') as f:
 53                copyfileobj(r.raw, f)
 54    return annotation_file
 55
 56
 57def _annotations_to_instances(coco, image_metadata, category_ids):
 58    import vigra
 59
 60    # create and save the segmentation
 61    annotation_ids = coco.getAnnIds(imgIds=image_metadata["id"], catIds=category_ids)
 62    annotations = coco.loadAnns(annotation_ids)
 63    assert len(annotations) <= np.iinfo("uint16").max
 64    shape = (image_metadata["height"], image_metadata["width"])
 65    seg = np.zeros(shape, dtype="uint32")
 66
 67    # sort annotations by size, except for iscrowd which go first
 68    # we do this to minimize small noise from overlapping multi annotations
 69    # (see below)
 70    sizes = [ann["area"] if ann["iscrowd"] == 0 else 1 for ann in annotations]
 71    sorting = np.argsort(sizes)
 72    annotations = [annotations[i] for i in sorting]
 73
 74    for seg_id, annotation in enumerate(annotations, 1):
 75        mask = coco.annToMask(annotation).astype("bool")
 76        assert mask.shape == seg.shape
 77        seg[mask] = seg_id
 78
 79    # some images have multiple masks per object with slightly different foreground
 80    # this causes small noise objects we need to filter
 81    min_size = 50
 82    seg_ids, sizes = np.unique(seg, return_counts=True)
 83    seg[np.isin(seg, seg_ids[sizes < min_size])] = 0
 84
 85    vigra.analysis.relabelConsecutive(seg, out=seg)
 86
 87    return seg.astype("uint16")
 88
 89
 90def _create_segmentations_from_annotations(annotation_file, image_folder, seg_folder, cell_types):
 91    assert COCO is not None, "pycocotools is required for processing the LIVECell ground-truth."
 92
 93    coco = COCO(annotation_file)
 94    category_ids = coco.getCatIds(catNms=["cell"])
 95    image_ids = coco.getImgIds(catIds=category_ids)
 96
 97    image_paths, seg_paths = [], []
 98    for image_id in tqdm(image_ids, desc="creating livecell segmentations from coco-style annotations"):
 99        # get the path for the image data and make sure the corresponding image exists
100        image_metadata = coco.loadImgs(image_id)[0]
101        file_name = image_metadata["file_name"]
102
103        # if cell_type names are given we only select file names that match a cell_type
104        if cell_types is not None and (not any([cell_type in file_name for cell_type in cell_types])):
105            continue
106
107        sub_folder = file_name.split("_")[0]
108        image_path = os.path.join(image_folder, sub_folder, file_name)
109        # something changed in the image layout? we keep the old version around in case this changes back...
110        if not os.path.exists(image_path):
111            image_path = os.path.join(image_folder, file_name)
112        assert os.path.exists(image_path), image_path
113        image_paths.append(image_path)
114
115        # get the output path
116        out_folder = os.path.join(seg_folder, sub_folder)
117        os.makedirs(out_folder, exist_ok=True)
118        seg_path = os.path.join(out_folder, file_name)
119        seg_paths.append(seg_path)
120        if os.path.exists(seg_path):
121            continue
122
123        seg = _annotations_to_instances(coco, image_metadata, category_ids)
124        imageio.imwrite(seg_path, seg)
125
126    assert len(image_paths) == len(seg_paths)
127    assert len(image_paths) > 0, \
128        f"No matching image paths were found. Did you pass invalid cell type names ({cell_types})?"
129
130    return image_paths, seg_paths
131
132
133def _download_livecell_annotations(path, split, download, cell_types, label_path):
134    annotation_file = _download_annotation_file(path, split, download)
135    if split == "test":
136        split_name = "livecell_test_images"
137    else:
138        split_name = "livecell_train_val_images"
139
140    image_folder = os.path.join(path, "images", split_name)
141    seg_folder = os.path.join(path, "annotations", split_name) if label_path is None\
142        else os.path.join(label_path, "annotations", split_name)
143
144    assert os.path.exists(image_folder), image_folder
145
146    return _create_segmentations_from_annotations(annotation_file, image_folder, seg_folder, cell_types)
147
148
149def get_livecell_data(path: Union[os.PathLike], download: bool = False):
150    """Download the LIVECell dataset.
151
152    Args:
153        path: Filepath to a folder where the downloaded data will be saved.
154        download: Whether to download the data if it is not present.
155    """
156    os.makedirs(path, exist_ok=True)
157    image_path = os.path.join(path, "images")
158
159    if os.path.exists(image_path):
160        return
161
162    url = URLS["images"]
163    checksum = CHECKSUM
164    zip_path = os.path.join(path, "livecell.zip")
165    util.download_source(zip_path, url, download, checksum)
166    util.unzip(zip_path, path, True)
167
168
169def get_livecell_paths(
170    path: Union[os.PathLike, str],
171    split: str,
172    download: bool = False,
173    cell_types: Optional[Sequence[str]] = None,
174    label_path: Optional[Union[os.PathLike, str]] = None
175) -> Tuple[List[str], List[str]]:
176    """Get paths to the LIVECell data.
177
178    Args:
179        path: Filepath to a folder where the downloaded data will be saved.
180        split: The data split to use. Either 'train', 'val' or 'test'.
181        download: Whether to download the data if it is not present.
182        cell_types: The cell types for which to get the data paths.
183        label_path: Optional path for loading the label data.
184
185    Returns:
186        List of filepaths for the image data.
187        List of filepaths for the label data.
188    """
189    get_livecell_data(path, download)
190    image_paths, seg_paths = _download_livecell_annotations(path, split, download, cell_types, label_path)
191    return image_paths, seg_paths
192
193
194def get_livecell_dataset(
195    path: Union[os.PathLike, str],
196    split: str,
197    patch_shape: Tuple[int, int],
198    download: bool = False,
199    offsets: Optional[List[List[int]]] = None,
200    boundaries: bool = False,
201    binary: bool = False,
202    cell_types: Optional[Sequence[str]] = None,
203    label_path: Optional[Union[os.PathLike, str]] = None,
204    label_dtype=torch.int64,
205    **kwargs
206) -> Dataset:
207    """Get the LIVECell dataset for segmenting cells in phase-contrast microscopy.
208
209    Args:
210        path: Filepath to a folder where the downloaded data will be saved.
211        split: The data split to use. Either 'train', 'val' or 'test'.
212        patch_shape: The patch shape to use for training.
213        download: Whether to download the data if it is not present.
214        offsets: Offset values for affinity computation used as target.
215        boundaries: Whether to compute boundaries as the target.
216        binary: Whether to use a binary segmentation target.
217        cell_types: The cell types for which to get the data paths.
218        label_path: Optional path for loading the label data.
219        label_dtype: The datatype of the label data.
220        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
221
222    Returns:
223        The segmentation dataset.
224    """
225    assert split in ("train", "val", "test")
226    if cell_types is not None:
227        assert isinstance(cell_types, (list, tuple)), \
228            f"cell_types must be passed as a list or tuple instead of {cell_types}"
229
230    image_paths, seg_paths = get_livecell_paths(path, split, download, cell_types, label_path)
231
232    kwargs = util.ensure_transforms(ndim=2, **kwargs)
233    kwargs, label_dtype = util.add_instance_label_transform(
234        kwargs, add_binary_target=True, label_dtype=label_dtype, offsets=offsets, boundaries=boundaries, binary=binary
235    )
236
237    return ImageCollectionDataset(
238        raw_image_paths=image_paths,
239        label_image_paths=seg_paths,
240        patch_shape=patch_shape,
241        label_dtype=label_dtype,
242        **kwargs
243    )
244
245
246def get_livecell_loader(
247    path: Union[os.PathLike, str],
248    split: str,
249    patch_shape: Tuple[int, int],
250    batch_size: int,
251    download: bool = False,
252    offsets: Optional[List[List[int]]] = None,
253    boundaries: bool = False,
254    binary: bool = False,
255    cell_types: Optional[Sequence[str]] = None,
256    label_path: Optional[Union[os.PathLike, str]] = None,
257    label_dtype=torch.int64,
258    **kwargs
259) -> DataLoader:
260    """Get the LIVECell dataloader for segmenting cells in phase-contrast microscopy.
261
262    Args:
263        path: Filepath to a folder where the downloaded data will be saved.
264        split: The data split to use. Either 'train', 'val' or 'test'.
265        patch_shape: The patch shape to use for training.
266        batch_size: The batch size for training.
267        download: Whether to download the data if it is not present.
268        offsets: Offset values for affinity computation used as target.
269        boundaries: Whether to compute boundaries as the target.
270        binary: Whether to use a binary segmentation target.
271        cell_types: The cell types for which to get the data paths.
272        label_path: Optional path for loading the label data.
273        label_dtype: The datatype of the label data.
274        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
275
276    Returns:
277        The DataLoader.
278    """
279    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
280    dataset = get_livecell_dataset(
281        path, split, patch_shape, download=download, offsets=offsets, boundaries=boundaries, binary=binary,
282        cell_types=cell_types, label_path=label_path, label_dtype=label_dtype, **ds_kwargs
283    )
284    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URLS = {'images': 'http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/images.zip', 'train': 'http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/LIVECell/livecell_coco_train.json', 'val': 'http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/LIVECell/livecell_coco_val.json', 'test': 'http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/LIVECell/livecell_coco_test.json'}
CHECKSUM = None
CELL_TYPES = ['A172', 'BT474', 'BV2', 'Huh7', 'MCF7', 'SHSY5Y', 'SkBr3', 'SKOV3']
def get_livecell_data(path: os.PathLike, download: bool = False):
150def get_livecell_data(path: Union[os.PathLike], download: bool = False):
151    """Download the LIVECell dataset.
152
153    Args:
154        path: Filepath to a folder where the downloaded data will be saved.
155        download: Whether to download the data if it is not present.
156    """
157    os.makedirs(path, exist_ok=True)
158    image_path = os.path.join(path, "images")
159
160    if os.path.exists(image_path):
161        return
162
163    url = URLS["images"]
164    checksum = CHECKSUM
165    zip_path = os.path.join(path, "livecell.zip")
166    util.download_source(zip_path, url, download, checksum)
167    util.unzip(zip_path, path, True)

Download the LIVECell dataset.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • download: Whether to download the data if it is not present.
def get_livecell_paths( path: Union[os.PathLike, str], split: str, download: bool = False, cell_types: Optional[Sequence[str]] = None, label_path: Union[os.PathLike, str, NoneType] = None) -> Tuple[List[str], List[str]]:
170def get_livecell_paths(
171    path: Union[os.PathLike, str],
172    split: str,
173    download: bool = False,
174    cell_types: Optional[Sequence[str]] = None,
175    label_path: Optional[Union[os.PathLike, str]] = None
176) -> Tuple[List[str], List[str]]:
177    """Get paths to the LIVECell data.
178
179    Args:
180        path: Filepath to a folder where the downloaded data will be saved.
181        split: The data split to use. Either 'train', 'val' or 'test'.
182        download: Whether to download the data if it is not present.
183        cell_types: The cell types for which to get the data paths.
184        label_path: Optional path for loading the label data.
185
186    Returns:
187        List of filepaths for the image data.
188        List of filepaths for the label data.
189    """
190    get_livecell_data(path, download)
191    image_paths, seg_paths = _download_livecell_annotations(path, split, download, cell_types, label_path)
192    return image_paths, seg_paths

Get paths to the LIVECell data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • split: The data split to use. Either 'train', 'val' or 'test'.
  • download: Whether to download the data if it is not present.
  • cell_types: The cell types for which to get the data paths.
  • label_path: Optional path for loading the label data.
Returns:

List of filepaths for the image data. List of filepaths for the label data.

def get_livecell_dataset( path: Union[os.PathLike, str], split: str, patch_shape: Tuple[int, int], download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, cell_types: Optional[Sequence[str]] = None, label_path: Union[os.PathLike, str, NoneType] = None, label_dtype=torch.int64, **kwargs) -> torch.utils.data.dataset.Dataset:
195def get_livecell_dataset(
196    path: Union[os.PathLike, str],
197    split: str,
198    patch_shape: Tuple[int, int],
199    download: bool = False,
200    offsets: Optional[List[List[int]]] = None,
201    boundaries: bool = False,
202    binary: bool = False,
203    cell_types: Optional[Sequence[str]] = None,
204    label_path: Optional[Union[os.PathLike, str]] = None,
205    label_dtype=torch.int64,
206    **kwargs
207) -> Dataset:
208    """Get the LIVECell dataset for segmenting cells in phase-contrast microscopy.
209
210    Args:
211        path: Filepath to a folder where the downloaded data will be saved.
212        split: The data split to use. Either 'train', 'val' or 'test'.
213        patch_shape: The patch shape to use for training.
214        download: Whether to download the data if it is not present.
215        offsets: Offset values for affinity computation used as target.
216        boundaries: Whether to compute boundaries as the target.
217        binary: Whether to use a binary segmentation target.
218        cell_types: The cell types for which to get the data paths.
219        label_path: Optional path for loading the label data.
220        label_dtype: The datatype of the label data.
221        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
222
223    Returns:
224        The segmentation dataset.
225    """
226    assert split in ("train", "val", "test")
227    if cell_types is not None:
228        assert isinstance(cell_types, (list, tuple)), \
229            f"cell_types must be passed as a list or tuple instead of {cell_types}"
230
231    image_paths, seg_paths = get_livecell_paths(path, split, download, cell_types, label_path)
232
233    kwargs = util.ensure_transforms(ndim=2, **kwargs)
234    kwargs, label_dtype = util.add_instance_label_transform(
235        kwargs, add_binary_target=True, label_dtype=label_dtype, offsets=offsets, boundaries=boundaries, binary=binary
236    )
237
238    return ImageCollectionDataset(
239        raw_image_paths=image_paths,
240        label_image_paths=seg_paths,
241        patch_shape=patch_shape,
242        label_dtype=label_dtype,
243        **kwargs
244    )

Get the LIVECell dataset for segmenting cells in phase-contrast microscopy.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • split: The data split to use. Either 'train', 'val' or 'test'.
  • patch_shape: The patch shape to use for training.
  • download: Whether to download the data if it is not present.
  • offsets: Offset values for affinity computation used as target.
  • boundaries: Whether to compute boundaries as the target.
  • binary: Whether to use a binary segmentation target.
  • cell_types: The cell types for which to get the data paths.
  • label_path: Optional path for loading the label data.
  • label_dtype: The datatype of the label data.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_livecell_loader( path: Union[os.PathLike, str], split: str, patch_shape: Tuple[int, int], batch_size: int, download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, cell_types: Optional[Sequence[str]] = None, label_path: Union[os.PathLike, str, NoneType] = None, label_dtype=torch.int64, **kwargs) -> torch.utils.data.dataloader.DataLoader:
247def get_livecell_loader(
248    path: Union[os.PathLike, str],
249    split: str,
250    patch_shape: Tuple[int, int],
251    batch_size: int,
252    download: bool = False,
253    offsets: Optional[List[List[int]]] = None,
254    boundaries: bool = False,
255    binary: bool = False,
256    cell_types: Optional[Sequence[str]] = None,
257    label_path: Optional[Union[os.PathLike, str]] = None,
258    label_dtype=torch.int64,
259    **kwargs
260) -> DataLoader:
261    """Get the LIVECell dataloader for segmenting cells in phase-contrast microscopy.
262
263    Args:
264        path: Filepath to a folder where the downloaded data will be saved.
265        split: The data split to use. Either 'train', 'val' or 'test'.
266        patch_shape: The patch shape to use for training.
267        batch_size: The batch size for training.
268        download: Whether to download the data if it is not present.
269        offsets: Offset values for affinity computation used as target.
270        boundaries: Whether to compute boundaries as the target.
271        binary: Whether to use a binary segmentation target.
272        cell_types: The cell types for which to get the data paths.
273        label_path: Optional path for loading the label data.
274        label_dtype: The datatype of the label data.
275        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
276
277    Returns:
278        The DataLoader.
279    """
280    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
281    dataset = get_livecell_dataset(
282        path, split, patch_shape, download=download, offsets=offsets, boundaries=boundaries, binary=binary,
283        cell_types=cell_types, label_path=label_path, label_dtype=label_dtype, **ds_kwargs
284    )
285    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the LIVECell dataloader for segmenting cells in phase-contrast microscopy.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • split: The data split to use. Either 'train', 'val' or 'test'.
  • patch_shape: The patch shape to use for training.
  • batch_size: The batch size for training.
  • download: Whether to download the data if it is not present.
  • offsets: Offset values for affinity computation used as target.
  • boundaries: Whether to compute boundaries as the target.
  • binary: Whether to use a binary segmentation target.
  • cell_types: The cell types for which to get the data paths.
  • label_path: Optional path for loading the label data.
  • label_dtype: The datatype of the label data.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.