torch_em.data.datasets.light_microscopy.livecell

The LIVECell dataset contains phase-contrast microscopy images and annotations for cell segmentations for 8 different cell lines.

This dataset is described in the publication https://doi.org/10.1038/s41592-021-01249-6. Please cite it if you use this dataset in your research.

View Source

  1"""The LIVECell dataset contains phase-contrast microscopy images
  2and annotations for cell segmentations for 8 different cell lines.
  3
  4This dataset is described in the publication https://doi.org/10.1038/s41592-021-01249-6.
  5Please cite it if you use this dataset in your research.
  6"""
  7
  8import os
  9import requests
 10from tqdm import tqdm
 11from shutil import copyfileobj
 12from typing import List, Optional, Sequence, Tuple, Union
 13
 14import numpy as np
 15import imageio.v3 as imageio
 16
 17import torch
 18from torch.utils.data import Dataset, DataLoader
 19
 20import torch_em
 21
 22from .. import util
 23from ... import ImageCollectionDataset
 24
 25try:
 26    from pycocotools.coco import COCO
 27except ImportError:
 28    COCO = None
 29
 30URLS = {
 31    "images": "http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/images.zip",
 32    "train": ("http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/"
 33              "LIVECell/livecell_coco_train.json"),
 34    "val": ("http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/"
 35            "LIVECell/livecell_coco_val.json"),
 36    "test": ("http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/"
 37             "LIVECell/livecell_coco_test.json")
 38}
 39# TODO
 40CHECKSUM = None
 41
 42CELL_TYPES = ['A172', 'BT474', 'BV2', 'Huh7', 'MCF7', 'SHSY5Y', 'SkBr3', 'SKOV3']
 43
 44
 45# TODO use download flag
 46def _download_annotation_file(path, split, download):
 47    annotation_file = os.path.join(path, f"{split}.json")
 48    if not os.path.exists(annotation_file):
 49        url = URLS[split]
 50        print("Downloading livecell annotation file from", url)
 51        with requests.get(url, stream=True) as r:
 52            with open(annotation_file, 'wb') as f:
 53                copyfileobj(r.raw, f)
 54    return annotation_file
 55
 56
 57def _annotations_to_instances(coco, image_metadata, category_ids):
 58    import vigra
 59
 60    # create and save the segmentation
 61    annotation_ids = coco.getAnnIds(imgIds=image_metadata["id"], catIds=category_ids)
 62    annotations = coco.loadAnns(annotation_ids)
 63    assert len(annotations) <= np.iinfo("uint16").max
 64    shape = (image_metadata["height"], image_metadata["width"])
 65    seg = np.zeros(shape, dtype="uint32")
 66
 67    # sort annotations by size, except for iscrowd which go first
 68    # we do this to minimize small noise from overlapping multi annotations
 69    # (see below)
 70    sizes = [ann["area"] if ann["iscrowd"] == 0 else 1 for ann in annotations]
 71    sorting = np.argsort(sizes)
 72    annotations = [annotations[i] for i in sorting]
 73
 74    for seg_id, annotation in enumerate(annotations, 1):
 75        mask = coco.annToMask(annotation).astype("bool")
 76        assert mask.shape == seg.shape
 77        seg[mask] = seg_id
 78
 79    # some images have multiple masks per object with slightly different foreground
 80    # this causes small noise objects we need to filter
 81    min_size = 50
 82    seg_ids, sizes = np.unique(seg, return_counts=True)
 83    seg[np.isin(seg, seg_ids[sizes < min_size])] = 0
 84
 85    vigra.analysis.relabelConsecutive(seg, out=seg)
 86
 87    return seg.astype("uint16")
 88
 89
 90def _create_segmentations_from_annotations(annotation_file, image_folder, seg_folder, cell_types):
 91    if COCO is None:
 92        raise ModuleNotFoundError(
 93            "'pycocotools' is required for processing the LIVECell ground-truth. "
 94            "Install it with 'conda install -c conda-forge pycocotools'."
 95        )
 96
 97    coco = COCO(annotation_file)
 98    category_ids = coco.getCatIds(catNms=["cell"])
 99    image_ids = coco.getImgIds(catIds=category_ids)
100
101    image_paths, seg_paths = [], []
102    for image_id in tqdm(image_ids, desc="creating livecell segmentations from coco-style annotations"):
103        # get the path for the image data and make sure the corresponding image exists
104        image_metadata = coco.loadImgs(image_id)[0]
105        file_name = image_metadata["file_name"]
106
107        # if cell_type names are given we only select file names that match a cell_type
108        if cell_types is not None and (not any([cell_type in file_name for cell_type in cell_types])):
109            continue
110
111        sub_folder = file_name.split("_")[0]
112        image_path = os.path.join(image_folder, sub_folder, file_name)
113        # something changed in the image layout? we keep the old version around in case this changes back...
114        if not os.path.exists(image_path):
115            image_path = os.path.join(image_folder, file_name)
116        assert os.path.exists(image_path), image_path
117        image_paths.append(image_path)
118
119        # get the output path
120        out_folder = os.path.join(seg_folder, sub_folder)
121        os.makedirs(out_folder, exist_ok=True)
122        seg_path = os.path.join(out_folder, file_name)
123        seg_paths.append(seg_path)
124        if os.path.exists(seg_path):
125            continue
126
127        seg = _annotations_to_instances(coco, image_metadata, category_ids)
128        imageio.imwrite(seg_path, seg)
129
130    assert len(image_paths) == len(seg_paths)
131    assert len(image_paths) > 0, \
132        f"No matching image paths were found. Did you pass invalid cell type names ({cell_types})?"
133
134    return image_paths, seg_paths
135
136
137def _download_livecell_annotations(path, split, download, cell_types, label_path):
138    annotation_file = _download_annotation_file(path, split, download)
139    if split == "test":
140        split_name = "livecell_test_images"
141    else:
142        split_name = "livecell_train_val_images"
143
144    image_folder = os.path.join(path, "images", split_name)
145    seg_folder = os.path.join(path, "annotations", split_name) if label_path is None\
146        else os.path.join(label_path, "annotations", split_name)
147
148    assert os.path.exists(image_folder), image_folder
149
150    return _create_segmentations_from_annotations(annotation_file, image_folder, seg_folder, cell_types)
151
152
153def get_livecell_data(path: Union[os.PathLike], download: bool = False):
154    """Download the LIVECell dataset.
155
156    Args:
157        path: Filepath to a folder where the downloaded data will be saved.
158        download: Whether to download the data if it is not present.
159    """
160    os.makedirs(path, exist_ok=True)
161    image_path = os.path.join(path, "images")
162
163    if os.path.exists(image_path):
164        return
165
166    url = URLS["images"]
167    checksum = CHECKSUM
168    zip_path = os.path.join(path, "livecell.zip")
169    util.download_source(zip_path, url, download, checksum)
170    util.unzip(zip_path, path, True)
171
172
173def get_livecell_paths(
174    path: Union[os.PathLike, str],
175    split: str,
176    download: bool = False,
177    cell_types: Optional[Sequence[str]] = None,
178    label_path: Optional[Union[os.PathLike, str]] = None
179) -> Tuple[List[str], List[str]]:
180    """Get paths to the LIVECell data.
181
182    Args:
183        path: Filepath to a folder where the downloaded data will be saved.
184        split: The data split to use. Either 'train', 'val' or 'test'.
185        download: Whether to download the data if it is not present.
186        cell_types: The cell types for which to get the data paths.
187        label_path: Optional path for loading the label data.
188
189    Returns:
190        List of filepaths for the image data.
191        List of filepaths for the label data.
192    """
193    get_livecell_data(path, download)
194    image_paths, seg_paths = _download_livecell_annotations(path, split, download, cell_types, label_path)
195    return image_paths, seg_paths
196
197
198def get_livecell_dataset(
199    path: Union[os.PathLike, str],
200    split: str,
201    patch_shape: Tuple[int, int],
202    download: bool = False,
203    offsets: Optional[List[List[int]]] = None,
204    boundaries: bool = False,
205    binary: bool = False,
206    cell_types: Optional[Sequence[str]] = None,
207    label_path: Optional[Union[os.PathLike, str]] = None,
208    label_dtype=torch.int64,
209    **kwargs
210) -> Dataset:
211    """Get the LIVECell dataset for segmenting cells in phase-contrast microscopy.
212
213    Args:
214        path: Filepath to a folder where the downloaded data will be saved.
215        split: The data split to use. Either 'train', 'val' or 'test'.
216        patch_shape: The patch shape to use for training.
217        download: Whether to download the data if it is not present.
218        offsets: Offset values for affinity computation used as target.
219        boundaries: Whether to compute boundaries as the target.
220        binary: Whether to use a binary segmentation target.
221        cell_types: The cell types for which to get the data paths.
222        label_path: Optional path for loading the label data.
223        label_dtype: The datatype of the label data.
224        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
225
226    Returns:
227        The segmentation dataset.
228    """
229    assert split in ("train", "val", "test")
230    if cell_types is not None:
231        assert isinstance(cell_types, (list, tuple)), \
232            f"cell_types must be passed as a list or tuple instead of {cell_types}"
233
234    image_paths, seg_paths = get_livecell_paths(path, split, download, cell_types, label_path)
235
236    kwargs = util.ensure_transforms(ndim=2, **kwargs)
237    kwargs, label_dtype = util.add_instance_label_transform(
238        kwargs, add_binary_target=True, label_dtype=label_dtype, offsets=offsets, boundaries=boundaries, binary=binary
239    )
240
241    return ImageCollectionDataset(
242        raw_image_paths=image_paths,
243        label_image_paths=seg_paths,
244        patch_shape=patch_shape,
245        label_dtype=label_dtype,
246        **kwargs
247    )
248
249
250def get_livecell_loader(
251    path: Union[os.PathLike, str],
252    split: str,
253    patch_shape: Tuple[int, int],
254    batch_size: int,
255    download: bool = False,
256    offsets: Optional[List[List[int]]] = None,
257    boundaries: bool = False,
258    binary: bool = False,
259    cell_types: Optional[Sequence[str]] = None,
260    label_path: Optional[Union[os.PathLike, str]] = None,
261    label_dtype=torch.int64,
262    **kwargs
263) -> DataLoader:
264    """Get the LIVECell dataloader for segmenting cells in phase-contrast microscopy.
265
266    Args:
267        path: Filepath to a folder where the downloaded data will be saved.
268        split: The data split to use. Either 'train', 'val' or 'test'.
269        patch_shape: The patch shape to use for training.
270        batch_size: The batch size for training.
271        download: Whether to download the data if it is not present.
272        offsets: Offset values for affinity computation used as target.
273        boundaries: Whether to compute boundaries as the target.
274        binary: Whether to use a binary segmentation target.
275        cell_types: The cell types for which to get the data paths.
276        label_path: Optional path for loading the label data.
277        label_dtype: The datatype of the label data.
278        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
279
280    Returns:
281        The DataLoader.
282    """
283    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
284    dataset = get_livecell_dataset(
285        path, split, patch_shape, download=download, offsets=offsets, boundaries=boundaries, binary=binary,
286        cell_types=cell_types, label_path=label_path, label_dtype=label_dtype, **ds_kwargs
287    )
288    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

URLS = {'images': 'http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/images.zip', 'train': 'http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/LIVECell/livecell_coco_train.json', 'val': 'http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/LIVECell/livecell_coco_val.json', 'test': 'http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/LIVECell/livecell_coco_test.json'}

CHECKSUM = None

CELL_TYPES = ['A172', 'BT474', 'BV2', 'Huh7', 'MCF7', 'SHSY5Y', 'SkBr3', 'SKOV3']

def get_livecell_data(path: os.PathLike, download: bool = False): View Source

154def get_livecell_data(path: Union[os.PathLike], download: bool = False):
155    """Download the LIVECell dataset.
156
157    Args:
158        path: Filepath to a folder where the downloaded data will be saved.
159        download: Whether to download the data if it is not present.
160    """
161    os.makedirs(path, exist_ok=True)
162    image_path = os.path.join(path, "images")
163
164    if os.path.exists(image_path):
165        return
166
167    url = URLS["images"]
168    checksum = CHECKSUM
169    zip_path = os.path.join(path, "livecell.zip")
170    util.download_source(zip_path, url, download, checksum)
171    util.unzip(zip_path, path, True)

Download the LIVECell dataset.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
download: Whether to download the data if it is not present.

def get_livecell_paths( path: Union[os.PathLike, str], split: str, download: bool = False, cell_types: Optional[Sequence[str]] = None, label_path: Union[os.PathLike, str, NoneType] = None) -> Tuple[List[str], List[str]]: View Source

174def get_livecell_paths(
175    path: Union[os.PathLike, str],
176    split: str,
177    download: bool = False,
178    cell_types: Optional[Sequence[str]] = None,
179    label_path: Optional[Union[os.PathLike, str]] = None
180) -> Tuple[List[str], List[str]]:
181    """Get paths to the LIVECell data.
182
183    Args:
184        path: Filepath to a folder where the downloaded data will be saved.
185        split: The data split to use. Either 'train', 'val' or 'test'.
186        download: Whether to download the data if it is not present.
187        cell_types: The cell types for which to get the data paths.
188        label_path: Optional path for loading the label data.
189
190    Returns:
191        List of filepaths for the image data.
192        List of filepaths for the label data.
193    """
194    get_livecell_data(path, download)
195    image_paths, seg_paths = _download_livecell_annotations(path, split, download, cell_types, label_path)
196    return image_paths, seg_paths

Get paths to the LIVECell data.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
split: The data split to use. Either 'train', 'val' or 'test'.
download: Whether to download the data if it is not present.
cell_types: The cell types for which to get the data paths.
label_path: Optional path for loading the label data.

Returns:

List of filepaths for the image data. List of filepaths for the label data.

def get_livecell_dataset( path: Union[os.PathLike, str], split: str, patch_shape: Tuple[int, int], download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, cell_types: Optional[Sequence[str]] = None, label_path: Union[os.PathLike, str, NoneType] = None, label_dtype=torch.int64, **kwargs) -> torch.utils.data.dataset.Dataset: View Source

199def get_livecell_dataset(
200    path: Union[os.PathLike, str],
201    split: str,
202    patch_shape: Tuple[int, int],
203    download: bool = False,
204    offsets: Optional[List[List[int]]] = None,
205    boundaries: bool = False,
206    binary: bool = False,
207    cell_types: Optional[Sequence[str]] = None,
208    label_path: Optional[Union[os.PathLike, str]] = None,
209    label_dtype=torch.int64,
210    **kwargs
211) -> Dataset:
212    """Get the LIVECell dataset for segmenting cells in phase-contrast microscopy.
213
214    Args:
215        path: Filepath to a folder where the downloaded data will be saved.
216        split: The data split to use. Either 'train', 'val' or 'test'.
217        patch_shape: The patch shape to use for training.
218        download: Whether to download the data if it is not present.
219        offsets: Offset values for affinity computation used as target.
220        boundaries: Whether to compute boundaries as the target.
221        binary: Whether to use a binary segmentation target.
222        cell_types: The cell types for which to get the data paths.
223        label_path: Optional path for loading the label data.
224        label_dtype: The datatype of the label data.
225        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
226
227    Returns:
228        The segmentation dataset.
229    """
230    assert split in ("train", "val", "test")
231    if cell_types is not None:
232        assert isinstance(cell_types, (list, tuple)), \
233            f"cell_types must be passed as a list or tuple instead of {cell_types}"
234
235    image_paths, seg_paths = get_livecell_paths(path, split, download, cell_types, label_path)
236
237    kwargs = util.ensure_transforms(ndim=2, **kwargs)
238    kwargs, label_dtype = util.add_instance_label_transform(
239        kwargs, add_binary_target=True, label_dtype=label_dtype, offsets=offsets, boundaries=boundaries, binary=binary
240    )
241
242    return ImageCollectionDataset(
243        raw_image_paths=image_paths,
244        label_image_paths=seg_paths,
245        patch_shape=patch_shape,
246        label_dtype=label_dtype,
247        **kwargs
248    )

Get the LIVECell dataset for segmenting cells in phase-contrast microscopy.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
split: The data split to use. Either 'train', 'val' or 'test'.
patch_shape: The patch shape to use for training.
download: Whether to download the data if it is not present.
offsets: Offset values for affinity computation used as target.
boundaries: Whether to compute boundaries as the target.
binary: Whether to use a binary segmentation target.
cell_types: The cell types for which to get the data paths.
label_path: Optional path for loading the label data.
label_dtype: The datatype of the label data.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

Returns:

The segmentation dataset.

def get_livecell_loader( path: Union[os.PathLike, str], split: str, patch_shape: Tuple[int, int], batch_size: int, download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, cell_types: Optional[Sequence[str]] = None, label_path: Union[os.PathLike, str, NoneType] = None, label_dtype=torch.int64, **kwargs) -> torch.utils.data.dataloader.DataLoader: View Source

251def get_livecell_loader(
252    path: Union[os.PathLike, str],
253    split: str,
254    patch_shape: Tuple[int, int],
255    batch_size: int,
256    download: bool = False,
257    offsets: Optional[List[List[int]]] = None,
258    boundaries: bool = False,
259    binary: bool = False,
260    cell_types: Optional[Sequence[str]] = None,
261    label_path: Optional[Union[os.PathLike, str]] = None,
262    label_dtype=torch.int64,
263    **kwargs
264) -> DataLoader:
265    """Get the LIVECell dataloader for segmenting cells in phase-contrast microscopy.
266
267    Args:
268        path: Filepath to a folder where the downloaded data will be saved.
269        split: The data split to use. Either 'train', 'val' or 'test'.
270        patch_shape: The patch shape to use for training.
271        batch_size: The batch size for training.
272        download: Whether to download the data if it is not present.
273        offsets: Offset values for affinity computation used as target.
274        boundaries: Whether to compute boundaries as the target.
275        binary: Whether to use a binary segmentation target.
276        cell_types: The cell types for which to get the data paths.
277        label_path: Optional path for loading the label data.
278        label_dtype: The datatype of the label data.
279        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
280
281    Returns:
282        The DataLoader.
283    """
284    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
285    dataset = get_livecell_dataset(
286        path, split, patch_shape, download=download, offsets=offsets, boundaries=boundaries, binary=binary,
287        cell_types=cell_types, label_path=label_path, label_dtype=label_dtype, **ds_kwargs
288    )
289    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the LIVECell dataloader for segmenting cells in phase-contrast microscopy.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
split: The data split to use. Either 'train', 'val' or 'test'.
patch_shape: The patch shape to use for training.
batch_size: The batch size for training.
download: Whether to download the data if it is not present.
offsets: Offset values for affinity computation used as target.
boundaries: Whether to compute boundaries as the target.
binary: Whether to use a binary segmentation target.
cell_types: The cell types for which to get the data paths.
label_path: Optional path for loading the label data.
label_dtype: The datatype of the label data.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

Returns:

The DataLoader.