torch_em.data.datasets.light_microscopy.livecell
The LIVECell dataset contains phase-contrast microscopy images and annotations for cell segmentations for 8 different cell lines.
This dataset is described in the publication https://doi.org/10.1038/s41592-021-01249-6. Please cite it if you use this dataset in your research.
1"""The LIVECell dataset contains phase-contrast microscopy images 2and annotations for cell segmentations for 8 different cell lines. 3 4This dataset is described in the publication https://doi.org/10.1038/s41592-021-01249-6. 5Please cite it if you use this dataset in your research. 6""" 7 8import os 9import json 10import requests 11from tqdm import tqdm 12from shutil import copyfileobj 13from typing import List, Optional, Sequence, Tuple, Union 14 15import numpy as np 16import imageio.v3 as imageio 17 18import torch 19from torch.utils.data import Dataset, DataLoader 20 21import torch_em 22 23from .. import util 24from ... import ImageCollectionDataset 25 26try: 27 from pycocotools.coco import COCO 28except ImportError: 29 COCO = None 30 31URLS = { 32 "images": "http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/images.zip", 33 "train": ("http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/" 34 "LIVECell/livecell_coco_train.json"), 35 "val": ("http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/" 36 "LIVECell/livecell_coco_val.json"), 37 "test": ("http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/" 38 "LIVECell/livecell_coco_test.json") 39} 40# TODO 41CHECKSUM = None 42 43CELL_TYPES = ['A172', 'BT474', 'BV2', 'Huh7', 'MCF7', 'SHSY5Y', 'SkBr3', 'SKOV3'] 44 45 46# TODO use download flag 47def _download_annotation_file(path, split, download): 48 annotation_file = os.path.join(path, f"{split}.json") 49 if not os.path.exists(annotation_file): 50 url = URLS[split] 51 print("Downloading livecell annotation file from", url) 52 with requests.get(url, stream=True) as r: 53 with open(annotation_file, 'wb') as f: 54 copyfileobj(r.raw, f) 55 return annotation_file 56 57 58def _annotations_to_instances(coco, image_metadata, category_ids): 59 import vigra 60 61 # create and save the segmentation 62 annotation_ids = coco.getAnnIds(imgIds=image_metadata["id"], catIds=category_ids) 63 annotations = coco.loadAnns(annotation_ids) 64 assert len(annotations) <= np.iinfo("uint16").max 65 shape = (image_metadata["height"], image_metadata["width"]) 66 seg = np.zeros(shape, dtype="uint32") 67 68 # sort annotations by size, except for iscrowd which go first 69 # we do this to minimize small noise from overlapping multi annotations 70 # (see below) 71 sizes = [ann["area"] if ann["iscrowd"] == 0 else 1 for ann in annotations] 72 sorting = np.argsort(sizes) 73 annotations = [annotations[i] for i in sorting] 74 75 for seg_id, annotation in enumerate(annotations, 1): 76 mask = coco.annToMask(annotation).astype("bool") 77 assert mask.shape == seg.shape 78 seg[mask] = seg_id 79 80 # some images have multiple masks per object with slightly different foreground 81 # this causes small noise objects we need to filter 82 min_size = 50 83 seg_ids, sizes = np.unique(seg, return_counts=True) 84 seg[np.isin(seg, seg_ids[sizes < min_size])] = 0 85 86 vigra.analysis.relabelConsecutive(seg, out=seg) 87 88 return seg.astype("uint16") 89 90 91def _create_segmentations_from_annotations(annotation_file, image_folder, seg_folder, cell_types): 92 # Use a per-cell_types cache to avoid reloading the COCO JSON when data is already prepared. 93 cache_key = "all" if cell_types is None else "_".join(sorted(cell_types)) 94 cache_file = os.path.join(seg_folder, f"seg_paths_{cache_key}.json") 95 if os.path.exists(cache_file): 96 with open(cache_file) as f: 97 cached = json.load(f) 98 image_paths = [os.path.join(seg_folder, fname) for fname in cached["image_paths"]] 99 seg_paths = [os.path.join(seg_folder, fname) for fname in cached["seg_paths"]] 100 return image_paths, seg_paths 101 102 if COCO is None: 103 raise ModuleNotFoundError( 104 "'pycocotools' is required for processing the LIVECell ground-truth. " 105 "Install it with 'conda install -c conda-forge pycocotools'." 106 ) 107 108 coco = COCO(annotation_file) 109 category_ids = coco.getCatIds(catNms=["cell"]) 110 image_ids = coco.getImgIds(catIds=category_ids) 111 112 image_paths, seg_paths = [], [] 113 for image_id in tqdm(image_ids, desc="creating livecell segmentations from coco-style annotations"): 114 # get the path for the image data and make sure the corresponding image exists 115 image_metadata = coco.loadImgs(image_id)[0] 116 file_name = image_metadata["file_name"] 117 118 # if cell_type names are given we only select file names that match a cell_type 119 if cell_types is not None and (not any([cell_type in file_name for cell_type in cell_types])): 120 continue 121 122 sub_folder = file_name.split("_")[0] 123 image_path = os.path.join(image_folder, sub_folder, file_name) 124 # something changed in the image layout? we keep the old version around in case this changes back... 125 if not os.path.exists(image_path): 126 image_path = os.path.join(image_folder, file_name) 127 assert os.path.exists(image_path), image_path 128 image_paths.append(image_path) 129 130 # get the output path 131 out_folder = os.path.join(seg_folder, sub_folder) 132 os.makedirs(out_folder, exist_ok=True) 133 seg_path = os.path.join(out_folder, file_name) 134 seg_paths.append(seg_path) 135 if os.path.exists(seg_path): 136 continue 137 138 seg = _annotations_to_instances(coco, image_metadata, category_ids) 139 imageio.imwrite(seg_path, seg) 140 141 assert len(image_paths) == len(seg_paths) 142 assert len(image_paths) > 0, \ 143 f"No matching image paths were found. Did you pass invalid cell type names ({cell_types})?" 144 145 cache_dir = os.path.dirname(cache_file) 146 image_paths_rel = [os.path.relpath(image_path, start=cache_dir) for image_path in image_paths] 147 seg_paths_rel = [os.path.relpath(seg_path, start=cache_dir) for seg_path in seg_paths] 148 with open(cache_file, "w") as f: 149 json.dump({"image_paths": image_paths_rel, "seg_paths": seg_paths_rel}, f) 150 151 return image_paths, seg_paths 152 153 154def _download_livecell_annotations(path, split, download, cell_types, label_path): 155 annotation_file = _download_annotation_file(path, split, download) 156 if split == "test": 157 split_name = "livecell_test_images" 158 else: 159 split_name = "livecell_train_val_images" 160 161 image_folder = os.path.join(path, "images", split_name) 162 seg_folder = os.path.join(path, "annotations", split_name) if label_path is None\ 163 else os.path.join(label_path, "annotations", split_name) 164 165 assert os.path.exists(image_folder), image_folder 166 167 return _create_segmentations_from_annotations(annotation_file, image_folder, seg_folder, cell_types) 168 169 170def get_livecell_data(path: Union[os.PathLike], download: bool = False): 171 """Download the LIVECell dataset. 172 173 Args: 174 path: Filepath to a folder where the downloaded data will be saved. 175 download: Whether to download the data if it is not present. 176 """ 177 os.makedirs(path, exist_ok=True) 178 image_path = os.path.join(path, "images") 179 180 if os.path.exists(image_path): 181 return 182 183 url = URLS["images"] 184 checksum = CHECKSUM 185 zip_path = os.path.join(path, "livecell.zip") 186 util.download_source(zip_path, url, download, checksum) 187 util.unzip(zip_path, path, True) 188 189 190def get_livecell_paths( 191 path: Union[os.PathLike, str], 192 split: str, 193 download: bool = False, 194 cell_types: Optional[Sequence[str]] = None, 195 label_path: Optional[Union[os.PathLike, str]] = None 196) -> Tuple[List[str], List[str]]: 197 """Get paths to the LIVECell data. 198 199 Args: 200 path: Filepath to a folder where the downloaded data will be saved. 201 split: The data split to use. Either 'train', 'val' or 'test'. 202 download: Whether to download the data if it is not present. 203 cell_types: The cell types for which to get the data paths. 204 label_path: Optional path for loading the label data. 205 206 Returns: 207 List of filepaths for the image data. 208 List of filepaths for the label data. 209 """ 210 get_livecell_data(path, download) 211 image_paths, seg_paths = _download_livecell_annotations(path, split, download, cell_types, label_path) 212 return image_paths, seg_paths 213 214 215def get_livecell_dataset( 216 path: Union[os.PathLike, str], 217 split: str, 218 patch_shape: Tuple[int, int], 219 download: bool = False, 220 offsets: Optional[List[List[int]]] = None, 221 boundaries: bool = False, 222 binary: bool = False, 223 cell_types: Optional[Sequence[str]] = None, 224 label_path: Optional[Union[os.PathLike, str]] = None, 225 label_dtype=torch.int64, 226 **kwargs 227) -> Dataset: 228 """Get the LIVECell dataset for segmenting cells in phase-contrast microscopy. 229 230 Args: 231 path: Filepath to a folder where the downloaded data will be saved. 232 split: The data split to use. Either 'train', 'val' or 'test'. 233 patch_shape: The patch shape to use for training. 234 download: Whether to download the data if it is not present. 235 offsets: Offset values for affinity computation used as target. 236 boundaries: Whether to compute boundaries as the target. 237 binary: Whether to use a binary segmentation target. 238 cell_types: The cell types for which to get the data paths. 239 label_path: Optional path for loading the label data. 240 label_dtype: The datatype of the label data. 241 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 242 243 Returns: 244 The segmentation dataset. 245 """ 246 assert split in ("train", "val", "test") 247 if cell_types is not None: 248 assert isinstance(cell_types, (list, tuple)), \ 249 f"cell_types must be passed as a list or tuple instead of {cell_types}" 250 251 image_paths, seg_paths = get_livecell_paths(path, split, download, cell_types, label_path) 252 253 kwargs = util.ensure_transforms(ndim=2, **kwargs) 254 kwargs, label_dtype = util.add_instance_label_transform( 255 kwargs, add_binary_target=True, label_dtype=label_dtype, offsets=offsets, boundaries=boundaries, binary=binary 256 ) 257 258 return ImageCollectionDataset( 259 raw_image_paths=image_paths, 260 label_image_paths=seg_paths, 261 patch_shape=patch_shape, 262 label_dtype=label_dtype, 263 **kwargs 264 ) 265 266 267def get_livecell_loader( 268 path: Union[os.PathLike, str], 269 split: str, 270 patch_shape: Tuple[int, int], 271 batch_size: int, 272 download: bool = False, 273 offsets: Optional[List[List[int]]] = None, 274 boundaries: bool = False, 275 binary: bool = False, 276 cell_types: Optional[Sequence[str]] = None, 277 label_path: Optional[Union[os.PathLike, str]] = None, 278 label_dtype=torch.int64, 279 **kwargs 280) -> DataLoader: 281 """Get the LIVECell dataloader for segmenting cells in phase-contrast microscopy. 282 283 Args: 284 path: Filepath to a folder where the downloaded data will be saved. 285 split: The data split to use. Either 'train', 'val' or 'test'. 286 patch_shape: The patch shape to use for training. 287 batch_size: The batch size for training. 288 download: Whether to download the data if it is not present. 289 offsets: Offset values for affinity computation used as target. 290 boundaries: Whether to compute boundaries as the target. 291 binary: Whether to use a binary segmentation target. 292 cell_types: The cell types for which to get the data paths. 293 label_path: Optional path for loading the label data. 294 label_dtype: The datatype of the label data. 295 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 296 297 Returns: 298 The DataLoader. 299 """ 300 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 301 dataset = get_livecell_dataset( 302 path, split, patch_shape, download=download, offsets=offsets, boundaries=boundaries, binary=binary, 303 cell_types=cell_types, label_path=label_path, label_dtype=label_dtype, **ds_kwargs 304 ) 305 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URLS =
{'images': 'http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/images.zip', 'train': 'http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/LIVECell/livecell_coco_train.json', 'val': 'http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/LIVECell/livecell_coco_val.json', 'test': 'http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/LIVECell/livecell_coco_test.json'}
CHECKSUM =
None
CELL_TYPES =
['A172', 'BT474', 'BV2', 'Huh7', 'MCF7', 'SHSY5Y', 'SkBr3', 'SKOV3']
def
get_livecell_data(path: os.PathLike, download: bool = False):
171def get_livecell_data(path: Union[os.PathLike], download: bool = False): 172 """Download the LIVECell dataset. 173 174 Args: 175 path: Filepath to a folder where the downloaded data will be saved. 176 download: Whether to download the data if it is not present. 177 """ 178 os.makedirs(path, exist_ok=True) 179 image_path = os.path.join(path, "images") 180 181 if os.path.exists(image_path): 182 return 183 184 url = URLS["images"] 185 checksum = CHECKSUM 186 zip_path = os.path.join(path, "livecell.zip") 187 util.download_source(zip_path, url, download, checksum) 188 util.unzip(zip_path, path, True)
Download the LIVECell dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- download: Whether to download the data if it is not present.
def
get_livecell_paths( path: Union[os.PathLike, str], split: str, download: bool = False, cell_types: Optional[Sequence[str]] = None, label_path: Union[os.PathLike, str, NoneType] = None) -> Tuple[List[str], List[str]]:
191def get_livecell_paths( 192 path: Union[os.PathLike, str], 193 split: str, 194 download: bool = False, 195 cell_types: Optional[Sequence[str]] = None, 196 label_path: Optional[Union[os.PathLike, str]] = None 197) -> Tuple[List[str], List[str]]: 198 """Get paths to the LIVECell data. 199 200 Args: 201 path: Filepath to a folder where the downloaded data will be saved. 202 split: The data split to use. Either 'train', 'val' or 'test'. 203 download: Whether to download the data if it is not present. 204 cell_types: The cell types for which to get the data paths. 205 label_path: Optional path for loading the label data. 206 207 Returns: 208 List of filepaths for the image data. 209 List of filepaths for the label data. 210 """ 211 get_livecell_data(path, download) 212 image_paths, seg_paths = _download_livecell_annotations(path, split, download, cell_types, label_path) 213 return image_paths, seg_paths
Get paths to the LIVECell data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The data split to use. Either 'train', 'val' or 'test'.
- download: Whether to download the data if it is not present.
- cell_types: The cell types for which to get the data paths.
- label_path: Optional path for loading the label data.
Returns:
List of filepaths for the image data. List of filepaths for the label data.
def
get_livecell_dataset( path: Union[os.PathLike, str], split: str, patch_shape: Tuple[int, int], download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, cell_types: Optional[Sequence[str]] = None, label_path: Union[os.PathLike, str, NoneType] = None, label_dtype=torch.int64, **kwargs) -> torch.utils.data.dataset.Dataset:
216def get_livecell_dataset( 217 path: Union[os.PathLike, str], 218 split: str, 219 patch_shape: Tuple[int, int], 220 download: bool = False, 221 offsets: Optional[List[List[int]]] = None, 222 boundaries: bool = False, 223 binary: bool = False, 224 cell_types: Optional[Sequence[str]] = None, 225 label_path: Optional[Union[os.PathLike, str]] = None, 226 label_dtype=torch.int64, 227 **kwargs 228) -> Dataset: 229 """Get the LIVECell dataset for segmenting cells in phase-contrast microscopy. 230 231 Args: 232 path: Filepath to a folder where the downloaded data will be saved. 233 split: The data split to use. Either 'train', 'val' or 'test'. 234 patch_shape: The patch shape to use for training. 235 download: Whether to download the data if it is not present. 236 offsets: Offset values for affinity computation used as target. 237 boundaries: Whether to compute boundaries as the target. 238 binary: Whether to use a binary segmentation target. 239 cell_types: The cell types for which to get the data paths. 240 label_path: Optional path for loading the label data. 241 label_dtype: The datatype of the label data. 242 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 243 244 Returns: 245 The segmentation dataset. 246 """ 247 assert split in ("train", "val", "test") 248 if cell_types is not None: 249 assert isinstance(cell_types, (list, tuple)), \ 250 f"cell_types must be passed as a list or tuple instead of {cell_types}" 251 252 image_paths, seg_paths = get_livecell_paths(path, split, download, cell_types, label_path) 253 254 kwargs = util.ensure_transforms(ndim=2, **kwargs) 255 kwargs, label_dtype = util.add_instance_label_transform( 256 kwargs, add_binary_target=True, label_dtype=label_dtype, offsets=offsets, boundaries=boundaries, binary=binary 257 ) 258 259 return ImageCollectionDataset( 260 raw_image_paths=image_paths, 261 label_image_paths=seg_paths, 262 patch_shape=patch_shape, 263 label_dtype=label_dtype, 264 **kwargs 265 )
Get the LIVECell dataset for segmenting cells in phase-contrast microscopy.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The data split to use. Either 'train', 'val' or 'test'.
- patch_shape: The patch shape to use for training.
- download: Whether to download the data if it is not present.
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- binary: Whether to use a binary segmentation target.
- cell_types: The cell types for which to get the data paths.
- label_path: Optional path for loading the label data.
- label_dtype: The datatype of the label data.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset.
Returns:
The segmentation dataset.
def
get_livecell_loader( path: Union[os.PathLike, str], split: str, patch_shape: Tuple[int, int], batch_size: int, download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, cell_types: Optional[Sequence[str]] = None, label_path: Union[os.PathLike, str, NoneType] = None, label_dtype=torch.int64, **kwargs) -> torch.utils.data.dataloader.DataLoader:
268def get_livecell_loader( 269 path: Union[os.PathLike, str], 270 split: str, 271 patch_shape: Tuple[int, int], 272 batch_size: int, 273 download: bool = False, 274 offsets: Optional[List[List[int]]] = None, 275 boundaries: bool = False, 276 binary: bool = False, 277 cell_types: Optional[Sequence[str]] = None, 278 label_path: Optional[Union[os.PathLike, str]] = None, 279 label_dtype=torch.int64, 280 **kwargs 281) -> DataLoader: 282 """Get the LIVECell dataloader for segmenting cells in phase-contrast microscopy. 283 284 Args: 285 path: Filepath to a folder where the downloaded data will be saved. 286 split: The data split to use. Either 'train', 'val' or 'test'. 287 patch_shape: The patch shape to use for training. 288 batch_size: The batch size for training. 289 download: Whether to download the data if it is not present. 290 offsets: Offset values for affinity computation used as target. 291 boundaries: Whether to compute boundaries as the target. 292 binary: Whether to use a binary segmentation target. 293 cell_types: The cell types for which to get the data paths. 294 label_path: Optional path for loading the label data. 295 label_dtype: The datatype of the label data. 296 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 297 298 Returns: 299 The DataLoader. 300 """ 301 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 302 dataset = get_livecell_dataset( 303 path, split, patch_shape, download=download, offsets=offsets, boundaries=boundaries, binary=binary, 304 cell_types=cell_types, label_path=label_path, label_dtype=label_dtype, **ds_kwargs 305 ) 306 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the LIVECell dataloader for segmenting cells in phase-contrast microscopy.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The data split to use. Either 'train', 'val' or 'test'.
- patch_shape: The patch shape to use for training.
- batch_size: The batch size for training.
- download: Whether to download the data if it is not present.
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- binary: Whether to use a binary segmentation target.
- cell_types: The cell types for which to get the data paths.
- label_path: Optional path for loading the label data.
- label_dtype: The datatype of the label data.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_datasetor for the PyTorch DataLoader.
Returns:
The DataLoader.