torch_em.data.datasets.light_microscopy.livecell
The LIVECell dataset contains phase-contrast microscopy images and annotations for cell segmentations for 8 different cell lines.
This dataset is described in the publication https://doi.org/10.1038/s41592-021-01249-6. Please cite it if you use this dataset in your research.
1"""The LIVECell dataset contains phase-contrast microscopy images 2and annotations for cell segmentations for 8 different cell lines. 3 4This dataset is described in the publication https://doi.org/10.1038/s41592-021-01249-6. 5Please cite it if you use this dataset in your research. 6""" 7 8import os 9import requests 10from tqdm import tqdm 11from shutil import copyfileobj 12from typing import List, Optional, Sequence, Tuple, Union 13 14import numpy as np 15import imageio.v3 as imageio 16 17import torch 18from torch.utils.data import Dataset, DataLoader 19 20import torch_em 21 22from .. import util 23from ... import ImageCollectionDataset 24 25try: 26 from pycocotools.coco import COCO 27except ImportError: 28 COCO = None 29 30URLS = { 31 "images": "http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/images.zip", 32 "train": ("http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/" 33 "LIVECell/livecell_coco_train.json"), 34 "val": ("http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/" 35 "LIVECell/livecell_coco_val.json"), 36 "test": ("http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/" 37 "LIVECell/livecell_coco_test.json") 38} 39# TODO 40CHECKSUM = None 41 42CELL_TYPES = ['A172', 'BT474', 'BV2', 'Huh7', 'MCF7', 'SHSY5Y', 'SkBr3', 'SKOV3'] 43 44 45# TODO use download flag 46def _download_annotation_file(path, split, download): 47 annotation_file = os.path.join(path, f"{split}.json") 48 if not os.path.exists(annotation_file): 49 url = URLS[split] 50 print("Downloading livecell annotation file from", url) 51 with requests.get(url, stream=True) as r: 52 with open(annotation_file, 'wb') as f: 53 copyfileobj(r.raw, f) 54 return annotation_file 55 56 57def _annotations_to_instances(coco, image_metadata, category_ids): 58 import vigra 59 60 # create and save the segmentation 61 annotation_ids = coco.getAnnIds(imgIds=image_metadata["id"], catIds=category_ids) 62 annotations = coco.loadAnns(annotation_ids) 63 assert len(annotations) <= np.iinfo("uint16").max 64 shape = (image_metadata["height"], image_metadata["width"]) 65 seg = np.zeros(shape, dtype="uint32") 66 67 # sort annotations by size, except for iscrowd which go first 68 # we do this to minimize small noise from overlapping multi annotations 69 # (see below) 70 sizes = [ann["area"] if ann["iscrowd"] == 0 else 1 for ann in annotations] 71 sorting = np.argsort(sizes) 72 annotations = [annotations[i] for i in sorting] 73 74 for seg_id, annotation in enumerate(annotations, 1): 75 mask = coco.annToMask(annotation).astype("bool") 76 assert mask.shape == seg.shape 77 seg[mask] = seg_id 78 79 # some images have multiple masks per object with slightly different foreground 80 # this causes small noise objects we need to filter 81 min_size = 50 82 seg_ids, sizes = np.unique(seg, return_counts=True) 83 seg[np.isin(seg, seg_ids[sizes < min_size])] = 0 84 85 vigra.analysis.relabelConsecutive(seg, out=seg) 86 87 return seg.astype("uint16") 88 89 90def _create_segmentations_from_annotations(annotation_file, image_folder, seg_folder, cell_types): 91 if COCO is None: 92 raise ModuleNotFoundError( 93 "'pycocotools' is required for processing the LIVECell ground-truth. " 94 "Install it with 'conda install -c conda-forge pycocotools'." 95 ) 96 97 coco = COCO(annotation_file) 98 category_ids = coco.getCatIds(catNms=["cell"]) 99 image_ids = coco.getImgIds(catIds=category_ids) 100 101 image_paths, seg_paths = [], [] 102 for image_id in tqdm(image_ids, desc="creating livecell segmentations from coco-style annotations"): 103 # get the path for the image data and make sure the corresponding image exists 104 image_metadata = coco.loadImgs(image_id)[0] 105 file_name = image_metadata["file_name"] 106 107 # if cell_type names are given we only select file names that match a cell_type 108 if cell_types is not None and (not any([cell_type in file_name for cell_type in cell_types])): 109 continue 110 111 sub_folder = file_name.split("_")[0] 112 image_path = os.path.join(image_folder, sub_folder, file_name) 113 # something changed in the image layout? we keep the old version around in case this changes back... 114 if not os.path.exists(image_path): 115 image_path = os.path.join(image_folder, file_name) 116 assert os.path.exists(image_path), image_path 117 image_paths.append(image_path) 118 119 # get the output path 120 out_folder = os.path.join(seg_folder, sub_folder) 121 os.makedirs(out_folder, exist_ok=True) 122 seg_path = os.path.join(out_folder, file_name) 123 seg_paths.append(seg_path) 124 if os.path.exists(seg_path): 125 continue 126 127 seg = _annotations_to_instances(coco, image_metadata, category_ids) 128 imageio.imwrite(seg_path, seg) 129 130 assert len(image_paths) == len(seg_paths) 131 assert len(image_paths) > 0, \ 132 f"No matching image paths were found. Did you pass invalid cell type names ({cell_types})?" 133 134 return image_paths, seg_paths 135 136 137def _download_livecell_annotations(path, split, download, cell_types, label_path): 138 annotation_file = _download_annotation_file(path, split, download) 139 if split == "test": 140 split_name = "livecell_test_images" 141 else: 142 split_name = "livecell_train_val_images" 143 144 image_folder = os.path.join(path, "images", split_name) 145 seg_folder = os.path.join(path, "annotations", split_name) if label_path is None\ 146 else os.path.join(label_path, "annotations", split_name) 147 148 assert os.path.exists(image_folder), image_folder 149 150 return _create_segmentations_from_annotations(annotation_file, image_folder, seg_folder, cell_types) 151 152 153def get_livecell_data(path: Union[os.PathLike], download: bool = False): 154 """Download the LIVECell dataset. 155 156 Args: 157 path: Filepath to a folder where the downloaded data will be saved. 158 download: Whether to download the data if it is not present. 159 """ 160 os.makedirs(path, exist_ok=True) 161 image_path = os.path.join(path, "images") 162 163 if os.path.exists(image_path): 164 return 165 166 url = URLS["images"] 167 checksum = CHECKSUM 168 zip_path = os.path.join(path, "livecell.zip") 169 util.download_source(zip_path, url, download, checksum) 170 util.unzip(zip_path, path, True) 171 172 173def get_livecell_paths( 174 path: Union[os.PathLike, str], 175 split: str, 176 download: bool = False, 177 cell_types: Optional[Sequence[str]] = None, 178 label_path: Optional[Union[os.PathLike, str]] = None 179) -> Tuple[List[str], List[str]]: 180 """Get paths to the LIVECell data. 181 182 Args: 183 path: Filepath to a folder where the downloaded data will be saved. 184 split: The data split to use. Either 'train', 'val' or 'test'. 185 download: Whether to download the data if it is not present. 186 cell_types: The cell types for which to get the data paths. 187 label_path: Optional path for loading the label data. 188 189 Returns: 190 List of filepaths for the image data. 191 List of filepaths for the label data. 192 """ 193 get_livecell_data(path, download) 194 image_paths, seg_paths = _download_livecell_annotations(path, split, download, cell_types, label_path) 195 return image_paths, seg_paths 196 197 198def get_livecell_dataset( 199 path: Union[os.PathLike, str], 200 split: str, 201 patch_shape: Tuple[int, int], 202 download: bool = False, 203 offsets: Optional[List[List[int]]] = None, 204 boundaries: bool = False, 205 binary: bool = False, 206 cell_types: Optional[Sequence[str]] = None, 207 label_path: Optional[Union[os.PathLike, str]] = None, 208 label_dtype=torch.int64, 209 **kwargs 210) -> Dataset: 211 """Get the LIVECell dataset for segmenting cells in phase-contrast microscopy. 212 213 Args: 214 path: Filepath to a folder where the downloaded data will be saved. 215 split: The data split to use. Either 'train', 'val' or 'test'. 216 patch_shape: The patch shape to use for training. 217 download: Whether to download the data if it is not present. 218 offsets: Offset values for affinity computation used as target. 219 boundaries: Whether to compute boundaries as the target. 220 binary: Whether to use a binary segmentation target. 221 cell_types: The cell types for which to get the data paths. 222 label_path: Optional path for loading the label data. 223 label_dtype: The datatype of the label data. 224 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 225 226 Returns: 227 The segmentation dataset. 228 """ 229 assert split in ("train", "val", "test") 230 if cell_types is not None: 231 assert isinstance(cell_types, (list, tuple)), \ 232 f"cell_types must be passed as a list or tuple instead of {cell_types}" 233 234 image_paths, seg_paths = get_livecell_paths(path, split, download, cell_types, label_path) 235 236 kwargs = util.ensure_transforms(ndim=2, **kwargs) 237 kwargs, label_dtype = util.add_instance_label_transform( 238 kwargs, add_binary_target=True, label_dtype=label_dtype, offsets=offsets, boundaries=boundaries, binary=binary 239 ) 240 241 return ImageCollectionDataset( 242 raw_image_paths=image_paths, 243 label_image_paths=seg_paths, 244 patch_shape=patch_shape, 245 label_dtype=label_dtype, 246 **kwargs 247 ) 248 249 250def get_livecell_loader( 251 path: Union[os.PathLike, str], 252 split: str, 253 patch_shape: Tuple[int, int], 254 batch_size: int, 255 download: bool = False, 256 offsets: Optional[List[List[int]]] = None, 257 boundaries: bool = False, 258 binary: bool = False, 259 cell_types: Optional[Sequence[str]] = None, 260 label_path: Optional[Union[os.PathLike, str]] = None, 261 label_dtype=torch.int64, 262 **kwargs 263) -> DataLoader: 264 """Get the LIVECell dataloader for segmenting cells in phase-contrast microscopy. 265 266 Args: 267 path: Filepath to a folder where the downloaded data will be saved. 268 split: The data split to use. Either 'train', 'val' or 'test'. 269 patch_shape: The patch shape to use for training. 270 batch_size: The batch size for training. 271 download: Whether to download the data if it is not present. 272 offsets: Offset values for affinity computation used as target. 273 boundaries: Whether to compute boundaries as the target. 274 binary: Whether to use a binary segmentation target. 275 cell_types: The cell types for which to get the data paths. 276 label_path: Optional path for loading the label data. 277 label_dtype: The datatype of the label data. 278 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 279 280 Returns: 281 The DataLoader. 282 """ 283 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 284 dataset = get_livecell_dataset( 285 path, split, patch_shape, download=download, offsets=offsets, boundaries=boundaries, binary=binary, 286 cell_types=cell_types, label_path=label_path, label_dtype=label_dtype, **ds_kwargs 287 ) 288 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URLS =
{'images': 'http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/images.zip', 'train': 'http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/LIVECell/livecell_coco_train.json', 'val': 'http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/LIVECell/livecell_coco_val.json', 'test': 'http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/LIVECell/livecell_coco_test.json'}
CHECKSUM =
None
CELL_TYPES =
['A172', 'BT474', 'BV2', 'Huh7', 'MCF7', 'SHSY5Y', 'SkBr3', 'SKOV3']
def
get_livecell_data(path: os.PathLike, download: bool = False):
154def get_livecell_data(path: Union[os.PathLike], download: bool = False): 155 """Download the LIVECell dataset. 156 157 Args: 158 path: Filepath to a folder where the downloaded data will be saved. 159 download: Whether to download the data if it is not present. 160 """ 161 os.makedirs(path, exist_ok=True) 162 image_path = os.path.join(path, "images") 163 164 if os.path.exists(image_path): 165 return 166 167 url = URLS["images"] 168 checksum = CHECKSUM 169 zip_path = os.path.join(path, "livecell.zip") 170 util.download_source(zip_path, url, download, checksum) 171 util.unzip(zip_path, path, True)
Download the LIVECell dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- download: Whether to download the data if it is not present.
def
get_livecell_paths( path: Union[os.PathLike, str], split: str, download: bool = False, cell_types: Optional[Sequence[str]] = None, label_path: Union[os.PathLike, str, NoneType] = None) -> Tuple[List[str], List[str]]:
174def get_livecell_paths( 175 path: Union[os.PathLike, str], 176 split: str, 177 download: bool = False, 178 cell_types: Optional[Sequence[str]] = None, 179 label_path: Optional[Union[os.PathLike, str]] = None 180) -> Tuple[List[str], List[str]]: 181 """Get paths to the LIVECell data. 182 183 Args: 184 path: Filepath to a folder where the downloaded data will be saved. 185 split: The data split to use. Either 'train', 'val' or 'test'. 186 download: Whether to download the data if it is not present. 187 cell_types: The cell types for which to get the data paths. 188 label_path: Optional path for loading the label data. 189 190 Returns: 191 List of filepaths for the image data. 192 List of filepaths for the label data. 193 """ 194 get_livecell_data(path, download) 195 image_paths, seg_paths = _download_livecell_annotations(path, split, download, cell_types, label_path) 196 return image_paths, seg_paths
Get paths to the LIVECell data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The data split to use. Either 'train', 'val' or 'test'.
- download: Whether to download the data if it is not present.
- cell_types: The cell types for which to get the data paths.
- label_path: Optional path for loading the label data.
Returns:
List of filepaths for the image data. List of filepaths for the label data.
def
get_livecell_dataset( path: Union[os.PathLike, str], split: str, patch_shape: Tuple[int, int], download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, cell_types: Optional[Sequence[str]] = None, label_path: Union[os.PathLike, str, NoneType] = None, label_dtype=torch.int64, **kwargs) -> torch.utils.data.dataset.Dataset:
199def get_livecell_dataset( 200 path: Union[os.PathLike, str], 201 split: str, 202 patch_shape: Tuple[int, int], 203 download: bool = False, 204 offsets: Optional[List[List[int]]] = None, 205 boundaries: bool = False, 206 binary: bool = False, 207 cell_types: Optional[Sequence[str]] = None, 208 label_path: Optional[Union[os.PathLike, str]] = None, 209 label_dtype=torch.int64, 210 **kwargs 211) -> Dataset: 212 """Get the LIVECell dataset for segmenting cells in phase-contrast microscopy. 213 214 Args: 215 path: Filepath to a folder where the downloaded data will be saved. 216 split: The data split to use. Either 'train', 'val' or 'test'. 217 patch_shape: The patch shape to use for training. 218 download: Whether to download the data if it is not present. 219 offsets: Offset values for affinity computation used as target. 220 boundaries: Whether to compute boundaries as the target. 221 binary: Whether to use a binary segmentation target. 222 cell_types: The cell types for which to get the data paths. 223 label_path: Optional path for loading the label data. 224 label_dtype: The datatype of the label data. 225 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 226 227 Returns: 228 The segmentation dataset. 229 """ 230 assert split in ("train", "val", "test") 231 if cell_types is not None: 232 assert isinstance(cell_types, (list, tuple)), \ 233 f"cell_types must be passed as a list or tuple instead of {cell_types}" 234 235 image_paths, seg_paths = get_livecell_paths(path, split, download, cell_types, label_path) 236 237 kwargs = util.ensure_transforms(ndim=2, **kwargs) 238 kwargs, label_dtype = util.add_instance_label_transform( 239 kwargs, add_binary_target=True, label_dtype=label_dtype, offsets=offsets, boundaries=boundaries, binary=binary 240 ) 241 242 return ImageCollectionDataset( 243 raw_image_paths=image_paths, 244 label_image_paths=seg_paths, 245 patch_shape=patch_shape, 246 label_dtype=label_dtype, 247 **kwargs 248 )
Get the LIVECell dataset for segmenting cells in phase-contrast microscopy.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The data split to use. Either 'train', 'val' or 'test'.
- patch_shape: The patch shape to use for training.
- download: Whether to download the data if it is not present.
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- binary: Whether to use a binary segmentation target.
- cell_types: The cell types for which to get the data paths.
- label_path: Optional path for loading the label data.
- label_dtype: The datatype of the label data.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
def
get_livecell_loader( path: Union[os.PathLike, str], split: str, patch_shape: Tuple[int, int], batch_size: int, download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, cell_types: Optional[Sequence[str]] = None, label_path: Union[os.PathLike, str, NoneType] = None, label_dtype=torch.int64, **kwargs) -> torch.utils.data.dataloader.DataLoader:
251def get_livecell_loader( 252 path: Union[os.PathLike, str], 253 split: str, 254 patch_shape: Tuple[int, int], 255 batch_size: int, 256 download: bool = False, 257 offsets: Optional[List[List[int]]] = None, 258 boundaries: bool = False, 259 binary: bool = False, 260 cell_types: Optional[Sequence[str]] = None, 261 label_path: Optional[Union[os.PathLike, str]] = None, 262 label_dtype=torch.int64, 263 **kwargs 264) -> DataLoader: 265 """Get the LIVECell dataloader for segmenting cells in phase-contrast microscopy. 266 267 Args: 268 path: Filepath to a folder where the downloaded data will be saved. 269 split: The data split to use. Either 'train', 'val' or 'test'. 270 patch_shape: The patch shape to use for training. 271 batch_size: The batch size for training. 272 download: Whether to download the data if it is not present. 273 offsets: Offset values for affinity computation used as target. 274 boundaries: Whether to compute boundaries as the target. 275 binary: Whether to use a binary segmentation target. 276 cell_types: The cell types for which to get the data paths. 277 label_path: Optional path for loading the label data. 278 label_dtype: The datatype of the label data. 279 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 280 281 Returns: 282 The DataLoader. 283 """ 284 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 285 dataset = get_livecell_dataset( 286 path, split, patch_shape, download=download, offsets=offsets, boundaries=boundaries, binary=binary, 287 cell_types=cell_types, label_path=label_path, label_dtype=label_dtype, **ds_kwargs 288 ) 289 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the LIVECell dataloader for segmenting cells in phase-contrast microscopy.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The data split to use. Either 'train', 'val' or 'test'.
- patch_shape: The patch shape to use for training.
- batch_size: The batch size for training.
- download: Whether to download the data if it is not present.
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- binary: Whether to use a binary segmentation target.
- cell_types: The cell types for which to get the data paths.
- label_path: Optional path for loading the label data.
- label_dtype: The datatype of the label data.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader.