torch_em.data.datasets.light_microscopy.livecell
The LIVECell dataset contains phase-contrast microscopy images and annotations for cell segmentations for 8 different cell lines.
This dataset is described in the publication https://doi.org/10.1038/s41592-021-01249-6. Please cite it if you use this dataset in your research.
1"""The LIVECell dataset contains phase-contrast microscopy images 2and annotations for cell segmentations for 8 different cell lines. 3 4This dataset is described in the publication https://doi.org/10.1038/s41592-021-01249-6. 5Please cite it if you use this dataset in your research. 6""" 7 8import os 9import requests 10from tqdm import tqdm 11from shutil import copyfileobj 12from typing import List, Optional, Sequence, Tuple, Union 13 14import numpy as np 15import imageio.v3 as imageio 16 17import torch 18from torch.utils.data import Dataset, DataLoader 19 20import torch_em 21 22from .. import util 23from ... import ImageCollectionDataset 24 25try: 26 from pycocotools.coco import COCO 27except ImportError: 28 COCO = None 29 30URLS = { 31 "images": "http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/images.zip", 32 "train": ("http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/" 33 "LIVECell/livecell_coco_train.json"), 34 "val": ("http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/" 35 "LIVECell/livecell_coco_val.json"), 36 "test": ("http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/" 37 "LIVECell/livecell_coco_test.json") 38} 39# TODO 40CHECKSUM = None 41 42CELL_TYPES = ['A172', 'BT474', 'BV2', 'Huh7', 'MCF7', 'SHSY5Y', 'SkBr3', 'SKOV3'] 43 44 45# TODO use download flag 46def _download_annotation_file(path, split, download): 47 annotation_file = os.path.join(path, f"{split}.json") 48 if not os.path.exists(annotation_file): 49 url = URLS[split] 50 print("Downloading livecell annotation file from", url) 51 with requests.get(url, stream=True) as r: 52 with open(annotation_file, 'wb') as f: 53 copyfileobj(r.raw, f) 54 return annotation_file 55 56 57def _annotations_to_instances(coco, image_metadata, category_ids): 58 import vigra 59 60 # create and save the segmentation 61 annotation_ids = coco.getAnnIds(imgIds=image_metadata["id"], catIds=category_ids) 62 annotations = coco.loadAnns(annotation_ids) 63 assert len(annotations) <= np.iinfo("uint16").max 64 shape = (image_metadata["height"], image_metadata["width"]) 65 seg = np.zeros(shape, dtype="uint32") 66 67 # sort annotations by size, except for iscrowd which go first 68 # we do this to minimize small noise from overlapping multi annotations 69 # (see below) 70 sizes = [ann["area"] if ann["iscrowd"] == 0 else 1 for ann in annotations] 71 sorting = np.argsort(sizes) 72 annotations = [annotations[i] for i in sorting] 73 74 for seg_id, annotation in enumerate(annotations, 1): 75 mask = coco.annToMask(annotation).astype("bool") 76 assert mask.shape == seg.shape 77 seg[mask] = seg_id 78 79 # some images have multiple masks per object with slightly different foreground 80 # this causes small noise objects we need to filter 81 min_size = 50 82 seg_ids, sizes = np.unique(seg, return_counts=True) 83 seg[np.isin(seg, seg_ids[sizes < min_size])] = 0 84 85 vigra.analysis.relabelConsecutive(seg, out=seg) 86 87 return seg.astype("uint16") 88 89 90def _create_segmentations_from_annotations(annotation_file, image_folder, seg_folder, cell_types): 91 assert COCO is not None, "pycocotools is required for processing the LIVECell ground-truth." 92 93 coco = COCO(annotation_file) 94 category_ids = coco.getCatIds(catNms=["cell"]) 95 image_ids = coco.getImgIds(catIds=category_ids) 96 97 image_paths, seg_paths = [], [] 98 for image_id in tqdm(image_ids, desc="creating livecell segmentations from coco-style annotations"): 99 # get the path for the image data and make sure the corresponding image exists 100 image_metadata = coco.loadImgs(image_id)[0] 101 file_name = image_metadata["file_name"] 102 103 # if cell_type names are given we only select file names that match a cell_type 104 if cell_types is not None and (not any([cell_type in file_name for cell_type in cell_types])): 105 continue 106 107 sub_folder = file_name.split("_")[0] 108 image_path = os.path.join(image_folder, sub_folder, file_name) 109 # something changed in the image layout? we keep the old version around in case this changes back... 110 if not os.path.exists(image_path): 111 image_path = os.path.join(image_folder, file_name) 112 assert os.path.exists(image_path), image_path 113 image_paths.append(image_path) 114 115 # get the output path 116 out_folder = os.path.join(seg_folder, sub_folder) 117 os.makedirs(out_folder, exist_ok=True) 118 seg_path = os.path.join(out_folder, file_name) 119 seg_paths.append(seg_path) 120 if os.path.exists(seg_path): 121 continue 122 123 seg = _annotations_to_instances(coco, image_metadata, category_ids) 124 imageio.imwrite(seg_path, seg) 125 126 assert len(image_paths) == len(seg_paths) 127 assert len(image_paths) > 0, \ 128 f"No matching image paths were found. Did you pass invalid cell type names ({cell_types})?" 129 130 return image_paths, seg_paths 131 132 133def _download_livecell_annotations(path, split, download, cell_types, label_path): 134 annotation_file = _download_annotation_file(path, split, download) 135 if split == "test": 136 split_name = "livecell_test_images" 137 else: 138 split_name = "livecell_train_val_images" 139 140 image_folder = os.path.join(path, "images", split_name) 141 seg_folder = os.path.join(path, "annotations", split_name) if label_path is None\ 142 else os.path.join(label_path, "annotations", split_name) 143 144 assert os.path.exists(image_folder), image_folder 145 146 return _create_segmentations_from_annotations(annotation_file, image_folder, seg_folder, cell_types) 147 148 149def get_livecell_data(path: Union[os.PathLike], download: bool = False): 150 """Download the LIVECell dataset. 151 152 Args: 153 path: Filepath to a folder where the downloaded data will be saved. 154 download: Whether to download the data if it is not present. 155 """ 156 os.makedirs(path, exist_ok=True) 157 image_path = os.path.join(path, "images") 158 159 if os.path.exists(image_path): 160 return 161 162 url = URLS["images"] 163 checksum = CHECKSUM 164 zip_path = os.path.join(path, "livecell.zip") 165 util.download_source(zip_path, url, download, checksum) 166 util.unzip(zip_path, path, True) 167 168 169def get_livecell_paths( 170 path: Union[os.PathLike, str], 171 split: str, 172 download: bool = False, 173 cell_types: Optional[Sequence[str]] = None, 174 label_path: Optional[Union[os.PathLike, str]] = None 175) -> Tuple[List[str], List[str]]: 176 """Get paths to the LIVECell data. 177 178 Args: 179 path: Filepath to a folder where the downloaded data will be saved. 180 split: The data split to use. Either 'train', 'val' or 'test'. 181 download: Whether to download the data if it is not present. 182 cell_types: The cell types for which to get the data paths. 183 label_path: Optional path for loading the label data. 184 185 Returns: 186 List of filepaths for the image data. 187 List of filepaths for the label data. 188 """ 189 get_livecell_data(path, download) 190 image_paths, seg_paths = _download_livecell_annotations(path, split, download, cell_types, label_path) 191 return image_paths, seg_paths 192 193 194def get_livecell_dataset( 195 path: Union[os.PathLike, str], 196 split: str, 197 patch_shape: Tuple[int, int], 198 download: bool = False, 199 offsets: Optional[List[List[int]]] = None, 200 boundaries: bool = False, 201 binary: bool = False, 202 cell_types: Optional[Sequence[str]] = None, 203 label_path: Optional[Union[os.PathLike, str]] = None, 204 label_dtype=torch.int64, 205 **kwargs 206) -> Dataset: 207 """Get the LIVECell dataset for segmenting cells in phase-contrast microscopy. 208 209 Args: 210 path: Filepath to a folder where the downloaded data will be saved. 211 split: The data split to use. Either 'train', 'val' or 'test'. 212 patch_shape: The patch shape to use for training. 213 download: Whether to download the data if it is not present. 214 offsets: Offset values for affinity computation used as target. 215 boundaries: Whether to compute boundaries as the target. 216 binary: Whether to use a binary segmentation target. 217 cell_types: The cell types for which to get the data paths. 218 label_path: Optional path for loading the label data. 219 label_dtype: The datatype of the label data. 220 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 221 222 Returns: 223 The segmentation dataset. 224 """ 225 assert split in ("train", "val", "test") 226 if cell_types is not None: 227 assert isinstance(cell_types, (list, tuple)), \ 228 f"cell_types must be passed as a list or tuple instead of {cell_types}" 229 230 image_paths, seg_paths = get_livecell_paths(path, split, download, cell_types, label_path) 231 232 kwargs = util.ensure_transforms(ndim=2, **kwargs) 233 kwargs, label_dtype = util.add_instance_label_transform( 234 kwargs, add_binary_target=True, label_dtype=label_dtype, offsets=offsets, boundaries=boundaries, binary=binary 235 ) 236 237 return ImageCollectionDataset( 238 raw_image_paths=image_paths, 239 label_image_paths=seg_paths, 240 patch_shape=patch_shape, 241 label_dtype=label_dtype, 242 **kwargs 243 ) 244 245 246def get_livecell_loader( 247 path: Union[os.PathLike, str], 248 split: str, 249 patch_shape: Tuple[int, int], 250 batch_size: int, 251 download: bool = False, 252 offsets: Optional[List[List[int]]] = None, 253 boundaries: bool = False, 254 binary: bool = False, 255 cell_types: Optional[Sequence[str]] = None, 256 label_path: Optional[Union[os.PathLike, str]] = None, 257 label_dtype=torch.int64, 258 **kwargs 259) -> DataLoader: 260 """Get the LIVECell dataloader for segmenting cells in phase-contrast microscopy. 261 262 Args: 263 path: Filepath to a folder where the downloaded data will be saved. 264 split: The data split to use. Either 'train', 'val' or 'test'. 265 patch_shape: The patch shape to use for training. 266 batch_size: The batch size for training. 267 download: Whether to download the data if it is not present. 268 offsets: Offset values for affinity computation used as target. 269 boundaries: Whether to compute boundaries as the target. 270 binary: Whether to use a binary segmentation target. 271 cell_types: The cell types for which to get the data paths. 272 label_path: Optional path for loading the label data. 273 label_dtype: The datatype of the label data. 274 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 275 276 Returns: 277 The DataLoader. 278 """ 279 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 280 dataset = get_livecell_dataset( 281 path, split, patch_shape, download=download, offsets=offsets, boundaries=boundaries, binary=binary, 282 cell_types=cell_types, label_path=label_path, label_dtype=label_dtype, **ds_kwargs 283 ) 284 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URLS =
{'images': 'http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/images.zip', 'train': 'http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/LIVECell/livecell_coco_train.json', 'val': 'http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/LIVECell/livecell_coco_val.json', 'test': 'http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/LIVECell/livecell_coco_test.json'}
CHECKSUM =
None
CELL_TYPES =
['A172', 'BT474', 'BV2', 'Huh7', 'MCF7', 'SHSY5Y', 'SkBr3', 'SKOV3']
def
get_livecell_data(path: os.PathLike, download: bool = False):
150def get_livecell_data(path: Union[os.PathLike], download: bool = False): 151 """Download the LIVECell dataset. 152 153 Args: 154 path: Filepath to a folder where the downloaded data will be saved. 155 download: Whether to download the data if it is not present. 156 """ 157 os.makedirs(path, exist_ok=True) 158 image_path = os.path.join(path, "images") 159 160 if os.path.exists(image_path): 161 return 162 163 url = URLS["images"] 164 checksum = CHECKSUM 165 zip_path = os.path.join(path, "livecell.zip") 166 util.download_source(zip_path, url, download, checksum) 167 util.unzip(zip_path, path, True)
Download the LIVECell dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- download: Whether to download the data if it is not present.
def
get_livecell_paths( path: Union[os.PathLike, str], split: str, download: bool = False, cell_types: Optional[Sequence[str]] = None, label_path: Union[os.PathLike, str, NoneType] = None) -> Tuple[List[str], List[str]]:
170def get_livecell_paths( 171 path: Union[os.PathLike, str], 172 split: str, 173 download: bool = False, 174 cell_types: Optional[Sequence[str]] = None, 175 label_path: Optional[Union[os.PathLike, str]] = None 176) -> Tuple[List[str], List[str]]: 177 """Get paths to the LIVECell data. 178 179 Args: 180 path: Filepath to a folder where the downloaded data will be saved. 181 split: The data split to use. Either 'train', 'val' or 'test'. 182 download: Whether to download the data if it is not present. 183 cell_types: The cell types for which to get the data paths. 184 label_path: Optional path for loading the label data. 185 186 Returns: 187 List of filepaths for the image data. 188 List of filepaths for the label data. 189 """ 190 get_livecell_data(path, download) 191 image_paths, seg_paths = _download_livecell_annotations(path, split, download, cell_types, label_path) 192 return image_paths, seg_paths
Get paths to the LIVECell data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The data split to use. Either 'train', 'val' or 'test'.
- download: Whether to download the data if it is not present.
- cell_types: The cell types for which to get the data paths.
- label_path: Optional path for loading the label data.
Returns:
List of filepaths for the image data. List of filepaths for the label data.
def
get_livecell_dataset( path: Union[os.PathLike, str], split: str, patch_shape: Tuple[int, int], download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, cell_types: Optional[Sequence[str]] = None, label_path: Union[os.PathLike, str, NoneType] = None, label_dtype=torch.int64, **kwargs) -> torch.utils.data.dataset.Dataset:
195def get_livecell_dataset( 196 path: Union[os.PathLike, str], 197 split: str, 198 patch_shape: Tuple[int, int], 199 download: bool = False, 200 offsets: Optional[List[List[int]]] = None, 201 boundaries: bool = False, 202 binary: bool = False, 203 cell_types: Optional[Sequence[str]] = None, 204 label_path: Optional[Union[os.PathLike, str]] = None, 205 label_dtype=torch.int64, 206 **kwargs 207) -> Dataset: 208 """Get the LIVECell dataset for segmenting cells in phase-contrast microscopy. 209 210 Args: 211 path: Filepath to a folder where the downloaded data will be saved. 212 split: The data split to use. Either 'train', 'val' or 'test'. 213 patch_shape: The patch shape to use for training. 214 download: Whether to download the data if it is not present. 215 offsets: Offset values for affinity computation used as target. 216 boundaries: Whether to compute boundaries as the target. 217 binary: Whether to use a binary segmentation target. 218 cell_types: The cell types for which to get the data paths. 219 label_path: Optional path for loading the label data. 220 label_dtype: The datatype of the label data. 221 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 222 223 Returns: 224 The segmentation dataset. 225 """ 226 assert split in ("train", "val", "test") 227 if cell_types is not None: 228 assert isinstance(cell_types, (list, tuple)), \ 229 f"cell_types must be passed as a list or tuple instead of {cell_types}" 230 231 image_paths, seg_paths = get_livecell_paths(path, split, download, cell_types, label_path) 232 233 kwargs = util.ensure_transforms(ndim=2, **kwargs) 234 kwargs, label_dtype = util.add_instance_label_transform( 235 kwargs, add_binary_target=True, label_dtype=label_dtype, offsets=offsets, boundaries=boundaries, binary=binary 236 ) 237 238 return ImageCollectionDataset( 239 raw_image_paths=image_paths, 240 label_image_paths=seg_paths, 241 patch_shape=patch_shape, 242 label_dtype=label_dtype, 243 **kwargs 244 )
Get the LIVECell dataset for segmenting cells in phase-contrast microscopy.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The data split to use. Either 'train', 'val' or 'test'.
- patch_shape: The patch shape to use for training.
- download: Whether to download the data if it is not present.
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- binary: Whether to use a binary segmentation target.
- cell_types: The cell types for which to get the data paths.
- label_path: Optional path for loading the label data.
- label_dtype: The datatype of the label data.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
def
get_livecell_loader( path: Union[os.PathLike, str], split: str, patch_shape: Tuple[int, int], batch_size: int, download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, cell_types: Optional[Sequence[str]] = None, label_path: Union[os.PathLike, str, NoneType] = None, label_dtype=torch.int64, **kwargs) -> torch.utils.data.dataloader.DataLoader:
247def get_livecell_loader( 248 path: Union[os.PathLike, str], 249 split: str, 250 patch_shape: Tuple[int, int], 251 batch_size: int, 252 download: bool = False, 253 offsets: Optional[List[List[int]]] = None, 254 boundaries: bool = False, 255 binary: bool = False, 256 cell_types: Optional[Sequence[str]] = None, 257 label_path: Optional[Union[os.PathLike, str]] = None, 258 label_dtype=torch.int64, 259 **kwargs 260) -> DataLoader: 261 """Get the LIVECell dataloader for segmenting cells in phase-contrast microscopy. 262 263 Args: 264 path: Filepath to a folder where the downloaded data will be saved. 265 split: The data split to use. Either 'train', 'val' or 'test'. 266 patch_shape: The patch shape to use for training. 267 batch_size: The batch size for training. 268 download: Whether to download the data if it is not present. 269 offsets: Offset values for affinity computation used as target. 270 boundaries: Whether to compute boundaries as the target. 271 binary: Whether to use a binary segmentation target. 272 cell_types: The cell types for which to get the data paths. 273 label_path: Optional path for loading the label data. 274 label_dtype: The datatype of the label data. 275 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 276 277 Returns: 278 The DataLoader. 279 """ 280 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 281 dataset = get_livecell_dataset( 282 path, split, patch_shape, download=download, offsets=offsets, boundaries=boundaries, binary=binary, 283 cell_types=cell_types, label_path=label_path, label_dtype=label_dtype, **ds_kwargs 284 ) 285 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the LIVECell dataloader for segmenting cells in phase-contrast microscopy.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The data split to use. Either 'train', 'val' or 'test'.
- patch_shape: The patch shape to use for training.
- batch_size: The batch size for training.
- download: Whether to download the data if it is not present.
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- binary: Whether to use a binary segmentation target.
- cell_types: The cell types for which to get the data paths.
- label_path: Optional path for loading the label data.
- label_dtype: The datatype of the label data.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader.