torch_em.data.datasets.light_microscopy.livecell
The LIVECell dataset contains phase-contrast microscopy images and annotations for cell segmentations for 8 different cell lines.
This dataset is desceibed in the publication https://doi.org/10.1038/s41592-021-01249-6. Please cite it if you use this dataset in your research.
1"""The LIVECell dataset contains phase-contrast microscopy images 2and annotations for cell segmentations for 8 different cell lines. 3 4This dataset is desceibed in the publication https://doi.org/10.1038/s41592-021-01249-6. 5Please cite it if you use this dataset in your research. 6""" 7 8import os 9from shutil import copyfileobj 10from typing import List, Optional, Sequence, Tuple, Union 11 12import imageio 13import numpy as np 14import requests 15import vigra 16from tqdm import tqdm 17 18import torch_em 19import torch.utils.data 20from torch.utils.data import Dataset, DataLoader 21from .. import util 22 23try: 24 from pycocotools.coco import COCO 25except ImportError: 26 COCO = None 27 28URLS = { 29 "images": "http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/images.zip", 30 "train": ("http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/" 31 "LIVECell/livecell_coco_train.json"), 32 "val": ("http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/" 33 "LIVECell/livecell_coco_val.json"), 34 "test": ("http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/" 35 "LIVECell/livecell_coco_test.json") 36} 37# TODO 38CHECKSUM = None 39 40 41def _download_livecell_images(path, download): 42 os.makedirs(path, exist_ok=True) 43 image_path = os.path.join(path, "images") 44 45 if os.path.exists(image_path): 46 return 47 48 url = URLS["images"] 49 checksum = CHECKSUM 50 zip_path = os.path.join(path, "livecell.zip") 51 util.download_source(zip_path, url, download, checksum) 52 util.unzip(zip_path, path, True) 53 54 55# TODO use download flag 56def _download_annotation_file(path, split, download): 57 annotation_file = os.path.join(path, f"{split}.json") 58 if not os.path.exists(annotation_file): 59 url = URLS[split] 60 print("Downloading livecell annotation file from", url) 61 with requests.get(url, stream=True) as r: 62 with open(annotation_file, 'wb') as f: 63 copyfileobj(r.raw, f) 64 return annotation_file 65 66 67def _annotations_to_instances(coco, image_metadata, category_ids): 68 # create and save the segmentation 69 annotation_ids = coco.getAnnIds(imgIds=image_metadata["id"], catIds=category_ids) 70 annotations = coco.loadAnns(annotation_ids) 71 assert len(annotations) <= np.iinfo("uint16").max 72 shape = (image_metadata["height"], image_metadata["width"]) 73 seg = np.zeros(shape, dtype="uint32") 74 75 # sort annotations by size, except for iscrowd which go first 76 # we do this to minimize small noise from overlapping multi annotations 77 # (see below) 78 sizes = [ann["area"] if ann["iscrowd"] == 0 else 1 for ann in annotations] 79 sorting = np.argsort(sizes) 80 annotations = [annotations[i] for i in sorting] 81 82 for seg_id, annotation in enumerate(annotations, 1): 83 mask = coco.annToMask(annotation).astype("bool") 84 assert mask.shape == seg.shape 85 seg[mask] = seg_id 86 87 # some images have multiple masks per object with slightly different foreground 88 # this causes small noise objects we need to filter 89 min_size = 50 90 seg_ids, sizes = np.unique(seg, return_counts=True) 91 seg[np.isin(seg, seg_ids[sizes < min_size])] = 0 92 93 vigra.analysis.relabelConsecutive(seg, out=seg) 94 95 return seg.astype("uint16") 96 97 98def _create_segmentations_from_annotations(annotation_file, image_folder, seg_folder, cell_types): 99 assert COCO is not None, "pycocotools is required for processing the LiveCELL ground-truth." 100 101 coco = COCO(annotation_file) 102 category_ids = coco.getCatIds(catNms=["cell"]) 103 image_ids = coco.getImgIds(catIds=category_ids) 104 105 image_paths, seg_paths = [], [] 106 for image_id in tqdm(image_ids, desc="creating livecell segmentations from coco-style annotations"): 107 # get the path for the image data and make sure the corresponding image exists 108 image_metadata = coco.loadImgs(image_id)[0] 109 file_name = image_metadata["file_name"] 110 111 # if cell_type names are given we only select file names that match a cell_type 112 if cell_types is not None and (not any([cell_type in file_name for cell_type in cell_types])): 113 continue 114 115 sub_folder = file_name.split("_")[0] 116 image_path = os.path.join(image_folder, sub_folder, file_name) 117 # something changed in the image layout? we keep the old version around in case this changes back... 118 if not os.path.exists(image_path): 119 image_path = os.path.join(image_folder, file_name) 120 assert os.path.exists(image_path), image_path 121 image_paths.append(image_path) 122 123 # get the output path 124 out_folder = os.path.join(seg_folder, sub_folder) 125 os.makedirs(out_folder, exist_ok=True) 126 seg_path = os.path.join(out_folder, file_name) 127 seg_paths.append(seg_path) 128 if os.path.exists(seg_path): 129 continue 130 131 seg = _annotations_to_instances(coco, image_metadata, category_ids) 132 imageio.imwrite(seg_path, seg) 133 134 assert len(image_paths) == len(seg_paths) 135 assert len(image_paths) > 0, \ 136 f"No matching image paths were found. Did you pass invalid cell type naems ({cell_types})?" 137 return image_paths, seg_paths 138 139 140def _download_livecell_annotations(path, split, download, cell_types, label_path): 141 annotation_file = _download_annotation_file(path, split, download) 142 if split == "test": 143 split_name = "livecell_test_images" 144 else: 145 split_name = "livecell_train_val_images" 146 147 image_folder = os.path.join(path, "images", split_name) 148 seg_folder = os.path.join(path, "annotations", split_name) if label_path is None\ 149 else os.path.join(label_path, "annotations", split_name) 150 151 assert os.path.exists(image_folder), image_folder 152 153 return _create_segmentations_from_annotations(annotation_file, image_folder, seg_folder, cell_types) 154 155 156def get_livecell_data( 157 path: Union[os.PathLike, str], 158 split: str, 159 download: bool, 160 cell_types: Optional[Sequence[str]] = None, 161 label_path: Optional[Union[os.PathLike, str]] = None 162) -> Tuple[List[str], List[str]]: 163 """Download the LIVECell dataset. 164 165 Args: 166 path: Filepath to a folder where the downloaded data will be saved. 167 split: The data split to use. Either 'train', 'val' or 'test'. 168 download: Whether to download the data if it is not present. 169 cell_types: The cell types for which to get the data paths. 170 label_path: Optional path for loading the label data. 171 172 Returns: 173 The paths to the image data. 174 The paths to the label data. 175 """ 176 _download_livecell_images(path, download) 177 image_paths, seg_paths = _download_livecell_annotations(path, split, download, cell_types, label_path) 178 return image_paths, seg_paths 179 180 181def get_livecell_dataset( 182 path: Union[os.PathLike, str], 183 split: str, 184 patch_shape: Tuple[int, int], 185 download: bool = False, 186 offsets: Optional[List[List[int]]] = None, 187 boundaries: bool = False, 188 binary: bool = False, 189 cell_types: Optional[Sequence[str]] = None, 190 label_path: Optional[Union[os.PathLike, str]] = None, 191 label_dtype=torch.int64, 192 **kwargs 193) -> Dataset: 194 """Get the LIVECell dataset for segmenting cells in phase-contrast microscopy. 195 196 Args: 197 path: Filepath to a folder where the downloaded data will be saved. 198 split: The data split to use. Either 'train', 'val' or 'test'. 199 patch_shape: The patch shape to use for training. 200 download: Whether to download the data if it is not present. 201 offsets: Offset values for affinity computation used as target. 202 boundaries: Whether to compute boundaries as the target. 203 binary: Whether to use a binary segmentation target. 204 cell_types: The cell types for which to get the data paths. 205 label_path: Optional path for loading the label data. 206 label_dtype: The datatype of the label data. 207 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 208 209 Returns: 210 The segmentation dataset. 211 """ 212 assert split in ("train", "val", "test") 213 if cell_types is not None: 214 assert isinstance(cell_types, (list, tuple)), \ 215 f"cell_types must be passed as a list or tuple instead of {cell_types}" 216 217 image_paths, seg_paths = get_livecell_data(path, split, download, cell_types, label_path) 218 219 kwargs = util.ensure_transforms(ndim=2, **kwargs) 220 kwargs, label_dtype = util.add_instance_label_transform( 221 kwargs, add_binary_target=True, label_dtype=label_dtype, 222 offsets=offsets, boundaries=boundaries, binary=binary 223 ) 224 225 dataset = torch_em.data.ImageCollectionDataset( 226 image_paths, seg_paths, patch_shape=patch_shape, label_dtype=label_dtype, **kwargs 227 ) 228 return dataset 229 230 231def get_livecell_loader( 232 path: Union[os.PathLike, str], 233 split: str, 234 patch_shape: Tuple[int, int], 235 batch_size: int, 236 download: bool = False, 237 offsets: Optional[List[List[int]]] = None, 238 boundaries: bool = False, 239 binary: bool = False, 240 cell_types: Optional[Sequence[str]] = None, 241 label_path: Optional[Union[os.PathLike, str]] = None, 242 label_dtype=torch.int64, 243 **kwargs 244) -> DataLoader: 245 """Get the LIVECell dataloader for segmenting cells in phase-contrast microscopy. 246 247 Args: 248 path: Filepath to a folder where the downloaded data will be saved. 249 split: The data split to use. Either 'train', 'val' or 'test'. 250 patch_shape: The patch shape to use for training. 251 batch_size: The batch size for training. 252 download: Whether to download the data if it is not present. 253 offsets: Offset values for affinity computation used as target. 254 boundaries: Whether to compute boundaries as the target. 255 binary: Whether to use a binary segmentation target. 256 cell_types: The cell types for which to get the data paths. 257 label_path: Optional path for loading the label data. 258 label_dtype: The datatype of the label data. 259 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 260 261 Returns: 262 The DataLoader. 263 """ 264 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 265 dataset = get_livecell_dataset( 266 path, split, patch_shape, download=download, offsets=offsets, boundaries=boundaries, binary=binary, 267 cell_types=cell_types, label_path=label_path, label_dtype=label_dtype, **ds_kwargs 268 ) 269 loader = torch_em.get_data_loader(dataset, batch_size, **loader_kwargs) 270 return loader
URLS =
{'images': 'http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/images.zip', 'train': 'http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/LIVECell/livecell_coco_train.json', 'val': 'http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/LIVECell/livecell_coco_val.json', 'test': 'http://livecell-dataset.s3.eu-central-1.amazonaws.com/LIVECell_dataset_2021/annotations/LIVECell/livecell_coco_test.json'}
CHECKSUM =
None
def
get_livecell_data( path: Union[os.PathLike, str], split: str, download: bool, cell_types: Optional[Sequence[str]] = None, label_path: Union[os.PathLike, str, NoneType] = None) -> Tuple[List[str], List[str]]:
157def get_livecell_data( 158 path: Union[os.PathLike, str], 159 split: str, 160 download: bool, 161 cell_types: Optional[Sequence[str]] = None, 162 label_path: Optional[Union[os.PathLike, str]] = None 163) -> Tuple[List[str], List[str]]: 164 """Download the LIVECell dataset. 165 166 Args: 167 path: Filepath to a folder where the downloaded data will be saved. 168 split: The data split to use. Either 'train', 'val' or 'test'. 169 download: Whether to download the data if it is not present. 170 cell_types: The cell types for which to get the data paths. 171 label_path: Optional path for loading the label data. 172 173 Returns: 174 The paths to the image data. 175 The paths to the label data. 176 """ 177 _download_livecell_images(path, download) 178 image_paths, seg_paths = _download_livecell_annotations(path, split, download, cell_types, label_path) 179 return image_paths, seg_paths
Download the LIVECell dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The data split to use. Either 'train', 'val' or 'test'.
- download: Whether to download the data if it is not present.
- cell_types: The cell types for which to get the data paths.
- label_path: Optional path for loading the label data.
Returns:
The paths to the image data. The paths to the label data.
def
get_livecell_dataset( path: Union[os.PathLike, str], split: str, patch_shape: Tuple[int, int], download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, cell_types: Optional[Sequence[str]] = None, label_path: Union[os.PathLike, str, NoneType] = None, label_dtype=torch.int64, **kwargs) -> torch.utils.data.dataset.Dataset:
182def get_livecell_dataset( 183 path: Union[os.PathLike, str], 184 split: str, 185 patch_shape: Tuple[int, int], 186 download: bool = False, 187 offsets: Optional[List[List[int]]] = None, 188 boundaries: bool = False, 189 binary: bool = False, 190 cell_types: Optional[Sequence[str]] = None, 191 label_path: Optional[Union[os.PathLike, str]] = None, 192 label_dtype=torch.int64, 193 **kwargs 194) -> Dataset: 195 """Get the LIVECell dataset for segmenting cells in phase-contrast microscopy. 196 197 Args: 198 path: Filepath to a folder where the downloaded data will be saved. 199 split: The data split to use. Either 'train', 'val' or 'test'. 200 patch_shape: The patch shape to use for training. 201 download: Whether to download the data if it is not present. 202 offsets: Offset values for affinity computation used as target. 203 boundaries: Whether to compute boundaries as the target. 204 binary: Whether to use a binary segmentation target. 205 cell_types: The cell types for which to get the data paths. 206 label_path: Optional path for loading the label data. 207 label_dtype: The datatype of the label data. 208 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 209 210 Returns: 211 The segmentation dataset. 212 """ 213 assert split in ("train", "val", "test") 214 if cell_types is not None: 215 assert isinstance(cell_types, (list, tuple)), \ 216 f"cell_types must be passed as a list or tuple instead of {cell_types}" 217 218 image_paths, seg_paths = get_livecell_data(path, split, download, cell_types, label_path) 219 220 kwargs = util.ensure_transforms(ndim=2, **kwargs) 221 kwargs, label_dtype = util.add_instance_label_transform( 222 kwargs, add_binary_target=True, label_dtype=label_dtype, 223 offsets=offsets, boundaries=boundaries, binary=binary 224 ) 225 226 dataset = torch_em.data.ImageCollectionDataset( 227 image_paths, seg_paths, patch_shape=patch_shape, label_dtype=label_dtype, **kwargs 228 ) 229 return dataset
Get the LIVECell dataset for segmenting cells in phase-contrast microscopy.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The data split to use. Either 'train', 'val' or 'test'.
- patch_shape: The patch shape to use for training.
- download: Whether to download the data if it is not present.
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- binary: Whether to use a binary segmentation target.
- cell_types: The cell types for which to get the data paths.
- label_path: Optional path for loading the label data.
- label_dtype: The datatype of the label data.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
def
get_livecell_loader( path: Union[os.PathLike, str], split: str, patch_shape: Tuple[int, int], batch_size: int, download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, cell_types: Optional[Sequence[str]] = None, label_path: Union[os.PathLike, str, NoneType] = None, label_dtype=torch.int64, **kwargs) -> torch.utils.data.dataloader.DataLoader:
232def get_livecell_loader( 233 path: Union[os.PathLike, str], 234 split: str, 235 patch_shape: Tuple[int, int], 236 batch_size: int, 237 download: bool = False, 238 offsets: Optional[List[List[int]]] = None, 239 boundaries: bool = False, 240 binary: bool = False, 241 cell_types: Optional[Sequence[str]] = None, 242 label_path: Optional[Union[os.PathLike, str]] = None, 243 label_dtype=torch.int64, 244 **kwargs 245) -> DataLoader: 246 """Get the LIVECell dataloader for segmenting cells in phase-contrast microscopy. 247 248 Args: 249 path: Filepath to a folder where the downloaded data will be saved. 250 split: The data split to use. Either 'train', 'val' or 'test'. 251 patch_shape: The patch shape to use for training. 252 batch_size: The batch size for training. 253 download: Whether to download the data if it is not present. 254 offsets: Offset values for affinity computation used as target. 255 boundaries: Whether to compute boundaries as the target. 256 binary: Whether to use a binary segmentation target. 257 cell_types: The cell types for which to get the data paths. 258 label_path: Optional path for loading the label data. 259 label_dtype: The datatype of the label data. 260 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 261 262 Returns: 263 The DataLoader. 264 """ 265 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 266 dataset = get_livecell_dataset( 267 path, split, patch_shape, download=download, offsets=offsets, boundaries=boundaries, binary=binary, 268 cell_types=cell_types, label_path=label_path, label_dtype=label_dtype, **ds_kwargs 269 ) 270 loader = torch_em.get_data_loader(dataset, batch_size, **loader_kwargs) 271 return loader
Get the LIVECell dataloader for segmenting cells in phase-contrast microscopy.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The data split to use. Either 'train', 'val' or 'test'.
- patch_shape: The patch shape to use for training.
- batch_size: The batch size for training.
- download: Whether to download the data if it is not present.
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- binary: Whether to use a binary segmentation target.
- cell_types: The cell types for which to get the data paths.
- label_path: Optional path for loading the label data.
- label_dtype: The datatype of the label data.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader.