torch_em.data.datasets.light_microscopy.ctc
The Cell Tracking Challenge contains annotated data for cell segmentation and tracking. We currently provide the 2d datasets with segmentation annotations.
If you use this data in your research please cite https://doi.org/10.1038/nmeth.4473.
1"""The Cell Tracking Challenge contains annotated data for cell segmentation and tracking. 2We currently provide the 2d datasets with segmentation annotations. 3 4If you use this data in your research please cite https://doi.org/10.1038/nmeth.4473. 5""" 6 7import os 8from glob import glob 9from shutil import copyfile 10from typing import Optional, Tuple, Union 11 12from torch.utils.data import Dataset, DataLoader 13 14import torch_em 15 16from .. import util 17 18 19CTC_CHECKSUMS = { 20 "train": { 21 "BF-C2DL-HSC": "0aa68ec37a9b06e72a5dfa07d809f56e1775157fb674bb75ff904936149657b1", 22 "BF-C2DL-MuSC": "ca72b59042809120578a198ba236e5ed3504dd6a122ef969428b7c64f0a5e67d", 23 "DIC-C2DH-HeLa": "832fed2d05bb7488cf9c51a2994b75f8f3f53b3c3098856211f2d39023c34e1a", 24 "Fluo-C2DL-Huh7": "1912658c1b3d8b38b314eb658b559e7b39c256917150e9b3dd8bfdc77347617d", 25 "Fluo-C2DL-MSC": "a083521f0cb673ae02d4957c5e6580c2e021943ef88101f6a2f61b944d671af2", 26 "Fluo-N2DH-GOWT1": "1a7bd9a7d1d10c4122c7782427b437246fb69cc3322a975485c04e206f64fc2c", 27 "Fluo-N2DH-SIM+": "3e809148c87ace80c72f563b56c35e0d9448dcdeb461a09c83f61e93f5e40ec8", 28 "Fluo-N2DL-HeLa": "35dd99d58e071aba0b03880128d920bd1c063783cc280f9531fbdc5be614c82e", 29 "PhC-C2DH-U373": "b18185c18fce54e8eeb93e4bbb9b201d757add9409bbf2283b8114185a11bc9e", 30 "PhC-C2DL-PSC": "9d54bb8febc8798934a21bf92e05d92f5e8557c87e28834b2832591cdda78422", 31 }, 32 "test": { 33 "BF-C2DL-HSC": "fd1c05ec625fd0526c8369d1139babe137e885457eee98c10d957da578d0d5bc", 34 "BF-C2DL-MuSC": "c5cae259e6090e82a2596967fb54c8a768717c1772398f8546ad1c8df0820450", 35 "DIC-C2DH-HeLa": "5e5d5f2aa90aef99d750cf03f5c12d799d50b892f98c86950e07a2c5955ac01f", 36 "Fluo-C2DL-Huh7": "cc7359f8fb6b0c43995365e83ce0116d32f477ac644b2ca02b98bc253e2bcbbe", 37 "Fluo-C2DL-MSC": "c90b13e603dde52f17801d4f0cadde04ed7f21cc05296b1f0957d92dbfc8ffa6", 38 "Fluo-N2DH-GOWT1": "c6893ec2d63459de49d4dc21009b04275573403c62cc02e6ee8d0cb1a5068add", 39 "Fluo-N2DH-SIM+": "c4f257add739b284d02176057814de345dee2ac1a7438e360ccd2df73618db68", 40 "Fluo-N2DL-HeLa": "45cf3daf05e8495aa2ce0febacca4cf0928fab808c0b14ed2eb7289a819e6bb8", 41 "PhC-C2DH-U373": "7aa3162e4363a416b259149adc13c9b09cb8aecfe8165eb1428dd534b66bec8a", 42 "PhC-C2DL-PSC": "8c98ac6203e7490157ceb6aa1131d60a3863001b61fb75e784bc49d47ee264d5", 43 } 44} 45 46 47def _get_ctc_url_and_checksum(dataset_name, split): 48 if split == "train": 49 _link_to_split = "training-datasets" 50 else: 51 _link_to_split = "test-datasets" 52 53 url = f"http://data.celltrackingchallenge.net/{_link_to_split}/{dataset_name}.zip" 54 checksum = CTC_CHECKSUMS[split][dataset_name] 55 return url, checksum 56 57 58def get_ctc_segmentation_data( 59 path: Union[os.PathLike, str], dataset_name: str, split: str, download: bool = False, 60) -> str: 61 f"""Download training data from the Cell Tracking Challenge. 62 63 Args: 64 path: Filepath to a folder where the downloaded data will be saved. 65 dataset_name: Name of the dataset to be downloaded. The available datasets are: 66 {', '.join(CTC_CHECKSUMS['train'].keys())} 67 split: The split to download. Either 'train' or 'test'. 68 download: Whether to download the data if it is not present. 69 70 Returns: 71 The filepath to the training data. 72 """ 73 dataset_names = list(CTC_CHECKSUMS["train"].keys()) 74 if dataset_name not in dataset_names: 75 raise ValueError(f"Invalid dataset: {dataset_name}, choose one of {dataset_names}.") 76 77 data_path = os.path.join(path, split, dataset_name) 78 79 if os.path.exists(data_path): 80 return data_path 81 82 os.makedirs(data_path) 83 url, checksum = _get_ctc_url_and_checksum(dataset_name, split) 84 zip_path = os.path.join(path, f"{dataset_name}.zip") 85 util.download_source(zip_path, url, download, checksum=checksum) 86 util.unzip(zip_path, os.path.join(path, split), remove=True) 87 88 return data_path 89 90 91def _require_gt_images(data_path, vol_ids): 92 image_paths, label_paths = [], [] 93 94 if isinstance(vol_ids, str): 95 vol_ids = [vol_ids] 96 97 for vol_id in vol_ids: 98 image_folder = os.path.join(data_path, vol_id) 99 assert os.path.join(image_folder), f"Cannot find volume id, {vol_id} in {data_path}." 100 101 label_folder = os.path.join(data_path, f"{vol_id}_GT", "SEG") 102 103 # copy over the images corresponding to the labeled frames 104 label_image_folder = os.path.join(data_path, f"{vol_id}_GT", "IM") 105 os.makedirs(label_image_folder, exist_ok=True) 106 107 this_label_paths = glob(os.path.join(label_folder, "*.tif")) 108 for label_path in this_label_paths: 109 fname = os.path.basename(label_path) 110 image_label_path = os.path.join(label_image_folder, fname) 111 if not os.path.exists(image_label_path): 112 im_name = "t" + fname.lstrip("main_seg") 113 image_path = os.path.join(image_folder, im_name) 114 assert os.path.join(image_path), image_path 115 copyfile(image_path, image_label_path) 116 117 image_paths.append(label_image_folder) 118 label_paths.append(label_folder) 119 120 return image_paths, label_paths 121 122 123def get_ctc_segmentation_paths( 124 path: Union[os.PathLike, str], 125 dataset_name: str, 126 split: str = "train", 127 vol_id: Optional[int] = None, 128 download: bool = False, 129) -> Tuple[str, str]: 130 f"""Get paths to the Cell Tracking Challenge data. 131 132 Args: 133 path: Filepath to a folder where the downloaded data will be saved. 134 dataset_name: Name of the dataset to be downloaded. The available datasets are: 135 {', '.join(CTC_CHECKSUMS['train'].keys())} 136 split: The split to download. Currently only supports 'train'. 137 vol_id: The train id to load. 138 download: Whether to download the data if it is not present. 139 140 Returns: 141 Filepath to the folder where image data is stored. 142 Filepath to the folder where label data is stored. 143 """ 144 data_path = get_ctc_segmentation_data(path, dataset_name, split, download) 145 146 if vol_id is None: 147 vol_ids = glob(os.path.join(data_path, "*_GT")) 148 vol_ids = [os.path.basename(vol_id) for vol_id in vol_ids] 149 vol_ids = [vol_id.rstrip("_GT") for vol_id in vol_ids] 150 else: 151 vol_ids = vol_id 152 153 image_path, label_path = _require_gt_images(data_path, vol_ids) 154 return image_path, label_path 155 156 157def get_ctc_segmentation_dataset( 158 path: Union[os.PathLike, str], 159 dataset_name: str, 160 patch_shape: Tuple[int, int, int], 161 split: str = "train", 162 vol_id: Optional[int] = None, 163 download: bool = False, 164 **kwargs, 165) -> Dataset: 166 f"""Get the CTC dataset for cell segmentation. 167 168 Args: 169 path: Filepath to a folder where the downloaded data will be saved. 170 dataset_name: Name of the dataset to be downloaded. The available datasets are: 171 {', '.join(CTC_CHECKSUMS['train'].keys())} 172 patch_shape: The patch shape to use for training. 173 split: The split to download. Currently only supports 'train'. 174 vol_id: The train id to load. 175 download: Whether to download the data if it is not present. 176 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 177 178 Returns: 179 The segmentation dataset. 180 """ 181 assert split in ["train"] 182 183 image_path, label_path = get_ctc_segmentation_paths(path, dataset_name, split, vol_id, download) 184 185 kwargs = util.update_kwargs(kwargs, "ndim", 2) 186 187 return torch_em.default_segmentation_dataset( 188 raw_paths=image_path, 189 raw_key="*.tif", 190 label_paths=label_path, 191 label_key="*.tif", 192 patch_shape=patch_shape, 193 is_seg_dataset=True, 194 **kwargs 195 ) 196 197 198def get_ctc_segmentation_loader( 199 path: Union[os.PathLike, str], 200 dataset_name: str, 201 patch_shape: Tuple[int, int, int], 202 batch_size: int, 203 split: str = "train", 204 vol_id: Optional[int] = None, 205 download: bool = False, 206 **kwargs, 207) -> DataLoader: 208 f"""Get the CTC dataloader for cell segmentation. 209 210 Args: 211 path: Filepath to a folder where the downloaded data will be saved. 212 dataset_name: Name of the dataset to be downloaded. The available datasets are: 213 {', '.join(CTC_CHECKSUMS['train'].keys())} 214 patch_shape: The patch shape to use for training. 215 batch_size: The batch size for training. 216 split: The split to download. Currently only supports 'train'. 217 vol_id: The train id to load. 218 download: Whether to download the data if it is not present. 219 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 220 221 Returns: 222 The DataLoader. 223 """ 224 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 225 dataset = get_ctc_segmentation_dataset(path, dataset_name, patch_shape, split, vol_id, download, **ds_kwargs) 226 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
CTC_CHECKSUMS =
{'train': {'BF-C2DL-HSC': '0aa68ec37a9b06e72a5dfa07d809f56e1775157fb674bb75ff904936149657b1', 'BF-C2DL-MuSC': 'ca72b59042809120578a198ba236e5ed3504dd6a122ef969428b7c64f0a5e67d', 'DIC-C2DH-HeLa': '832fed2d05bb7488cf9c51a2994b75f8f3f53b3c3098856211f2d39023c34e1a', 'Fluo-C2DL-Huh7': '1912658c1b3d8b38b314eb658b559e7b39c256917150e9b3dd8bfdc77347617d', 'Fluo-C2DL-MSC': 'a083521f0cb673ae02d4957c5e6580c2e021943ef88101f6a2f61b944d671af2', 'Fluo-N2DH-GOWT1': '1a7bd9a7d1d10c4122c7782427b437246fb69cc3322a975485c04e206f64fc2c', 'Fluo-N2DH-SIM+': '3e809148c87ace80c72f563b56c35e0d9448dcdeb461a09c83f61e93f5e40ec8', 'Fluo-N2DL-HeLa': '35dd99d58e071aba0b03880128d920bd1c063783cc280f9531fbdc5be614c82e', 'PhC-C2DH-U373': 'b18185c18fce54e8eeb93e4bbb9b201d757add9409bbf2283b8114185a11bc9e', 'PhC-C2DL-PSC': '9d54bb8febc8798934a21bf92e05d92f5e8557c87e28834b2832591cdda78422'}, 'test': {'BF-C2DL-HSC': 'fd1c05ec625fd0526c8369d1139babe137e885457eee98c10d957da578d0d5bc', 'BF-C2DL-MuSC': 'c5cae259e6090e82a2596967fb54c8a768717c1772398f8546ad1c8df0820450', 'DIC-C2DH-HeLa': '5e5d5f2aa90aef99d750cf03f5c12d799d50b892f98c86950e07a2c5955ac01f', 'Fluo-C2DL-Huh7': 'cc7359f8fb6b0c43995365e83ce0116d32f477ac644b2ca02b98bc253e2bcbbe', 'Fluo-C2DL-MSC': 'c90b13e603dde52f17801d4f0cadde04ed7f21cc05296b1f0957d92dbfc8ffa6', 'Fluo-N2DH-GOWT1': 'c6893ec2d63459de49d4dc21009b04275573403c62cc02e6ee8d0cb1a5068add', 'Fluo-N2DH-SIM+': 'c4f257add739b284d02176057814de345dee2ac1a7438e360ccd2df73618db68', 'Fluo-N2DL-HeLa': '45cf3daf05e8495aa2ce0febacca4cf0928fab808c0b14ed2eb7289a819e6bb8', 'PhC-C2DH-U373': '7aa3162e4363a416b259149adc13c9b09cb8aecfe8165eb1428dd534b66bec8a', 'PhC-C2DL-PSC': '8c98ac6203e7490157ceb6aa1131d60a3863001b61fb75e784bc49d47ee264d5'}}
def
get_ctc_segmentation_data( path: Union[os.PathLike, str], dataset_name: str, split: str, download: bool = False) -> str:
59def get_ctc_segmentation_data( 60 path: Union[os.PathLike, str], dataset_name: str, split: str, download: bool = False, 61) -> str: 62 f"""Download training data from the Cell Tracking Challenge. 63 64 Args: 65 path: Filepath to a folder where the downloaded data will be saved. 66 dataset_name: Name of the dataset to be downloaded. The available datasets are: 67 {', '.join(CTC_CHECKSUMS['train'].keys())} 68 split: The split to download. Either 'train' or 'test'. 69 download: Whether to download the data if it is not present. 70 71 Returns: 72 The filepath to the training data. 73 """ 74 dataset_names = list(CTC_CHECKSUMS["train"].keys()) 75 if dataset_name not in dataset_names: 76 raise ValueError(f"Invalid dataset: {dataset_name}, choose one of {dataset_names}.") 77 78 data_path = os.path.join(path, split, dataset_name) 79 80 if os.path.exists(data_path): 81 return data_path 82 83 os.makedirs(data_path) 84 url, checksum = _get_ctc_url_and_checksum(dataset_name, split) 85 zip_path = os.path.join(path, f"{dataset_name}.zip") 86 util.download_source(zip_path, url, download, checksum=checksum) 87 util.unzip(zip_path, os.path.join(path, split), remove=True) 88 89 return data_path
def
get_ctc_segmentation_paths( path: Union[os.PathLike, str], dataset_name: str, split: str = 'train', vol_id: Optional[int] = None, download: bool = False) -> Tuple[str, str]:
124def get_ctc_segmentation_paths( 125 path: Union[os.PathLike, str], 126 dataset_name: str, 127 split: str = "train", 128 vol_id: Optional[int] = None, 129 download: bool = False, 130) -> Tuple[str, str]: 131 f"""Get paths to the Cell Tracking Challenge data. 132 133 Args: 134 path: Filepath to a folder where the downloaded data will be saved. 135 dataset_name: Name of the dataset to be downloaded. The available datasets are: 136 {', '.join(CTC_CHECKSUMS['train'].keys())} 137 split: The split to download. Currently only supports 'train'. 138 vol_id: The train id to load. 139 download: Whether to download the data if it is not present. 140 141 Returns: 142 Filepath to the folder where image data is stored. 143 Filepath to the folder where label data is stored. 144 """ 145 data_path = get_ctc_segmentation_data(path, dataset_name, split, download) 146 147 if vol_id is None: 148 vol_ids = glob(os.path.join(data_path, "*_GT")) 149 vol_ids = [os.path.basename(vol_id) for vol_id in vol_ids] 150 vol_ids = [vol_id.rstrip("_GT") for vol_id in vol_ids] 151 else: 152 vol_ids = vol_id 153 154 image_path, label_path = _require_gt_images(data_path, vol_ids) 155 return image_path, label_path
def
get_ctc_segmentation_dataset( path: Union[os.PathLike, str], dataset_name: str, patch_shape: Tuple[int, int, int], split: str = 'train', vol_id: Optional[int] = None, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
158def get_ctc_segmentation_dataset( 159 path: Union[os.PathLike, str], 160 dataset_name: str, 161 patch_shape: Tuple[int, int, int], 162 split: str = "train", 163 vol_id: Optional[int] = None, 164 download: bool = False, 165 **kwargs, 166) -> Dataset: 167 f"""Get the CTC dataset for cell segmentation. 168 169 Args: 170 path: Filepath to a folder where the downloaded data will be saved. 171 dataset_name: Name of the dataset to be downloaded. The available datasets are: 172 {', '.join(CTC_CHECKSUMS['train'].keys())} 173 patch_shape: The patch shape to use for training. 174 split: The split to download. Currently only supports 'train'. 175 vol_id: The train id to load. 176 download: Whether to download the data if it is not present. 177 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 178 179 Returns: 180 The segmentation dataset. 181 """ 182 assert split in ["train"] 183 184 image_path, label_path = get_ctc_segmentation_paths(path, dataset_name, split, vol_id, download) 185 186 kwargs = util.update_kwargs(kwargs, "ndim", 2) 187 188 return torch_em.default_segmentation_dataset( 189 raw_paths=image_path, 190 raw_key="*.tif", 191 label_paths=label_path, 192 label_key="*.tif", 193 patch_shape=patch_shape, 194 is_seg_dataset=True, 195 **kwargs 196 )
def
get_ctc_segmentation_loader( path: Union[os.PathLike, str], dataset_name: str, patch_shape: Tuple[int, int, int], batch_size: int, split: str = 'train', vol_id: Optional[int] = None, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
199def get_ctc_segmentation_loader( 200 path: Union[os.PathLike, str], 201 dataset_name: str, 202 patch_shape: Tuple[int, int, int], 203 batch_size: int, 204 split: str = "train", 205 vol_id: Optional[int] = None, 206 download: bool = False, 207 **kwargs, 208) -> DataLoader: 209 f"""Get the CTC dataloader for cell segmentation. 210 211 Args: 212 path: Filepath to a folder where the downloaded data will be saved. 213 dataset_name: Name of the dataset to be downloaded. The available datasets are: 214 {', '.join(CTC_CHECKSUMS['train'].keys())} 215 patch_shape: The patch shape to use for training. 216 batch_size: The batch size for training. 217 split: The split to download. Currently only supports 'train'. 218 vol_id: The train id to load. 219 download: Whether to download the data if it is not present. 220 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 221 222 Returns: 223 The DataLoader. 224 """ 225 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 226 dataset = get_ctc_segmentation_dataset(path, dataset_name, patch_shape, split, vol_id, download, **ds_kwargs) 227 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)