torch_em.data.datasets.light_microscopy.ctc

The Cell Tracking Challenge contains annotated data for cell segmentation and tracking. We currently provide the 2d datasets with segmentation annotations.

If you use this data in your research please cite https://doi.org/10.1038/nmeth.4473.

  1"""The Cell Tracking Challenge contains annotated data for cell segmentation and tracking.
  2We currently provide the 2d datasets with segmentation annotations.
  3
  4If you use this data in your research please cite https://doi.org/10.1038/nmeth.4473.
  5"""
  6
  7import os
  8from glob import glob
  9from shutil import copyfile
 10from typing import Optional, Tuple, Union
 11
 12from torch.utils.data import Dataset, DataLoader
 13
 14import torch_em
 15
 16from .. import util
 17
 18
 19CTC_CHECKSUMS = {
 20    "train": {
 21        "BF-C2DL-HSC": "0aa68ec37a9b06e72a5dfa07d809f56e1775157fb674bb75ff904936149657b1",
 22        "BF-C2DL-MuSC": "ca72b59042809120578a198ba236e5ed3504dd6a122ef969428b7c64f0a5e67d",
 23        "DIC-C2DH-HeLa": "832fed2d05bb7488cf9c51a2994b75f8f3f53b3c3098856211f2d39023c34e1a",
 24        "Fluo-C2DL-Huh7": "1912658c1b3d8b38b314eb658b559e7b39c256917150e9b3dd8bfdc77347617d",
 25        "Fluo-C2DL-MSC": "a083521f0cb673ae02d4957c5e6580c2e021943ef88101f6a2f61b944d671af2",
 26        "Fluo-N2DH-GOWT1": "1a7bd9a7d1d10c4122c7782427b437246fb69cc3322a975485c04e206f64fc2c",
 27        "Fluo-N2DH-SIM+": "3e809148c87ace80c72f563b56c35e0d9448dcdeb461a09c83f61e93f5e40ec8",
 28        "Fluo-N2DL-HeLa": "35dd99d58e071aba0b03880128d920bd1c063783cc280f9531fbdc5be614c82e",
 29        "PhC-C2DH-U373": "b18185c18fce54e8eeb93e4bbb9b201d757add9409bbf2283b8114185a11bc9e",
 30        "PhC-C2DL-PSC": "9d54bb8febc8798934a21bf92e05d92f5e8557c87e28834b2832591cdda78422",
 31    },
 32    "test": {
 33        "BF-C2DL-HSC": "fd1c05ec625fd0526c8369d1139babe137e885457eee98c10d957da578d0d5bc",
 34        "BF-C2DL-MuSC": "c5cae259e6090e82a2596967fb54c8a768717c1772398f8546ad1c8df0820450",
 35        "DIC-C2DH-HeLa": "5e5d5f2aa90aef99d750cf03f5c12d799d50b892f98c86950e07a2c5955ac01f",
 36        "Fluo-C2DL-Huh7": "cc7359f8fb6b0c43995365e83ce0116d32f477ac644b2ca02b98bc253e2bcbbe",
 37        "Fluo-C2DL-MSC": "c90b13e603dde52f17801d4f0cadde04ed7f21cc05296b1f0957d92dbfc8ffa6",
 38        "Fluo-N2DH-GOWT1": "c6893ec2d63459de49d4dc21009b04275573403c62cc02e6ee8d0cb1a5068add",
 39        "Fluo-N2DH-SIM+": "c4f257add739b284d02176057814de345dee2ac1a7438e360ccd2df73618db68",
 40        "Fluo-N2DL-HeLa": "45cf3daf05e8495aa2ce0febacca4cf0928fab808c0b14ed2eb7289a819e6bb8",
 41        "PhC-C2DH-U373": "7aa3162e4363a416b259149adc13c9b09cb8aecfe8165eb1428dd534b66bec8a",
 42        "PhC-C2DL-PSC": "8c98ac6203e7490157ceb6aa1131d60a3863001b61fb75e784bc49d47ee264d5",
 43    }
 44}
 45
 46
 47def _get_ctc_url_and_checksum(dataset_name, split):
 48    if split == "train":
 49        _link_to_split = "training-datasets"
 50    else:
 51        _link_to_split = "test-datasets"
 52
 53    url = f"http://data.celltrackingchallenge.net/{_link_to_split}/{dataset_name}.zip"
 54    checksum = CTC_CHECKSUMS[split][dataset_name]
 55    return url, checksum
 56
 57
 58def get_ctc_segmentation_data(
 59    path: Union[os.PathLike, str], dataset_name: str, split: str, download: bool = False,
 60) -> str:
 61    f"""Download training data from the Cell Tracking Challenge.
 62
 63    Args:
 64        path: Filepath to a folder where the downloaded data will be saved.
 65        dataset_name: Name of the dataset to be downloaded. The available datasets are:
 66            {', '.join(CTC_CHECKSUMS['train'].keys())}
 67        split: The split to download. Either 'train' or 'test'.
 68        download: Whether to download the data if it is not present.
 69
 70    Returns:
 71        The filepath to the training data.
 72    """
 73    dataset_names = list(CTC_CHECKSUMS["train"].keys())
 74    if dataset_name not in dataset_names:
 75        raise ValueError(f"Invalid dataset: {dataset_name}, choose one of {dataset_names}.")
 76
 77    data_path = os.path.join(path, split, dataset_name)
 78
 79    if os.path.exists(data_path):
 80        return data_path
 81
 82    os.makedirs(data_path)
 83    url, checksum = _get_ctc_url_and_checksum(dataset_name, split)
 84    zip_path = os.path.join(path, f"{dataset_name}.zip")
 85    util.download_source(zip_path, url, download, checksum=checksum)
 86    util.unzip(zip_path, os.path.join(path, split), remove=True)
 87
 88    return data_path
 89
 90
 91def _require_gt_images(data_path, vol_ids):
 92    image_paths, label_paths = [], []
 93
 94    if isinstance(vol_ids, str):
 95        vol_ids = [vol_ids]
 96
 97    for vol_id in vol_ids:
 98        image_folder = os.path.join(data_path, vol_id)
 99        assert os.path.join(image_folder), f"Cannot find volume id, {vol_id} in {data_path}."
100
101        label_folder = os.path.join(data_path, f"{vol_id}_GT", "SEG")
102
103        # copy over the images corresponding to the labeled frames
104        label_image_folder = os.path.join(data_path, f"{vol_id}_GT", "IM")
105        os.makedirs(label_image_folder, exist_ok=True)
106
107        this_label_paths = glob(os.path.join(label_folder, "*.tif"))
108        for label_path in this_label_paths:
109            fname = os.path.basename(label_path)
110            image_label_path = os.path.join(label_image_folder, fname)
111            if not os.path.exists(image_label_path):
112                im_name = "t" + fname.lstrip("main_seg")
113                image_path = os.path.join(image_folder, im_name)
114                assert os.path.join(image_path), image_path
115                copyfile(image_path, image_label_path)
116
117        image_paths.append(label_image_folder)
118        label_paths.append(label_folder)
119
120    return image_paths, label_paths
121
122
123def get_ctc_segmentation_paths(
124    path: Union[os.PathLike, str],
125    dataset_name: str,
126    split: str = "train",
127    vol_id: Optional[int] = None,
128    download: bool = False,
129) -> Tuple[str, str]:
130    f"""Get paths to the Cell Tracking Challenge data.
131
132    Args:
133        path: Filepath to a folder where the downloaded data will be saved.
134        dataset_name: Name of the dataset to be downloaded. The available datasets are:
135            {', '.join(CTC_CHECKSUMS['train'].keys())}
136        split: The split to download. Currently only supports 'train'.
137        vol_id: The train id to load.
138        download: Whether to download the data if it is not present.
139
140    Returns:
141        Filepath to the folder where image data is stored.
142        Filepath to the folder where label data is stored.
143    """
144    data_path = get_ctc_segmentation_data(path, dataset_name, split, download)
145
146    if vol_id is None:
147        vol_ids = glob(os.path.join(data_path, "*_GT"))
148        vol_ids = [os.path.basename(vol_id) for vol_id in vol_ids]
149        vol_ids = [vol_id.rstrip("_GT") for vol_id in vol_ids]
150    else:
151        vol_ids = vol_id
152
153    image_path, label_path = _require_gt_images(data_path, vol_ids)
154    return image_path, label_path
155
156
157def get_ctc_segmentation_dataset(
158    path: Union[os.PathLike, str],
159    dataset_name: str,
160    patch_shape: Tuple[int, int, int],
161    split: str = "train",
162    vol_id: Optional[int] = None,
163    download: bool = False,
164    **kwargs,
165) -> Dataset:
166    f"""Get the CTC dataset for cell segmentation.
167
168    Args:
169        path: Filepath to a folder where the downloaded data will be saved.
170        dataset_name: Name of the dataset to be downloaded. The available datasets are:
171            {', '.join(CTC_CHECKSUMS['train'].keys())}
172        patch_shape: The patch shape to use for training.
173        split: The split to download. Currently only supports 'train'.
174        vol_id: The train id to load.
175        download: Whether to download the data if it is not present.
176        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
177
178    Returns:
179       The segmentation dataset.
180    """
181    assert split in ["train"]
182
183    image_path, label_path = get_ctc_segmentation_paths(path, dataset_name, split, vol_id, download)
184
185    kwargs = util.update_kwargs(kwargs, "ndim", 2)
186
187    return torch_em.default_segmentation_dataset(
188        raw_paths=image_path,
189        raw_key="*.tif",
190        label_paths=label_path,
191        label_key="*.tif",
192        patch_shape=patch_shape,
193        is_seg_dataset=True,
194        **kwargs
195    )
196
197
198def get_ctc_segmentation_loader(
199    path: Union[os.PathLike, str],
200    dataset_name: str,
201    patch_shape: Tuple[int, int, int],
202    batch_size: int,
203    split: str = "train",
204    vol_id: Optional[int] = None,
205    download: bool = False,
206    **kwargs,
207) -> DataLoader:
208    f"""Get the CTC dataloader for cell segmentation.
209
210    Args:
211        path: Filepath to a folder where the downloaded data will be saved.
212        dataset_name: Name of the dataset to be downloaded. The available datasets are:
213            {', '.join(CTC_CHECKSUMS['train'].keys())}
214        patch_shape: The patch shape to use for training.
215        batch_size: The batch size for training.
216        split: The split to download. Currently only supports 'train'.
217        vol_id: The train id to load.
218        download: Whether to download the data if it is not present.
219        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
220
221    Returns:
222       The DataLoader.
223    """
224    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
225    dataset = get_ctc_segmentation_dataset(path, dataset_name, patch_shape, split, vol_id, download, **ds_kwargs)
226    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
CTC_CHECKSUMS = {'train': {'BF-C2DL-HSC': '0aa68ec37a9b06e72a5dfa07d809f56e1775157fb674bb75ff904936149657b1', 'BF-C2DL-MuSC': 'ca72b59042809120578a198ba236e5ed3504dd6a122ef969428b7c64f0a5e67d', 'DIC-C2DH-HeLa': '832fed2d05bb7488cf9c51a2994b75f8f3f53b3c3098856211f2d39023c34e1a', 'Fluo-C2DL-Huh7': '1912658c1b3d8b38b314eb658b559e7b39c256917150e9b3dd8bfdc77347617d', 'Fluo-C2DL-MSC': 'a083521f0cb673ae02d4957c5e6580c2e021943ef88101f6a2f61b944d671af2', 'Fluo-N2DH-GOWT1': '1a7bd9a7d1d10c4122c7782427b437246fb69cc3322a975485c04e206f64fc2c', 'Fluo-N2DH-SIM+': '3e809148c87ace80c72f563b56c35e0d9448dcdeb461a09c83f61e93f5e40ec8', 'Fluo-N2DL-HeLa': '35dd99d58e071aba0b03880128d920bd1c063783cc280f9531fbdc5be614c82e', 'PhC-C2DH-U373': 'b18185c18fce54e8eeb93e4bbb9b201d757add9409bbf2283b8114185a11bc9e', 'PhC-C2DL-PSC': '9d54bb8febc8798934a21bf92e05d92f5e8557c87e28834b2832591cdda78422'}, 'test': {'BF-C2DL-HSC': 'fd1c05ec625fd0526c8369d1139babe137e885457eee98c10d957da578d0d5bc', 'BF-C2DL-MuSC': 'c5cae259e6090e82a2596967fb54c8a768717c1772398f8546ad1c8df0820450', 'DIC-C2DH-HeLa': '5e5d5f2aa90aef99d750cf03f5c12d799d50b892f98c86950e07a2c5955ac01f', 'Fluo-C2DL-Huh7': 'cc7359f8fb6b0c43995365e83ce0116d32f477ac644b2ca02b98bc253e2bcbbe', 'Fluo-C2DL-MSC': 'c90b13e603dde52f17801d4f0cadde04ed7f21cc05296b1f0957d92dbfc8ffa6', 'Fluo-N2DH-GOWT1': 'c6893ec2d63459de49d4dc21009b04275573403c62cc02e6ee8d0cb1a5068add', 'Fluo-N2DH-SIM+': 'c4f257add739b284d02176057814de345dee2ac1a7438e360ccd2df73618db68', 'Fluo-N2DL-HeLa': '45cf3daf05e8495aa2ce0febacca4cf0928fab808c0b14ed2eb7289a819e6bb8', 'PhC-C2DH-U373': '7aa3162e4363a416b259149adc13c9b09cb8aecfe8165eb1428dd534b66bec8a', 'PhC-C2DL-PSC': '8c98ac6203e7490157ceb6aa1131d60a3863001b61fb75e784bc49d47ee264d5'}}
def get_ctc_segmentation_data( path: Union[os.PathLike, str], dataset_name: str, split: str, download: bool = False) -> str:
59def get_ctc_segmentation_data(
60    path: Union[os.PathLike, str], dataset_name: str, split: str, download: bool = False,
61) -> str:
62    f"""Download training data from the Cell Tracking Challenge.
63
64    Args:
65        path: Filepath to a folder where the downloaded data will be saved.
66        dataset_name: Name of the dataset to be downloaded. The available datasets are:
67            {', '.join(CTC_CHECKSUMS['train'].keys())}
68        split: The split to download. Either 'train' or 'test'.
69        download: Whether to download the data if it is not present.
70
71    Returns:
72        The filepath to the training data.
73    """
74    dataset_names = list(CTC_CHECKSUMS["train"].keys())
75    if dataset_name not in dataset_names:
76        raise ValueError(f"Invalid dataset: {dataset_name}, choose one of {dataset_names}.")
77
78    data_path = os.path.join(path, split, dataset_name)
79
80    if os.path.exists(data_path):
81        return data_path
82
83    os.makedirs(data_path)
84    url, checksum = _get_ctc_url_and_checksum(dataset_name, split)
85    zip_path = os.path.join(path, f"{dataset_name}.zip")
86    util.download_source(zip_path, url, download, checksum=checksum)
87    util.unzip(zip_path, os.path.join(path, split), remove=True)
88
89    return data_path
def get_ctc_segmentation_paths( path: Union[os.PathLike, str], dataset_name: str, split: str = 'train', vol_id: Optional[int] = None, download: bool = False) -> Tuple[str, str]:
124def get_ctc_segmentation_paths(
125    path: Union[os.PathLike, str],
126    dataset_name: str,
127    split: str = "train",
128    vol_id: Optional[int] = None,
129    download: bool = False,
130) -> Tuple[str, str]:
131    f"""Get paths to the Cell Tracking Challenge data.
132
133    Args:
134        path: Filepath to a folder where the downloaded data will be saved.
135        dataset_name: Name of the dataset to be downloaded. The available datasets are:
136            {', '.join(CTC_CHECKSUMS['train'].keys())}
137        split: The split to download. Currently only supports 'train'.
138        vol_id: The train id to load.
139        download: Whether to download the data if it is not present.
140
141    Returns:
142        Filepath to the folder where image data is stored.
143        Filepath to the folder where label data is stored.
144    """
145    data_path = get_ctc_segmentation_data(path, dataset_name, split, download)
146
147    if vol_id is None:
148        vol_ids = glob(os.path.join(data_path, "*_GT"))
149        vol_ids = [os.path.basename(vol_id) for vol_id in vol_ids]
150        vol_ids = [vol_id.rstrip("_GT") for vol_id in vol_ids]
151    else:
152        vol_ids = vol_id
153
154    image_path, label_path = _require_gt_images(data_path, vol_ids)
155    return image_path, label_path
def get_ctc_segmentation_dataset( path: Union[os.PathLike, str], dataset_name: str, patch_shape: Tuple[int, int, int], split: str = 'train', vol_id: Optional[int] = None, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
158def get_ctc_segmentation_dataset(
159    path: Union[os.PathLike, str],
160    dataset_name: str,
161    patch_shape: Tuple[int, int, int],
162    split: str = "train",
163    vol_id: Optional[int] = None,
164    download: bool = False,
165    **kwargs,
166) -> Dataset:
167    f"""Get the CTC dataset for cell segmentation.
168
169    Args:
170        path: Filepath to a folder where the downloaded data will be saved.
171        dataset_name: Name of the dataset to be downloaded. The available datasets are:
172            {', '.join(CTC_CHECKSUMS['train'].keys())}
173        patch_shape: The patch shape to use for training.
174        split: The split to download. Currently only supports 'train'.
175        vol_id: The train id to load.
176        download: Whether to download the data if it is not present.
177        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
178
179    Returns:
180       The segmentation dataset.
181    """
182    assert split in ["train"]
183
184    image_path, label_path = get_ctc_segmentation_paths(path, dataset_name, split, vol_id, download)
185
186    kwargs = util.update_kwargs(kwargs, "ndim", 2)
187
188    return torch_em.default_segmentation_dataset(
189        raw_paths=image_path,
190        raw_key="*.tif",
191        label_paths=label_path,
192        label_key="*.tif",
193        patch_shape=patch_shape,
194        is_seg_dataset=True,
195        **kwargs
196    )
def get_ctc_segmentation_loader( path: Union[os.PathLike, str], dataset_name: str, patch_shape: Tuple[int, int, int], batch_size: int, split: str = 'train', vol_id: Optional[int] = None, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
199def get_ctc_segmentation_loader(
200    path: Union[os.PathLike, str],
201    dataset_name: str,
202    patch_shape: Tuple[int, int, int],
203    batch_size: int,
204    split: str = "train",
205    vol_id: Optional[int] = None,
206    download: bool = False,
207    **kwargs,
208) -> DataLoader:
209    f"""Get the CTC dataloader for cell segmentation.
210
211    Args:
212        path: Filepath to a folder where the downloaded data will be saved.
213        dataset_name: Name of the dataset to be downloaded. The available datasets are:
214            {', '.join(CTC_CHECKSUMS['train'].keys())}
215        patch_shape: The patch shape to use for training.
216        batch_size: The batch size for training.
217        split: The split to download. Currently only supports 'train'.
218        vol_id: The train id to load.
219        download: Whether to download the data if it is not present.
220        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
221
222    Returns:
223       The DataLoader.
224    """
225    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
226    dataset = get_ctc_segmentation_dataset(path, dataset_name, patch_shape, split, vol_id, download, **ds_kwargs)
227    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)