torch_em.data.datasets.light_microscopy.ctc

The Cell Tracking Challenge contains annotated data for cell segmentation and tracking.

We currently cprovide the 2d datasets with segmentation annotations. If you use this data in your research please cite https://doi.org/10.1038/nmeth.4473.

  1"""The Cell Tracking Challenge contains annotated data for cell segmentation and tracking.
  2
  3We currently cprovide the 2d datasets with segmentation annotations.
  4If you use this data in your research please cite https://doi.org/10.1038/nmeth.4473.
  5"""
  6
  7import os
  8from glob import glob
  9from shutil import copyfile
 10from typing import Optional, Tuple, Union
 11
 12import torch_em
 13from torch.utils.data import Dataset, DataLoader
 14from .. import util
 15
 16
 17CTC_CHECKSUMS = {
 18    "train": {
 19        "BF-C2DL-HSC": "0aa68ec37a9b06e72a5dfa07d809f56e1775157fb674bb75ff904936149657b1",
 20        "BF-C2DL-MuSC": "ca72b59042809120578a198ba236e5ed3504dd6a122ef969428b7c64f0a5e67d",
 21        "DIC-C2DH-HeLa": "832fed2d05bb7488cf9c51a2994b75f8f3f53b3c3098856211f2d39023c34e1a",
 22        "Fluo-C2DL-Huh7": "1912658c1b3d8b38b314eb658b559e7b39c256917150e9b3dd8bfdc77347617d",
 23        "Fluo-C2DL-MSC": "a083521f0cb673ae02d4957c5e6580c2e021943ef88101f6a2f61b944d671af2",
 24        "Fluo-N2DH-GOWT1": "1a7bd9a7d1d10c4122c7782427b437246fb69cc3322a975485c04e206f64fc2c",
 25        "Fluo-N2DH-SIM+": "3e809148c87ace80c72f563b56c35e0d9448dcdeb461a09c83f61e93f5e40ec8",
 26        "Fluo-N2DL-HeLa": "35dd99d58e071aba0b03880128d920bd1c063783cc280f9531fbdc5be614c82e",
 27        "PhC-C2DH-U373": "b18185c18fce54e8eeb93e4bbb9b201d757add9409bbf2283b8114185a11bc9e",
 28        "PhC-C2DL-PSC": "9d54bb8febc8798934a21bf92e05d92f5e8557c87e28834b2832591cdda78422",
 29    },
 30    "test": {
 31        "BF-C2DL-HSC": "fd1c05ec625fd0526c8369d1139babe137e885457eee98c10d957da578d0d5bc",
 32        "BF-C2DL-MuSC": "c5cae259e6090e82a2596967fb54c8a768717c1772398f8546ad1c8df0820450",
 33        "DIC-C2DH-HeLa": "5e5d5f2aa90aef99d750cf03f5c12d799d50b892f98c86950e07a2c5955ac01f",
 34        "Fluo-C2DL-Huh7": "cc7359f8fb6b0c43995365e83ce0116d32f477ac644b2ca02b98bc253e2bcbbe",
 35        "Fluo-C2DL-MSC": "c90b13e603dde52f17801d4f0cadde04ed7f21cc05296b1f0957d92dbfc8ffa6",
 36        "Fluo-N2DH-GOWT1": "c6893ec2d63459de49d4dc21009b04275573403c62cc02e6ee8d0cb1a5068add",
 37        "Fluo-N2DH-SIM+": "c4f257add739b284d02176057814de345dee2ac1a7438e360ccd2df73618db68",
 38        "Fluo-N2DL-HeLa": "45cf3daf05e8495aa2ce0febacca4cf0928fab808c0b14ed2eb7289a819e6bb8",
 39        "PhC-C2DH-U373": "7aa3162e4363a416b259149adc13c9b09cb8aecfe8165eb1428dd534b66bec8a",
 40        "PhC-C2DL-PSC": "8c98ac6203e7490157ceb6aa1131d60a3863001b61fb75e784bc49d47ee264d5",
 41    }
 42}
 43
 44
 45def _get_ctc_url_and_checksum(dataset_name, split):
 46    if split == "train":
 47        _link_to_split = "training-datasets"
 48    else:
 49        _link_to_split = "test-datasets"
 50
 51    url = f"http://data.celltrackingchallenge.net/{_link_to_split}/{dataset_name}.zip"
 52    checksum = CTC_CHECKSUMS[split][dataset_name]
 53    return url, checksum
 54
 55
 56def get_ctc_data(
 57    path: Union[os.PathLike, str],
 58    dataset_name: str,
 59    download: bool,
 60    split: str
 61) -> str:
 62    f"""Download training data from the cell tracking challenge.
 63
 64    Args:
 65        path: Filepath to a folder where the downloaded data will be saved.
 66        dataset_name: Name of the dataset to be downloaded. The available datasets are:
 67            {', '.join(CTC_CHECKSUMS['train'].keys())}
 68        download: Whether to download the data if it is not present.
 69        split: The split to download. Either 'train' or 'test'.
 70
 71    Returns:
 72        The filepath to the training data.
 73    """
 74    dataset_names = list(CTC_CHECKSUMS["train"].keys())
 75    if dataset_name not in dataset_names:
 76        raise ValueError(f"Invalid dataset: {dataset_name}, choose one of {dataset_names}.")
 77
 78    data_path = os.path.join(path, split, dataset_name)
 79
 80    if os.path.exists(data_path):
 81        return data_path
 82
 83    os.makedirs(data_path)
 84    url, checksum = _get_ctc_url_and_checksum(dataset_name, split)
 85    zip_path = os.path.join(path, f"{dataset_name}.zip")
 86    util.download_source(zip_path, url, download, checksum=checksum)
 87    util.unzip(zip_path, os.path.join(path, split), remove=True)
 88
 89    return data_path
 90
 91
 92def _require_gt_images(data_path, vol_ids):
 93    image_paths, label_paths = [], []
 94
 95    if isinstance(vol_ids, str):
 96        vol_ids = [vol_ids]
 97
 98    for vol_id in vol_ids:
 99        image_folder = os.path.join(data_path, vol_id)
100        assert os.path.join(image_folder), f"Cannot find volume id, {vol_id} in {data_path}."
101
102        label_folder = os.path.join(data_path, f"{vol_id}_GT", "SEG")
103
104        # copy over the images corresponding to the labeled frames
105        label_image_folder = os.path.join(data_path, f"{vol_id}_GT", "IM")
106        os.makedirs(label_image_folder, exist_ok=True)
107
108        this_label_paths = glob(os.path.join(label_folder, "*.tif"))
109        for label_path in this_label_paths:
110            fname = os.path.basename(label_path)
111            image_label_path = os.path.join(label_image_folder, fname)
112            if not os.path.exists(image_label_path):
113                im_name = "t" + fname.lstrip("main_seg")
114                image_path = os.path.join(image_folder, im_name)
115                assert os.path.join(image_path), image_path
116                copyfile(image_path, image_label_path)
117
118        image_paths.append(label_image_folder)
119        label_paths.append(label_folder)
120
121    return image_paths, label_paths
122
123
124def get_ctc_segmentation_dataset(
125    path: Union[os.PathLike, str],
126    dataset_name: str,
127    patch_shape: Tuple[int, int, int],
128    split: str = "train",
129    vol_id: Optional[int] = None,
130    download: bool = False,
131    **kwargs,
132) -> Dataset:
133    """Get the CTC dataset for cell segmentation.
134
135    Args:
136        path: Filepath to a folder where the downloaded data will be saved.
137        dataset_name: Name of the dataset to be downloaded. The available datasets are:
138            {', '.join(CTC_CHECKSUMS['train'].keys())}
139        patch_shape: The patch shape to use for training.
140        split: The split to download. Currently only supports 'train'.
141        vol_id: The train id to load.
142        download: Whether to download the data if it is not present.
143        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
144
145    Returns:
146       The segmentation dataset.
147    """
148    assert split in ["train"]
149
150    data_path = get_ctc_data(path, dataset_name, download, split)
151
152    if vol_id is None:
153        vol_ids = glob(os.path.join(data_path, "*_GT"))
154        vol_ids = [os.path.basename(vol_id) for vol_id in vol_ids]
155        vol_ids = [vol_id.rstrip("_GT") for vol_id in vol_ids]
156    else:
157        vol_ids = vol_id
158
159    image_path, label_path = _require_gt_images(data_path, vol_ids)
160
161    kwargs = util.update_kwargs(kwargs, "ndim", 2)
162    return torch_em.default_segmentation_dataset(
163        image_path, "*.tif", label_path, "*.tif", patch_shape, is_seg_dataset=True, **kwargs
164    )
165
166
167def get_ctc_segmentation_loader(
168    path: Union[os.PathLike, str],
169    dataset_name: str,
170    patch_shape: Tuple[int, int, int],
171    batch_size: int,
172    split: str = "train",
173    vol_id: Optional[int] = None,
174    download: bool = False,
175    **kwargs,
176) -> DataLoader:
177    """Get the CTC dataloader for cell segmentation.
178
179    Args:
180        path: Filepath to a folder where the downloaded data will be saved.
181        dataset_name: Name of the dataset to be downloaded. The available datasets are:
182            {', '.join(CTC_CHECKSUMS['train'].keys())}
183        patch_shape: The patch shape to use for training.
184        batch_size: The batch size for training.
185        split: The split to download. Currently only supports 'train'.
186        vol_id: The train id to load.
187        download: Whether to download the data if it is not present.
188        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
189
190    Returns:
191       The DataLoader.
192    """
193    ds_kwargs, loader_kwargs = util.split_kwargs(
194        torch_em.default_segmentation_dataset, **kwargs
195    )
196    dataset = get_ctc_segmentation_dataset(
197        path, dataset_name, patch_shape, split=split, vol_id=vol_id, download=download, **ds_kwargs,
198    )
199
200    loader = torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
201    return loader
CTC_CHECKSUMS = {'train': {'BF-C2DL-HSC': '0aa68ec37a9b06e72a5dfa07d809f56e1775157fb674bb75ff904936149657b1', 'BF-C2DL-MuSC': 'ca72b59042809120578a198ba236e5ed3504dd6a122ef969428b7c64f0a5e67d', 'DIC-C2DH-HeLa': '832fed2d05bb7488cf9c51a2994b75f8f3f53b3c3098856211f2d39023c34e1a', 'Fluo-C2DL-Huh7': '1912658c1b3d8b38b314eb658b559e7b39c256917150e9b3dd8bfdc77347617d', 'Fluo-C2DL-MSC': 'a083521f0cb673ae02d4957c5e6580c2e021943ef88101f6a2f61b944d671af2', 'Fluo-N2DH-GOWT1': '1a7bd9a7d1d10c4122c7782427b437246fb69cc3322a975485c04e206f64fc2c', 'Fluo-N2DH-SIM+': '3e809148c87ace80c72f563b56c35e0d9448dcdeb461a09c83f61e93f5e40ec8', 'Fluo-N2DL-HeLa': '35dd99d58e071aba0b03880128d920bd1c063783cc280f9531fbdc5be614c82e', 'PhC-C2DH-U373': 'b18185c18fce54e8eeb93e4bbb9b201d757add9409bbf2283b8114185a11bc9e', 'PhC-C2DL-PSC': '9d54bb8febc8798934a21bf92e05d92f5e8557c87e28834b2832591cdda78422'}, 'test': {'BF-C2DL-HSC': 'fd1c05ec625fd0526c8369d1139babe137e885457eee98c10d957da578d0d5bc', 'BF-C2DL-MuSC': 'c5cae259e6090e82a2596967fb54c8a768717c1772398f8546ad1c8df0820450', 'DIC-C2DH-HeLa': '5e5d5f2aa90aef99d750cf03f5c12d799d50b892f98c86950e07a2c5955ac01f', 'Fluo-C2DL-Huh7': 'cc7359f8fb6b0c43995365e83ce0116d32f477ac644b2ca02b98bc253e2bcbbe', 'Fluo-C2DL-MSC': 'c90b13e603dde52f17801d4f0cadde04ed7f21cc05296b1f0957d92dbfc8ffa6', 'Fluo-N2DH-GOWT1': 'c6893ec2d63459de49d4dc21009b04275573403c62cc02e6ee8d0cb1a5068add', 'Fluo-N2DH-SIM+': 'c4f257add739b284d02176057814de345dee2ac1a7438e360ccd2df73618db68', 'Fluo-N2DL-HeLa': '45cf3daf05e8495aa2ce0febacca4cf0928fab808c0b14ed2eb7289a819e6bb8', 'PhC-C2DH-U373': '7aa3162e4363a416b259149adc13c9b09cb8aecfe8165eb1428dd534b66bec8a', 'PhC-C2DL-PSC': '8c98ac6203e7490157ceb6aa1131d60a3863001b61fb75e784bc49d47ee264d5'}}
def get_ctc_data( path: Union[os.PathLike, str], dataset_name: str, download: bool, split: str) -> str:
57def get_ctc_data(
58    path: Union[os.PathLike, str],
59    dataset_name: str,
60    download: bool,
61    split: str
62) -> str:
63    f"""Download training data from the cell tracking challenge.
64
65    Args:
66        path: Filepath to a folder where the downloaded data will be saved.
67        dataset_name: Name of the dataset to be downloaded. The available datasets are:
68            {', '.join(CTC_CHECKSUMS['train'].keys())}
69        download: Whether to download the data if it is not present.
70        split: The split to download. Either 'train' or 'test'.
71
72    Returns:
73        The filepath to the training data.
74    """
75    dataset_names = list(CTC_CHECKSUMS["train"].keys())
76    if dataset_name not in dataset_names:
77        raise ValueError(f"Invalid dataset: {dataset_name}, choose one of {dataset_names}.")
78
79    data_path = os.path.join(path, split, dataset_name)
80
81    if os.path.exists(data_path):
82        return data_path
83
84    os.makedirs(data_path)
85    url, checksum = _get_ctc_url_and_checksum(dataset_name, split)
86    zip_path = os.path.join(path, f"{dataset_name}.zip")
87    util.download_source(zip_path, url, download, checksum=checksum)
88    util.unzip(zip_path, os.path.join(path, split), remove=True)
89
90    return data_path
def get_ctc_segmentation_dataset( path: Union[os.PathLike, str], dataset_name: str, patch_shape: Tuple[int, int, int], split: str = 'train', vol_id: Optional[int] = None, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
125def get_ctc_segmentation_dataset(
126    path: Union[os.PathLike, str],
127    dataset_name: str,
128    patch_shape: Tuple[int, int, int],
129    split: str = "train",
130    vol_id: Optional[int] = None,
131    download: bool = False,
132    **kwargs,
133) -> Dataset:
134    """Get the CTC dataset for cell segmentation.
135
136    Args:
137        path: Filepath to a folder where the downloaded data will be saved.
138        dataset_name: Name of the dataset to be downloaded. The available datasets are:
139            {', '.join(CTC_CHECKSUMS['train'].keys())}
140        patch_shape: The patch shape to use for training.
141        split: The split to download. Currently only supports 'train'.
142        vol_id: The train id to load.
143        download: Whether to download the data if it is not present.
144        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
145
146    Returns:
147       The segmentation dataset.
148    """
149    assert split in ["train"]
150
151    data_path = get_ctc_data(path, dataset_name, download, split)
152
153    if vol_id is None:
154        vol_ids = glob(os.path.join(data_path, "*_GT"))
155        vol_ids = [os.path.basename(vol_id) for vol_id in vol_ids]
156        vol_ids = [vol_id.rstrip("_GT") for vol_id in vol_ids]
157    else:
158        vol_ids = vol_id
159
160    image_path, label_path = _require_gt_images(data_path, vol_ids)
161
162    kwargs = util.update_kwargs(kwargs, "ndim", 2)
163    return torch_em.default_segmentation_dataset(
164        image_path, "*.tif", label_path, "*.tif", patch_shape, is_seg_dataset=True, **kwargs
165    )

Get the CTC dataset for cell segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • dataset_name: Name of the dataset to be downloaded. The available datasets are: {', '.join(CTC_CHECKSUMS['train'].keys())}
  • patch_shape: The patch shape to use for training.
  • split: The split to download. Currently only supports 'train'.
  • vol_id: The train id to load.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_ctc_segmentation_loader( path: Union[os.PathLike, str], dataset_name: str, patch_shape: Tuple[int, int, int], batch_size: int, split: str = 'train', vol_id: Optional[int] = None, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
168def get_ctc_segmentation_loader(
169    path: Union[os.PathLike, str],
170    dataset_name: str,
171    patch_shape: Tuple[int, int, int],
172    batch_size: int,
173    split: str = "train",
174    vol_id: Optional[int] = None,
175    download: bool = False,
176    **kwargs,
177) -> DataLoader:
178    """Get the CTC dataloader for cell segmentation.
179
180    Args:
181        path: Filepath to a folder where the downloaded data will be saved.
182        dataset_name: Name of the dataset to be downloaded. The available datasets are:
183            {', '.join(CTC_CHECKSUMS['train'].keys())}
184        patch_shape: The patch shape to use for training.
185        batch_size: The batch size for training.
186        split: The split to download. Currently only supports 'train'.
187        vol_id: The train id to load.
188        download: Whether to download the data if it is not present.
189        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
190
191    Returns:
192       The DataLoader.
193    """
194    ds_kwargs, loader_kwargs = util.split_kwargs(
195        torch_em.default_segmentation_dataset, **kwargs
196    )
197    dataset = get_ctc_segmentation_dataset(
198        path, dataset_name, patch_shape, split=split, vol_id=vol_id, download=download, **ds_kwargs,
199    )
200
201    loader = torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
202    return loader

Get the CTC dataloader for cell segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • dataset_name: Name of the dataset to be downloaded. The available datasets are: {', '.join(CTC_CHECKSUMS['train'].keys())}
  • patch_shape: The patch shape to use for training.
  • batch_size: The batch size for training.
  • split: The split to download. Currently only supports 'train'.
  • vol_id: The train id to load.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.