torch_em.data.datasets.light_microscopy.cshaper

The CShaper dataset contains 3D fluorescence microscopy images of Caenorhabditis elegans early embryos with cell instance segmentation annotations.

The dataset is organised into training and evaluation splits:

  • Training: Sample01, Sample02 (27 timepoints each)
  • Evaluation: Sample02, Sample03, Sample04 (7 timepoints each)

Each timepoint is a separate 3D NIfTI volume (.nii.gz):

  • Raw membrane images: RawMemb/{sample}_{tp}_rawMemb.nii.gz
  • Cell segmentation: SegCell/{sample}_{tp}_segCell.nii.gz

NOTE: The data must be downloaded manually. Download the zip from the SharePoint link provided at https://doi.org/10.6084/m9.figshare.12839315 and place it as {path}/OneDrive.zip (or whatever filename it downloads as).

The dataset is from the publication https://doi.org/10.1038/s41467-020-19863-x. Please cite it if you use this dataset in your research.

  1"""The CShaper dataset contains 3D fluorescence microscopy images of Caenorhabditis
  2elegans early embryos with cell instance segmentation annotations.
  3
  4The dataset is organised into training and evaluation splits:
  5- Training: Sample01, Sample02 (27 timepoints each)
  6- Evaluation: Sample02, Sample03, Sample04 (7 timepoints each)
  7
  8Each timepoint is a separate 3D NIfTI volume (.nii.gz):
  9- Raw membrane images: RawMemb/{sample}_{tp}_rawMemb.nii.gz
 10- Cell segmentation: SegCell/{sample}_{tp}_segCell.nii.gz
 11
 12NOTE: The data must be downloaded manually. Download the zip from the SharePoint link
 13provided at https://doi.org/10.6084/m9.figshare.12839315 and place it as
 14`{path}/OneDrive.zip` (or whatever filename it downloads as).
 15
 16The dataset is from the publication https://doi.org/10.1038/s41467-020-19863-x.
 17Please cite it if you use this dataset in your research.
 18"""
 19
 20import os
 21from glob import glob
 22from natsort import natsorted
 23from typing import List, Literal, Optional, Tuple, Union
 24
 25from torch.utils.data import Dataset, DataLoader
 26
 27import torch_em
 28
 29from .. import util
 30
 31
 32# Root path inside the zip after extraction
 33_ZIP_ROOT = "CShaper Supplementary Data/DMapNet Training and Evaluation"
 34
 35TRAIN_SAMPLES = ["Sample01", "Sample02"]
 36EVAL_SAMPLES = ["Sample02", "Sample03", "Sample04"]
 37
 38
 39def get_cshaper_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 40    """Extract the CShaper dataset zip.
 41
 42    NOTE: The zip must be downloaded manually from the SharePoint link at
 43    https://doi.org/10.6084/m9.figshare.12839315 and placed inside `path`.
 44    Any zip file found in `path` will be extracted automatically.
 45
 46    Args:
 47        path: Filepath to a folder containing the downloaded CShaper zip.
 48        download: Ignored (manual download required).
 49
 50    Returns:
 51        The filepath to the extracted data root directory.
 52    """
 53    data_dir = os.path.join(path, _ZIP_ROOT)
 54    if os.path.exists(data_dir):
 55        return data_dir
 56
 57    # Find any zip in path
 58    zips = glob(os.path.join(path, "*.zip"))
 59    if not zips:
 60        raise RuntimeError(
 61            f"No zip file found in {path}. "
 62            "Please download the CShaper data manually from the SharePoint link at "
 63            "https://doi.org/10.6084/m9.figshare.12839315 and place the zip in `path`."
 64        )
 65
 66    util.unzip(zips[0], path)
 67    return data_dir
 68
 69
 70def _convert_to_h5(data_dir: str, split: str) -> str:
 71    """Convert NIfTI timepoint files to per-timepoint HDF5 files.
 72
 73    Args:
 74        data_dir: The extracted CShaper root directory.
 75        split: "train" or "val".
 76
 77    Returns:
 78        The directory containing the converted HDF5 files.
 79    """
 80    try:
 81        import nibabel as nib
 82    except ImportError:
 83        raise RuntimeError(
 84            "The 'nibabel' package is required to read CShaper NIfTI files. "
 85            "Install with: pip install nibabel"
 86        )
 87    import h5py
 88
 89    split_subdir = "TrainingData" if split == "train" else "EvaluationData"
 90    split_dir = os.path.join(data_dir, split_subdir)
 91
 92    h5_dir = os.path.join(data_dir, f"h5_{split}")
 93    if os.path.exists(h5_dir) and len(glob(os.path.join(h5_dir, "*.h5"))) > 0:
 94        return h5_dir
 95    os.makedirs(h5_dir, exist_ok=True)
 96
 97    sample_dirs = natsorted([
 98        d for d in glob(os.path.join(split_dir, "*/")) if os.path.isdir(d)
 99    ])
100
101    for sample_dir in sample_dirs:
102        raw_files = natsorted(glob(os.path.join(sample_dir, "RawMemb", "*.nii.gz")))
103        seg_dir = os.path.join(sample_dir, "SegCell")
104
105        for raw_path in raw_files:
106            # e.g. Sample01_030_rawMemb.nii.gz → Sample01_030
107            basename = os.path.basename(raw_path)
108            tp_stem = basename.replace("_rawMemb.nii.gz", "")
109            h5_path = os.path.join(h5_dir, f"{tp_stem}.h5")
110
111            if os.path.exists(h5_path):
112                continue
113
114            seg_path = os.path.join(seg_dir, f"{tp_stem}_segCell.nii.gz")
115            if not os.path.exists(seg_path):
116                continue
117
118            raw_vol = nib.load(raw_path).get_fdata().astype("float32")
119            seg_vol = nib.load(seg_path).get_fdata().astype("int32")
120
121            with h5py.File(h5_path, "w") as f:
122                f.create_dataset("raw", data=raw_vol, compression="gzip")
123                f.create_dataset("labels", data=seg_vol, compression="gzip")
124
125    return h5_dir
126
127
128def get_cshaper_paths(
129    path: Union[os.PathLike, str],
130    split: Literal["train", "val"] = "train",
131    samples: Optional[List[str]] = None,
132    download: bool = False,
133) -> Tuple[List[str], List[str]]:
134    """Get paths to the CShaper data.
135
136    Args:
137        path: Filepath to a folder containing the downloaded CShaper zip.
138        split: The data split to use. Either "train" (Sample01, Sample02) or
139            "val" (Sample02, Sample03, Sample04).
140        samples: Optional list of sample names to restrict to (e.g., ["Sample01"]).
141            If None, all samples for the split are used.
142        download: Ignored (manual download required).
143
144    Returns:
145        List of filepaths for the HDF5 image data (key: "raw").
146        List of filepaths for the HDF5 label data (key: "labels").
147    """
148    if split not in ("train", "val"):
149        raise ValueError(f"Invalid split '{split}'. Choose 'train' or 'val'.")
150
151    data_dir = get_cshaper_data(path, download)
152    h5_dir = _convert_to_h5(data_dir, split)
153
154    h5_files = natsorted(glob(os.path.join(h5_dir, "*.h5")))
155
156    if len(h5_files) == 0:
157        raise RuntimeError(f"No HDF5 files found in {h5_dir}. Check the dataset structure.")
158
159    if samples is not None:
160        h5_files = [p for p in h5_files if any(os.path.basename(p).startswith(s) for s in samples)]
161
162    return h5_files, h5_files
163
164
165def get_cshaper_dataset(
166    path: Union[os.PathLike, str],
167    patch_shape: Tuple[int, ...],
168    split: Literal["train", "val"] = "train",
169    samples: Optional[List[str]] = None,
170    raw_key: str = "raw",
171    label_key: str = "labels",
172    download: bool = False,
173    **kwargs,
174) -> Dataset:
175    """Get the CShaper dataset for C. elegans embryo cell segmentation.
176
177    Args:
178        path: Filepath to a folder containing the downloaded CShaper zip.
179        patch_shape: The patch shape to use for training.
180        split: The data split to use. Either "train" or "val".
181        samples: Optional list of sample names to restrict to (e.g., ["Sample01"]).
182        raw_key: The HDF5 key for raw image data.
183        label_key: The HDF5 key for label data.
184        download: Ignored (manual download required).
185        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
186
187    Returns:
188        The segmentation dataset.
189    """
190    raw_paths, label_paths = get_cshaper_paths(path, split, samples, download)
191
192    return torch_em.default_segmentation_dataset(
193        raw_paths=raw_paths,
194        raw_key=raw_key,
195        label_paths=label_paths,
196        label_key=label_key,
197        patch_shape=patch_shape,
198        **kwargs,
199    )
200
201
202def get_cshaper_loader(
203    path: Union[os.PathLike, str],
204    batch_size: int,
205    patch_shape: Tuple[int, ...],
206    split: Literal["train", "val"] = "train",
207    samples: Optional[List[str]] = None,
208    raw_key: str = "raw",
209    label_key: str = "labels",
210    download: bool = False,
211    **kwargs,
212) -> DataLoader:
213    """Get the CShaper dataloader for C. elegans embryo cell segmentation.
214
215    Args:
216        path: Filepath to a folder containing the downloaded CShaper zip.
217        batch_size: The batch size for training.
218        patch_shape: The patch shape to use for training.
219        split: The data split to use. Either "train" or "val".
220        samples: Optional list of sample names to restrict to (e.g., ["Sample01"]).
221        raw_key: The HDF5 key for raw image data.
222        label_key: The HDF5 key for label data.
223        download: Ignored (manual download required).
224        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
225
226    Returns:
227        The DataLoader.
228    """
229    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
230    dataset = get_cshaper_dataset(path, patch_shape, split, samples, raw_key, label_key, download, **ds_kwargs)
231    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
TRAIN_SAMPLES = ['Sample01', 'Sample02']
EVAL_SAMPLES = ['Sample02', 'Sample03', 'Sample04']
def get_cshaper_data(path: Union[os.PathLike, str], download: bool = False) -> str:
40def get_cshaper_data(path: Union[os.PathLike, str], download: bool = False) -> str:
41    """Extract the CShaper dataset zip.
42
43    NOTE: The zip must be downloaded manually from the SharePoint link at
44    https://doi.org/10.6084/m9.figshare.12839315 and placed inside `path`.
45    Any zip file found in `path` will be extracted automatically.
46
47    Args:
48        path: Filepath to a folder containing the downloaded CShaper zip.
49        download: Ignored (manual download required).
50
51    Returns:
52        The filepath to the extracted data root directory.
53    """
54    data_dir = os.path.join(path, _ZIP_ROOT)
55    if os.path.exists(data_dir):
56        return data_dir
57
58    # Find any zip in path
59    zips = glob(os.path.join(path, "*.zip"))
60    if not zips:
61        raise RuntimeError(
62            f"No zip file found in {path}. "
63            "Please download the CShaper data manually from the SharePoint link at "
64            "https://doi.org/10.6084/m9.figshare.12839315 and place the zip in `path`."
65        )
66
67    util.unzip(zips[0], path)
68    return data_dir

Extract the CShaper dataset zip.

NOTE: The zip must be downloaded manually from the SharePoint link at https://doi.org/10.6084/m9.figshare.12839315 and placed inside path. Any zip file found in path will be extracted automatically.

Arguments:
  • path: Filepath to a folder containing the downloaded CShaper zip.
  • download: Ignored (manual download required).
Returns:

The filepath to the extracted data root directory.

def get_cshaper_paths( path: Union[os.PathLike, str], split: Literal['train', 'val'] = 'train', samples: Optional[List[str]] = None, download: bool = False) -> Tuple[List[str], List[str]]:
129def get_cshaper_paths(
130    path: Union[os.PathLike, str],
131    split: Literal["train", "val"] = "train",
132    samples: Optional[List[str]] = None,
133    download: bool = False,
134) -> Tuple[List[str], List[str]]:
135    """Get paths to the CShaper data.
136
137    Args:
138        path: Filepath to a folder containing the downloaded CShaper zip.
139        split: The data split to use. Either "train" (Sample01, Sample02) or
140            "val" (Sample02, Sample03, Sample04).
141        samples: Optional list of sample names to restrict to (e.g., ["Sample01"]).
142            If None, all samples for the split are used.
143        download: Ignored (manual download required).
144
145    Returns:
146        List of filepaths for the HDF5 image data (key: "raw").
147        List of filepaths for the HDF5 label data (key: "labels").
148    """
149    if split not in ("train", "val"):
150        raise ValueError(f"Invalid split '{split}'. Choose 'train' or 'val'.")
151
152    data_dir = get_cshaper_data(path, download)
153    h5_dir = _convert_to_h5(data_dir, split)
154
155    h5_files = natsorted(glob(os.path.join(h5_dir, "*.h5")))
156
157    if len(h5_files) == 0:
158        raise RuntimeError(f"No HDF5 files found in {h5_dir}. Check the dataset structure.")
159
160    if samples is not None:
161        h5_files = [p for p in h5_files if any(os.path.basename(p).startswith(s) for s in samples)]
162
163    return h5_files, h5_files

Get paths to the CShaper data.

Arguments:
  • path: Filepath to a folder containing the downloaded CShaper zip.
  • split: The data split to use. Either "train" (Sample01, Sample02) or "val" (Sample02, Sample03, Sample04).
  • samples: Optional list of sample names to restrict to (e.g., ["Sample01"]). If None, all samples for the split are used.
  • download: Ignored (manual download required).
Returns:

List of filepaths for the HDF5 image data (key: "raw"). List of filepaths for the HDF5 label data (key: "labels").

def get_cshaper_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], split: Literal['train', 'val'] = 'train', samples: Optional[List[str]] = None, raw_key: str = 'raw', label_key: str = 'labels', download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
166def get_cshaper_dataset(
167    path: Union[os.PathLike, str],
168    patch_shape: Tuple[int, ...],
169    split: Literal["train", "val"] = "train",
170    samples: Optional[List[str]] = None,
171    raw_key: str = "raw",
172    label_key: str = "labels",
173    download: bool = False,
174    **kwargs,
175) -> Dataset:
176    """Get the CShaper dataset for C. elegans embryo cell segmentation.
177
178    Args:
179        path: Filepath to a folder containing the downloaded CShaper zip.
180        patch_shape: The patch shape to use for training.
181        split: The data split to use. Either "train" or "val".
182        samples: Optional list of sample names to restrict to (e.g., ["Sample01"]).
183        raw_key: The HDF5 key for raw image data.
184        label_key: The HDF5 key for label data.
185        download: Ignored (manual download required).
186        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
187
188    Returns:
189        The segmentation dataset.
190    """
191    raw_paths, label_paths = get_cshaper_paths(path, split, samples, download)
192
193    return torch_em.default_segmentation_dataset(
194        raw_paths=raw_paths,
195        raw_key=raw_key,
196        label_paths=label_paths,
197        label_key=label_key,
198        patch_shape=patch_shape,
199        **kwargs,
200    )

Get the CShaper dataset for C. elegans embryo cell segmentation.

Arguments:
  • path: Filepath to a folder containing the downloaded CShaper zip.
  • patch_shape: The patch shape to use for training.
  • split: The data split to use. Either "train" or "val".
  • samples: Optional list of sample names to restrict to (e.g., ["Sample01"]).
  • raw_key: The HDF5 key for raw image data.
  • label_key: The HDF5 key for label data.
  • download: Ignored (manual download required).
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_cshaper_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, ...], split: Literal['train', 'val'] = 'train', samples: Optional[List[str]] = None, raw_key: str = 'raw', label_key: str = 'labels', download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
203def get_cshaper_loader(
204    path: Union[os.PathLike, str],
205    batch_size: int,
206    patch_shape: Tuple[int, ...],
207    split: Literal["train", "val"] = "train",
208    samples: Optional[List[str]] = None,
209    raw_key: str = "raw",
210    label_key: str = "labels",
211    download: bool = False,
212    **kwargs,
213) -> DataLoader:
214    """Get the CShaper dataloader for C. elegans embryo cell segmentation.
215
216    Args:
217        path: Filepath to a folder containing the downloaded CShaper zip.
218        batch_size: The batch size for training.
219        patch_shape: The patch shape to use for training.
220        split: The data split to use. Either "train" or "val".
221        samples: Optional list of sample names to restrict to (e.g., ["Sample01"]).
222        raw_key: The HDF5 key for raw image data.
223        label_key: The HDF5 key for label data.
224        download: Ignored (manual download required).
225        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
226
227    Returns:
228        The DataLoader.
229    """
230    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
231    dataset = get_cshaper_dataset(path, patch_shape, split, samples, raw_key, label_key, download, **ds_kwargs)
232    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the CShaper dataloader for C. elegans embryo cell segmentation.

Arguments:
  • path: Filepath to a folder containing the downloaded CShaper zip.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape to use for training.
  • split: The data split to use. Either "train" or "val".
  • samples: Optional list of sample names to restrict to (e.g., ["Sample01"]).
  • raw_key: The HDF5 key for raw image data.
  • label_key: The HDF5 key for label data.
  • download: Ignored (manual download required).
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.