torch_em.data.datasets.light_microscopy.gonuclear

This dataset contains annotation for nucleus segmentation in 3d fluorescence microscopy.

This dataset is from the publication https://doi.org/10.1242/dev.202800. Please cite it if you use this dataset in your research.

View Source

  1"""This dataset contains annotation for nucleus segmentation in 3d fluorescence microscopy.
  2
  3This dataset is from the publication https://doi.org/10.1242/dev.202800.
  4Please cite it if you use this dataset in your research.
  5"""
  6
  7import os
  8from glob import glob
  9from shutil import rmtree
 10from typing import Optional, Tuple, Union, List
 11
 12import numpy as np
 13import imageio.v3 as imageio
 14
 15from torch.utils.data import Dataset, DataLoader
 16
 17import torch_em
 18
 19from .. import util
 20
 21
 22URL = "https://www.ebi.ac.uk/biostudies/files/S-BIAD1026/Nuclei_training_segmentation/Training%20image%20dataset_Tiff%20Files.zip"  # noqa
 23CHECKSUM = "b103388a4aed01c7aadb2d5f49392d2dd08dd7cbeb2357b0c56355384ebb93a9"
 24
 25
 26def _load_tif(path):
 27    vol = None
 28
 29    path_tif = path + ".tif"
 30    if os.path.exists(path_tif):
 31        vol = imageio.imread(path_tif)
 32
 33    path_tiff = path + ".tiff"
 34    if os.path.exists(path_tiff):
 35        vol = imageio.imread(path_tiff)
 36
 37    if vol is None:
 38        raise RuntimeError("Can't find tif or tiff file for {path}.")
 39
 40    return vol
 41
 42
 43def _clip_shape(raw, labels):
 44    shape = raw.shape
 45    labels = labels[:shape[0], :shape[1], :shape[2]]
 46
 47    shape = labels.shape
 48    raw = raw[:shape[0], :shape[1], :shape[2]]
 49
 50    assert labels.shape == raw.shape, f"{labels.shape}, {raw.shape}"
 51    return raw, labels
 52
 53
 54def _process_data(in_folder, out_folder):
 55    import h5py
 56
 57    os.makedirs(out_folder, exist_ok=True)
 58
 59    sample_folders = glob(os.path.join(in_folder, "*"))
 60    for folder in sample_folders:
 61        sample = os.path.basename(folder)
 62        out_path = os.path.join(out_folder, f"{sample}.h5")
 63
 64        cell_raw = _load_tif(os.path.join(folder, f"{sample}_cellwall"))
 65        cell_labels = _load_tif(os.path.join(folder, f"{sample}_cellseg"))
 66        cell_labels = cell_labels[:, ::-1]
 67        cell_raw, cell_labels = _clip_shape(cell_raw, cell_labels)
 68
 69        nucleus_raw = _load_tif(os.path.join(folder, f"{sample}_n_H2BtdTomato"))
 70        nucleus_labels = _load_tif(os.path.join(folder, f"{sample}_n_stain_StarDist_goldGT"))
 71        nucleus_labels = nucleus_labels[:, ::-1]
 72        nucleus_raw, nucleus_labels = _clip_shape(nucleus_raw, nucleus_labels)
 73
 74        # Remove last frames with artifacts for two volumes (1137 and 1170).
 75        if sample in ["1137", "1170"]:
 76            nucleus_raw, nucleus_labels = nucleus_raw[:-1], nucleus_labels[:-1]
 77            cell_raw, cell_labels = cell_raw[:-1], cell_labels[:-1]
 78
 79        # Fixing cell labels for one volume (1136) is misaligned.
 80        if sample == "1136":
 81            cell_labels = np.fliplr(cell_labels)
 82
 83        with h5py.File(out_path, "w") as f:
 84            f.create_dataset("raw/cells", data=cell_raw, compression="gzip")
 85            f.create_dataset("raw/nuclei", data=nucleus_raw, compression="gzip")
 86
 87            f.create_dataset("labels/cells", data=cell_labels, compression="gzip")
 88            f.create_dataset("labels/nuclei", data=nucleus_labels, compression="gzip")
 89
 90
 91def get_gonuclear_data(path: Union[os.PathLike, str], download: bool) -> str:
 92    """Download the GoNuclear training data.
 93
 94    Args:
 95        path: Filepath to a folder where the downloaded data will be saved.
 96        download: Whether to download the data if it is not present.
 97
 98    Returns:
 99        The filepath to the training data.
100    """
101    data_path = os.path.join(path, "gonuclear_datasets")
102    if os.path.exists(data_path):
103        return data_path
104
105    os.makedirs(path, exist_ok=True)
106    zip_path = os.path.join(path, "gonuclear.zip")
107    util.download_source(zip_path, URL, download, CHECKSUM)
108    util.unzip(zip_path, path, True)
109
110    extracted_path = os.path.join(path, "Training image dataset_Tiff Files")
111    assert os.path.exists(extracted_path), extracted_path
112    _process_data(extracted_path, data_path)
113    assert os.path.exists(data_path)
114
115    rmtree(extracted_path)
116    return data_path
117
118
119def get_gonuclear_paths(
120    path: Union[os.PathLike, str],
121    sample_ids: Optional[Union[int, Tuple[int, ...]]] = None,
122    download: bool = False
123) -> List[str]:
124    """Get paths to the GoNuclear data.
125
126    Args:
127        path: Filepath to a folder where the downloaded data will be saved.
128        sample_ids: The sample ids to load. The valid sample ids are:
129            1135, 1136, 1137, 1139, 1170. If none is given all samples will be loaded.
130        download: Whether to download the data if it is not present.
131
132    Returns:
133        List of filepaths for the stored data.
134    """
135    data_root = get_gonuclear_data(path, download)
136
137    if sample_ids is None:
138        paths = sorted(glob(os.path.join(data_root, "*.h5")))
139    else:
140        paths = []
141        for sample_id in sample_ids:
142            sample_path = os.path.join(data_root, f"{sample_id}.h5")
143            if not os.path.exists(sample_path):
144                raise ValueError(f"Invalid sample id {sample_id}.")
145            paths.append(sample_path)
146
147    return paths
148
149
150def get_gonuclear_dataset(
151    path: Union[os.PathLike, str],
152    patch_shape: Tuple[int, int],
153    segmentation_task: str = "nuclei",
154    sample_ids: Optional[Union[int, Tuple[int, ...]]] = None,
155    offsets: Optional[List[List[int]]] = None,
156    boundaries: bool = False,
157    binary: bool = False,
158    download: bool = False,
159    **kwargs
160) -> Dataset:
161    """Get the GoNuclear dataset for segmenting nuclei in 3d fluorescence microscopy.
162
163    Args:
164        path: Filepath to a folder where the downloaded data will be saved.
165        patch_shape: The patch shape to use for training.
166        segmentation_task: The segmentation task. Either 'nuclei' or 'cells'.
167        sample_ids: The sample ids to load. The valid sample ids are:
168            1135, 1136, 1137, 1139, 1170. If none is given all samples will be loaded.
169        offsets: Offset values for affinity computation used as target.
170        boundaries: Whether to compute boundaries as the target.
171        binary: Whether to use a binary segmentation target.
172        download: Whether to download the data if it is not present.
173        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
174
175    Returns:
176       The segmentation dataset.
177    """
178    paths = get_gonuclear_paths(path, sample_ids, download)
179
180    if segmentation_task == "nuclei":
181        raw_key = "raw/nuclei"
182        label_key = "labels/nuclei"
183    elif segmentation_task == "cells":
184        raw_key = "raw/cells"
185        label_key = "labels/cells"
186    else:
187        raise ValueError(f"Invalid segmentation task {segmentation_task}, expect one of 'cells' or 'nuclei'.")
188
189    kwargs, _ = util.add_instance_label_transform(
190        kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets,
191    )
192
193    return torch_em.default_segmentation_dataset(
194        raw_paths=paths,
195        raw_key=raw_key,
196        label_paths=paths,
197        label_key=label_key,
198        patch_shape=patch_shape,
199        **kwargs
200    )
201
202
203def get_gonuclear_loader(
204    path: Union[os.PathLike, str],
205    patch_shape: Tuple[int, int],
206    batch_size: int,
207    segmentation_task: str = "nuclei",
208    sample_ids: Optional[Union[int, Tuple[int, ...]]] = None,
209    offsets: Optional[List[List[int]]] = None,
210    boundaries: bool = False,
211    binary: bool = False,
212    download: bool = False,
213    **kwargs
214) -> DataLoader:
215    """Get the GoNuclear dataloader for segmenting nuclei in 3d fluorescence microscopy.
216
217    Args:
218        path: Filepath to a folder where the downloaded data will be saved.
219        patch_shape: The patch shape to use for training.
220        batch_size: The batch size for training.
221        segmentation_task: The segmentation task. Either 'nuclei' or 'cells'.
222        sample_ids: The sample ids to load. The valid sample ids are:
223            1135, 1136, 1137, 1139, 1170. If none is given all samples will be loaded.
224        offsets: Offset values for affinity computation used as target.
225        boundaries: Whether to compute boundaries as the target.
226        binary: Whether to use a binary segmentation target.
227        download: Whether to download the data if it is not present.
228        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
229
230    Returns:
231        The DataLoader.
232    """
233    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
234    dataset = get_gonuclear_dataset(
235        path=path,
236        patch_shape=patch_shape,
237        segmentation_task=segmentation_task,
238        sample_ids=sample_ids,
239        offsets=offsets,
240        boundaries=boundaries,
241        binary=binary,
242        download=download,
243        **ds_kwargs,
244    )
245    return torch_em.get_data_loader(dataset, batch_size=batch_size, **loader_kwargs)

URL = 'https://www.ebi.ac.uk/biostudies/files/S-BIAD1026/Nuclei_training_segmentation/Training%20image%20dataset_Tiff%20Files.zip'

CHECKSUM = 'b103388a4aed01c7aadb2d5f49392d2dd08dd7cbeb2357b0c56355384ebb93a9'

def get_gonuclear_data(path: Union[os.PathLike, str], download: bool) -> str: View Source

 92def get_gonuclear_data(path: Union[os.PathLike, str], download: bool) -> str:
 93    """Download the GoNuclear training data.
 94
 95    Args:
 96        path: Filepath to a folder where the downloaded data will be saved.
 97        download: Whether to download the data if it is not present.
 98
 99    Returns:
100        The filepath to the training data.
101    """
102    data_path = os.path.join(path, "gonuclear_datasets")
103    if os.path.exists(data_path):
104        return data_path
105
106    os.makedirs(path, exist_ok=True)
107    zip_path = os.path.join(path, "gonuclear.zip")
108    util.download_source(zip_path, URL, download, CHECKSUM)
109    util.unzip(zip_path, path, True)
110
111    extracted_path = os.path.join(path, "Training image dataset_Tiff Files")
112    assert os.path.exists(extracted_path), extracted_path
113    _process_data(extracted_path, data_path)
114    assert os.path.exists(data_path)
115
116    rmtree(extracted_path)
117    return data_path

Download the GoNuclear training data.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
download: Whether to download the data if it is not present.

Returns:

The filepath to the training data.

def get_gonuclear_paths( path: Union[os.PathLike, str], sample_ids: Union[int, Tuple[int, ...], NoneType] = None, download: bool = False) -> List[str]: View Source

120def get_gonuclear_paths(
121    path: Union[os.PathLike, str],
122    sample_ids: Optional[Union[int, Tuple[int, ...]]] = None,
123    download: bool = False
124) -> List[str]:
125    """Get paths to the GoNuclear data.
126
127    Args:
128        path: Filepath to a folder where the downloaded data will be saved.
129        sample_ids: The sample ids to load. The valid sample ids are:
130            1135, 1136, 1137, 1139, 1170. If none is given all samples will be loaded.
131        download: Whether to download the data if it is not present.
132
133    Returns:
134        List of filepaths for the stored data.
135    """
136    data_root = get_gonuclear_data(path, download)
137
138    if sample_ids is None:
139        paths = sorted(glob(os.path.join(data_root, "*.h5")))
140    else:
141        paths = []
142        for sample_id in sample_ids:
143            sample_path = os.path.join(data_root, f"{sample_id}.h5")
144            if not os.path.exists(sample_path):
145                raise ValueError(f"Invalid sample id {sample_id}.")
146            paths.append(sample_path)
147
148    return paths

Get paths to the GoNuclear data.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
sample_ids: The sample ids to load. The valid sample ids are: 1135, 1136, 1137, 1139, 1170. If none is given all samples will be loaded.
download: Whether to download the data if it is not present.

Returns:

List of filepaths for the stored data.

def get_gonuclear_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], segmentation_task: str = 'nuclei', sample_ids: Union[int, Tuple[int, ...], NoneType] = None, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset: View Source

151def get_gonuclear_dataset(
152    path: Union[os.PathLike, str],
153    patch_shape: Tuple[int, int],
154    segmentation_task: str = "nuclei",
155    sample_ids: Optional[Union[int, Tuple[int, ...]]] = None,
156    offsets: Optional[List[List[int]]] = None,
157    boundaries: bool = False,
158    binary: bool = False,
159    download: bool = False,
160    **kwargs
161) -> Dataset:
162    """Get the GoNuclear dataset for segmenting nuclei in 3d fluorescence microscopy.
163
164    Args:
165        path: Filepath to a folder where the downloaded data will be saved.
166        patch_shape: The patch shape to use for training.
167        segmentation_task: The segmentation task. Either 'nuclei' or 'cells'.
168        sample_ids: The sample ids to load. The valid sample ids are:
169            1135, 1136, 1137, 1139, 1170. If none is given all samples will be loaded.
170        offsets: Offset values for affinity computation used as target.
171        boundaries: Whether to compute boundaries as the target.
172        binary: Whether to use a binary segmentation target.
173        download: Whether to download the data if it is not present.
174        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
175
176    Returns:
177       The segmentation dataset.
178    """
179    paths = get_gonuclear_paths(path, sample_ids, download)
180
181    if segmentation_task == "nuclei":
182        raw_key = "raw/nuclei"
183        label_key = "labels/nuclei"
184    elif segmentation_task == "cells":
185        raw_key = "raw/cells"
186        label_key = "labels/cells"
187    else:
188        raise ValueError(f"Invalid segmentation task {segmentation_task}, expect one of 'cells' or 'nuclei'.")
189
190    kwargs, _ = util.add_instance_label_transform(
191        kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets,
192    )
193
194    return torch_em.default_segmentation_dataset(
195        raw_paths=paths,
196        raw_key=raw_key,
197        label_paths=paths,
198        label_key=label_key,
199        patch_shape=patch_shape,
200        **kwargs
201    )

Get the GoNuclear dataset for segmenting nuclei in 3d fluorescence microscopy.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
patch_shape: The patch shape to use for training.
segmentation_task: The segmentation task. Either 'nuclei' or 'cells'.
sample_ids: The sample ids to load. The valid sample ids are: 1135, 1136, 1137, 1139, 1170. If none is given all samples will be loaded.
offsets: Offset values for affinity computation used as target.
boundaries: Whether to compute boundaries as the target.
binary: Whether to use a binary segmentation target.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

Returns:

The segmentation dataset.

def get_gonuclear_loader( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], batch_size: int, segmentation_task: str = 'nuclei', sample_ids: Union[int, Tuple[int, ...], NoneType] = None, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader: View Source

204def get_gonuclear_loader(
205    path: Union[os.PathLike, str],
206    patch_shape: Tuple[int, int],
207    batch_size: int,
208    segmentation_task: str = "nuclei",
209    sample_ids: Optional[Union[int, Tuple[int, ...]]] = None,
210    offsets: Optional[List[List[int]]] = None,
211    boundaries: bool = False,
212    binary: bool = False,
213    download: bool = False,
214    **kwargs
215) -> DataLoader:
216    """Get the GoNuclear dataloader for segmenting nuclei in 3d fluorescence microscopy.
217
218    Args:
219        path: Filepath to a folder where the downloaded data will be saved.
220        patch_shape: The patch shape to use for training.
221        batch_size: The batch size for training.
222        segmentation_task: The segmentation task. Either 'nuclei' or 'cells'.
223        sample_ids: The sample ids to load. The valid sample ids are:
224            1135, 1136, 1137, 1139, 1170. If none is given all samples will be loaded.
225        offsets: Offset values for affinity computation used as target.
226        boundaries: Whether to compute boundaries as the target.
227        binary: Whether to use a binary segmentation target.
228        download: Whether to download the data if it is not present.
229        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
230
231    Returns:
232        The DataLoader.
233    """
234    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
235    dataset = get_gonuclear_dataset(
236        path=path,
237        patch_shape=patch_shape,
238        segmentation_task=segmentation_task,
239        sample_ids=sample_ids,
240        offsets=offsets,
241        boundaries=boundaries,
242        binary=binary,
243        download=download,
244        **ds_kwargs,
245    )
246    return torch_em.get_data_loader(dataset, batch_size=batch_size, **loader_kwargs)

Get the GoNuclear dataloader for segmenting nuclei in 3d fluorescence microscopy.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
patch_shape: The patch shape to use for training.
batch_size: The batch size for training.
segmentation_task: The segmentation task. Either 'nuclei' or 'cells'.
sample_ids: The sample ids to load. The valid sample ids are: 1135, 1136, 1137, 1139, 1170. If none is given all samples will be loaded.
offsets: Offset values for affinity computation used as target.
boundaries: Whether to compute boundaries as the target.
binary: Whether to use a binary segmentation target.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

Returns:

The DataLoader.