torch_em.data.datasets.light_microscopy.lpc_nucseg

The LPC NucSeg dataset contains annotations for nuclear segmentation in fluorescence microscopy images.

The dataset provides 97 hand-segmented images with ~4,009 cells from U2OS (gnf) and NIH3T3 (ic100) cell lines.

The dataset is located at https://github.com/luispedro/Coelho2009_ISBI_NuclearSegmentation. This dataset is from the publication https://doi.org/10.1109/ISBI.2009.5193098. Please cite it if you use this dataset in your research.

  1"""The LPC NucSeg dataset contains annotations for nuclear segmentation
  2in fluorescence microscopy images.
  3
  4The dataset provides 97 hand-segmented images with ~4,009 cells from U2OS (gnf)
  5and NIH3T3 (ic100) cell lines.
  6
  7The dataset is located at https://github.com/luispedro/Coelho2009_ISBI_NuclearSegmentation.
  8This dataset is from the publication https://doi.org/10.1109/ISBI.2009.5193098.
  9Please cite it if you use this dataset in your research.
 10"""
 11
 12import os
 13from glob import glob
 14from typing import Union, Tuple, List, Optional
 15
 16import imageio.v3 as imageio
 17
 18from torch.utils.data import Dataset, DataLoader
 19
 20import torch_em
 21
 22from .. import util
 23
 24
 25URLS = {
 26    "images": "https://github.com/luispedro/Coelho2009_ISBI_NuclearSegmentation/archive/refs/heads/master.zip",
 27}
 28
 29
 30def _create_h5_data(path, source):
 31    """Create h5 files with raw images and instance labels."""
 32    import h5py
 33    from tqdm import tqdm
 34
 35    repo_dir = os.path.join(path, "Coelho2009_ISBI_NuclearSegmentation-master")
 36    h5_dir = os.path.join(path, "h5_data", source)
 37    os.makedirs(h5_dir, exist_ok=True)
 38
 39    raw_dir = os.path.join(repo_dir, "data", "images", "dna-images", source)
 40    label_dir = os.path.join(repo_dir, "data", "preprocessed-data", source)
 41
 42    raw_paths = sorted(glob(os.path.join(raw_dir, "*.png")))
 43
 44    for raw_path in tqdm(raw_paths, desc=f"Creating h5 files for {source}"):
 45        fname = os.path.basename(raw_path)
 46        h5_path = os.path.join(h5_dir, fname.replace(".png", ".h5"))
 47
 48        if os.path.exists(h5_path):
 49            continue
 50
 51        label_path = os.path.join(label_dir, fname)
 52        if not os.path.exists(label_path):
 53            continue
 54
 55        raw = imageio.imread(raw_path)
 56        labels = imageio.imread(label_path)
 57
 58        # Convert RGB to grayscale if needed (DNA fluorescence should be single channel)
 59        if raw.ndim == 3:
 60            raw = raw[..., 0]  # Take first channel
 61
 62        with h5py.File(h5_path, "w") as f:
 63            f.create_dataset("raw", data=raw, compression="gzip")
 64            f.create_dataset("labels", data=labels.astype("int64"), compression="gzip")
 65
 66    return h5_dir
 67
 68
 69def get_lpc_nucseg_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 70    """Download the LPC NucSeg dataset.
 71
 72    Args:
 73        path: Filepath to a folder where the downloaded data will be saved.
 74        download: Whether to download the data if it is not present.
 75
 76    Returns:
 77        The filepath to the directory with the data.
 78    """
 79    repo_dir = os.path.join(path, "Coelho2009_ISBI_NuclearSegmentation-master")
 80    if os.path.exists(repo_dir):
 81        return repo_dir
 82
 83    os.makedirs(path, exist_ok=True)
 84
 85    zip_path = os.path.join(path, "master.zip")
 86    util.download_source(path=zip_path, url=URLS["images"], download=download, checksum=None)
 87    util.unzip(zip_path=zip_path, dst=path, remove=False)
 88
 89    return repo_dir
 90
 91
 92def get_lpc_nucseg_paths(
 93    path: Union[os.PathLike, str],
 94    source: Optional[Union[str, List[str]]] = None,
 95    download: bool = False,
 96) -> List[str]:
 97    """Get paths to the LPC NucSeg data.
 98
 99    Args:
100        path: Filepath to a folder where the downloaded data will be saved.
101        source: The data source(s) to use. One of 'gnf' (U2OS cells) or 'ic100' (NIH3T3 cells).
102            Can also be a list of sources. If None, all sources will be used.
103        download: Whether to download the data if it is not present.
104
105    Returns:
106        List of filepaths for the h5 data.
107    """
108    from natsort import natsorted
109
110    get_lpc_nucseg_data(path, download)
111
112    if source is None:
113        source = ["gnf", "ic100"]
114    elif isinstance(source, str):
115        source = [source]
116
117    all_h5_paths = []
118    for src in source:
119        assert src in ("gnf", "ic100"), f"'{src}' is not a valid source. Choose from 'gnf' or 'ic100'."
120
121        h5_dir = os.path.join(path, "h5_data", src)
122        if not os.path.exists(h5_dir) or len(glob(os.path.join(h5_dir, "*.h5"))) == 0:
123            _create_h5_data(path, src)
124
125        h5_paths = glob(os.path.join(h5_dir, "*.h5"))
126        all_h5_paths.extend(h5_paths)
127
128    assert len(all_h5_paths) > 0, f"No data found for source '{source}'"
129
130    return natsorted(all_h5_paths)
131
132
133def get_lpc_nucseg_dataset(
134    path: Union[os.PathLike, str],
135    patch_shape: Tuple[int, int],
136    source: Optional[Union[str, List[str]]] = None,
137    download: bool = False,
138    **kwargs
139) -> Dataset:
140    """Get the LPC NucSeg dataset for nuclear segmentation.
141
142    Args:
143        path: Filepath to a folder where the downloaded data will be saved.
144        patch_shape: The patch shape to use for training.
145        source: The data source(s) to use. One of 'gnf' (U2OS cells) or 'ic100' (NIH3T3 cells).
146            Can also be a list of sources. If None, all sources will be used.
147        download: Whether to download the data if it is not present.
148        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
149
150    Returns:
151        The segmentation dataset.
152    """
153    h5_paths = get_lpc_nucseg_paths(path, source, download)
154
155    kwargs, _ = util.add_instance_label_transform(
156        kwargs, add_binary_target=True,
157    )
158    kwargs = util.ensure_transforms(ndim=2, **kwargs)
159
160    return torch_em.default_segmentation_dataset(
161        raw_paths=h5_paths,
162        raw_key="raw",
163        label_paths=h5_paths,
164        label_key="labels",
165        patch_shape=patch_shape,
166        ndim=2,
167        **kwargs
168    )
169
170
171def get_lpc_nucseg_loader(
172    path: Union[os.PathLike, str],
173    batch_size: int,
174    patch_shape: Tuple[int, int],
175    source: Optional[Union[str, List[str]]] = None,
176    download: bool = False,
177    **kwargs
178) -> DataLoader:
179    """Get the LPC NucSeg dataloader for nuclear segmentation.
180
181    Args:
182        path: Filepath to a folder where the downloaded data will be saved.
183        batch_size: The batch size for training.
184        patch_shape: The patch shape to use for training.
185        source: The data source(s) to use. One of 'gnf' (U2OS cells) or 'ic100' (NIH3T3 cells).
186            Can also be a list of sources. If None, all sources will be used.
187        download: Whether to download the data if it is not present.
188        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
189
190    Returns:
191        The DataLoader.
192    """
193    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
194    dataset = get_lpc_nucseg_dataset(
195        path=path,
196        patch_shape=patch_shape,
197        source=source,
198        download=download,
199        **ds_kwargs,
200    )
201    return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)
URLS = {'images': 'https://github.com/luispedro/Coelho2009_ISBI_NuclearSegmentation/archive/refs/heads/master.zip'}
def get_lpc_nucseg_data(path: Union[os.PathLike, str], download: bool = False) -> str:
70def get_lpc_nucseg_data(path: Union[os.PathLike, str], download: bool = False) -> str:
71    """Download the LPC NucSeg dataset.
72
73    Args:
74        path: Filepath to a folder where the downloaded data will be saved.
75        download: Whether to download the data if it is not present.
76
77    Returns:
78        The filepath to the directory with the data.
79    """
80    repo_dir = os.path.join(path, "Coelho2009_ISBI_NuclearSegmentation-master")
81    if os.path.exists(repo_dir):
82        return repo_dir
83
84    os.makedirs(path, exist_ok=True)
85
86    zip_path = os.path.join(path, "master.zip")
87    util.download_source(path=zip_path, url=URLS["images"], download=download, checksum=None)
88    util.unzip(zip_path=zip_path, dst=path, remove=False)
89
90    return repo_dir

Download the LPC NucSeg dataset.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • download: Whether to download the data if it is not present.
Returns:

The filepath to the directory with the data.

def get_lpc_nucseg_paths( path: Union[os.PathLike, str], source: Union[List[str], str, NoneType] = None, download: bool = False) -> List[str]:
 93def get_lpc_nucseg_paths(
 94    path: Union[os.PathLike, str],
 95    source: Optional[Union[str, List[str]]] = None,
 96    download: bool = False,
 97) -> List[str]:
 98    """Get paths to the LPC NucSeg data.
 99
100    Args:
101        path: Filepath to a folder where the downloaded data will be saved.
102        source: The data source(s) to use. One of 'gnf' (U2OS cells) or 'ic100' (NIH3T3 cells).
103            Can also be a list of sources. If None, all sources will be used.
104        download: Whether to download the data if it is not present.
105
106    Returns:
107        List of filepaths for the h5 data.
108    """
109    from natsort import natsorted
110
111    get_lpc_nucseg_data(path, download)
112
113    if source is None:
114        source = ["gnf", "ic100"]
115    elif isinstance(source, str):
116        source = [source]
117
118    all_h5_paths = []
119    for src in source:
120        assert src in ("gnf", "ic100"), f"'{src}' is not a valid source. Choose from 'gnf' or 'ic100'."
121
122        h5_dir = os.path.join(path, "h5_data", src)
123        if not os.path.exists(h5_dir) or len(glob(os.path.join(h5_dir, "*.h5"))) == 0:
124            _create_h5_data(path, src)
125
126        h5_paths = glob(os.path.join(h5_dir, "*.h5"))
127        all_h5_paths.extend(h5_paths)
128
129    assert len(all_h5_paths) > 0, f"No data found for source '{source}'"
130
131    return natsorted(all_h5_paths)

Get paths to the LPC NucSeg data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • source: The data source(s) to use. One of 'gnf' (U2OS cells) or 'ic100' (NIH3T3 cells). Can also be a list of sources. If None, all sources will be used.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths for the h5 data.

def get_lpc_nucseg_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], source: Union[List[str], str, NoneType] = None, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
134def get_lpc_nucseg_dataset(
135    path: Union[os.PathLike, str],
136    patch_shape: Tuple[int, int],
137    source: Optional[Union[str, List[str]]] = None,
138    download: bool = False,
139    **kwargs
140) -> Dataset:
141    """Get the LPC NucSeg dataset for nuclear segmentation.
142
143    Args:
144        path: Filepath to a folder where the downloaded data will be saved.
145        patch_shape: The patch shape to use for training.
146        source: The data source(s) to use. One of 'gnf' (U2OS cells) or 'ic100' (NIH3T3 cells).
147            Can also be a list of sources. If None, all sources will be used.
148        download: Whether to download the data if it is not present.
149        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
150
151    Returns:
152        The segmentation dataset.
153    """
154    h5_paths = get_lpc_nucseg_paths(path, source, download)
155
156    kwargs, _ = util.add_instance_label_transform(
157        kwargs, add_binary_target=True,
158    )
159    kwargs = util.ensure_transforms(ndim=2, **kwargs)
160
161    return torch_em.default_segmentation_dataset(
162        raw_paths=h5_paths,
163        raw_key="raw",
164        label_paths=h5_paths,
165        label_key="labels",
166        patch_shape=patch_shape,
167        ndim=2,
168        **kwargs
169    )

Get the LPC NucSeg dataset for nuclear segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • source: The data source(s) to use. One of 'gnf' (U2OS cells) or 'ic100' (NIH3T3 cells). Can also be a list of sources. If None, all sources will be used.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_lpc_nucseg_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], source: Union[List[str], str, NoneType] = None, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
172def get_lpc_nucseg_loader(
173    path: Union[os.PathLike, str],
174    batch_size: int,
175    patch_shape: Tuple[int, int],
176    source: Optional[Union[str, List[str]]] = None,
177    download: bool = False,
178    **kwargs
179) -> DataLoader:
180    """Get the LPC NucSeg dataloader for nuclear segmentation.
181
182    Args:
183        path: Filepath to a folder where the downloaded data will be saved.
184        batch_size: The batch size for training.
185        patch_shape: The patch shape to use for training.
186        source: The data source(s) to use. One of 'gnf' (U2OS cells) or 'ic100' (NIH3T3 cells).
187            Can also be a list of sources. If None, all sources will be used.
188        download: Whether to download the data if it is not present.
189        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
190
191    Returns:
192        The DataLoader.
193    """
194    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
195    dataset = get_lpc_nucseg_dataset(
196        path=path,
197        patch_shape=patch_shape,
198        source=source,
199        download=download,
200        **ds_kwargs,
201    )
202    return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)

Get the LPC NucSeg dataloader for nuclear segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape to use for training.
  • source: The data source(s) to use. One of 'gnf' (U2OS cells) or 'ic100' (NIH3T3 cells). Can also be a list of sources. If None, all sources will be used.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.