torch_em.data.datasets.light_microscopy.lpc_nucseg
The LPC NucSeg dataset contains annotations for nuclear segmentation in fluorescence microscopy images.
The dataset provides 97 hand-segmented images with ~4,009 cells from U2OS (gnf) and NIH3T3 (ic100) cell lines.
The dataset is located at https://github.com/luispedro/Coelho2009_ISBI_NuclearSegmentation. This dataset is from the publication https://doi.org/10.1109/ISBI.2009.5193098. Please cite it if you use this dataset in your research.
1"""The LPC NucSeg dataset contains annotations for nuclear segmentation 2in fluorescence microscopy images. 3 4The dataset provides 97 hand-segmented images with ~4,009 cells from U2OS (gnf) 5and NIH3T3 (ic100) cell lines. 6 7The dataset is located at https://github.com/luispedro/Coelho2009_ISBI_NuclearSegmentation. 8This dataset is from the publication https://doi.org/10.1109/ISBI.2009.5193098. 9Please cite it if you use this dataset in your research. 10""" 11 12import os 13from glob import glob 14from typing import Union, Tuple, List, Optional 15 16import imageio.v3 as imageio 17 18from torch.utils.data import Dataset, DataLoader 19 20import torch_em 21 22from .. import util 23 24 25URLS = { 26 "images": "https://github.com/luispedro/Coelho2009_ISBI_NuclearSegmentation/archive/refs/heads/master.zip", 27} 28 29 30def _create_h5_data(path, source): 31 """Create h5 files with raw images and instance labels.""" 32 import h5py 33 from tqdm import tqdm 34 35 repo_dir = os.path.join(path, "Coelho2009_ISBI_NuclearSegmentation-master") 36 h5_dir = os.path.join(path, "h5_data", source) 37 os.makedirs(h5_dir, exist_ok=True) 38 39 raw_dir = os.path.join(repo_dir, "data", "images", "dna-images", source) 40 label_dir = os.path.join(repo_dir, "data", "preprocessed-data", source) 41 42 raw_paths = sorted(glob(os.path.join(raw_dir, "*.png"))) 43 44 for raw_path in tqdm(raw_paths, desc=f"Creating h5 files for {source}"): 45 fname = os.path.basename(raw_path) 46 h5_path = os.path.join(h5_dir, fname.replace(".png", ".h5")) 47 48 if os.path.exists(h5_path): 49 continue 50 51 label_path = os.path.join(label_dir, fname) 52 if not os.path.exists(label_path): 53 continue 54 55 raw = imageio.imread(raw_path) 56 labels = imageio.imread(label_path) 57 58 # Convert RGB to grayscale if needed (DNA fluorescence should be single channel) 59 if raw.ndim == 3: 60 raw = raw[..., 0] # Take first channel 61 62 with h5py.File(h5_path, "w") as f: 63 f.create_dataset("raw", data=raw, compression="gzip") 64 f.create_dataset("labels", data=labels.astype("int64"), compression="gzip") 65 66 return h5_dir 67 68 69def get_lpc_nucseg_data(path: Union[os.PathLike, str], download: bool = False) -> str: 70 """Download the LPC NucSeg dataset. 71 72 Args: 73 path: Filepath to a folder where the downloaded data will be saved. 74 download: Whether to download the data if it is not present. 75 76 Returns: 77 The filepath to the directory with the data. 78 """ 79 repo_dir = os.path.join(path, "Coelho2009_ISBI_NuclearSegmentation-master") 80 if os.path.exists(repo_dir): 81 return repo_dir 82 83 os.makedirs(path, exist_ok=True) 84 85 zip_path = os.path.join(path, "master.zip") 86 util.download_source(path=zip_path, url=URLS["images"], download=download, checksum=None) 87 util.unzip(zip_path=zip_path, dst=path, remove=False) 88 89 return repo_dir 90 91 92def get_lpc_nucseg_paths( 93 path: Union[os.PathLike, str], 94 source: Optional[Union[str, List[str]]] = None, 95 download: bool = False, 96) -> List[str]: 97 """Get paths to the LPC NucSeg data. 98 99 Args: 100 path: Filepath to a folder where the downloaded data will be saved. 101 source: The data source(s) to use. One of 'gnf' (U2OS cells) or 'ic100' (NIH3T3 cells). 102 Can also be a list of sources. If None, all sources will be used. 103 download: Whether to download the data if it is not present. 104 105 Returns: 106 List of filepaths for the h5 data. 107 """ 108 from natsort import natsorted 109 110 get_lpc_nucseg_data(path, download) 111 112 if source is None: 113 source = ["gnf", "ic100"] 114 elif isinstance(source, str): 115 source = [source] 116 117 all_h5_paths = [] 118 for src in source: 119 assert src in ("gnf", "ic100"), f"'{src}' is not a valid source. Choose from 'gnf' or 'ic100'." 120 121 h5_dir = os.path.join(path, "h5_data", src) 122 if not os.path.exists(h5_dir) or len(glob(os.path.join(h5_dir, "*.h5"))) == 0: 123 _create_h5_data(path, src) 124 125 h5_paths = glob(os.path.join(h5_dir, "*.h5")) 126 all_h5_paths.extend(h5_paths) 127 128 assert len(all_h5_paths) > 0, f"No data found for source '{source}'" 129 130 return natsorted(all_h5_paths) 131 132 133def get_lpc_nucseg_dataset( 134 path: Union[os.PathLike, str], 135 patch_shape: Tuple[int, int], 136 source: Optional[Union[str, List[str]]] = None, 137 download: bool = False, 138 **kwargs 139) -> Dataset: 140 """Get the LPC NucSeg dataset for nuclear segmentation. 141 142 Args: 143 path: Filepath to a folder where the downloaded data will be saved. 144 patch_shape: The patch shape to use for training. 145 source: The data source(s) to use. One of 'gnf' (U2OS cells) or 'ic100' (NIH3T3 cells). 146 Can also be a list of sources. If None, all sources will be used. 147 download: Whether to download the data if it is not present. 148 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 149 150 Returns: 151 The segmentation dataset. 152 """ 153 h5_paths = get_lpc_nucseg_paths(path, source, download) 154 155 kwargs, _ = util.add_instance_label_transform( 156 kwargs, add_binary_target=True, 157 ) 158 kwargs = util.ensure_transforms(ndim=2, **kwargs) 159 160 return torch_em.default_segmentation_dataset( 161 raw_paths=h5_paths, 162 raw_key="raw", 163 label_paths=h5_paths, 164 label_key="labels", 165 patch_shape=patch_shape, 166 ndim=2, 167 **kwargs 168 ) 169 170 171def get_lpc_nucseg_loader( 172 path: Union[os.PathLike, str], 173 batch_size: int, 174 patch_shape: Tuple[int, int], 175 source: Optional[Union[str, List[str]]] = None, 176 download: bool = False, 177 **kwargs 178) -> DataLoader: 179 """Get the LPC NucSeg dataloader for nuclear segmentation. 180 181 Args: 182 path: Filepath to a folder where the downloaded data will be saved. 183 batch_size: The batch size for training. 184 patch_shape: The patch shape to use for training. 185 source: The data source(s) to use. One of 'gnf' (U2OS cells) or 'ic100' (NIH3T3 cells). 186 Can also be a list of sources. If None, all sources will be used. 187 download: Whether to download the data if it is not present. 188 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 189 190 Returns: 191 The DataLoader. 192 """ 193 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 194 dataset = get_lpc_nucseg_dataset( 195 path=path, 196 patch_shape=patch_shape, 197 source=source, 198 download=download, 199 **ds_kwargs, 200 ) 201 return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)
URLS =
{'images': 'https://github.com/luispedro/Coelho2009_ISBI_NuclearSegmentation/archive/refs/heads/master.zip'}
def
get_lpc_nucseg_data(path: Union[os.PathLike, str], download: bool = False) -> str:
70def get_lpc_nucseg_data(path: Union[os.PathLike, str], download: bool = False) -> str: 71 """Download the LPC NucSeg dataset. 72 73 Args: 74 path: Filepath to a folder where the downloaded data will be saved. 75 download: Whether to download the data if it is not present. 76 77 Returns: 78 The filepath to the directory with the data. 79 """ 80 repo_dir = os.path.join(path, "Coelho2009_ISBI_NuclearSegmentation-master") 81 if os.path.exists(repo_dir): 82 return repo_dir 83 84 os.makedirs(path, exist_ok=True) 85 86 zip_path = os.path.join(path, "master.zip") 87 util.download_source(path=zip_path, url=URLS["images"], download=download, checksum=None) 88 util.unzip(zip_path=zip_path, dst=path, remove=False) 89 90 return repo_dir
Download the LPC NucSeg dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- download: Whether to download the data if it is not present.
Returns:
The filepath to the directory with the data.
def
get_lpc_nucseg_paths( path: Union[os.PathLike, str], source: Union[List[str], str, NoneType] = None, download: bool = False) -> List[str]:
93def get_lpc_nucseg_paths( 94 path: Union[os.PathLike, str], 95 source: Optional[Union[str, List[str]]] = None, 96 download: bool = False, 97) -> List[str]: 98 """Get paths to the LPC NucSeg data. 99 100 Args: 101 path: Filepath to a folder where the downloaded data will be saved. 102 source: The data source(s) to use. One of 'gnf' (U2OS cells) or 'ic100' (NIH3T3 cells). 103 Can also be a list of sources. If None, all sources will be used. 104 download: Whether to download the data if it is not present. 105 106 Returns: 107 List of filepaths for the h5 data. 108 """ 109 from natsort import natsorted 110 111 get_lpc_nucseg_data(path, download) 112 113 if source is None: 114 source = ["gnf", "ic100"] 115 elif isinstance(source, str): 116 source = [source] 117 118 all_h5_paths = [] 119 for src in source: 120 assert src in ("gnf", "ic100"), f"'{src}' is not a valid source. Choose from 'gnf' or 'ic100'." 121 122 h5_dir = os.path.join(path, "h5_data", src) 123 if not os.path.exists(h5_dir) or len(glob(os.path.join(h5_dir, "*.h5"))) == 0: 124 _create_h5_data(path, src) 125 126 h5_paths = glob(os.path.join(h5_dir, "*.h5")) 127 all_h5_paths.extend(h5_paths) 128 129 assert len(all_h5_paths) > 0, f"No data found for source '{source}'" 130 131 return natsorted(all_h5_paths)
Get paths to the LPC NucSeg data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- source: The data source(s) to use. One of 'gnf' (U2OS cells) or 'ic100' (NIH3T3 cells). Can also be a list of sources. If None, all sources will be used.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the h5 data.
def
get_lpc_nucseg_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], source: Union[List[str], str, NoneType] = None, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
134def get_lpc_nucseg_dataset( 135 path: Union[os.PathLike, str], 136 patch_shape: Tuple[int, int], 137 source: Optional[Union[str, List[str]]] = None, 138 download: bool = False, 139 **kwargs 140) -> Dataset: 141 """Get the LPC NucSeg dataset for nuclear segmentation. 142 143 Args: 144 path: Filepath to a folder where the downloaded data will be saved. 145 patch_shape: The patch shape to use for training. 146 source: The data source(s) to use. One of 'gnf' (U2OS cells) or 'ic100' (NIH3T3 cells). 147 Can also be a list of sources. If None, all sources will be used. 148 download: Whether to download the data if it is not present. 149 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 150 151 Returns: 152 The segmentation dataset. 153 """ 154 h5_paths = get_lpc_nucseg_paths(path, source, download) 155 156 kwargs, _ = util.add_instance_label_transform( 157 kwargs, add_binary_target=True, 158 ) 159 kwargs = util.ensure_transforms(ndim=2, **kwargs) 160 161 return torch_em.default_segmentation_dataset( 162 raw_paths=h5_paths, 163 raw_key="raw", 164 label_paths=h5_paths, 165 label_key="labels", 166 patch_shape=patch_shape, 167 ndim=2, 168 **kwargs 169 )
Get the LPC NucSeg dataset for nuclear segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- source: The data source(s) to use. One of 'gnf' (U2OS cells) or 'ic100' (NIH3T3 cells). Can also be a list of sources. If None, all sources will be used.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset.
Returns:
The segmentation dataset.
def
get_lpc_nucseg_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], source: Union[List[str], str, NoneType] = None, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
172def get_lpc_nucseg_loader( 173 path: Union[os.PathLike, str], 174 batch_size: int, 175 patch_shape: Tuple[int, int], 176 source: Optional[Union[str, List[str]]] = None, 177 download: bool = False, 178 **kwargs 179) -> DataLoader: 180 """Get the LPC NucSeg dataloader for nuclear segmentation. 181 182 Args: 183 path: Filepath to a folder where the downloaded data will be saved. 184 batch_size: The batch size for training. 185 patch_shape: The patch shape to use for training. 186 source: The data source(s) to use. One of 'gnf' (U2OS cells) or 'ic100' (NIH3T3 cells). 187 Can also be a list of sources. If None, all sources will be used. 188 download: Whether to download the data if it is not present. 189 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 190 191 Returns: 192 The DataLoader. 193 """ 194 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 195 dataset = get_lpc_nucseg_dataset( 196 path=path, 197 patch_shape=patch_shape, 198 source=source, 199 download=download, 200 **ds_kwargs, 201 ) 202 return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)
Get the LPC NucSeg dataloader for nuclear segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- source: The data source(s) to use. One of 'gnf' (U2OS cells) or 'ic100' (NIH3T3 cells). Can also be a list of sources. If None, all sources will be used.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_datasetor for the PyTorch DataLoader.
Returns:
The DataLoader.