torch_em.data.datasets.light_microscopy.morphonet
The MorphoNet dataset contains 3D segmentation annotations for five organisms from the MorphoNet 2.0 publication.
The dataset provides 3D instance segmentation labels for cell and nucleus segmentation across five model organisms imaged with confocal and light-sheet microscopy:
- Patiria miniata (starfish embryo, confocal, membrane/nuclei) from https://doi.org/10.1242/dev.202362.
- Tribolium castaneum (beetle embryo, light-sheet, nuclei) from https://doi.org/10.1038/s41592-023-01879-y.
- Arabidopsis thaliana (plant shoot apical meristem, confocal, membranes) from https://doi.org/10.1073/pnas.1616768113.
- Caenorhabditis elegans (nematode embryo, confocal, nuclei) from https://doi.org/10.1038/s41592-023-01879-y.
- Phallusia mammillata (ascidian embryo, light-sheet, membranes) from https://doi.org/10.1126/science.aar5663.
The dataset is located at https://doi.org/10.6084/m9.figshare.30529745.v2. This dataset is from the publication https://doi.org/10.7554/eLife.106227.2. Please cite it if you use this dataset in your research.
1"""The MorphoNet dataset contains 3D segmentation annotations for five organisms 2from the MorphoNet 2.0 publication. 3 4The dataset provides 3D instance segmentation labels for cell and nucleus segmentation 5across five model organisms imaged with confocal and light-sheet microscopy: 6- Patiria miniata (starfish embryo, confocal, membrane/nuclei) from https://doi.org/10.1242/dev.202362. 7- Tribolium castaneum (beetle embryo, light-sheet, nuclei) from https://doi.org/10.1038/s41592-023-01879-y. 8- Arabidopsis thaliana (plant shoot apical meristem, confocal, membranes) from https://doi.org/10.1073/pnas.1616768113. 9- Caenorhabditis elegans (nematode embryo, confocal, nuclei) from https://doi.org/10.1038/s41592-023-01879-y. 10- Phallusia mammillata (ascidian embryo, light-sheet, membranes) from https://doi.org/10.1126/science.aar5663. 11 12The dataset is located at https://doi.org/10.6084/m9.figshare.30529745.v2. 13This dataset is from the publication https://doi.org/10.7554/eLife.106227.2. 14Please cite it if you use this dataset in your research. 15""" 16 17import os 18from glob import glob 19from typing import Union, Tuple, List, Optional 20 21from torch.utils.data import Dataset, DataLoader 22 23import torch_em 24 25from .. import util 26 27 28URLS = { 29 "patiria_miniata": "https://ndownloader.figshare.com/files/59296676", 30 "tribolium_castaneum": "https://ndownloader.figshare.com/files/59296685", 31 "arabidopsis_thaliana": "https://ndownloader.figshare.com/files/59296700", 32 "caenorhabditis_elegans": "https://ndownloader.figshare.com/files/59296703", 33 "phallusia_mammillata": "https://ndownloader.figshare.com/files/59296712", 34} 35 36DIR_NAMES = { 37 "patiria_miniata": "Patiria miniata", 38 "tribolium_castaneum": "Tribolium castaneum", 39 "arabidopsis_thaliana": "Arabidopsis thaliana", 40 "caenorhabditis_elegans": "Caenorhabditis elegans", 41 "phallusia_mammillata": "Phallusia mammillata", 42} 43 44ORGANISMS = list(URLS.keys()) 45 46 47def _get_tif_files(directory): 48 """Get all TIF/TIFF files from a directory.""" 49 files = glob(os.path.join(directory, "*.tif")) + glob(os.path.join(directory, "*.tiff")) 50 # Exclude macOS metadata files. 51 files = [f for f in files if not os.path.basename(f).startswith(".")] 52 return files 53 54 55def _match_raw_seg_files(raw_dir, seg_dir, organism): 56 """Match RAW and SEG files for a given organism.""" 57 from natsort import natsorted 58 59 raw_files = natsorted(_get_tif_files(raw_dir)) 60 seg_files = natsorted(_get_tif_files(seg_dir)) 61 62 # For Tribolium, filter out the empty channel 0 from RAW (only ch1 has nuclei). 63 if organism == "tribolium_castaneum": 64 raw_files = [f for f in raw_files if "ch0" not in os.path.basename(f)] 65 66 assert len(raw_files) > 0, f"No RAW files found in {raw_dir}" 67 assert len(seg_files) > 0, f"No SEG files found in {seg_dir}" 68 assert len(raw_files) == len(seg_files), ( 69 f"Mismatch for {organism}: {len(raw_files)} RAW files vs {len(seg_files)} SEG files" 70 ) 71 72 return list(zip(raw_files, seg_files)) 73 74 75def _create_h5_data(path, organism): 76 """Create h5 files with raw images and instance segmentation labels.""" 77 import h5py 78 import imageio.v3 as imageio 79 from tqdm import tqdm 80 81 h5_dir = os.path.join(path, "h5_data", organism) 82 os.makedirs(h5_dir, exist_ok=True) 83 84 org_dir = os.path.join(path, DIR_NAMES[organism]) 85 raw_dir = os.path.join(org_dir, "published", "RAW") 86 seg_dir = os.path.join(org_dir, "published", "SEG") 87 88 pairs = _match_raw_seg_files(raw_dir, seg_dir, organism) 89 90 for i, (raw_path, seg_path) in enumerate(tqdm(pairs, desc=f"Creating h5 for {organism}")): 91 h5_path = os.path.join(h5_dir, f"t{i:04d}.h5") 92 93 if os.path.exists(h5_path): 94 continue 95 96 raw = imageio.imread(raw_path) 97 seg = imageio.imread(seg_path) 98 99 with h5py.File(h5_path, "w") as f: 100 f.create_dataset("raw", data=raw, compression="gzip") 101 f.create_dataset("labels", data=seg.astype("int64"), compression="gzip") 102 103 return h5_dir 104 105 106def get_morphonet_data( 107 path: Union[os.PathLike, str], 108 organism: Optional[Union[str, List[str]]] = None, 109 download: bool = False, 110) -> str: 111 """Download the MorphoNet dataset. 112 113 Args: 114 path: Filepath to a folder where the downloaded data will be saved. 115 organism: The organism(s) to download data for. If None, all organisms will be downloaded. 116 Available organisms: patiria_miniata, tribolium_castaneum, arabidopsis_thaliana, 117 caenorhabditis_elegans, phallusia_mammillata. 118 download: Whether to download the data if it is not present. 119 120 Returns: 121 The filepath to the directory with the data. 122 """ 123 if organism is None: 124 organism = ORGANISMS 125 elif isinstance(organism, str): 126 organism = [organism] 127 128 for org in organism: 129 assert org in ORGANISMS, f"'{org}' is not a valid organism. Choose from {ORGANISMS}." 130 131 org_dir = os.path.join(path, DIR_NAMES[org]) 132 if os.path.exists(org_dir): 133 continue 134 135 os.makedirs(path, exist_ok=True) 136 zip_path = os.path.join(path, f"{org}.zip") 137 util.download_source(path=zip_path, url=URLS[org], download=download, checksum=None) 138 util.unzip(zip_path=zip_path, dst=path) 139 140 return path 141 142 143def get_morphonet_paths( 144 path: Union[os.PathLike, str], 145 organism: Optional[Union[str, List[str]]] = None, 146 download: bool = False, 147) -> List[str]: 148 """Get paths to the MorphoNet data. 149 150 Args: 151 path: Filepath to a folder where the downloaded data will be saved. 152 organism: The organism(s) to use. If None, all organisms will be used. 153 download: Whether to download the data if it is not present. 154 155 Returns: 156 List of filepaths for the h5 data. 157 """ 158 from natsort import natsorted 159 160 if organism is None: 161 organism = ORGANISMS 162 elif isinstance(organism, str): 163 organism = [organism] 164 165 get_morphonet_data(path, organism, download) 166 167 all_h5_paths = [] 168 for org in organism: 169 h5_dir = os.path.join(path, "h5_data", org) 170 if not os.path.exists(h5_dir) or len(glob(os.path.join(h5_dir, "*.h5"))) == 0: 171 _create_h5_data(path, org) 172 173 h5_paths = glob(os.path.join(h5_dir, "*.h5")) 174 all_h5_paths.extend(h5_paths) 175 176 assert len(all_h5_paths) > 0, f"No data found for organism(s) '{organism}'" 177 178 return natsorted(all_h5_paths) 179 180 181def get_morphonet_dataset( 182 path: Union[os.PathLike, str], 183 patch_shape: Tuple[int, int, int], 184 organism: Optional[Union[str, List[str]]] = None, 185 download: bool = False, 186 **kwargs 187) -> Dataset: 188 """Get the MorphoNet dataset for 3D cell/nucleus segmentation. 189 190 Args: 191 path: Filepath to a folder where the downloaded data will be saved. 192 patch_shape: The patch shape to use for training. 193 organism: The organism(s) to use. If None, all organisms will be used. 194 download: Whether to download the data if it is not present. 195 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 196 197 Returns: 198 The segmentation dataset. 199 """ 200 h5_paths = get_morphonet_paths(path, organism, download) 201 202 kwargs, _ = util.add_instance_label_transform( 203 kwargs, add_binary_target=True, 204 ) 205 kwargs = util.ensure_transforms(ndim=3, **kwargs) 206 207 return torch_em.default_segmentation_dataset( 208 raw_paths=h5_paths, 209 raw_key="raw", 210 label_paths=h5_paths, 211 label_key="labels", 212 patch_shape=patch_shape, 213 ndim=3, 214 **kwargs 215 ) 216 217 218def get_morphonet_loader( 219 path: Union[os.PathLike, str], 220 batch_size: int, 221 patch_shape: Tuple[int, int, int], 222 organism: Optional[Union[str, List[str]]] = None, 223 download: bool = False, 224 **kwargs 225) -> DataLoader: 226 """Get the MorphoNet dataloader for 3D cell/nucleus segmentation. 227 228 Args: 229 path: Filepath to a folder where the downloaded data will be saved. 230 batch_size: The batch size for training. 231 patch_shape: The patch shape to use for training. 232 organism: The organism(s) to use. If None, all organisms will be used. 233 download: Whether to download the data if it is not present. 234 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 235 236 Returns: 237 The DataLoader. 238 """ 239 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 240 dataset = get_morphonet_dataset( 241 path=path, 242 patch_shape=patch_shape, 243 organism=organism, 244 download=download, 245 **ds_kwargs, 246 ) 247 return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)
107def get_morphonet_data( 108 path: Union[os.PathLike, str], 109 organism: Optional[Union[str, List[str]]] = None, 110 download: bool = False, 111) -> str: 112 """Download the MorphoNet dataset. 113 114 Args: 115 path: Filepath to a folder where the downloaded data will be saved. 116 organism: The organism(s) to download data for. If None, all organisms will be downloaded. 117 Available organisms: patiria_miniata, tribolium_castaneum, arabidopsis_thaliana, 118 caenorhabditis_elegans, phallusia_mammillata. 119 download: Whether to download the data if it is not present. 120 121 Returns: 122 The filepath to the directory with the data. 123 """ 124 if organism is None: 125 organism = ORGANISMS 126 elif isinstance(organism, str): 127 organism = [organism] 128 129 for org in organism: 130 assert org in ORGANISMS, f"'{org}' is not a valid organism. Choose from {ORGANISMS}." 131 132 org_dir = os.path.join(path, DIR_NAMES[org]) 133 if os.path.exists(org_dir): 134 continue 135 136 os.makedirs(path, exist_ok=True) 137 zip_path = os.path.join(path, f"{org}.zip") 138 util.download_source(path=zip_path, url=URLS[org], download=download, checksum=None) 139 util.unzip(zip_path=zip_path, dst=path) 140 141 return path
Download the MorphoNet dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- organism: The organism(s) to download data for. If None, all organisms will be downloaded. Available organisms: patiria_miniata, tribolium_castaneum, arabidopsis_thaliana, caenorhabditis_elegans, phallusia_mammillata.
- download: Whether to download the data if it is not present.
Returns:
The filepath to the directory with the data.
144def get_morphonet_paths( 145 path: Union[os.PathLike, str], 146 organism: Optional[Union[str, List[str]]] = None, 147 download: bool = False, 148) -> List[str]: 149 """Get paths to the MorphoNet data. 150 151 Args: 152 path: Filepath to a folder where the downloaded data will be saved. 153 organism: The organism(s) to use. If None, all organisms will be used. 154 download: Whether to download the data if it is not present. 155 156 Returns: 157 List of filepaths for the h5 data. 158 """ 159 from natsort import natsorted 160 161 if organism is None: 162 organism = ORGANISMS 163 elif isinstance(organism, str): 164 organism = [organism] 165 166 get_morphonet_data(path, organism, download) 167 168 all_h5_paths = [] 169 for org in organism: 170 h5_dir = os.path.join(path, "h5_data", org) 171 if not os.path.exists(h5_dir) or len(glob(os.path.join(h5_dir, "*.h5"))) == 0: 172 _create_h5_data(path, org) 173 174 h5_paths = glob(os.path.join(h5_dir, "*.h5")) 175 all_h5_paths.extend(h5_paths) 176 177 assert len(all_h5_paths) > 0, f"No data found for organism(s) '{organism}'" 178 179 return natsorted(all_h5_paths)
Get paths to the MorphoNet data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- organism: The organism(s) to use. If None, all organisms will be used.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the h5 data.
182def get_morphonet_dataset( 183 path: Union[os.PathLike, str], 184 patch_shape: Tuple[int, int, int], 185 organism: Optional[Union[str, List[str]]] = None, 186 download: bool = False, 187 **kwargs 188) -> Dataset: 189 """Get the MorphoNet dataset for 3D cell/nucleus segmentation. 190 191 Args: 192 path: Filepath to a folder where the downloaded data will be saved. 193 patch_shape: The patch shape to use for training. 194 organism: The organism(s) to use. If None, all organisms will be used. 195 download: Whether to download the data if it is not present. 196 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 197 198 Returns: 199 The segmentation dataset. 200 """ 201 h5_paths = get_morphonet_paths(path, organism, download) 202 203 kwargs, _ = util.add_instance_label_transform( 204 kwargs, add_binary_target=True, 205 ) 206 kwargs = util.ensure_transforms(ndim=3, **kwargs) 207 208 return torch_em.default_segmentation_dataset( 209 raw_paths=h5_paths, 210 raw_key="raw", 211 label_paths=h5_paths, 212 label_key="labels", 213 patch_shape=patch_shape, 214 ndim=3, 215 **kwargs 216 )
Get the MorphoNet dataset for 3D cell/nucleus segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- organism: The organism(s) to use. If None, all organisms will be used.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset.
Returns:
The segmentation dataset.
219def get_morphonet_loader( 220 path: Union[os.PathLike, str], 221 batch_size: int, 222 patch_shape: Tuple[int, int, int], 223 organism: Optional[Union[str, List[str]]] = None, 224 download: bool = False, 225 **kwargs 226) -> DataLoader: 227 """Get the MorphoNet dataloader for 3D cell/nucleus segmentation. 228 229 Args: 230 path: Filepath to a folder where the downloaded data will be saved. 231 batch_size: The batch size for training. 232 patch_shape: The patch shape to use for training. 233 organism: The organism(s) to use. If None, all organisms will be used. 234 download: Whether to download the data if it is not present. 235 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 236 237 Returns: 238 The DataLoader. 239 """ 240 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 241 dataset = get_morphonet_dataset( 242 path=path, 243 patch_shape=patch_shape, 244 organism=organism, 245 download=download, 246 **ds_kwargs, 247 ) 248 return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)
Get the MorphoNet dataloader for 3D cell/nucleus segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- organism: The organism(s) to use. If None, all organisms will be used.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_datasetor for the PyTorch DataLoader.
Returns:
The DataLoader.