torch_em.data.datasets.light_microscopy.idr0095
The IDR0095 dataset (idr0095-ali-asymmetry) contains fluorescence microscopy images of Escherichia coli cells from three experiments studying regulatory asymmetry in transcription factor autoregulatory gene networks.
Each acquisition contains three imaging channels:
- Phase contrast (channel 0): cell morphology - used as raw input for segmentation
- mCherry (channel 1): transcription factor gene expression level
- YFP (channel 2): downstream target gene expression level
The Phase channel images are extracted from Nikon ND2 files and paired with
hand-segmented cell instance masks. Note: annotations are sparse - not all
visible cells in each field of view are labeled. Reading ND2 files requires
the nd2 package: pip install nd2
Data is hosted on EBI FTP: ftp.ebi.ac.uk/pub/databases/IDR/idr0095-ali-asymmetry/ The dataset accession on IDR is idr0095.
This dataset is from the following publication:
- Ali et al. (2020): https://doi.org/10.7554/eLife.56517 Please cite it if you use this dataset in your research.
1"""The IDR0095 dataset (idr0095-ali-asymmetry) contains fluorescence microscopy 2images of Escherichia coli cells from three experiments studying regulatory asymmetry 3in transcription factor autoregulatory gene networks. 4 5Each acquisition contains three imaging channels: 6- Phase contrast (channel 0): cell morphology - used as raw input for segmentation 7- mCherry (channel 1): transcription factor gene expression level 8- YFP (channel 2): downstream target gene expression level 9 10The Phase channel images are extracted from Nikon ND2 files and paired with 11hand-segmented cell instance masks. Note: annotations are sparse - not all 12visible cells in each field of view are labeled. Reading ND2 files requires 13the `nd2` package: pip install nd2 14 15Data is hosted on EBI FTP: ftp.ebi.ac.uk/pub/databases/IDR/idr0095-ali-asymmetry/ 16The dataset accession on IDR is idr0095. 17 18This dataset is from the following publication: 19- Ali et al. (2020): https://doi.org/10.7554/eLife.56517 20Please cite it if you use this dataset in your research. 21""" 22 23import ftplib 24import os 25from glob import glob 26from natsort import natsorted 27from typing import List, Literal, Tuple, Union 28 29import numpy as np 30import imageio.v3 as imageio 31from tqdm import tqdm 32 33from torch.utils.data import Dataset, DataLoader 34 35import torch_em 36 37from .. import util 38 39 40FTP_HOST = "ftp.ebi.ac.uk" 41FTP_BASE = "/pub/databases/IDR/idr0095-ali-asymmetry/20200804-ftp" 42 43EXPERIMENT_DIRS = { 44 "A": "Experiment_A_Figure3", 45 "B": "Experiment_B_Figure4", 46 "C": "Experiment_C_Figure5", 47} 48 49 50def _ftp_download_recursive(ftp: ftplib.FTP, remote_dir: str, local_dir: str) -> None: 51 os.makedirs(local_dir, exist_ok=True) 52 ftp.cwd(remote_dir) 53 54 entries = [] 55 ftp.retrlines("LIST", entries.append) 56 57 for entry in entries: 58 parts = entry.split() 59 name = parts[-1] 60 is_dir = entry.startswith("d") 61 local_path = os.path.join(local_dir, name) 62 63 if is_dir: 64 _ftp_download_recursive(ftp, f"{remote_dir}/{name}", local_path) 65 ftp.cwd(remote_dir) # return to parent after recursion 66 else: 67 if not os.path.exists(local_path): 68 with open(local_path, "wb") as f: 69 ftp.retrbinary(f"RETR {name}", f.write) 70 71 72def _get_phase_channel_index(nd2_file) -> int: 73 """Return the index of the Phase channel, defaulting to 0.""" 74 try: 75 names = [ch.channel.name.lower() for ch in nd2_file.metadata.channels] 76 for i, name in enumerate(names): 77 if "phase" in name or "bf" in name or "trans" in name: 78 return i 79 except Exception: 80 pass 81 return 0 82 83 84def _extract_phase_tiffs(exp_dir: str, experiment: str) -> None: 85 """Extract Phase channel frames from all ND2 files in an experiment directory.""" 86 try: 87 import nd2 88 except ImportError: 89 raise ImportError( 90 "The 'nd2' package is required to read ND2 files from IDR0095. " 91 "Please install it with: pip install nd2" 92 ) 93 94 nd2_files = natsorted(glob(os.path.join(exp_dir, "**", "*.nd2"), recursive=True)) 95 if not nd2_files: 96 raise RuntimeError(f"No ND2 files found in {exp_dir}.") 97 98 for nd2_path in tqdm(nd2_files, desc=f"Extracting Phase TIFFs (Experiment {experiment})"): 99 condition = os.path.splitext(os.path.basename(nd2_path))[0] 100 mask_dir = os.path.join(os.path.dirname(nd2_path), condition) 101 102 if not os.path.isdir(mask_dir): 103 continue 104 105 mask_paths = natsorted(glob(os.path.join(mask_dir, "*-Mask.tif"))) 106 if not mask_paths: 107 continue 108 109 phase_paths = [p.replace("-Mask.tif", "-Phase.tif") for p in mask_paths] 110 if all(os.path.exists(p) for p in phase_paths): 111 continue # already extracted 112 113 try: 114 with nd2.ND2File(nd2_path) as f: 115 phase_idx = _get_phase_channel_index(f) 116 arr = f.asarray() # shape varies by acquisition settings 117 except Exception as e: 118 print(f"Warning: skipping {nd2_path} - could not read ND2 file: {e}") 119 continue 120 121 # Normalize to (N_fields, N_channels, H, W). 122 # nd2.asarray() may return (P, C, Y, X), (C, Y, X), (Y, X), etc. 123 if arr.ndim == 2: 124 arr = arr[np.newaxis, np.newaxis] # (1, 1, H, W) 125 elif arr.ndim == 3: 126 arr = arr[:, np.newaxis] # (P, 1, H, W) or (C, H, W)? 127 # If 4-D, assume (P, C, H, W) - standard nd2 layout for multi-position/channel. 128 129 n_frames = arr.shape[0] 130 131 for i, (mask_path, phase_path) in enumerate(zip(mask_paths, phase_paths)): 132 if os.path.exists(phase_path): 133 continue 134 frame_idx = min(i, n_frames - 1) 135 frame = arr[frame_idx, phase_idx] if arr.ndim == 4 else arr[frame_idx, 0] 136 imageio.imwrite(phase_path, frame.astype(np.uint16)) 137 138 139def get_idr0095_data( 140 path: Union[os.PathLike, str], 141 experiment: Literal["A", "B", "C"] = "A", 142 download: bool = False, 143) -> str: 144 """Download the IDR0095 dataset from EBI FTP and extract Phase channel TIFFs. 145 146 NOTE: This dataset is large - Experiment A is ~6 GB, B ~9 GB, C ~18 GB. 147 Raw images are in Nikon ND2 format; the `nd2` package (pip install nd2) 148 is required to extract the Phase channel TIFFs on first use. 149 150 Args: 151 path: Filepath to a folder where the downloaded data will be saved. 152 experiment: The experiment to download. One of 'A', 'B', or 'C', 153 corresponding to Figures 3, 4, and 5 of Ali et al. (2020). 154 download: Whether to download the data if it is not present. 155 156 Returns: 157 The filepath to the data directory containing the experiment folder. 158 """ 159 assert experiment in EXPERIMENT_DIRS, \ 160 f"'{experiment}' is not a valid experiment. Choose from {list(EXPERIMENT_DIRS)}." 161 162 data_dir = os.path.join(path, "idr0095") 163 exp_dir = os.path.join(data_dir, EXPERIMENT_DIRS[experiment]) 164 165 if not download and not os.path.exists(exp_dir): 166 raise RuntimeError( 167 f"IDR0095 experiment {experiment} not found at {exp_dir}. " 168 "Set download=True to download from EBI FTP." 169 ) 170 171 if download: 172 os.makedirs(data_dir, exist_ok=True) 173 print(f"Connecting to {FTP_HOST} to download IDR0095 Experiment {experiment} ...") 174 print("This may take a long time depending on experiment size (~6–18 GB).") 175 with ftplib.FTP(FTP_HOST) as ftp: 176 ftp.login() # anonymous login 177 # _ftp_download_recursive skips files that already exist, safe to re-run. 178 _ftp_download_recursive(ftp, f"{FTP_BASE}/{EXPERIMENT_DIRS[experiment]}", exp_dir) 179 180 _extract_phase_tiffs(exp_dir, experiment) 181 return data_dir 182 183 184def get_idr0095_paths( 185 path: Union[os.PathLike, str], 186 experiment: Literal["A", "B", "C"] = "A", 187 download: bool = False, 188) -> Tuple[List[str], List[str]]: 189 """Get paths to IDR0095 Phase-channel images and cell segmentation masks. 190 191 Args: 192 path: Filepath to a folder where the downloaded data will be saved. 193 experiment: The experiment to use. One of 'A', 'B', or 'C'. 194 download: Whether to download the data if it is not present. 195 196 Returns: 197 List of filepaths to the Phase-channel TIFF images. 198 List of filepaths to the instance segmentation mask TIFFs. 199 """ 200 data_dir = get_idr0095_data(path, experiment, download) 201 exp_dir = os.path.join(data_dir, EXPERIMENT_DIRS[experiment]) 202 203 mask_paths = natsorted(glob(os.path.join(exp_dir, "**", "*-Mask.tif"), recursive=True)) 204 pairs = [ 205 (p.replace("-Mask.tif", "-Phase.tif"), p) 206 for p in mask_paths 207 if os.path.exists(p.replace("-Mask.tif", "-Phase.tif")) 208 ] 209 210 if not pairs: 211 raise RuntimeError( 212 f"No Phase TIFFs found in {exp_dir}. " 213 "Ensure the dataset was downloaded and nd2 is installed for Phase extraction." 214 ) 215 216 raw_paths, mask_paths = zip(*pairs) 217 return list(raw_paths), list(mask_paths) 218 219 220def get_idr0095_dataset( 221 path: Union[os.PathLike, str], 222 patch_shape: Tuple[int, int], 223 experiment: Literal["A", "B", "C"] = "A", 224 download: bool = False, 225 **kwargs, 226) -> Dataset: 227 """Get the IDR0095 dataset for E. coli phase-contrast cell segmentation. 228 229 Args: 230 path: Filepath to a folder where the downloaded data will be saved. 231 patch_shape: The patch shape to use for training. 232 experiment: The experiment to use. One of 'A', 'B', or 'C'. 233 download: Whether to download the data if it is not present. 234 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 235 236 Returns: 237 The segmentation dataset. 238 """ 239 raw_paths, label_paths = get_idr0095_paths(path, experiment, download) 240 241 return torch_em.default_segmentation_dataset( 242 raw_paths=raw_paths, 243 raw_key=None, 244 label_paths=label_paths, 245 label_key=None, 246 patch_shape=patch_shape, 247 **kwargs, 248 ) 249 250 251def get_idr0095_loader( 252 path: Union[os.PathLike, str], 253 batch_size: int, 254 patch_shape: Tuple[int, int], 255 experiment: Literal["A", "B", "C"] = "A", 256 download: bool = False, 257 **kwargs, 258) -> DataLoader: 259 """Get the IDR0095 dataloader for E. coli phase-contrast cell segmentation. 260 261 Args: 262 path: Filepath to a folder where the downloaded data will be saved. 263 batch_size: The batch size for training. 264 patch_shape: The patch shape to use for training. 265 experiment: The experiment to use. One of 'A', 'B', or 'C'. 266 download: Whether to download the data if it is not present. 267 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 268 269 Returns: 270 The DataLoader. 271 """ 272 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 273 dataset = get_idr0095_dataset(path, patch_shape, experiment, download, **ds_kwargs) 274 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
140def get_idr0095_data( 141 path: Union[os.PathLike, str], 142 experiment: Literal["A", "B", "C"] = "A", 143 download: bool = False, 144) -> str: 145 """Download the IDR0095 dataset from EBI FTP and extract Phase channel TIFFs. 146 147 NOTE: This dataset is large - Experiment A is ~6 GB, B ~9 GB, C ~18 GB. 148 Raw images are in Nikon ND2 format; the `nd2` package (pip install nd2) 149 is required to extract the Phase channel TIFFs on first use. 150 151 Args: 152 path: Filepath to a folder where the downloaded data will be saved. 153 experiment: The experiment to download. One of 'A', 'B', or 'C', 154 corresponding to Figures 3, 4, and 5 of Ali et al. (2020). 155 download: Whether to download the data if it is not present. 156 157 Returns: 158 The filepath to the data directory containing the experiment folder. 159 """ 160 assert experiment in EXPERIMENT_DIRS, \ 161 f"'{experiment}' is not a valid experiment. Choose from {list(EXPERIMENT_DIRS)}." 162 163 data_dir = os.path.join(path, "idr0095") 164 exp_dir = os.path.join(data_dir, EXPERIMENT_DIRS[experiment]) 165 166 if not download and not os.path.exists(exp_dir): 167 raise RuntimeError( 168 f"IDR0095 experiment {experiment} not found at {exp_dir}. " 169 "Set download=True to download from EBI FTP." 170 ) 171 172 if download: 173 os.makedirs(data_dir, exist_ok=True) 174 print(f"Connecting to {FTP_HOST} to download IDR0095 Experiment {experiment} ...") 175 print("This may take a long time depending on experiment size (~6–18 GB).") 176 with ftplib.FTP(FTP_HOST) as ftp: 177 ftp.login() # anonymous login 178 # _ftp_download_recursive skips files that already exist, safe to re-run. 179 _ftp_download_recursive(ftp, f"{FTP_BASE}/{EXPERIMENT_DIRS[experiment]}", exp_dir) 180 181 _extract_phase_tiffs(exp_dir, experiment) 182 return data_dir
Download the IDR0095 dataset from EBI FTP and extract Phase channel TIFFs.
NOTE: This dataset is large - Experiment A is ~6 GB, B ~9 GB, C ~18 GB.
Raw images are in Nikon ND2 format; the nd2 package (pip install nd2)
is required to extract the Phase channel TIFFs on first use.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- experiment: The experiment to download. One of 'A', 'B', or 'C', corresponding to Figures 3, 4, and 5 of Ali et al. (2020).
- download: Whether to download the data if it is not present.
Returns:
The filepath to the data directory containing the experiment folder.
185def get_idr0095_paths( 186 path: Union[os.PathLike, str], 187 experiment: Literal["A", "B", "C"] = "A", 188 download: bool = False, 189) -> Tuple[List[str], List[str]]: 190 """Get paths to IDR0095 Phase-channel images and cell segmentation masks. 191 192 Args: 193 path: Filepath to a folder where the downloaded data will be saved. 194 experiment: The experiment to use. One of 'A', 'B', or 'C'. 195 download: Whether to download the data if it is not present. 196 197 Returns: 198 List of filepaths to the Phase-channel TIFF images. 199 List of filepaths to the instance segmentation mask TIFFs. 200 """ 201 data_dir = get_idr0095_data(path, experiment, download) 202 exp_dir = os.path.join(data_dir, EXPERIMENT_DIRS[experiment]) 203 204 mask_paths = natsorted(glob(os.path.join(exp_dir, "**", "*-Mask.tif"), recursive=True)) 205 pairs = [ 206 (p.replace("-Mask.tif", "-Phase.tif"), p) 207 for p in mask_paths 208 if os.path.exists(p.replace("-Mask.tif", "-Phase.tif")) 209 ] 210 211 if not pairs: 212 raise RuntimeError( 213 f"No Phase TIFFs found in {exp_dir}. " 214 "Ensure the dataset was downloaded and nd2 is installed for Phase extraction." 215 ) 216 217 raw_paths, mask_paths = zip(*pairs) 218 return list(raw_paths), list(mask_paths)
Get paths to IDR0095 Phase-channel images and cell segmentation masks.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- experiment: The experiment to use. One of 'A', 'B', or 'C'.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths to the Phase-channel TIFF images. List of filepaths to the instance segmentation mask TIFFs.
221def get_idr0095_dataset( 222 path: Union[os.PathLike, str], 223 patch_shape: Tuple[int, int], 224 experiment: Literal["A", "B", "C"] = "A", 225 download: bool = False, 226 **kwargs, 227) -> Dataset: 228 """Get the IDR0095 dataset for E. coli phase-contrast cell segmentation. 229 230 Args: 231 path: Filepath to a folder where the downloaded data will be saved. 232 patch_shape: The patch shape to use for training. 233 experiment: The experiment to use. One of 'A', 'B', or 'C'. 234 download: Whether to download the data if it is not present. 235 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 236 237 Returns: 238 The segmentation dataset. 239 """ 240 raw_paths, label_paths = get_idr0095_paths(path, experiment, download) 241 242 return torch_em.default_segmentation_dataset( 243 raw_paths=raw_paths, 244 raw_key=None, 245 label_paths=label_paths, 246 label_key=None, 247 patch_shape=patch_shape, 248 **kwargs, 249 )
Get the IDR0095 dataset for E. coli phase-contrast cell segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- experiment: The experiment to use. One of 'A', 'B', or 'C'.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset.
Returns:
The segmentation dataset.
252def get_idr0095_loader( 253 path: Union[os.PathLike, str], 254 batch_size: int, 255 patch_shape: Tuple[int, int], 256 experiment: Literal["A", "B", "C"] = "A", 257 download: bool = False, 258 **kwargs, 259) -> DataLoader: 260 """Get the IDR0095 dataloader for E. coli phase-contrast cell segmentation. 261 262 Args: 263 path: Filepath to a folder where the downloaded data will be saved. 264 batch_size: The batch size for training. 265 patch_shape: The patch shape to use for training. 266 experiment: The experiment to use. One of 'A', 'B', or 'C'. 267 download: Whether to download the data if it is not present. 268 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 269 270 Returns: 271 The DataLoader. 272 """ 273 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 274 dataset = get_idr0095_dataset(path, patch_shape, experiment, download, **ds_kwargs) 275 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the IDR0095 dataloader for E. coli phase-contrast cell segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- experiment: The experiment to use. One of 'A', 'B', or 'C'.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_datasetor for the PyTorch DataLoader.
Returns:
The DataLoader.