torch_em.data.datasets.light_microscopy.idr0095

The IDR0095 dataset (idr0095-ali-asymmetry) contains fluorescence microscopy images of Escherichia coli cells from three experiments studying regulatory asymmetry in transcription factor autoregulatory gene networks.

Each acquisition contains three imaging channels:

  • Phase contrast (channel 0): cell morphology - used as raw input for segmentation
  • mCherry (channel 1): transcription factor gene expression level
  • YFP (channel 2): downstream target gene expression level

The Phase channel images are extracted from Nikon ND2 files and paired with hand-segmented cell instance masks. Note: annotations are sparse - not all visible cells in each field of view are labeled. Reading ND2 files requires the nd2 package: pip install nd2

Data is hosted on EBI FTP: ftp.ebi.ac.uk/pub/databases/IDR/idr0095-ali-asymmetry/ The dataset accession on IDR is idr0095.

This dataset is from the following publication:

  1"""The IDR0095 dataset (idr0095-ali-asymmetry) contains fluorescence microscopy
  2images of Escherichia coli cells from three experiments studying regulatory asymmetry
  3in transcription factor autoregulatory gene networks.
  4
  5Each acquisition contains three imaging channels:
  6- Phase contrast (channel 0): cell morphology - used as raw input for segmentation
  7- mCherry (channel 1): transcription factor gene expression level
  8- YFP (channel 2): downstream target gene expression level
  9
 10The Phase channel images are extracted from Nikon ND2 files and paired with
 11hand-segmented cell instance masks. Note: annotations are sparse - not all
 12visible cells in each field of view are labeled. Reading ND2 files requires
 13the `nd2` package: pip install nd2
 14
 15Data is hosted on EBI FTP: ftp.ebi.ac.uk/pub/databases/IDR/idr0095-ali-asymmetry/
 16The dataset accession on IDR is idr0095.
 17
 18This dataset is from the following publication:
 19- Ali et al. (2020): https://doi.org/10.7554/eLife.56517
 20Please cite it if you use this dataset in your research.
 21"""
 22
 23import ftplib
 24import os
 25from glob import glob
 26from natsort import natsorted
 27from typing import List, Literal, Tuple, Union
 28
 29import numpy as np
 30import imageio.v3 as imageio
 31from tqdm import tqdm
 32
 33from torch.utils.data import Dataset, DataLoader
 34
 35import torch_em
 36
 37from .. import util
 38
 39
 40FTP_HOST = "ftp.ebi.ac.uk"
 41FTP_BASE = "/pub/databases/IDR/idr0095-ali-asymmetry/20200804-ftp"
 42
 43EXPERIMENT_DIRS = {
 44    "A": "Experiment_A_Figure3",
 45    "B": "Experiment_B_Figure4",
 46    "C": "Experiment_C_Figure5",
 47}
 48
 49
 50def _ftp_download_recursive(ftp: ftplib.FTP, remote_dir: str, local_dir: str) -> None:
 51    os.makedirs(local_dir, exist_ok=True)
 52    ftp.cwd(remote_dir)
 53
 54    entries = []
 55    ftp.retrlines("LIST", entries.append)
 56
 57    for entry in entries:
 58        parts = entry.split()
 59        name = parts[-1]
 60        is_dir = entry.startswith("d")
 61        local_path = os.path.join(local_dir, name)
 62
 63        if is_dir:
 64            _ftp_download_recursive(ftp, f"{remote_dir}/{name}", local_path)
 65            ftp.cwd(remote_dir)  # return to parent after recursion
 66        else:
 67            if not os.path.exists(local_path):
 68                with open(local_path, "wb") as f:
 69                    ftp.retrbinary(f"RETR {name}", f.write)
 70
 71
 72def _get_phase_channel_index(nd2_file) -> int:
 73    """Return the index of the Phase channel, defaulting to 0."""
 74    try:
 75        names = [ch.channel.name.lower() for ch in nd2_file.metadata.channels]
 76        for i, name in enumerate(names):
 77            if "phase" in name or "bf" in name or "trans" in name:
 78                return i
 79    except Exception:
 80        pass
 81    return 0
 82
 83
 84def _extract_phase_tiffs(exp_dir: str, experiment: str) -> None:
 85    """Extract Phase channel frames from all ND2 files in an experiment directory."""
 86    try:
 87        import nd2
 88    except ImportError:
 89        raise ImportError(
 90            "The 'nd2' package is required to read ND2 files from IDR0095. "
 91            "Please install it with: pip install nd2"
 92        )
 93
 94    nd2_files = natsorted(glob(os.path.join(exp_dir, "**", "*.nd2"), recursive=True))
 95    if not nd2_files:
 96        raise RuntimeError(f"No ND2 files found in {exp_dir}.")
 97
 98    for nd2_path in tqdm(nd2_files, desc=f"Extracting Phase TIFFs (Experiment {experiment})"):
 99        condition = os.path.splitext(os.path.basename(nd2_path))[0]
100        mask_dir = os.path.join(os.path.dirname(nd2_path), condition)
101
102        if not os.path.isdir(mask_dir):
103            continue
104
105        mask_paths = natsorted(glob(os.path.join(mask_dir, "*-Mask.tif")))
106        if not mask_paths:
107            continue
108
109        phase_paths = [p.replace("-Mask.tif", "-Phase.tif") for p in mask_paths]
110        if all(os.path.exists(p) for p in phase_paths):
111            continue  # already extracted
112
113        try:
114            with nd2.ND2File(nd2_path) as f:
115                phase_idx = _get_phase_channel_index(f)
116                arr = f.asarray()  # shape varies by acquisition settings
117        except Exception as e:
118            print(f"Warning: skipping {nd2_path} - could not read ND2 file: {e}")
119            continue
120
121        # Normalize to (N_fields, N_channels, H, W).
122        # nd2.asarray() may return (P, C, Y, X), (C, Y, X), (Y, X), etc.
123        if arr.ndim == 2:
124            arr = arr[np.newaxis, np.newaxis]  # (1, 1, H, W)
125        elif arr.ndim == 3:
126            arr = arr[:, np.newaxis]  # (P, 1, H, W) or (C, H, W)?
127        # If 4-D, assume (P, C, H, W) - standard nd2 layout for multi-position/channel.
128
129        n_frames = arr.shape[0]
130
131        for i, (mask_path, phase_path) in enumerate(zip(mask_paths, phase_paths)):
132            if os.path.exists(phase_path):
133                continue
134            frame_idx = min(i, n_frames - 1)
135            frame = arr[frame_idx, phase_idx] if arr.ndim == 4 else arr[frame_idx, 0]
136            imageio.imwrite(phase_path, frame.astype(np.uint16))
137
138
139def get_idr0095_data(
140    path: Union[os.PathLike, str],
141    experiment: Literal["A", "B", "C"] = "A",
142    download: bool = False,
143) -> str:
144    """Download the IDR0095 dataset from EBI FTP and extract Phase channel TIFFs.
145
146    NOTE: This dataset is large - Experiment A is ~6 GB, B ~9 GB, C ~18 GB.
147    Raw images are in Nikon ND2 format; the `nd2` package (pip install nd2)
148    is required to extract the Phase channel TIFFs on first use.
149
150    Args:
151        path: Filepath to a folder where the downloaded data will be saved.
152        experiment: The experiment to download. One of 'A', 'B', or 'C',
153            corresponding to Figures 3, 4, and 5 of Ali et al. (2020).
154        download: Whether to download the data if it is not present.
155
156    Returns:
157        The filepath to the data directory containing the experiment folder.
158    """
159    assert experiment in EXPERIMENT_DIRS, \
160        f"'{experiment}' is not a valid experiment. Choose from {list(EXPERIMENT_DIRS)}."
161
162    data_dir = os.path.join(path, "idr0095")
163    exp_dir = os.path.join(data_dir, EXPERIMENT_DIRS[experiment])
164
165    if not download and not os.path.exists(exp_dir):
166        raise RuntimeError(
167            f"IDR0095 experiment {experiment} not found at {exp_dir}. "
168            "Set download=True to download from EBI FTP."
169        )
170
171    if download:
172        os.makedirs(data_dir, exist_ok=True)
173        print(f"Connecting to {FTP_HOST} to download IDR0095 Experiment {experiment} ...")
174        print("This may take a long time depending on experiment size (~6–18 GB).")
175        with ftplib.FTP(FTP_HOST) as ftp:
176            ftp.login()  # anonymous login
177            # _ftp_download_recursive skips files that already exist, safe to re-run.
178            _ftp_download_recursive(ftp, f"{FTP_BASE}/{EXPERIMENT_DIRS[experiment]}", exp_dir)
179
180    _extract_phase_tiffs(exp_dir, experiment)
181    return data_dir
182
183
184def get_idr0095_paths(
185    path: Union[os.PathLike, str],
186    experiment: Literal["A", "B", "C"] = "A",
187    download: bool = False,
188) -> Tuple[List[str], List[str]]:
189    """Get paths to IDR0095 Phase-channel images and cell segmentation masks.
190
191    Args:
192        path: Filepath to a folder where the downloaded data will be saved.
193        experiment: The experiment to use. One of 'A', 'B', or 'C'.
194        download: Whether to download the data if it is not present.
195
196    Returns:
197        List of filepaths to the Phase-channel TIFF images.
198        List of filepaths to the instance segmentation mask TIFFs.
199    """
200    data_dir = get_idr0095_data(path, experiment, download)
201    exp_dir = os.path.join(data_dir, EXPERIMENT_DIRS[experiment])
202
203    mask_paths = natsorted(glob(os.path.join(exp_dir, "**", "*-Mask.tif"), recursive=True))
204    pairs = [
205        (p.replace("-Mask.tif", "-Phase.tif"), p)
206        for p in mask_paths
207        if os.path.exists(p.replace("-Mask.tif", "-Phase.tif"))
208    ]
209
210    if not pairs:
211        raise RuntimeError(
212            f"No Phase TIFFs found in {exp_dir}. "
213            "Ensure the dataset was downloaded and nd2 is installed for Phase extraction."
214        )
215
216    raw_paths, mask_paths = zip(*pairs)
217    return list(raw_paths), list(mask_paths)
218
219
220def get_idr0095_dataset(
221    path: Union[os.PathLike, str],
222    patch_shape: Tuple[int, int],
223    experiment: Literal["A", "B", "C"] = "A",
224    download: bool = False,
225    **kwargs,
226) -> Dataset:
227    """Get the IDR0095 dataset for E. coli phase-contrast cell segmentation.
228
229    Args:
230        path: Filepath to a folder where the downloaded data will be saved.
231        patch_shape: The patch shape to use for training.
232        experiment: The experiment to use. One of 'A', 'B', or 'C'.
233        download: Whether to download the data if it is not present.
234        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
235
236    Returns:
237        The segmentation dataset.
238    """
239    raw_paths, label_paths = get_idr0095_paths(path, experiment, download)
240
241    return torch_em.default_segmentation_dataset(
242        raw_paths=raw_paths,
243        raw_key=None,
244        label_paths=label_paths,
245        label_key=None,
246        patch_shape=patch_shape,
247        **kwargs,
248    )
249
250
251def get_idr0095_loader(
252    path: Union[os.PathLike, str],
253    batch_size: int,
254    patch_shape: Tuple[int, int],
255    experiment: Literal["A", "B", "C"] = "A",
256    download: bool = False,
257    **kwargs,
258) -> DataLoader:
259    """Get the IDR0095 dataloader for E. coli phase-contrast cell segmentation.
260
261    Args:
262        path: Filepath to a folder where the downloaded data will be saved.
263        batch_size: The batch size for training.
264        patch_shape: The patch shape to use for training.
265        experiment: The experiment to use. One of 'A', 'B', or 'C'.
266        download: Whether to download the data if it is not present.
267        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
268
269    Returns:
270        The DataLoader.
271    """
272    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
273    dataset = get_idr0095_dataset(path, patch_shape, experiment, download, **ds_kwargs)
274    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
FTP_HOST = 'ftp.ebi.ac.uk'
FTP_BASE = '/pub/databases/IDR/idr0095-ali-asymmetry/20200804-ftp'
EXPERIMENT_DIRS = {'A': 'Experiment_A_Figure3', 'B': 'Experiment_B_Figure4', 'C': 'Experiment_C_Figure5'}
def get_idr0095_data( path: Union[os.PathLike, str], experiment: Literal['A', 'B', 'C'] = 'A', download: bool = False) -> str:
140def get_idr0095_data(
141    path: Union[os.PathLike, str],
142    experiment: Literal["A", "B", "C"] = "A",
143    download: bool = False,
144) -> str:
145    """Download the IDR0095 dataset from EBI FTP and extract Phase channel TIFFs.
146
147    NOTE: This dataset is large - Experiment A is ~6 GB, B ~9 GB, C ~18 GB.
148    Raw images are in Nikon ND2 format; the `nd2` package (pip install nd2)
149    is required to extract the Phase channel TIFFs on first use.
150
151    Args:
152        path: Filepath to a folder where the downloaded data will be saved.
153        experiment: The experiment to download. One of 'A', 'B', or 'C',
154            corresponding to Figures 3, 4, and 5 of Ali et al. (2020).
155        download: Whether to download the data if it is not present.
156
157    Returns:
158        The filepath to the data directory containing the experiment folder.
159    """
160    assert experiment in EXPERIMENT_DIRS, \
161        f"'{experiment}' is not a valid experiment. Choose from {list(EXPERIMENT_DIRS)}."
162
163    data_dir = os.path.join(path, "idr0095")
164    exp_dir = os.path.join(data_dir, EXPERIMENT_DIRS[experiment])
165
166    if not download and not os.path.exists(exp_dir):
167        raise RuntimeError(
168            f"IDR0095 experiment {experiment} not found at {exp_dir}. "
169            "Set download=True to download from EBI FTP."
170        )
171
172    if download:
173        os.makedirs(data_dir, exist_ok=True)
174        print(f"Connecting to {FTP_HOST} to download IDR0095 Experiment {experiment} ...")
175        print("This may take a long time depending on experiment size (~6–18 GB).")
176        with ftplib.FTP(FTP_HOST) as ftp:
177            ftp.login()  # anonymous login
178            # _ftp_download_recursive skips files that already exist, safe to re-run.
179            _ftp_download_recursive(ftp, f"{FTP_BASE}/{EXPERIMENT_DIRS[experiment]}", exp_dir)
180
181    _extract_phase_tiffs(exp_dir, experiment)
182    return data_dir

Download the IDR0095 dataset from EBI FTP and extract Phase channel TIFFs.

NOTE: This dataset is large - Experiment A is ~6 GB, B ~9 GB, C ~18 GB. Raw images are in Nikon ND2 format; the nd2 package (pip install nd2) is required to extract the Phase channel TIFFs on first use.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • experiment: The experiment to download. One of 'A', 'B', or 'C', corresponding to Figures 3, 4, and 5 of Ali et al. (2020).
  • download: Whether to download the data if it is not present.
Returns:

The filepath to the data directory containing the experiment folder.

def get_idr0095_paths( path: Union[os.PathLike, str], experiment: Literal['A', 'B', 'C'] = 'A', download: bool = False) -> Tuple[List[str], List[str]]:
185def get_idr0095_paths(
186    path: Union[os.PathLike, str],
187    experiment: Literal["A", "B", "C"] = "A",
188    download: bool = False,
189) -> Tuple[List[str], List[str]]:
190    """Get paths to IDR0095 Phase-channel images and cell segmentation masks.
191
192    Args:
193        path: Filepath to a folder where the downloaded data will be saved.
194        experiment: The experiment to use. One of 'A', 'B', or 'C'.
195        download: Whether to download the data if it is not present.
196
197    Returns:
198        List of filepaths to the Phase-channel TIFF images.
199        List of filepaths to the instance segmentation mask TIFFs.
200    """
201    data_dir = get_idr0095_data(path, experiment, download)
202    exp_dir = os.path.join(data_dir, EXPERIMENT_DIRS[experiment])
203
204    mask_paths = natsorted(glob(os.path.join(exp_dir, "**", "*-Mask.tif"), recursive=True))
205    pairs = [
206        (p.replace("-Mask.tif", "-Phase.tif"), p)
207        for p in mask_paths
208        if os.path.exists(p.replace("-Mask.tif", "-Phase.tif"))
209    ]
210
211    if not pairs:
212        raise RuntimeError(
213            f"No Phase TIFFs found in {exp_dir}. "
214            "Ensure the dataset was downloaded and nd2 is installed for Phase extraction."
215        )
216
217    raw_paths, mask_paths = zip(*pairs)
218    return list(raw_paths), list(mask_paths)

Get paths to IDR0095 Phase-channel images and cell segmentation masks.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • experiment: The experiment to use. One of 'A', 'B', or 'C'.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths to the Phase-channel TIFF images. List of filepaths to the instance segmentation mask TIFFs.

def get_idr0095_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], experiment: Literal['A', 'B', 'C'] = 'A', download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
221def get_idr0095_dataset(
222    path: Union[os.PathLike, str],
223    patch_shape: Tuple[int, int],
224    experiment: Literal["A", "B", "C"] = "A",
225    download: bool = False,
226    **kwargs,
227) -> Dataset:
228    """Get the IDR0095 dataset for E. coli phase-contrast cell segmentation.
229
230    Args:
231        path: Filepath to a folder where the downloaded data will be saved.
232        patch_shape: The patch shape to use for training.
233        experiment: The experiment to use. One of 'A', 'B', or 'C'.
234        download: Whether to download the data if it is not present.
235        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
236
237    Returns:
238        The segmentation dataset.
239    """
240    raw_paths, label_paths = get_idr0095_paths(path, experiment, download)
241
242    return torch_em.default_segmentation_dataset(
243        raw_paths=raw_paths,
244        raw_key=None,
245        label_paths=label_paths,
246        label_key=None,
247        patch_shape=patch_shape,
248        **kwargs,
249    )

Get the IDR0095 dataset for E. coli phase-contrast cell segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • experiment: The experiment to use. One of 'A', 'B', or 'C'.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_idr0095_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], experiment: Literal['A', 'B', 'C'] = 'A', download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
252def get_idr0095_loader(
253    path: Union[os.PathLike, str],
254    batch_size: int,
255    patch_shape: Tuple[int, int],
256    experiment: Literal["A", "B", "C"] = "A",
257    download: bool = False,
258    **kwargs,
259) -> DataLoader:
260    """Get the IDR0095 dataloader for E. coli phase-contrast cell segmentation.
261
262    Args:
263        path: Filepath to a folder where the downloaded data will be saved.
264        batch_size: The batch size for training.
265        patch_shape: The patch shape to use for training.
266        experiment: The experiment to use. One of 'A', 'B', or 'C'.
267        download: Whether to download the data if it is not present.
268        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
269
270    Returns:
271        The DataLoader.
272    """
273    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
274    dataset = get_idr0095_dataset(path, patch_shape, experiment, download, **ds_kwargs)
275    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the IDR0095 dataloader for E. coli phase-contrast cell segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape to use for training.
  • experiment: The experiment to use. One of 'A', 'B', or 'C'.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.