torch_em.data.datasets.light_microscopy.pnas_arabidopsis

The PNAS Arabidopsis dataset contains cell segmentation in confocal microscopy images of arabidopsis plantlets.

NOTE: There is tracking information available for this data.

This dataset is from the publication https://doi.org/10.1073/pnas.1616768113. Please cite it if you use this dataset for your research.

  1"""The PNAS Arabidopsis dataset contains cell segmentation in confocal microscopy images of
  2arabidopsis plantlets.
  3
  4NOTE: There is tracking information available for this data.
  5
  6This dataset is from the publication https://doi.org/10.1073/pnas.1616768113.
  7Please cite it if you use this dataset for your research.
  8"""
  9
 10import os
 11import shutil
 12from glob import glob
 13from tqdm import tqdm
 14from pathlib import Path
 15from natsort import natsorted
 16from typing import Union, Tuple, List
 17
 18import imageio.v3 as imageio
 19
 20from torch.utils.data import Dataset, DataLoader
 21
 22import torch_em
 23
 24from .. import util
 25
 26
 27URL = "https://www.repository.cam.ac.uk/bitstream/handle/1810/262530/PNAS.zip?sequence=4&isAllowed=y"
 28CHECKSUM = "39341398389baf6d93c3f652b7e2e8aedc5579c29dfaf2b82b41ebfc3caa05c4"
 29
 30
 31def get_pnas_arabidopsis_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 32    """Download the PNAS Arabidopsis dataset.
 33
 34    Args:
 35        path: Filepath to a folder where the data is downloaded for further processing.
 36        download: Whether to download the data if it is not present.
 37
 38    Returns:
 39        Filepath where the data is downloaded and pre-processed.
 40    """
 41    data_dir = os.path.join(path, "data")
 42    if os.path.exists(data_dir):
 43        return data_dir
 44
 45    os.makedirs(data_dir)
 46
 47    zip_path = os.path.join(path, "PNAS.zip")
 48    util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM)
 49    util.unzip(zip_path=zip_path, dst=data_dir)
 50
 51    # Convert the data to h5 (It's hard to keep a track of filenames as they are not completely consistent)
 52    import h5py
 53
 54    raw_paths = natsorted(glob(os.path.join(data_dir, "PNAS", "plant*", "processed_tiffs", "*trim-acylYFP.tif")))
 55    for rpath in tqdm(raw_paths, desc="Preprocessing images"):
 56        # Let's find the label.
 57        label_path = rpath.replace("processed_tiffs", "segmentation_tiffs")
 58        label_path = glob(label_path.replace(".tif", "*.tif"))
 59
 60        if len(label_path) != 1:
 61            print(f"It seems like there are no matching labels for '{os.path.basename(rpath)}'.")
 62            continue
 63
 64        label_path = label_path[0]
 65
 66        raw = imageio.imread(rpath)
 67        labels = imageio.imread(label_path)
 68
 69        # Store both image and corresponding labels in a h5 file.
 70        vol_path = os.path.join(data_dir, Path(os.path.basename(rpath)).with_suffix(".h5"))
 71        with h5py.File(vol_path, "w") as f:
 72            f.create_dataset("raw", data=raw, dtype=raw.dtype, compression="gzip")
 73            f.create_dataset("labels", data=labels, dtype=labels.dtype, compression="gzip")
 74
 75    # Remove old data folder
 76    shutil.rmtree(os.path.join(path, "data", "PNAS"))
 77
 78    return data_dir
 79
 80
 81def get_pnas_arabidopsis_paths(path: Union[os.PathLike, str], download: bool = False) -> List[str]:
 82    """Get paths to the PNAS Arabidopsis data.
 83
 84    Args:
 85        path: Filepath to a folder where the data is downloaded for further processing.
 86        download: Whether to download the data if it is not present.
 87
 88    Returns:
 89        List of filepaths for the volumetric data.
 90    """
 91    data_dir = get_pnas_arabidopsis_data(path, download)
 92    volume_paths = glob(os.path.join(data_dir, "*.h5"))
 93    return volume_paths
 94
 95
 96def get_pnas_arabidopsis_dataset(
 97    path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], download: bool = False, **kwargs
 98) -> Dataset:
 99    """Get the PNAS Arabidopsis dataset for cell segmentation.
100
101    Args:
102        path: Filepath to a folder where the data is downloaded for further processing.
103        patch_shape: The patch shape to use for training.
104        download: Whether to download the data if it is not present.
105        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
106
107    Returns:
108        The segmentation dataset.
109    """
110    volume_paths = get_pnas_arabidopsis_paths(path, download)
111
112    return torch_em.default_segmentation_dataset(
113        raw_paths=volume_paths,
114        raw_key="raw",
115        label_paths=volume_paths,
116        label_key="labels",
117        patch_shape=patch_shape,
118        is_seg_dataset=True,
119        **kwargs
120    )
121
122
123def get_pnas_arabidopsis_loader(
124    path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, ...], download: bool = False, **kwargs
125) -> DataLoader:
126    """Get the PNAS Arabidopsis dataset for cell segmentation.
127
128    Args:
129        path: Filepath to a folder where the data is downloaded for further processing.
130        batch_size: The batch size for training.
131        patch_shape: The patch shape to use for training.
132        download: Whether to download the data if it is not present.
133        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
134
135    Returns:
136        The segmentation dataset.
137    """
138    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
139    dataset = get_pnas_arabidopsis_dataset(path, patch_shape, download, **ds_kwargs)
140    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URL = 'https://www.repository.cam.ac.uk/bitstream/handle/1810/262530/PNAS.zip?sequence=4&isAllowed=y'
CHECKSUM = '39341398389baf6d93c3f652b7e2e8aedc5579c29dfaf2b82b41ebfc3caa05c4'
def get_pnas_arabidopsis_data(path: Union[os.PathLike, str], download: bool = False) -> str:
32def get_pnas_arabidopsis_data(path: Union[os.PathLike, str], download: bool = False) -> str:
33    """Download the PNAS Arabidopsis dataset.
34
35    Args:
36        path: Filepath to a folder where the data is downloaded for further processing.
37        download: Whether to download the data if it is not present.
38
39    Returns:
40        Filepath where the data is downloaded and pre-processed.
41    """
42    data_dir = os.path.join(path, "data")
43    if os.path.exists(data_dir):
44        return data_dir
45
46    os.makedirs(data_dir)
47
48    zip_path = os.path.join(path, "PNAS.zip")
49    util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM)
50    util.unzip(zip_path=zip_path, dst=data_dir)
51
52    # Convert the data to h5 (It's hard to keep a track of filenames as they are not completely consistent)
53    import h5py
54
55    raw_paths = natsorted(glob(os.path.join(data_dir, "PNAS", "plant*", "processed_tiffs", "*trim-acylYFP.tif")))
56    for rpath in tqdm(raw_paths, desc="Preprocessing images"):
57        # Let's find the label.
58        label_path = rpath.replace("processed_tiffs", "segmentation_tiffs")
59        label_path = glob(label_path.replace(".tif", "*.tif"))
60
61        if len(label_path) != 1:
62            print(f"It seems like there are no matching labels for '{os.path.basename(rpath)}'.")
63            continue
64
65        label_path = label_path[0]
66
67        raw = imageio.imread(rpath)
68        labels = imageio.imread(label_path)
69
70        # Store both image and corresponding labels in a h5 file.
71        vol_path = os.path.join(data_dir, Path(os.path.basename(rpath)).with_suffix(".h5"))
72        with h5py.File(vol_path, "w") as f:
73            f.create_dataset("raw", data=raw, dtype=raw.dtype, compression="gzip")
74            f.create_dataset("labels", data=labels, dtype=labels.dtype, compression="gzip")
75
76    # Remove old data folder
77    shutil.rmtree(os.path.join(path, "data", "PNAS"))
78
79    return data_dir

Download the PNAS Arabidopsis dataset.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • download: Whether to download the data if it is not present.
Returns:

Filepath where the data is downloaded and pre-processed.

def get_pnas_arabidopsis_paths(path: Union[os.PathLike, str], download: bool = False) -> List[str]:
82def get_pnas_arabidopsis_paths(path: Union[os.PathLike, str], download: bool = False) -> List[str]:
83    """Get paths to the PNAS Arabidopsis data.
84
85    Args:
86        path: Filepath to a folder where the data is downloaded for further processing.
87        download: Whether to download the data if it is not present.
88
89    Returns:
90        List of filepaths for the volumetric data.
91    """
92    data_dir = get_pnas_arabidopsis_data(path, download)
93    volume_paths = glob(os.path.join(data_dir, "*.h5"))
94    return volume_paths

Get paths to the PNAS Arabidopsis data.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths for the volumetric data.

def get_pnas_arabidopsis_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
 97def get_pnas_arabidopsis_dataset(
 98    path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], download: bool = False, **kwargs
 99) -> Dataset:
100    """Get the PNAS Arabidopsis dataset for cell segmentation.
101
102    Args:
103        path: Filepath to a folder where the data is downloaded for further processing.
104        patch_shape: The patch shape to use for training.
105        download: Whether to download the data if it is not present.
106        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
107
108    Returns:
109        The segmentation dataset.
110    """
111    volume_paths = get_pnas_arabidopsis_paths(path, download)
112
113    return torch_em.default_segmentation_dataset(
114        raw_paths=volume_paths,
115        raw_key="raw",
116        label_paths=volume_paths,
117        label_key="labels",
118        patch_shape=patch_shape,
119        is_seg_dataset=True,
120        **kwargs
121    )

Get the PNAS Arabidopsis dataset for cell segmentation.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • patch_shape: The patch shape to use for training.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_pnas_arabidopsis_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, ...], download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
124def get_pnas_arabidopsis_loader(
125    path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, ...], download: bool = False, **kwargs
126) -> DataLoader:
127    """Get the PNAS Arabidopsis dataset for cell segmentation.
128
129    Args:
130        path: Filepath to a folder where the data is downloaded for further processing.
131        batch_size: The batch size for training.
132        patch_shape: The patch shape to use for training.
133        download: Whether to download the data if it is not present.
134        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
135
136    Returns:
137        The segmentation dataset.
138    """
139    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
140    dataset = get_pnas_arabidopsis_dataset(path, patch_shape, download, **ds_kwargs)
141    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the PNAS Arabidopsis dataset for cell segmentation.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape to use for training.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.