torch_em.data.datasets.electron_microscopy.lucchi

The Lucchi dataset is a segmentation dataset for mitochondrion segmentation in electron microscopy.

The dataset was published in https://doi.org/10.48550/arXiv.1812.06024. Please cite this publication if you use the dataset in your research. We use the version of the dataset from https://sites.google.com/view/connectomics/.

View Source

  1"""The Lucchi dataset is a segmentation dataset for mitochondrion segmentation in electron microscopy.
  2
  3The dataset was published in https://doi.org/10.48550/arXiv.1812.06024.
  4Please cite this publication if you use the dataset in your research.
  5We use the version of the dataset from https://sites.google.com/view/connectomics/.
  6"""
  7
  8import os
  9from glob import glob
 10from tqdm import tqdm
 11from shutil import rmtree
 12from concurrent import futures
 13from typing import Tuple, Union, Literal
 14
 15import imageio
 16import numpy as np
 17
 18import torch_em
 19
 20from torch.utils.data import Dataset, DataLoader
 21
 22from .. import util
 23
 24
 25URL = "http://www.casser.io/files/lucchi_pp.zip"
 26CHECKSUM = "770ce9e98fc6f29c1b1a250c637e6c5125f2b5f1260e5a7687b55a79e2e8844d"
 27
 28
 29def _load_volume(path, pattern):
 30    nz = len(glob(os.path.join(path, "*.png")))
 31    im0 = imageio.imread(os.path.join(path, pattern % 0))
 32    out = np.zeros((nz,) + im0.shape, dtype=im0.dtype)
 33    out[0] = im0
 34
 35    def _loadz(z):
 36        im = imageio.imread(os.path.join(path, pattern % z))
 37        out[z] = im
 38
 39    n_threads = 8
 40    with futures.ThreadPoolExecutor(n_threads) as tp:
 41        list(tqdm(
 42            tp.map(_loadz, range(1, nz)), desc="Load volume", total=nz-1
 43        ))
 44
 45    return out
 46
 47
 48def _create_data(root, inputs, out_path):
 49    import h5py
 50
 51    raw = _load_volume(os.path.join(root, inputs[0]), pattern="mask%04i.png")
 52    labels_argb = _load_volume(os.path.join(root, inputs[1]), pattern="%i.png")
 53    if labels_argb.ndim == 4:
 54        labels = np.zeros(raw.shape, dtype="uint8")
 55        fg_mask = (labels_argb == np.array([255, 255, 255, 255])[None, None, None]).all(axis=-1)
 56        labels[fg_mask] = 1
 57    else:
 58        assert labels_argb.ndim == 3
 59        labels = labels_argb
 60        labels[labels == 255] = 1
 61    assert (np.unique(labels) == np.array([0, 1])).all()
 62    assert raw.shape == labels.shape, f"{raw.shape}, {labels.shape}"
 63    with h5py.File(out_path, "w") as f:
 64        f.create_dataset("raw", data=raw, compression="gzip")
 65        f.create_dataset("labels", data=labels.astype("uint8"), compression="gzip")
 66
 67
 68def get_lucchi_data(path: Union[os.PathLike, str], split: Literal["train", "test"], download: bool = False) -> str:
 69    """Download the Lucchi dataset.
 70
 71    Args:
 72        path: Filepath to a folder where the downloaded data will be saved.
 73        split: The split to download, either 'train' or 'test'.
 74        download: Whether to download the data if it is not present.
 75
 76    Returns:
 77        The filepath for the downloaded data.
 78    """
 79    data_path = os.path.join(path, f"lucchi_{split}.h5")
 80    if os.path.exists(data_path):
 81        return data_path
 82
 83    os.makedirs(path, exist_ok=True)
 84    tmp_path = os.path.join(path, "lucchi.zip")
 85    util.download_source(tmp_path, URL, download, checksum=CHECKSUM)
 86    util.unzip(tmp_path, path, remove=True)
 87
 88    root = os.path.join(path, "Lucchi++")
 89    assert os.path.exists(root), root
 90
 91    inputs = [["Test_In", "Test_Out"], ["Train_In", "Train_Out"]]
 92    outputs = ["lucchi_train.h5", "lucchi_test.h5"]
 93    for inp, out in zip(inputs, outputs):
 94        out_path = os.path.join(path, out)
 95        _create_data(root, inp, out_path)
 96    rmtree(root)
 97
 98    assert os.path.exists(data_path), data_path
 99    return data_path
100
101
102def get_lucchi_paths(path: Union[os.PathLike, str], split: Literal["train", "test"], download: bool = False) -> str:
103    """Get paths to the Lucchi data.
104
105    Args:
106        path: Filepath to a folder where the downloaded data will be saved.
107        split: The data split. Either 'train' or 'test'.
108        download: Whether to download the data if it is not present.
109
110    Returns:
111        The filepath for the stored data.
112    """
113    get_lucchi_data(path, split, download)
114    data_path = os.path.join(path, f"lucchi_{split}.h5")
115    return data_path
116
117
118def get_lucchi_dataset(
119    path: Union[os.PathLike, str],
120    split: Literal["train", "test"],
121    patch_shape: Tuple[int, int, int],
122    download: bool = False,
123    **kwargs
124) -> Dataset:
125    """Get dataset for EM mitochondrion segmentation in the Lucchi dataset.
126
127    Args:
128        path: Filepath to a folder where the downloaded data will be saved.
129        split: The data split. Either 'train' or 'test'.
130        patch_shape: The patch shape to use for training.
131        download: Whether to download the data if it is not present.
132        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
133
134    Returns:
135        The segmentation dataset.
136    """
137    assert split in ("train", "test")
138
139    data_path = get_lucchi_paths(path, split, download)
140
141    return torch_em.default_segmentation_dataset(
142        raw_paths=data_path,
143        raw_key="raw",
144        label_paths=data_path,
145        label_key="labels",
146        patch_shape=patch_shape,
147        **kwargs
148    )
149
150
151def get_lucchi_loader(
152    path: Union[os.PathLike, str],
153    split: Literal["train", "test"],
154    patch_shape: Tuple[int, int, int],
155    batch_size: int,
156    download: bool = False,
157    **kwargs
158) -> DataLoader:
159    """Get dataloader for EM mitochondrion segmentation in the Lucchi dataset.
160
161    Args:
162        path: Filepath to a folder where the downloaded data will be saved.
163        split: The data split. Either 'train' or 'test'.
164        patch_shape: The patch shape to use for training.
165        batch_size: The batch size for training.
166        download: Whether to download the data if it is not present.
167        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
168
169    Returns:
170        The PyTorch DataLoader.
171    """
172    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
173    dataset = get_lucchi_dataset(path, split, patch_shape, download=download, **ds_kwargs)
174    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

URL = 'http://www.casser.io/files/lucchi_pp.zip'

CHECKSUM = '770ce9e98fc6f29c1b1a250c637e6c5125f2b5f1260e5a7687b55a79e2e8844d'

def get_lucchi_data( path: Union[os.PathLike, str], split: Literal['train', 'test'], download: bool = False) -> str: View Source

 69def get_lucchi_data(path: Union[os.PathLike, str], split: Literal["train", "test"], download: bool = False) -> str:
 70    """Download the Lucchi dataset.
 71
 72    Args:
 73        path: Filepath to a folder where the downloaded data will be saved.
 74        split: The split to download, either 'train' or 'test'.
 75        download: Whether to download the data if it is not present.
 76
 77    Returns:
 78        The filepath for the downloaded data.
 79    """
 80    data_path = os.path.join(path, f"lucchi_{split}.h5")
 81    if os.path.exists(data_path):
 82        return data_path
 83
 84    os.makedirs(path, exist_ok=True)
 85    tmp_path = os.path.join(path, "lucchi.zip")
 86    util.download_source(tmp_path, URL, download, checksum=CHECKSUM)
 87    util.unzip(tmp_path, path, remove=True)
 88
 89    root = os.path.join(path, "Lucchi++")
 90    assert os.path.exists(root), root
 91
 92    inputs = [["Test_In", "Test_Out"], ["Train_In", "Train_Out"]]
 93    outputs = ["lucchi_train.h5", "lucchi_test.h5"]
 94    for inp, out in zip(inputs, outputs):
 95        out_path = os.path.join(path, out)
 96        _create_data(root, inp, out_path)
 97    rmtree(root)
 98
 99    assert os.path.exists(data_path), data_path
100    return data_path

Download the Lucchi dataset.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
split: The split to download, either 'train' or 'test'.
download: Whether to download the data if it is not present.

Returns:

The filepath for the downloaded data.

def get_lucchi_paths( path: Union[os.PathLike, str], split: Literal['train', 'test'], download: bool = False) -> str: View Source

103def get_lucchi_paths(path: Union[os.PathLike, str], split: Literal["train", "test"], download: bool = False) -> str:
104    """Get paths to the Lucchi data.
105
106    Args:
107        path: Filepath to a folder where the downloaded data will be saved.
108        split: The data split. Either 'train' or 'test'.
109        download: Whether to download the data if it is not present.
110
111    Returns:
112        The filepath for the stored data.
113    """
114    get_lucchi_data(path, split, download)
115    data_path = os.path.join(path, f"lucchi_{split}.h5")
116    return data_path

Get paths to the Lucchi data.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
split: The data split. Either 'train' or 'test'.
download: Whether to download the data if it is not present.

Returns:

The filepath for the stored data.

def get_lucchi_dataset( path: Union[os.PathLike, str], split: Literal['train', 'test'], patch_shape: Tuple[int, int, int], download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset: View Source

119def get_lucchi_dataset(
120    path: Union[os.PathLike, str],
121    split: Literal["train", "test"],
122    patch_shape: Tuple[int, int, int],
123    download: bool = False,
124    **kwargs
125) -> Dataset:
126    """Get dataset for EM mitochondrion segmentation in the Lucchi dataset.
127
128    Args:
129        path: Filepath to a folder where the downloaded data will be saved.
130        split: The data split. Either 'train' or 'test'.
131        patch_shape: The patch shape to use for training.
132        download: Whether to download the data if it is not present.
133        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
134
135    Returns:
136        The segmentation dataset.
137    """
138    assert split in ("train", "test")
139
140    data_path = get_lucchi_paths(path, split, download)
141
142    return torch_em.default_segmentation_dataset(
143        raw_paths=data_path,
144        raw_key="raw",
145        label_paths=data_path,
146        label_key="labels",
147        patch_shape=patch_shape,
148        **kwargs
149    )

Get dataset for EM mitochondrion segmentation in the Lucchi dataset.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
split: The data split. Either 'train' or 'test'.
patch_shape: The patch shape to use for training.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

Returns:

The segmentation dataset.

def get_lucchi_loader( path: Union[os.PathLike, str], split: Literal['train', 'test'], patch_shape: Tuple[int, int, int], batch_size: int, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader: View Source

152def get_lucchi_loader(
153    path: Union[os.PathLike, str],
154    split: Literal["train", "test"],
155    patch_shape: Tuple[int, int, int],
156    batch_size: int,
157    download: bool = False,
158    **kwargs
159) -> DataLoader:
160    """Get dataloader for EM mitochondrion segmentation in the Lucchi dataset.
161
162    Args:
163        path: Filepath to a folder where the downloaded data will be saved.
164        split: The data split. Either 'train' or 'test'.
165        patch_shape: The patch shape to use for training.
166        batch_size: The batch size for training.
167        download: Whether to download the data if it is not present.
168        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
169
170    Returns:
171        The PyTorch DataLoader.
172    """
173    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
174    dataset = get_lucchi_dataset(path, split, patch_shape, download=download, **ds_kwargs)
175    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get dataloader for EM mitochondrion segmentation in the Lucchi dataset.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
split: The data split. Either 'train' or 'test'.
patch_shape: The patch shape to use for training.
batch_size: The batch size for training.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

Returns:

The PyTorch DataLoader.