torch_em.data.datasets.electron_microscopy.kasthuri

The Kasthuri dataset is a segmentation dataset for mitochondrion segmentation in electron microscopy.

The dataset was published in https://doi.org/10.48550/arXiv.1812.06024. Please cite this publication if you use the dataset in your research. We use the version of the dataset from https://sites.google.com/view/connectomics/.

View Source

  1"""The Kasthuri dataset is a segmentation dataset for mitochondrion segmentation in electron microscopy.
  2
  3The dataset was published in https://doi.org/10.48550/arXiv.1812.06024.
  4Please cite this publication if you use the dataset in your research.
  5We use the version of the dataset from https://sites.google.com/view/connectomics/.
  6"""
  7
  8import os
  9from glob import glob
 10from tqdm import tqdm
 11from shutil import rmtree
 12from concurrent import futures
 13from typing import Tuple, Union
 14
 15import imageio
 16import numpy as np
 17
 18import torch_em
 19
 20from torch.utils.data import Dataset, DataLoader
 21
 22from .. import util
 23
 24
 25URL = "http://www.casser.io/files/kasthuri_pp.zip "
 26CHECKSUM = "bbb78fd205ec9b57feb8f93ebbdf1666261cbc3e0305e7f11583ab5157a3d792"
 27
 28# TODO: add sampler for foreground (-1 is empty area)
 29# TODO: and masking for the empty space
 30
 31
 32def _load_volume(path):
 33    files = glob(os.path.join(path, "*.png"))
 34    files.sort()
 35    nz = len(files)
 36
 37    im0 = imageio.imread(files[0])
 38    out = np.zeros((nz,) + im0.shape, dtype=im0.dtype)
 39    out[0] = im0
 40
 41    def _loadz(z):
 42        im = imageio.imread(files[z])
 43        out[z] = im
 44
 45    n_threads = 8
 46    with futures.ThreadPoolExecutor(n_threads) as tp:
 47        list(tqdm(
 48            tp.map(_loadz, range(1, nz)), desc="Load volume", total=nz-1
 49        ))
 50
 51    return out
 52
 53
 54def _create_data(root, inputs, out_path):
 55    import h5py
 56
 57    raw = _load_volume(os.path.join(root, inputs[0]))
 58    labels_argb = _load_volume(os.path.join(root, inputs[1]))
 59    assert labels_argb.ndim == 4
 60    labels = np.zeros(raw.shape, dtype="int8")
 61
 62    fg_mask = (labels_argb == np.array([255, 255, 255])[None, None, None]).all(axis=-1)
 63    labels[fg_mask] = 1
 64    bg_mask = (labels_argb == np.array([2, 2, 2])[None, None, None]).all(axis=-1)
 65    labels[bg_mask] = -1
 66    assert (np.unique(labels) == np.array([-1, 0, 1])).all()
 67    assert raw.shape == labels.shape, f"{raw.shape}, {labels.shape}"
 68    with h5py.File(out_path, "w") as f:
 69        f.create_dataset("raw", data=raw, compression="gzip")
 70        f.create_dataset("labels", data=labels, compression="gzip")
 71
 72
 73def get_kasthuri_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 74    """Download the kasthuri dataset.
 75
 76    Args:
 77        path: Filepath to a folder where the downloaded data will be saved.
 78        download: Whether to download the data if it is not present.
 79
 80    Returns:
 81        The filepath for the downloaded data.
 82    """
 83    if os.path.exists(path):
 84        return path
 85
 86    os.makedirs(path)
 87    tmp_path = os.path.join(path, "kasthuri.zip")
 88    util.download_source(tmp_path, URL, download, checksum=CHECKSUM)
 89    util.unzip(tmp_path, path, remove=True)
 90
 91    root = os.path.join(path, "Kasthuri++")
 92    assert os.path.exists(root), root
 93
 94    inputs = [["Test_In", "Test_Out"], ["Train_In", "Train_Out"]]
 95    outputs = ["kasthuri_train.h5", "kasthuri_test.h5"]
 96    for inp, out in zip(inputs, outputs):
 97        out_path = os.path.join(path, out)
 98        _create_data(root, inp, out_path)
 99
100    rmtree(root)
101    return path
102
103
104def get_kasthuri_paths(path: Union[os.PathLike, str], split: str, download: bool = False) -> str:
105    """Get paths to the Kasthuri data.
106
107    Args:
108        path: Filepath to a folder where the downloaded data will be saved.
109        split: The data split. Either 'train' or 'test'.
110        download: Whether to download the data if it is not present.
111
112    Returns:
113        The filepath to the stored data.
114    """
115    get_kasthuri_data(path, download)
116    data_path = os.path.join(path, f"kasthuri_{split}.h5")
117    assert os.path.exists(data_path), data_path
118    return data_path
119
120
121def get_kasthuri_dataset(
122    path: Union[os.PathLike, str], split: str, patch_shape: Tuple[int, int, int], download: bool = False, **kwargs
123) -> Dataset:
124    """Get dataset for EM mitochondrion segmentation in the kasthuri dataset.
125
126    Args:
127        path: Filepath to a folder where the downloaded data will be saved.
128        split: The data split. Either 'train' or 'test'.
129        patch_shape: The patch shape to use for training.
130        download: Whether to download the data if it is not present.
131        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
132
133    Returns:
134        The segmentation dataset.
135    """
136    assert split in ("train", "test")
137
138    data_path = get_kasthuri_paths(path, split, download)
139
140    return torch_em.default_segmentation_dataset(
141        raw_paths=data_path,
142        raw_key="raw",
143        label_paths=data_path,
144        label_key="labels",
145        patch_shape=patch_shape,
146        **kwargs
147    )
148
149
150def get_kasthuri_loader(
151    path: Union[os.PathLike, str],
152    split: str,
153    patch_shape: Tuple[int, int, int],
154    batch_size: int,
155    download: bool = False,
156    **kwargs
157) -> DataLoader:
158    """Get dataloader for EM mitochondrion segmentation in the kasthuri dataset.
159
160    Args:
161        path: Filepath to a folder where the downloaded data will be saved.
162        split: The data split. Either 'train' or 'test'.
163        patch_shape: The patch shape to use for training.
164        batch_size: The batch size for training.
165        download: Whether to download the data if it is not present.
166        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
167
168    Returns:
169        The PyTorch DataLoader.
170    """
171    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
172    dataset = get_kasthuri_dataset(path, split, patch_shape, download=download, **ds_kwargs)
173    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

URL = 'http://www.casser.io/files/kasthuri_pp.zip '

CHECKSUM = 'bbb78fd205ec9b57feb8f93ebbdf1666261cbc3e0305e7f11583ab5157a3d792'

def get_kasthuri_data(path: Union[os.PathLike, str], download: bool = False) -> str: View Source

 74def get_kasthuri_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 75    """Download the kasthuri dataset.
 76
 77    Args:
 78        path: Filepath to a folder where the downloaded data will be saved.
 79        download: Whether to download the data if it is not present.
 80
 81    Returns:
 82        The filepath for the downloaded data.
 83    """
 84    if os.path.exists(path):
 85        return path
 86
 87    os.makedirs(path)
 88    tmp_path = os.path.join(path, "kasthuri.zip")
 89    util.download_source(tmp_path, URL, download, checksum=CHECKSUM)
 90    util.unzip(tmp_path, path, remove=True)
 91
 92    root = os.path.join(path, "Kasthuri++")
 93    assert os.path.exists(root), root
 94
 95    inputs = [["Test_In", "Test_Out"], ["Train_In", "Train_Out"]]
 96    outputs = ["kasthuri_train.h5", "kasthuri_test.h5"]
 97    for inp, out in zip(inputs, outputs):
 98        out_path = os.path.join(path, out)
 99        _create_data(root, inp, out_path)
100
101    rmtree(root)
102    return path

Download the kasthuri dataset.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
download: Whether to download the data if it is not present.

Returns:

The filepath for the downloaded data.

def get_kasthuri_paths(path: Union[os.PathLike, str], split: str, download: bool = False) -> str: View Source

105def get_kasthuri_paths(path: Union[os.PathLike, str], split: str, download: bool = False) -> str:
106    """Get paths to the Kasthuri data.
107
108    Args:
109        path: Filepath to a folder where the downloaded data will be saved.
110        split: The data split. Either 'train' or 'test'.
111        download: Whether to download the data if it is not present.
112
113    Returns:
114        The filepath to the stored data.
115    """
116    get_kasthuri_data(path, download)
117    data_path = os.path.join(path, f"kasthuri_{split}.h5")
118    assert os.path.exists(data_path), data_path
119    return data_path

Get paths to the Kasthuri data.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
split: The data split. Either 'train' or 'test'.
download: Whether to download the data if it is not present.

Returns:

The filepath to the stored data.

def get_kasthuri_dataset( path: Union[os.PathLike, str], split: str, patch_shape: Tuple[int, int, int], download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset: View Source

122def get_kasthuri_dataset(
123    path: Union[os.PathLike, str], split: str, patch_shape: Tuple[int, int, int], download: bool = False, **kwargs
124) -> Dataset:
125    """Get dataset for EM mitochondrion segmentation in the kasthuri dataset.
126
127    Args:
128        path: Filepath to a folder where the downloaded data will be saved.
129        split: The data split. Either 'train' or 'test'.
130        patch_shape: The patch shape to use for training.
131        download: Whether to download the data if it is not present.
132        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
133
134    Returns:
135        The segmentation dataset.
136    """
137    assert split in ("train", "test")
138
139    data_path = get_kasthuri_paths(path, split, download)
140
141    return torch_em.default_segmentation_dataset(
142        raw_paths=data_path,
143        raw_key="raw",
144        label_paths=data_path,
145        label_key="labels",
146        patch_shape=patch_shape,
147        **kwargs
148    )

Get dataset for EM mitochondrion segmentation in the kasthuri dataset.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
split: The data split. Either 'train' or 'test'.
patch_shape: The patch shape to use for training.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

Returns:

The segmentation dataset.

def get_kasthuri_loader( path: Union[os.PathLike, str], split: str, patch_shape: Tuple[int, int, int], batch_size: int, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader: View Source

151def get_kasthuri_loader(
152    path: Union[os.PathLike, str],
153    split: str,
154    patch_shape: Tuple[int, int, int],
155    batch_size: int,
156    download: bool = False,
157    **kwargs
158) -> DataLoader:
159    """Get dataloader for EM mitochondrion segmentation in the kasthuri dataset.
160
161    Args:
162        path: Filepath to a folder where the downloaded data will be saved.
163        split: The data split. Either 'train' or 'test'.
164        patch_shape: The patch shape to use for training.
165        batch_size: The batch size for training.
166        download: Whether to download the data if it is not present.
167        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
168
169    Returns:
170        The PyTorch DataLoader.
171    """
172    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
173    dataset = get_kasthuri_dataset(path, split, patch_shape, download=download, **ds_kwargs)
174    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get dataloader for EM mitochondrion segmentation in the kasthuri dataset.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
split: The data split. Either 'train' or 'test'.
patch_shape: The patch shape to use for training.
batch_size: The batch size for training.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

Returns:

The PyTorch DataLoader.