torch_em.data.datasets.electron_microscopy.mitoem

MitoEM is a dataset for segmenting mitochondria in electron microscopy. It contains two large annotated volumes, one from rat cortex, the other from human cortex. This dataset was used for a segmentation challenge at ISBI 2022.

If you use it in your research then please cite https://doi.org/10.1007/978-3-030-59722-1_7.

View Source

  1"""MitoEM is a dataset for segmenting mitochondria in electron microscopy.
  2It contains two large annotated volumes, one from rat cortex, the other from human cortex.
  3This dataset was used for a segmentation challenge at ISBI 2022.
  4
  5If you use it in your research then please cite https://doi.org/10.1007/978-3-030-59722-1_7.
  6"""
  7
  8import os
  9from tqdm import tqdm
 10import multiprocessing
 11from shutil import rmtree
 12from concurrent import futures
 13from typing import List, Optional, Sequence, Tuple, Union
 14
 15import imageio
 16import numpy as np
 17
 18import torch_em
 19
 20from torch.utils.data import Dataset, DataLoader
 21
 22from .. import util
 23
 24
 25URLS = {
 26    "raw": {
 27        "human": "https://www.dropbox.com/s/z41qtu4y735j95e/EM30-H-im.zip?dl=1",
 28        "rat": "https://huggingface.co/datasets/pytc/EM30/resolve/main/EM30-R-im.zip"
 29    },
 30    "labels": {
 31        "human": "https://www.dropbox.com/s/dhf89bc14kemw4e/EM30-H-mito-train-val-v2.zip?dl=1",
 32        "rat": "https://huggingface.co/datasets/pytc/MitoEM/resolve/main/EM30-R-mito-train-val-v2.zip"
 33    }
 34}
 35CHECKSUMS = {
 36    "raw": {
 37        "human": "98fe259f36a7d8d43f99981b7a0ef8cdeba2ce2615ff91595f428ae57207a041",
 38        "rat": "6a2cac68adde5d01984542d3ee1d7753d1fa3e6eb2a042ce15ce297c95885bbe"
 39    },
 40    "labels": {
 41        "human": "0e8ed292cfcd0c58701d9f4299244a1b66d6aeb506c85754c34f98a4eda0ef1b",
 42        "rat": "c56380ac575428a818bd293ca3509d1249999846c3702ccbf11d308acdd2ae86"
 43    }
 44}
 45
 46
 47def _check_data(path, sample):
 48    splits = ["train", "val", "test"]
 49    expected_paths = [os.path.join(path, f"{sample}_{split}.n5") for split in splits]
 50    return all(os.path.exists(pp) for pp in expected_paths)
 51
 52
 53def get_slices(folder):
 54    files = os.listdir(folder)
 55    files.sort()
 56    files = [os.path.splitext(ff)[0] for ff in files]
 57    slice_ids = [int(ff[2:]) if ff.startswith('im') else int(ff[3:]) for ff in files]
 58    return slice_ids
 59
 60
 61def _load_vol(pattern, slice_ids, desc, n_threads, dtype=None):
 62    im0 = pattern % slice_ids[0]
 63    im0 = imageio.imread(im0)
 64
 65    shape = (len(slice_ids),) + im0.shape
 66
 67    dtype = im0.dtype if dtype is None else dtype
 68    out = np.zeros(shape, dtype=dtype)
 69    out[0] = im0
 70
 71    def load_slice(z, slice_id):
 72        out[z] = imageio.imread(pattern % slice_id)
 73
 74    zs = list(range(1, len(slice_ids)))
 75    assert len(zs) == len(slice_ids) - 1
 76    with futures.ThreadPoolExecutor(n_threads) as tp:
 77        list(tqdm(tp.map(load_slice, zs, slice_ids[1:]), total=len(slice_ids) - 1, desc=desc))
 78
 79    return out
 80
 81
 82def _create_volume(out_path, im_folder, label_folder=None, z_start=None):
 83    import z5py
 84
 85    if label_folder is None:
 86        assert z_start is not None
 87        n_slices = len(get_slices(im_folder))
 88        slices = list(range(z_start, n_slices))
 89    else:
 90        assert z_start is None
 91        slices = get_slices(label_folder)
 92
 93    n_threads = min(16, multiprocessing.cpu_count())
 94    raw = _load_vol(os.path.join(im_folder, "im%04i.png"), slices, "load raw", n_threads)
 95    if label_folder is not None:
 96        labels = _load_vol(os.path.join(label_folder, "seg%04i.tif"), slices, "load labels", n_threads, dtype="uint64")
 97
 98    print("Write volume to", out_path)
 99    chunks = (32, 256, 256)
100    with z5py.File(out_path, "a") as f:
101        f.create_dataset("raw", data=raw, chunks=chunks, compression="gzip", n_threads=n_threads)
102        if label_folder is not None:
103            ds = f.create_dataset("labels", data=labels, chunks=chunks, compression="gzip", n_threads=n_threads)
104            ds.attrs["maxId"] = int(labels.max()) + 1
105
106    return slices[-1]
107
108
109def _require_mitoem_sample(path, sample, download):
110    os.makedirs(path, exist_ok=True)
111
112    for name in ("raw", "labels"):
113        url = URLS[name][sample]
114        checksum = CHECKSUMS[name][sample]
115        zip_path = os.path.join(path, f"{sample}.zip")
116        util.download_source(zip_path, url, download, checksum)
117        util.unzip(zip_path, path, remove=True)
118
119    im_folder = os.path.join(path, "im")
120    train_folder = os.path.join(path, "mito-train-v2")
121    val_folder = os.path.join(path, "mito-val-v2")
122
123    print("Create train volume")
124    train_path = os.path.join(path, f"{sample}_train.n5")
125    _create_volume(train_path, im_folder, train_folder)
126
127    print("Create validation volume")
128    val_path = os.path.join(path, f"{sample}_val.n5")
129    z = _create_volume(val_path, im_folder, val_folder)
130
131    print("Create test volume")
132    test_path = os.path.join(path, f"{sample}_test.n5")
133    _create_volume(test_path, im_folder, z_start=z)
134
135    rmtree(im_folder)
136    rmtree(train_folder)
137    rmtree(val_folder)
138
139
140def get_mitoem_data(path: Union[os.PathLike, str], samples: Sequence[str], splits: Sequence[str], download: bool):
141    """Download the MitoEM training data.
142
143    Args:
144        path: Filepath to a folder where the downloaded data will be saved.
145        samples: The samples to download. The available samples are 'human' and 'rat'.
146        splits: The data splits to download. The available splits are 'train', 'val' and 'test'.
147        download: Whether to download the data if it is not present.
148    """
149    assert len(set(splits) - {"train", "val"}) == 0, f"{splits}"
150    assert len(set(samples) - {"human", "rat"}) == 0, f"{samples}"
151    os.makedirs(path, exist_ok=True)
152
153    for sample in samples:
154        if not _check_data(path, sample):
155            print("The MitoEM data for sample", sample, "is not available yet and will be downloaded and created.")
156            print("Note that this dataset is large, so this step can take several hours (depending on your internet).")
157            _require_mitoem_sample(path, sample, download)
158            print("The MitoEM data for sample", sample, "has been created.")
159
160        for split in splits:
161            split_path = os.path.join(path, f"{sample}_{split}.n5")
162            assert os.path.exists(split_path), split_path
163
164
165def get_mitoem_paths(
166    path: Union[os.PathLike, str],
167    splits: Sequence[str],
168    samples: Sequence[str] = ("human", "rat"),
169    download: bool = False,
170) -> List[str]:
171    """Get paths for MitoEM data.
172
173    Args:
174        path: Filepath to a folder where the downloaded data will be saved.
175        samples: The samples to download. The available samples are 'human' and 'rat'.
176        splits: The data splits to download. The available splits are 'train', 'val' and 'test'.
177        download: Whether to download the data if it is not present.
178
179    Returns:
180        The filepaths for the stored data.
181    """
182    if isinstance(splits, str):
183        splits = [splits]
184
185    if isinstance(samples, str):
186        samples = [samples]
187
188    get_mitoem_data(path, samples, splits, download)
189    data_paths = [os.path.join(path, f"{sample}_{split}.n5") for split in splits for sample in samples]
190
191    return data_paths
192
193
194def get_mitoem_dataset(
195    path: Union[os.PathLike, str],
196    splits: Sequence[str],
197    patch_shape: Tuple[int, int, int],
198    samples: Sequence[str] = ("human", "rat"),
199    download: bool = False,
200    offsets: Optional[List[List[int]]] = None,
201    boundaries: bool = False,
202    binary: bool = False,
203    **kwargs,
204) -> Dataset:
205    """Get the MitoEM dataset for the segmentation of mitochondria in EM.
206
207    Args:
208        path: Filepath to a folder where the downloaded data will be saved.
209        splits: The splits to use for the dataset. Available values are 'train', 'val' and 'test'.
210        patch_shape: The patch shape to use for training.
211        samples: The samples to use for the dataset. The available samples are 'human' and 'rat'.
212        download: Whether to download the data if it is not present.
213        offsets: Offset values for affinity computation used as target.
214        boundaries: Whether to compute boundaries as the target.
215        binary: Whether to return a binary segmentation target.
216        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
217
218    Returns:
219       The segmentation dataset.
220    """
221    assert len(patch_shape) == 3
222
223    data_paths = get_mitoem_paths(path, splits, samples, download)
224
225    kwargs, _ = util.add_instance_label_transform(
226        kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets
227    )
228
229    return torch_em.default_segmentation_dataset(
230        raw_paths=data_paths,
231        raw_key="raw",
232        label_paths=data_paths,
233        label_key="labels",
234        patch_shape=patch_shape,
235        **kwargs
236    )
237
238
239def get_mitoem_loader(
240    path: Union[os.PathLike, str],
241    splits: Sequence[str],
242    patch_shape: Tuple[int, int, int],
243    batch_size: int,
244    samples: Sequence[str] = ("human", "rat"),
245    download: bool = False,
246    offsets: Optional[List[List[int]]] = None,
247    boundaries: bool = False,
248    binary: bool = False,
249    **kwargs,
250) -> DataLoader:
251    """Get the MitoEM dataloader for the segmentation of mitochondria in EM.
252
253    Args:
254        path: Filepath to a folder where the downloaded data will be saved.
255        splits: The splits to use for the dataset. Available values are 'train', 'val' and 'test'.
256        patch_shape: The patch shape to use for training.
257        batch_size: The batch size for training.
258        samples: The samples to use for the dataset. The available samples are 'human' and 'rat'.
259        download: Whether to download the data if it is not present.
260        offsets: Offset values for affinity computation used as target.
261        boundaries: Whether to compute boundaries as the target.
262        binary: Whether to return a binary segmentation target.
263        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
264
265    Returns:
266       The DataLoader.
267    """
268    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
269    dataset = get_mitoem_dataset(path, splits, patch_shape, samples, download, offsets, boundaries, binary, **ds_kwargs)
270    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

URLS = {'raw': {'human': 'https://www.dropbox.com/s/z41qtu4y735j95e/EM30-H-im.zip?dl=1', 'rat': 'https://huggingface.co/datasets/pytc/EM30/resolve/main/EM30-R-im.zip'}, 'labels': {'human': 'https://www.dropbox.com/s/dhf89bc14kemw4e/EM30-H-mito-train-val-v2.zip?dl=1', 'rat': 'https://huggingface.co/datasets/pytc/MitoEM/resolve/main/EM30-R-mito-train-val-v2.zip'}}

CHECKSUMS = {'raw': {'human': '98fe259f36a7d8d43f99981b7a0ef8cdeba2ce2615ff91595f428ae57207a041', 'rat': '6a2cac68adde5d01984542d3ee1d7753d1fa3e6eb2a042ce15ce297c95885bbe'}, 'labels': {'human': '0e8ed292cfcd0c58701d9f4299244a1b66d6aeb506c85754c34f98a4eda0ef1b', 'rat': 'c56380ac575428a818bd293ca3509d1249999846c3702ccbf11d308acdd2ae86'}}

def get_slices(folder): View Source

54def get_slices(folder):
55    files = os.listdir(folder)
56    files.sort()
57    files = [os.path.splitext(ff)[0] for ff in files]
58    slice_ids = [int(ff[2:]) if ff.startswith('im') else int(ff[3:]) for ff in files]
59    return slice_ids

def get_mitoem_data( path: Union[os.PathLike, str], samples: Sequence[str], splits: Sequence[str], download: bool): View Source

141def get_mitoem_data(path: Union[os.PathLike, str], samples: Sequence[str], splits: Sequence[str], download: bool):
142    """Download the MitoEM training data.
143
144    Args:
145        path: Filepath to a folder where the downloaded data will be saved.
146        samples: The samples to download. The available samples are 'human' and 'rat'.
147        splits: The data splits to download. The available splits are 'train', 'val' and 'test'.
148        download: Whether to download the data if it is not present.
149    """
150    assert len(set(splits) - {"train", "val"}) == 0, f"{splits}"
151    assert len(set(samples) - {"human", "rat"}) == 0, f"{samples}"
152    os.makedirs(path, exist_ok=True)
153
154    for sample in samples:
155        if not _check_data(path, sample):
156            print("The MitoEM data for sample", sample, "is not available yet and will be downloaded and created.")
157            print("Note that this dataset is large, so this step can take several hours (depending on your internet).")
158            _require_mitoem_sample(path, sample, download)
159            print("The MitoEM data for sample", sample, "has been created.")
160
161        for split in splits:
162            split_path = os.path.join(path, f"{sample}_{split}.n5")
163            assert os.path.exists(split_path), split_path

Download the MitoEM training data.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
samples: The samples to download. The available samples are 'human' and 'rat'.
splits: The data splits to download. The available splits are 'train', 'val' and 'test'.
download: Whether to download the data if it is not present.

def get_mitoem_paths( path: Union[os.PathLike, str], splits: Sequence[str], samples: Sequence[str] = ('human', 'rat'), download: bool = False) -> List[str]: View Source

166def get_mitoem_paths(
167    path: Union[os.PathLike, str],
168    splits: Sequence[str],
169    samples: Sequence[str] = ("human", "rat"),
170    download: bool = False,
171) -> List[str]:
172    """Get paths for MitoEM data.
173
174    Args:
175        path: Filepath to a folder where the downloaded data will be saved.
176        samples: The samples to download. The available samples are 'human' and 'rat'.
177        splits: The data splits to download. The available splits are 'train', 'val' and 'test'.
178        download: Whether to download the data if it is not present.
179
180    Returns:
181        The filepaths for the stored data.
182    """
183    if isinstance(splits, str):
184        splits = [splits]
185
186    if isinstance(samples, str):
187        samples = [samples]
188
189    get_mitoem_data(path, samples, splits, download)
190    data_paths = [os.path.join(path, f"{sample}_{split}.n5") for split in splits for sample in samples]
191
192    return data_paths

Get paths for MitoEM data.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
samples: The samples to download. The available samples are 'human' and 'rat'.
splits: The data splits to download. The available splits are 'train', 'val' and 'test'.
download: Whether to download the data if it is not present.

Returns:

The filepaths for the stored data.

def get_mitoem_dataset( path: Union[os.PathLike, str], splits: Sequence[str], patch_shape: Tuple[int, int, int], samples: Sequence[str] = ('human', 'rat'), download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset: View Source

195def get_mitoem_dataset(
196    path: Union[os.PathLike, str],
197    splits: Sequence[str],
198    patch_shape: Tuple[int, int, int],
199    samples: Sequence[str] = ("human", "rat"),
200    download: bool = False,
201    offsets: Optional[List[List[int]]] = None,
202    boundaries: bool = False,
203    binary: bool = False,
204    **kwargs,
205) -> Dataset:
206    """Get the MitoEM dataset for the segmentation of mitochondria in EM.
207
208    Args:
209        path: Filepath to a folder where the downloaded data will be saved.
210        splits: The splits to use for the dataset. Available values are 'train', 'val' and 'test'.
211        patch_shape: The patch shape to use for training.
212        samples: The samples to use for the dataset. The available samples are 'human' and 'rat'.
213        download: Whether to download the data if it is not present.
214        offsets: Offset values for affinity computation used as target.
215        boundaries: Whether to compute boundaries as the target.
216        binary: Whether to return a binary segmentation target.
217        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
218
219    Returns:
220       The segmentation dataset.
221    """
222    assert len(patch_shape) == 3
223
224    data_paths = get_mitoem_paths(path, splits, samples, download)
225
226    kwargs, _ = util.add_instance_label_transform(
227        kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets
228    )
229
230    return torch_em.default_segmentation_dataset(
231        raw_paths=data_paths,
232        raw_key="raw",
233        label_paths=data_paths,
234        label_key="labels",
235        patch_shape=patch_shape,
236        **kwargs
237    )

Get the MitoEM dataset for the segmentation of mitochondria in EM.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
splits: The splits to use for the dataset. Available values are 'train', 'val' and 'test'.
patch_shape: The patch shape to use for training.
samples: The samples to use for the dataset. The available samples are 'human' and 'rat'.
download: Whether to download the data if it is not present.
offsets: Offset values for affinity computation used as target.
boundaries: Whether to compute boundaries as the target.
binary: Whether to return a binary segmentation target.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

Returns:

The segmentation dataset.

def get_mitoem_loader( path: Union[os.PathLike, str], splits: Sequence[str], patch_shape: Tuple[int, int, int], batch_size: int, samples: Sequence[str] = ('human', 'rat'), download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader: View Source

240def get_mitoem_loader(
241    path: Union[os.PathLike, str],
242    splits: Sequence[str],
243    patch_shape: Tuple[int, int, int],
244    batch_size: int,
245    samples: Sequence[str] = ("human", "rat"),
246    download: bool = False,
247    offsets: Optional[List[List[int]]] = None,
248    boundaries: bool = False,
249    binary: bool = False,
250    **kwargs,
251) -> DataLoader:
252    """Get the MitoEM dataloader for the segmentation of mitochondria in EM.
253
254    Args:
255        path: Filepath to a folder where the downloaded data will be saved.
256        splits: The splits to use for the dataset. Available values are 'train', 'val' and 'test'.
257        patch_shape: The patch shape to use for training.
258        batch_size: The batch size for training.
259        samples: The samples to use for the dataset. The available samples are 'human' and 'rat'.
260        download: Whether to download the data if it is not present.
261        offsets: Offset values for affinity computation used as target.
262        boundaries: Whether to compute boundaries as the target.
263        binary: Whether to return a binary segmentation target.
264        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
265
266    Returns:
267       The DataLoader.
268    """
269    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
270    dataset = get_mitoem_dataset(path, splits, patch_shape, samples, download, offsets, boundaries, binary, **ds_kwargs)
271    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the MitoEM dataloader for the segmentation of mitochondria in EM.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
splits: The splits to use for the dataset. Available values are 'train', 'val' and 'test'.
patch_shape: The patch shape to use for training.
batch_size: The batch size for training.
samples: The samples to use for the dataset. The available samples are 'human' and 'rat'.
download: Whether to download the data if it is not present.
offsets: Offset values for affinity computation used as target.
boundaries: Whether to compute boundaries as the target.
binary: Whether to return a binary segmentation target.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

Returns:

The DataLoader.