torch_em.data.datasets.electron_microscopy.mitoem

MitoEM is a dataset for segmenting mitochondria in electron microscopy.

It contains two large annotated volumes, one from rat cortex, the other from human cortex. This dataset was used for a segmentation challenge at ISBI 2022. If you use it in your research then please cite https://doi.org/10.1007/978-3-030-59722-1_7.

  1"""MitoEM is a dataset for segmenting mitochondria in electron microscopy.
  2
  3It contains two large annotated volumes, one from rat cortex, the other from human cortex.
  4This dataset was used for a segmentation challenge at ISBI 2022.
  5If you use it in your research then please cite https://doi.org/10.1007/978-3-030-59722-1_7.
  6"""
  7
  8import os
  9import multiprocessing
 10from concurrent import futures
 11from shutil import rmtree
 12from typing import List, Optional, Sequence, Tuple, Union
 13
 14import imageio
 15import numpy as np
 16import torch_em
 17import z5py
 18
 19from torch.utils.data import Dataset, DataLoader
 20from tqdm import tqdm
 21from .. import util
 22
 23URLS = {
 24    "raw": {
 25        "human": "https://www.dropbox.com/s/z41qtu4y735j95e/EM30-H-im.zip?dl=1",
 26        "rat": "https://huggingface.co/datasets/pytc/EM30/resolve/main/EM30-R-im.zip"
 27    },
 28    "labels": {
 29        "human": "https://www.dropbox.com/s/dhf89bc14kemw4e/EM30-H-mito-train-val-v2.zip?dl=1",
 30        "rat": "https://huggingface.co/datasets/pytc/MitoEM/blob/main/EM30-R-mito-train-val-v2.zip"
 31    }
 32}
 33CHECKSUMS = {
 34    "raw": {
 35        "human": "98fe259f36a7d8d43f99981b7a0ef8cdeba2ce2615ff91595f428ae57207a041",
 36        "rat": "6a2cac68adde5d01984542d3ee1d7753d1fa3e6eb2a042ce15ce297c95885bbe"
 37    },
 38    "labels": {
 39        "human": "0e8ed292cfcd0c58701d9f4299244a1b66d6aeb506c85754c34f98a4eda0ef1b",
 40        "rat": "c56380ac575428a818bd293ca3509d1249999846c3702ccbf11d308acdd2ae86"
 41    }
 42}
 43
 44
 45def _check_data(path, sample):
 46    splits = ["train", "val", "test"]
 47    expected_paths = [os.path.join(path, f"{sample}_{split}.n5") for split in splits]
 48    return all(os.path.exists(pp) for pp in expected_paths)
 49
 50
 51def get_slices(folder):
 52    files = os.listdir(folder)
 53    files.sort()
 54    files = [os.path.splitext(ff)[0] for ff in files]
 55    slice_ids = [int(ff[2:]) if ff.startswith('im') else int(ff[3:]) for ff in files]
 56    return slice_ids
 57
 58
 59def _load_vol(pattern, slice_ids, desc, n_threads, dtype=None):
 60    im0 = pattern % slice_ids[0]
 61    im0 = imageio.imread(im0)
 62
 63    shape = (len(slice_ids),) + im0.shape
 64
 65    dtype = im0.dtype if dtype is None else dtype
 66    out = np.zeros(shape, dtype=dtype)
 67    out[0] = im0
 68
 69    def load_slice(z, slice_id):
 70        out[z] = imageio.imread(pattern % slice_id)
 71
 72    zs = list(range(1, len(slice_ids)))
 73    assert len(zs) == len(slice_ids) - 1
 74    with futures.ThreadPoolExecutor(n_threads) as tp:
 75        list(tqdm(tp.map(load_slice, zs, slice_ids[1:]), total=len(slice_ids) - 1, desc=desc))
 76
 77    return out
 78
 79
 80def _create_volume(out_path, im_folder, label_folder=None, z_start=None):
 81    if label_folder is None:
 82        assert z_start is not None
 83        n_slices = len(get_slices(im_folder))
 84        slices = list(range(z_start, n_slices))
 85    else:
 86        assert z_start is None
 87        slices = get_slices(label_folder)
 88
 89    n_threads = min(16, multiprocessing.cpu_count())
 90    raw = _load_vol(os.path.join(im_folder, "im%04i.png"), slices, "load raw", n_threads)
 91    if label_folder is not None:
 92        labels = _load_vol(os.path.join(label_folder, "seg%04i.tif"), slices, "load labels", n_threads, dtype="uint64")
 93
 94    print("Write volume to", out_path)
 95    chunks = (32, 256, 256)
 96    with z5py.File(out_path, "a") as f:
 97        f.create_dataset("raw", data=raw, chunks=chunks, compression="gzip", n_threads=n_threads)
 98        if label_folder is not None:
 99            ds = f.create_dataset("labels", data=labels, chunks=chunks, compression="gzip", n_threads=n_threads)
100            ds.attrs["maxId"] = int(labels.max()) + 1
101
102    return slices[-1]
103
104
105def _require_mitoem_sample(path, sample, download):
106    os.makedirs(path, exist_ok=True)
107
108    for name in ("raw", "labels"):
109        url = URLS[name][sample]
110        checksum = CHECKSUMS[name][sample]
111        zip_path = os.path.join(path, f"{sample}.zip")
112        util.download_source(zip_path, url, download, checksum)
113        util.unzip(zip_path, path, remove=True)
114
115    im_folder = os.path.join(path, "im")
116    train_folder = os.path.join(path, "mito-train-v2")
117    val_folder = os.path.join(path, "mito-val-v2")
118
119    print("Create train volume")
120    train_path = os.path.join(path, f"{sample}_train.n5")
121    _create_volume(train_path, im_folder, train_folder)
122
123    print("Create validation volume")
124    val_path = os.path.join(path, f"{sample}_val.n5")
125    z = _create_volume(val_path, im_folder, val_folder)
126
127    print("Create test volume")
128    test_path = os.path.join(path, f"{sample}_test.n5")
129    _create_volume(test_path, im_folder, z_start=z)
130
131    rmtree(im_folder)
132    rmtree(train_folder)
133    rmtree(val_folder)
134
135
136def get_mitoem_data(
137    path: Union[os.PathLike, str], samples: Sequence[str], splits: Sequence[str], download: bool
138) -> List[str]:
139    """Download the MitoEM training data.
140
141    Args:
142        path: Filepath to a folder where the downloaded data will be saved.
143        samples: The samples to download. The available samples are 'human' and 'rat'.
144        splits: The data splits to download. The available splits are 'train', 'val' and 'test'.
145        download: Whether to download the data if it is not present.
146
147    Returns:
148        The paths to the downloaded and converted files.
149    """
150    if isinstance(splits, str):
151        splits = [splits]
152    assert len(set(splits) - {"train", "val"}) == 0, f"{splits}"
153    assert len(set(samples) - {"human", "rat"}) == 0, f"{samples}"
154    os.makedirs(path, exist_ok=True)
155
156    data_paths = []
157    for sample in samples:
158        if not _check_data(path, sample):
159            print("The MitoEM data for sample", sample, "is not available yet and will be downloaded and created.")
160            print("Note that this dataset is large, so this step can take several hours (depending on your internet).")
161            _require_mitoem_sample(path, sample, download)
162            print("The MitoEM data for sample", sample, "has been created.")
163
164        for split in splits:
165            split_path = os.path.join(path, f"{sample}_{split}.n5")
166            assert os.path.exists(split_path), split_path
167            data_paths.append(split_path)
168    return data_paths
169
170
171def get_mitoem_dataset(
172    path: Union[os.PathLike, str],
173    splits: Sequence[str],
174    patch_shape: Tuple[int, int, int],
175    samples: Sequence[str] = ("human", "rat"),
176    download: bool = False,
177    offsets: Optional[List[List[int]]] = None,
178    boundaries: bool = False,
179    binary: bool = False,
180    **kwargs,
181) -> Dataset:
182    """Get the MitoEM dataset for the segmentation of mitochondria in EM.
183
184    Args:
185        path: Filepath to a folder where the downloaded data will be saved.
186        splits: The splits to use for the dataset. Available values are 'train', 'val' and 'test'.
187        patch_shape: The patch shape to use for training.
188        samples: The samples to use for the dataset. The available samples are 'human' and 'rat'.
189        download: Whether to download the data if it is not present.
190        offsets: Offset values for affinity computation used as target.
191        boundaries: Whether to compute boundaries as the target.
192        binary: Whether to return a binary segmentation target.
193        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
194
195    Returns:
196       The segmentation dataset.
197    """
198    assert len(patch_shape) == 3
199
200    data_paths = get_mitoem_data(path, samples, splits, download)
201
202    kwargs, _ = util.add_instance_label_transform(
203        kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets
204    )
205    raw_key = "raw"
206    label_key = "labels"
207    return torch_em.default_segmentation_dataset(data_paths, raw_key, data_paths, label_key, patch_shape, **kwargs)
208
209
210def get_mitoem_loader(
211    path: Union[os.PathLike, str],
212    splits: Sequence[str],
213    patch_shape: Tuple[int, int, int],
214    batch_size: int,
215    samples: Sequence[str] = ("human", "rat"),
216    download: bool = False,
217    offsets: Optional[List[List[int]]] = None,
218    boundaries: bool = False,
219    binary: bool = False,
220    **kwargs,
221) -> DataLoader:
222    """Get the MitoEM dataload for the segmentation of mitochondria in EM.
223
224    Args:
225        path: Filepath to a folder where the downloaded data will be saved.
226        splits: The splits to use for the dataset. Available values are 'train', 'val' and 'test'.
227        patch_shape: The patch shape to use for training.
228        batch_size: The batch size for training.
229        samples: The samples to use for the dataset. The available samples are 'human' and 'rat'.
230        download: Whether to download the data if it is not present.
231        offsets: Offset values for affinity computation used as target.
232        boundaries: Whether to compute boundaries as the target.
233        binary: Whether to return a binary segmentation target.
234        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
235
236    Returns:
237       The DataLoader.
238    """
239    ds_kwargs, loader_kwargs = util.split_kwargs(
240        torch_em.default_segmentation_dataset, **kwargs
241    )
242    dataset = get_mitoem_dataset(
243        path, splits, patch_shape,
244        samples=samples, download=download,
245        offsets=offsets, boundaries=boundaries, binary=binary,
246        **ds_kwargs
247    )
248    loader = torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
249    return loader
URLS = {'raw': {'human': 'https://www.dropbox.com/s/z41qtu4y735j95e/EM30-H-im.zip?dl=1', 'rat': 'https://huggingface.co/datasets/pytc/EM30/resolve/main/EM30-R-im.zip'}, 'labels': {'human': 'https://www.dropbox.com/s/dhf89bc14kemw4e/EM30-H-mito-train-val-v2.zip?dl=1', 'rat': 'https://huggingface.co/datasets/pytc/MitoEM/blob/main/EM30-R-mito-train-val-v2.zip'}}
CHECKSUMS = {'raw': {'human': '98fe259f36a7d8d43f99981b7a0ef8cdeba2ce2615ff91595f428ae57207a041', 'rat': '6a2cac68adde5d01984542d3ee1d7753d1fa3e6eb2a042ce15ce297c95885bbe'}, 'labels': {'human': '0e8ed292cfcd0c58701d9f4299244a1b66d6aeb506c85754c34f98a4eda0ef1b', 'rat': 'c56380ac575428a818bd293ca3509d1249999846c3702ccbf11d308acdd2ae86'}}
def get_slices(folder):
52def get_slices(folder):
53    files = os.listdir(folder)
54    files.sort()
55    files = [os.path.splitext(ff)[0] for ff in files]
56    slice_ids = [int(ff[2:]) if ff.startswith('im') else int(ff[3:]) for ff in files]
57    return slice_ids
def get_mitoem_data( path: Union[os.PathLike, str], samples: Sequence[str], splits: Sequence[str], download: bool) -> List[str]:
137def get_mitoem_data(
138    path: Union[os.PathLike, str], samples: Sequence[str], splits: Sequence[str], download: bool
139) -> List[str]:
140    """Download the MitoEM training data.
141
142    Args:
143        path: Filepath to a folder where the downloaded data will be saved.
144        samples: The samples to download. The available samples are 'human' and 'rat'.
145        splits: The data splits to download. The available splits are 'train', 'val' and 'test'.
146        download: Whether to download the data if it is not present.
147
148    Returns:
149        The paths to the downloaded and converted files.
150    """
151    if isinstance(splits, str):
152        splits = [splits]
153    assert len(set(splits) - {"train", "val"}) == 0, f"{splits}"
154    assert len(set(samples) - {"human", "rat"}) == 0, f"{samples}"
155    os.makedirs(path, exist_ok=True)
156
157    data_paths = []
158    for sample in samples:
159        if not _check_data(path, sample):
160            print("The MitoEM data for sample", sample, "is not available yet and will be downloaded and created.")
161            print("Note that this dataset is large, so this step can take several hours (depending on your internet).")
162            _require_mitoem_sample(path, sample, download)
163            print("The MitoEM data for sample", sample, "has been created.")
164
165        for split in splits:
166            split_path = os.path.join(path, f"{sample}_{split}.n5")
167            assert os.path.exists(split_path), split_path
168            data_paths.append(split_path)
169    return data_paths

Download the MitoEM training data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • samples: The samples to download. The available samples are 'human' and 'rat'.
  • splits: The data splits to download. The available splits are 'train', 'val' and 'test'.
  • download: Whether to download the data if it is not present.
Returns:

The paths to the downloaded and converted files.

def get_mitoem_dataset( path: Union[os.PathLike, str], splits: Sequence[str], patch_shape: Tuple[int, int, int], samples: Sequence[str] = ('human', 'rat'), download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
172def get_mitoem_dataset(
173    path: Union[os.PathLike, str],
174    splits: Sequence[str],
175    patch_shape: Tuple[int, int, int],
176    samples: Sequence[str] = ("human", "rat"),
177    download: bool = False,
178    offsets: Optional[List[List[int]]] = None,
179    boundaries: bool = False,
180    binary: bool = False,
181    **kwargs,
182) -> Dataset:
183    """Get the MitoEM dataset for the segmentation of mitochondria in EM.
184
185    Args:
186        path: Filepath to a folder where the downloaded data will be saved.
187        splits: The splits to use for the dataset. Available values are 'train', 'val' and 'test'.
188        patch_shape: The patch shape to use for training.
189        samples: The samples to use for the dataset. The available samples are 'human' and 'rat'.
190        download: Whether to download the data if it is not present.
191        offsets: Offset values for affinity computation used as target.
192        boundaries: Whether to compute boundaries as the target.
193        binary: Whether to return a binary segmentation target.
194        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
195
196    Returns:
197       The segmentation dataset.
198    """
199    assert len(patch_shape) == 3
200
201    data_paths = get_mitoem_data(path, samples, splits, download)
202
203    kwargs, _ = util.add_instance_label_transform(
204        kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets
205    )
206    raw_key = "raw"
207    label_key = "labels"
208    return torch_em.default_segmentation_dataset(data_paths, raw_key, data_paths, label_key, patch_shape, **kwargs)

Get the MitoEM dataset for the segmentation of mitochondria in EM.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • splits: The splits to use for the dataset. Available values are 'train', 'val' and 'test'.
  • patch_shape: The patch shape to use for training.
  • samples: The samples to use for the dataset. The available samples are 'human' and 'rat'.
  • download: Whether to download the data if it is not present.
  • offsets: Offset values for affinity computation used as target.
  • boundaries: Whether to compute boundaries as the target.
  • binary: Whether to return a binary segmentation target.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_mitoem_loader( path: Union[os.PathLike, str], splits: Sequence[str], patch_shape: Tuple[int, int, int], batch_size: int, samples: Sequence[str] = ('human', 'rat'), download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
211def get_mitoem_loader(
212    path: Union[os.PathLike, str],
213    splits: Sequence[str],
214    patch_shape: Tuple[int, int, int],
215    batch_size: int,
216    samples: Sequence[str] = ("human", "rat"),
217    download: bool = False,
218    offsets: Optional[List[List[int]]] = None,
219    boundaries: bool = False,
220    binary: bool = False,
221    **kwargs,
222) -> DataLoader:
223    """Get the MitoEM dataload for the segmentation of mitochondria in EM.
224
225    Args:
226        path: Filepath to a folder where the downloaded data will be saved.
227        splits: The splits to use for the dataset. Available values are 'train', 'val' and 'test'.
228        patch_shape: The patch shape to use for training.
229        batch_size: The batch size for training.
230        samples: The samples to use for the dataset. The available samples are 'human' and 'rat'.
231        download: Whether to download the data if it is not present.
232        offsets: Offset values for affinity computation used as target.
233        boundaries: Whether to compute boundaries as the target.
234        binary: Whether to return a binary segmentation target.
235        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
236
237    Returns:
238       The DataLoader.
239    """
240    ds_kwargs, loader_kwargs = util.split_kwargs(
241        torch_em.default_segmentation_dataset, **kwargs
242    )
243    dataset = get_mitoem_dataset(
244        path, splits, patch_shape,
245        samples=samples, download=download,
246        offsets=offsets, boundaries=boundaries, binary=binary,
247        **ds_kwargs
248    )
249    loader = torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
250    return loader

Get the MitoEM dataload for the segmentation of mitochondria in EM.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • splits: The splits to use for the dataset. Available values are 'train', 'val' and 'test'.
  • patch_shape: The patch shape to use for training.
  • batch_size: The batch size for training.
  • samples: The samples to use for the dataset. The available samples are 'human' and 'rat'.
  • download: Whether to download the data if it is not present.
  • offsets: Offset values for affinity computation used as target.
  • boundaries: Whether to compute boundaries as the target.
  • binary: Whether to return a binary segmentation target.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.