torch_em.data.datasets.electron_microscopy.emneuron

EMNeuron is a dataset for neuron segmentation in EM. It contains multiple annotated volumes from 16 domain sources.

The dataset is hosted at https://huggingface.co/datasets/yanchaoz/EMNeuron. The dataset is published in https://papers.miccai.org/miccai-2024/677-Paper0518.html. Please cite this publication if you use the dataset in your research.

  1"""EMNeuron is a dataset for neuron segmentation in EM.
  2It contains multiple annotated volumes from 16 domain sources.
  3
  4The dataset is hosted at https://huggingface.co/datasets/yanchaoz/EMNeuron.
  5The dataset is published in https://papers.miccai.org/miccai-2024/677-Paper0518.html.
  6Please cite this publication if you use the dataset in your research.
  7"""
  8
  9import os
 10import shutil
 11from glob import glob
 12from natsort import natsorted
 13from typing import Union, Tuple, List, Literal
 14
 15from torch.utils.data import Dataset, DataLoader
 16
 17import torch_em
 18
 19from .. import util
 20
 21
 22def _clean_redundant_files(path):
 23    # The "InDistribution" directory is redundant.
 24    target_dir = os.path.join(path, "valid", "InDistribution", "InDistribution")
 25    if os.path.exists(target_dir):
 26        shutil.rmtree(target_dir)
 27
 28
 29def get_emneuron_data(path: Union[os.PathLike, str], split: Literal['train', 'val'], download: bool = False):
 30    """Get the EMNeuron data.
 31
 32    NOTE: The automatic download feature is currently not supported in `get_emneuron_data`.
 33    You must follow the steps mentioned to download the data:
 34    - Go to the official GitHub repository: https://github.com/yanchaoz/SegNeuron.
 35    - Access the dataset link (hosted at HuggingFace): https://huggingface.co/datasets/yanchaoz/EMNeuron.
 36    - Login / create your account to access the "Dataset Card".
 37    - Go to "Files" in the dataset repo and download a) `labeled.rar` and b) `valid.rar`.
 38    - Finally, provide the filepath to the folder where rar files are stored.
 39
 40    Args:
 41        path: Filepath to a folder where the downloaded data will be saved.
 42        split: The split of the data to be used for training.
 43        download: Whether to download the data if it is not present.
 44    """
 45    if download:
 46        raise NotImplementedError(
 47            "Automatic download is not supported for this data. Please read the docstring for more details."
 48        )
 49
 50    os.makedirs(path, exist_ok=True)
 51
 52    if split == "train":
 53        rar_path = os.path.join(path, "labeled.rar")
 54    elif split == "val":
 55        rar_path = os.path.join(path, "valid.rar")
 56    else:
 57        raise ValueError(f"'{split}' is not a valid split. Please choose either 'train' or 'val'.")
 58
 59    if os.path.exists(os.path.splitext(rar_path)[0]):
 60        return
 61
 62    util.unzip_rarfile(rar_path=rar_path, dst=path, remove=False, use_rarfile=False)
 63
 64    _clean_redundant_files(path)
 65
 66
 67def get_emneuron_paths(
 68    path: Union[os.PathLike, str], split: Literal['train', 'val'], download: bool = False
 69) -> List[str]:
 70    """Get paths to the EMNeuron data.
 71
 72    Args:
 73        path: Filepath to a folder where the downloaded data will be saved.
 74        split: The split of the data to be used for training.
 75        download: Whether to download the data if it is not present.
 76
 77    Returns:
 78        List of filepaths to the stored data.
 79    """
 80    get_emneuron_data(path, split, download)
 81    if split == "train":
 82        label_paths = natsorted(glob(os.path.join(path, "labeled", "*", "*_MaskIns.tif")))
 83        raw_paths = [os.path.join(os.path.dirname(p), os.path.basename(p).replace("_MaskIns", "")) for p in label_paths]
 84
 85    else:  # 'val' split
 86        raw_paths = natsorted(glob(os.path.join(path, "valid", "*", "*", "raw.tif")))
 87        label_paths = [
 88            os.path.join(os.path.dirname(p), "label_0.tif")
 89            if os.path.exists(os.path.join(os.path.dirname(p), "label_0.tif"))
 90            else os.path.join(os.path.dirname(p), "label.tif") for p in raw_paths
 91        ]
 92
 93    assert len(raw_paths) == len(label_paths)
 94    return raw_paths, label_paths
 95
 96
 97def get_emneuron_dataset(
 98    path: Union[os.PathLike, str],
 99    patch_shape: Tuple[int, ...],
100    split: Literal['train', 'val'],
101    download: bool = False,
102    **kwargs
103) -> Dataset:
104    """Get the dataset for neuron segmentation.
105
106    Args:
107        path: Filepath to a folder where the downloaded data will be saved.
108        batch_size: The batch size for training.
109        patch_shape: The patch shape to use for training.
110        split: The split of the data to be used for training.
111        download: Whether to download the data if it is not present.
112        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
113
114    Returns:
115        The segmentation dataset.
116    """
117    raw_paths, label_paths = get_emneuron_paths(path, split, download)
118
119    return torch_em.default_segmentation_dataset(
120        raw_paths=raw_paths,
121        raw_key=None,
122        label_paths=label_paths,
123        label_key=None,
124        patch_shape=patch_shape,
125        is_seg_dataset=True,
126        **kwargs
127    )
128
129
130def get_emneuron_loader(
131    path: Union[os.PathLike, str],
132    batch_size: int,
133    patch_shape: Tuple[int, ...],
134    split: Literal['train', 'val'],
135    download: bool = False,
136    **kwargs
137) -> DataLoader:
138    """Get the dataloader for neuron segmentation.
139
140    Args:
141        path: Filepath to a folder where the downloaded data will be saved.
142        batch_size: The batch size for training.
143        patch_shape: The patch shape to use for training.
144        split: The split of the data to be used for training.
145        download: Whether to download the data if it is not present.
146        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
147
148    Returns:
149        The DataLoader.
150    """
151    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
152    dataset = get_emneuron_dataset(path=path, patch_shape=patch_shape, split=split, download=download, **ds_kwargs)
153    return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)
def get_emneuron_data( path: Union[os.PathLike, str], split: Literal['train', 'val'], download: bool = False):
30def get_emneuron_data(path: Union[os.PathLike, str], split: Literal['train', 'val'], download: bool = False):
31    """Get the EMNeuron data.
32
33    NOTE: The automatic download feature is currently not supported in `get_emneuron_data`.
34    You must follow the steps mentioned to download the data:
35    - Go to the official GitHub repository: https://github.com/yanchaoz/SegNeuron.
36    - Access the dataset link (hosted at HuggingFace): https://huggingface.co/datasets/yanchaoz/EMNeuron.
37    - Login / create your account to access the "Dataset Card".
38    - Go to "Files" in the dataset repo and download a) `labeled.rar` and b) `valid.rar`.
39    - Finally, provide the filepath to the folder where rar files are stored.
40
41    Args:
42        path: Filepath to a folder where the downloaded data will be saved.
43        split: The split of the data to be used for training.
44        download: Whether to download the data if it is not present.
45    """
46    if download:
47        raise NotImplementedError(
48            "Automatic download is not supported for this data. Please read the docstring for more details."
49        )
50
51    os.makedirs(path, exist_ok=True)
52
53    if split == "train":
54        rar_path = os.path.join(path, "labeled.rar")
55    elif split == "val":
56        rar_path = os.path.join(path, "valid.rar")
57    else:
58        raise ValueError(f"'{split}' is not a valid split. Please choose either 'train' or 'val'.")
59
60    if os.path.exists(os.path.splitext(rar_path)[0]):
61        return
62
63    util.unzip_rarfile(rar_path=rar_path, dst=path, remove=False, use_rarfile=False)
64
65    _clean_redundant_files(path)

Get the EMNeuron data.

NOTE: The automatic download feature is currently not supported in get_emneuron_data. You must follow the steps mentioned to download the data:

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • split: The split of the data to be used for training.
  • download: Whether to download the data if it is not present.
def get_emneuron_paths( path: Union[os.PathLike, str], split: Literal['train', 'val'], download: bool = False) -> List[str]:
68def get_emneuron_paths(
69    path: Union[os.PathLike, str], split: Literal['train', 'val'], download: bool = False
70) -> List[str]:
71    """Get paths to the EMNeuron data.
72
73    Args:
74        path: Filepath to a folder where the downloaded data will be saved.
75        split: The split of the data to be used for training.
76        download: Whether to download the data if it is not present.
77
78    Returns:
79        List of filepaths to the stored data.
80    """
81    get_emneuron_data(path, split, download)
82    if split == "train":
83        label_paths = natsorted(glob(os.path.join(path, "labeled", "*", "*_MaskIns.tif")))
84        raw_paths = [os.path.join(os.path.dirname(p), os.path.basename(p).replace("_MaskIns", "")) for p in label_paths]
85
86    else:  # 'val' split
87        raw_paths = natsorted(glob(os.path.join(path, "valid", "*", "*", "raw.tif")))
88        label_paths = [
89            os.path.join(os.path.dirname(p), "label_0.tif")
90            if os.path.exists(os.path.join(os.path.dirname(p), "label_0.tif"))
91            else os.path.join(os.path.dirname(p), "label.tif") for p in raw_paths
92        ]
93
94    assert len(raw_paths) == len(label_paths)
95    return raw_paths, label_paths

Get paths to the EMNeuron data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • split: The split of the data to be used for training.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths to the stored data.

def get_emneuron_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], split: Literal['train', 'val'], download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
 98def get_emneuron_dataset(
 99    path: Union[os.PathLike, str],
100    patch_shape: Tuple[int, ...],
101    split: Literal['train', 'val'],
102    download: bool = False,
103    **kwargs
104) -> Dataset:
105    """Get the dataset for neuron segmentation.
106
107    Args:
108        path: Filepath to a folder where the downloaded data will be saved.
109        batch_size: The batch size for training.
110        patch_shape: The patch shape to use for training.
111        split: The split of the data to be used for training.
112        download: Whether to download the data if it is not present.
113        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
114
115    Returns:
116        The segmentation dataset.
117    """
118    raw_paths, label_paths = get_emneuron_paths(path, split, download)
119
120    return torch_em.default_segmentation_dataset(
121        raw_paths=raw_paths,
122        raw_key=None,
123        label_paths=label_paths,
124        label_key=None,
125        patch_shape=patch_shape,
126        is_seg_dataset=True,
127        **kwargs
128    )

Get the dataset for neuron segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape to use for training.
  • split: The split of the data to be used for training.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_emneuron_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, ...], split: Literal['train', 'val'], download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
131def get_emneuron_loader(
132    path: Union[os.PathLike, str],
133    batch_size: int,
134    patch_shape: Tuple[int, ...],
135    split: Literal['train', 'val'],
136    download: bool = False,
137    **kwargs
138) -> DataLoader:
139    """Get the dataloader for neuron segmentation.
140
141    Args:
142        path: Filepath to a folder where the downloaded data will be saved.
143        batch_size: The batch size for training.
144        patch_shape: The patch shape to use for training.
145        split: The split of the data to be used for training.
146        download: Whether to download the data if it is not present.
147        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
148
149    Returns:
150        The DataLoader.
151    """
152    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
153    dataset = get_emneuron_dataset(path=path, patch_shape=patch_shape, split=split, download=download, **ds_kwargs)
154    return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)

Get the dataloader for neuron segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape to use for training.
  • split: The split of the data to be used for training.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.