torch_em.data.datasets.electron_microscopy.emneuron

EMNeuron is a dataset for neuron segmentation in EM. It contains multiple annotated volumes from 16 domain sources.

The dataset is hosted at https://huggingface.co/datasets/yanchaoz/EMNeuron. The dataset is published in https://papers.miccai.org/miccai-2024/677-Paper0518.html. Please cite this publication if you use the dataset in your research.

View Source

  1"""EMNeuron is a dataset for neuron segmentation in EM.
  2It contains multiple annotated volumes from 16 domain sources.
  3
  4The dataset is hosted at https://huggingface.co/datasets/yanchaoz/EMNeuron.
  5The dataset is published in https://papers.miccai.org/miccai-2024/677-Paper0518.html.
  6Please cite this publication if you use the dataset in your research.
  7"""
  8
  9import os
 10import shutil
 11from glob import glob
 12from natsort import natsorted
 13from typing import Union, Tuple, List, Literal
 14
 15from torch.utils.data import Dataset, DataLoader
 16
 17import torch_em
 18
 19from .. import util
 20
 21
 22def _clean_redundant_files(path):
 23    # The "InDistribution" directory is redundant.
 24    target_dir = os.path.join(path, "valid", "InDistribution", "InDistribution")
 25    if os.path.exists(target_dir):
 26        shutil.rmtree(target_dir)
 27
 28
 29def get_emneuron_data(path: Union[os.PathLike, str], split: Literal['train', 'val'], download: bool = False):
 30    """Get the EMNeuron data.
 31
 32    NOTE: The automatic download feature is currently not supported in `get_emneuron_data`.
 33    You must follow the steps mentioned to download the data:
 34    - Go to the official GitHub repository: https://github.com/yanchaoz/SegNeuron.
 35    - Access the dataset link (hosted at HuggingFace): https://huggingface.co/datasets/yanchaoz/EMNeuron.
 36    - Login / create your account to access the "Dataset Card".
 37    - Go to "Files" in the dataset repo and download a) `labeled.rar` and b) `valid.rar`.
 38    - Finally, provide the filepath to the folder where rar files are stored.
 39
 40    Args:
 41        path: Filepath to a folder where the downloaded data will be saved.
 42        split: The split of the data to be used for training.
 43        download: Whether to download the data if it is not present.
 44    """
 45    if download:
 46        raise NotImplementedError(
 47            "Automatic download is not supported for this data. Please read the docstring for more details."
 48        )
 49
 50    os.makedirs(path, exist_ok=True)
 51
 52    if split == "train":
 53        rar_path = os.path.join(path, "labeled.rar")
 54    elif split == "val":
 55        rar_path = os.path.join(path, "valid.rar")
 56    else:
 57        raise ValueError(f"'{split}' is not a valid split. Please choose either 'train' or 'val'.")
 58
 59    if os.path.exists(os.path.splitext(rar_path)[0]):
 60        return
 61
 62    util.unzip_rarfile(rar_path=rar_path, dst=path, remove=False, use_rarfile=False)
 63
 64    _clean_redundant_files(path)
 65
 66
 67def get_emneuron_paths(
 68    path: Union[os.PathLike, str], split: Literal['train', 'val'], download: bool = False
 69) -> List[str]:
 70    """Get paths to the EMNeuron data.
 71
 72    Args:
 73        path: Filepath to a folder where the downloaded data will be saved.
 74        split: The split of the data to be used for training.
 75        download: Whether to download the data if it is not present.
 76
 77    Returns:
 78        List of filepaths to the stored data.
 79    """
 80    get_emneuron_data(path, split, download)
 81    if split == "train":
 82        label_paths = natsorted(glob(os.path.join(path, "labeled", "*", "*_MaskIns.tif")))
 83        raw_paths = [os.path.join(os.path.dirname(p), os.path.basename(p).replace("_MaskIns", "")) for p in label_paths]
 84
 85    else:  # 'val' split
 86        raw_paths = natsorted(glob(os.path.join(path, "valid", "*", "*", "raw.tif")))
 87        label_paths = [
 88            os.path.join(os.path.dirname(p), "label_0.tif")
 89            if os.path.exists(os.path.join(os.path.dirname(p), "label_0.tif"))
 90            else os.path.join(os.path.dirname(p), "label.tif") for p in raw_paths
 91        ]
 92
 93    assert len(raw_paths) == len(label_paths)
 94    return raw_paths, label_paths
 95
 96
 97def get_emneuron_dataset(
 98    path: Union[os.PathLike, str],
 99    patch_shape: Tuple[int, ...],
100    split: Literal['train', 'val'],
101    download: bool = False,
102    **kwargs
103) -> Dataset:
104    """Get the dataset for neuron segmentation.
105
106    Args:
107        path: Filepath to a folder where the downloaded data will be saved.
108        batch_size: The batch size for training.
109        patch_shape: The patch shape to use for training.
110        split: The split of the data to be used for training.
111        download: Whether to download the data if it is not present.
112        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
113
114    Returns:
115        The segmentation dataset.
116    """
117    raw_paths, label_paths = get_emneuron_paths(path, split, download)
118
119    return torch_em.default_segmentation_dataset(
120        raw_paths=raw_paths,
121        raw_key=None,
122        label_paths=label_paths,
123        label_key=None,
124        patch_shape=patch_shape,
125        is_seg_dataset=True,
126        **kwargs
127    )
128
129
130def get_emneuron_loader(
131    path: Union[os.PathLike, str],
132    batch_size: int,
133    patch_shape: Tuple[int, ...],
134    split: Literal['train', 'val'],
135    download: bool = False,
136    **kwargs
137) -> DataLoader:
138    """Get the dataloader for neuron segmentation.
139
140    Args:
141        path: Filepath to a folder where the downloaded data will be saved.
142        batch_size: The batch size for training.
143        patch_shape: The patch shape to use for training.
144        split: The split of the data to be used for training.
145        download: Whether to download the data if it is not present.
146        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
147
148    Returns:
149        The DataLoader.
150    """
151    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
152    dataset = get_emneuron_dataset(path=path, patch_shape=patch_shape, split=split, download=download, **ds_kwargs)
153    return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)

def get_emneuron_data( path: Union[os.PathLike, str], split: Literal['train', 'val'], download: bool = False): View Source

30def get_emneuron_data(path: Union[os.PathLike, str], split: Literal['train', 'val'], download: bool = False):
31    """Get the EMNeuron data.
32
33    NOTE: The automatic download feature is currently not supported in `get_emneuron_data`.
34    You must follow the steps mentioned to download the data:
35    - Go to the official GitHub repository: https://github.com/yanchaoz/SegNeuron.
36    - Access the dataset link (hosted at HuggingFace): https://huggingface.co/datasets/yanchaoz/EMNeuron.
37    - Login / create your account to access the "Dataset Card".
38    - Go to "Files" in the dataset repo and download a) `labeled.rar` and b) `valid.rar`.
39    - Finally, provide the filepath to the folder where rar files are stored.
40
41    Args:
42        path: Filepath to a folder where the downloaded data will be saved.
43        split: The split of the data to be used for training.
44        download: Whether to download the data if it is not present.
45    """
46    if download:
47        raise NotImplementedError(
48            "Automatic download is not supported for this data. Please read the docstring for more details."
49        )
50
51    os.makedirs(path, exist_ok=True)
52
53    if split == "train":
54        rar_path = os.path.join(path, "labeled.rar")
55    elif split == "val":
56        rar_path = os.path.join(path, "valid.rar")
57    else:
58        raise ValueError(f"'{split}' is not a valid split. Please choose either 'train' or 'val'.")
59
60    if os.path.exists(os.path.splitext(rar_path)[0]):
61        return
62
63    util.unzip_rarfile(rar_path=rar_path, dst=path, remove=False, use_rarfile=False)
64
65    _clean_redundant_files(path)

Get the EMNeuron data.

NOTE: The automatic download feature is currently not supported in get_emneuron_data. You must follow the steps mentioned to download the data:

Go to the official GitHub repository: https://github.com/yanchaoz/SegNeuron.
Access the dataset link (hosted at HuggingFace): https://huggingface.co/datasets/yanchaoz/EMNeuron.
Login / create your account to access the "Dataset Card".
Go to "Files" in the dataset repo and download a) labeled.rar and b) valid.rar.
Finally, provide the filepath to the folder where rar files are stored.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
split: The split of the data to be used for training.
download: Whether to download the data if it is not present.

def get_emneuron_paths( path: Union[os.PathLike, str], split: Literal['train', 'val'], download: bool = False) -> List[str]: View Source

68def get_emneuron_paths(
69    path: Union[os.PathLike, str], split: Literal['train', 'val'], download: bool = False
70) -> List[str]:
71    """Get paths to the EMNeuron data.
72
73    Args:
74        path: Filepath to a folder where the downloaded data will be saved.
75        split: The split of the data to be used for training.
76        download: Whether to download the data if it is not present.
77
78    Returns:
79        List of filepaths to the stored data.
80    """
81    get_emneuron_data(path, split, download)
82    if split == "train":
83        label_paths = natsorted(glob(os.path.join(path, "labeled", "*", "*_MaskIns.tif")))
84        raw_paths = [os.path.join(os.path.dirname(p), os.path.basename(p).replace("_MaskIns", "")) for p in label_paths]
85
86    else:  # 'val' split
87        raw_paths = natsorted(glob(os.path.join(path, "valid", "*", "*", "raw.tif")))
88        label_paths = [
89            os.path.join(os.path.dirname(p), "label_0.tif")
90            if os.path.exists(os.path.join(os.path.dirname(p), "label_0.tif"))
91            else os.path.join(os.path.dirname(p), "label.tif") for p in raw_paths
92        ]
93
94    assert len(raw_paths) == len(label_paths)
95    return raw_paths, label_paths

Get paths to the EMNeuron data.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
split: The split of the data to be used for training.
download: Whether to download the data if it is not present.

Returns:

List of filepaths to the stored data.

def get_emneuron_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], split: Literal['train', 'val'], download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset: View Source

 98def get_emneuron_dataset(
 99    path: Union[os.PathLike, str],
100    patch_shape: Tuple[int, ...],
101    split: Literal['train', 'val'],
102    download: bool = False,
103    **kwargs
104) -> Dataset:
105    """Get the dataset for neuron segmentation.
106
107    Args:
108        path: Filepath to a folder where the downloaded data will be saved.
109        batch_size: The batch size for training.
110        patch_shape: The patch shape to use for training.
111        split: The split of the data to be used for training.
112        download: Whether to download the data if it is not present.
113        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
114
115    Returns:
116        The segmentation dataset.
117    """
118    raw_paths, label_paths = get_emneuron_paths(path, split, download)
119
120    return torch_em.default_segmentation_dataset(
121        raw_paths=raw_paths,
122        raw_key=None,
123        label_paths=label_paths,
124        label_key=None,
125        patch_shape=patch_shape,
126        is_seg_dataset=True,
127        **kwargs
128    )

Get the dataset for neuron segmentation.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
batch_size: The batch size for training.
patch_shape: The patch shape to use for training.
split: The split of the data to be used for training.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

Returns:

The segmentation dataset.

def get_emneuron_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, ...], split: Literal['train', 'val'], download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader: View Source

131def get_emneuron_loader(
132    path: Union[os.PathLike, str],
133    batch_size: int,
134    patch_shape: Tuple[int, ...],
135    split: Literal['train', 'val'],
136    download: bool = False,
137    **kwargs
138) -> DataLoader:
139    """Get the dataloader for neuron segmentation.
140
141    Args:
142        path: Filepath to a folder where the downloaded data will be saved.
143        batch_size: The batch size for training.
144        patch_shape: The patch shape to use for training.
145        split: The split of the data to be used for training.
146        download: Whether to download the data if it is not present.
147        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
148
149    Returns:
150        The DataLoader.
151    """
152    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
153    dataset = get_emneuron_dataset(path=path, patch_shape=patch_shape, split=split, download=download, **ds_kwargs)
154    return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)

Get the dataloader for neuron segmentation.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
batch_size: The batch size for training.
patch_shape: The patch shape to use for training.
split: The split of the data to be used for training.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

Returns:

The DataLoader.