torch_em.data.datasets.electron_microscopy.emneuron
EMNeuron is a dataset for neuron segmentation in EM. It contains multiple annotated volumes from 16 domain sources.
The dataset is hosted at https://huggingface.co/datasets/yanchaoz/EMNeuron. The dataset is published in https://papers.miccai.org/miccai-2024/677-Paper0518.html. Please cite this publication if you use the dataset in your research.
1"""EMNeuron is a dataset for neuron segmentation in EM. 2It contains multiple annotated volumes from 16 domain sources. 3 4The dataset is hosted at https://huggingface.co/datasets/yanchaoz/EMNeuron. 5The dataset is published in https://papers.miccai.org/miccai-2024/677-Paper0518.html. 6Please cite this publication if you use the dataset in your research. 7""" 8 9import os 10import shutil 11from glob import glob 12from natsort import natsorted 13from typing import Union, Tuple, List, Literal 14 15from torch.utils.data import Dataset, DataLoader 16 17import torch_em 18 19from .. import util 20 21 22def _clean_redundant_files(path): 23 # The "InDistribution" directory is redundant. 24 target_dir = os.path.join(path, "valid", "InDistribution", "InDistribution") 25 if os.path.exists(target_dir): 26 shutil.rmtree(target_dir) 27 28 29def get_emneuron_data(path: Union[os.PathLike, str], split: Literal['train', 'val'], download: bool = False): 30 """Get the EMNeuron data. 31 32 NOTE: The automatic download feature is currently not supported in `get_emneuron_data`. 33 You must follow the steps mentioned to download the data: 34 - Go to the official GitHub repository: https://github.com/yanchaoz/SegNeuron. 35 - Access the dataset link (hosted at HuggingFace): https://huggingface.co/datasets/yanchaoz/EMNeuron. 36 - Login / create your account to access the "Dataset Card". 37 - Go to "Files" in the dataset repo and download a) `labeled.rar` and b) `valid.rar`. 38 - Finally, provide the filepath to the folder where rar files are stored. 39 40 Args: 41 path: Filepath to a folder where the downloaded data will be saved. 42 split: The split of the data to be used for training. 43 download: Whether to download the data if it is not present. 44 """ 45 if download: 46 raise NotImplementedError( 47 "Automatic download is not supported for this data. Please read the docstring for more details." 48 ) 49 50 os.makedirs(path, exist_ok=True) 51 52 if split == "train": 53 rar_path = os.path.join(path, "labeled.rar") 54 elif split == "val": 55 rar_path = os.path.join(path, "valid.rar") 56 else: 57 raise ValueError(f"'{split}' is not a valid split. Please choose either 'train' or 'val'.") 58 59 if os.path.exists(os.path.splitext(rar_path)[0]): 60 return 61 62 util.unzip_rarfile(rar_path=rar_path, dst=path, remove=False, use_rarfile=False) 63 64 _clean_redundant_files(path) 65 66 67def get_emneuron_paths( 68 path: Union[os.PathLike, str], split: Literal['train', 'val'], download: bool = False 69) -> List[str]: 70 """Get paths to the EMNeuron data. 71 72 Args: 73 path: Filepath to a folder where the downloaded data will be saved. 74 split: The split of the data to be used for training. 75 download: Whether to download the data if it is not present. 76 77 Returns: 78 List of filepaths to the stored data. 79 """ 80 get_emneuron_data(path, split, download) 81 if split == "train": 82 label_paths = natsorted(glob(os.path.join(path, "labeled", "*", "*_MaskIns.tif"))) 83 raw_paths = [os.path.join(os.path.dirname(p), os.path.basename(p).replace("_MaskIns", "")) for p in label_paths] 84 85 else: # 'val' split 86 raw_paths = natsorted(glob(os.path.join(path, "valid", "*", "*", "raw.tif"))) 87 label_paths = [ 88 os.path.join(os.path.dirname(p), "label_0.tif") 89 if os.path.exists(os.path.join(os.path.dirname(p), "label_0.tif")) 90 else os.path.join(os.path.dirname(p), "label.tif") for p in raw_paths 91 ] 92 93 assert len(raw_paths) == len(label_paths) 94 return raw_paths, label_paths 95 96 97def get_emneuron_dataset( 98 path: Union[os.PathLike, str], 99 patch_shape: Tuple[int, ...], 100 split: Literal['train', 'val'], 101 download: bool = False, 102 **kwargs 103) -> Dataset: 104 """Get the dataset for neuron segmentation. 105 106 Args: 107 path: Filepath to a folder where the downloaded data will be saved. 108 batch_size: The batch size for training. 109 patch_shape: The patch shape to use for training. 110 split: The split of the data to be used for training. 111 download: Whether to download the data if it is not present. 112 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 113 114 Returns: 115 The segmentation dataset. 116 """ 117 raw_paths, label_paths = get_emneuron_paths(path, split, download) 118 119 return torch_em.default_segmentation_dataset( 120 raw_paths=raw_paths, 121 raw_key=None, 122 label_paths=label_paths, 123 label_key=None, 124 patch_shape=patch_shape, 125 is_seg_dataset=True, 126 **kwargs 127 ) 128 129 130def get_emneuron_loader( 131 path: Union[os.PathLike, str], 132 batch_size: int, 133 patch_shape: Tuple[int, ...], 134 split: Literal['train', 'val'], 135 download: bool = False, 136 **kwargs 137) -> DataLoader: 138 """Get the dataloader for neuron segmentation. 139 140 Args: 141 path: Filepath to a folder where the downloaded data will be saved. 142 batch_size: The batch size for training. 143 patch_shape: The patch shape to use for training. 144 split: The split of the data to be used for training. 145 download: Whether to download the data if it is not present. 146 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 147 148 Returns: 149 The DataLoader. 150 """ 151 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 152 dataset = get_emneuron_dataset(path=path, patch_shape=patch_shape, split=split, download=download, **ds_kwargs) 153 return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)
def
get_emneuron_data( path: Union[os.PathLike, str], split: Literal['train', 'val'], download: bool = False):
30def get_emneuron_data(path: Union[os.PathLike, str], split: Literal['train', 'val'], download: bool = False): 31 """Get the EMNeuron data. 32 33 NOTE: The automatic download feature is currently not supported in `get_emneuron_data`. 34 You must follow the steps mentioned to download the data: 35 - Go to the official GitHub repository: https://github.com/yanchaoz/SegNeuron. 36 - Access the dataset link (hosted at HuggingFace): https://huggingface.co/datasets/yanchaoz/EMNeuron. 37 - Login / create your account to access the "Dataset Card". 38 - Go to "Files" in the dataset repo and download a) `labeled.rar` and b) `valid.rar`. 39 - Finally, provide the filepath to the folder where rar files are stored. 40 41 Args: 42 path: Filepath to a folder where the downloaded data will be saved. 43 split: The split of the data to be used for training. 44 download: Whether to download the data if it is not present. 45 """ 46 if download: 47 raise NotImplementedError( 48 "Automatic download is not supported for this data. Please read the docstring for more details." 49 ) 50 51 os.makedirs(path, exist_ok=True) 52 53 if split == "train": 54 rar_path = os.path.join(path, "labeled.rar") 55 elif split == "val": 56 rar_path = os.path.join(path, "valid.rar") 57 else: 58 raise ValueError(f"'{split}' is not a valid split. Please choose either 'train' or 'val'.") 59 60 if os.path.exists(os.path.splitext(rar_path)[0]): 61 return 62 63 util.unzip_rarfile(rar_path=rar_path, dst=path, remove=False, use_rarfile=False) 64 65 _clean_redundant_files(path)
Get the EMNeuron data.
NOTE: The automatic download feature is currently not supported in get_emneuron_data
.
You must follow the steps mentioned to download the data:
- Go to the official GitHub repository: https://github.com/yanchaoz/SegNeuron.
- Access the dataset link (hosted at HuggingFace): https://huggingface.co/datasets/yanchaoz/EMNeuron.
- Login / create your account to access the "Dataset Card".
- Go to "Files" in the dataset repo and download a)
labeled.rar
and b)valid.rar
. - Finally, provide the filepath to the folder where rar files are stored.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The split of the data to be used for training.
- download: Whether to download the data if it is not present.
def
get_emneuron_paths( path: Union[os.PathLike, str], split: Literal['train', 'val'], download: bool = False) -> List[str]:
68def get_emneuron_paths( 69 path: Union[os.PathLike, str], split: Literal['train', 'val'], download: bool = False 70) -> List[str]: 71 """Get paths to the EMNeuron data. 72 73 Args: 74 path: Filepath to a folder where the downloaded data will be saved. 75 split: The split of the data to be used for training. 76 download: Whether to download the data if it is not present. 77 78 Returns: 79 List of filepaths to the stored data. 80 """ 81 get_emneuron_data(path, split, download) 82 if split == "train": 83 label_paths = natsorted(glob(os.path.join(path, "labeled", "*", "*_MaskIns.tif"))) 84 raw_paths = [os.path.join(os.path.dirname(p), os.path.basename(p).replace("_MaskIns", "")) for p in label_paths] 85 86 else: # 'val' split 87 raw_paths = natsorted(glob(os.path.join(path, "valid", "*", "*", "raw.tif"))) 88 label_paths = [ 89 os.path.join(os.path.dirname(p), "label_0.tif") 90 if os.path.exists(os.path.join(os.path.dirname(p), "label_0.tif")) 91 else os.path.join(os.path.dirname(p), "label.tif") for p in raw_paths 92 ] 93 94 assert len(raw_paths) == len(label_paths) 95 return raw_paths, label_paths
Get paths to the EMNeuron data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The split of the data to be used for training.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths to the stored data.
def
get_emneuron_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], split: Literal['train', 'val'], download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
98def get_emneuron_dataset( 99 path: Union[os.PathLike, str], 100 patch_shape: Tuple[int, ...], 101 split: Literal['train', 'val'], 102 download: bool = False, 103 **kwargs 104) -> Dataset: 105 """Get the dataset for neuron segmentation. 106 107 Args: 108 path: Filepath to a folder where the downloaded data will be saved. 109 batch_size: The batch size for training. 110 patch_shape: The patch shape to use for training. 111 split: The split of the data to be used for training. 112 download: Whether to download the data if it is not present. 113 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 114 115 Returns: 116 The segmentation dataset. 117 """ 118 raw_paths, label_paths = get_emneuron_paths(path, split, download) 119 120 return torch_em.default_segmentation_dataset( 121 raw_paths=raw_paths, 122 raw_key=None, 123 label_paths=label_paths, 124 label_key=None, 125 patch_shape=patch_shape, 126 is_seg_dataset=True, 127 **kwargs 128 )
Get the dataset for neuron segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- split: The split of the data to be used for training.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
def
get_emneuron_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, ...], split: Literal['train', 'val'], download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
131def get_emneuron_loader( 132 path: Union[os.PathLike, str], 133 batch_size: int, 134 patch_shape: Tuple[int, ...], 135 split: Literal['train', 'val'], 136 download: bool = False, 137 **kwargs 138) -> DataLoader: 139 """Get the dataloader for neuron segmentation. 140 141 Args: 142 path: Filepath to a folder where the downloaded data will be saved. 143 batch_size: The batch size for training. 144 patch_shape: The patch shape to use for training. 145 split: The split of the data to be used for training. 146 download: Whether to download the data if it is not present. 147 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 148 149 Returns: 150 The DataLoader. 151 """ 152 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 153 dataset = get_emneuron_dataset(path=path, patch_shape=patch_shape, split=split, download=download, **ds_kwargs) 154 return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)
Get the dataloader for neuron segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- split: The split of the data to be used for training.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader.