torch_em.data.datasets.electron_microscopy.deepict
Dataset for segmentation of structures in Cryo ET. The DeePict dataset contains annotations for several structures in CryoET. The dataset implemented here currently only provides access to the actin annotations.
The dataset is part of the publication https://doi.org/10.1038/s41592-022-01746-2. Plase cite it if you use this dataset in your research.
1"""Dataset for segmentation of structures in Cryo ET. 2The DeePict dataset contains annotations for several structures in CryoET. 3The dataset implemented here currently only provides access to the actin annotations. 4 5The dataset is part of the publication https://doi.org/10.1038/s41592-022-01746-2. 6Plase cite it if you use this dataset in your research. 7""" 8 9import os 10from glob import glob 11from shutil import rmtree 12from typing import Tuple, Union, List 13 14from torch.utils.data import Dataset, DataLoader 15 16try: 17 import mrcfile 18except ImportError: 19 mrcfile = None 20 21import torch_em 22 23from .. import util 24 25 26ACTIN_ID = 10002 27 28 29def _process_deepict_actin(input_path, output_path): 30 from elf.io import open_file 31 32 os.makedirs(output_path, exist_ok=True) 33 34 # datasets = ["00004", "00011", "00012"] 35 # There are issues with the 00011 dataset 36 datasets = ["00004", "00012"] 37 for dataset in datasets: 38 ds_folder = os.path.join(input_path, dataset) 39 assert os.path.exists(ds_folder) 40 ds_out = os.path.join(output_path, f"{dataset}.h5") 41 if os.path.exists(ds_out): 42 continue 43 44 assert mrcfile is not None, "Plese install mrcfile" 45 46 tomo_folder = glob(os.path.join(ds_folder, "Tomograms", "VoxelSpacing*")) 47 assert len(tomo_folder) == 1 48 tomo_folder = tomo_folder[0] 49 50 annotation_folder = os.path.join(tomo_folder, "Annotations") 51 annotion_files = glob(os.path.join(annotation_folder, "*.zarr")) 52 53 tomo_path = os.path.join(tomo_folder, "CanonicalTomogram", f"{dataset}.mrc") 54 with mrcfile.open(tomo_path, "r") as f: 55 data = f.data[:] 56 57 annotations = {} 58 for annotation in annotion_files: 59 with open_file(annotation, "r") as f: 60 annotation_data = f["0"][:].astype("uint8") 61 assert annotation_data.shape == data.shape 62 annotation_name = os.path.basename(annotation).split("-")[1] 63 annotations[annotation_name] = annotation_data 64 65 with open_file(ds_out, "a") as f: 66 f.create_dataset("raw", data=data, compression="gzip") 67 for name, annotation in annotations.items(): 68 f.create_dataset(f"labels/original/{name}", data=annotation, compression="gzip") 69 70 # Create combined annotations for actin 71 actin_seg = annotations["actin_deepict_training_prediction"] 72 actin_seg2 = annotations["actin_ground_truth"] 73 actin_seg[actin_seg2 == 1] = 1 74 f.create_dataset("labels/actin", data=actin_seg, compression="gzip") 75 76 77def get_deepict_actin_data(path: Union[os.PathLike, str], download: bool) -> str: 78 """Download the DeePict actin dataset. 79 80 Args: 81 path: Filepath to a folder where the downloaded data will be saved. 82 download: Whether to download the data if it is not present. 83 84 Returns: 85 The path to the downloaded data. 86 """ 87 # Check if the processed data is already present. 88 dataset_path = os.path.join(path, "deepict_actin") 89 if os.path.exists(dataset_path): 90 return dataset_path 91 92 # Otherwise download the data. 93 dl_path = util.download_from_cryo_et_portal(path, ACTIN_ID, download) 94 95 # And then process it. 96 _process_deepict_actin(dl_path, dataset_path) 97 98 # Clean up the original data after processing. 99 rmtree(dl_path) 100 101 return dataset_path 102 103 104def get_deepict_actin_paths(path: Union[os.PathLike, str], download: bool = False) -> List[str]: 105 """Get paths to DeePict actin data. 106 107 Args: 108 path: Filepath to a folder where the downloaded data will be saved. 109 download: Whether to download the data if it is not present. 110 111 Returns: 112 The filepaths to the stored data. 113 """ 114 get_deepict_actin_data(path, download) 115 data_paths = sorted(glob(os.path.join(path, "deepict_actin", "*.h5"))) 116 return data_paths 117 118 119def get_deepict_actin_dataset( 120 path: Union[os.PathLike, str], 121 patch_shape: Tuple[int, int, int], 122 label_key: str = "labels/actin", 123 download: bool = False, 124 **kwargs 125) -> Dataset: 126 """Get the dataset for actin segmentation in Cryo ET data. 127 128 Args: 129 path: Filepath to a folder where the downloaded data will be saved. 130 patch_shape: The patch shape to use for training. 131 label_key: The key for the labels to load. By default this uses 'labels/actin', 132 which holds the best version of actin ground-truth images. 133 download: Whether to download the data if it is not present. 134 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 135 136 Returns: 137 The segmentation dataset. 138 """ 139 assert len(patch_shape) == 3 140 141 data_paths = get_deepict_actin_paths(path, download) 142 143 return torch_em.default_segmentation_dataset( 144 raw_paths=data_paths, 145 raw_key="raw", 146 label_paths=data_paths, 147 label_key=label_key, 148 patch_shape=patch_shape, 149 is_seg_dataset=True, 150 **kwargs 151 ) 152 153 154def get_deepict_actin_loader( 155 path: Union[os.PathLike, str], 156 patch_shape: Tuple[int, int, int], 157 batch_size: int, 158 label_key: str = "labels/actin", 159 download: bool = False, 160 **kwargs 161) -> DataLoader: 162 """Get the DataLoader for actin segmentation in CryoET data. 163 164 Args: 165 path: Filepath to a folder where the downloaded data will be saved. 166 patch_shape: The patch shape to use for training. 167 batch_size: The batch size for training. 168 label_key: The key for the labels to load. By default this uses 'labels/actin', 169 which holds the best version of actin ground-truth images. 170 download: Whether to download the data if it is not present. 171 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 172 173 Returns: 174 The DataLoader. 175 """ 176 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 177 dataset = get_deepict_actin_dataset(path, patch_shape, label_key=label_key, download=download, **ds_kwargs) 178 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
ACTIN_ID =
10002
def
get_deepict_actin_data(path: Union[os.PathLike, str], download: bool) -> str:
78def get_deepict_actin_data(path: Union[os.PathLike, str], download: bool) -> str: 79 """Download the DeePict actin dataset. 80 81 Args: 82 path: Filepath to a folder where the downloaded data will be saved. 83 download: Whether to download the data if it is not present. 84 85 Returns: 86 The path to the downloaded data. 87 """ 88 # Check if the processed data is already present. 89 dataset_path = os.path.join(path, "deepict_actin") 90 if os.path.exists(dataset_path): 91 return dataset_path 92 93 # Otherwise download the data. 94 dl_path = util.download_from_cryo_et_portal(path, ACTIN_ID, download) 95 96 # And then process it. 97 _process_deepict_actin(dl_path, dataset_path) 98 99 # Clean up the original data after processing. 100 rmtree(dl_path) 101 102 return dataset_path
Download the DeePict actin dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- download: Whether to download the data if it is not present.
Returns:
The path to the downloaded data.
def
get_deepict_actin_paths(path: Union[os.PathLike, str], download: bool = False) -> List[str]:
105def get_deepict_actin_paths(path: Union[os.PathLike, str], download: bool = False) -> List[str]: 106 """Get paths to DeePict actin data. 107 108 Args: 109 path: Filepath to a folder where the downloaded data will be saved. 110 download: Whether to download the data if it is not present. 111 112 Returns: 113 The filepaths to the stored data. 114 """ 115 get_deepict_actin_data(path, download) 116 data_paths = sorted(glob(os.path.join(path, "deepict_actin", "*.h5"))) 117 return data_paths
Get paths to DeePict actin data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- download: Whether to download the data if it is not present.
Returns:
The filepaths to the stored data.
def
get_deepict_actin_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int, int], label_key: str = 'labels/actin', download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
120def get_deepict_actin_dataset( 121 path: Union[os.PathLike, str], 122 patch_shape: Tuple[int, int, int], 123 label_key: str = "labels/actin", 124 download: bool = False, 125 **kwargs 126) -> Dataset: 127 """Get the dataset for actin segmentation in Cryo ET data. 128 129 Args: 130 path: Filepath to a folder where the downloaded data will be saved. 131 patch_shape: The patch shape to use for training. 132 label_key: The key for the labels to load. By default this uses 'labels/actin', 133 which holds the best version of actin ground-truth images. 134 download: Whether to download the data if it is not present. 135 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 136 137 Returns: 138 The segmentation dataset. 139 """ 140 assert len(patch_shape) == 3 141 142 data_paths = get_deepict_actin_paths(path, download) 143 144 return torch_em.default_segmentation_dataset( 145 raw_paths=data_paths, 146 raw_key="raw", 147 label_paths=data_paths, 148 label_key=label_key, 149 patch_shape=patch_shape, 150 is_seg_dataset=True, 151 **kwargs 152 )
Get the dataset for actin segmentation in Cryo ET data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- label_key: The key for the labels to load. By default this uses 'labels/actin', which holds the best version of actin ground-truth images.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
def
get_deepict_actin_loader( path: Union[os.PathLike, str], patch_shape: Tuple[int, int, int], batch_size: int, label_key: str = 'labels/actin', download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
155def get_deepict_actin_loader( 156 path: Union[os.PathLike, str], 157 patch_shape: Tuple[int, int, int], 158 batch_size: int, 159 label_key: str = "labels/actin", 160 download: bool = False, 161 **kwargs 162) -> DataLoader: 163 """Get the DataLoader for actin segmentation in CryoET data. 164 165 Args: 166 path: Filepath to a folder where the downloaded data will be saved. 167 patch_shape: The patch shape to use for training. 168 batch_size: The batch size for training. 169 label_key: The key for the labels to load. By default this uses 'labels/actin', 170 which holds the best version of actin ground-truth images. 171 download: Whether to download the data if it is not present. 172 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 173 174 Returns: 175 The DataLoader. 176 """ 177 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 178 dataset = get_deepict_actin_dataset(path, patch_shape, label_key=label_key, download=download, **ds_kwargs) 179 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the DataLoader for actin segmentation in CryoET data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- batch_size: The batch size for training.
- label_key: The key for the labels to load. By default this uses 'labels/actin', which holds the best version of actin ground-truth images.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader.