torch_em.data.datasets.electron_microscopy.deepict

Dataset for segmentation of structures in Cryo ET. The DeePict dataset contains annotations for several structures in CryoET. The dataset implemented here currently only provides access to the actin annotations.

The dataset is part of the publication https://doi.org/10.1038/s41592-022-01746-2. Plase cite it if you use this dataset in your research.

  1"""Dataset for segmentation of structures in Cryo ET.
  2The DeePict dataset contains annotations for several structures in CryoET.
  3The dataset implemented here currently only provides access to the actin annotations.
  4
  5The dataset is part of the publication https://doi.org/10.1038/s41592-022-01746-2.
  6Plase cite it if you use this dataset in your research.
  7"""
  8
  9import os
 10from glob import glob
 11from shutil import rmtree
 12from typing import Tuple, Union, List
 13
 14from torch.utils.data import Dataset, DataLoader
 15
 16try:
 17    import mrcfile
 18except ImportError:
 19    mrcfile = None
 20
 21import torch_em
 22
 23from .. import util
 24
 25
 26ACTIN_ID = 10002
 27
 28
 29def _process_deepict_actin(input_path, output_path):
 30    from elf.io import open_file
 31
 32    os.makedirs(output_path, exist_ok=True)
 33
 34    # datasets = ["00004", "00011", "00012"]
 35    # There are issues with the 00011 dataset
 36    datasets = ["00004", "00012"]
 37    for dataset in datasets:
 38        ds_folder = os.path.join(input_path, dataset)
 39        assert os.path.exists(ds_folder)
 40        ds_out = os.path.join(output_path, f"{dataset}.h5")
 41        if os.path.exists(ds_out):
 42            continue
 43
 44        assert mrcfile is not None, "Plese install mrcfile"
 45
 46        tomo_folder = glob(os.path.join(ds_folder, "Tomograms", "VoxelSpacing*"))
 47        assert len(tomo_folder) == 1
 48        tomo_folder = tomo_folder[0]
 49
 50        annotation_folder = os.path.join(tomo_folder, "Annotations")
 51        annotion_files = glob(os.path.join(annotation_folder, "*.zarr"))
 52
 53        tomo_path = os.path.join(tomo_folder, "CanonicalTomogram", f"{dataset}.mrc")
 54        with mrcfile.open(tomo_path, "r") as f:
 55            data = f.data[:]
 56
 57        annotations = {}
 58        for annotation in annotion_files:
 59            with open_file(annotation, "r") as f:
 60                annotation_data = f["0"][:].astype("uint8")
 61            assert annotation_data.shape == data.shape
 62            annotation_name = os.path.basename(annotation).split("-")[1]
 63            annotations[annotation_name] = annotation_data
 64
 65        with open_file(ds_out, "a") as f:
 66            f.create_dataset("raw", data=data, compression="gzip")
 67            for name, annotation in annotations.items():
 68                f.create_dataset(f"labels/original/{name}", data=annotation, compression="gzip")
 69
 70            # Create combined annotations for actin
 71            actin_seg = annotations["actin_deepict_training_prediction"]
 72            actin_seg2 = annotations["actin_ground_truth"]
 73            actin_seg[actin_seg2 == 1] = 1
 74            f.create_dataset("labels/actin", data=actin_seg, compression="gzip")
 75
 76
 77def get_deepict_actin_data(path: Union[os.PathLike, str], download: bool) -> str:
 78    """Download the DeePict actin dataset.
 79
 80    Args:
 81        path: Filepath to a folder where the downloaded data will be saved.
 82        download: Whether to download the data if it is not present.
 83
 84    Returns:
 85        The path to the downloaded data.
 86    """
 87    # Check if the processed data is already present.
 88    dataset_path = os.path.join(path, "deepict_actin")
 89    if os.path.exists(dataset_path):
 90        return dataset_path
 91
 92    # Otherwise download the data.
 93    dl_path = util.download_from_cryo_et_portal(path, ACTIN_ID, download)
 94
 95    # And then process it.
 96    _process_deepict_actin(dl_path, dataset_path)
 97
 98    # Clean up the original data after processing.
 99    rmtree(dl_path)
100
101    return dataset_path
102
103
104def get_deepict_actin_paths(path: Union[os.PathLike, str], download: bool = False) -> List[str]:
105    """Get paths to DeePict actin data.
106
107    Args:
108        path: Filepath to a folder where the downloaded data will be saved.
109        download: Whether to download the data if it is not present.
110
111    Returns:
112        The filepaths to the stored data.
113    """
114    get_deepict_actin_data(path, download)
115    data_paths = sorted(glob(os.path.join(path, "deepict_actin", "*.h5")))
116    return data_paths
117
118
119def get_deepict_actin_dataset(
120    path: Union[os.PathLike, str],
121    patch_shape: Tuple[int, int, int],
122    label_key: str = "labels/actin",
123    download: bool = False,
124    **kwargs
125) -> Dataset:
126    """Get the dataset for actin segmentation in Cryo ET data.
127
128    Args:
129        path: Filepath to a folder where the downloaded data will be saved.
130        patch_shape: The patch shape to use for training.
131        label_key: The key for the labels to load. By default this uses 'labels/actin',
132            which holds the best version of actin ground-truth images.
133        download: Whether to download the data if it is not present.
134        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
135
136    Returns:
137       The segmentation dataset.
138    """
139    assert len(patch_shape) == 3
140
141    data_paths = get_deepict_actin_paths(path, download)
142
143    return torch_em.default_segmentation_dataset(
144        raw_paths=data_paths,
145        raw_key="raw",
146        label_paths=data_paths,
147        label_key=label_key,
148        patch_shape=patch_shape,
149        is_seg_dataset=True,
150        **kwargs
151    )
152
153
154def get_deepict_actin_loader(
155    path: Union[os.PathLike, str],
156    patch_shape: Tuple[int, int, int],
157    batch_size: int,
158    label_key: str = "labels/actin",
159    download: bool = False,
160    **kwargs
161) -> DataLoader:
162    """Get the DataLoader for actin segmentation in CryoET data.
163
164    Args:
165        path: Filepath to a folder where the downloaded data will be saved.
166        patch_shape: The patch shape to use for training.
167        batch_size: The batch size for training.
168        label_key: The key for the labels to load. By default this uses 'labels/actin',
169            which holds the best version of actin ground-truth images.
170        download: Whether to download the data if it is not present.
171        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
172
173    Returns:
174        The DataLoader.
175    """
176    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
177    dataset = get_deepict_actin_dataset(path, patch_shape, label_key=label_key, download=download, **ds_kwargs)
178    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
ACTIN_ID = 10002
def get_deepict_actin_data(path: Union[os.PathLike, str], download: bool) -> str:
 78def get_deepict_actin_data(path: Union[os.PathLike, str], download: bool) -> str:
 79    """Download the DeePict actin dataset.
 80
 81    Args:
 82        path: Filepath to a folder where the downloaded data will be saved.
 83        download: Whether to download the data if it is not present.
 84
 85    Returns:
 86        The path to the downloaded data.
 87    """
 88    # Check if the processed data is already present.
 89    dataset_path = os.path.join(path, "deepict_actin")
 90    if os.path.exists(dataset_path):
 91        return dataset_path
 92
 93    # Otherwise download the data.
 94    dl_path = util.download_from_cryo_et_portal(path, ACTIN_ID, download)
 95
 96    # And then process it.
 97    _process_deepict_actin(dl_path, dataset_path)
 98
 99    # Clean up the original data after processing.
100    rmtree(dl_path)
101
102    return dataset_path

Download the DeePict actin dataset.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • download: Whether to download the data if it is not present.
Returns:

The path to the downloaded data.

def get_deepict_actin_paths(path: Union[os.PathLike, str], download: bool = False) -> List[str]:
105def get_deepict_actin_paths(path: Union[os.PathLike, str], download: bool = False) -> List[str]:
106    """Get paths to DeePict actin data.
107
108    Args:
109        path: Filepath to a folder where the downloaded data will be saved.
110        download: Whether to download the data if it is not present.
111
112    Returns:
113        The filepaths to the stored data.
114    """
115    get_deepict_actin_data(path, download)
116    data_paths = sorted(glob(os.path.join(path, "deepict_actin", "*.h5")))
117    return data_paths

Get paths to DeePict actin data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • download: Whether to download the data if it is not present.
Returns:

The filepaths to the stored data.

def get_deepict_actin_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int, int], label_key: str = 'labels/actin', download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
120def get_deepict_actin_dataset(
121    path: Union[os.PathLike, str],
122    patch_shape: Tuple[int, int, int],
123    label_key: str = "labels/actin",
124    download: bool = False,
125    **kwargs
126) -> Dataset:
127    """Get the dataset for actin segmentation in Cryo ET data.
128
129    Args:
130        path: Filepath to a folder where the downloaded data will be saved.
131        patch_shape: The patch shape to use for training.
132        label_key: The key for the labels to load. By default this uses 'labels/actin',
133            which holds the best version of actin ground-truth images.
134        download: Whether to download the data if it is not present.
135        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
136
137    Returns:
138       The segmentation dataset.
139    """
140    assert len(patch_shape) == 3
141
142    data_paths = get_deepict_actin_paths(path, download)
143
144    return torch_em.default_segmentation_dataset(
145        raw_paths=data_paths,
146        raw_key="raw",
147        label_paths=data_paths,
148        label_key=label_key,
149        patch_shape=patch_shape,
150        is_seg_dataset=True,
151        **kwargs
152    )

Get the dataset for actin segmentation in Cryo ET data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • label_key: The key for the labels to load. By default this uses 'labels/actin', which holds the best version of actin ground-truth images.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_deepict_actin_loader( path: Union[os.PathLike, str], patch_shape: Tuple[int, int, int], batch_size: int, label_key: str = 'labels/actin', download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
155def get_deepict_actin_loader(
156    path: Union[os.PathLike, str],
157    patch_shape: Tuple[int, int, int],
158    batch_size: int,
159    label_key: str = "labels/actin",
160    download: bool = False,
161    **kwargs
162) -> DataLoader:
163    """Get the DataLoader for actin segmentation in CryoET data.
164
165    Args:
166        path: Filepath to a folder where the downloaded data will be saved.
167        patch_shape: The patch shape to use for training.
168        batch_size: The batch size for training.
169        label_key: The key for the labels to load. By default this uses 'labels/actin',
170            which holds the best version of actin ground-truth images.
171        download: Whether to download the data if it is not present.
172        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
173
174    Returns:
175        The DataLoader.
176    """
177    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
178    dataset = get_deepict_actin_dataset(path, patch_shape, label_key=label_key, download=download, **ds_kwargs)
179    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the DataLoader for actin segmentation in CryoET data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • batch_size: The batch size for training.
  • label_key: The key for the labels to load. By default this uses 'labels/actin', which holds the best version of actin ground-truth images.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.