torch_em.data.datasets.light_microscopy.segpc
The SegPC dataset contains annotations for cytoplasm and nucleus segmentation in microscopy images of multiple myeloma plasma cells.
This dataset is located at https://ieee-dataport.org/open-access/segpc-2021-segmentation-multiple-myeloma-plasma-cells-microscopic-images. # noqa The dataset is from the publication https://doi.org/10.1016/j.media.2022.102677. Please cite it if you use this dataset for your research.
1"""The SegPC dataset contains annotations for cytoplasm and nucleus segmentation in microscopy images 2of multiple myeloma plasma cells. 3 4This dataset is located at https://ieee-dataport.org/open-access/segpc-2021-segmentation-multiple-myeloma-plasma-cells-microscopic-images. # noqa 5The dataset is from the publication https://doi.org/10.1016/j.media.2022.102677. 6Please cite it if you use this dataset for your research. 7""" 8 9import os 10from glob import glob 11from tqdm import tqdm 12from pathlib import Path 13from natsort import natsorted 14from typing import Union, Literal, Tuple, List 15 16import numpy as np 17import imageio.v3 as imageio 18 19from torch.utils.data import Dataset, DataLoader 20 21import torch_em 22 23from .. import util 24 25 26def get_segpc_data(path: Union[os.PathLike, str], split: Literal['train', 'validation'], download: bool = False) -> str: 27 """Instruction to download SegPC data. 28 29 NOTE: Please download the dataset from https://ieee-dataport.org/open-access/segpc-2021-segmentation-multiple-myeloma-plasma-cells-microscopic-images. # noqa 30 31 Args: 32 path: Filepath to a folder where the data should be manually downloaded for further processing. 33 split: The data split to use. Either 'train' or 'validation'. 34 download: Whether to download the data if it is not present. 35 36 Returns: 37 The filepath to the data. 38 """ 39 data_dir = os.path.join(path, "TCIA_SegPC_dataset", split) 40 if os.path.exists(data_dir): 41 return data_dir 42 43 if download: 44 raise NotImplementedError( 45 "The dataset cannot be automatically downloaded. ", 46 "Please see 'get_segpc_data' in 'torch_em/data/datasets/light_microscopy/segpc.py for details." 47 ) 48 49 zip_path = os.path.join(path, "TCIA_SegPC_dataset.zip") 50 os.path.exists(zip_path), f"The manually downloaded zip file should be placed at '{path}'." 51 util.unzip(zip_path=zip_path, dst=path, remove=False) 52 53 # Unzip the split-wise zip files. 54 if split not in ['train', 'validation']: 55 if split == "test": 56 raise ValueError("The 'test' split does not have labels.") 57 raise ValueError(f"'{split}' is not a valid split.") 58 59 util.unzip(zip_path=os.path.join(Path(data_dir).parent, f"{split}.zip"), dst=Path(data_dir).parent) 60 61 return data_dir 62 63 64def get_segpc_paths( 65 path: Union[os.PathLike, str], split: Literal['train', 'validation'], download: bool = False 66) -> List[str]: 67 """Get paths to the SegPC data. 68 69 Args: 70 path: Filepath to a folder where the data is stored. 71 split: The data split to use. Either 'train' or 'validation'. 72 download: Whether to download the data if it is not present. 73 74 Returns: 75 List of filepaths for the input data. 76 """ 77 data_dir = get_segpc_data(path, split, download) 78 79 preprocessed_dir = os.path.join(data_dir, "preprocessed") 80 os.makedirs(preprocessed_dir, exist_ok=True) 81 82 volume_paths = [] 83 raw_paths = natsorted(glob(os.path.join(data_dir, "x", "*.bmp"))) 84 for rpath in tqdm(raw_paths, desc=f"Preprocessing '{split}' inputs"): 85 volume_path = os.path.join(preprocessed_dir, Path(os.path.basename(rpath)).with_suffix(".h5")) 86 volume_paths.append(volume_path) 87 if os.path.exists(volume_path): 88 continue 89 90 image = imageio.imread(rpath) 91 92 label_paths = glob(rpath.replace("x", "y").replace(".bmp", "_*.bmp")) 93 94 nuclei = np.zeros(image.shape[:2], dtype="uint32") 95 cells = np.zeros(image.shape[:2], dtype="uint32") 96 for i, lpath in enumerate(label_paths, start=1): 97 label = imageio.imread(lpath) 98 99 if label.ndim == 3: 100 label = label[..., 0] 101 102 nuclei[label == 40] = i 103 cells[label > 0] = i 104 105 import h5py 106 with h5py.File(volume_path, "w") as f: 107 f.create_dataset("raw", data=image.transpose(2, 0, 1), compression="gzip") 108 f.create_dataset("labels/nuclei", data=nuclei, compression="gzip") 109 f.create_dataset("labels/cells", data=cells, compression="gzip") 110 111 return volume_paths 112 113 114def get_segpc_dataset( 115 path: Union[os.PathLike, str], 116 patch_shape: Tuple[int, int], 117 split: Literal['train', 'val'], 118 label_choice: Literal['nuclei', 'cells'] = "cells", 119 download: bool = False, 120 **kwargs 121) -> Dataset: 122 """Get the SegPC dataset for plasma cell (and nuclei) segmentation. 123 124 Args: 125 path: Filepath to a folder where the data is stored. 126 patch_shape: The patch shape to use for training. 127 split: The data split to use. Either 'train' or 'validation'. 128 label_choice: The choice of labels. 129 download: Whether to download the data if it is not present. 130 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 131 132 Returns: 133 The segmentation dataset. 134 """ 135 volume_paths = get_segpc_paths(path, split, download) 136 137 return torch_em.default_segmentation_dataset( 138 raw_paths=volume_paths, 139 raw_key="raw", 140 label_paths=volume_paths, 141 label_key=f"labels/{label_choice}", 142 patch_shape=patch_shape, 143 with_channels=True, 144 is_seg_dataset=True, 145 ndim=2, 146 **kwargs 147 ) 148 149 150def get_segpc_loader( 151 path: Union[os.PathLike, str], 152 batch_size: int, 153 patch_shape: Tuple[int, int], 154 split: Literal['train', 'val'], 155 label_choice: Literal['nuclei', 'cells'] = "cells", 156 download: bool = False, 157 **kwargs 158) -> DataLoader: 159 """Get the SegPC dataloader for plasma cell (and nuclei) segmentation. 160 161 Args: 162 path: Filepath to a folder where the data is stored. 163 batch_size: The batch size for training. 164 patch_shape: The patch shape to use for training. 165 split: The data split to use. Either 'train' or 'validation'. 166 label_choice: The choice of labels. 167 download: Whether to download the data if it is not present. 168 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 169 170 Returns: 171 The DataLoader. 172 """ 173 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 174 dataset = get_segpc_dataset(path, patch_shape, split, label_choice, download, **ds_kwargs) 175 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
27def get_segpc_data(path: Union[os.PathLike, str], split: Literal['train', 'validation'], download: bool = False) -> str: 28 """Instruction to download SegPC data. 29 30 NOTE: Please download the dataset from https://ieee-dataport.org/open-access/segpc-2021-segmentation-multiple-myeloma-plasma-cells-microscopic-images. # noqa 31 32 Args: 33 path: Filepath to a folder where the data should be manually downloaded for further processing. 34 split: The data split to use. Either 'train' or 'validation'. 35 download: Whether to download the data if it is not present. 36 37 Returns: 38 The filepath to the data. 39 """ 40 data_dir = os.path.join(path, "TCIA_SegPC_dataset", split) 41 if os.path.exists(data_dir): 42 return data_dir 43 44 if download: 45 raise NotImplementedError( 46 "The dataset cannot be automatically downloaded. ", 47 "Please see 'get_segpc_data' in 'torch_em/data/datasets/light_microscopy/segpc.py for details." 48 ) 49 50 zip_path = os.path.join(path, "TCIA_SegPC_dataset.zip") 51 os.path.exists(zip_path), f"The manually downloaded zip file should be placed at '{path}'." 52 util.unzip(zip_path=zip_path, dst=path, remove=False) 53 54 # Unzip the split-wise zip files. 55 if split not in ['train', 'validation']: 56 if split == "test": 57 raise ValueError("The 'test' split does not have labels.") 58 raise ValueError(f"'{split}' is not a valid split.") 59 60 util.unzip(zip_path=os.path.join(Path(data_dir).parent, f"{split}.zip"), dst=Path(data_dir).parent) 61 62 return data_dir
Instruction to download SegPC data.
NOTE: Please download the dataset from https://ieee-dataport.org/open-access/segpc-2021-segmentation-multiple-myeloma-plasma-cells-microscopic-images. # noqa
Arguments:
- path: Filepath to a folder where the data should be manually downloaded for further processing.
- split: The data split to use. Either 'train' or 'validation'.
- download: Whether to download the data if it is not present.
Returns:
The filepath to the data.
65def get_segpc_paths( 66 path: Union[os.PathLike, str], split: Literal['train', 'validation'], download: bool = False 67) -> List[str]: 68 """Get paths to the SegPC data. 69 70 Args: 71 path: Filepath to a folder where the data is stored. 72 split: The data split to use. Either 'train' or 'validation'. 73 download: Whether to download the data if it is not present. 74 75 Returns: 76 List of filepaths for the input data. 77 """ 78 data_dir = get_segpc_data(path, split, download) 79 80 preprocessed_dir = os.path.join(data_dir, "preprocessed") 81 os.makedirs(preprocessed_dir, exist_ok=True) 82 83 volume_paths = [] 84 raw_paths = natsorted(glob(os.path.join(data_dir, "x", "*.bmp"))) 85 for rpath in tqdm(raw_paths, desc=f"Preprocessing '{split}' inputs"): 86 volume_path = os.path.join(preprocessed_dir, Path(os.path.basename(rpath)).with_suffix(".h5")) 87 volume_paths.append(volume_path) 88 if os.path.exists(volume_path): 89 continue 90 91 image = imageio.imread(rpath) 92 93 label_paths = glob(rpath.replace("x", "y").replace(".bmp", "_*.bmp")) 94 95 nuclei = np.zeros(image.shape[:2], dtype="uint32") 96 cells = np.zeros(image.shape[:2], dtype="uint32") 97 for i, lpath in enumerate(label_paths, start=1): 98 label = imageio.imread(lpath) 99 100 if label.ndim == 3: 101 label = label[..., 0] 102 103 nuclei[label == 40] = i 104 cells[label > 0] = i 105 106 import h5py 107 with h5py.File(volume_path, "w") as f: 108 f.create_dataset("raw", data=image.transpose(2, 0, 1), compression="gzip") 109 f.create_dataset("labels/nuclei", data=nuclei, compression="gzip") 110 f.create_dataset("labels/cells", data=cells, compression="gzip") 111 112 return volume_paths
Get paths to the SegPC data.
Arguments:
- path: Filepath to a folder where the data is stored.
- split: The data split to use. Either 'train' or 'validation'.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the input data.
115def get_segpc_dataset( 116 path: Union[os.PathLike, str], 117 patch_shape: Tuple[int, int], 118 split: Literal['train', 'val'], 119 label_choice: Literal['nuclei', 'cells'] = "cells", 120 download: bool = False, 121 **kwargs 122) -> Dataset: 123 """Get the SegPC dataset for plasma cell (and nuclei) segmentation. 124 125 Args: 126 path: Filepath to a folder where the data is stored. 127 patch_shape: The patch shape to use for training. 128 split: The data split to use. Either 'train' or 'validation'. 129 label_choice: The choice of labels. 130 download: Whether to download the data if it is not present. 131 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 132 133 Returns: 134 The segmentation dataset. 135 """ 136 volume_paths = get_segpc_paths(path, split, download) 137 138 return torch_em.default_segmentation_dataset( 139 raw_paths=volume_paths, 140 raw_key="raw", 141 label_paths=volume_paths, 142 label_key=f"labels/{label_choice}", 143 patch_shape=patch_shape, 144 with_channels=True, 145 is_seg_dataset=True, 146 ndim=2, 147 **kwargs 148 )
Get the SegPC dataset for plasma cell (and nuclei) segmentation.
Arguments:
- path: Filepath to a folder where the data is stored.
- patch_shape: The patch shape to use for training.
- split: The data split to use. Either 'train' or 'validation'.
- label_choice: The choice of labels.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
151def get_segpc_loader( 152 path: Union[os.PathLike, str], 153 batch_size: int, 154 patch_shape: Tuple[int, int], 155 split: Literal['train', 'val'], 156 label_choice: Literal['nuclei', 'cells'] = "cells", 157 download: bool = False, 158 **kwargs 159) -> DataLoader: 160 """Get the SegPC dataloader for plasma cell (and nuclei) segmentation. 161 162 Args: 163 path: Filepath to a folder where the data is stored. 164 batch_size: The batch size for training. 165 patch_shape: The patch shape to use for training. 166 split: The data split to use. Either 'train' or 'validation'. 167 label_choice: The choice of labels. 168 download: Whether to download the data if it is not present. 169 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 170 171 Returns: 172 The DataLoader. 173 """ 174 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 175 dataset = get_segpc_dataset(path, patch_shape, split, label_choice, download, **ds_kwargs) 176 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the SegPC dataloader for plasma cell (and nuclei) segmentation.
Arguments:
- path: Filepath to a folder where the data is stored.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- split: The data split to use. Either 'train' or 'validation'.
- label_choice: The choice of labels.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader.