torch_em.data.datasets.histopathology.segpath
SegPath contains semantic segmentation masks for H&E stained histopathology images from diverse cancer tissues.
The dataset is located at https://dakomura.github.io/SegPath/ (across several Zenodo links). The dataset is from the publication https://doi.org/10.1016/j.patter.2023.100688. Please cite it if you use this dataset for your research.
1"""SegPath contains semantic segmentation masks for H&E stained histopathology images from diverse cancer tissues. 2 3The dataset is located at https://dakomura.github.io/SegPath/ (across several Zenodo links). 4The dataset is from the publication https://doi.org/10.1016/j.patter.2023.100688. 5Please cite it if you use this dataset for your research. 6""" 7 8import csv 9import gzip 10import os 11import tarfile 12from glob import glob 13from typing import List, Literal, Optional, Tuple, Union 14 15import torch 16from torch.utils.data import Dataset, DataLoader 17 18import torch_em 19 20from .. import util 21 22 23URLS = { 24 "epithelium": { 25 "data": "https://zenodo.org/api/records/7412731/files/panCK_Epithelium.tar.gz/content", 26 "metadata": "https://zenodo.org/api/records/7412731/files/panCK_fileinfo.csv/content", 27 "data_name": "panCK_Epithelium.tar.gz", 28 "metadata_name": "panCK_fileinfo.csv", 29 }, 30 "smooth_muscle": { 31 "data": "https://zenodo.org/api/records/7412732/files/aSMA_SmoothMuscle.tar.gz/content", 32 "metadata": "https://zenodo.org/api/records/7412732/files/aSMA_fileinfo.csv/content", 33 "data_name": "aSMA_SmoothMuscle.tar.gz", 34 "metadata_name": "aSMA_fileinfo.csv", 35 }, 36 "red_blood_cells": { 37 "data": "https://zenodo.org/api/records/7412580/files/CD235a_RBC.tar.gz/content", 38 "metadata": "https://zenodo.org/api/records/7412580/files/CD235a_fileinfo.csv/content", 39 "data_name": "CD235a_RBC.tar.gz", 40 "metadata_name": "CD235a_fileinfo.csv", 41 }, 42 "leukocytes": { 43 "data": "https://zenodo.org/api/records/7412739/files/CD45RB_Leukocyte.tar.gz/content", 44 "metadata": "https://zenodo.org/api/records/7412739/files/CD45RB_fileinfo.csv/content", 45 "data_name": "CD45RB_Leukocyte.tar.gz", 46 "metadata_name": "CD45RB_fileinfo.csv", 47 }, 48 "lymphocytes": { 49 "data": "https://zenodo.org/api/records/7412529/files/CD3CD20_Lymphocyte.tar.gz/content", 50 "metadata": "https://zenodo.org/api/records/7412529/files/CD3CD20_fileinfo.csv/content", 51 "data_name": "CD3CD20_Lymphocyte.tar.gz", 52 "metadata_name": "CD3CD20_fileinfo.csv", 53 }, 54 "endothelium": { 55 "data": "https://zenodo.org/api/records/7412512/files/ERG_Endothelium.tar.gz/content", 56 "metadata": "https://zenodo.org/api/records/7412512/files/ERG_fileinfo.csv/content", 57 "data_name": "ERG_Endothelium.tar.gz", 58 "metadata_name": "ERG_fileinfo.csv", 59 }, 60 "plasma_cells": { 61 "data": "https://zenodo.org/api/records/7412500/files/MIST1_PlasmaCell.tar.gz/content", 62 "metadata": "https://zenodo.org/api/records/7412500/files/MIST1_fileinfo.csv/content", 63 "data_name": "MIST1_PlasmaCell.tar.gz", 64 "metadata_name": "MIST1_fileinfo.csv", 65 }, 66 "myeloid_cells": { 67 "data": "https://zenodo.org/api/records/7412690/files/MNDA_MyeloidCell.tar.gz/content", 68 "metadata": "https://zenodo.org/api/records/7412690/files/MNDA_fileinfo.csv/content", 69 "data_name": "MNDA_MyeloidCell.tar.gz", 70 "metadata_name": "MNDA_fileinfo.csv", 71 }, 72} 73 74 75def _to_cell_types(cell_types): 76 if cell_types is None: 77 return list(URLS) 78 if isinstance(cell_types, str): 79 cell_types = [cell_types] 80 invalid_cell_types = set(cell_types) - set(URLS) 81 if invalid_cell_types: 82 raise ValueError( 83 f"Invalid cell type choices: {sorted(invalid_cell_types)}. Choose from {sorted(URLS)}." 84 ) 85 return cell_types 86 87 88def _is_gzip(path): 89 with open(path, "rb") as f: 90 return f.read(2) == b"\x1f\x8b" 91 92 93def _extract_data(path): 94 data_folder = os.path.splitext(os.path.splitext(os.path.basename(path))[0])[0] 95 extract_path = os.path.join(os.path.dirname(path), data_folder) 96 if os.path.exists(extract_path): 97 return 98 99 extract_root = os.path.dirname(path) 100 with tarfile.open(path) as f: 101 for member in f.getmembers(): 102 member_path = os.path.abspath(os.path.join(extract_root, member.name)) 103 if os.path.commonpath([os.path.abspath(extract_root), member_path]) != os.path.abspath(extract_root): 104 raise RuntimeError(f"Unsafe path in tar archive: {member.name}") 105 f.extractall(extract_root) 106 107 108def get_segpath_data( 109 path: Union[os.PathLike, str], 110 cell_types: Optional[Union[str, List[str]]] = None, 111 download: bool = False, 112) -> None: 113 """Download the SegPath data. 114 115 Args: 116 path: Filepath to a folder where the downloaded data will be saved. 117 cell_types: The cell types to download. By default all cell types are downloaded. 118 download: Whether to download the data if it is not present. 119 """ 120 os.makedirs(path, exist_ok=True) 121 if not download: 122 return 123 124 for cell_type in _to_cell_types(cell_types): 125 source = URLS[cell_type] 126 data_path = os.path.join(path, source["data_name"]) 127 metadata_path = os.path.join(path, source["metadata_name"]) 128 data_folder = os.path.splitext(os.path.splitext(source["data_name"])[0])[0] 129 extracted_path = os.path.join(path, data_folder) 130 131 util.download_source(metadata_path, source["metadata"], download, checksum=None) 132 133 if not os.path.exists(extracted_path): 134 util.download_source(data_path, source["data"], download, checksum=None) 135 _extract_data(data_path) 136 137 138def _get_paths_from_metadata(path, cell_type, split): 139 source = URLS[cell_type] 140 metadata_path = os.path.join(path, source["metadata_name"]) 141 image_paths, label_paths = [], [] 142 143 open_file = gzip.open if _is_gzip(metadata_path) else open 144 with open_file(metadata_path, mode="rt") as f: 145 reader = csv.DictReader(f) 146 for row in reader: 147 if split is not None and row["train_val_test"] != split: 148 continue 149 150 filename = row["filename"] 151 if not filename.endswith("_HE.png"): 152 continue 153 154 image_path = os.path.join(path, filename) 155 label_path = os.path.join(path, filename.replace("_HE.png", "_mask.png")) 156 if not os.path.exists(image_path) or not os.path.exists(label_path): 157 continue 158 159 image_paths.append(image_path) 160 label_paths.append(label_path) 161 162 return image_paths, label_paths 163 164 165def _get_paths_from_files(path, cell_type, split): 166 if split is not None: 167 raise RuntimeError( 168 "The SegPath metadata CSV is required for split selection, but it could not be found. " 169 "Please download the metadata with `download=True` or place it into the dataset folder." 170 ) 171 172 data_name = os.path.splitext(os.path.splitext(URLS[cell_type]["data_name"])[0])[0] 173 image_paths = sorted(glob(os.path.join(path, data_name, "*_HE.png"))) 174 label_paths = [image_path.replace("_HE.png", "_mask.png") for image_path in image_paths] 175 paired_paths = [ 176 (image_path, label_path) for image_path, label_path in zip(image_paths, label_paths) 177 if os.path.exists(label_path) 178 ] 179 if not paired_paths: 180 return [], [] 181 182 image_paths, label_paths = zip(*paired_paths) 183 return list(image_paths), list(label_paths) 184 185 186def get_segpath_paths( 187 path: Union[os.PathLike, str], 188 cell_types: Optional[Union[str, List[str]]] = None, 189 split: Optional[Literal["train", "val", "test"]] = None, 190 download: bool = False, 191) -> Tuple[List[str], List[str]]: 192 """Get paths to the SegPath data. 193 194 Args: 195 path: Filepath to a folder where the downloaded data will be saved. 196 cell_types: The cell types to use. By default all cell types are used. 197 split: The split to use. Either "train", "val", "test" or None for all images. 198 download: Whether to download the data if it is not present. 199 200 Returns: 201 List of filepaths for the image data. 202 List of filepaths for the label data. 203 """ 204 if split is not None and split not in ("train", "val", "test"): 205 raise ValueError(f"'{split}' is not a valid split choice.") 206 207 cell_types = _to_cell_types(cell_types) 208 get_segpath_data(path, cell_types, download) 209 210 image_paths, label_paths = [], [] 211 for cell_type in cell_types: 212 metadata_path = os.path.join(path, URLS[cell_type]["metadata_name"]) 213 if os.path.exists(metadata_path): 214 this_image_paths, this_label_paths = _get_paths_from_metadata(path, cell_type, split) 215 else: 216 this_image_paths, this_label_paths = _get_paths_from_files(path, cell_type, split) 217 218 image_paths.extend(this_image_paths) 219 label_paths.extend(this_label_paths) 220 221 if not image_paths: 222 raise RuntimeError("Could not find any SegPath images and masks for the requested settings.") 223 224 return image_paths, label_paths 225 226 227def get_segpath_dataset( 228 path: Union[os.PathLike, str], 229 patch_shape: Tuple[int, int], 230 cell_types: Optional[Union[str, List[str]]] = None, 231 split: Optional[Literal["train", "val", "test"]] = None, 232 download: bool = False, 233 label_dtype: torch.dtype = torch.int64, 234 resize_inputs: bool = False, 235 **kwargs 236) -> Dataset: 237 """Get the SegPath dataset for semantic segmentation in H&E stained histopathology images. 238 239 Args: 240 path: Filepath to a folder where the downloaded data will be saved. 241 patch_shape: The patch shape to use for training. 242 cell_types: The cell types to use. By default all cell types are used. 243 split: The split to use. Either "train", "val", "test" or None for all images. 244 download: Whether to download the data if it is not present. 245 label_dtype: The datatype of labels. 246 resize_inputs: Whether to resize the input images. 247 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 248 249 Returns: 250 The segmentation dataset. 251 """ 252 image_paths, label_paths = get_segpath_paths(path, cell_types, split, download) 253 254 if resize_inputs: 255 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True} 256 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 257 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 258 ) 259 260 return torch_em.default_segmentation_dataset( 261 raw_paths=image_paths, 262 raw_key=None, 263 label_paths=label_paths, 264 label_key=None, 265 patch_shape=patch_shape, 266 label_dtype=label_dtype, 267 is_seg_dataset=False, 268 **kwargs 269 ) 270 271 272def get_segpath_loader( 273 path: Union[os.PathLike, str], 274 patch_shape: Tuple[int, int], 275 batch_size: int, 276 cell_types: Optional[Union[str, List[str]]] = None, 277 split: Optional[Literal["train", "val", "test"]] = None, 278 download: bool = False, 279 label_dtype: torch.dtype = torch.int64, 280 resize_inputs: bool = False, 281 **kwargs 282) -> DataLoader: 283 """Get the SegPath dataloader. 284 285 Args: 286 path: Filepath to a folder where the downloaded data will be saved. 287 patch_shape: The patch shape to use for training. 288 batch_size: The batch size for training. 289 cell_types: The cell types to use. By default all cell types are used. 290 split: The split to use. Either "train", "val", "test" or None for all images. 291 download: Whether to download the data if it is not present. 292 label_dtype: The datatype of labels. 293 resize_inputs: Whether to resize the input images. 294 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 295 296 Returns: 297 The DataLoader. 298 """ 299 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 300 dataset = get_segpath_dataset( 301 path=path, patch_shape=patch_shape, cell_types=cell_types, split=split, download=download, 302 label_dtype=label_dtype, resize_inputs=resize_inputs, **ds_kwargs 303 ) 304 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URLS =
{'epithelium': {'data': 'https://zenodo.org/api/records/7412731/files/panCK_Epithelium.tar.gz/content', 'metadata': 'https://zenodo.org/api/records/7412731/files/panCK_fileinfo.csv/content', 'data_name': 'panCK_Epithelium.tar.gz', 'metadata_name': 'panCK_fileinfo.csv'}, 'smooth_muscle': {'data': 'https://zenodo.org/api/records/7412732/files/aSMA_SmoothMuscle.tar.gz/content', 'metadata': 'https://zenodo.org/api/records/7412732/files/aSMA_fileinfo.csv/content', 'data_name': 'aSMA_SmoothMuscle.tar.gz', 'metadata_name': 'aSMA_fileinfo.csv'}, 'red_blood_cells': {'data': 'https://zenodo.org/api/records/7412580/files/CD235a_RBC.tar.gz/content', 'metadata': 'https://zenodo.org/api/records/7412580/files/CD235a_fileinfo.csv/content', 'data_name': 'CD235a_RBC.tar.gz', 'metadata_name': 'CD235a_fileinfo.csv'}, 'leukocytes': {'data': 'https://zenodo.org/api/records/7412739/files/CD45RB_Leukocyte.tar.gz/content', 'metadata': 'https://zenodo.org/api/records/7412739/files/CD45RB_fileinfo.csv/content', 'data_name': 'CD45RB_Leukocyte.tar.gz', 'metadata_name': 'CD45RB_fileinfo.csv'}, 'lymphocytes': {'data': 'https://zenodo.org/api/records/7412529/files/CD3CD20_Lymphocyte.tar.gz/content', 'metadata': 'https://zenodo.org/api/records/7412529/files/CD3CD20_fileinfo.csv/content', 'data_name': 'CD3CD20_Lymphocyte.tar.gz', 'metadata_name': 'CD3CD20_fileinfo.csv'}, 'endothelium': {'data': 'https://zenodo.org/api/records/7412512/files/ERG_Endothelium.tar.gz/content', 'metadata': 'https://zenodo.org/api/records/7412512/files/ERG_fileinfo.csv/content', 'data_name': 'ERG_Endothelium.tar.gz', 'metadata_name': 'ERG_fileinfo.csv'}, 'plasma_cells': {'data': 'https://zenodo.org/api/records/7412500/files/MIST1_PlasmaCell.tar.gz/content', 'metadata': 'https://zenodo.org/api/records/7412500/files/MIST1_fileinfo.csv/content', 'data_name': 'MIST1_PlasmaCell.tar.gz', 'metadata_name': 'MIST1_fileinfo.csv'}, 'myeloid_cells': {'data': 'https://zenodo.org/api/records/7412690/files/MNDA_MyeloidCell.tar.gz/content', 'metadata': 'https://zenodo.org/api/records/7412690/files/MNDA_fileinfo.csv/content', 'data_name': 'MNDA_MyeloidCell.tar.gz', 'metadata_name': 'MNDA_fileinfo.csv'}}
def
get_segpath_data( path: Union[os.PathLike, str], cell_types: Union[List[str], str, NoneType] = None, download: bool = False) -> None:
109def get_segpath_data( 110 path: Union[os.PathLike, str], 111 cell_types: Optional[Union[str, List[str]]] = None, 112 download: bool = False, 113) -> None: 114 """Download the SegPath data. 115 116 Args: 117 path: Filepath to a folder where the downloaded data will be saved. 118 cell_types: The cell types to download. By default all cell types are downloaded. 119 download: Whether to download the data if it is not present. 120 """ 121 os.makedirs(path, exist_ok=True) 122 if not download: 123 return 124 125 for cell_type in _to_cell_types(cell_types): 126 source = URLS[cell_type] 127 data_path = os.path.join(path, source["data_name"]) 128 metadata_path = os.path.join(path, source["metadata_name"]) 129 data_folder = os.path.splitext(os.path.splitext(source["data_name"])[0])[0] 130 extracted_path = os.path.join(path, data_folder) 131 132 util.download_source(metadata_path, source["metadata"], download, checksum=None) 133 134 if not os.path.exists(extracted_path): 135 util.download_source(data_path, source["data"], download, checksum=None) 136 _extract_data(data_path)
Download the SegPath data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- cell_types: The cell types to download. By default all cell types are downloaded.
- download: Whether to download the data if it is not present.
def
get_segpath_paths( path: Union[os.PathLike, str], cell_types: Union[List[str], str, NoneType] = None, split: Optional[Literal['train', 'val', 'test']] = None, download: bool = False) -> Tuple[List[str], List[str]]:
187def get_segpath_paths( 188 path: Union[os.PathLike, str], 189 cell_types: Optional[Union[str, List[str]]] = None, 190 split: Optional[Literal["train", "val", "test"]] = None, 191 download: bool = False, 192) -> Tuple[List[str], List[str]]: 193 """Get paths to the SegPath data. 194 195 Args: 196 path: Filepath to a folder where the downloaded data will be saved. 197 cell_types: The cell types to use. By default all cell types are used. 198 split: The split to use. Either "train", "val", "test" or None for all images. 199 download: Whether to download the data if it is not present. 200 201 Returns: 202 List of filepaths for the image data. 203 List of filepaths for the label data. 204 """ 205 if split is not None and split not in ("train", "val", "test"): 206 raise ValueError(f"'{split}' is not a valid split choice.") 207 208 cell_types = _to_cell_types(cell_types) 209 get_segpath_data(path, cell_types, download) 210 211 image_paths, label_paths = [], [] 212 for cell_type in cell_types: 213 metadata_path = os.path.join(path, URLS[cell_type]["metadata_name"]) 214 if os.path.exists(metadata_path): 215 this_image_paths, this_label_paths = _get_paths_from_metadata(path, cell_type, split) 216 else: 217 this_image_paths, this_label_paths = _get_paths_from_files(path, cell_type, split) 218 219 image_paths.extend(this_image_paths) 220 label_paths.extend(this_label_paths) 221 222 if not image_paths: 223 raise RuntimeError("Could not find any SegPath images and masks for the requested settings.") 224 225 return image_paths, label_paths
Get paths to the SegPath data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- cell_types: The cell types to use. By default all cell types are used.
- split: The split to use. Either "train", "val", "test" or None for all images.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the image data. List of filepaths for the label data.
def
get_segpath_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], cell_types: Union[List[str], str, NoneType] = None, split: Optional[Literal['train', 'val', 'test']] = None, download: bool = False, label_dtype: torch.dtype = torch.int64, resize_inputs: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
228def get_segpath_dataset( 229 path: Union[os.PathLike, str], 230 patch_shape: Tuple[int, int], 231 cell_types: Optional[Union[str, List[str]]] = None, 232 split: Optional[Literal["train", "val", "test"]] = None, 233 download: bool = False, 234 label_dtype: torch.dtype = torch.int64, 235 resize_inputs: bool = False, 236 **kwargs 237) -> Dataset: 238 """Get the SegPath dataset for semantic segmentation in H&E stained histopathology images. 239 240 Args: 241 path: Filepath to a folder where the downloaded data will be saved. 242 patch_shape: The patch shape to use for training. 243 cell_types: The cell types to use. By default all cell types are used. 244 split: The split to use. Either "train", "val", "test" or None for all images. 245 download: Whether to download the data if it is not present. 246 label_dtype: The datatype of labels. 247 resize_inputs: Whether to resize the input images. 248 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 249 250 Returns: 251 The segmentation dataset. 252 """ 253 image_paths, label_paths = get_segpath_paths(path, cell_types, split, download) 254 255 if resize_inputs: 256 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True} 257 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 258 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 259 ) 260 261 return torch_em.default_segmentation_dataset( 262 raw_paths=image_paths, 263 raw_key=None, 264 label_paths=label_paths, 265 label_key=None, 266 patch_shape=patch_shape, 267 label_dtype=label_dtype, 268 is_seg_dataset=False, 269 **kwargs 270 )
Get the SegPath dataset for semantic segmentation in H&E stained histopathology images.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- cell_types: The cell types to use. By default all cell types are used.
- split: The split to use. Either "train", "val", "test" or None for all images.
- download: Whether to download the data if it is not present.
- label_dtype: The datatype of labels.
- resize_inputs: Whether to resize the input images.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset.
Returns:
The segmentation dataset.
def
get_segpath_loader( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], batch_size: int, cell_types: Union[List[str], str, NoneType] = None, split: Optional[Literal['train', 'val', 'test']] = None, download: bool = False, label_dtype: torch.dtype = torch.int64, resize_inputs: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
273def get_segpath_loader( 274 path: Union[os.PathLike, str], 275 patch_shape: Tuple[int, int], 276 batch_size: int, 277 cell_types: Optional[Union[str, List[str]]] = None, 278 split: Optional[Literal["train", "val", "test"]] = None, 279 download: bool = False, 280 label_dtype: torch.dtype = torch.int64, 281 resize_inputs: bool = False, 282 **kwargs 283) -> DataLoader: 284 """Get the SegPath dataloader. 285 286 Args: 287 path: Filepath to a folder where the downloaded data will be saved. 288 patch_shape: The patch shape to use for training. 289 batch_size: The batch size for training. 290 cell_types: The cell types to use. By default all cell types are used. 291 split: The split to use. Either "train", "val", "test" or None for all images. 292 download: Whether to download the data if it is not present. 293 label_dtype: The datatype of labels. 294 resize_inputs: Whether to resize the input images. 295 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 296 297 Returns: 298 The DataLoader. 299 """ 300 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 301 dataset = get_segpath_dataset( 302 path=path, patch_shape=patch_shape, cell_types=cell_types, split=split, download=download, 303 label_dtype=label_dtype, resize_inputs=resize_inputs, **ds_kwargs 304 ) 305 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the SegPath dataloader.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- batch_size: The batch size for training.
- cell_types: The cell types to use. By default all cell types are used.
- split: The split to use. Either "train", "val", "test" or None for all images.
- download: Whether to download the data if it is not present.
- label_dtype: The datatype of labels.
- resize_inputs: Whether to resize the input images.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_datasetor for the PyTorch DataLoader.
Returns:
The DataLoader.