torch_em.data.datasets.histopathology.segpath

SegPath contains semantic segmentation masks for H&E stained histopathology images from diverse cancer tissues.

The dataset is located at https://dakomura.github.io/SegPath/ (across several Zenodo links). The dataset is from the publication https://doi.org/10.1016/j.patter.2023.100688. Please cite it if you use this dataset for your research.

  1"""SegPath contains semantic segmentation masks for H&E stained histopathology images from diverse cancer tissues.
  2
  3The dataset is located at https://dakomura.github.io/SegPath/ (across several Zenodo links).
  4The dataset is from the publication https://doi.org/10.1016/j.patter.2023.100688.
  5Please cite it if you use this dataset for your research.
  6"""
  7
  8import csv
  9import gzip
 10import os
 11import tarfile
 12from glob import glob
 13from typing import List, Literal, Optional, Tuple, Union
 14
 15import torch
 16from torch.utils.data import Dataset, DataLoader
 17
 18import torch_em
 19
 20from .. import util
 21
 22
 23URLS = {
 24    "epithelium": {
 25        "data": "https://zenodo.org/api/records/7412731/files/panCK_Epithelium.tar.gz/content",
 26        "metadata": "https://zenodo.org/api/records/7412731/files/panCK_fileinfo.csv/content",
 27        "data_name": "panCK_Epithelium.tar.gz",
 28        "metadata_name": "panCK_fileinfo.csv",
 29    },
 30    "smooth_muscle": {
 31        "data": "https://zenodo.org/api/records/7412732/files/aSMA_SmoothMuscle.tar.gz/content",
 32        "metadata": "https://zenodo.org/api/records/7412732/files/aSMA_fileinfo.csv/content",
 33        "data_name": "aSMA_SmoothMuscle.tar.gz",
 34        "metadata_name": "aSMA_fileinfo.csv",
 35    },
 36    "red_blood_cells": {
 37        "data": "https://zenodo.org/api/records/7412580/files/CD235a_RBC.tar.gz/content",
 38        "metadata": "https://zenodo.org/api/records/7412580/files/CD235a_fileinfo.csv/content",
 39        "data_name": "CD235a_RBC.tar.gz",
 40        "metadata_name": "CD235a_fileinfo.csv",
 41    },
 42    "leukocytes": {
 43        "data": "https://zenodo.org/api/records/7412739/files/CD45RB_Leukocyte.tar.gz/content",
 44        "metadata": "https://zenodo.org/api/records/7412739/files/CD45RB_fileinfo.csv/content",
 45        "data_name": "CD45RB_Leukocyte.tar.gz",
 46        "metadata_name": "CD45RB_fileinfo.csv",
 47    },
 48    "lymphocytes": {
 49        "data": "https://zenodo.org/api/records/7412529/files/CD3CD20_Lymphocyte.tar.gz/content",
 50        "metadata": "https://zenodo.org/api/records/7412529/files/CD3CD20_fileinfo.csv/content",
 51        "data_name": "CD3CD20_Lymphocyte.tar.gz",
 52        "metadata_name": "CD3CD20_fileinfo.csv",
 53    },
 54    "endothelium": {
 55        "data": "https://zenodo.org/api/records/7412512/files/ERG_Endothelium.tar.gz/content",
 56        "metadata": "https://zenodo.org/api/records/7412512/files/ERG_fileinfo.csv/content",
 57        "data_name": "ERG_Endothelium.tar.gz",
 58        "metadata_name": "ERG_fileinfo.csv",
 59    },
 60    "plasma_cells": {
 61        "data": "https://zenodo.org/api/records/7412500/files/MIST1_PlasmaCell.tar.gz/content",
 62        "metadata": "https://zenodo.org/api/records/7412500/files/MIST1_fileinfo.csv/content",
 63        "data_name": "MIST1_PlasmaCell.tar.gz",
 64        "metadata_name": "MIST1_fileinfo.csv",
 65    },
 66    "myeloid_cells": {
 67        "data": "https://zenodo.org/api/records/7412690/files/MNDA_MyeloidCell.tar.gz/content",
 68        "metadata": "https://zenodo.org/api/records/7412690/files/MNDA_fileinfo.csv/content",
 69        "data_name": "MNDA_MyeloidCell.tar.gz",
 70        "metadata_name": "MNDA_fileinfo.csv",
 71    },
 72}
 73
 74
 75def _to_cell_types(cell_types):
 76    if cell_types is None:
 77        return list(URLS)
 78    if isinstance(cell_types, str):
 79        cell_types = [cell_types]
 80    invalid_cell_types = set(cell_types) - set(URLS)
 81    if invalid_cell_types:
 82        raise ValueError(
 83            f"Invalid cell type choices: {sorted(invalid_cell_types)}. Choose from {sorted(URLS)}."
 84        )
 85    return cell_types
 86
 87
 88def _is_gzip(path):
 89    with open(path, "rb") as f:
 90        return f.read(2) == b"\x1f\x8b"
 91
 92
 93def _extract_data(path):
 94    data_folder = os.path.splitext(os.path.splitext(os.path.basename(path))[0])[0]
 95    extract_path = os.path.join(os.path.dirname(path), data_folder)
 96    if os.path.exists(extract_path):
 97        return
 98
 99    extract_root = os.path.dirname(path)
100    with tarfile.open(path) as f:
101        for member in f.getmembers():
102            member_path = os.path.abspath(os.path.join(extract_root, member.name))
103            if os.path.commonpath([os.path.abspath(extract_root), member_path]) != os.path.abspath(extract_root):
104                raise RuntimeError(f"Unsafe path in tar archive: {member.name}")
105        f.extractall(extract_root)
106
107
108def get_segpath_data(
109    path: Union[os.PathLike, str],
110    cell_types: Optional[Union[str, List[str]]] = None,
111    download: bool = False,
112) -> None:
113    """Download the SegPath data.
114
115    Args:
116        path: Filepath to a folder where the downloaded data will be saved.
117        cell_types: The cell types to download. By default all cell types are downloaded.
118        download: Whether to download the data if it is not present.
119    """
120    os.makedirs(path, exist_ok=True)
121    if not download:
122        return
123
124    for cell_type in _to_cell_types(cell_types):
125        source = URLS[cell_type]
126        data_path = os.path.join(path, source["data_name"])
127        metadata_path = os.path.join(path, source["metadata_name"])
128        data_folder = os.path.splitext(os.path.splitext(source["data_name"])[0])[0]
129        extracted_path = os.path.join(path, data_folder)
130
131        util.download_source(metadata_path, source["metadata"], download, checksum=None)
132
133        if not os.path.exists(extracted_path):
134            util.download_source(data_path, source["data"], download, checksum=None)
135            _extract_data(data_path)
136
137
138def _get_paths_from_metadata(path, cell_type, split):
139    source = URLS[cell_type]
140    metadata_path = os.path.join(path, source["metadata_name"])
141    image_paths, label_paths = [], []
142
143    open_file = gzip.open if _is_gzip(metadata_path) else open
144    with open_file(metadata_path, mode="rt") as f:
145        reader = csv.DictReader(f)
146        for row in reader:
147            if split is not None and row["train_val_test"] != split:
148                continue
149
150            filename = row["filename"]
151            if not filename.endswith("_HE.png"):
152                continue
153
154            image_path = os.path.join(path, filename)
155            label_path = os.path.join(path, filename.replace("_HE.png", "_mask.png"))
156            if not os.path.exists(image_path) or not os.path.exists(label_path):
157                continue
158
159            image_paths.append(image_path)
160            label_paths.append(label_path)
161
162    return image_paths, label_paths
163
164
165def _get_paths_from_files(path, cell_type, split):
166    if split is not None:
167        raise RuntimeError(
168            "The SegPath metadata CSV is required for split selection, but it could not be found. "
169            "Please download the metadata with `download=True` or place it into the dataset folder."
170        )
171
172    data_name = os.path.splitext(os.path.splitext(URLS[cell_type]["data_name"])[0])[0]
173    image_paths = sorted(glob(os.path.join(path, data_name, "*_HE.png")))
174    label_paths = [image_path.replace("_HE.png", "_mask.png") for image_path in image_paths]
175    paired_paths = [
176        (image_path, label_path) for image_path, label_path in zip(image_paths, label_paths)
177        if os.path.exists(label_path)
178    ]
179    if not paired_paths:
180        return [], []
181
182    image_paths, label_paths = zip(*paired_paths)
183    return list(image_paths), list(label_paths)
184
185
186def get_segpath_paths(
187    path: Union[os.PathLike, str],
188    cell_types: Optional[Union[str, List[str]]] = None,
189    split: Optional[Literal["train", "val", "test"]] = None,
190    download: bool = False,
191) -> Tuple[List[str], List[str]]:
192    """Get paths to the SegPath data.
193
194    Args:
195        path: Filepath to a folder where the downloaded data will be saved.
196        cell_types: The cell types to use. By default all cell types are used.
197        split: The split to use. Either "train", "val", "test" or None for all images.
198        download: Whether to download the data if it is not present.
199
200    Returns:
201        List of filepaths for the image data.
202        List of filepaths for the label data.
203    """
204    if split is not None and split not in ("train", "val", "test"):
205        raise ValueError(f"'{split}' is not a valid split choice.")
206
207    cell_types = _to_cell_types(cell_types)
208    get_segpath_data(path, cell_types, download)
209
210    image_paths, label_paths = [], []
211    for cell_type in cell_types:
212        metadata_path = os.path.join(path, URLS[cell_type]["metadata_name"])
213        if os.path.exists(metadata_path):
214            this_image_paths, this_label_paths = _get_paths_from_metadata(path, cell_type, split)
215        else:
216            this_image_paths, this_label_paths = _get_paths_from_files(path, cell_type, split)
217
218        image_paths.extend(this_image_paths)
219        label_paths.extend(this_label_paths)
220
221    if not image_paths:
222        raise RuntimeError("Could not find any SegPath images and masks for the requested settings.")
223
224    return image_paths, label_paths
225
226
227def get_segpath_dataset(
228    path: Union[os.PathLike, str],
229    patch_shape: Tuple[int, int],
230    cell_types: Optional[Union[str, List[str]]] = None,
231    split: Optional[Literal["train", "val", "test"]] = None,
232    download: bool = False,
233    label_dtype: torch.dtype = torch.int64,
234    resize_inputs: bool = False,
235    **kwargs
236) -> Dataset:
237    """Get the SegPath dataset for semantic segmentation in H&E stained histopathology images.
238
239    Args:
240        path: Filepath to a folder where the downloaded data will be saved.
241        patch_shape: The patch shape to use for training.
242        cell_types: The cell types to use. By default all cell types are used.
243        split: The split to use. Either "train", "val", "test" or None for all images.
244        download: Whether to download the data if it is not present.
245        label_dtype: The datatype of labels.
246        resize_inputs: Whether to resize the input images.
247        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
248
249    Returns:
250        The segmentation dataset.
251    """
252    image_paths, label_paths = get_segpath_paths(path, cell_types, split, download)
253
254    if resize_inputs:
255        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
256        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
257            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
258        )
259
260    return torch_em.default_segmentation_dataset(
261        raw_paths=image_paths,
262        raw_key=None,
263        label_paths=label_paths,
264        label_key=None,
265        patch_shape=patch_shape,
266        label_dtype=label_dtype,
267        is_seg_dataset=False,
268        **kwargs
269    )
270
271
272def get_segpath_loader(
273    path: Union[os.PathLike, str],
274    patch_shape: Tuple[int, int],
275    batch_size: int,
276    cell_types: Optional[Union[str, List[str]]] = None,
277    split: Optional[Literal["train", "val", "test"]] = None,
278    download: bool = False,
279    label_dtype: torch.dtype = torch.int64,
280    resize_inputs: bool = False,
281    **kwargs
282) -> DataLoader:
283    """Get the SegPath dataloader.
284
285    Args:
286        path: Filepath to a folder where the downloaded data will be saved.
287        patch_shape: The patch shape to use for training.
288        batch_size: The batch size for training.
289        cell_types: The cell types to use. By default all cell types are used.
290        split: The split to use. Either "train", "val", "test" or None for all images.
291        download: Whether to download the data if it is not present.
292        label_dtype: The datatype of labels.
293        resize_inputs: Whether to resize the input images.
294        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
295
296    Returns:
297        The DataLoader.
298    """
299    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
300    dataset = get_segpath_dataset(
301        path=path, patch_shape=patch_shape, cell_types=cell_types, split=split, download=download,
302        label_dtype=label_dtype, resize_inputs=resize_inputs, **ds_kwargs
303    )
304    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URLS = {'epithelium': {'data': 'https://zenodo.org/api/records/7412731/files/panCK_Epithelium.tar.gz/content', 'metadata': 'https://zenodo.org/api/records/7412731/files/panCK_fileinfo.csv/content', 'data_name': 'panCK_Epithelium.tar.gz', 'metadata_name': 'panCK_fileinfo.csv'}, 'smooth_muscle': {'data': 'https://zenodo.org/api/records/7412732/files/aSMA_SmoothMuscle.tar.gz/content', 'metadata': 'https://zenodo.org/api/records/7412732/files/aSMA_fileinfo.csv/content', 'data_name': 'aSMA_SmoothMuscle.tar.gz', 'metadata_name': 'aSMA_fileinfo.csv'}, 'red_blood_cells': {'data': 'https://zenodo.org/api/records/7412580/files/CD235a_RBC.tar.gz/content', 'metadata': 'https://zenodo.org/api/records/7412580/files/CD235a_fileinfo.csv/content', 'data_name': 'CD235a_RBC.tar.gz', 'metadata_name': 'CD235a_fileinfo.csv'}, 'leukocytes': {'data': 'https://zenodo.org/api/records/7412739/files/CD45RB_Leukocyte.tar.gz/content', 'metadata': 'https://zenodo.org/api/records/7412739/files/CD45RB_fileinfo.csv/content', 'data_name': 'CD45RB_Leukocyte.tar.gz', 'metadata_name': 'CD45RB_fileinfo.csv'}, 'lymphocytes': {'data': 'https://zenodo.org/api/records/7412529/files/CD3CD20_Lymphocyte.tar.gz/content', 'metadata': 'https://zenodo.org/api/records/7412529/files/CD3CD20_fileinfo.csv/content', 'data_name': 'CD3CD20_Lymphocyte.tar.gz', 'metadata_name': 'CD3CD20_fileinfo.csv'}, 'endothelium': {'data': 'https://zenodo.org/api/records/7412512/files/ERG_Endothelium.tar.gz/content', 'metadata': 'https://zenodo.org/api/records/7412512/files/ERG_fileinfo.csv/content', 'data_name': 'ERG_Endothelium.tar.gz', 'metadata_name': 'ERG_fileinfo.csv'}, 'plasma_cells': {'data': 'https://zenodo.org/api/records/7412500/files/MIST1_PlasmaCell.tar.gz/content', 'metadata': 'https://zenodo.org/api/records/7412500/files/MIST1_fileinfo.csv/content', 'data_name': 'MIST1_PlasmaCell.tar.gz', 'metadata_name': 'MIST1_fileinfo.csv'}, 'myeloid_cells': {'data': 'https://zenodo.org/api/records/7412690/files/MNDA_MyeloidCell.tar.gz/content', 'metadata': 'https://zenodo.org/api/records/7412690/files/MNDA_fileinfo.csv/content', 'data_name': 'MNDA_MyeloidCell.tar.gz', 'metadata_name': 'MNDA_fileinfo.csv'}}
def get_segpath_data( path: Union[os.PathLike, str], cell_types: Union[List[str], str, NoneType] = None, download: bool = False) -> None:
109def get_segpath_data(
110    path: Union[os.PathLike, str],
111    cell_types: Optional[Union[str, List[str]]] = None,
112    download: bool = False,
113) -> None:
114    """Download the SegPath data.
115
116    Args:
117        path: Filepath to a folder where the downloaded data will be saved.
118        cell_types: The cell types to download. By default all cell types are downloaded.
119        download: Whether to download the data if it is not present.
120    """
121    os.makedirs(path, exist_ok=True)
122    if not download:
123        return
124
125    for cell_type in _to_cell_types(cell_types):
126        source = URLS[cell_type]
127        data_path = os.path.join(path, source["data_name"])
128        metadata_path = os.path.join(path, source["metadata_name"])
129        data_folder = os.path.splitext(os.path.splitext(source["data_name"])[0])[0]
130        extracted_path = os.path.join(path, data_folder)
131
132        util.download_source(metadata_path, source["metadata"], download, checksum=None)
133
134        if not os.path.exists(extracted_path):
135            util.download_source(data_path, source["data"], download, checksum=None)
136            _extract_data(data_path)

Download the SegPath data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • cell_types: The cell types to download. By default all cell types are downloaded.
  • download: Whether to download the data if it is not present.
def get_segpath_paths( path: Union[os.PathLike, str], cell_types: Union[List[str], str, NoneType] = None, split: Optional[Literal['train', 'val', 'test']] = None, download: bool = False) -> Tuple[List[str], List[str]]:
187def get_segpath_paths(
188    path: Union[os.PathLike, str],
189    cell_types: Optional[Union[str, List[str]]] = None,
190    split: Optional[Literal["train", "val", "test"]] = None,
191    download: bool = False,
192) -> Tuple[List[str], List[str]]:
193    """Get paths to the SegPath data.
194
195    Args:
196        path: Filepath to a folder where the downloaded data will be saved.
197        cell_types: The cell types to use. By default all cell types are used.
198        split: The split to use. Either "train", "val", "test" or None for all images.
199        download: Whether to download the data if it is not present.
200
201    Returns:
202        List of filepaths for the image data.
203        List of filepaths for the label data.
204    """
205    if split is not None and split not in ("train", "val", "test"):
206        raise ValueError(f"'{split}' is not a valid split choice.")
207
208    cell_types = _to_cell_types(cell_types)
209    get_segpath_data(path, cell_types, download)
210
211    image_paths, label_paths = [], []
212    for cell_type in cell_types:
213        metadata_path = os.path.join(path, URLS[cell_type]["metadata_name"])
214        if os.path.exists(metadata_path):
215            this_image_paths, this_label_paths = _get_paths_from_metadata(path, cell_type, split)
216        else:
217            this_image_paths, this_label_paths = _get_paths_from_files(path, cell_type, split)
218
219        image_paths.extend(this_image_paths)
220        label_paths.extend(this_label_paths)
221
222    if not image_paths:
223        raise RuntimeError("Could not find any SegPath images and masks for the requested settings.")
224
225    return image_paths, label_paths

Get paths to the SegPath data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • cell_types: The cell types to use. By default all cell types are used.
  • split: The split to use. Either "train", "val", "test" or None for all images.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths for the image data. List of filepaths for the label data.

def get_segpath_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], cell_types: Union[List[str], str, NoneType] = None, split: Optional[Literal['train', 'val', 'test']] = None, download: bool = False, label_dtype: torch.dtype = torch.int64, resize_inputs: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
228def get_segpath_dataset(
229    path: Union[os.PathLike, str],
230    patch_shape: Tuple[int, int],
231    cell_types: Optional[Union[str, List[str]]] = None,
232    split: Optional[Literal["train", "val", "test"]] = None,
233    download: bool = False,
234    label_dtype: torch.dtype = torch.int64,
235    resize_inputs: bool = False,
236    **kwargs
237) -> Dataset:
238    """Get the SegPath dataset for semantic segmentation in H&E stained histopathology images.
239
240    Args:
241        path: Filepath to a folder where the downloaded data will be saved.
242        patch_shape: The patch shape to use for training.
243        cell_types: The cell types to use. By default all cell types are used.
244        split: The split to use. Either "train", "val", "test" or None for all images.
245        download: Whether to download the data if it is not present.
246        label_dtype: The datatype of labels.
247        resize_inputs: Whether to resize the input images.
248        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
249
250    Returns:
251        The segmentation dataset.
252    """
253    image_paths, label_paths = get_segpath_paths(path, cell_types, split, download)
254
255    if resize_inputs:
256        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
257        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
258            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
259        )
260
261    return torch_em.default_segmentation_dataset(
262        raw_paths=image_paths,
263        raw_key=None,
264        label_paths=label_paths,
265        label_key=None,
266        patch_shape=patch_shape,
267        label_dtype=label_dtype,
268        is_seg_dataset=False,
269        **kwargs
270    )

Get the SegPath dataset for semantic segmentation in H&E stained histopathology images.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • cell_types: The cell types to use. By default all cell types are used.
  • split: The split to use. Either "train", "val", "test" or None for all images.
  • download: Whether to download the data if it is not present.
  • label_dtype: The datatype of labels.
  • resize_inputs: Whether to resize the input images.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_segpath_loader( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], batch_size: int, cell_types: Union[List[str], str, NoneType] = None, split: Optional[Literal['train', 'val', 'test']] = None, download: bool = False, label_dtype: torch.dtype = torch.int64, resize_inputs: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
273def get_segpath_loader(
274    path: Union[os.PathLike, str],
275    patch_shape: Tuple[int, int],
276    batch_size: int,
277    cell_types: Optional[Union[str, List[str]]] = None,
278    split: Optional[Literal["train", "val", "test"]] = None,
279    download: bool = False,
280    label_dtype: torch.dtype = torch.int64,
281    resize_inputs: bool = False,
282    **kwargs
283) -> DataLoader:
284    """Get the SegPath dataloader.
285
286    Args:
287        path: Filepath to a folder where the downloaded data will be saved.
288        patch_shape: The patch shape to use for training.
289        batch_size: The batch size for training.
290        cell_types: The cell types to use. By default all cell types are used.
291        split: The split to use. Either "train", "val", "test" or None for all images.
292        download: Whether to download the data if it is not present.
293        label_dtype: The datatype of labels.
294        resize_inputs: Whether to resize the input images.
295        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
296
297    Returns:
298        The DataLoader.
299    """
300    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
301    dataset = get_segpath_dataset(
302        path=path, patch_shape=patch_shape, cell_types=cell_types, split=split, download=download,
303        label_dtype=label_dtype, resize_inputs=resize_inputs, **ds_kwargs
304    )
305    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the SegPath dataloader.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • batch_size: The batch size for training.
  • cell_types: The cell types to use. By default all cell types are used.
  • split: The split to use. Either "train", "val", "test" or None for all images.
  • download: Whether to download the data if it is not present.
  • label_dtype: The datatype of labels.
  • resize_inputs: Whether to resize the input images.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.