torch_em.data.datasets.light_microscopy.segpc

The SegPC dataset contains annotations for cytoplasm and nucleus segmentation in microscopy images of multiple myeloma plasma cells.

This dataset is located at https://ieee-dataport.org/open-access/segpc-2021-segmentation-multiple-myeloma-plasma-cells-microscopic-images. # noqa The dataset is from the publication https://doi.org/10.1016/j.media.2022.102677. Please cite it if you use this dataset for your research.

View Source

  1"""The SegPC dataset contains annotations for cytoplasm and nucleus segmentation in microscopy images
  2of multiple myeloma plasma cells.
  3
  4This dataset is located at https://ieee-dataport.org/open-access/segpc-2021-segmentation-multiple-myeloma-plasma-cells-microscopic-images.  # noqa
  5The dataset is from the publication https://doi.org/10.1016/j.media.2022.102677.
  6Please cite it if you use this dataset for your research.
  7"""
  8
  9import os
 10from glob import glob
 11from tqdm import tqdm
 12from pathlib import Path
 13from natsort import natsorted
 14from typing import Union, Literal, Tuple, List
 15
 16import numpy as np
 17import imageio.v3 as imageio
 18
 19from torch.utils.data import Dataset, DataLoader
 20
 21import torch_em
 22
 23from .. import util
 24
 25
 26def get_segpc_data(path: Union[os.PathLike, str], split: Literal['train', 'validation'], download: bool = False) -> str:
 27    """Instruction to download SegPC data.
 28
 29    NOTE: Please download the dataset from https://ieee-dataport.org/open-access/segpc-2021-segmentation-multiple-myeloma-plasma-cells-microscopic-images.  # noqa
 30
 31    Args:
 32        path: Filepath to a folder where the data should be manually downloaded for further processing.
 33        split: The data split to use. Either 'train' or 'validation'.
 34        download: Whether to download the data if it is not present.
 35
 36    Returns:
 37        The filepath to the data.
 38    """
 39    data_dir = os.path.join(path, "TCIA_SegPC_dataset", split)
 40    if os.path.exists(data_dir):
 41        return data_dir
 42
 43    if download:
 44        raise NotImplementedError(
 45            "The dataset cannot be automatically downloaded. ",
 46            "Please see 'get_segpc_data' in 'torch_em/data/datasets/light_microscopy/segpc.py for details."
 47        )
 48
 49    zip_path = os.path.join(path, "TCIA_SegPC_dataset.zip")
 50    os.path.exists(zip_path), f"The manually downloaded zip file should be placed at '{path}'."
 51    util.unzip(zip_path=zip_path, dst=path, remove=False)
 52
 53    # Unzip the split-wise zip files.
 54    if split not in ['train', 'validation']:
 55        if split == "test":
 56            raise ValueError("The 'test' split does not have labels.")
 57        raise ValueError(f"'{split}' is not a valid split.")
 58
 59    util.unzip(zip_path=os.path.join(Path(data_dir).parent, f"{split}.zip"), dst=Path(data_dir).parent)
 60
 61    return data_dir
 62
 63
 64def get_segpc_paths(
 65    path: Union[os.PathLike, str], split: Literal['train', 'validation'], download: bool = False
 66) -> List[str]:
 67    """Get paths to the SegPC data.
 68
 69    Args:
 70        path: Filepath to a folder where the data is stored.
 71        split: The data split to use. Either 'train' or 'validation'.
 72        download: Whether to download the data if it is not present.
 73
 74    Returns:
 75        List of filepaths for the input data.
 76    """
 77    data_dir = get_segpc_data(path, split, download)
 78
 79    preprocessed_dir = os.path.join(data_dir, "preprocessed")
 80    os.makedirs(preprocessed_dir, exist_ok=True)
 81
 82    volume_paths = []
 83    raw_paths = natsorted(glob(os.path.join(data_dir, "x", "*.bmp")))
 84    for rpath in tqdm(raw_paths, desc=f"Preprocessing '{split}' inputs"):
 85        volume_path = os.path.join(preprocessed_dir, Path(os.path.basename(rpath)).with_suffix(".h5"))
 86        volume_paths.append(volume_path)
 87        if os.path.exists(volume_path):
 88            continue
 89
 90        image = imageio.imread(rpath)
 91
 92        label_paths = glob(rpath.replace("x", "y").replace(".bmp", "_*.bmp"))
 93
 94        nuclei = np.zeros(image.shape[:2], dtype="uint32")
 95        cells = np.zeros(image.shape[:2], dtype="uint32")
 96        for i, lpath in enumerate(label_paths, start=1):
 97            label = imageio.imread(lpath)
 98
 99            if label.ndim == 3:
100                label = label[..., 0]
101
102            nuclei[label == 40] = i
103            cells[label > 0] = i
104
105        import h5py
106        with h5py.File(volume_path, "w") as f:
107            f.create_dataset("raw", data=image.transpose(2, 0, 1), compression="gzip")
108            f.create_dataset("labels/nuclei", data=nuclei, compression="gzip")
109            f.create_dataset("labels/cells", data=cells, compression="gzip")
110
111    return volume_paths
112
113
114def get_segpc_dataset(
115    path: Union[os.PathLike, str],
116    patch_shape: Tuple[int, int],
117    split: Literal['train', 'val'],
118    label_choice: Literal['nuclei', 'cells'] = "cells",
119    download: bool = False,
120    **kwargs
121) -> Dataset:
122    """Get the SegPC dataset for plasma cell (and nuclei) segmentation.
123
124    Args:
125        path: Filepath to a folder where the data is stored.
126        patch_shape: The patch shape to use for training.
127        split: The data split to use. Either 'train' or 'validation'.
128        label_choice: The choice of labels.
129        download: Whether to download the data if it is not present.
130        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
131
132    Returns:
133        The segmentation dataset.
134    """
135    volume_paths = get_segpc_paths(path, split, download)
136
137    return torch_em.default_segmentation_dataset(
138        raw_paths=volume_paths,
139        raw_key="raw",
140        label_paths=volume_paths,
141        label_key=f"labels/{label_choice}",
142        patch_shape=patch_shape,
143        with_channels=True,
144        is_seg_dataset=True,
145        ndim=2,
146        **kwargs
147    )
148
149
150def get_segpc_loader(
151    path: Union[os.PathLike, str],
152    batch_size: int,
153    patch_shape: Tuple[int, int],
154    split: Literal['train', 'val'],
155    label_choice: Literal['nuclei', 'cells'] = "cells",
156    download: bool = False,
157    **kwargs
158) -> DataLoader:
159    """Get the SegPC dataloader for plasma cell (and nuclei) segmentation.
160
161    Args:
162        path: Filepath to a folder where the data is stored.
163        batch_size: The batch size for training.
164        patch_shape: The patch shape to use for training.
165        split: The data split to use. Either 'train' or 'validation'.
166        label_choice: The choice of labels.
167        download: Whether to download the data if it is not present.
168        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
169
170    Returns:
171        The DataLoader.
172    """
173    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
174    dataset = get_segpc_dataset(path, patch_shape, split, label_choice, download, **ds_kwargs)
175    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

def get_segpc_data( path: Union[os.PathLike, str], split: Literal['train', 'validation'], download: bool = False) -> str: View Source

27def get_segpc_data(path: Union[os.PathLike, str], split: Literal['train', 'validation'], download: bool = False) -> str:
28    """Instruction to download SegPC data.
29
30    NOTE: Please download the dataset from https://ieee-dataport.org/open-access/segpc-2021-segmentation-multiple-myeloma-plasma-cells-microscopic-images.  # noqa
31
32    Args:
33        path: Filepath to a folder where the data should be manually downloaded for further processing.
34        split: The data split to use. Either 'train' or 'validation'.
35        download: Whether to download the data if it is not present.
36
37    Returns:
38        The filepath to the data.
39    """
40    data_dir = os.path.join(path, "TCIA_SegPC_dataset", split)
41    if os.path.exists(data_dir):
42        return data_dir
43
44    if download:
45        raise NotImplementedError(
46            "The dataset cannot be automatically downloaded. ",
47            "Please see 'get_segpc_data' in 'torch_em/data/datasets/light_microscopy/segpc.py for details."
48        )
49
50    zip_path = os.path.join(path, "TCIA_SegPC_dataset.zip")
51    os.path.exists(zip_path), f"The manually downloaded zip file should be placed at '{path}'."
52    util.unzip(zip_path=zip_path, dst=path, remove=False)
53
54    # Unzip the split-wise zip files.
55    if split not in ['train', 'validation']:
56        if split == "test":
57            raise ValueError("The 'test' split does not have labels.")
58        raise ValueError(f"'{split}' is not a valid split.")
59
60    util.unzip(zip_path=os.path.join(Path(data_dir).parent, f"{split}.zip"), dst=Path(data_dir).parent)
61
62    return data_dir

Instruction to download SegPC data.

NOTE: Please download the dataset from https://ieee-dataport.org/open-access/segpc-2021-segmentation-multiple-myeloma-plasma-cells-microscopic-images. # noqa

Arguments:

path: Filepath to a folder where the data should be manually downloaded for further processing.
split: The data split to use. Either 'train' or 'validation'.
download: Whether to download the data if it is not present.

Returns:

The filepath to the data.

def get_segpc_paths( path: Union[os.PathLike, str], split: Literal['train', 'validation'], download: bool = False) -> List[str]: View Source

 65def get_segpc_paths(
 66    path: Union[os.PathLike, str], split: Literal['train', 'validation'], download: bool = False
 67) -> List[str]:
 68    """Get paths to the SegPC data.
 69
 70    Args:
 71        path: Filepath to a folder where the data is stored.
 72        split: The data split to use. Either 'train' or 'validation'.
 73        download: Whether to download the data if it is not present.
 74
 75    Returns:
 76        List of filepaths for the input data.
 77    """
 78    data_dir = get_segpc_data(path, split, download)
 79
 80    preprocessed_dir = os.path.join(data_dir, "preprocessed")
 81    os.makedirs(preprocessed_dir, exist_ok=True)
 82
 83    volume_paths = []
 84    raw_paths = natsorted(glob(os.path.join(data_dir, "x", "*.bmp")))
 85    for rpath in tqdm(raw_paths, desc=f"Preprocessing '{split}' inputs"):
 86        volume_path = os.path.join(preprocessed_dir, Path(os.path.basename(rpath)).with_suffix(".h5"))
 87        volume_paths.append(volume_path)
 88        if os.path.exists(volume_path):
 89            continue
 90
 91        image = imageio.imread(rpath)
 92
 93        label_paths = glob(rpath.replace("x", "y").replace(".bmp", "_*.bmp"))
 94
 95        nuclei = np.zeros(image.shape[:2], dtype="uint32")
 96        cells = np.zeros(image.shape[:2], dtype="uint32")
 97        for i, lpath in enumerate(label_paths, start=1):
 98            label = imageio.imread(lpath)
 99
100            if label.ndim == 3:
101                label = label[..., 0]
102
103            nuclei[label == 40] = i
104            cells[label > 0] = i
105
106        import h5py
107        with h5py.File(volume_path, "w") as f:
108            f.create_dataset("raw", data=image.transpose(2, 0, 1), compression="gzip")
109            f.create_dataset("labels/nuclei", data=nuclei, compression="gzip")
110            f.create_dataset("labels/cells", data=cells, compression="gzip")
111
112    return volume_paths

Get paths to the SegPC data.

Arguments:

path: Filepath to a folder where the data is stored.
split: The data split to use. Either 'train' or 'validation'.
download: Whether to download the data if it is not present.

Returns:

List of filepaths for the input data.

def get_segpc_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Literal['train', 'val'], label_choice: Literal['nuclei', 'cells'] = 'cells', download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset: View Source

115def get_segpc_dataset(
116    path: Union[os.PathLike, str],
117    patch_shape: Tuple[int, int],
118    split: Literal['train', 'val'],
119    label_choice: Literal['nuclei', 'cells'] = "cells",
120    download: bool = False,
121    **kwargs
122) -> Dataset:
123    """Get the SegPC dataset for plasma cell (and nuclei) segmentation.
124
125    Args:
126        path: Filepath to a folder where the data is stored.
127        patch_shape: The patch shape to use for training.
128        split: The data split to use. Either 'train' or 'validation'.
129        label_choice: The choice of labels.
130        download: Whether to download the data if it is not present.
131        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
132
133    Returns:
134        The segmentation dataset.
135    """
136    volume_paths = get_segpc_paths(path, split, download)
137
138    return torch_em.default_segmentation_dataset(
139        raw_paths=volume_paths,
140        raw_key="raw",
141        label_paths=volume_paths,
142        label_key=f"labels/{label_choice}",
143        patch_shape=patch_shape,
144        with_channels=True,
145        is_seg_dataset=True,
146        ndim=2,
147        **kwargs
148    )

Get the SegPC dataset for plasma cell (and nuclei) segmentation.

Arguments:

path: Filepath to a folder where the data is stored.
patch_shape: The patch shape to use for training.
split: The data split to use. Either 'train' or 'validation'.
label_choice: The choice of labels.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

Returns:

The segmentation dataset.

def get_segpc_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Literal['train', 'val'], label_choice: Literal['nuclei', 'cells'] = 'cells', download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader: View Source

151def get_segpc_loader(
152    path: Union[os.PathLike, str],
153    batch_size: int,
154    patch_shape: Tuple[int, int],
155    split: Literal['train', 'val'],
156    label_choice: Literal['nuclei', 'cells'] = "cells",
157    download: bool = False,
158    **kwargs
159) -> DataLoader:
160    """Get the SegPC dataloader for plasma cell (and nuclei) segmentation.
161
162    Args:
163        path: Filepath to a folder where the data is stored.
164        batch_size: The batch size for training.
165        patch_shape: The patch shape to use for training.
166        split: The data split to use. Either 'train' or 'validation'.
167        label_choice: The choice of labels.
168        download: Whether to download the data if it is not present.
169        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
170
171    Returns:
172        The DataLoader.
173    """
174    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
175    dataset = get_segpc_dataset(path, patch_shape, split, label_choice, download, **ds_kwargs)
176    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the SegPC dataloader for plasma cell (and nuclei) segmentation.

Arguments:

path: Filepath to a folder where the data is stored.
batch_size: The batch size for training.
patch_shape: The patch shape to use for training.
split: The data split to use. Either 'train' or 'validation'.
label_choice: The choice of labels.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

Returns:

The DataLoader.