torch_em.data.datasets.histopathology.cpm

The CPM dataset contains annotations for nucleus segmentation in H&E stained histopathology images for different tissue images.

NOTE: You must download the files manually.

The dataset is located at https://drive.google.com/drive/folders/1l55cv3DuY-f7-JotDN7N5nbNnjbLWchK.
The restructuring details are mentioned by the authors here: https://github.com/vqdang/hover_net/issues/5#issuecomment-508431862.

This dataset is from the publication https://doi.org/10.3389/fbioe.2019.00053. Please cite it if you use this dataset for your research.

View Source

  1"""The CPM dataset contains annotations for nucleus segmentation in
  2H&E stained histopathology images for different tissue images.
  3
  4NOTE: You must download the files manually.
  51. The dataset is located at https://drive.google.com/drive/folders/1l55cv3DuY-f7-JotDN7N5nbNnjbLWchK.
  62. The restructuring details are mentioned by the authors here: https://github.com/vqdang/hover_net/issues/5#issuecomment-508431862.
  7
  8This dataset is from the publication https://doi.org/10.3389/fbioe.2019.00053.
  9Please cite it if you use this dataset for your research.
 10"""  # noqa
 11
 12import os
 13from glob import glob
 14from tqdm import tqdm
 15from natsort import natsorted
 16from typing import Union, Literal, Optional, Tuple, List
 17
 18import json
 19import pandas as pd
 20from scipy.io import loadmat
 21import imageio.v3 as imageio
 22from sklearn.model_selection import train_test_split
 23
 24from torch.utils.data import Dataset, DataLoader
 25
 26import torch_em
 27
 28from .. import util
 29
 30
 31URL = {
 32    "cpm15": "https://drive.google.com/drive/folders/11ko-GcDsPpA9GBHuCtl_jNzWQl6qY_-I?usp=drive_link",
 33    "cpm17": "https://drive.google.com/drive/folders/1sJ4nmkif6j4s2FOGj8j6i_Ye7z9w0TfA?usp=drive_link",
 34}
 35
 36
 37def _create_split_csv(path, split):
 38    csv_path = os.path.join(path, 'cpm15_split.csv')
 39    if os.path.exists(csv_path):
 40        df = pd.read_csv(csv_path)
 41        df[split] = df[split].apply(lambda x: json.loads(x.replace("'", '"')))  # ensures all items from column in list.
 42        split_list = df.iloc[0][split]
 43
 44    else:
 45        print(f"Creating a new split file at '{csv_path}'.")
 46        image_names = [
 47            os.path.basename(image).split(".")[0] for image in glob(os.path.join(path, 'cpm15', 'Images', '*.png'))
 48        ]
 49
 50        train_ids, test_ids = train_test_split(image_names, test_size=0.25)  # 20% split for test.
 51        train_ids, val_ids = train_test_split(train_ids, test_size=0.20)  # 15% split for val.
 52        split_ids = {"train": train_ids, "val": val_ids, "test": test_ids}
 53
 54        df = pd.DataFrame.from_dict([split_ids])
 55        df.to_csv(csv_path)
 56        split_list = split_ids[split]
 57
 58    return split_list
 59
 60
 61def get_cpm_data(path: Union[os.PathLike, str], data_choice: Literal['cpm15', 'cpm17'], download: bool = False) -> str:
 62    """Obtain the CPM data.
 63
 64    NOTE: The dataset is located at https://drive.google.com/drive/folders/1l55cv3DuY-f7-JotDN7N5nbNnjbLWchK.
 65    Visit the drive link -> select the dataset(s) of choice -> right click and 'Download' the folder as zipfile.
 66
 67    Args:
 68        path: Filepath to a folder where the data is downloaded for further processing.
 69        data_choice: The choice of data.
 70        download: Whether to download the data if it is not present.
 71
 72    Returns:
 73        Filepath where the data has been manually downloaded and later preprocessed.
 74    """
 75    if data_choice not in ['cpm15', 'cpm17']:
 76        raise ValueError(f"'{data_choice}' is not a valid data choice.")
 77
 78    data_dir = os.path.join(path, data_choice)
 79    if os.path.exists(data_dir):
 80        return data_dir
 81
 82    if download:
 83        raise NotImplementedError(
 84            "The dataset cannot be automatically downloaded. "
 85            "Please see 'get_cpm_data' in 'torch_em/data/datasets/histopathology/cpm.py' for details."
 86        )
 87
 88    os.makedirs(path, exist_ok=True)
 89    zip_path = glob(os.path.join(path, f"{data_choice}*.zip"))
 90    if len(zip_path) == 0:
 91        raise AssertionError(
 92            f"zip file for '{data_choice}' dataset is not found. Please download it from '{URL[data_choice]}'."
 93        )
 94
 95    zip_path = zip_path[0]
 96    util.unzip(zip_path=zip_path, dst=path, remove=False)
 97
 98    return data_dir
 99
100
101def get_cpm_paths(
102    path: Union[os.PathLike, str],
103    data_choice: Literal['cpm15', 'cpm17'],
104    split: Literal["train", "val", "test"],
105    download: bool = False
106) -> Tuple[List[str], List[str]]:
107    """Get paths to the CPM data.
108
109    Args:
110        path: Filepath to a folder where the data is downloaded for further processing.
111        data_choice: The choice of data.
112        split: The choice of data split.
113        download: Whether to download the data if it is not present.
114
115    Returns:
116        List of filepaths to the image data.
117        List of filepaths to the label data.
118    """
119    data_dir = get_cpm_data(path, data_choice, download)
120
121    if data_choice == "cpm15":
122        raw_dir, label_dir = "Images", "Labels"
123        split_list = _create_split_csv(path, split)
124
125        raw_paths = [os.path.join(data_dir, raw_dir, f"{fname}.png") for fname in split_list]
126        label_mat_paths = [os.path.join(data_dir, label_dir, f"{fname}.mat") for fname in split_list]
127
128    else:
129        assert split in ['train', 'test'], 'Explicit val split does not exist for cpm17.'
130        raw_dir, label_dir = f"{split}/Images", f"{split}/Labels"
131        raw_paths = [p for p in natsorted(glob(os.path.join(data_dir, raw_dir, "*.png")))]
132        label_mat_paths = [p for p in natsorted(glob(os.path.join(data_dir, label_dir, "*.mat")))]
133
134    label_paths = []
135    for mpath in tqdm(label_mat_paths, desc="Preprocessing labels"):
136        label_path = mpath.replace(".mat", "_instance_labels.tif")
137        label_paths.append(label_path)
138        if os.path.exists(label_path):
139            continue
140
141        label = loadmat(mpath)["inst_map"]
142        imageio.imwrite(label_path, label, compression="zlib")
143
144    assert len(raw_paths) == len(label_paths) and len(raw_paths) > 0
145
146    return raw_paths, label_paths
147
148
149def get_cpm_dataset(
150    path: Union[os.PathLike, str],
151    patch_shape: Tuple[int, int],
152    data_choice: Optional[Literal['cpm15', 'cpm17']] = None,
153    split: Literal["train", "val", "test"] = None,
154    resize_inputs: bool = False,
155    download: bool = False,
156    **kwargs
157) -> Dataset:
158    """Get the CPM dataset for nucleus segmentation.
159
160    Args:
161        path: Filepath to a folder where the data is downloaded for further processing.
162        patch_shape: The patch shape to use for training.
163        data_choice: The choice of data.
164        resize_inputs: Whether to resize the inputs.
165        download: Whether to download the data if it is not present.
166        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
167
168    Returns:
169        The segmentation dataset.
170    """
171    raw_paths, label_paths = get_cpm_paths(path, data_choice, split, download)
172
173    if resize_inputs:
174        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
175        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
176            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
177        )
178
179    return torch_em.default_segmentation_dataset(
180        raw_paths=raw_paths,
181        raw_key=None,
182        label_paths=label_paths,
183        label_key=None,
184        is_seg_dataset=False,
185        patch_shape=patch_shape,
186        with_channels=True,
187        ndim=2,
188        **kwargs
189    )
190
191
192def get_cpm_loader(
193    path: Union[os.PathLike, str],
194    batch_size: int,
195    patch_shape: Tuple[int, int],
196    data_choice: Optional[Literal['cpm15', 'cpm17']] = None,
197    split: Literal["train", "val", "test"] = None,
198    resize_inputs: bool = False,
199    download: bool = False,
200    **kwargs
201) -> DataLoader:
202    """Get the CPM dataset for nucleus segmentation.
203
204    Args:
205        path: Filepath to a folder where the data is downloaded for further processing.
206        batch_size: The batch size for training.
207        patch_shape: The patch shape to use for training.
208        data_choice: The choice of data.
209        resize_inputs: Whether to resize the inputs.
210        download: Whether to download the data if it is not present.
211        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
212
213    Returns:
214        The DataLoader
215    """
216    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
217    dataset = get_cpm_dataset(path, patch_shape, data_choice, split, resize_inputs, download, **ds_kwargs)
218    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

URL = {'cpm15': 'https://drive.google.com/drive/folders/11ko-GcDsPpA9GBHuCtl_jNzWQl6qY_-I?usp=drive_link', 'cpm17': 'https://drive.google.com/drive/folders/1sJ4nmkif6j4s2FOGj8j6i_Ye7z9w0TfA?usp=drive_link'}

def get_cpm_data( path: Union[os.PathLike, str], data_choice: Literal['cpm15', 'cpm17'], download: bool = False) -> str: View Source

62def get_cpm_data(path: Union[os.PathLike, str], data_choice: Literal['cpm15', 'cpm17'], download: bool = False) -> str:
63    """Obtain the CPM data.
64
65    NOTE: The dataset is located at https://drive.google.com/drive/folders/1l55cv3DuY-f7-JotDN7N5nbNnjbLWchK.
66    Visit the drive link -> select the dataset(s) of choice -> right click and 'Download' the folder as zipfile.
67
68    Args:
69        path: Filepath to a folder where the data is downloaded for further processing.
70        data_choice: The choice of data.
71        download: Whether to download the data if it is not present.
72
73    Returns:
74        Filepath where the data has been manually downloaded and later preprocessed.
75    """
76    if data_choice not in ['cpm15', 'cpm17']:
77        raise ValueError(f"'{data_choice}' is not a valid data choice.")
78
79    data_dir = os.path.join(path, data_choice)
80    if os.path.exists(data_dir):
81        return data_dir
82
83    if download:
84        raise NotImplementedError(
85            "The dataset cannot be automatically downloaded. "
86            "Please see 'get_cpm_data' in 'torch_em/data/datasets/histopathology/cpm.py' for details."
87        )
88
89    os.makedirs(path, exist_ok=True)
90    zip_path = glob(os.path.join(path, f"{data_choice}*.zip"))
91    if len(zip_path) == 0:
92        raise AssertionError(
93            f"zip file for '{data_choice}' dataset is not found. Please download it from '{URL[data_choice]}'."
94        )
95
96    zip_path = zip_path[0]
97    util.unzip(zip_path=zip_path, dst=path, remove=False)
98
99    return data_dir

Obtain the CPM data.

NOTE: The dataset is located at https://drive.google.com/drive/folders/1l55cv3DuY-f7-JotDN7N5nbNnjbLWchK. Visit the drive link -> select the dataset(s) of choice -> right click and 'Download' the folder as zipfile.

Arguments:

path: Filepath to a folder where the data is downloaded for further processing.
data_choice: The choice of data.
download: Whether to download the data if it is not present.

Returns:

Filepath where the data has been manually downloaded and later preprocessed.

def get_cpm_paths( path: Union[os.PathLike, str], data_choice: Literal['cpm15', 'cpm17'], split: Literal['train', 'val', 'test'], download: bool = False) -> Tuple[List[str], List[str]]: View Source

102def get_cpm_paths(
103    path: Union[os.PathLike, str],
104    data_choice: Literal['cpm15', 'cpm17'],
105    split: Literal["train", "val", "test"],
106    download: bool = False
107) -> Tuple[List[str], List[str]]:
108    """Get paths to the CPM data.
109
110    Args:
111        path: Filepath to a folder where the data is downloaded for further processing.
112        data_choice: The choice of data.
113        split: The choice of data split.
114        download: Whether to download the data if it is not present.
115
116    Returns:
117        List of filepaths to the image data.
118        List of filepaths to the label data.
119    """
120    data_dir = get_cpm_data(path, data_choice, download)
121
122    if data_choice == "cpm15":
123        raw_dir, label_dir = "Images", "Labels"
124        split_list = _create_split_csv(path, split)
125
126        raw_paths = [os.path.join(data_dir, raw_dir, f"{fname}.png") for fname in split_list]
127        label_mat_paths = [os.path.join(data_dir, label_dir, f"{fname}.mat") for fname in split_list]
128
129    else:
130        assert split in ['train', 'test'], 'Explicit val split does not exist for cpm17.'
131        raw_dir, label_dir = f"{split}/Images", f"{split}/Labels"
132        raw_paths = [p for p in natsorted(glob(os.path.join(data_dir, raw_dir, "*.png")))]
133        label_mat_paths = [p for p in natsorted(glob(os.path.join(data_dir, label_dir, "*.mat")))]
134
135    label_paths = []
136    for mpath in tqdm(label_mat_paths, desc="Preprocessing labels"):
137        label_path = mpath.replace(".mat", "_instance_labels.tif")
138        label_paths.append(label_path)
139        if os.path.exists(label_path):
140            continue
141
142        label = loadmat(mpath)["inst_map"]
143        imageio.imwrite(label_path, label, compression="zlib")
144
145    assert len(raw_paths) == len(label_paths) and len(raw_paths) > 0
146
147    return raw_paths, label_paths

Get paths to the CPM data.

Arguments:

path: Filepath to a folder where the data is downloaded for further processing.
data_choice: The choice of data.
split: The choice of data split.
download: Whether to download the data if it is not present.

Returns:

List of filepaths to the image data. List of filepaths to the label data.

def get_cpm_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], data_choice: Optional[Literal['cpm15', 'cpm17']] = None, split: Literal['train', 'val', 'test'] = None, resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset: View Source

150def get_cpm_dataset(
151    path: Union[os.PathLike, str],
152    patch_shape: Tuple[int, int],
153    data_choice: Optional[Literal['cpm15', 'cpm17']] = None,
154    split: Literal["train", "val", "test"] = None,
155    resize_inputs: bool = False,
156    download: bool = False,
157    **kwargs
158) -> Dataset:
159    """Get the CPM dataset for nucleus segmentation.
160
161    Args:
162        path: Filepath to a folder where the data is downloaded for further processing.
163        patch_shape: The patch shape to use for training.
164        data_choice: The choice of data.
165        resize_inputs: Whether to resize the inputs.
166        download: Whether to download the data if it is not present.
167        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
168
169    Returns:
170        The segmentation dataset.
171    """
172    raw_paths, label_paths = get_cpm_paths(path, data_choice, split, download)
173
174    if resize_inputs:
175        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
176        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
177            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
178        )
179
180    return torch_em.default_segmentation_dataset(
181        raw_paths=raw_paths,
182        raw_key=None,
183        label_paths=label_paths,
184        label_key=None,
185        is_seg_dataset=False,
186        patch_shape=patch_shape,
187        with_channels=True,
188        ndim=2,
189        **kwargs
190    )

Get the CPM dataset for nucleus segmentation.

Arguments:

path: Filepath to a folder where the data is downloaded for further processing.
patch_shape: The patch shape to use for training.
data_choice: The choice of data.
resize_inputs: Whether to resize the inputs.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

Returns:

The segmentation dataset.

def get_cpm_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], data_choice: Optional[Literal['cpm15', 'cpm17']] = None, split: Literal['train', 'val', 'test'] = None, resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader: View Source

193def get_cpm_loader(
194    path: Union[os.PathLike, str],
195    batch_size: int,
196    patch_shape: Tuple[int, int],
197    data_choice: Optional[Literal['cpm15', 'cpm17']] = None,
198    split: Literal["train", "val", "test"] = None,
199    resize_inputs: bool = False,
200    download: bool = False,
201    **kwargs
202) -> DataLoader:
203    """Get the CPM dataset for nucleus segmentation.
204
205    Args:
206        path: Filepath to a folder where the data is downloaded for further processing.
207        batch_size: The batch size for training.
208        patch_shape: The patch shape to use for training.
209        data_choice: The choice of data.
210        resize_inputs: Whether to resize the inputs.
211        download: Whether to download the data if it is not present.
212        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
213
214    Returns:
215        The DataLoader
216    """
217    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
218    dataset = get_cpm_dataset(path, patch_shape, data_choice, split, resize_inputs, download, **ds_kwargs)
219    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the CPM dataset for nucleus segmentation.

Arguments:

path: Filepath to a folder where the data is downloaded for further processing.
batch_size: The batch size for training.
patch_shape: The patch shape to use for training.
data_choice: The choice of data.
resize_inputs: Whether to resize the inputs.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

Returns:

The DataLoader