torch_em.data.datasets.histopathology.cpm
The CPM dataset contains annotations for nucleus segmentation in H&E stained histopathology images for different tissue images.
NOTE: You must download the files manually.
- The dataset is located at https://drive.google.com/drive/folders/1l55cv3DuY-f7-JotDN7N5nbNnjbLWchK.
- The restructuring details are mentioned by the authors here: https://github.com/vqdang/hover_net/issues/5#issuecomment-508431862.
This dataset is from the publication https://doi.org/10.3389/fbioe.2019.00053. Please cite it if you use this dataset for your research.
1"""The CPM dataset contains annotations for nucleus segmentation in 2H&E stained histopathology images for different tissue images. 3 4NOTE: You must download the files manually. 51. The dataset is located at https://drive.google.com/drive/folders/1l55cv3DuY-f7-JotDN7N5nbNnjbLWchK. 62. The restructuring details are mentioned by the authors here: https://github.com/vqdang/hover_net/issues/5#issuecomment-508431862. 7 8This dataset is from the publication https://doi.org/10.3389/fbioe.2019.00053. 9Please cite it if you use this dataset for your research. 10""" # noqa 11 12import os 13from glob import glob 14from tqdm import tqdm 15from natsort import natsorted 16from typing import Union, Literal, Optional, Tuple, List 17 18import json 19import pandas as pd 20from scipy.io import loadmat 21import imageio.v3 as imageio 22from sklearn.model_selection import train_test_split 23 24from torch.utils.data import Dataset, DataLoader 25 26import torch_em 27 28from .. import util 29 30 31URL = { 32 "cpm15": "https://drive.google.com/drive/folders/11ko-GcDsPpA9GBHuCtl_jNzWQl6qY_-I?usp=drive_link", 33 "cpm17": "https://drive.google.com/drive/folders/1sJ4nmkif6j4s2FOGj8j6i_Ye7z9w0TfA?usp=drive_link", 34} 35 36 37def _create_split_csv(path, split): 38 csv_path = os.path.join(path, 'cpm15_split.csv') 39 if os.path.exists(csv_path): 40 df = pd.read_csv(csv_path) 41 df[split] = df[split].apply(lambda x: json.loads(x.replace("'", '"'))) # ensures all items from column in list. 42 split_list = df.iloc[0][split] 43 44 else: 45 print(f"Creating a new split file at '{csv_path}'.") 46 image_names = [ 47 os.path.basename(image).split(".")[0] for image in glob(os.path.join(path, 'cpm15', 'Images', '*.png')) 48 ] 49 50 train_ids, test_ids = train_test_split(image_names, test_size=0.25) # 20% split for test. 51 train_ids, val_ids = train_test_split(train_ids, test_size=0.20) # 15% split for val. 52 split_ids = {"train": train_ids, "val": val_ids, "test": test_ids} 53 54 df = pd.DataFrame.from_dict([split_ids]) 55 df.to_csv(csv_path) 56 split_list = split_ids[split] 57 58 return split_list 59 60 61def get_cpm_data(path: Union[os.PathLike, str], data_choice: Literal['cpm15', 'cpm17'], download: bool = False) -> str: 62 """Obtain the CPM data. 63 64 NOTE: The dataset is located at https://drive.google.com/drive/folders/1l55cv3DuY-f7-JotDN7N5nbNnjbLWchK. 65 Visit the drive link -> select the dataset(s) of choice -> right click and 'Download' the folder as zipfile. 66 67 Args: 68 path: Filepath to a folder where the data is downloaded for further processing. 69 data_choice: The choice of data. 70 download: Whether to download the data if it is not present. 71 72 Returns: 73 Filepath where the data has been manually downloaded and later preprocessed. 74 """ 75 if data_choice not in ['cpm15', 'cpm17']: 76 raise ValueError(f"'{data_choice}' is not a valid data choice.") 77 78 data_dir = os.path.join(path, data_choice) 79 if os.path.exists(data_dir): 80 return data_dir 81 82 if download: 83 raise NotImplementedError( 84 "The dataset cannot be automatically downloaded. " 85 "Please see 'get_cpm_data' in 'torch_em/data/datasets/histopathology/cpm.py' for details." 86 ) 87 88 os.makedirs(path, exist_ok=True) 89 zip_path = glob(os.path.join(path, f"{data_choice}*.zip")) 90 if len(zip_path) == 0: 91 raise AssertionError( 92 f"zip file for '{data_choice}' dataset is not found. Please download it from '{URL[data_choice]}'." 93 ) 94 95 zip_path = zip_path[0] 96 util.unzip(zip_path=zip_path, dst=path, remove=False) 97 98 return data_dir 99 100 101def get_cpm_paths( 102 path: Union[os.PathLike, str], 103 data_choice: Literal['cpm15', 'cpm17'], 104 split: Literal["train", "val", "test"], 105 download: bool = False 106) -> Tuple[List[str], List[str]]: 107 """Get paths to the CPM data. 108 109 Args: 110 path: Filepath to a folder where the data is downloaded for further processing. 111 data_choice: The choice of data. 112 split: The choice of data split. 113 download: Whether to download the data if it is not present. 114 115 Returns: 116 List of filepaths to the image data. 117 List of filepaths to the label data. 118 """ 119 data_dir = get_cpm_data(path, data_choice, download) 120 121 if data_choice == "cpm15": 122 raw_dir, label_dir = "Images", "Labels" 123 split_list = _create_split_csv(path, split) 124 125 raw_paths = [os.path.join(data_dir, raw_dir, f"{fname}.png") for fname in split_list] 126 label_mat_paths = [os.path.join(data_dir, label_dir, f"{fname}.mat") for fname in split_list] 127 128 else: 129 assert split in ['train', 'test'], 'Explicit val split does not exist for cpm17.' 130 raw_dir, label_dir = f"{split}/Images", f"{split}/Labels" 131 raw_paths = [p for p in natsorted(glob(os.path.join(data_dir, raw_dir, "*.png")))] 132 label_mat_paths = [p for p in natsorted(glob(os.path.join(data_dir, label_dir, "*.mat")))] 133 134 label_paths = [] 135 for mpath in tqdm(label_mat_paths, desc="Preprocessing labels"): 136 label_path = mpath.replace(".mat", "_instance_labels.tif") 137 label_paths.append(label_path) 138 if os.path.exists(label_path): 139 continue 140 141 label = loadmat(mpath)["inst_map"] 142 imageio.imwrite(label_path, label, compression="zlib") 143 144 assert len(raw_paths) == len(label_paths) and len(raw_paths) > 0 145 146 return raw_paths, label_paths 147 148 149def get_cpm_dataset( 150 path: Union[os.PathLike, str], 151 patch_shape: Tuple[int, int], 152 data_choice: Optional[Literal['cpm15', 'cpm17']] = None, 153 split: Literal["train", "val", "test"] = None, 154 resize_inputs: bool = False, 155 download: bool = False, 156 **kwargs 157) -> Dataset: 158 """Get the CPM dataset for nucleus segmentation. 159 160 Args: 161 path: Filepath to a folder where the data is downloaded for further processing. 162 patch_shape: The patch shape to use for training. 163 data_choice: The choice of data. 164 resize_inputs: Whether to resize the inputs. 165 download: Whether to download the data if it is not present. 166 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 167 168 Returns: 169 The segmentation dataset. 170 """ 171 raw_paths, label_paths = get_cpm_paths(path, data_choice, split, download) 172 173 if resize_inputs: 174 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True} 175 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 176 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 177 ) 178 179 return torch_em.default_segmentation_dataset( 180 raw_paths=raw_paths, 181 raw_key=None, 182 label_paths=label_paths, 183 label_key=None, 184 is_seg_dataset=False, 185 patch_shape=patch_shape, 186 with_channels=True, 187 ndim=2, 188 **kwargs 189 ) 190 191 192def get_cpm_loader( 193 path: Union[os.PathLike, str], 194 batch_size: int, 195 patch_shape: Tuple[int, int], 196 data_choice: Optional[Literal['cpm15', 'cpm17']] = None, 197 split: Literal["train", "val", "test"] = None, 198 resize_inputs: bool = False, 199 download: bool = False, 200 **kwargs 201) -> DataLoader: 202 """Get the CPM dataset for nucleus segmentation. 203 204 Args: 205 path: Filepath to a folder where the data is downloaded for further processing. 206 batch_size: The batch size for training. 207 patch_shape: The patch shape to use for training. 208 data_choice: The choice of data. 209 resize_inputs: Whether to resize the inputs. 210 download: Whether to download the data if it is not present. 211 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 212 213 Returns: 214 The DataLoader 215 """ 216 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 217 dataset = get_cpm_dataset(path, patch_shape, data_choice, split, resize_inputs, download, **ds_kwargs) 218 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
62def get_cpm_data(path: Union[os.PathLike, str], data_choice: Literal['cpm15', 'cpm17'], download: bool = False) -> str: 63 """Obtain the CPM data. 64 65 NOTE: The dataset is located at https://drive.google.com/drive/folders/1l55cv3DuY-f7-JotDN7N5nbNnjbLWchK. 66 Visit the drive link -> select the dataset(s) of choice -> right click and 'Download' the folder as zipfile. 67 68 Args: 69 path: Filepath to a folder where the data is downloaded for further processing. 70 data_choice: The choice of data. 71 download: Whether to download the data if it is not present. 72 73 Returns: 74 Filepath where the data has been manually downloaded and later preprocessed. 75 """ 76 if data_choice not in ['cpm15', 'cpm17']: 77 raise ValueError(f"'{data_choice}' is not a valid data choice.") 78 79 data_dir = os.path.join(path, data_choice) 80 if os.path.exists(data_dir): 81 return data_dir 82 83 if download: 84 raise NotImplementedError( 85 "The dataset cannot be automatically downloaded. " 86 "Please see 'get_cpm_data' in 'torch_em/data/datasets/histopathology/cpm.py' for details." 87 ) 88 89 os.makedirs(path, exist_ok=True) 90 zip_path = glob(os.path.join(path, f"{data_choice}*.zip")) 91 if len(zip_path) == 0: 92 raise AssertionError( 93 f"zip file for '{data_choice}' dataset is not found. Please download it from '{URL[data_choice]}'." 94 ) 95 96 zip_path = zip_path[0] 97 util.unzip(zip_path=zip_path, dst=path, remove=False) 98 99 return data_dir
Obtain the CPM data.
NOTE: The dataset is located at https://drive.google.com/drive/folders/1l55cv3DuY-f7-JotDN7N5nbNnjbLWchK. Visit the drive link -> select the dataset(s) of choice -> right click and 'Download' the folder as zipfile.
Arguments:
- path: Filepath to a folder where the data is downloaded for further processing.
- data_choice: The choice of data.
- download: Whether to download the data if it is not present.
Returns:
Filepath where the data has been manually downloaded and later preprocessed.
102def get_cpm_paths( 103 path: Union[os.PathLike, str], 104 data_choice: Literal['cpm15', 'cpm17'], 105 split: Literal["train", "val", "test"], 106 download: bool = False 107) -> Tuple[List[str], List[str]]: 108 """Get paths to the CPM data. 109 110 Args: 111 path: Filepath to a folder where the data is downloaded for further processing. 112 data_choice: The choice of data. 113 split: The choice of data split. 114 download: Whether to download the data if it is not present. 115 116 Returns: 117 List of filepaths to the image data. 118 List of filepaths to the label data. 119 """ 120 data_dir = get_cpm_data(path, data_choice, download) 121 122 if data_choice == "cpm15": 123 raw_dir, label_dir = "Images", "Labels" 124 split_list = _create_split_csv(path, split) 125 126 raw_paths = [os.path.join(data_dir, raw_dir, f"{fname}.png") for fname in split_list] 127 label_mat_paths = [os.path.join(data_dir, label_dir, f"{fname}.mat") for fname in split_list] 128 129 else: 130 assert split in ['train', 'test'], 'Explicit val split does not exist for cpm17.' 131 raw_dir, label_dir = f"{split}/Images", f"{split}/Labels" 132 raw_paths = [p for p in natsorted(glob(os.path.join(data_dir, raw_dir, "*.png")))] 133 label_mat_paths = [p for p in natsorted(glob(os.path.join(data_dir, label_dir, "*.mat")))] 134 135 label_paths = [] 136 for mpath in tqdm(label_mat_paths, desc="Preprocessing labels"): 137 label_path = mpath.replace(".mat", "_instance_labels.tif") 138 label_paths.append(label_path) 139 if os.path.exists(label_path): 140 continue 141 142 label = loadmat(mpath)["inst_map"] 143 imageio.imwrite(label_path, label, compression="zlib") 144 145 assert len(raw_paths) == len(label_paths) and len(raw_paths) > 0 146 147 return raw_paths, label_paths
Get paths to the CPM data.
Arguments:
- path: Filepath to a folder where the data is downloaded for further processing.
- data_choice: The choice of data.
- split: The choice of data split.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths to the image data. List of filepaths to the label data.
150def get_cpm_dataset( 151 path: Union[os.PathLike, str], 152 patch_shape: Tuple[int, int], 153 data_choice: Optional[Literal['cpm15', 'cpm17']] = None, 154 split: Literal["train", "val", "test"] = None, 155 resize_inputs: bool = False, 156 download: bool = False, 157 **kwargs 158) -> Dataset: 159 """Get the CPM dataset for nucleus segmentation. 160 161 Args: 162 path: Filepath to a folder where the data is downloaded for further processing. 163 patch_shape: The patch shape to use for training. 164 data_choice: The choice of data. 165 resize_inputs: Whether to resize the inputs. 166 download: Whether to download the data if it is not present. 167 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 168 169 Returns: 170 The segmentation dataset. 171 """ 172 raw_paths, label_paths = get_cpm_paths(path, data_choice, split, download) 173 174 if resize_inputs: 175 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True} 176 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 177 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 178 ) 179 180 return torch_em.default_segmentation_dataset( 181 raw_paths=raw_paths, 182 raw_key=None, 183 label_paths=label_paths, 184 label_key=None, 185 is_seg_dataset=False, 186 patch_shape=patch_shape, 187 with_channels=True, 188 ndim=2, 189 **kwargs 190 )
Get the CPM dataset for nucleus segmentation.
Arguments:
- path: Filepath to a folder where the data is downloaded for further processing.
- patch_shape: The patch shape to use for training.
- data_choice: The choice of data.
- resize_inputs: Whether to resize the inputs.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
193def get_cpm_loader( 194 path: Union[os.PathLike, str], 195 batch_size: int, 196 patch_shape: Tuple[int, int], 197 data_choice: Optional[Literal['cpm15', 'cpm17']] = None, 198 split: Literal["train", "val", "test"] = None, 199 resize_inputs: bool = False, 200 download: bool = False, 201 **kwargs 202) -> DataLoader: 203 """Get the CPM dataset for nucleus segmentation. 204 205 Args: 206 path: Filepath to a folder where the data is downloaded for further processing. 207 batch_size: The batch size for training. 208 patch_shape: The patch shape to use for training. 209 data_choice: The choice of data. 210 resize_inputs: Whether to resize the inputs. 211 download: Whether to download the data if it is not present. 212 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 213 214 Returns: 215 The DataLoader 216 """ 217 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 218 dataset = get_cpm_dataset(path, patch_shape, data_choice, split, resize_inputs, download, **ds_kwargs) 219 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the CPM dataset for nucleus segmentation.
Arguments:
- path: Filepath to a folder where the data is downloaded for further processing.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- data_choice: The choice of data.
- resize_inputs: Whether to resize the inputs.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader