torch_em.data.datasets.histopathology.conic
The CONIC dataset contains annotations for nucleus segmentation in histopathology images in H&E stained colon tissue.
This dataset is from the publication https://doi.org/10.1016/j.media.2023.103047. Please cite it if you use this dataset for your research.
1"""The CONIC dataset contains annotations for nucleus segmentation 2in histopathology images in H&E stained colon tissue. 3 4This dataset is from the publication https://doi.org/10.1016/j.media.2023.103047. 5Please cite it if you use this dataset for your research. 6""" 7 8import os 9from glob import glob 10from tqdm import tqdm 11from typing import Tuple, Union, List, Literal 12 13import numpy as np 14import pandas as pd 15 16from torch.utils.data import Dataset, DataLoader 17 18import torch_em 19 20from torch_em.data.datasets import util 21from sklearn.model_selection import StratifiedShuffleSplit 22 23 24URL = "https://drive.google.com/drive/folders/1il9jG7uA4-ebQ_lNmXbbF2eOK9uNwheb?usp=sharing" 25 26 27def _create_split_list(path, split): 28 # source: HoVerNet repo: https://github.com/vqdang/hover_net/blob/conic/generate_split.py. 29 # We take the FOLD_IDX = 0 as used for the baseline model 30 31 split_csv = os.path.join(path, "split.csv") 32 33 if os.path.exists(split_csv): 34 split_df = pd.read_csv(split_csv) 35 else: 36 SEED = 5 37 info = pd.read_csv(os.path.join(path, "patch_info.csv")) 38 file_names = np.squeeze(info.to_numpy()).tolist() 39 40 img_sources = [v.split('-')[0] for v in file_names] 41 img_sources = np.unique(img_sources) 42 43 cohort_sources = [v.split('_')[0] for v in img_sources] 44 _, cohort_sources = np.unique(cohort_sources, return_inverse=True) 45 46 num_trials = 10 47 splitter = StratifiedShuffleSplit(n_splits=num_trials, train_size=0.8, test_size=0.2, random_state=SEED) 48 49 splits = {} 50 split_generator = splitter.split(img_sources, cohort_sources) 51 for train_indices, valid_indices in split_generator: 52 train_cohorts = img_sources[train_indices] 53 valid_cohorts = img_sources[valid_indices] 54 55 assert np.intersect1d(train_cohorts, valid_cohorts).size == 0 56 57 train_names = [ 58 file_name for file_name in file_names for source in train_cohorts if source == file_name.split('-')[0] 59 ] 60 valid_names = [ 61 file_name for file_name in file_names for source in valid_cohorts if source == file_name.split('-')[0] 62 ] 63 64 train_names = np.unique(train_names) 65 valid_names = np.unique(valid_names) 66 print(f'Train: {len(train_names):04d} - Valid: {len(valid_names):04d}') 67 68 assert np.intersect1d(train_names, valid_names).size == 0 69 70 train_indices = [file_names.index(v) for v in train_names] 71 valid_indices = [file_names.index(v) for v in valid_names] 72 73 while len(train_indices) > len(valid_indices): 74 valid_indices.append(np.nan) 75 76 splits['train'] = train_indices 77 splits['test'] = valid_indices 78 break 79 80 split_df = pd.DataFrame(splits) 81 split_df.to_csv(split_csv, index=False) 82 83 split_list = [int(v) for v in split_df[split].dropna()] 84 return split_list 85 86 87def _extract_images(split, path): 88 89 split_list = _create_split_list(path, split) 90 91 images = np.load(os.path.join(path, "images.npy")) 92 labels = np.load(os.path.join(path, "labels.npy")) 93 94 instance_masks = [] 95 raw = [] 96 semantic_masks = [] 97 98 for idx, (image, label) in tqdm( 99 enumerate(zip(images, labels)), desc=f"Extracting '{split}' data", total=images.shape[0] 100 ): 101 if idx not in split_list: 102 continue 103 104 semantic_masks.append(label[:, :, 1]) 105 instance_masks.append(label[:, :, 0]) 106 raw.append(image) 107 108 raw = np.stack(raw).transpose(3, 0, 1, 2) # B, H, W, C --> C, B, H, W 109 instance_masks = np.stack(instance_masks) 110 semantic_masks = np.stack(semantic_masks) 111 112 import h5py 113 with h5py.File(os.path.join(path, f"{split}.h5"), "a") as f: 114 f.create_dataset("raw", data=raw, compression="gzip") 115 f.create_dataset("labels/instances", data=instance_masks, compression="gzip") 116 f.create_dataset("labels/semantic", data=semantic_masks, compression="gzip") 117 118 119def get_conic_data(path: Union[os.PathLike, str], split: Literal["train", "test"], download: bool = False) -> str: 120 """Download the CONIC dataset for nucleus segmentation. 121 122 Args: 123 path: Filepath to a folder where the downloaded data will be saved. 124 split: The choice of data split. 125 download: Whether to download the data if it is not present. 126 127 Returns: 128 Filepath where the data is download for further processing. 129 """ 130 if split not in ['train', 'test']: 131 raise ValueError(f"'{split}' is not a valid split.") 132 133 data_dir = os.path.join(path, "data") 134 if os.path.exists(data_dir) and glob(os.path.join(data_dir, "*.h5")): 135 return data_dir 136 137 os.makedirs(path, exist_ok=True) 138 139 # Download the files from google drive. 140 util.download_source_gdrive(path=data_dir, url=URL, download=download, download_type="folder", quiet=False) 141 142 # Extract and preprocess images for all splits 143 for _split in ['train', 'test']: 144 _extract_images(_split, data_dir) 145 146 return data_dir 147 148 149def get_conic_paths( 150 path: Union[os.PathLike], split: Literal["train", "test"], download: bool = False 151) -> List[str]: 152 """Get paths to the CONIC data. 153 154 Args: 155 path: Filepath to a folder where the downloaded data will be saved. 156 split: The choice of data splits. 157 download: Whether to download the data if it is not present. 158 159 Returns: 160 List of filepaths for the stored data. 161 """ 162 data_dir = get_conic_data(path, split, download) 163 return os.path.join(data_dir, f"{split}.h5") 164 165 166def get_conic_dataset( 167 path: Union[os.PathLike, str], 168 patch_shape: Tuple[int, int], 169 split: Literal["train", "test"], 170 label_choice: Literal["instances", "semantic"] = "instances", 171 resize_inputs: bool = False, 172 download: bool = False, 173 **kwargs 174) -> Dataset: 175 """Get the CONIC dataset for nucleus segmentation. 176 177 Args: 178 path: Filepath to a folder where the downloaded data will be saved. 179 patch_shape: The patch shape to use for training. 180 split: The choice of data split. 181 resize_inputs: Whether to resize the input images. 182 download: Whether to download the data if it is not present. 183 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 184 185 Returns: 186 The segmentation dataset. 187 """ 188 data_paths = get_conic_paths(path, split, download) 189 190 if resize_inputs: 191 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True} 192 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 193 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 194 ) 195 196 return torch_em.default_segmentation_dataset( 197 raw_paths=data_paths, 198 raw_key="raw", 199 label_paths=data_paths, 200 label_key=f"labels/{label_choice}", 201 patch_shape=patch_shape, 202 ndim=2, 203 with_channels=True, 204 **kwargs 205 ) 206 207 208def get_conic_loader( 209 path: Union[os.PathLike, str], 210 batch_size: int, 211 patch_shape: Tuple[int, int], 212 split: Literal["train", "test"], 213 label_choice: Literal["instances", "semantic"] = "instances", 214 resize_inputs: bool = False, 215 download: bool = False, 216 **kwargs 217) -> DataLoader: 218 """Get the CONIC dataloader for nucleus segmentation. 219 220 Args: 221 path: Filepath to a folder where the downloaded data will be saved. 222 batch_size: The batch size for training. 223 patch_shape: The patch shape to use for training. 224 split: The choice of data split. 225 resize_inputs: Whether to resize the inputs. 226 download: Whether to download the data if it is not present. 227 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 228 229 Returns: 230 The DataLoader. 231 """ 232 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 233 ds = get_conic_dataset(path, patch_shape, split, label_choice, resize_inputs, download, **ds_kwargs) 234 return torch_em.get_data_loader(ds, batch_size, **loader_kwargs)
URL =
'https://drive.google.com/drive/folders/1il9jG7uA4-ebQ_lNmXbbF2eOK9uNwheb?usp=sharing'
def
get_conic_data( path: Union[os.PathLike, str], split: Literal['train', 'test'], download: bool = False) -> str:
120def get_conic_data(path: Union[os.PathLike, str], split: Literal["train", "test"], download: bool = False) -> str: 121 """Download the CONIC dataset for nucleus segmentation. 122 123 Args: 124 path: Filepath to a folder where the downloaded data will be saved. 125 split: The choice of data split. 126 download: Whether to download the data if it is not present. 127 128 Returns: 129 Filepath where the data is download for further processing. 130 """ 131 if split not in ['train', 'test']: 132 raise ValueError(f"'{split}' is not a valid split.") 133 134 data_dir = os.path.join(path, "data") 135 if os.path.exists(data_dir) and glob(os.path.join(data_dir, "*.h5")): 136 return data_dir 137 138 os.makedirs(path, exist_ok=True) 139 140 # Download the files from google drive. 141 util.download_source_gdrive(path=data_dir, url=URL, download=download, download_type="folder", quiet=False) 142 143 # Extract and preprocess images for all splits 144 for _split in ['train', 'test']: 145 _extract_images(_split, data_dir) 146 147 return data_dir
Download the CONIC dataset for nucleus segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The choice of data split.
- download: Whether to download the data if it is not present.
Returns:
Filepath where the data is download for further processing.
def
get_conic_paths( path: os.PathLike, split: Literal['train', 'test'], download: bool = False) -> List[str]:
150def get_conic_paths( 151 path: Union[os.PathLike], split: Literal["train", "test"], download: bool = False 152) -> List[str]: 153 """Get paths to the CONIC data. 154 155 Args: 156 path: Filepath to a folder where the downloaded data will be saved. 157 split: The choice of data splits. 158 download: Whether to download the data if it is not present. 159 160 Returns: 161 List of filepaths for the stored data. 162 """ 163 data_dir = get_conic_data(path, split, download) 164 return os.path.join(data_dir, f"{split}.h5")
Get paths to the CONIC data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The choice of data splits.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the stored data.
def
get_conic_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Literal['train', 'test'], label_choice: Literal['instances', 'semantic'] = 'instances', resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
167def get_conic_dataset( 168 path: Union[os.PathLike, str], 169 patch_shape: Tuple[int, int], 170 split: Literal["train", "test"], 171 label_choice: Literal["instances", "semantic"] = "instances", 172 resize_inputs: bool = False, 173 download: bool = False, 174 **kwargs 175) -> Dataset: 176 """Get the CONIC dataset for nucleus segmentation. 177 178 Args: 179 path: Filepath to a folder where the downloaded data will be saved. 180 patch_shape: The patch shape to use for training. 181 split: The choice of data split. 182 resize_inputs: Whether to resize the input images. 183 download: Whether to download the data if it is not present. 184 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 185 186 Returns: 187 The segmentation dataset. 188 """ 189 data_paths = get_conic_paths(path, split, download) 190 191 if resize_inputs: 192 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True} 193 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 194 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 195 ) 196 197 return torch_em.default_segmentation_dataset( 198 raw_paths=data_paths, 199 raw_key="raw", 200 label_paths=data_paths, 201 label_key=f"labels/{label_choice}", 202 patch_shape=patch_shape, 203 ndim=2, 204 with_channels=True, 205 **kwargs 206 )
Get the CONIC dataset for nucleus segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- split: The choice of data split.
- resize_inputs: Whether to resize the input images.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
def
get_conic_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Literal['train', 'test'], label_choice: Literal['instances', 'semantic'] = 'instances', resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
209def get_conic_loader( 210 path: Union[os.PathLike, str], 211 batch_size: int, 212 patch_shape: Tuple[int, int], 213 split: Literal["train", "test"], 214 label_choice: Literal["instances", "semantic"] = "instances", 215 resize_inputs: bool = False, 216 download: bool = False, 217 **kwargs 218) -> DataLoader: 219 """Get the CONIC dataloader for nucleus segmentation. 220 221 Args: 222 path: Filepath to a folder where the downloaded data will be saved. 223 batch_size: The batch size for training. 224 patch_shape: The patch shape to use for training. 225 split: The choice of data split. 226 resize_inputs: Whether to resize the inputs. 227 download: Whether to download the data if it is not present. 228 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 229 230 Returns: 231 The DataLoader. 232 """ 233 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 234 ds = get_conic_dataset(path, patch_shape, split, label_choice, resize_inputs, download, **ds_kwargs) 235 return torch_em.get_data_loader(ds, batch_size, **loader_kwargs)
Get the CONIC dataloader for nucleus segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- split: The choice of data split.
- resize_inputs: Whether to resize the inputs.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader.