torch_em.data.datasets.histopathology.conic

The CONIC dataset contains annotations for nucleus segmentation in histopathology images in H&E stained colon tissue.

This dataset is from the publication https://doi.org/10.1016/j.media.2023.103047. Please cite it if you use this dataset for your research.

  1"""The CONIC dataset contains annotations for nucleus segmentation
  2in histopathology images in H&E stained colon tissue.
  3
  4This dataset is from the publication https://doi.org/10.1016/j.media.2023.103047.
  5Please cite it if you use this dataset for your research.
  6"""
  7
  8import os
  9from glob import glob
 10from tqdm import tqdm
 11from typing import Tuple, Union, List, Literal
 12
 13import numpy as np
 14import pandas as pd
 15
 16from torch.utils.data import Dataset, DataLoader
 17
 18import torch_em
 19
 20from torch_em.data.datasets import util
 21from sklearn.model_selection import StratifiedShuffleSplit
 22
 23
 24URL = "https://drive.google.com/drive/folders/1il9jG7uA4-ebQ_lNmXbbF2eOK9uNwheb?usp=sharing"
 25
 26
 27def _create_split_list(path, split):
 28    # source: HoVerNet repo: https://github.com/vqdang/hover_net/blob/conic/generate_split.py.
 29    # We take the FOLD_IDX = 0 as used for the baseline model
 30
 31    split_csv = os.path.join(path, "split.csv")
 32
 33    if os.path.exists(split_csv):
 34        split_df = pd.read_csv(split_csv)
 35    else:
 36        SEED = 5
 37        info = pd.read_csv(os.path.join(path, "patch_info.csv"))
 38        file_names = np.squeeze(info.to_numpy()).tolist()
 39
 40        img_sources = [v.split('-')[0] for v in file_names]
 41        img_sources = np.unique(img_sources)
 42
 43        cohort_sources = [v.split('_')[0] for v in img_sources]
 44        _, cohort_sources = np.unique(cohort_sources, return_inverse=True)
 45
 46        num_trials = 10
 47        splitter = StratifiedShuffleSplit(n_splits=num_trials, train_size=0.8, test_size=0.2, random_state=SEED)
 48
 49        splits = {}
 50        split_generator = splitter.split(img_sources, cohort_sources)
 51        for train_indices, valid_indices in split_generator:
 52            train_cohorts = img_sources[train_indices]
 53            valid_cohorts = img_sources[valid_indices]
 54
 55            assert np.intersect1d(train_cohorts, valid_cohorts).size == 0
 56
 57            train_names = [
 58                file_name for file_name in file_names for source in train_cohorts if source == file_name.split('-')[0]
 59            ]
 60            valid_names = [
 61                file_name for file_name in file_names for source in valid_cohorts if source == file_name.split('-')[0]
 62            ]
 63
 64            train_names = np.unique(train_names)
 65            valid_names = np.unique(valid_names)
 66            print(f'Train: {len(train_names):04d} - Valid: {len(valid_names):04d}')
 67
 68            assert np.intersect1d(train_names, valid_names).size == 0
 69
 70            train_indices = [file_names.index(v) for v in train_names]
 71            valid_indices = [file_names.index(v) for v in valid_names]
 72
 73            while len(train_indices) > len(valid_indices):
 74                valid_indices.append(np.nan)
 75
 76            splits['train'] = train_indices
 77            splits['test'] = valid_indices
 78            break
 79
 80        split_df = pd.DataFrame(splits)
 81        split_df.to_csv(split_csv, index=False)
 82
 83    split_list = [int(v) for v in split_df[split].dropna()]
 84    return split_list
 85
 86
 87def _extract_images(split, path):
 88
 89    split_list = _create_split_list(path, split)
 90
 91    images = np.load(os.path.join(path, "images.npy"))
 92    labels = np.load(os.path.join(path, "labels.npy"))
 93
 94    instance_masks = []
 95    raw = []
 96    semantic_masks = []
 97
 98    for idx, (image, label) in tqdm(
 99        enumerate(zip(images, labels)), desc=f"Extracting '{split}' data", total=images.shape[0]
100    ):
101        if idx not in split_list:
102            continue
103
104        semantic_masks.append(label[:, :, 1])
105        instance_masks.append(label[:, :, 0])
106        raw.append(image)
107
108    raw = np.stack(raw).transpose(3, 0, 1, 2)  # B, H, W, C --> C, B, H, W
109    instance_masks = np.stack(instance_masks)
110    semantic_masks = np.stack(semantic_masks)
111
112    import h5py
113    with h5py.File(os.path.join(path, f"{split}.h5"), "a") as f:
114        f.create_dataset("raw", data=raw, compression="gzip")
115        f.create_dataset("labels/instances", data=instance_masks, compression="gzip")
116        f.create_dataset("labels/semantic", data=semantic_masks, compression="gzip")
117
118
119def get_conic_data(path: Union[os.PathLike, str], split: Literal["train", "test"], download: bool = False) -> str:
120    """Download the CONIC dataset for nucleus segmentation.
121
122    Args:
123        path: Filepath to a folder where the downloaded data will be saved.
124        split: The choice of data split.
125        download: Whether to download the data if it is not present.
126
127    Returns:
128        Filepath where the data is download for further processing.
129    """
130    if split not in ['train', 'test']:
131        raise ValueError(f"'{split}' is not a valid split.")
132
133    data_dir = os.path.join(path, "data")
134    if os.path.exists(data_dir) and glob(os.path.join(data_dir, "*.h5")):
135        return data_dir
136
137    os.makedirs(path, exist_ok=True)
138
139    # Download the files from google drive.
140    util.download_source_gdrive(path=data_dir, url=URL, download=download, download_type="folder", quiet=False)
141
142    # Extract and preprocess images for all splits
143    for _split in ['train', 'test']:
144        _extract_images(_split, data_dir)
145
146    return data_dir
147
148
149def get_conic_paths(
150    path: Union[os.PathLike], split: Literal["train", "test"], download: bool = False
151) -> List[str]:
152    """Get paths to the CONIC data.
153
154    Args:
155        path: Filepath to a folder where the downloaded data will be saved.
156        split: The choice of data splits.
157        download: Whether to download the data if it is not present.
158
159    Returns:
160        List of filepaths for the stored data.
161    """
162    data_dir = get_conic_data(path, split, download)
163    return os.path.join(data_dir, f"{split}.h5")
164
165
166def get_conic_dataset(
167    path: Union[os.PathLike, str],
168    patch_shape: Tuple[int, int],
169    split: Literal["train", "test"],
170    label_choice: Literal["instances", "semantic"] = "instances",
171    resize_inputs: bool = False,
172    download: bool = False,
173    **kwargs
174) -> Dataset:
175    """Get the CONIC dataset for nucleus segmentation.
176
177    Args:
178        path: Filepath to a folder where the downloaded data will be saved.
179        patch_shape: The patch shape to use for training.
180        split: The choice of data split.
181        resize_inputs: Whether to resize the input images.
182        download: Whether to download the data if it is not present.
183        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
184
185    Returns:
186        The segmentation dataset.
187    """
188    data_paths = get_conic_paths(path, split, download)
189
190    if resize_inputs:
191        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
192        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
193            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
194        )
195
196    return torch_em.default_segmentation_dataset(
197        raw_paths=data_paths,
198        raw_key="raw",
199        label_paths=data_paths,
200        label_key=f"labels/{label_choice}",
201        patch_shape=patch_shape,
202        ndim=2,
203        with_channels=True,
204        **kwargs
205    )
206
207
208def get_conic_loader(
209    path: Union[os.PathLike, str],
210    batch_size: int,
211    patch_shape: Tuple[int, int],
212    split: Literal["train", "test"],
213    label_choice: Literal["instances", "semantic"] = "instances",
214    resize_inputs: bool = False,
215    download: bool = False,
216    **kwargs
217) -> DataLoader:
218    """Get the CONIC dataloader for nucleus segmentation.
219
220    Args:
221        path: Filepath to a folder where the downloaded data will be saved.
222        batch_size: The batch size for training.
223        patch_shape: The patch shape to use for training.
224        split: The choice of data split.
225        resize_inputs: Whether to resize the inputs.
226        download: Whether to download the data if it is not present.
227        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
228
229    Returns:
230        The DataLoader.
231    """
232    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
233    ds = get_conic_dataset(path, patch_shape, split, label_choice, resize_inputs, download, **ds_kwargs)
234    return torch_em.get_data_loader(ds, batch_size, **loader_kwargs)
URL = 'https://drive.google.com/drive/folders/1il9jG7uA4-ebQ_lNmXbbF2eOK9uNwheb?usp=sharing'
def get_conic_data( path: Union[os.PathLike, str], split: Literal['train', 'test'], download: bool = False) -> str:
120def get_conic_data(path: Union[os.PathLike, str], split: Literal["train", "test"], download: bool = False) -> str:
121    """Download the CONIC dataset for nucleus segmentation.
122
123    Args:
124        path: Filepath to a folder where the downloaded data will be saved.
125        split: The choice of data split.
126        download: Whether to download the data if it is not present.
127
128    Returns:
129        Filepath where the data is download for further processing.
130    """
131    if split not in ['train', 'test']:
132        raise ValueError(f"'{split}' is not a valid split.")
133
134    data_dir = os.path.join(path, "data")
135    if os.path.exists(data_dir) and glob(os.path.join(data_dir, "*.h5")):
136        return data_dir
137
138    os.makedirs(path, exist_ok=True)
139
140    # Download the files from google drive.
141    util.download_source_gdrive(path=data_dir, url=URL, download=download, download_type="folder", quiet=False)
142
143    # Extract and preprocess images for all splits
144    for _split in ['train', 'test']:
145        _extract_images(_split, data_dir)
146
147    return data_dir

Download the CONIC dataset for nucleus segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • split: The choice of data split.
  • download: Whether to download the data if it is not present.
Returns:

Filepath where the data is download for further processing.

def get_conic_paths( path: os.PathLike, split: Literal['train', 'test'], download: bool = False) -> List[str]:
150def get_conic_paths(
151    path: Union[os.PathLike], split: Literal["train", "test"], download: bool = False
152) -> List[str]:
153    """Get paths to the CONIC data.
154
155    Args:
156        path: Filepath to a folder where the downloaded data will be saved.
157        split: The choice of data splits.
158        download: Whether to download the data if it is not present.
159
160    Returns:
161        List of filepaths for the stored data.
162    """
163    data_dir = get_conic_data(path, split, download)
164    return os.path.join(data_dir, f"{split}.h5")

Get paths to the CONIC data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • split: The choice of data splits.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths for the stored data.

def get_conic_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Literal['train', 'test'], label_choice: Literal['instances', 'semantic'] = 'instances', resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
167def get_conic_dataset(
168    path: Union[os.PathLike, str],
169    patch_shape: Tuple[int, int],
170    split: Literal["train", "test"],
171    label_choice: Literal["instances", "semantic"] = "instances",
172    resize_inputs: bool = False,
173    download: bool = False,
174    **kwargs
175) -> Dataset:
176    """Get the CONIC dataset for nucleus segmentation.
177
178    Args:
179        path: Filepath to a folder where the downloaded data will be saved.
180        patch_shape: The patch shape to use for training.
181        split: The choice of data split.
182        resize_inputs: Whether to resize the input images.
183        download: Whether to download the data if it is not present.
184        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
185
186    Returns:
187        The segmentation dataset.
188    """
189    data_paths = get_conic_paths(path, split, download)
190
191    if resize_inputs:
192        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
193        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
194            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
195        )
196
197    return torch_em.default_segmentation_dataset(
198        raw_paths=data_paths,
199        raw_key="raw",
200        label_paths=data_paths,
201        label_key=f"labels/{label_choice}",
202        patch_shape=patch_shape,
203        ndim=2,
204        with_channels=True,
205        **kwargs
206    )

Get the CONIC dataset for nucleus segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • split: The choice of data split.
  • resize_inputs: Whether to resize the input images.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_conic_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Literal['train', 'test'], label_choice: Literal['instances', 'semantic'] = 'instances', resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
209def get_conic_loader(
210    path: Union[os.PathLike, str],
211    batch_size: int,
212    patch_shape: Tuple[int, int],
213    split: Literal["train", "test"],
214    label_choice: Literal["instances", "semantic"] = "instances",
215    resize_inputs: bool = False,
216    download: bool = False,
217    **kwargs
218) -> DataLoader:
219    """Get the CONIC dataloader for nucleus segmentation.
220
221    Args:
222        path: Filepath to a folder where the downloaded data will be saved.
223        batch_size: The batch size for training.
224        patch_shape: The patch shape to use for training.
225        split: The choice of data split.
226        resize_inputs: Whether to resize the inputs.
227        download: Whether to download the data if it is not present.
228        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
229
230    Returns:
231        The DataLoader.
232    """
233    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
234    ds = get_conic_dataset(path, patch_shape, split, label_choice, resize_inputs, download, **ds_kwargs)
235    return torch_em.get_data_loader(ds, batch_size, **loader_kwargs)

Get the CONIC dataloader for nucleus segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape to use for training.
  • split: The choice of data split.
  • resize_inputs: Whether to resize the inputs.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.