torch_em.data.datasets.histopathology.consep
The CoNSeP dataset contains annotations for nucleus segmentation in H&E stained histopathology images for multi-tissue regions.
NOTE: The source of this dataset is an open-source version hosted on Kaggle:
This dataset is from the publication https://doi.org/10.1016/j.media.2019.101563. Please cite it if you use this dataset for your research.
1"""The CoNSeP dataset contains annotations for nucleus segmentation in 2H&E stained histopathology images for multi-tissue regions. 3 4NOTE: The source of this dataset is an open-source version hosted on Kaggle: 5- https://www.kaggle.com/datasets/rftexas/tiled-consep-224x224px 6 7This dataset is from the publication https://doi.org/10.1016/j.media.2019.101563. 8Please cite it if you use this dataset for your research. 9""" 10 11import os 12from glob import glob 13from tqdm import tqdm 14from pathlib import Path 15from natsort import natsorted 16from typing import Union, Tuple, List, Literal 17 18import h5py 19import imageio.v3 as imageio 20import numpy as np 21import torch_em 22 23from elf.segmentation.stitching import stitch_tiled_segmentation 24from scipy.io import loadmat 25from skimage.measure import label as connected_components 26from torch.utils.data import Dataset, DataLoader 27 28from .. import util 29 30 31def _preprocess_image(raw_paths, label_paths, output_path): 32 33 # Find the start and stop coordinates for all tiles by parsing their filenames. 34 tile_coordinates = [] 35 for path in raw_paths: 36 tile_coords = tuple(int(coord) for coord in Path(path).stem.split("_")[2:]) 37 tile_coordinates.append(tile_coords) 38 39 # Find the dimension of the image as the maximum of the tile coordinates. 40 h = max(coord[1] for coord in tile_coordinates) 41 w = max(coord[3] for coord in tile_coordinates) 42 shape = (h, w) 43 44 # Stitch together the image data. 45 raw = np.zeros(shape + (3,), dtype="uint8") 46 for path, coords in zip(raw_paths, tile_coordinates): 47 tile_data = imageio.imread(path) 48 y1, y2, x1, x2 = coords 49 raw[y1:y2, x1:x2] = tile_data 50 51 # Stitch together the label data. 52 # First, we load the labels and apply an offset so that we have unique ids. 53 # Also, some parts of the labels are over-lapping and we make sure to only write it once. 54 offset = 0 55 labels = np.zeros(shape, dtype="uint32") 56 written = np.zeros(shape, dtype=bool) 57 for path, coords in zip(label_paths, tile_coordinates): 58 y1, y2, x1, x2 = coords 59 60 tile_labels = loadmat(path)["instance_map"] 61 tile_labels = connected_components(tile_labels).astype("uint32") 62 63 # Find the mask where we have labels in this tile, and where data was already written. 64 tile_mask = tile_labels != 0 65 tile_not_written = ~written[y1:y2, x1:x2] 66 67 # And intersect them. 68 tile_mask = np.logical_and(tile_mask, tile_not_written) 69 70 # Add up the offset to this tile, unless it is empty. 71 if tile_mask.sum() > 0: 72 tile_labels[tile_mask] += offset 73 offset = int(tile_labels.max()) 74 75 # Write out what has been written and the labels. 76 written[y1:y2, x1:x2][tile_mask] = 1 77 labels[y1:y2, x1:x2][tile_mask] = tile_labels[tile_mask] 78 79 # Stitch the labels together. 80 tile_shape = (224, 224) 81 stitched_labels = stitch_tiled_segmentation(labels, tile_shape=tile_shape, overlap=1, verbose=False) 82 83 with h5py.File(output_path, "w") as f: 84 f.create_dataset("raw", data=raw.transpose(2, 0, 1), compression="gzip") 85 f.create_dataset("labels", data=stitched_labels, compression="gzip") 86 87 88def _preprocess_data(data_dir, split): 89 preprocessed_dir = os.path.join(data_dir, "preprocessed", split) 90 os.makedirs(preprocessed_dir, exist_ok=True) 91 92 n_images = 28 if split == "train" else 15 93 for image_id in tqdm(range(1, n_images), desc="Preprocessing inputs"): 94 output_path = os.path.join(preprocessed_dir, f"{image_id}.h5") 95 if os.path.exists(output_path): 96 continue 97 98 raw_paths = natsorted(glob(os.path.join(data_dir, "tiles", f"{split}_{image_id}_*.png"))) 99 label_paths = [p.replace("tiles", "labels").replace(".png", ".mat") for p in raw_paths] 100 _preprocess_image(raw_paths, label_paths, output_path) 101 102 103def get_consep_data(path: Union[os.PathLike, str], download: bool = False) -> str: 104 """Download the CoNSeP dataset. 105 106 Args: 107 path: Filepath to a folder where the data is downloaded for further processing. 108 download: Whether to download the data if it is not present. 109 110 Returns: 111 Filepath where the data is downloaded and preprocessed. 112 """ 113 data_dir = os.path.join(path, "data", "consep") 114 if os.path.exists(data_dir): 115 return data_dir 116 117 os.makedirs(path, exist_ok=True) 118 119 util.download_source_kaggle(path=path, dataset_name="rftexas/tiled-consep-224x224px", download=download) 120 util.unzip(zip_path=os.path.join( 121 path, "tiled-consep-224x224px.zip"), dst=os.path.join(path, "data"), remove=False 122 ) 123 124 return data_dir 125 126 127def get_consep_paths( 128 path: Union[os.PathLike, str], split: Literal["train", "test"], download: bool = False 129) -> List[str]: 130 """Get paths to the CoNSeP data. 131 132 Args: 133 path: Filepath to a folder where the data is downloaded for further processing. 134 split: The choice of data split. 135 download: Whether to download the data if it is not present. 136 137 Returns: 138 List of filepaths for the input data. 139 """ 140 data_dir = get_consep_data(path, download) 141 142 _preprocess_data(data_dir, "train") 143 _preprocess_data(data_dir, "test") 144 145 if split not in ["train", "test"]: 146 raise ValueError(f"'{split}' is not a valid split.") 147 148 paths = natsorted(glob(os.path.join(data_dir, "preprocessed", split, "*.h5"))) 149 return paths 150 151 152def get_consep_dataset( 153 path: Union[os.PathLike, str], 154 patch_shape: Tuple[int, int], 155 split: Literal["train", "test"], 156 download: bool = False, 157 **kwargs 158) -> Dataset: 159 """Get the CoNSeP dataset for nucleus segmentation. 160 161 Args: 162 path: Filepath to a folder where the data is downloaded for further processing. 163 patch_shape: The patch shape to use for training. 164 split: The choice of data split. 165 download: Whether to download the data if it is not present. 166 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 167 168 Returns: 169 The segmentation dataset. 170 """ 171 volume_paths = get_consep_paths(path, split, download) 172 173 return torch_em.default_segmentation_dataset( 174 raw_paths=volume_paths, 175 raw_key="raw", 176 label_paths=volume_paths, 177 label_key="labels", 178 is_seg_dataset=True, 179 patch_shape=patch_shape, 180 with_channels=True, 181 ndim=2, 182 **kwargs 183 ) 184 185 186def get_consep_loader( 187 path: Union[os.PathLike, str], 188 batch_size: int, 189 patch_shape: Tuple[int, int], 190 split: Literal["train", "test"], 191 download: bool = False, 192 **kwargs 193) -> DataLoader: 194 """Get the CoNSeP dataloader for nucleus segmentation. 195 196 Args: 197 path: Filepath to a folder where the data is downloaded for further processing. 198 batch_size: The batch size for training. 199 patch_shape: The patch shape to use for training. 200 split: The choice of data split. 201 download: Whether to download the data if it is not present. 202 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 203 204 Returns: 205 The DataLoader. 206 """ 207 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 208 dataset = get_consep_dataset(path, patch_shape, split, download, **ds_kwargs) 209 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
def
get_consep_data(path: Union[os.PathLike, str], download: bool = False) -> str:
104def get_consep_data(path: Union[os.PathLike, str], download: bool = False) -> str: 105 """Download the CoNSeP dataset. 106 107 Args: 108 path: Filepath to a folder where the data is downloaded for further processing. 109 download: Whether to download the data if it is not present. 110 111 Returns: 112 Filepath where the data is downloaded and preprocessed. 113 """ 114 data_dir = os.path.join(path, "data", "consep") 115 if os.path.exists(data_dir): 116 return data_dir 117 118 os.makedirs(path, exist_ok=True) 119 120 util.download_source_kaggle(path=path, dataset_name="rftexas/tiled-consep-224x224px", download=download) 121 util.unzip(zip_path=os.path.join( 122 path, "tiled-consep-224x224px.zip"), dst=os.path.join(path, "data"), remove=False 123 ) 124 125 return data_dir
Download the CoNSeP dataset.
Arguments:
- path: Filepath to a folder where the data is downloaded for further processing.
- download: Whether to download the data if it is not present.
Returns:
Filepath where the data is downloaded and preprocessed.
def
get_consep_paths( path: Union[os.PathLike, str], split: Literal['train', 'test'], download: bool = False) -> List[str]:
128def get_consep_paths( 129 path: Union[os.PathLike, str], split: Literal["train", "test"], download: bool = False 130) -> List[str]: 131 """Get paths to the CoNSeP data. 132 133 Args: 134 path: Filepath to a folder where the data is downloaded for further processing. 135 split: The choice of data split. 136 download: Whether to download the data if it is not present. 137 138 Returns: 139 List of filepaths for the input data. 140 """ 141 data_dir = get_consep_data(path, download) 142 143 _preprocess_data(data_dir, "train") 144 _preprocess_data(data_dir, "test") 145 146 if split not in ["train", "test"]: 147 raise ValueError(f"'{split}' is not a valid split.") 148 149 paths = natsorted(glob(os.path.join(data_dir, "preprocessed", split, "*.h5"))) 150 return paths
Get paths to the CoNSeP data.
Arguments:
- path: Filepath to a folder where the data is downloaded for further processing.
- split: The choice of data split.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the input data.
def
get_consep_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Literal['train', 'test'], download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
153def get_consep_dataset( 154 path: Union[os.PathLike, str], 155 patch_shape: Tuple[int, int], 156 split: Literal["train", "test"], 157 download: bool = False, 158 **kwargs 159) -> Dataset: 160 """Get the CoNSeP dataset for nucleus segmentation. 161 162 Args: 163 path: Filepath to a folder where the data is downloaded for further processing. 164 patch_shape: The patch shape to use for training. 165 split: The choice of data split. 166 download: Whether to download the data if it is not present. 167 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 168 169 Returns: 170 The segmentation dataset. 171 """ 172 volume_paths = get_consep_paths(path, split, download) 173 174 return torch_em.default_segmentation_dataset( 175 raw_paths=volume_paths, 176 raw_key="raw", 177 label_paths=volume_paths, 178 label_key="labels", 179 is_seg_dataset=True, 180 patch_shape=patch_shape, 181 with_channels=True, 182 ndim=2, 183 **kwargs 184 )
Get the CoNSeP dataset for nucleus segmentation.
Arguments:
- path: Filepath to a folder where the data is downloaded for further processing.
- patch_shape: The patch shape to use for training.
- split: The choice of data split.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
def
get_consep_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Literal['train', 'test'], download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
187def get_consep_loader( 188 path: Union[os.PathLike, str], 189 batch_size: int, 190 patch_shape: Tuple[int, int], 191 split: Literal["train", "test"], 192 download: bool = False, 193 **kwargs 194) -> DataLoader: 195 """Get the CoNSeP dataloader for nucleus segmentation. 196 197 Args: 198 path: Filepath to a folder where the data is downloaded for further processing. 199 batch_size: The batch size for training. 200 patch_shape: The patch shape to use for training. 201 split: The choice of data split. 202 download: Whether to download the data if it is not present. 203 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 204 205 Returns: 206 The DataLoader. 207 """ 208 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 209 dataset = get_consep_dataset(path, patch_shape, split, download, **ds_kwargs) 210 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the CoNSeP dataloader for nucleus segmentation.
Arguments:
- path: Filepath to a folder where the data is downloaded for further processing.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- split: The choice of data split.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader.