torch_em.data.datasets.histopathology.tnbc
The TNBC dataset contains annotations for nucleus segmentation in H&E stained histopathology images.
The dataset is located at https://doi.org/10.5281/zenodo.1175282. Please cite it if you use this dataset for your research.
1"""The TNBC dataset contains annotations for nucleus segmentation 2in H&E stained histopathology images. 3 4The dataset is located at https://doi.org/10.5281/zenodo.1175282. 5Please cite it if you use this dataset for your research. 6""" 7 8import os 9import shutil 10from glob import glob 11from tqdm import tqdm 12from pathlib import Path 13from natsort import natsorted 14from typing import Union, Tuple, List, Literal 15 16import json 17import pandas as pd 18import imageio.v3 as imageio 19from sklearn.model_selection import train_test_split 20from skimage.measure import label as connected_components 21 22from torch.utils.data import Dataset, DataLoader 23 24import torch_em 25 26from .. import util 27 28 29URL = "https://zenodo.org/records/1175282/files/TNBC_NucleiSegmentation.zip" 30CHECKSUM = "da708c3a988f4ad4b9bbb9283b387faf703f0bc0e5e689927306bd27ea13a57f" 31 32 33def _create_split_csv(path, data_dir, split): 34 csv_path = os.path.join(path, 'tnbc_split.csv') 35 if os.path.exists(csv_path): 36 df = pd.read_csv(csv_path) 37 df[split] = df[split].apply(lambda x: json.loads(x.replace("'", '"'))) # ensures all items from column in list. 38 split_list = df.iloc[0][split] 39 40 else: 41 print(f"Creating a new split file at '{csv_path}'.") 42 image_names = [ 43 os.path.basename(image).split(".")[0] for image in glob(os.path.join(data_dir, '*.h5')) 44 ] 45 46 train_ids, test_ids = train_test_split(image_names, test_size=0.2) # 20% for test split. 47 train_ids, val_ids = train_test_split(train_ids, test_size=0.15) # 15% for val split. 48 split_ids = {"train": train_ids, "val": val_ids, "test": test_ids} 49 50 df = pd.DataFrame.from_dict([split_ids]) 51 df.to_csv(csv_path, index=False) 52 53 split_list = split_ids[split] 54 55 return split_list 56 57 58def _preprocess_images(path): 59 import h5py 60 61 raw_paths = natsorted(glob(os.path.join(path, "TNBC_NucleiSegmentation", "Slide_*", "*.png"))) 62 label_paths = natsorted(glob(os.path.join(path, "TNBC_NucleiSegmentation", "GT_*", "*.png"))) 63 64 preprocessed_dir = os.path.join(path, "preprocessed") 65 os.makedirs(preprocessed_dir, exist_ok=True) 66 67 for rpath, lpath in tqdm(zip(raw_paths, label_paths), desc="Preprocessing images", total=len(raw_paths)): 68 raw = imageio.imread(rpath) 69 if raw.ndim == 3 and raw.shape[-1] == 4: 70 raw = raw[..., :-1] # remove 4th alpha channel (seems like an empty channel). 71 72 raw = raw.transpose(2, 0, 1) 73 label = imageio.imread(lpath) 74 75 vol_path = os.path.join(preprocessed_dir, f"{Path(lpath).stem}.h5") 76 77 with h5py.File(vol_path, "w") as f: 78 f.create_dataset("raw", shape=raw.shape, data=raw, compression="gzip") 79 f.create_dataset("labels/semantic", shape=label.shape, data=label, compression="gzip") 80 f.create_dataset( 81 "labels/instances", shape=label.shape, data=connected_components(label), compression="gzip" 82 ) 83 84 shutil.rmtree(os.path.join(path, "TNBC_NucleiSegmentation")) 85 shutil.rmtree(os.path.join(path, "__MACOSX")) 86 87 88def get_tnbc_data(path: Union[os.PathLike, str], download: bool = False) -> str: 89 """Download the TNBC dataset for nucleus segmentation. 90 91 Args: 92 path: Filepath to a folder where the downloaded data will be saved. 93 download: Whether to download the data if it is not present. 94 95 Returns: 96 The filepath to the downloaded data. 97 """ 98 data_dir = os.path.join(path, "preprocessed") 99 if os.path.exists(data_dir): 100 return data_dir 101 102 os.makedirs(path, exist_ok=True) 103 104 zip_path = os.path.join(path, "TNBC_NucleiSegmentation.zip") 105 util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM) 106 util.unzip(zip_path=zip_path, dst=path) 107 108 _preprocess_images(path) 109 110 return data_dir 111 112 113def get_tnbc_paths( 114 path: Union[os.PathLike, str], split: Literal["train", "val", "test"], download: bool = False 115) -> List[int]: 116 """Get paths to the TNBC data. 117 118 Args: 119 path: Filepath to a folder where the downloaded data will be saved. 120 split: The choice of data split. 121 download: Whether to download the data if it is not present. 122 123 Returns: 124 List of filepaths to the preprocessed image data. 125 """ 126 data_dir = get_tnbc_data(path, download) 127 split_list = _create_split_csv(path, data_dir, split) 128 volume_paths = [os.path.join(data_dir, f"{fname}.h5") for fname in split_list] 129 return volume_paths 130 131 132def get_tnbc_dataset( 133 path: Union[os.PathLike, str], 134 patch_shape: Tuple[int, int], 135 split: Literal["train", "val", "test"], 136 resize_inputs: bool = False, 137 download: bool = False, 138 **kwargs 139) -> Dataset: 140 """Get the TNBC dataset for nucleus segmentation. 141 142 Args: 143 path: Filepath to a folder where the downloaded data will be saved. 144 patch_shape: The patch shape to use for training. 145 split: The choice of data split. 146 resize_inputs: Whether to resize the inputs. 147 download: Whether to download the data if it is not present. 148 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 149 150 Returns: 151 The segmentation dataset. 152 """ 153 label_choice = "instances" # semantic / instances 154 155 volume_paths = get_tnbc_paths(path, split, download) 156 157 if resize_inputs: 158 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True} 159 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 160 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 161 ) 162 163 return torch_em.default_segmentation_dataset( 164 raw_paths=volume_paths, 165 raw_key="raw", 166 label_paths=volume_paths, 167 label_key=f"labels/{label_choice}", 168 patch_shape=patch_shape, 169 is_seg_dataset=True, 170 with_channels=True, 171 **kwargs 172 ) 173 174 175def get_tnbc_loader( 176 path: Union[os.PathLike, str], 177 batch_size: int, 178 patch_shape: Tuple[int, int], 179 split: Literal["train", "val", "test"], 180 resize_inputs: bool = False, 181 download: bool = False, 182 **kwargs 183) -> DataLoader: 184 """Get the TNBC dataloader for nucleus segmentation. 185 186 Args: 187 path: Filepath to a folder where the downloaded data will be saved. 188 batch_size: The batch size for training. 189 patch_shape: The patch shape to use for training. 190 split: The choice of data split. 191 resize_inputs: Whether to resize the inputs. 192 download: Whether to download the data if it is not present. 193 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 194 195 Returns: 196 The DataLoader. 197 """ 198 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 199 dataset = get_tnbc_dataset(path, patch_shape, split, resize_inputs, download, **ds_kwargs) 200 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URL =
'https://zenodo.org/records/1175282/files/TNBC_NucleiSegmentation.zip'
CHECKSUM =
'da708c3a988f4ad4b9bbb9283b387faf703f0bc0e5e689927306bd27ea13a57f'
def
get_tnbc_data(path: Union[os.PathLike, str], download: bool = False) -> str:
89def get_tnbc_data(path: Union[os.PathLike, str], download: bool = False) -> str: 90 """Download the TNBC dataset for nucleus segmentation. 91 92 Args: 93 path: Filepath to a folder where the downloaded data will be saved. 94 download: Whether to download the data if it is not present. 95 96 Returns: 97 The filepath to the downloaded data. 98 """ 99 data_dir = os.path.join(path, "preprocessed") 100 if os.path.exists(data_dir): 101 return data_dir 102 103 os.makedirs(path, exist_ok=True) 104 105 zip_path = os.path.join(path, "TNBC_NucleiSegmentation.zip") 106 util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM) 107 util.unzip(zip_path=zip_path, dst=path) 108 109 _preprocess_images(path) 110 111 return data_dir
Download the TNBC dataset for nucleus segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- download: Whether to download the data if it is not present.
Returns:
The filepath to the downloaded data.
def
get_tnbc_paths( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], download: bool = False) -> List[int]:
114def get_tnbc_paths( 115 path: Union[os.PathLike, str], split: Literal["train", "val", "test"], download: bool = False 116) -> List[int]: 117 """Get paths to the TNBC data. 118 119 Args: 120 path: Filepath to a folder where the downloaded data will be saved. 121 split: The choice of data split. 122 download: Whether to download the data if it is not present. 123 124 Returns: 125 List of filepaths to the preprocessed image data. 126 """ 127 data_dir = get_tnbc_data(path, download) 128 split_list = _create_split_csv(path, data_dir, split) 129 volume_paths = [os.path.join(data_dir, f"{fname}.h5") for fname in split_list] 130 return volume_paths
Get paths to the TNBC data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The choice of data split.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths to the preprocessed image data.
def
get_tnbc_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
133def get_tnbc_dataset( 134 path: Union[os.PathLike, str], 135 patch_shape: Tuple[int, int], 136 split: Literal["train", "val", "test"], 137 resize_inputs: bool = False, 138 download: bool = False, 139 **kwargs 140) -> Dataset: 141 """Get the TNBC dataset for nucleus segmentation. 142 143 Args: 144 path: Filepath to a folder where the downloaded data will be saved. 145 patch_shape: The patch shape to use for training. 146 split: The choice of data split. 147 resize_inputs: Whether to resize the inputs. 148 download: Whether to download the data if it is not present. 149 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 150 151 Returns: 152 The segmentation dataset. 153 """ 154 label_choice = "instances" # semantic / instances 155 156 volume_paths = get_tnbc_paths(path, split, download) 157 158 if resize_inputs: 159 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True} 160 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 161 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 162 ) 163 164 return torch_em.default_segmentation_dataset( 165 raw_paths=volume_paths, 166 raw_key="raw", 167 label_paths=volume_paths, 168 label_key=f"labels/{label_choice}", 169 patch_shape=patch_shape, 170 is_seg_dataset=True, 171 with_channels=True, 172 **kwargs 173 )
Get the TNBC dataset for nucleus segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- split: The choice of data split.
- resize_inputs: Whether to resize the inputs.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
def
get_tnbc_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
176def get_tnbc_loader( 177 path: Union[os.PathLike, str], 178 batch_size: int, 179 patch_shape: Tuple[int, int], 180 split: Literal["train", "val", "test"], 181 resize_inputs: bool = False, 182 download: bool = False, 183 **kwargs 184) -> DataLoader: 185 """Get the TNBC dataloader for nucleus segmentation. 186 187 Args: 188 path: Filepath to a folder where the downloaded data will be saved. 189 batch_size: The batch size for training. 190 patch_shape: The patch shape to use for training. 191 split: The choice of data split. 192 resize_inputs: Whether to resize the inputs. 193 download: Whether to download the data if it is not present. 194 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 195 196 Returns: 197 The DataLoader. 198 """ 199 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 200 dataset = get_tnbc_dataset(path, patch_shape, split, resize_inputs, download, **ds_kwargs) 201 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the TNBC dataloader for nucleus segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- split: The choice of data split.
- resize_inputs: Whether to resize the inputs.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader.