torch_em.data.datasets.histopathology.tnbc

The TNBC dataset contains annotations for nucleus segmentation in H&E stained histopathology images.

The dataset is located at https://doi.org/10.5281/zenodo.1175282. Please cite it if you use this dataset for your research.

  1"""The TNBC dataset contains annotations for nucleus segmentation
  2in H&E stained histopathology images.
  3
  4The dataset is located at https://doi.org/10.5281/zenodo.1175282.
  5Please cite it if you use this dataset for your research.
  6"""
  7
  8import os
  9import shutil
 10from glob import glob
 11from tqdm import tqdm
 12from pathlib import Path
 13from natsort import natsorted
 14from typing import Union, Tuple, List, Literal
 15
 16import json
 17import pandas as pd
 18import imageio.v3 as imageio
 19from sklearn.model_selection import train_test_split
 20from skimage.measure import label as connected_components
 21
 22from torch.utils.data import Dataset, DataLoader
 23
 24import torch_em
 25
 26from .. import util
 27
 28
 29URL = "https://zenodo.org/records/1175282/files/TNBC_NucleiSegmentation.zip"
 30CHECKSUM = "da708c3a988f4ad4b9bbb9283b387faf703f0bc0e5e689927306bd27ea13a57f"
 31
 32
 33def _create_split_csv(path, data_dir, split):
 34    csv_path = os.path.join(path, 'tnbc_split.csv')
 35    if os.path.exists(csv_path):
 36        df = pd.read_csv(csv_path)
 37        df[split] = df[split].apply(lambda x: json.loads(x.replace("'", '"')))  # ensures all items from column in list.
 38        split_list = df.iloc[0][split]
 39
 40    else:
 41        print(f"Creating a new split file at '{csv_path}'.")
 42        image_names = [
 43            os.path.basename(image).split(".")[0] for image in glob(os.path.join(data_dir, '*.h5'))
 44        ]
 45
 46        train_ids, test_ids = train_test_split(image_names, test_size=0.2)  # 20% for test split.
 47        train_ids, val_ids = train_test_split(train_ids, test_size=0.15)  # 15% for val split.
 48        split_ids = {"train": train_ids, "val": val_ids, "test": test_ids}
 49
 50        df = pd.DataFrame.from_dict([split_ids])
 51        df.to_csv(csv_path, index=False)
 52
 53        split_list = split_ids[split]
 54
 55    return split_list
 56
 57
 58def _preprocess_images(path):
 59    import h5py
 60
 61    raw_paths = natsorted(glob(os.path.join(path, "TNBC_NucleiSegmentation", "Slide_*", "*.png")))
 62    label_paths = natsorted(glob(os.path.join(path, "TNBC_NucleiSegmentation", "GT_*", "*.png")))
 63
 64    preprocessed_dir = os.path.join(path, "preprocessed")
 65    os.makedirs(preprocessed_dir, exist_ok=True)
 66
 67    for rpath, lpath in tqdm(zip(raw_paths, label_paths), desc="Preprocessing images", total=len(raw_paths)):
 68        raw = imageio.imread(rpath)
 69        if raw.ndim == 3 and raw.shape[-1] == 4:
 70            raw = raw[..., :-1]  # remove 4th alpha channel (seems like an empty channel).
 71
 72        raw = raw.transpose(2, 0, 1)
 73        label = imageio.imread(lpath)
 74
 75        vol_path = os.path.join(preprocessed_dir, f"{Path(lpath).stem}.h5")
 76
 77        with h5py.File(vol_path, "w") as f:
 78            f.create_dataset("raw", shape=raw.shape, data=raw, compression="gzip")
 79            f.create_dataset("labels/semantic", shape=label.shape, data=label, compression="gzip")
 80            f.create_dataset(
 81                "labels/instances", shape=label.shape, data=connected_components(label), compression="gzip"
 82            )
 83
 84    shutil.rmtree(os.path.join(path, "TNBC_NucleiSegmentation"))
 85    shutil.rmtree(os.path.join(path, "__MACOSX"))
 86
 87
 88def get_tnbc_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 89    """Download the TNBC dataset for nucleus segmentation.
 90
 91    Args:
 92        path: Filepath to a folder where the downloaded data will be saved.
 93        download: Whether to download the data if it is not present.
 94
 95    Returns:
 96        The filepath to the downloaded data.
 97    """
 98    data_dir = os.path.join(path, "preprocessed")
 99    if os.path.exists(data_dir):
100        return data_dir
101
102    os.makedirs(path, exist_ok=True)
103
104    zip_path = os.path.join(path, "TNBC_NucleiSegmentation.zip")
105    util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM)
106    util.unzip(zip_path=zip_path, dst=path)
107
108    _preprocess_images(path)
109
110    return data_dir
111
112
113def get_tnbc_paths(
114    path: Union[os.PathLike, str], split: Literal["train", "val", "test"], download: bool = False
115) -> List[int]:
116    """Get paths to the TNBC data.
117
118    Args:
119        path: Filepath to a folder where the downloaded data will be saved.
120        split: The choice of data split.
121        download: Whether to download the data if it is not present.
122
123    Returns:
124        List of filepaths to the preprocessed image data.
125    """
126    data_dir = get_tnbc_data(path, download)
127    split_list = _create_split_csv(path, data_dir, split)
128    volume_paths = [os.path.join(data_dir, f"{fname}.h5") for fname in split_list]
129    return volume_paths
130
131
132def get_tnbc_dataset(
133    path: Union[os.PathLike, str],
134    patch_shape: Tuple[int, int],
135    split: Literal["train", "val", "test"],
136    resize_inputs: bool = False,
137    download: bool = False,
138    **kwargs
139) -> Dataset:
140    """Get the TNBC dataset for nucleus segmentation.
141
142    Args:
143        path: Filepath to a folder where the downloaded data will be saved.
144        patch_shape: The patch shape to use for training.
145        split: The choice of data split.
146        resize_inputs: Whether to resize the inputs.
147        download: Whether to download the data if it is not present.
148        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
149
150    Returns:
151        The segmentation dataset.
152    """
153    label_choice = "instances"  # semantic / instances
154
155    volume_paths = get_tnbc_paths(path, split, download)
156
157    if resize_inputs:
158        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
159        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
160            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
161        )
162
163    return torch_em.default_segmentation_dataset(
164        raw_paths=volume_paths,
165        raw_key="raw",
166        label_paths=volume_paths,
167        label_key=f"labels/{label_choice}",
168        patch_shape=patch_shape,
169        is_seg_dataset=True,
170        with_channels=True,
171        **kwargs
172    )
173
174
175def get_tnbc_loader(
176    path: Union[os.PathLike, str],
177    batch_size: int,
178    patch_shape: Tuple[int, int],
179    split: Literal["train", "val", "test"],
180    resize_inputs: bool = False,
181    download: bool = False,
182    **kwargs
183) -> DataLoader:
184    """Get the TNBC dataloader for nucleus segmentation.
185
186    Args:
187        path: Filepath to a folder where the downloaded data will be saved.
188        batch_size: The batch size for training.
189        patch_shape: The patch shape to use for training.
190        split: The choice of data split.
191        resize_inputs: Whether to resize the inputs.
192        download: Whether to download the data if it is not present.
193        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
194
195    Returns:
196        The DataLoader.
197    """
198    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
199    dataset = get_tnbc_dataset(path, patch_shape, split, resize_inputs, download, **ds_kwargs)
200    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URL = 'https://zenodo.org/records/1175282/files/TNBC_NucleiSegmentation.zip'
CHECKSUM = 'da708c3a988f4ad4b9bbb9283b387faf703f0bc0e5e689927306bd27ea13a57f'
def get_tnbc_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 89def get_tnbc_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 90    """Download the TNBC dataset for nucleus segmentation.
 91
 92    Args:
 93        path: Filepath to a folder where the downloaded data will be saved.
 94        download: Whether to download the data if it is not present.
 95
 96    Returns:
 97        The filepath to the downloaded data.
 98    """
 99    data_dir = os.path.join(path, "preprocessed")
100    if os.path.exists(data_dir):
101        return data_dir
102
103    os.makedirs(path, exist_ok=True)
104
105    zip_path = os.path.join(path, "TNBC_NucleiSegmentation.zip")
106    util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM)
107    util.unzip(zip_path=zip_path, dst=path)
108
109    _preprocess_images(path)
110
111    return data_dir

Download the TNBC dataset for nucleus segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • download: Whether to download the data if it is not present.
Returns:

The filepath to the downloaded data.

def get_tnbc_paths( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], download: bool = False) -> List[int]:
114def get_tnbc_paths(
115    path: Union[os.PathLike, str], split: Literal["train", "val", "test"], download: bool = False
116) -> List[int]:
117    """Get paths to the TNBC data.
118
119    Args:
120        path: Filepath to a folder where the downloaded data will be saved.
121        split: The choice of data split.
122        download: Whether to download the data if it is not present.
123
124    Returns:
125        List of filepaths to the preprocessed image data.
126    """
127    data_dir = get_tnbc_data(path, download)
128    split_list = _create_split_csv(path, data_dir, split)
129    volume_paths = [os.path.join(data_dir, f"{fname}.h5") for fname in split_list]
130    return volume_paths

Get paths to the TNBC data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • split: The choice of data split.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths to the preprocessed image data.

def get_tnbc_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
133def get_tnbc_dataset(
134    path: Union[os.PathLike, str],
135    patch_shape: Tuple[int, int],
136    split: Literal["train", "val", "test"],
137    resize_inputs: bool = False,
138    download: bool = False,
139    **kwargs
140) -> Dataset:
141    """Get the TNBC dataset for nucleus segmentation.
142
143    Args:
144        path: Filepath to a folder where the downloaded data will be saved.
145        patch_shape: The patch shape to use for training.
146        split: The choice of data split.
147        resize_inputs: Whether to resize the inputs.
148        download: Whether to download the data if it is not present.
149        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
150
151    Returns:
152        The segmentation dataset.
153    """
154    label_choice = "instances"  # semantic / instances
155
156    volume_paths = get_tnbc_paths(path, split, download)
157
158    if resize_inputs:
159        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
160        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
161            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
162        )
163
164    return torch_em.default_segmentation_dataset(
165        raw_paths=volume_paths,
166        raw_key="raw",
167        label_paths=volume_paths,
168        label_key=f"labels/{label_choice}",
169        patch_shape=patch_shape,
170        is_seg_dataset=True,
171        with_channels=True,
172        **kwargs
173    )

Get the TNBC dataset for nucleus segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • split: The choice of data split.
  • resize_inputs: Whether to resize the inputs.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_tnbc_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
176def get_tnbc_loader(
177    path: Union[os.PathLike, str],
178    batch_size: int,
179    patch_shape: Tuple[int, int],
180    split: Literal["train", "val", "test"],
181    resize_inputs: bool = False,
182    download: bool = False,
183    **kwargs
184) -> DataLoader:
185    """Get the TNBC dataloader for nucleus segmentation.
186
187    Args:
188        path: Filepath to a folder where the downloaded data will be saved.
189        batch_size: The batch size for training.
190        patch_shape: The patch shape to use for training.
191        split: The choice of data split.
192        resize_inputs: Whether to resize the inputs.
193        download: Whether to download the data if it is not present.
194        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
195
196    Returns:
197        The DataLoader.
198    """
199    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
200    dataset = get_tnbc_dataset(path, patch_shape, split, resize_inputs, download, **ds_kwargs)
201    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the TNBC dataloader for nucleus segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape to use for training.
  • split: The choice of data split.
  • resize_inputs: Whether to resize the inputs.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.