torch_em.data.datasets.histopathology.lynsec

The LyNSeC dataset contains annotations for nucleus segmentation in IHC and H&E stained lymphoma tissue images.

The dataset is located at https://doi.org/10.5281/zenodo.8065174. This dataset is from the publication https://doi.org/10.1016/j.compbiomed.2024.107978. Please cite it if you use this dataset in your research.

  1"""The LyNSeC dataset contains annotations for nucleus segmentation
  2in IHC and H&E stained lymphoma tissue images.
  3
  4The dataset is located at https://doi.org/10.5281/zenodo.8065174.
  5This dataset is from the publication https://doi.org/10.1016/j.compbiomed.2024.107978.
  6Please cite it if you use this dataset in your research.
  7"""
  8
  9import os
 10from glob import glob
 11from tqdm import tqdm
 12from pathlib import Path
 13from natsort import natsorted
 14from typing import Union, Tuple, List, Optional, Literal
 15
 16import json
 17import numpy as np
 18import pandas as pd
 19import imageio.v3 as imageio
 20from sklearn.model_selection import train_test_split
 21
 22import torch_em
 23
 24from torch.utils.data import Dataset, DataLoader
 25
 26from .. import util
 27
 28
 29URL = "https://zenodo.org/records/8065174/files/lynsec.zip"
 30CHECKSUM = "14b9b5a9c39cb41afc7f31de5a995cefff0947c215e14ab9c7a463f32fbbf4b6"
 31
 32
 33def _create_split_csv(path, data_dir, split, choice):
 34    assert split in ["train", "val", "test"], "Please choose a valid split."
 35
 36    csv_path = os.path.join(path, f"lynsec_{choice}_split.csv")
 37    if os.path.exists(csv_path):
 38        df = pd.read_csv(csv_path)
 39        df[split] = df[split].apply(lambda x: json.loads(x.replace("'", '"')))  # ensures all items from column in list.
 40        split_list = df.iloc[0][split]
 41
 42    else:
 43        print(f"Creating a new split file at '{csv_path}'.")
 44        image_names = [
 45            os.path.basename(image).split(".")[0] for image in glob(os.path.join(data_dir, choice, 'images', '*.tif'))
 46        ]
 47
 48        # Create random splits per dataset.
 49        train_ids, test_ids = train_test_split(image_names, test_size=0.2)  # 20% for test split.
 50        train_ids, val_ids = train_test_split(train_ids, test_size=0.15)  # 15% for val split.
 51        split_ids = {"train": train_ids, "val": val_ids, "test": test_ids}
 52
 53        df = pd.DataFrame.from_dict([split_ids])
 54        df.to_csv(csv_path, index=False)
 55        split_list = split_ids[split]
 56
 57    return split_list
 58
 59
 60def _preprocess_dataset(data_dir):
 61    data_dirs = natsorted(glob(os.path.join(data_dir, "lynsec*")))
 62    for _dir in data_dirs:
 63        if os.path.basename(_dir) == "lynsec 1":
 64            target_dir = "ihc"
 65        else:
 66            target_dir = "h&e"
 67
 68        image_dir = os.path.join(data_dir, target_dir, "images")
 69        label_dir = os.path.join(data_dir, target_dir, "labels")
 70        os.makedirs(image_dir, exist_ok=True)
 71        os.makedirs(label_dir, exist_ok=True)
 72
 73        paths = natsorted(glob(os.path.join(_dir, "*.npy")))
 74        for fpath in tqdm(paths, desc="Preprocessing inputs"):
 75            fname = Path(fpath).stem
 76            darray = np.load(fpath)
 77
 78            raw = darray[..., :3]
 79            labels = darray[..., 3]
 80
 81            if target_dir == "h&e" and fname in [f"{i}_l2" for i in range(35)]:  # set of images have mismatching labels
 82                continue
 83
 84            imageio.imwrite(os.path.join(image_dir, f"{fname}.tif"), raw, compression="zlib")
 85            imageio.imwrite(os.path.join(label_dir, f"{fname}.tif"), labels, compression="zlib")
 86
 87
 88def get_lynsec_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 89    """Download the LyNSeC dataset for nucleus segmentation.
 90
 91    Args:
 92        path: Filepath to a folder where the downloaded data will be saved.
 93        download: Whether to download the data if it is not present.
 94
 95    Returns:
 96        The filepath to the downloaded data.
 97    """
 98    data_dir = os.path.join(path, "data")
 99    if os.path.exists(data_dir):
100        return data_dir
101
102    os.makedirs(data_dir, exist_ok=True)
103
104    zip_path = os.path.join(path, "lynsec.zip")
105    util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM)
106    util.unzip(zip_path=zip_path, dst=data_dir)
107
108    _preprocess_dataset(data_dir)
109
110    return data_dir
111
112
113def get_lynsec_paths(
114    path: Union[os.PathLike, str],
115    split: Optional[Literal["train", "val", "test"]] = None,
116    choice: Optional[Literal['ihc', 'h&e']] = None,
117    download: bool = False
118) -> Tuple[List[str], List[str]]:
119    """Get paths to the LyNSec data.
120
121    Args:
122        path: Filepath to a folder where the downloaded data will be saved.
123        split: The choice of data split.
124        choice: The choice of dataset.
125        download: Whether to download the data if it is not present.
126
127    Returns:
128        List of filepaths to the image data.
129        List of filepaths to the label data.
130    """
131    data_dir = get_lynsec_data(path, download)
132
133    if choice is None:
134        choice = "*"
135
136    raw_paths = natsorted(glob(os.path.join(data_dir, choice, "images", "*.tif")))
137    label_paths = natsorted(glob(os.path.join(data_dir, choice, "labels", "*.tif")))
138
139    if split is not None:
140        if choice == "*":  # If user did not choose a split, we make splits for both datasets.
141            split_list = _create_split_csv(path, data_dir, split, "h&e")
142            split_list.extend(_create_split_csv(path, data_dir, split, "ihc"))
143        else:
144            split_list = _create_split_csv(path, data_dir, split, choice)
145
146        # Filter paths which are valid for the chosen split.
147        raw_paths = [p for p in raw_paths if os.path.basename(p).split(".")[0] in split_list]
148        label_paths = [p for p in label_paths if os.path.basename(p).split(".")[0] in split_list]
149
150    return raw_paths, label_paths
151
152
153def get_lynsec_dataset(
154    path: Union[os.PathLike, str],
155    patch_shape: Tuple[int, int],
156    split: Optional[Literal["train", "val", "test"]] = None,
157    choice: Optional[Literal['ihc', 'h&e']] = None,
158    resize_inputs: bool = False,
159    download: bool = False,
160    **kwargs
161) -> Dataset:
162    """Get the LyNSeC dataset for nucleus segmentation.
163
164    Args:
165        path: Filepath to a folder where the downloaded data will be saved.
166        patch_shape: The patch shape to use for training.
167        split: The choice of data split.
168        choice: The choice of dataset.
169        resize_inputs: Whether to resize the inputs.
170        download: Whether to download the data if it is not present.
171        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
172
173    Returns:
174        The segmentation dataset.
175    """
176    raw_paths, label_paths = get_lynsec_paths(path, split, choice, download)
177
178    if resize_inputs:
179        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
180        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
181            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
182        )
183
184    return torch_em.default_segmentation_dataset(
185        raw_paths=raw_paths,
186        raw_key=None,
187        label_paths=label_paths,
188        label_key=None,
189        patch_shape=patch_shape,
190        is_seg_dataset=False,
191        **kwargs
192    )
193
194
195def get_lynsec_loader(
196    path: Union[os.PathLike, str],
197    batch_size: int,
198    patch_shape: Tuple[int, int],
199    split: Optional[Literal["train", "val", "test"]] = None,
200    choice: Optional[Literal['ihc', 'h&e']] = None,
201    resize_inputs: bool = False,
202    download: bool = False,
203    **kwargs
204) -> DataLoader:
205    """Get the LyNSeC dataloader for nucleus segmentation.
206
207    Args:
208        path: Filepath to a folder where the downloaded data will be saved.
209        batch_size: The batch size for training.
210        patch_shape: The patch shape to use for training.
211        split: The choice of data split.
212        choice: The choice of dataset.
213        resize_inputs: Whether to resize the inputs.
214        download: Whether to download the data if it is not present.
215        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
216
217    Returns:
218        The DataLoader.
219    """
220    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
221    dataset = get_lynsec_dataset(path, patch_shape, split, choice, resize_inputs, download, **ds_kwargs)
222    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URL = 'https://zenodo.org/records/8065174/files/lynsec.zip'
CHECKSUM = '14b9b5a9c39cb41afc7f31de5a995cefff0947c215e14ab9c7a463f32fbbf4b6'
def get_lynsec_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 89def get_lynsec_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 90    """Download the LyNSeC dataset for nucleus segmentation.
 91
 92    Args:
 93        path: Filepath to a folder where the downloaded data will be saved.
 94        download: Whether to download the data if it is not present.
 95
 96    Returns:
 97        The filepath to the downloaded data.
 98    """
 99    data_dir = os.path.join(path, "data")
100    if os.path.exists(data_dir):
101        return data_dir
102
103    os.makedirs(data_dir, exist_ok=True)
104
105    zip_path = os.path.join(path, "lynsec.zip")
106    util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM)
107    util.unzip(zip_path=zip_path, dst=data_dir)
108
109    _preprocess_dataset(data_dir)
110
111    return data_dir

Download the LyNSeC dataset for nucleus segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • download: Whether to download the data if it is not present.
Returns:

The filepath to the downloaded data.

def get_lynsec_paths( path: Union[os.PathLike, str], split: Optional[Literal['train', 'val', 'test']] = None, choice: Optional[Literal['ihc', 'h&e']] = None, download: bool = False) -> Tuple[List[str], List[str]]:
114def get_lynsec_paths(
115    path: Union[os.PathLike, str],
116    split: Optional[Literal["train", "val", "test"]] = None,
117    choice: Optional[Literal['ihc', 'h&e']] = None,
118    download: bool = False
119) -> Tuple[List[str], List[str]]:
120    """Get paths to the LyNSec data.
121
122    Args:
123        path: Filepath to a folder where the downloaded data will be saved.
124        split: The choice of data split.
125        choice: The choice of dataset.
126        download: Whether to download the data if it is not present.
127
128    Returns:
129        List of filepaths to the image data.
130        List of filepaths to the label data.
131    """
132    data_dir = get_lynsec_data(path, download)
133
134    if choice is None:
135        choice = "*"
136
137    raw_paths = natsorted(glob(os.path.join(data_dir, choice, "images", "*.tif")))
138    label_paths = natsorted(glob(os.path.join(data_dir, choice, "labels", "*.tif")))
139
140    if split is not None:
141        if choice == "*":  # If user did not choose a split, we make splits for both datasets.
142            split_list = _create_split_csv(path, data_dir, split, "h&e")
143            split_list.extend(_create_split_csv(path, data_dir, split, "ihc"))
144        else:
145            split_list = _create_split_csv(path, data_dir, split, choice)
146
147        # Filter paths which are valid for the chosen split.
148        raw_paths = [p for p in raw_paths if os.path.basename(p).split(".")[0] in split_list]
149        label_paths = [p for p in label_paths if os.path.basename(p).split(".")[0] in split_list]
150
151    return raw_paths, label_paths

Get paths to the LyNSec data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • split: The choice of data split.
  • choice: The choice of dataset.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths to the image data. List of filepaths to the label data.

def get_lynsec_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Optional[Literal['train', 'val', 'test']] = None, choice: Optional[Literal['ihc', 'h&e']] = None, resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
154def get_lynsec_dataset(
155    path: Union[os.PathLike, str],
156    patch_shape: Tuple[int, int],
157    split: Optional[Literal["train", "val", "test"]] = None,
158    choice: Optional[Literal['ihc', 'h&e']] = None,
159    resize_inputs: bool = False,
160    download: bool = False,
161    **kwargs
162) -> Dataset:
163    """Get the LyNSeC dataset for nucleus segmentation.
164
165    Args:
166        path: Filepath to a folder where the downloaded data will be saved.
167        patch_shape: The patch shape to use for training.
168        split: The choice of data split.
169        choice: The choice of dataset.
170        resize_inputs: Whether to resize the inputs.
171        download: Whether to download the data if it is not present.
172        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
173
174    Returns:
175        The segmentation dataset.
176    """
177    raw_paths, label_paths = get_lynsec_paths(path, split, choice, download)
178
179    if resize_inputs:
180        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
181        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
182            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
183        )
184
185    return torch_em.default_segmentation_dataset(
186        raw_paths=raw_paths,
187        raw_key=None,
188        label_paths=label_paths,
189        label_key=None,
190        patch_shape=patch_shape,
191        is_seg_dataset=False,
192        **kwargs
193    )

Get the LyNSeC dataset for nucleus segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • split: The choice of data split.
  • choice: The choice of dataset.
  • resize_inputs: Whether to resize the inputs.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_lynsec_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Optional[Literal['train', 'val', 'test']] = None, choice: Optional[Literal['ihc', 'h&e']] = None, resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
196def get_lynsec_loader(
197    path: Union[os.PathLike, str],
198    batch_size: int,
199    patch_shape: Tuple[int, int],
200    split: Optional[Literal["train", "val", "test"]] = None,
201    choice: Optional[Literal['ihc', 'h&e']] = None,
202    resize_inputs: bool = False,
203    download: bool = False,
204    **kwargs
205) -> DataLoader:
206    """Get the LyNSeC dataloader for nucleus segmentation.
207
208    Args:
209        path: Filepath to a folder where the downloaded data will be saved.
210        batch_size: The batch size for training.
211        patch_shape: The patch shape to use for training.
212        split: The choice of data split.
213        choice: The choice of dataset.
214        resize_inputs: Whether to resize the inputs.
215        download: Whether to download the data if it is not present.
216        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
217
218    Returns:
219        The DataLoader.
220    """
221    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
222    dataset = get_lynsec_dataset(path, patch_shape, split, choice, resize_inputs, download, **ds_kwargs)
223    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the LyNSeC dataloader for nucleus segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape to use for training.
  • split: The choice of data split.
  • choice: The choice of dataset.
  • resize_inputs: Whether to resize the inputs.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.