torch_em.data.datasets.histopathology.cryonuseg

The CryoNuSeg dataset contains annotations for nucleus segmentation in cryosectioned H&E stained histological images of 10 different organs.

This dataset is from the publication https://doi.org/10.1016/j.compbiomed.2021.104349. Please cite it if you use this dataset for your research.

View Source

  1"""The CryoNuSeg dataset contains annotations for nucleus segmentation
  2in cryosectioned H&E stained histological images of 10 different organs.
  3
  4This dataset is from the publication https://doi.org/10.1016/j.compbiomed.2021.104349.
  5Please cite it if you use this dataset for your research.
  6"""
  7
  8import os
  9from glob import glob
 10from natsort import natsorted
 11from typing import Union, Tuple, Literal, List
 12
 13import json
 14import pandas as pd
 15from sklearn.model_selection import train_test_split
 16
 17from torch.utils.data import Dataset, DataLoader
 18
 19import torch_em
 20
 21from .. import util
 22
 23
 24def _create_split_csv(path, data_dir, split):
 25    csv_path = os.path.join(path, 'cryonuseg_split.csv')
 26    if os.path.exists(csv_path):
 27        df = pd.read_csv(csv_path)
 28        df[split] = df[split].apply(lambda x: json.loads(x.replace("'", '"')))  # ensures all items from column in list.
 29        split_list = df.iloc[0][split]
 30
 31    else:
 32        print(f"Creating a new split file at '{csv_path}'.")
 33        image_names = [
 34            os.path.basename(image).split(".")[0] for image in glob(os.path.join(path, data_dir, '*.tif'))
 35        ]
 36
 37        # Create random splits per dataset.
 38        train_ids, test_ids = train_test_split(image_names, test_size=0.2)  # 20% for test split.
 39        train_ids, val_ids = train_test_split(train_ids, test_size=0.15)  # 15% for val split.
 40        split_ids = {"train": train_ids, "val": val_ids, "test": test_ids}
 41
 42        df = pd.DataFrame.from_dict([split_ids])
 43        df.to_csv(csv_path, index=False)
 44
 45        split_list = split_ids[split]
 46
 47    return split_list
 48
 49
 50def get_cryonuseg_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 51    """Download the CryoNuSeg dataset for nucleus segmentation.
 52
 53    Args:
 54        path: Filepath to a folder where the downloaded data will be saved.
 55        download: Whether to download the data if it is not present.
 56
 57    Returns:
 58        The folder where the data is downloaded and preprocessed.
 59    """
 60    data_dir = os.path.join(path, r"tissue images")
 61    if os.path.exists(os.path.join(path, r"tissue images")):
 62        return data_dir
 63
 64    os.makedirs(path, exist_ok=True)
 65    util.download_source_kaggle(
 66        path=path, dataset_name="ipateam/segmentation-of-nuclei-in-cryosectioned-he-images", download=download
 67    )
 68
 69    zip_path = os.path.join(path, "segmentation-of-nuclei-in-cryosectioned-he-images.zip")
 70    util.unzip(zip_path=zip_path, dst=path)
 71
 72    return data_dir
 73
 74
 75def get_cryonuseg_paths(
 76    path: Union[os.PathLike, str],
 77    split: Literal["train", "val", "test"],
 78    rater_choice: Literal["b1", "b2", "b3"] = "b1",
 79    download: bool = False,
 80) -> Tuple[List[str], List[str]]:
 81    """Get paths to the CryoNuSeg data.
 82
 83    Args:
 84        path: Filepath to a folder where the downloaded data will be saved.
 85        split: The choice of data split.
 86        rater: The choice of annotator.
 87        download: Whether to download the data if it is not present.
 88
 89    Returns:
 90        List of filepaths to the image data.
 91        List of filepaths to the label data.
 92    """
 93    data_dir = get_cryonuseg_data(path, download)
 94
 95    if rater_choice == "b1":
 96        label_dir = r"Annotator 1 (biologist)/"
 97    elif rater_choice == "b2":
 98        label_dir = r"Annotator 1 (biologist second round of manual marks up)/" * 2
 99    elif rater_choice == "b3":
100        label_dir = r"Annotator 2 (bioinformatician)/" * 2
101    else:
102        raise ValueError(f"'{rater_choice}' is not a valid rater choice.")
103
104    # Point to the instance labels folder
105    label_dir += r"label masks modify"
106    split_list = _create_split_csv(path, label_dir, split)
107
108    # Get the raw and label paths
109    label_paths = natsorted([os.path.join(path, label_dir, f'{fname}.tif') for fname in split_list])
110    raw_paths = natsorted([os.path.join(data_dir, f'{fname}.tif') for fname in split_list])
111
112    assert len(raw_paths) == len(label_paths) and len(raw_paths) > 0
113
114    return raw_paths, label_paths
115
116
117def get_cryonuseg_dataset(
118    path: Union[os.PathLike, str],
119    patch_shape: Tuple[int, int],
120    split: Literal["train", "val", "test"],
121    rater: Literal["b1", "b2", "b3"] = "b1",
122    resize_inputs: bool = False,
123    download: bool = False,
124    **kwargs
125) -> Dataset:
126    """Get the CryoNuSeg dataset for nucleus segmentation.
127
128    Args:
129        path: Filepath to a folder where the downloaded data will be saved.
130        patch_shape: The patch shape to use for training.
131        split: The choice of data split.
132        rater: The choice of annotator.
133        resize_inputs: Whether to resize the inputs.
134        download: Whether to download the data if it is not present.
135        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
136
137    Returns:
138        The segmentation dataset.
139    """
140    raw_paths, label_paths = get_cryonuseg_paths(path, split, rater, download)
141
142    if resize_inputs:
143        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
144        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
145            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
146        )
147
148    return torch_em.default_segmentation_dataset(
149        raw_paths=raw_paths,
150        raw_key=None,
151        label_paths=label_paths,
152        label_key=None,
153        is_seg_dataset=False,
154        patch_shape=patch_shape,
155        **kwargs
156    )
157
158
159def get_cryonuseg_loader(
160    path: Union[os.PathLike, str],
161    batch_size: int,
162    patch_shape: Tuple[int, int],
163    split: Literal["train", "val", "test"],
164    rater: Literal["b1", "b2", "b3"] = "b1",
165    resize_inputs: bool = False,
166    download: bool = False,
167    **kwargs
168) -> DataLoader:
169    """Get the CryoNuSeg dataloader for nucleus segmentation.
170
171    Args:
172        path: Filepath to a folder where the downloaded data will be saved.
173        batch_size: The batch size for training.
174        patch_shape: The patch shape to use for training.
175        split: The choice of data split.
176        rater: The choice of annotator.
177        resize_inputs: Whether to resize the inputs.
178        download: Whether to download the data if it is not present.
179        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
180
181    Returns:
182        The DataLoader.
183    """
184    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
185    dataset = get_cryonuseg_dataset(path, patch_shape, split, rater, resize_inputs, download, **ds_kwargs)
186    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

def get_cryonuseg_data(path: Union[os.PathLike, str], download: bool = False) -> str: View Source

51def get_cryonuseg_data(path: Union[os.PathLike, str], download: bool = False) -> str:
52    """Download the CryoNuSeg dataset for nucleus segmentation.
53
54    Args:
55        path: Filepath to a folder where the downloaded data will be saved.
56        download: Whether to download the data if it is not present.
57
58    Returns:
59        The folder where the data is downloaded and preprocessed.
60    """
61    data_dir = os.path.join(path, r"tissue images")
62    if os.path.exists(os.path.join(path, r"tissue images")):
63        return data_dir
64
65    os.makedirs(path, exist_ok=True)
66    util.download_source_kaggle(
67        path=path, dataset_name="ipateam/segmentation-of-nuclei-in-cryosectioned-he-images", download=download
68    )
69
70    zip_path = os.path.join(path, "segmentation-of-nuclei-in-cryosectioned-he-images.zip")
71    util.unzip(zip_path=zip_path, dst=path)
72
73    return data_dir

Download the CryoNuSeg dataset for nucleus segmentation.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
download: Whether to download the data if it is not present.

Returns:

The folder where the data is downloaded and preprocessed.

def get_cryonuseg_paths( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], rater_choice: Literal['b1', 'b2', 'b3'] = 'b1', download: bool = False) -> Tuple[List[str], List[str]]: View Source

 76def get_cryonuseg_paths(
 77    path: Union[os.PathLike, str],
 78    split: Literal["train", "val", "test"],
 79    rater_choice: Literal["b1", "b2", "b3"] = "b1",
 80    download: bool = False,
 81) -> Tuple[List[str], List[str]]:
 82    """Get paths to the CryoNuSeg data.
 83
 84    Args:
 85        path: Filepath to a folder where the downloaded data will be saved.
 86        split: The choice of data split.
 87        rater: The choice of annotator.
 88        download: Whether to download the data if it is not present.
 89
 90    Returns:
 91        List of filepaths to the image data.
 92        List of filepaths to the label data.
 93    """
 94    data_dir = get_cryonuseg_data(path, download)
 95
 96    if rater_choice == "b1":
 97        label_dir = r"Annotator 1 (biologist)/"
 98    elif rater_choice == "b2":
 99        label_dir = r"Annotator 1 (biologist second round of manual marks up)/" * 2
100    elif rater_choice == "b3":
101        label_dir = r"Annotator 2 (bioinformatician)/" * 2
102    else:
103        raise ValueError(f"'{rater_choice}' is not a valid rater choice.")
104
105    # Point to the instance labels folder
106    label_dir += r"label masks modify"
107    split_list = _create_split_csv(path, label_dir, split)
108
109    # Get the raw and label paths
110    label_paths = natsorted([os.path.join(path, label_dir, f'{fname}.tif') for fname in split_list])
111    raw_paths = natsorted([os.path.join(data_dir, f'{fname}.tif') for fname in split_list])
112
113    assert len(raw_paths) == len(label_paths) and len(raw_paths) > 0
114
115    return raw_paths, label_paths

Get paths to the CryoNuSeg data.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
split: The choice of data split.
rater: The choice of annotator.
download: Whether to download the data if it is not present.

Returns:

List of filepaths to the image data. List of filepaths to the label data.

def get_cryonuseg_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], rater: Literal['b1', 'b2', 'b3'] = 'b1', resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset: View Source

118def get_cryonuseg_dataset(
119    path: Union[os.PathLike, str],
120    patch_shape: Tuple[int, int],
121    split: Literal["train", "val", "test"],
122    rater: Literal["b1", "b2", "b3"] = "b1",
123    resize_inputs: bool = False,
124    download: bool = False,
125    **kwargs
126) -> Dataset:
127    """Get the CryoNuSeg dataset for nucleus segmentation.
128
129    Args:
130        path: Filepath to a folder where the downloaded data will be saved.
131        patch_shape: The patch shape to use for training.
132        split: The choice of data split.
133        rater: The choice of annotator.
134        resize_inputs: Whether to resize the inputs.
135        download: Whether to download the data if it is not present.
136        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
137
138    Returns:
139        The segmentation dataset.
140    """
141    raw_paths, label_paths = get_cryonuseg_paths(path, split, rater, download)
142
143    if resize_inputs:
144        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
145        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
146            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
147        )
148
149    return torch_em.default_segmentation_dataset(
150        raw_paths=raw_paths,
151        raw_key=None,
152        label_paths=label_paths,
153        label_key=None,
154        is_seg_dataset=False,
155        patch_shape=patch_shape,
156        **kwargs
157    )

Get the CryoNuSeg dataset for nucleus segmentation.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
patch_shape: The patch shape to use for training.
split: The choice of data split.
rater: The choice of annotator.
resize_inputs: Whether to resize the inputs.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

Returns:

The segmentation dataset.

def get_cryonuseg_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], rater: Literal['b1', 'b2', 'b3'] = 'b1', resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader: View Source

160def get_cryonuseg_loader(
161    path: Union[os.PathLike, str],
162    batch_size: int,
163    patch_shape: Tuple[int, int],
164    split: Literal["train", "val", "test"],
165    rater: Literal["b1", "b2", "b3"] = "b1",
166    resize_inputs: bool = False,
167    download: bool = False,
168    **kwargs
169) -> DataLoader:
170    """Get the CryoNuSeg dataloader for nucleus segmentation.
171
172    Args:
173        path: Filepath to a folder where the downloaded data will be saved.
174        batch_size: The batch size for training.
175        patch_shape: The patch shape to use for training.
176        split: The choice of data split.
177        rater: The choice of annotator.
178        resize_inputs: Whether to resize the inputs.
179        download: Whether to download the data if it is not present.
180        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
181
182    Returns:
183        The DataLoader.
184    """
185    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
186    dataset = get_cryonuseg_dataset(path, patch_shape, split, rater, resize_inputs, download, **ds_kwargs)
187    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the CryoNuSeg dataloader for nucleus segmentation.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
batch_size: The batch size for training.
patch_shape: The patch shape to use for training.
split: The choice of data split.
rater: The choice of annotator.
resize_inputs: Whether to resize the inputs.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

Returns:

The DataLoader.