torch_em.data.datasets.histopathology.lynsec
The LyNSeC dataset contains annotations for nucleus segmentation in IHC and H&E stained lymphoma tissue images.
The dataset is located at https://doi.org/10.5281/zenodo.8065174. This dataset is from the publication https://doi.org/10.1016/j.compbiomed.2024.107978. Please cite it if you use this dataset in your research.
1"""The LyNSeC dataset contains annotations for nucleus segmentation 2in IHC and H&E stained lymphoma tissue images. 3 4The dataset is located at https://doi.org/10.5281/zenodo.8065174. 5This dataset is from the publication https://doi.org/10.1016/j.compbiomed.2024.107978. 6Please cite it if you use this dataset in your research. 7""" 8 9import os 10from glob import glob 11from tqdm import tqdm 12from pathlib import Path 13from natsort import natsorted 14from typing import Union, Tuple, List, Optional, Literal 15 16import json 17import numpy as np 18import pandas as pd 19import imageio.v3 as imageio 20from sklearn.model_selection import train_test_split 21 22import torch_em 23 24from torch.utils.data import Dataset, DataLoader 25 26from .. import util 27 28 29URL = "https://zenodo.org/records/8065174/files/lynsec.zip" 30CHECKSUM = "14b9b5a9c39cb41afc7f31de5a995cefff0947c215e14ab9c7a463f32fbbf4b6" 31 32 33def _create_split_csv(path, data_dir, split, choice): 34 assert split in ["train", "val", "test"], "Please choose a valid split." 35 36 csv_path = os.path.join(path, f"lynsec_{choice}_split.csv") 37 if os.path.exists(csv_path): 38 df = pd.read_csv(csv_path) 39 df[split] = df[split].apply(lambda x: json.loads(x.replace("'", '"'))) # ensures all items from column in list. 40 split_list = df.iloc[0][split] 41 42 else: 43 print(f"Creating a new split file at '{csv_path}'.") 44 image_names = [ 45 os.path.basename(image).split(".")[0] for image in glob(os.path.join(data_dir, choice, 'images', '*.tif')) 46 ] 47 48 # Create random splits per dataset. 49 train_ids, test_ids = train_test_split(image_names, test_size=0.2) # 20% for test split. 50 train_ids, val_ids = train_test_split(train_ids, test_size=0.15) # 15% for val split. 51 split_ids = {"train": train_ids, "val": val_ids, "test": test_ids} 52 53 df = pd.DataFrame.from_dict([split_ids]) 54 df.to_csv(csv_path, index=False) 55 split_list = split_ids[split] 56 57 return split_list 58 59 60def _preprocess_dataset(data_dir): 61 data_dirs = natsorted(glob(os.path.join(data_dir, "lynsec*"))) 62 for _dir in data_dirs: 63 if os.path.basename(_dir) == "lynsec 1": 64 target_dir = "ihc" 65 else: 66 target_dir = "h&e" 67 68 image_dir = os.path.join(data_dir, target_dir, "images") 69 label_dir = os.path.join(data_dir, target_dir, "labels") 70 os.makedirs(image_dir, exist_ok=True) 71 os.makedirs(label_dir, exist_ok=True) 72 73 paths = natsorted(glob(os.path.join(_dir, "*.npy"))) 74 for fpath in tqdm(paths, desc="Preprocessing inputs"): 75 fname = Path(fpath).stem 76 darray = np.load(fpath) 77 78 raw = darray[..., :3] 79 labels = darray[..., 3] 80 81 if target_dir == "h&e" and fname in [f"{i}_l2" for i in range(35)]: # set of images have mismatching labels 82 continue 83 84 imageio.imwrite(os.path.join(image_dir, f"{fname}.tif"), raw, compression="zlib") 85 imageio.imwrite(os.path.join(label_dir, f"{fname}.tif"), labels, compression="zlib") 86 87 88def get_lynsec_data(path: Union[os.PathLike, str], download: bool = False) -> str: 89 """Download the LyNSeC dataset for nucleus segmentation. 90 91 Args: 92 path: Filepath to a folder where the downloaded data will be saved. 93 download: Whether to download the data if it is not present. 94 95 Returns: 96 The filepath to the downloaded data. 97 """ 98 data_dir = os.path.join(path, "data") 99 if os.path.exists(data_dir): 100 return data_dir 101 102 os.makedirs(data_dir, exist_ok=True) 103 104 zip_path = os.path.join(path, "lynsec.zip") 105 util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM) 106 util.unzip(zip_path=zip_path, dst=data_dir) 107 108 _preprocess_dataset(data_dir) 109 110 return data_dir 111 112 113def get_lynsec_paths( 114 path: Union[os.PathLike, str], 115 split: Optional[Literal["train", "val", "test"]] = None, 116 choice: Optional[Literal['ihc', 'h&e']] = None, 117 download: bool = False 118) -> Tuple[List[str], List[str]]: 119 """Get paths to the LyNSec data. 120 121 Args: 122 path: Filepath to a folder where the downloaded data will be saved. 123 split: The choice of data split. 124 choice: The choice of dataset. 125 download: Whether to download the data if it is not present. 126 127 Returns: 128 List of filepaths to the image data. 129 List of filepaths to the label data. 130 """ 131 data_dir = get_lynsec_data(path, download) 132 133 if choice is None: 134 choice = "*" 135 136 raw_paths = natsorted(glob(os.path.join(data_dir, choice, "images", "*.tif"))) 137 label_paths = natsorted(glob(os.path.join(data_dir, choice, "labels", "*.tif"))) 138 139 if split is not None: 140 if choice == "*": # If user did not choose a split, we make splits for both datasets. 141 split_list = _create_split_csv(path, data_dir, split, "h&e") 142 split_list.extend(_create_split_csv(path, data_dir, split, "ihc")) 143 else: 144 split_list = _create_split_csv(path, data_dir, split, choice) 145 146 # Filter paths which are valid for the chosen split. 147 raw_paths = [p for p in raw_paths if os.path.basename(p).split(".")[0] in split_list] 148 label_paths = [p for p in label_paths if os.path.basename(p).split(".")[0] in split_list] 149 150 return raw_paths, label_paths 151 152 153def get_lynsec_dataset( 154 path: Union[os.PathLike, str], 155 patch_shape: Tuple[int, int], 156 split: Optional[Literal["train", "val", "test"]] = None, 157 choice: Optional[Literal['ihc', 'h&e']] = None, 158 resize_inputs: bool = False, 159 download: bool = False, 160 **kwargs 161) -> Dataset: 162 """Get the LyNSeC dataset for nucleus segmentation. 163 164 Args: 165 path: Filepath to a folder where the downloaded data will be saved. 166 patch_shape: The patch shape to use for training. 167 split: The choice of data split. 168 choice: The choice of dataset. 169 resize_inputs: Whether to resize the inputs. 170 download: Whether to download the data if it is not present. 171 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 172 173 Returns: 174 The segmentation dataset. 175 """ 176 raw_paths, label_paths = get_lynsec_paths(path, split, choice, download) 177 178 if resize_inputs: 179 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True} 180 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 181 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 182 ) 183 184 return torch_em.default_segmentation_dataset( 185 raw_paths=raw_paths, 186 raw_key=None, 187 label_paths=label_paths, 188 label_key=None, 189 patch_shape=patch_shape, 190 is_seg_dataset=False, 191 **kwargs 192 ) 193 194 195def get_lynsec_loader( 196 path: Union[os.PathLike, str], 197 batch_size: int, 198 patch_shape: Tuple[int, int], 199 split: Optional[Literal["train", "val", "test"]] = None, 200 choice: Optional[Literal['ihc', 'h&e']] = None, 201 resize_inputs: bool = False, 202 download: bool = False, 203 **kwargs 204) -> DataLoader: 205 """Get the LyNSeC dataloader for nucleus segmentation. 206 207 Args: 208 path: Filepath to a folder where the downloaded data will be saved. 209 batch_size: The batch size for training. 210 patch_shape: The patch shape to use for training. 211 split: The choice of data split. 212 choice: The choice of dataset. 213 resize_inputs: Whether to resize the inputs. 214 download: Whether to download the data if it is not present. 215 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 216 217 Returns: 218 The DataLoader. 219 """ 220 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 221 dataset = get_lynsec_dataset(path, patch_shape, split, choice, resize_inputs, download, **ds_kwargs) 222 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URL =
'https://zenodo.org/records/8065174/files/lynsec.zip'
CHECKSUM =
'14b9b5a9c39cb41afc7f31de5a995cefff0947c215e14ab9c7a463f32fbbf4b6'
def
get_lynsec_data(path: Union[os.PathLike, str], download: bool = False) -> str:
89def get_lynsec_data(path: Union[os.PathLike, str], download: bool = False) -> str: 90 """Download the LyNSeC dataset for nucleus segmentation. 91 92 Args: 93 path: Filepath to a folder where the downloaded data will be saved. 94 download: Whether to download the data if it is not present. 95 96 Returns: 97 The filepath to the downloaded data. 98 """ 99 data_dir = os.path.join(path, "data") 100 if os.path.exists(data_dir): 101 return data_dir 102 103 os.makedirs(data_dir, exist_ok=True) 104 105 zip_path = os.path.join(path, "lynsec.zip") 106 util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM) 107 util.unzip(zip_path=zip_path, dst=data_dir) 108 109 _preprocess_dataset(data_dir) 110 111 return data_dir
Download the LyNSeC dataset for nucleus segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- download: Whether to download the data if it is not present.
Returns:
The filepath to the downloaded data.
def
get_lynsec_paths( path: Union[os.PathLike, str], split: Optional[Literal['train', 'val', 'test']] = None, choice: Optional[Literal['ihc', 'h&e']] = None, download: bool = False) -> Tuple[List[str], List[str]]:
114def get_lynsec_paths( 115 path: Union[os.PathLike, str], 116 split: Optional[Literal["train", "val", "test"]] = None, 117 choice: Optional[Literal['ihc', 'h&e']] = None, 118 download: bool = False 119) -> Tuple[List[str], List[str]]: 120 """Get paths to the LyNSec data. 121 122 Args: 123 path: Filepath to a folder where the downloaded data will be saved. 124 split: The choice of data split. 125 choice: The choice of dataset. 126 download: Whether to download the data if it is not present. 127 128 Returns: 129 List of filepaths to the image data. 130 List of filepaths to the label data. 131 """ 132 data_dir = get_lynsec_data(path, download) 133 134 if choice is None: 135 choice = "*" 136 137 raw_paths = natsorted(glob(os.path.join(data_dir, choice, "images", "*.tif"))) 138 label_paths = natsorted(glob(os.path.join(data_dir, choice, "labels", "*.tif"))) 139 140 if split is not None: 141 if choice == "*": # If user did not choose a split, we make splits for both datasets. 142 split_list = _create_split_csv(path, data_dir, split, "h&e") 143 split_list.extend(_create_split_csv(path, data_dir, split, "ihc")) 144 else: 145 split_list = _create_split_csv(path, data_dir, split, choice) 146 147 # Filter paths which are valid for the chosen split. 148 raw_paths = [p for p in raw_paths if os.path.basename(p).split(".")[0] in split_list] 149 label_paths = [p for p in label_paths if os.path.basename(p).split(".")[0] in split_list] 150 151 return raw_paths, label_paths
Get paths to the LyNSec data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The choice of data split.
- choice: The choice of dataset.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths to the image data. List of filepaths to the label data.
def
get_lynsec_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Optional[Literal['train', 'val', 'test']] = None, choice: Optional[Literal['ihc', 'h&e']] = None, resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
154def get_lynsec_dataset( 155 path: Union[os.PathLike, str], 156 patch_shape: Tuple[int, int], 157 split: Optional[Literal["train", "val", "test"]] = None, 158 choice: Optional[Literal['ihc', 'h&e']] = None, 159 resize_inputs: bool = False, 160 download: bool = False, 161 **kwargs 162) -> Dataset: 163 """Get the LyNSeC dataset for nucleus segmentation. 164 165 Args: 166 path: Filepath to a folder where the downloaded data will be saved. 167 patch_shape: The patch shape to use for training. 168 split: The choice of data split. 169 choice: The choice of dataset. 170 resize_inputs: Whether to resize the inputs. 171 download: Whether to download the data if it is not present. 172 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 173 174 Returns: 175 The segmentation dataset. 176 """ 177 raw_paths, label_paths = get_lynsec_paths(path, split, choice, download) 178 179 if resize_inputs: 180 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True} 181 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 182 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 183 ) 184 185 return torch_em.default_segmentation_dataset( 186 raw_paths=raw_paths, 187 raw_key=None, 188 label_paths=label_paths, 189 label_key=None, 190 patch_shape=patch_shape, 191 is_seg_dataset=False, 192 **kwargs 193 )
Get the LyNSeC dataset for nucleus segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- split: The choice of data split.
- choice: The choice of dataset.
- resize_inputs: Whether to resize the inputs.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
def
get_lynsec_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Optional[Literal['train', 'val', 'test']] = None, choice: Optional[Literal['ihc', 'h&e']] = None, resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
196def get_lynsec_loader( 197 path: Union[os.PathLike, str], 198 batch_size: int, 199 patch_shape: Tuple[int, int], 200 split: Optional[Literal["train", "val", "test"]] = None, 201 choice: Optional[Literal['ihc', 'h&e']] = None, 202 resize_inputs: bool = False, 203 download: bool = False, 204 **kwargs 205) -> DataLoader: 206 """Get the LyNSeC dataloader for nucleus segmentation. 207 208 Args: 209 path: Filepath to a folder where the downloaded data will be saved. 210 batch_size: The batch size for training. 211 patch_shape: The patch shape to use for training. 212 split: The choice of data split. 213 choice: The choice of dataset. 214 resize_inputs: Whether to resize the inputs. 215 download: Whether to download the data if it is not present. 216 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 217 218 Returns: 219 The DataLoader. 220 """ 221 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 222 dataset = get_lynsec_dataset(path, patch_shape, split, choice, resize_inputs, download, **ds_kwargs) 223 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the LyNSeC dataloader for nucleus segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- split: The choice of data split.
- choice: The choice of dataset.
- resize_inputs: Whether to resize the inputs.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader.