torch_em.data.datasets.histopathology.cryonuseg
The CryoNuSeg dataset contains annotations for nucleus segmentation in cryosectioned H&E stained histological images of 10 different organs.
This dataset is from the publication https://doi.org/10.1016/j.compbiomed.2021.104349. Please cite it if you use this dataset for your research.
1"""The CryoNuSeg dataset contains annotations for nucleus segmentation 2in cryosectioned H&E stained histological images of 10 different organs. 3 4This dataset is from the publication https://doi.org/10.1016/j.compbiomed.2021.104349. 5Please cite it if you use this dataset for your research. 6""" 7 8import os 9from glob import glob 10from natsort import natsorted 11from typing import Union, Tuple, Literal, List 12 13import json 14import pandas as pd 15from sklearn.model_selection import train_test_split 16 17from torch.utils.data import Dataset, DataLoader 18 19import torch_em 20 21from .. import util 22 23 24def _create_split_csv(path, data_dir, split): 25 csv_path = os.path.join(path, 'cryonuseg_split.csv') 26 if os.path.exists(csv_path): 27 df = pd.read_csv(csv_path) 28 df[split] = df[split].apply(lambda x: json.loads(x.replace("'", '"'))) # ensures all items from column in list. 29 split_list = df.iloc[0][split] 30 31 else: 32 print(f"Creating a new split file at '{csv_path}'.") 33 image_names = [ 34 os.path.basename(image).split(".")[0] for image in glob(os.path.join(path, data_dir, '*.tif')) 35 ] 36 37 # Create random splits per dataset. 38 train_ids, test_ids = train_test_split(image_names, test_size=0.2) # 20% for test split. 39 train_ids, val_ids = train_test_split(train_ids, test_size=0.15) # 15% for val split. 40 split_ids = {"train": train_ids, "val": val_ids, "test": test_ids} 41 42 df = pd.DataFrame.from_dict([split_ids]) 43 df.to_csv(csv_path, index=False) 44 45 split_list = split_ids[split] 46 47 return split_list 48 49 50def get_cryonuseg_data(path: Union[os.PathLike, str], download: bool = False) -> str: 51 """Download the CryoNuSeg dataset for nucleus segmentation. 52 53 Args: 54 path: Filepath to a folder where the downloaded data will be saved. 55 download: Whether to download the data if it is not present. 56 57 Returns: 58 The folder where the data is downloaded and preprocessed. 59 """ 60 data_dir = os.path.join(path, r"tissue images") 61 if os.path.exists(os.path.join(path, r"tissue images")): 62 return data_dir 63 64 os.makedirs(path, exist_ok=True) 65 util.download_source_kaggle( 66 path=path, dataset_name="ipateam/segmentation-of-nuclei-in-cryosectioned-he-images", download=download 67 ) 68 69 zip_path = os.path.join(path, "segmentation-of-nuclei-in-cryosectioned-he-images.zip") 70 util.unzip(zip_path=zip_path, dst=path) 71 72 return data_dir 73 74 75def get_cryonuseg_paths( 76 path: Union[os.PathLike, str], 77 split: Literal["train", "val", "test"], 78 rater_choice: Literal["b1", "b2", "b3"] = "b1", 79 download: bool = False, 80) -> Tuple[List[str], List[str]]: 81 """Get paths to the CryoNuSeg data. 82 83 Args: 84 path: Filepath to a folder where the downloaded data will be saved. 85 split: The choice of data split. 86 rater: The choice of annotator. 87 download: Whether to download the data if it is not present. 88 89 Returns: 90 List of filepaths to the image data. 91 List of filepaths to the label data. 92 """ 93 data_dir = get_cryonuseg_data(path, download) 94 95 if rater_choice == "b1": 96 label_dir = r"Annotator 1 (biologist)/" 97 elif rater_choice == "b2": 98 label_dir = r"Annotator 1 (biologist second round of manual marks up)/" * 2 99 elif rater_choice == "b3": 100 label_dir = r"Annotator 2 (bioinformatician)/" * 2 101 else: 102 raise ValueError(f"'{rater_choice}' is not a valid rater choice.") 103 104 # Point to the instance labels folder 105 label_dir += r"label masks modify" 106 split_list = _create_split_csv(path, label_dir, split) 107 108 # Get the raw and label paths 109 label_paths = natsorted([os.path.join(path, label_dir, f'{fname}.tif') for fname in split_list]) 110 raw_paths = natsorted([os.path.join(data_dir, f'{fname}.tif') for fname in split_list]) 111 112 assert len(raw_paths) == len(label_paths) and len(raw_paths) > 0 113 114 return raw_paths, label_paths 115 116 117def get_cryonuseg_dataset( 118 path: Union[os.PathLike, str], 119 patch_shape: Tuple[int, int], 120 split: Literal["train", "val", "test"], 121 rater: Literal["b1", "b2", "b3"] = "b1", 122 resize_inputs: bool = False, 123 download: bool = False, 124 **kwargs 125) -> Dataset: 126 """Get the CryoNuSeg dataset for nucleus segmentation. 127 128 Args: 129 path: Filepath to a folder where the downloaded data will be saved. 130 patch_shape: The patch shape to use for training. 131 split: The choice of data split. 132 rater: The choice of annotator. 133 resize_inputs: Whether to resize the inputs. 134 download: Whether to download the data if it is not present. 135 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 136 137 Returns: 138 The segmentation dataset. 139 """ 140 raw_paths, label_paths = get_cryonuseg_paths(path, split, rater, download) 141 142 if resize_inputs: 143 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True} 144 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 145 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 146 ) 147 148 return torch_em.default_segmentation_dataset( 149 raw_paths=raw_paths, 150 raw_key=None, 151 label_paths=label_paths, 152 label_key=None, 153 is_seg_dataset=False, 154 patch_shape=patch_shape, 155 **kwargs 156 ) 157 158 159def get_cryonuseg_loader( 160 path: Union[os.PathLike, str], 161 batch_size: int, 162 patch_shape: Tuple[int, int], 163 split: Literal["train", "val", "test"], 164 rater: Literal["b1", "b2", "b3"] = "b1", 165 resize_inputs: bool = False, 166 download: bool = False, 167 **kwargs 168) -> DataLoader: 169 """Get the CryoNuSeg dataloader for nucleus segmentation. 170 171 Args: 172 path: Filepath to a folder where the downloaded data will be saved. 173 batch_size: The batch size for training. 174 patch_shape: The patch shape to use for training. 175 split: The choice of data split. 176 rater: The choice of annotator. 177 resize_inputs: Whether to resize the inputs. 178 download: Whether to download the data if it is not present. 179 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 180 181 Returns: 182 The DataLoader. 183 """ 184 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 185 dataset = get_cryonuseg_dataset(path, patch_shape, split, rater, resize_inputs, download, **ds_kwargs) 186 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
def
get_cryonuseg_data(path: Union[os.PathLike, str], download: bool = False) -> str:
51def get_cryonuseg_data(path: Union[os.PathLike, str], download: bool = False) -> str: 52 """Download the CryoNuSeg dataset for nucleus segmentation. 53 54 Args: 55 path: Filepath to a folder where the downloaded data will be saved. 56 download: Whether to download the data if it is not present. 57 58 Returns: 59 The folder where the data is downloaded and preprocessed. 60 """ 61 data_dir = os.path.join(path, r"tissue images") 62 if os.path.exists(os.path.join(path, r"tissue images")): 63 return data_dir 64 65 os.makedirs(path, exist_ok=True) 66 util.download_source_kaggle( 67 path=path, dataset_name="ipateam/segmentation-of-nuclei-in-cryosectioned-he-images", download=download 68 ) 69 70 zip_path = os.path.join(path, "segmentation-of-nuclei-in-cryosectioned-he-images.zip") 71 util.unzip(zip_path=zip_path, dst=path) 72 73 return data_dir
Download the CryoNuSeg dataset for nucleus segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- download: Whether to download the data if it is not present.
Returns:
The folder where the data is downloaded and preprocessed.
def
get_cryonuseg_paths( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], rater_choice: Literal['b1', 'b2', 'b3'] = 'b1', download: bool = False) -> Tuple[List[str], List[str]]:
76def get_cryonuseg_paths( 77 path: Union[os.PathLike, str], 78 split: Literal["train", "val", "test"], 79 rater_choice: Literal["b1", "b2", "b3"] = "b1", 80 download: bool = False, 81) -> Tuple[List[str], List[str]]: 82 """Get paths to the CryoNuSeg data. 83 84 Args: 85 path: Filepath to a folder where the downloaded data will be saved. 86 split: The choice of data split. 87 rater: The choice of annotator. 88 download: Whether to download the data if it is not present. 89 90 Returns: 91 List of filepaths to the image data. 92 List of filepaths to the label data. 93 """ 94 data_dir = get_cryonuseg_data(path, download) 95 96 if rater_choice == "b1": 97 label_dir = r"Annotator 1 (biologist)/" 98 elif rater_choice == "b2": 99 label_dir = r"Annotator 1 (biologist second round of manual marks up)/" * 2 100 elif rater_choice == "b3": 101 label_dir = r"Annotator 2 (bioinformatician)/" * 2 102 else: 103 raise ValueError(f"'{rater_choice}' is not a valid rater choice.") 104 105 # Point to the instance labels folder 106 label_dir += r"label masks modify" 107 split_list = _create_split_csv(path, label_dir, split) 108 109 # Get the raw and label paths 110 label_paths = natsorted([os.path.join(path, label_dir, f'{fname}.tif') for fname in split_list]) 111 raw_paths = natsorted([os.path.join(data_dir, f'{fname}.tif') for fname in split_list]) 112 113 assert len(raw_paths) == len(label_paths) and len(raw_paths) > 0 114 115 return raw_paths, label_paths
Get paths to the CryoNuSeg data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The choice of data split.
- rater: The choice of annotator.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths to the image data. List of filepaths to the label data.
def
get_cryonuseg_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], rater: Literal['b1', 'b2', 'b3'] = 'b1', resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
118def get_cryonuseg_dataset( 119 path: Union[os.PathLike, str], 120 patch_shape: Tuple[int, int], 121 split: Literal["train", "val", "test"], 122 rater: Literal["b1", "b2", "b3"] = "b1", 123 resize_inputs: bool = False, 124 download: bool = False, 125 **kwargs 126) -> Dataset: 127 """Get the CryoNuSeg dataset for nucleus segmentation. 128 129 Args: 130 path: Filepath to a folder where the downloaded data will be saved. 131 patch_shape: The patch shape to use for training. 132 split: The choice of data split. 133 rater: The choice of annotator. 134 resize_inputs: Whether to resize the inputs. 135 download: Whether to download the data if it is not present. 136 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 137 138 Returns: 139 The segmentation dataset. 140 """ 141 raw_paths, label_paths = get_cryonuseg_paths(path, split, rater, download) 142 143 if resize_inputs: 144 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True} 145 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 146 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 147 ) 148 149 return torch_em.default_segmentation_dataset( 150 raw_paths=raw_paths, 151 raw_key=None, 152 label_paths=label_paths, 153 label_key=None, 154 is_seg_dataset=False, 155 patch_shape=patch_shape, 156 **kwargs 157 )
Get the CryoNuSeg dataset for nucleus segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- split: The choice of data split.
- rater: The choice of annotator.
- resize_inputs: Whether to resize the inputs.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
def
get_cryonuseg_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], rater: Literal['b1', 'b2', 'b3'] = 'b1', resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
160def get_cryonuseg_loader( 161 path: Union[os.PathLike, str], 162 batch_size: int, 163 patch_shape: Tuple[int, int], 164 split: Literal["train", "val", "test"], 165 rater: Literal["b1", "b2", "b3"] = "b1", 166 resize_inputs: bool = False, 167 download: bool = False, 168 **kwargs 169) -> DataLoader: 170 """Get the CryoNuSeg dataloader for nucleus segmentation. 171 172 Args: 173 path: Filepath to a folder where the downloaded data will be saved. 174 batch_size: The batch size for training. 175 patch_shape: The patch shape to use for training. 176 split: The choice of data split. 177 rater: The choice of annotator. 178 resize_inputs: Whether to resize the inputs. 179 download: Whether to download the data if it is not present. 180 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 181 182 Returns: 183 The DataLoader. 184 """ 185 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 186 dataset = get_cryonuseg_dataset(path, patch_shape, split, rater, resize_inputs, download, **ds_kwargs) 187 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the CryoNuSeg dataloader for nucleus segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- split: The choice of data split.
- rater: The choice of annotator.
- resize_inputs: Whether to resize the inputs.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader.