torch_em.data.datasets.histopathology.bcss
This dataset contains annotations for tissue region segmentation in breast cancer histopathology images.
NOTE: There are multiple semantic instances in tissue labels. Below mentioned are their respective index details: - 0: outside_roi (~background) - 1: tumor - 2: stroma - 3: lymphocytic_infiltrate - 4: necrosis_or_debris - 5: glandular_secretions - 6: blood - 7: exclude - 8: metaplasia_NOS - 9: fat - 10: plasma_cells - 11: other_immune_infiltrate - 12: mucoid_material - 13: normal_acinus_or_duct - 14: lymphatics - 15: undetermined - 16: nerve - 17: skin_adnexa - 18: blood_vessel - 19: angioinvasion - 20: dcis - 21: other
This dataset is from https://bcsegmentation.grand-challenge.org/BCSS/. Please cite this paper (https://doi.org/10.1093/bioinformatics/btz083) if you use this dataset for a publication.
1"""This dataset contains annotations for tissue region segmentation in 2breast cancer histopathology images. 3 4NOTE: There are multiple semantic instances in tissue labels. Below mentioned are their respective index details: 5 - 0: outside_roi (~background) 6 - 1: tumor 7 - 2: stroma 8 - 3: lymphocytic_infiltrate 9 - 4: necrosis_or_debris 10 - 5: glandular_secretions 11 - 6: blood 12 - 7: exclude 13 - 8: metaplasia_NOS 14 - 9: fat 15 - 10: plasma_cells 16 - 11: other_immune_infiltrate 17 - 12: mucoid_material 18 - 13: normal_acinus_or_duct 19 - 14: lymphatics 20 - 15: undetermined 21 - 16: nerve 22 - 17: skin_adnexa 23 - 18: blood_vessel 24 - 19: angioinvasion 25 - 20: dcis 26 - 21: other 27 28This dataset is from https://bcsegmentation.grand-challenge.org/BCSS/. 29Please cite this paper (https://doi.org/10.1093/bioinformatics/btz083) if you use this dataset for a publication. 30""" 31 32import os 33import shutil 34from glob import glob 35from pathlib import Path 36from typing import Union, Optional, List, Tuple 37 38from sklearn.model_selection import train_test_split 39 40import torch 41from torch.utils.data import Dataset, DataLoader 42 43import torch_em 44 45from .. import util 46 47 48URL = "https://drive.google.com/drive/folders/1zqbdkQF8i5cEmZOGmbdQm-EP8dRYtvss?usp=sharing" 49 50 51# TODO 52CHECKSUM = None 53 54 55TEST_LIST = [ 56 "TCGA-A2-A0SX-DX1_xmin53791_ymin56683_MPP-0.2500", "TCGA-BH-A0BG-DX1_xmin64019_ymin24975_MPP-0.2500", 57 "TCGA-AR-A1AI-DX1_xmin38671_ymin10616_MPP-0.2500", "TCGA-E2-A574-DX1_xmin54962_ymin47475_MPP-0.2500", 58 "TCGA-GM-A3XL-DX1_xmin29910_ymin15820_MPP-0.2500", "TCGA-E2-A14X-DX1_xmin88836_ymin66393_MPP-0.2500", 59 "TCGA-A2-A04P-DX1_xmin104246_ymin48517_MPP-0.2500", "TCGA-E2-A14N-DX1_xmin21383_ymin66838_MPP-0.2500", 60 "TCGA-EW-A1OV-DX1_xmin126026_ymin65132_MPP-0.2500", "TCGA-S3-AA15-DX1_xmin55486_ymin28926_MPP-0.2500", 61 "TCGA-LL-A5YO-DX1_xmin36631_ymin44396_MPP-0.2500", "TCGA-GI-A2C9-DX1_xmin20882_ymin11843_MPP-0.2500", 62 "TCGA-BH-A0BW-DX1_xmin42346_ymin30843_MPP-0.2500", "TCGA-E2-A1B6-DX1_xmin16266_ymin50634_MPP-0.2500", 63 "TCGA-AO-A0J2-DX1_xmin33561_ymin14515_MPP-0.2500" 64] 65 66 67def _download_bcss_dataset(path, download): 68 """Current recommendation: 69 - download the folder from URL manually 70 - use the consortium's git repo to download the dataset (https://github.com/PathologyDataScience/BCSS) 71 """ 72 raise NotImplementedError("Please download the dataset using the drive link / git repo directly") 73 74 # FIXME: limitation for the installation below: 75 # - only downloads first 50 files - due to `gdown`'s download folder function 76 # - (optional) clone their git repo to download their data 77 util.download_source_gdrive(path=path, url=URL, download=download, checksum=CHECKSUM, download_type="folder") 78 79 80def _get_image_and_label_paths(path): 81 # when downloading the files from `URL`, the input images are stored under `rgbs_colorNormalized` 82 # when getting the files from the git repo's command line feature, the input images are stored under `images` 83 if os.path.exists(os.path.join(path, "images")): 84 image_paths = sorted(glob(os.path.join(path, "images", "*"))) 85 label_paths = sorted(glob(os.path.join(path, "masks", "*"))) 86 elif os.path.exists(os.path.join(path, "0_Public-data-Amgad2019_0.25MPP", "rgbs_colorNormalized")): 87 image_paths = sorted(glob(os.path.join(path, "0_Public-data-Amgad2019_0.25MPP", "rgbs_colorNormalized", "*"))) 88 label_paths = sorted(glob(os.path.join(path, "0_Public-data-Amgad2019_0.25MPP", "masks", "*"))) 89 else: 90 raise ValueError( 91 "Please check the image directory. " 92 "If downloaded from gdrive, it's named \"rgbs_colorNormalized\", if from github it's named \"images\"" 93 ) 94 95 return image_paths, label_paths 96 97 98def get_bcss_data(path: Union[os.PathLike, str], download: bool = False): 99 """Download the BCSS dataset. 100 101 Args: 102 path: Filepath to a folder where the downloaded data will be saved. 103 download: Whether to download the data if it is not present. 104 """ 105 if download: 106 _download_bcss_dataset(path, download) 107 108 if os.path.exists(os.path.join(path, "train")) and os.path.exists(os.path.join(path, "test")): 109 return 110 111 all_image_paths, all_label_paths = _get_image_and_label_paths(path) 112 113 train_img_dir, train_lab_dir = os.path.join(path, "train", "images"), os.path.join(path, "train", "masks") 114 test_img_dir, test_lab_dir = os.path.join(path, "test", "images"), os.path.join(path, "test", "masks") 115 os.makedirs(train_img_dir, exist_ok=True) 116 os.makedirs(train_lab_dir, exist_ok=True) 117 os.makedirs(test_img_dir, exist_ok=True) 118 os.makedirs(test_lab_dir, exist_ok=True) 119 120 for image_path, label_path in zip(all_image_paths, all_label_paths): 121 img_idx, label_idx = os.path.split(image_path)[-1], os.path.split(label_path)[-1] 122 if Path(image_path).stem in TEST_LIST: 123 # move image and label to test 124 dst_img_path, dst_lab_path = os.path.join(test_img_dir, img_idx), os.path.join(test_lab_dir, label_idx) 125 shutil.copy(src=image_path, dst=dst_img_path) 126 shutil.copy(src=label_path, dst=dst_lab_path) 127 else: 128 # move image and label to train 129 dst_img_path, dst_lab_path = os.path.join(train_img_dir, img_idx), os.path.join(train_lab_dir, label_idx) 130 shutil.copy(src=image_path, dst=dst_img_path) 131 shutil.copy(src=label_path, dst=dst_lab_path) 132 133 134def get_bcsss_paths( 135 path: Union[os.PathLike, str], split: Optional[str] = None, val_fraction: float = 0.2, download: bool = False 136) -> Tuple[List[str], List[str]]: 137 """Get paths to the BCSS data. 138 139 Args: 140 path: Filepath to a folder where the downloaded data will be saved. 141 split: The split to use for the dataset. Either 'train', 'val' or 'test'. 142 val_fraction: The fraction of data to be considered for validation split. 143 download: Whether to download the data if it is not present. 144 145 Returns: 146 List of filepaths for the image data. 147 List of filepaths for the label data. 148 """ 149 get_bcss_data(path, download) 150 151 if split is None: 152 image_paths = sorted(glob(os.path.join(path, "*", "images", "*"))) 153 label_paths = sorted(glob(os.path.join(path, "*", "masks", "*"))) 154 else: 155 assert split in ["train", "val", "test"], "Please choose from the available `train` / `val` / `test` splits" 156 if split == "test": 157 image_paths = sorted(glob(os.path.join(path, "test", "images", "*"))) 158 label_paths = sorted(glob(os.path.join(path, "test", "masks", "*"))) 159 else: 160 image_paths = sorted(glob(os.path.join(path, "train", "images", "*"))) 161 label_paths = sorted(glob(os.path.join(path, "train", "masks", "*"))) 162 163 (train_image_paths, val_image_paths, 164 train_label_paths, val_label_paths) = train_test_split( 165 image_paths, label_paths, test_size=val_fraction, random_state=42 166 ) 167 168 image_paths = train_image_paths if split == "train" else val_image_paths 169 label_paths = train_label_paths if split == "train" else val_label_paths 170 171 assert len(image_paths) == len(label_paths) 172 173 return image_paths, label_paths 174 175 176def get_bcss_dataset( 177 path: Union[os.PathLike, str], 178 patch_shape: Tuple[int, ...], 179 split: Optional[str] = None, 180 val_fraction: float = 0.2, 181 download: bool = False, 182 label_dtype: torch.dtype = torch.int64, 183 **kwargs 184) -> Dataset: 185 """Get the BCSS dataset for breast cancer tissue segmentation in histopathology. 186 187 Args: 188 path: Filepath to a folder where the downloaded data will be saved. 189 patch_shape: The patch shape to use for training. 190 split: The split to use for the dataset. Either 'train', 'val' or 'test'. 191 val_fraction: The fraction of data to be considered for validation split. 192 download: Whether to download the data if it is not present. 193 label_dtype: The datatype of labels. 194 kwargs: kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 195 196 Returns: 197 The segmentation dataset. 198 """ 199 image_paths, label_paths = get_bcsss_paths(path, split, val_fraction, download) 200 201 return torch_em.default_segmentation_dataset( 202 raw_paths=image_paths, 203 raw_key=None, 204 label_paths=label_paths, 205 label_key=None, 206 patch_shape=patch_shape, 207 label_dtype=label_dtype, 208 is_seg_dataset=False, 209 **kwargs 210 ) 211 212 213def get_bcss_loader( 214 path: Union[os.PathLike, str], 215 patch_shape: Tuple[int, ...], 216 batch_size: int, 217 split: Optional[str] = None, 218 val_fraction: float = 0.2, 219 download: bool = False, 220 label_dtype: torch.dtype = torch.int64, 221 **kwargs 222) -> DataLoader: 223 """Get the BCSS dataloader for breast cancer tissue segmentation in histopathology. 224 225 Args: 226 path: Filepath to a folder where the downloaded data will be saved. 227 patch_shape: The patch shape to use for training. 228 batch_size: The batch size for training. 229 split: The split to use for the dataset. Either 'train', 'val' or 'test'. 230 val_fraction: The fraction of data to be considered for validation split. 231 download: Whether to download the data if it is not present. 232 label_dtype: The datatype of labels. 233 kwargs: kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 234 235 Returns: 236 The DataLoader. 237 """ 238 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 239 dataset = get_bcss_dataset( 240 path, patch_shape, split, val_fraction, download=download, label_dtype=label_dtype, **ds_kwargs 241 ) 242 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
99def get_bcss_data(path: Union[os.PathLike, str], download: bool = False): 100 """Download the BCSS dataset. 101 102 Args: 103 path: Filepath to a folder where the downloaded data will be saved. 104 download: Whether to download the data if it is not present. 105 """ 106 if download: 107 _download_bcss_dataset(path, download) 108 109 if os.path.exists(os.path.join(path, "train")) and os.path.exists(os.path.join(path, "test")): 110 return 111 112 all_image_paths, all_label_paths = _get_image_and_label_paths(path) 113 114 train_img_dir, train_lab_dir = os.path.join(path, "train", "images"), os.path.join(path, "train", "masks") 115 test_img_dir, test_lab_dir = os.path.join(path, "test", "images"), os.path.join(path, "test", "masks") 116 os.makedirs(train_img_dir, exist_ok=True) 117 os.makedirs(train_lab_dir, exist_ok=True) 118 os.makedirs(test_img_dir, exist_ok=True) 119 os.makedirs(test_lab_dir, exist_ok=True) 120 121 for image_path, label_path in zip(all_image_paths, all_label_paths): 122 img_idx, label_idx = os.path.split(image_path)[-1], os.path.split(label_path)[-1] 123 if Path(image_path).stem in TEST_LIST: 124 # move image and label to test 125 dst_img_path, dst_lab_path = os.path.join(test_img_dir, img_idx), os.path.join(test_lab_dir, label_idx) 126 shutil.copy(src=image_path, dst=dst_img_path) 127 shutil.copy(src=label_path, dst=dst_lab_path) 128 else: 129 # move image and label to train 130 dst_img_path, dst_lab_path = os.path.join(train_img_dir, img_idx), os.path.join(train_lab_dir, label_idx) 131 shutil.copy(src=image_path, dst=dst_img_path) 132 shutil.copy(src=label_path, dst=dst_lab_path)
Download the BCSS dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- download: Whether to download the data if it is not present.
135def get_bcsss_paths( 136 path: Union[os.PathLike, str], split: Optional[str] = None, val_fraction: float = 0.2, download: bool = False 137) -> Tuple[List[str], List[str]]: 138 """Get paths to the BCSS data. 139 140 Args: 141 path: Filepath to a folder where the downloaded data will be saved. 142 split: The split to use for the dataset. Either 'train', 'val' or 'test'. 143 val_fraction: The fraction of data to be considered for validation split. 144 download: Whether to download the data if it is not present. 145 146 Returns: 147 List of filepaths for the image data. 148 List of filepaths for the label data. 149 """ 150 get_bcss_data(path, download) 151 152 if split is None: 153 image_paths = sorted(glob(os.path.join(path, "*", "images", "*"))) 154 label_paths = sorted(glob(os.path.join(path, "*", "masks", "*"))) 155 else: 156 assert split in ["train", "val", "test"], "Please choose from the available `train` / `val` / `test` splits" 157 if split == "test": 158 image_paths = sorted(glob(os.path.join(path, "test", "images", "*"))) 159 label_paths = sorted(glob(os.path.join(path, "test", "masks", "*"))) 160 else: 161 image_paths = sorted(glob(os.path.join(path, "train", "images", "*"))) 162 label_paths = sorted(glob(os.path.join(path, "train", "masks", "*"))) 163 164 (train_image_paths, val_image_paths, 165 train_label_paths, val_label_paths) = train_test_split( 166 image_paths, label_paths, test_size=val_fraction, random_state=42 167 ) 168 169 image_paths = train_image_paths if split == "train" else val_image_paths 170 label_paths = train_label_paths if split == "train" else val_label_paths 171 172 assert len(image_paths) == len(label_paths) 173 174 return image_paths, label_paths
Get paths to the BCSS data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The split to use for the dataset. Either 'train', 'val' or 'test'.
- val_fraction: The fraction of data to be considered for validation split.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the image data. List of filepaths for the label data.
177def get_bcss_dataset( 178 path: Union[os.PathLike, str], 179 patch_shape: Tuple[int, ...], 180 split: Optional[str] = None, 181 val_fraction: float = 0.2, 182 download: bool = False, 183 label_dtype: torch.dtype = torch.int64, 184 **kwargs 185) -> Dataset: 186 """Get the BCSS dataset for breast cancer tissue segmentation in histopathology. 187 188 Args: 189 path: Filepath to a folder where the downloaded data will be saved. 190 patch_shape: The patch shape to use for training. 191 split: The split to use for the dataset. Either 'train', 'val' or 'test'. 192 val_fraction: The fraction of data to be considered for validation split. 193 download: Whether to download the data if it is not present. 194 label_dtype: The datatype of labels. 195 kwargs: kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 196 197 Returns: 198 The segmentation dataset. 199 """ 200 image_paths, label_paths = get_bcsss_paths(path, split, val_fraction, download) 201 202 return torch_em.default_segmentation_dataset( 203 raw_paths=image_paths, 204 raw_key=None, 205 label_paths=label_paths, 206 label_key=None, 207 patch_shape=patch_shape, 208 label_dtype=label_dtype, 209 is_seg_dataset=False, 210 **kwargs 211 )
Get the BCSS dataset for breast cancer tissue segmentation in histopathology.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- split: The split to use for the dataset. Either 'train', 'val' or 'test'.
- val_fraction: The fraction of data to be considered for validation split.
- download: Whether to download the data if it is not present.
- label_dtype: The datatype of labels.
- kwargs: kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
214def get_bcss_loader( 215 path: Union[os.PathLike, str], 216 patch_shape: Tuple[int, ...], 217 batch_size: int, 218 split: Optional[str] = None, 219 val_fraction: float = 0.2, 220 download: bool = False, 221 label_dtype: torch.dtype = torch.int64, 222 **kwargs 223) -> DataLoader: 224 """Get the BCSS dataloader for breast cancer tissue segmentation in histopathology. 225 226 Args: 227 path: Filepath to a folder where the downloaded data will be saved. 228 patch_shape: The patch shape to use for training. 229 batch_size: The batch size for training. 230 split: The split to use for the dataset. Either 'train', 'val' or 'test'. 231 val_fraction: The fraction of data to be considered for validation split. 232 download: Whether to download the data if it is not present. 233 label_dtype: The datatype of labels. 234 kwargs: kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 235 236 Returns: 237 The DataLoader. 238 """ 239 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 240 dataset = get_bcss_dataset( 241 path, patch_shape, split, val_fraction, download=download, label_dtype=label_dtype, **ds_kwargs 242 ) 243 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the BCSS dataloader for breast cancer tissue segmentation in histopathology.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- batch_size: The batch size for training.
- split: The split to use for the dataset. Either 'train', 'val' or 'test'.
- val_fraction: The fraction of data to be considered for validation split.
- download: Whether to download the data if it is not present.
- label_dtype: The datatype of labels.
- kwargs: kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The DataLoader.