torch_em.data.datasets.histopathology.monusac
This dataset consists annotations for nucleus segmentation in H&E stained tissue images derived from four different organs.
This dataset comes from https://monusac-2020.grand-challenge.org/Data/.
This dataset is from the publication https://doi.org/10.1109/TMI.2021.3085712. Please cite it if you use this dataset in your research.
1"""This dataset consists annotations for nucleus segmentation in 2H&E stained tissue images derived from four different organs. 3 4This dataset comes from https://monusac-2020.grand-challenge.org/Data/. 5 6This dataset is from the publication https://doi.org/10.1109/TMI.2021.3085712. 7Please cite it if you use this dataset in your research. 8""" 9 10import os 11import shutil 12from glob import glob 13from tqdm import tqdm 14from pathlib import Path 15from typing import Optional, List, Union, Literal, Tuple 16 17import imageio.v3 as imageio 18 19from torch.utils.data import Dataset, DataLoader 20 21import torch_em 22 23from .. import util 24 25 26URL = { 27 "train": "https://drive.google.com/uc?export=download&id=1lxMZaAPSpEHLSxGA9KKMt_r-4S8dwLhq", 28 "test": "https://drive.google.com/uc?export=download&id=1G54vsOdxWY1hG7dzmkeK3r0xz9s-heyQ" 29} 30 31 32CHECKSUM = { 33 "train": "5b7cbeb34817a8f880d3fddc28391e48d3329a91bf3adcbd131ea149a725cd92", 34 "test": "bcbc38f6bf8b149230c90c29f3428cc7b2b76f8acd7766ce9fc908fc896c2674" 35} 36 37# here's the description: https://drive.google.com/file/d/1kdOl3s6uQBRv0nToSIf1dPuceZunzL4N/view 38ORGAN_SPLITS = { 39 "train": { 40 "lung": [ 41 "TCGA-55-1594", "TCGA-69-7760", "TCGA-69-A59K", "TCGA-73-4668", "TCGA-78-7220", 42 "TCGA-86-7713", "TCGA-86-8672", "TCGA-L4-A4E5", "TCGA-MP-A4SY", "TCGA-MP-A4T7" 43 ], 44 "kidney": [ 45 "TCGA-5P-A9K0", "TCGA-B9-A44B", "TCGA-B9-A8YI", "TCGA-DW-7841", "TCGA-EV-5903", "TCGA-F9-A97G", 46 "TCGA-G7-A8LD", "TCGA-MH-A560", "TCGA-P4-AAVK", "TCGA-SX-A7SR", "TCGA-UZ-A9PO", "TCGA-UZ-A9PU" 47 ], 48 "breast": [ 49 "TCGA-A2-A0CV", "TCGA-A2-A0ES", "TCGA-B6-A0WZ", "TCGA-BH-A18T", "TCGA-D8-A1X5", 50 "TCGA-E2-A154", "TCGA-E9-A22B", "TCGA-E9-A22G", "TCGA-EW-A6SD", "TCGA-S3-AA11" 51 ], 52 "prostate": [ 53 "TCGA-EJ-5495", "TCGA-EJ-5505", "TCGA-EJ-5517", "TCGA-G9-6342", "TCGA-G9-6499", 54 "TCGA-J4-A67Q", "TCGA-J4-A67T", "TCGA-KK-A59X", "TCGA-KK-A6E0", "TCGA-KK-A7AW", 55 "TCGA-V1-A8WL", "TCGA-V1-A9O9", "TCGA-X4-A8KQ", "TCGA-YL-A9WY" 56 ] 57 }, 58 "test": { 59 "lung": [ 60 "TCGA-49-6743", "TCGA-50-6591", "TCGA-55-7570", "TCGA-55-7573", 61 "TCGA-73-4662", "TCGA-78-7152", "TCGA-MP-A4T7" 62 ], 63 "kidney": [ 64 "TCGA-2Z-A9JG", "TCGA-2Z-A9JN", "TCGA-DW-7838", "TCGA-DW-7963", 65 "TCGA-F9-A8NY", "TCGA-IZ-A6M9", "TCGA-MH-A55W" 66 ], 67 "breast": ["TCGA-A2-A04X", "TCGA-A2-A0ES", "TCGA-D8-A3Z6", "TCGA-E2-A108", "TCGA-EW-A6SB"], 68 "prostate": ["TCGA-G9-6356", "TCGA-G9-6367", "TCGA-VP-A87E", "TCGA-VP-A87H", "TCGA-X4-A8KS", "TCGA-YL-A9WL"] 69 }, 70} 71 72 73def _check_channel_consistency(path, split): 74 "The provided tif images have RGBA channels, check and remove the alpha channel" 75 all_image_path = glob(os.path.join(path, "images", split, "*.tif")) 76 for image_path in all_image_path: 77 image = imageio.imread(image_path) 78 if image.ndim == 3 and image.shape[-1] == 4: # NOTE: There are images without an alpha channel. 79 rgb_image = image[..., :-1] # get rid of the alpha channel 80 imageio.imwrite(image_path, rgb_image) 81 82 83def _process_monusac(path, split): 84 util.unzip(os.path.join(path, f"monusac_{split}.zip"), path) 85 86 # assorting the images into expected dir; 87 # converting the label xml files to numpy arrays (of same dimension as input images) in the expected dir 88 root_img_save_dir = os.path.join(path, "images", split) 89 root_label_save_dir = os.path.join(path, "labels", split) 90 91 os.makedirs(root_img_save_dir, exist_ok=True) 92 os.makedirs(root_label_save_dir, exist_ok=True) 93 94 all_patient_dir = sorted(glob(os.path.join(path, "MoNuSAC*", "*"))) 95 96 for patient_dir in tqdm(all_patient_dir, desc=f"Converting {split} inputs for all patients"): 97 all_img_dir = sorted(glob(os.path.join(patient_dir, "*.tif"))) 98 all_xml_label_dir = sorted(glob(os.path.join(patient_dir, "*.xml"))) 99 100 if len(all_img_dir) != len(all_xml_label_dir): 101 _convert_missing_tif_from_svs(patient_dir) 102 all_img_dir = sorted(glob(os.path.join(patient_dir, "*.tif"))) 103 104 assert len(all_img_dir) == len(all_xml_label_dir) 105 106 for img_path, xml_label_path in zip(all_img_dir, all_xml_label_dir): 107 desired_label_shape = imageio.imread(img_path).shape[:-1] 108 109 img_id = os.path.split(img_path)[-1] 110 dst = os.path.join(root_img_save_dir, img_id) 111 shutil.move(src=img_path, dst=dst) 112 113 _label = util.generate_labeled_array_from_xml(shape=desired_label_shape, xml_file=xml_label_path) 114 _fileid = img_id.split(".")[0] 115 imageio.imwrite(os.path.join(root_label_save_dir, f"{_fileid}.tif"), _label) 116 117 shutil.rmtree(glob(os.path.join(path, "MoNuSAC*"))[0]) 118 119 120def _convert_missing_tif_from_svs(patient_dir): 121 """This function activates when we see some missing tiff inputs (and converts svs to tiff) 122 123 Cause: Happens only in the test split, maybe while converting the data, some were missed 124 Fix: We have the original svs scans. We convert the svs scans to tiff 125 """ 126 all_svs_dir = sorted(glob(os.path.join(patient_dir, "*.svs"))) 127 for svs_path in all_svs_dir: 128 save_tif_path = os.path.splitext(svs_path)[0] + ".tif" 129 if not os.path.exists(save_tif_path): 130 img_array = util.convert_svs_to_array(svs_path) 131 # the array from svs scans are supposed to be RGB images 132 assert img_array.shape[-1] == 3 133 imageio.imwrite(save_tif_path, img_array) 134 135 136def get_patient_id(path, split_wrt="-01Z-00-"): 137 """Gets us the patient id in the expected format 138 Input Names: "TCGA-<XX>-<XXXX>-01z-00-DX<X>-(<X>, <00X>).tif" (example: TCGA-2Z-A9JG-01Z-00-DX1_1.tif) 139 Expected: "TCGA-<XX>-<XXXX>" (example: TCGA-2Z-A9JG) 140 """ 141 patient_image_id = Path(path).stem 142 patient_id = patient_image_id.split(split_wrt)[0] 143 return patient_id 144 145 146def get_monusac_data(path: Union[os.PathLike, str], split: Literal['train', 'test'], download: bool = False): 147 """Download the MoNuSAC dataset. 148 149 Args: 150 path: Filepath to a folder where the downloaded data will be saved. 151 split: The split to use for the dataset. Either 'train' or 'test'. 152 download: Whether to download the data if it is not present. 153 """ 154 assert split in ["train", "test"], "Please choose from train/test" 155 156 # check if we have extracted the images and labels already 157 im_path = os.path.join(path, "images", split) 158 label_path = os.path.join(path, "labels", split) 159 if os.path.exists(im_path) and os.path.exists(label_path): 160 return 161 162 os.makedirs(path, exist_ok=True) 163 zip_path = os.path.join(path, f"monusac_{split}.zip") 164 util.download_source_gdrive(zip_path, URL[split], download=download, checksum=CHECKSUM[split]) 165 166 _process_monusac(path, split) 167 168 _check_channel_consistency(path, split) 169 170 171def get_monusac_paths( 172 path: Union[os.PathLike, str], 173 split: Literal['train', 'val'], 174 organ_type: Optional[List[str]] = None, 175 download: bool = False 176) -> Tuple[List[str], List[str]]: 177 """Get paths to MoNuSAC data. 178 179 Args: 180 path: Filepath to a folder where the downloaded data will be saved. 181 split: The split to use for the dataset. Either 'train' or 'test'. 182 organ_type: The choice of organ type. 183 download: Whether to download the data if it is not present. 184 185 Returns: 186 List of filepaths to the image data. 187 List of filepaths to the label data. 188 """ 189 get_monusac_data(path, split, download) 190 191 image_paths = sorted(glob(os.path.join(path, "images", split, "*"))) 192 label_paths = sorted(glob(os.path.join(path, "labels", split, "*"))) 193 194 if organ_type is not None: 195 # get all patients for multiple organ selection 196 all_organ_splits = sum([ORGAN_SPLITS[split][o] for o in organ_type], []) 197 198 image_paths = [_path for _path in image_paths if get_patient_id(_path) in all_organ_splits] 199 label_paths = [_path for _path in label_paths if get_patient_id(_path) in all_organ_splits] 200 201 assert len(image_paths) == len(label_paths) 202 203 return image_paths, label_paths 204 205 206def get_monusac_dataset( 207 path: Union[os.PathLike, str], 208 patch_shape: Tuple[int, ...], 209 split: Literal['train', 'test'], 210 organ_type: Optional[List[str]] = None, 211 download: bool = False, 212 offsets: Optional[List[List[int]]] = None, 213 boundaries: bool = False, 214 binary: bool = False, 215 resize_inputs: bool = False, 216 **kwargs 217) -> Dataset: 218 """Get the MoNuSAC dataset for nucleus segmentation in H&E stained tissue images. 219 220 Args: 221 path: Filepath to a folder where the downloaded data will be saved. 222 patch_shape: The patch shape to use for training. 223 split: The split to use for the dataset. Either 'train' or 'test'. 224 organ_type: The choice of organ type. 225 download: Whether to download the data if it is not present. 226 offsets: Offset values for affinity computation used as target. 227 boundaries: Whether to compute boundaries as the target. 228 binary: Whether to use a binary segmentation target. 229 resize_inputs: Whether to resize the inputs. 230 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 231 232 Returns: 233 The segmentation dataset. 234 """ 235 image_paths, label_paths = get_monusac_paths(path, split, organ_type, download) 236 237 kwargs, _ = util.add_instance_label_transform( 238 kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets 239 ) 240 241 if resize_inputs: 242 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True} 243 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 244 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 245 ) 246 247 return torch_em.default_segmentation_dataset( 248 raw_paths=image_paths, 249 raw_key=None, 250 label_paths=label_paths, 251 label_key=None, 252 patch_shape=patch_shape, 253 is_seg_dataset=False, 254 **kwargs 255 ) 256 257 258def get_monusac_loader( 259 path: Union[os.PathLike, str], 260 patch_shape: Tuple[int, ...], 261 batch_size: int, 262 split: Literal['train', 'test'], 263 organ_type: Optional[List[str]] = None, 264 download: bool = False, 265 offsets: Optional[List[List[int]]] = None, 266 boundaries: bool = False, 267 binary: bool = False, 268 resize_inputs: bool = False, 269 **kwargs 270) -> DataLoader: 271 """Get the MoNuSAC dataloader for nucleus segmentation in H&E stained tissue images. 272 273 Args: 274 path: Filepath to a folder where the downloaded data will be saved. 275 patch_shape: The patch shape to use for training. 276 batch_size: The batch size for training. 277 split: The split to use for the dataset. Either 'train' or 'test'. 278 organ_type: The choice of organ type. 279 download: Whether to download the data if it is not present. 280 offsets: Offset values for affinity computation used as target. 281 boundaries: Whether to compute boundaries as the target. 282 binary: Whether to use a binary segmentation target. 283 resize_inputs: Whether to resize the inputs. 284 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 285 286 Returns: 287 The DataLoader 288 """ 289 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 290 dataset = get_monusac_dataset( 291 path, patch_shape, split, organ_type, download, offsets, boundaries, binary, resize_inputs, **ds_kwargs 292 ) 293 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
137def get_patient_id(path, split_wrt="-01Z-00-"): 138 """Gets us the patient id in the expected format 139 Input Names: "TCGA-<XX>-<XXXX>-01z-00-DX<X>-(<X>, <00X>).tif" (example: TCGA-2Z-A9JG-01Z-00-DX1_1.tif) 140 Expected: "TCGA-<XX>-<XXXX>" (example: TCGA-2Z-A9JG) 141 """ 142 patient_image_id = Path(path).stem 143 patient_id = patient_image_id.split(split_wrt)[0] 144 return patient_id
Gets us the patient id in the expected format
Input Names: "TCGA-
147def get_monusac_data(path: Union[os.PathLike, str], split: Literal['train', 'test'], download: bool = False): 148 """Download the MoNuSAC dataset. 149 150 Args: 151 path: Filepath to a folder where the downloaded data will be saved. 152 split: The split to use for the dataset. Either 'train' or 'test'. 153 download: Whether to download the data if it is not present. 154 """ 155 assert split in ["train", "test"], "Please choose from train/test" 156 157 # check if we have extracted the images and labels already 158 im_path = os.path.join(path, "images", split) 159 label_path = os.path.join(path, "labels", split) 160 if os.path.exists(im_path) and os.path.exists(label_path): 161 return 162 163 os.makedirs(path, exist_ok=True) 164 zip_path = os.path.join(path, f"monusac_{split}.zip") 165 util.download_source_gdrive(zip_path, URL[split], download=download, checksum=CHECKSUM[split]) 166 167 _process_monusac(path, split) 168 169 _check_channel_consistency(path, split)
Download the MoNuSAC dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The split to use for the dataset. Either 'train' or 'test'.
- download: Whether to download the data if it is not present.
172def get_monusac_paths( 173 path: Union[os.PathLike, str], 174 split: Literal['train', 'val'], 175 organ_type: Optional[List[str]] = None, 176 download: bool = False 177) -> Tuple[List[str], List[str]]: 178 """Get paths to MoNuSAC data. 179 180 Args: 181 path: Filepath to a folder where the downloaded data will be saved. 182 split: The split to use for the dataset. Either 'train' or 'test'. 183 organ_type: The choice of organ type. 184 download: Whether to download the data if it is not present. 185 186 Returns: 187 List of filepaths to the image data. 188 List of filepaths to the label data. 189 """ 190 get_monusac_data(path, split, download) 191 192 image_paths = sorted(glob(os.path.join(path, "images", split, "*"))) 193 label_paths = sorted(glob(os.path.join(path, "labels", split, "*"))) 194 195 if organ_type is not None: 196 # get all patients for multiple organ selection 197 all_organ_splits = sum([ORGAN_SPLITS[split][o] for o in organ_type], []) 198 199 image_paths = [_path for _path in image_paths if get_patient_id(_path) in all_organ_splits] 200 label_paths = [_path for _path in label_paths if get_patient_id(_path) in all_organ_splits] 201 202 assert len(image_paths) == len(label_paths) 203 204 return image_paths, label_paths
Get paths to MoNuSAC data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The split to use for the dataset. Either 'train' or 'test'.
- organ_type: The choice of organ type.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths to the image data. List of filepaths to the label data.
207def get_monusac_dataset( 208 path: Union[os.PathLike, str], 209 patch_shape: Tuple[int, ...], 210 split: Literal['train', 'test'], 211 organ_type: Optional[List[str]] = None, 212 download: bool = False, 213 offsets: Optional[List[List[int]]] = None, 214 boundaries: bool = False, 215 binary: bool = False, 216 resize_inputs: bool = False, 217 **kwargs 218) -> Dataset: 219 """Get the MoNuSAC dataset for nucleus segmentation in H&E stained tissue images. 220 221 Args: 222 path: Filepath to a folder where the downloaded data will be saved. 223 patch_shape: The patch shape to use for training. 224 split: The split to use for the dataset. Either 'train' or 'test'. 225 organ_type: The choice of organ type. 226 download: Whether to download the data if it is not present. 227 offsets: Offset values for affinity computation used as target. 228 boundaries: Whether to compute boundaries as the target. 229 binary: Whether to use a binary segmentation target. 230 resize_inputs: Whether to resize the inputs. 231 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 232 233 Returns: 234 The segmentation dataset. 235 """ 236 image_paths, label_paths = get_monusac_paths(path, split, organ_type, download) 237 238 kwargs, _ = util.add_instance_label_transform( 239 kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets 240 ) 241 242 if resize_inputs: 243 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True} 244 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 245 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 246 ) 247 248 return torch_em.default_segmentation_dataset( 249 raw_paths=image_paths, 250 raw_key=None, 251 label_paths=label_paths, 252 label_key=None, 253 patch_shape=patch_shape, 254 is_seg_dataset=False, 255 **kwargs 256 )
Get the MoNuSAC dataset for nucleus segmentation in H&E stained tissue images.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- split: The split to use for the dataset. Either 'train' or 'test'.
- organ_type: The choice of organ type.
- download: Whether to download the data if it is not present.
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- binary: Whether to use a binary segmentation target.
- resize_inputs: Whether to resize the inputs.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
259def get_monusac_loader( 260 path: Union[os.PathLike, str], 261 patch_shape: Tuple[int, ...], 262 batch_size: int, 263 split: Literal['train', 'test'], 264 organ_type: Optional[List[str]] = None, 265 download: bool = False, 266 offsets: Optional[List[List[int]]] = None, 267 boundaries: bool = False, 268 binary: bool = False, 269 resize_inputs: bool = False, 270 **kwargs 271) -> DataLoader: 272 """Get the MoNuSAC dataloader for nucleus segmentation in H&E stained tissue images. 273 274 Args: 275 path: Filepath to a folder where the downloaded data will be saved. 276 patch_shape: The patch shape to use for training. 277 batch_size: The batch size for training. 278 split: The split to use for the dataset. Either 'train' or 'test'. 279 organ_type: The choice of organ type. 280 download: Whether to download the data if it is not present. 281 offsets: Offset values for affinity computation used as target. 282 boundaries: Whether to compute boundaries as the target. 283 binary: Whether to use a binary segmentation target. 284 resize_inputs: Whether to resize the inputs. 285 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 286 287 Returns: 288 The DataLoader 289 """ 290 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 291 dataset = get_monusac_dataset( 292 path, patch_shape, split, organ_type, download, offsets, boundaries, binary, resize_inputs, **ds_kwargs 293 ) 294 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the MoNuSAC dataloader for nucleus segmentation in H&E stained tissue images.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- batch_size: The batch size for training.
- split: The split to use for the dataset. Either 'train' or 'test'.
- organ_type: The choice of organ type.
- download: Whether to download the data if it is not present.
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- binary: Whether to use a binary segmentation target.
- resize_inputs: Whether to resize the inputs.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader