torch_em.data.datasets.histopathology.monuseg
This dataset contains annotations for nucleus segmentation in H&E stained tissue images derived from different organs.
This dataset comes from https://monuseg.grand-challenge.org/Data/.
Please cite the relevant publications from the challenge if you use this dataset in your research.
1"""This dataset contains annotations for nucleus segmentation in 2H&E stained tissue images derived from different organs. 3 4This dataset comes from https://monuseg.grand-challenge.org/Data/. 5 6Please cite the relevant publications from the challenge 7if you use this dataset in your research. 8""" 9 10import os 11import shutil 12from tqdm import tqdm 13from glob import glob 14from pathlib import Path 15from typing import List, Optional, Union, Tuple, Literal 16 17import imageio.v3 as imageio 18 19from torch.utils.data import Dataset, DataLoader 20 21import torch_em 22 23from .. import util 24 25 26URL = { 27 "train": "https://drive.google.com/uc?export=download&id=1ZgqFJomqQGNnsx7w7QBzQQMVA16lbVCA", 28 "test": "https://drive.google.com/uc?export=download&id=1NKkSQ5T0ZNQ8aUhh0a8Dt2YKYCQXIViw" 29} 30 31CHECKSUM = { 32 "train": "25d3d3185bb2970b397cafa72eb664c9b4d24294aee382e7e3df9885affce742", 33 "test": "13e522387ae8b1bcc0530e13ff9c7b4d91ec74959ef6f6e57747368d7ee6f88a" 34} 35 36# Here is the description: https://drive.google.com/file/d/1xYyQ31CHFRnvTCTuuHdconlJCMk2SK7Z/view?usp=sharing 37ORGAN_SPLITS = { 38 "breast": [ 39 "TCGA-A7-A13E-01Z-00-DX1", "TCGA-A7-A13F-01Z-00-DX1", "TCGA-AR-A1AK-01Z-00-DX1", 40 "TCGA-AR-A1AS-01Z-00-DX1", "TCGA-E2-A1B5-01Z-00-DX1", "TCGA-E2-A14V-01Z-00-DX1" 41 ], 42 "kidney": [ 43 "TCGA-B0-5711-01Z-00-DX1", "TCGA-HE-7128-01Z-00-DX1", "TCGA-HE-7129-01Z-00-DX1", 44 "TCGA-HE-7130-01Z-00-DX1", "TCGA-B0-5710-01Z-00-DX1", "TCGA-B0-5698-01Z-00-DX1" 45 ], 46 "liver": [ 47 "TCGA-18-5592-01Z-00-DX1", "TCGA-38-6178-01Z-00-DX1", "TCGA-49-4488-01Z-00-DX1", 48 "TCGA-50-5931-01Z-00-DX1", "TCGA-21-5784-01Z-00-DX1", "TCGA-21-5786-01Z-00-DX1" 49 ], 50 "prostate": [ 51 "TCGA-G9-6336-01Z-00-DX1", "TCGA-G9-6348-01Z-00-DX1", "TCGA-G9-6356-01Z-00-DX1", 52 "TCGA-G9-6363-01Z-00-DX1", "TCGA-CH-5767-01Z-00-DX1", "TCGA-G9-6362-01Z-00-DX1" 53 ], 54 "bladder": ["TCGA-DK-A2I6-01A-01-TS1", "TCGA-G2-A2EK-01A-02-TSB"], 55 "colon": ["TCGA-AY-A8YK-01A-01-TS1", "TCGA-NH-A8F7-01A-01-TS1"], 56 "stomach": ["TCGA-KB-A93J-01A-01-TS1", "TCGA-RD-A8N9-01A-01-TS1"] 57} 58 59 60def _process_monuseg(path, split): 61 util.unzip(os.path.join(path, f"monuseg_{split}.zip"), path) 62 63 # assorting the images into expected dir; 64 # converting the label xml files to numpy arrays (of same dimension as input images) in the expected dir 65 root_img_save_dir = os.path.join(path, "images", split) 66 root_label_save_dir = os.path.join(path, "labels", split) 67 68 os.makedirs(root_img_save_dir, exist_ok=True) 69 os.makedirs(root_label_save_dir, exist_ok=True) 70 71 if split == "train": 72 all_img_dir = sorted(glob(os.path.join(path, "*", "Tissue*", "*"))) 73 all_xml_label_dir = sorted(glob(os.path.join(path, "*", "Annotations", "*"))) 74 else: 75 all_img_dir = sorted(glob(os.path.join(path, "MoNuSegTestData", "*.tif"))) 76 all_xml_label_dir = sorted(glob(os.path.join(path, "MoNuSegTestData", "*.xml"))) 77 78 assert len(all_img_dir) == len(all_xml_label_dir) 79 80 for img_path, xml_label_path in tqdm( 81 zip(all_img_dir, all_xml_label_dir), 82 desc=f"Converting {split} split to the expected format", 83 total=len(all_img_dir) 84 ): 85 desired_label_shape = imageio.imread(img_path).shape[:-1] 86 87 img_id = os.path.split(img_path)[-1] 88 dst = os.path.join(root_img_save_dir, img_id) 89 shutil.move(src=img_path, dst=dst) 90 91 _label = util.generate_labeled_array_from_xml(shape=desired_label_shape, xml_file=xml_label_path) 92 _fileid = img_id.split(".")[0] 93 imageio.imwrite(os.path.join(root_label_save_dir, f"{_fileid}.tif"), _label, compression="zlib") 94 95 shutil.rmtree(glob(os.path.join(path, "MoNuSeg*"))[0]) 96 if split == "train": 97 shutil.rmtree(glob(os.path.join(path, "__MACOSX"))[0]) 98 99 100def get_monuseg_data(path: Union[os.PathLike, str], split: Literal['train', 'test'], download: bool = False): 101 """Download the MoNuSeg dataset. 102 103 Args: 104 path: Filepath to a folder where the downloaded data will be saved. 105 split: The split to use for the dataset. Either 'train' or 'test'. 106 download: Whether to download the data if it is not present. 107 """ 108 assert split in ["train", "test"], "The split choices in MoNuSeg datset are train/test, please choose from them" 109 110 # check if we have extracted the images and labels already 111 im_path = os.path.join(path, "images", split) 112 label_path = os.path.join(path, "labels", split) 113 if os.path.exists(im_path) and os.path.exists(label_path): 114 return 115 116 os.makedirs(path, exist_ok=True) 117 zip_path = os.path.join(path, f"monuseg_{split}.zip") 118 util.download_source_gdrive(zip_path, URL[split], download=download, checksum=CHECKSUM[split]) 119 120 _process_monuseg(path, split) 121 122 123def get_monuseg_paths( 124 path: Union[os.PathLike, str], 125 split: Literal['train', 'test'], 126 organ_type: Optional[List[str]] = None, 127 download: bool = False 128) -> Tuple[List[str], List[str]]: 129 """Get paths to the MoNuSeg data. 130 131 Args: 132 path: Filepath to a folder where the downloaded data will be saved. 133 split: The split to use for the dataset. Either 'train' or 'test'. 134 organ_type: The choice of organ type. 135 download: Whether to download the data if it is not present. 136 137 Returns: 138 List of filepaths to the image data. 139 List of filepaths to the label data. 140 """ 141 get_monuseg_data(path, split, download) 142 143 image_paths = sorted(glob(os.path.join(path, "images", split, "*"))) 144 label_paths = sorted(glob(os.path.join(path, "labels", split, "*"))) 145 146 if split == "train" and organ_type is not None: 147 # get all patients for multiple organ selection 148 all_organ_splits = sum([ORGAN_SPLITS[_o] for _o in organ_type], []) 149 150 image_paths = [_path for _path in image_paths if Path(_path).stem in all_organ_splits] 151 label_paths = [_path for _path in label_paths if Path(_path).stem in all_organ_splits] 152 153 elif split == "test" and organ_type is not None: 154 # we don't have organ splits in the test dataset 155 raise ValueError("The test split does not have any organ informations, please pass `organ_type=None`") 156 157 return image_paths, label_paths 158 159 160def get_monuseg_dataset( 161 path: Union[os.PathLike, str], 162 patch_shape: Tuple[int, ...], 163 split: Literal['train', 'test'], 164 organ_type: Optional[List[str]] = None, 165 download: bool = False, 166 offsets: Optional[List[List[int]]] = None, 167 boundaries: bool = False, 168 binary: bool = False, 169 resize_inputs: bool = False, 170 **kwargs 171) -> Dataset: 172 """Get the MoNuSeg dataset for nucleus segmentation in H&E stained tissue images. 173 174 Args: 175 path: Filepath to a folder where the downloaded data will be saved. 176 patch_shape: The patch shape to use for training. 177 split: The split to use for the dataset. Either 'train' or 'test'. 178 organ_type: The choice of organ type. 179 download: Whether to download the data if it is not present. 180 offsets: Offset values for affinity computation used as target. 181 boundaries: Whether to compute boundaries as the target. 182 binary: Whether to use a binary segmentation target. 183 resize_inputs: Whether to resize the inputs. 184 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 185 186 Returns: 187 The segmentation dataset. 188 """ 189 image_paths, label_paths = get_monuseg_paths(path, split, organ_type, download) 190 191 kwargs, _ = util.add_instance_label_transform( 192 kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets 193 ) 194 195 if resize_inputs: 196 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True} 197 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 198 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 199 ) 200 201 return torch_em.default_segmentation_dataset( 202 raw_paths=image_paths, 203 raw_key=None, 204 label_paths=label_paths, 205 label_key=None, 206 patch_shape=patch_shape, 207 is_seg_dataset=False, 208 **kwargs 209 ) 210 211 212def get_monuseg_loader( 213 path: Union[os.PathLike, str], 214 patch_shape: Tuple[int, ...], 215 batch_size: int, 216 split: Literal['train', 'test'], 217 organ_type: Optional[List[str]] = None, 218 download: bool = False, 219 offsets: Optional[List[List[int]]] = None, 220 boundaries: bool = False, 221 binary: bool = False, 222 resize_inputs: bool = False, 223 **kwargs 224) -> DataLoader: 225 """Get the MoNuSeg dataloader for nucleus segmentation in H&E stained tissue images. 226 227 Args: 228 path: Filepath to a folder where the downloaded data will be saved. 229 patch_shape: The patch shape to use for training. 230 batch_size: The batch size for training. 231 split: The split to use for the dataset. Either 'train' or 'test'. 232 organ_type: The choice of organ type. 233 download: Whether to download the data if it is not present. 234 offsets: Offset values for affinity computation used as target. 235 boundaries: Whether to compute boundaries as the target. 236 binary: Whether to use a binary segmentation target. 237 resize_inputs: Whether to resize the inputs. 238 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 239 240 Returns: 241 The DataLoader 242 """ 243 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 244 dataset = get_monuseg_dataset( 245 path, patch_shape, split, organ_type, download, offsets, boundaries, binary, resize_inputs, **ds_kwargs 246 ) 247 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URL =
{'train': 'https://drive.google.com/uc?export=download&id=1ZgqFJomqQGNnsx7w7QBzQQMVA16lbVCA', 'test': 'https://drive.google.com/uc?export=download&id=1NKkSQ5T0ZNQ8aUhh0a8Dt2YKYCQXIViw'}
CHECKSUM =
{'train': '25d3d3185bb2970b397cafa72eb664c9b4d24294aee382e7e3df9885affce742', 'test': '13e522387ae8b1bcc0530e13ff9c7b4d91ec74959ef6f6e57747368d7ee6f88a'}
ORGAN_SPLITS =
{'breast': ['TCGA-A7-A13E-01Z-00-DX1', 'TCGA-A7-A13F-01Z-00-DX1', 'TCGA-AR-A1AK-01Z-00-DX1', 'TCGA-AR-A1AS-01Z-00-DX1', 'TCGA-E2-A1B5-01Z-00-DX1', 'TCGA-E2-A14V-01Z-00-DX1'], 'kidney': ['TCGA-B0-5711-01Z-00-DX1', 'TCGA-HE-7128-01Z-00-DX1', 'TCGA-HE-7129-01Z-00-DX1', 'TCGA-HE-7130-01Z-00-DX1', 'TCGA-B0-5710-01Z-00-DX1', 'TCGA-B0-5698-01Z-00-DX1'], 'liver': ['TCGA-18-5592-01Z-00-DX1', 'TCGA-38-6178-01Z-00-DX1', 'TCGA-49-4488-01Z-00-DX1', 'TCGA-50-5931-01Z-00-DX1', 'TCGA-21-5784-01Z-00-DX1', 'TCGA-21-5786-01Z-00-DX1'], 'prostate': ['TCGA-G9-6336-01Z-00-DX1', 'TCGA-G9-6348-01Z-00-DX1', 'TCGA-G9-6356-01Z-00-DX1', 'TCGA-G9-6363-01Z-00-DX1', 'TCGA-CH-5767-01Z-00-DX1', 'TCGA-G9-6362-01Z-00-DX1'], 'bladder': ['TCGA-DK-A2I6-01A-01-TS1', 'TCGA-G2-A2EK-01A-02-TSB'], 'colon': ['TCGA-AY-A8YK-01A-01-TS1', 'TCGA-NH-A8F7-01A-01-TS1'], 'stomach': ['TCGA-KB-A93J-01A-01-TS1', 'TCGA-RD-A8N9-01A-01-TS1']}
def
get_monuseg_data( path: Union[os.PathLike, str], split: Literal['train', 'test'], download: bool = False):
101def get_monuseg_data(path: Union[os.PathLike, str], split: Literal['train', 'test'], download: bool = False): 102 """Download the MoNuSeg dataset. 103 104 Args: 105 path: Filepath to a folder where the downloaded data will be saved. 106 split: The split to use for the dataset. Either 'train' or 'test'. 107 download: Whether to download the data if it is not present. 108 """ 109 assert split in ["train", "test"], "The split choices in MoNuSeg datset are train/test, please choose from them" 110 111 # check if we have extracted the images and labels already 112 im_path = os.path.join(path, "images", split) 113 label_path = os.path.join(path, "labels", split) 114 if os.path.exists(im_path) and os.path.exists(label_path): 115 return 116 117 os.makedirs(path, exist_ok=True) 118 zip_path = os.path.join(path, f"monuseg_{split}.zip") 119 util.download_source_gdrive(zip_path, URL[split], download=download, checksum=CHECKSUM[split]) 120 121 _process_monuseg(path, split)
Download the MoNuSeg dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The split to use for the dataset. Either 'train' or 'test'.
- download: Whether to download the data if it is not present.
def
get_monuseg_paths( path: Union[os.PathLike, str], split: Literal['train', 'test'], organ_type: Optional[List[str]] = None, download: bool = False) -> Tuple[List[str], List[str]]:
124def get_monuseg_paths( 125 path: Union[os.PathLike, str], 126 split: Literal['train', 'test'], 127 organ_type: Optional[List[str]] = None, 128 download: bool = False 129) -> Tuple[List[str], List[str]]: 130 """Get paths to the MoNuSeg data. 131 132 Args: 133 path: Filepath to a folder where the downloaded data will be saved. 134 split: The split to use for the dataset. Either 'train' or 'test'. 135 organ_type: The choice of organ type. 136 download: Whether to download the data if it is not present. 137 138 Returns: 139 List of filepaths to the image data. 140 List of filepaths to the label data. 141 """ 142 get_monuseg_data(path, split, download) 143 144 image_paths = sorted(glob(os.path.join(path, "images", split, "*"))) 145 label_paths = sorted(glob(os.path.join(path, "labels", split, "*"))) 146 147 if split == "train" and organ_type is not None: 148 # get all patients for multiple organ selection 149 all_organ_splits = sum([ORGAN_SPLITS[_o] for _o in organ_type], []) 150 151 image_paths = [_path for _path in image_paths if Path(_path).stem in all_organ_splits] 152 label_paths = [_path for _path in label_paths if Path(_path).stem in all_organ_splits] 153 154 elif split == "test" and organ_type is not None: 155 # we don't have organ splits in the test dataset 156 raise ValueError("The test split does not have any organ informations, please pass `organ_type=None`") 157 158 return image_paths, label_paths
Get paths to the MoNuSeg data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The split to use for the dataset. Either 'train' or 'test'.
- organ_type: The choice of organ type.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths to the image data. List of filepaths to the label data.
def
get_monuseg_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], split: Literal['train', 'test'], organ_type: Optional[List[str]] = None, download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, resize_inputs: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
161def get_monuseg_dataset( 162 path: Union[os.PathLike, str], 163 patch_shape: Tuple[int, ...], 164 split: Literal['train', 'test'], 165 organ_type: Optional[List[str]] = None, 166 download: bool = False, 167 offsets: Optional[List[List[int]]] = None, 168 boundaries: bool = False, 169 binary: bool = False, 170 resize_inputs: bool = False, 171 **kwargs 172) -> Dataset: 173 """Get the MoNuSeg dataset for nucleus segmentation in H&E stained tissue images. 174 175 Args: 176 path: Filepath to a folder where the downloaded data will be saved. 177 patch_shape: The patch shape to use for training. 178 split: The split to use for the dataset. Either 'train' or 'test'. 179 organ_type: The choice of organ type. 180 download: Whether to download the data if it is not present. 181 offsets: Offset values for affinity computation used as target. 182 boundaries: Whether to compute boundaries as the target. 183 binary: Whether to use a binary segmentation target. 184 resize_inputs: Whether to resize the inputs. 185 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 186 187 Returns: 188 The segmentation dataset. 189 """ 190 image_paths, label_paths = get_monuseg_paths(path, split, organ_type, download) 191 192 kwargs, _ = util.add_instance_label_transform( 193 kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets 194 ) 195 196 if resize_inputs: 197 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True} 198 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 199 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 200 ) 201 202 return torch_em.default_segmentation_dataset( 203 raw_paths=image_paths, 204 raw_key=None, 205 label_paths=label_paths, 206 label_key=None, 207 patch_shape=patch_shape, 208 is_seg_dataset=False, 209 **kwargs 210 )
Get the MoNuSeg dataset for nucleus segmentation in H&E stained tissue images.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- split: The split to use for the dataset. Either 'train' or 'test'.
- organ_type: The choice of organ type.
- download: Whether to download the data if it is not present.
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- binary: Whether to use a binary segmentation target.
- resize_inputs: Whether to resize the inputs.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
def
get_monuseg_loader( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], batch_size: int, split: Literal['train', 'test'], organ_type: Optional[List[str]] = None, download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, resize_inputs: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
213def get_monuseg_loader( 214 path: Union[os.PathLike, str], 215 patch_shape: Tuple[int, ...], 216 batch_size: int, 217 split: Literal['train', 'test'], 218 organ_type: Optional[List[str]] = None, 219 download: bool = False, 220 offsets: Optional[List[List[int]]] = None, 221 boundaries: bool = False, 222 binary: bool = False, 223 resize_inputs: bool = False, 224 **kwargs 225) -> DataLoader: 226 """Get the MoNuSeg dataloader for nucleus segmentation in H&E stained tissue images. 227 228 Args: 229 path: Filepath to a folder where the downloaded data will be saved. 230 patch_shape: The patch shape to use for training. 231 batch_size: The batch size for training. 232 split: The split to use for the dataset. Either 'train' or 'test'. 233 organ_type: The choice of organ type. 234 download: Whether to download the data if it is not present. 235 offsets: Offset values for affinity computation used as target. 236 boundaries: Whether to compute boundaries as the target. 237 binary: Whether to use a binary segmentation target. 238 resize_inputs: Whether to resize the inputs. 239 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 240 241 Returns: 242 The DataLoader 243 """ 244 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 245 dataset = get_monuseg_dataset( 246 path, patch_shape, split, organ_type, download, offsets, boundaries, binary, resize_inputs, **ds_kwargs 247 ) 248 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the MoNuSeg dataloader for nucleus segmentation in H&E stained tissue images.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- batch_size: The batch size for training.
- split: The split to use for the dataset. Either 'train' or 'test'.
- organ_type: The choice of organ type.
- download: Whether to download the data if it is not present.
- offsets: Offset values for affinity computation used as target.
- boundaries: Whether to compute boundaries as the target.
- binary: Whether to use a binary segmentation target.
- resize_inputs: Whether to resize the inputs.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader