torch_em.data.datasets.histopathology.monuseg
1import os 2import shutil 3from tqdm import tqdm 4from glob import glob 5from pathlib import Path 6from typing import List, Optional 7 8import imageio.v3 as imageio 9 10import torch_em 11from torch_em.data.datasets import util 12 13 14URL = { 15 "train": "https://drive.google.com/uc?export=download&id=1ZgqFJomqQGNnsx7w7QBzQQMVA16lbVCA", 16 "test": "https://drive.google.com/uc?export=download&id=1NKkSQ5T0ZNQ8aUhh0a8Dt2YKYCQXIViw" 17} 18 19CHECKSUM = { 20 "train": "25d3d3185bb2970b397cafa72eb664c9b4d24294aee382e7e3df9885affce742", 21 "test": "13e522387ae8b1bcc0530e13ff9c7b4d91ec74959ef6f6e57747368d7ee6f88a" 22} 23 24# here's the description: https://drive.google.com/file/d/1xYyQ31CHFRnvTCTuuHdconlJCMk2SK7Z/view?usp=sharing 25ORGAN_SPLITS = { 26 "breast": ["TCGA-A7-A13E-01Z-00-DX1", "TCGA-A7-A13F-01Z-00-DX1", "TCGA-AR-A1AK-01Z-00-DX1", 27 "TCGA-AR-A1AS-01Z-00-DX1", "TCGA-E2-A1B5-01Z-00-DX1", "TCGA-E2-A14V-01Z-00-DX1"], 28 "kidney": ["TCGA-B0-5711-01Z-00-DX1", "TCGA-HE-7128-01Z-00-DX1", "TCGA-HE-7129-01Z-00-DX1", 29 "TCGA-HE-7130-01Z-00-DX1", "TCGA-B0-5710-01Z-00-DX1", "TCGA-B0-5698-01Z-00-DX1"], 30 "liver": ["TCGA-18-5592-01Z-00-DX1", "TCGA-38-6178-01Z-00-DX1", "TCGA-49-4488-01Z-00-DX1", 31 "TCGA-50-5931-01Z-00-DX1", "TCGA-21-5784-01Z-00-DX1", "TCGA-21-5786-01Z-00-DX1"], 32 "prostate": ["TCGA-G9-6336-01Z-00-DX1", "TCGA-G9-6348-01Z-00-DX1", "TCGA-G9-6356-01Z-00-DX1", 33 "TCGA-G9-6363-01Z-00-DX1", "TCGA-CH-5767-01Z-00-DX1", "TCGA-G9-6362-01Z-00-DX1"], 34 "bladder": ["TCGA-DK-A2I6-01A-01-TS1", "TCGA-G2-A2EK-01A-02-TSB"], 35 "colon": ["TCGA-AY-A8YK-01A-01-TS1", "TCGA-NH-A8F7-01A-01-TS1"], 36 "stomach": ["TCGA-KB-A93J-01A-01-TS1", "TCGA-RD-A8N9-01A-01-TS1"] 37} 38 39 40def _download_monuseg(path, download, split): 41 assert split in ["train", "test"], "The split choices in MoNuSeg datset are train/test, please choose from them" 42 43 # check if we have extracted the images and labels already 44 im_path = os.path.join(path, "images", split) 45 label_path = os.path.join(path, "labels", split) 46 if os.path.exists(im_path) and os.path.exists(label_path): 47 return 48 49 os.makedirs(path, exist_ok=True) 50 zip_path = os.path.join(path, f"monuseg_{split}.zip") 51 util.download_source_gdrive(zip_path, URL[split], download=download, checksum=CHECKSUM[split]) 52 53 _process_monuseg(path, split) 54 55 56def _process_monuseg(path, split): 57 util.unzip(os.path.join(path, f"monuseg_{split}.zip"), path) 58 59 # assorting the images into expected dir; 60 # converting the label xml files to numpy arrays (of same dimension as input images) in the expected dir 61 root_img_save_dir = os.path.join(path, "images", split) 62 root_label_save_dir = os.path.join(path, "labels", split) 63 64 os.makedirs(root_img_save_dir, exist_ok=True) 65 os.makedirs(root_label_save_dir, exist_ok=True) 66 67 if split == "train": 68 all_img_dir = sorted(glob(os.path.join(path, "*", "Tissue*", "*"))) 69 all_xml_label_dir = sorted(glob(os.path.join(path, "*", "Annotations", "*"))) 70 else: 71 all_img_dir = sorted(glob(os.path.join(path, "MoNuSegTestData", "*.tif"))) 72 all_xml_label_dir = sorted(glob(os.path.join(path, "MoNuSegTestData", "*.xml"))) 73 74 assert len(all_img_dir) == len(all_xml_label_dir) 75 76 for img_path, xml_label_path in tqdm(zip(all_img_dir, all_xml_label_dir), 77 desc=f"Converting {split} split to the expected format", 78 total=len(all_img_dir)): 79 desired_label_shape = imageio.imread(img_path).shape[:-1] 80 81 img_id = os.path.split(img_path)[-1] 82 dst = os.path.join(root_img_save_dir, img_id) 83 shutil.move(src=img_path, dst=dst) 84 85 _label = util.generate_labeled_array_from_xml(shape=desired_label_shape, xml_file=xml_label_path) 86 _fileid = img_id.split(".")[0] 87 imageio.imwrite(os.path.join(root_label_save_dir, f"{_fileid}.tif"), _label) 88 89 shutil.rmtree(glob(os.path.join(path, "MoNuSeg*"))[0]) 90 if split == "train": 91 shutil.rmtree(glob(os.path.join(path, "__MACOSX"))[0]) 92 93 94def get_monuseg_dataset( 95 path, patch_shape, split, organ_type: Optional[List[str]] = None, download=False, 96 offsets=None, boundaries=False, binary=False, **kwargs 97): 98 """Dataset from https://monuseg.grand-challenge.org/Data/ 99 """ 100 _download_monuseg(path, download, split) 101 102 image_paths = sorted(glob(os.path.join(path, "images", split, "*"))) 103 label_paths = sorted(glob(os.path.join(path, "labels", split, "*"))) 104 105 if split == "train" and organ_type is not None: 106 # get all patients for multiple organ selection 107 all_organ_splits = sum([ORGAN_SPLITS[_o] for _o in organ_type], []) 108 109 image_paths = [_path for _path in image_paths if Path(_path).stem in all_organ_splits] 110 label_paths = [_path for _path in label_paths if Path(_path).stem in all_organ_splits] 111 112 elif split == "test" and organ_type is not None: 113 # we don't have organ splits in the test dataset 114 raise ValueError("The test split does not have any organ informations, please pass `organ_type=None`") 115 116 kwargs, _ = util.add_instance_label_transform( 117 kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets 118 ) 119 return torch_em.default_segmentation_dataset( 120 image_paths, None, label_paths, None, patch_shape, is_seg_dataset=False, **kwargs 121 ) 122 123 124def get_monuseg_loader( 125 path, patch_shape, batch_size, split, organ_type=None, download=False, offsets=None, boundaries=False, binary=False, 126 **kwargs 127): 128 ds_kwargs, loader_kwargs = util.split_kwargs( 129 torch_em.default_segmentation_dataset, **kwargs 130 ) 131 dataset = get_monuseg_dataset( 132 path, patch_shape, split, organ_type=organ_type, download=download, 133 offsets=offsets, boundaries=boundaries, binary=binary, **ds_kwargs 134 ) 135 loader = torch_em.get_data_loader(dataset, batch_size, **loader_kwargs) 136 return loader
URL =
{'train': 'https://drive.google.com/uc?export=download&id=1ZgqFJomqQGNnsx7w7QBzQQMVA16lbVCA', 'test': 'https://drive.google.com/uc?export=download&id=1NKkSQ5T0ZNQ8aUhh0a8Dt2YKYCQXIViw'}
CHECKSUM =
{'train': '25d3d3185bb2970b397cafa72eb664c9b4d24294aee382e7e3df9885affce742', 'test': '13e522387ae8b1bcc0530e13ff9c7b4d91ec74959ef6f6e57747368d7ee6f88a'}
ORGAN_SPLITS =
{'breast': ['TCGA-A7-A13E-01Z-00-DX1', 'TCGA-A7-A13F-01Z-00-DX1', 'TCGA-AR-A1AK-01Z-00-DX1', 'TCGA-AR-A1AS-01Z-00-DX1', 'TCGA-E2-A1B5-01Z-00-DX1', 'TCGA-E2-A14V-01Z-00-DX1'], 'kidney': ['TCGA-B0-5711-01Z-00-DX1', 'TCGA-HE-7128-01Z-00-DX1', 'TCGA-HE-7129-01Z-00-DX1', 'TCGA-HE-7130-01Z-00-DX1', 'TCGA-B0-5710-01Z-00-DX1', 'TCGA-B0-5698-01Z-00-DX1'], 'liver': ['TCGA-18-5592-01Z-00-DX1', 'TCGA-38-6178-01Z-00-DX1', 'TCGA-49-4488-01Z-00-DX1', 'TCGA-50-5931-01Z-00-DX1', 'TCGA-21-5784-01Z-00-DX1', 'TCGA-21-5786-01Z-00-DX1'], 'prostate': ['TCGA-G9-6336-01Z-00-DX1', 'TCGA-G9-6348-01Z-00-DX1', 'TCGA-G9-6356-01Z-00-DX1', 'TCGA-G9-6363-01Z-00-DX1', 'TCGA-CH-5767-01Z-00-DX1', 'TCGA-G9-6362-01Z-00-DX1'], 'bladder': ['TCGA-DK-A2I6-01A-01-TS1', 'TCGA-G2-A2EK-01A-02-TSB'], 'colon': ['TCGA-AY-A8YK-01A-01-TS1', 'TCGA-NH-A8F7-01A-01-TS1'], 'stomach': ['TCGA-KB-A93J-01A-01-TS1', 'TCGA-RD-A8N9-01A-01-TS1']}
def
get_monuseg_dataset( path, patch_shape, split, organ_type: Optional[List[str]] = None, download=False, offsets=None, boundaries=False, binary=False, **kwargs):
95def get_monuseg_dataset( 96 path, patch_shape, split, organ_type: Optional[List[str]] = None, download=False, 97 offsets=None, boundaries=False, binary=False, **kwargs 98): 99 """Dataset from https://monuseg.grand-challenge.org/Data/ 100 """ 101 _download_monuseg(path, download, split) 102 103 image_paths = sorted(glob(os.path.join(path, "images", split, "*"))) 104 label_paths = sorted(glob(os.path.join(path, "labels", split, "*"))) 105 106 if split == "train" and organ_type is not None: 107 # get all patients for multiple organ selection 108 all_organ_splits = sum([ORGAN_SPLITS[_o] for _o in organ_type], []) 109 110 image_paths = [_path for _path in image_paths if Path(_path).stem in all_organ_splits] 111 label_paths = [_path for _path in label_paths if Path(_path).stem in all_organ_splits] 112 113 elif split == "test" and organ_type is not None: 114 # we don't have organ splits in the test dataset 115 raise ValueError("The test split does not have any organ informations, please pass `organ_type=None`") 116 117 kwargs, _ = util.add_instance_label_transform( 118 kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets 119 ) 120 return torch_em.default_segmentation_dataset( 121 image_paths, None, label_paths, None, patch_shape, is_seg_dataset=False, **kwargs 122 )
Dataset from https://monuseg.grand-challenge.org/Data/
def
get_monuseg_loader( path, patch_shape, batch_size, split, organ_type=None, download=False, offsets=None, boundaries=False, binary=False, **kwargs):
125def get_monuseg_loader( 126 path, patch_shape, batch_size, split, organ_type=None, download=False, offsets=None, boundaries=False, binary=False, 127 **kwargs 128): 129 ds_kwargs, loader_kwargs = util.split_kwargs( 130 torch_em.default_segmentation_dataset, **kwargs 131 ) 132 dataset = get_monuseg_dataset( 133 path, patch_shape, split, organ_type=organ_type, download=download, 134 offsets=offsets, boundaries=boundaries, binary=binary, **ds_kwargs 135 ) 136 loader = torch_em.get_data_loader(dataset, batch_size, **loader_kwargs) 137 return loader