torch_em.data.datasets.histopathology.monuseg

  1import os
  2import shutil
  3from tqdm import tqdm
  4from glob import glob
  5from pathlib import Path
  6from typing import List, Optional
  7
  8import imageio.v3 as imageio
  9
 10import torch_em
 11from torch_em.data.datasets import util
 12
 13
 14URL = {
 15    "train": "https://drive.google.com/uc?export=download&id=1ZgqFJomqQGNnsx7w7QBzQQMVA16lbVCA",
 16    "test": "https://drive.google.com/uc?export=download&id=1NKkSQ5T0ZNQ8aUhh0a8Dt2YKYCQXIViw"
 17}
 18
 19CHECKSUM = {
 20    "train": "25d3d3185bb2970b397cafa72eb664c9b4d24294aee382e7e3df9885affce742",
 21    "test": "13e522387ae8b1bcc0530e13ff9c7b4d91ec74959ef6f6e57747368d7ee6f88a"
 22}
 23
 24# here's the description: https://drive.google.com/file/d/1xYyQ31CHFRnvTCTuuHdconlJCMk2SK7Z/view?usp=sharing
 25ORGAN_SPLITS = {
 26    "breast": ["TCGA-A7-A13E-01Z-00-DX1", "TCGA-A7-A13F-01Z-00-DX1", "TCGA-AR-A1AK-01Z-00-DX1",
 27               "TCGA-AR-A1AS-01Z-00-DX1", "TCGA-E2-A1B5-01Z-00-DX1", "TCGA-E2-A14V-01Z-00-DX1"],
 28    "kidney": ["TCGA-B0-5711-01Z-00-DX1", "TCGA-HE-7128-01Z-00-DX1", "TCGA-HE-7129-01Z-00-DX1",
 29               "TCGA-HE-7130-01Z-00-DX1", "TCGA-B0-5710-01Z-00-DX1", "TCGA-B0-5698-01Z-00-DX1"],
 30    "liver": ["TCGA-18-5592-01Z-00-DX1", "TCGA-38-6178-01Z-00-DX1", "TCGA-49-4488-01Z-00-DX1",
 31              "TCGA-50-5931-01Z-00-DX1", "TCGA-21-5784-01Z-00-DX1", "TCGA-21-5786-01Z-00-DX1"],
 32    "prostate": ["TCGA-G9-6336-01Z-00-DX1", "TCGA-G9-6348-01Z-00-DX1", "TCGA-G9-6356-01Z-00-DX1",
 33                 "TCGA-G9-6363-01Z-00-DX1", "TCGA-CH-5767-01Z-00-DX1", "TCGA-G9-6362-01Z-00-DX1"],
 34    "bladder": ["TCGA-DK-A2I6-01A-01-TS1", "TCGA-G2-A2EK-01A-02-TSB"],
 35    "colon": ["TCGA-AY-A8YK-01A-01-TS1", "TCGA-NH-A8F7-01A-01-TS1"],
 36    "stomach": ["TCGA-KB-A93J-01A-01-TS1", "TCGA-RD-A8N9-01A-01-TS1"]
 37}
 38
 39
 40def _download_monuseg(path, download, split):
 41    assert split in ["train", "test"], "The split choices in MoNuSeg datset are train/test, please choose from them"
 42
 43    # check if we have extracted the images and labels already
 44    im_path = os.path.join(path, "images", split)
 45    label_path = os.path.join(path, "labels", split)
 46    if os.path.exists(im_path) and os.path.exists(label_path):
 47        return
 48
 49    os.makedirs(path, exist_ok=True)
 50    zip_path = os.path.join(path, f"monuseg_{split}.zip")
 51    util.download_source_gdrive(zip_path, URL[split], download=download, checksum=CHECKSUM[split])
 52
 53    _process_monuseg(path, split)
 54
 55
 56def _process_monuseg(path, split):
 57    util.unzip(os.path.join(path, f"monuseg_{split}.zip"), path)
 58
 59    # assorting the images into expected dir;
 60    # converting the label xml files to numpy arrays (of same dimension as input images) in the expected dir
 61    root_img_save_dir = os.path.join(path, "images", split)
 62    root_label_save_dir = os.path.join(path, "labels", split)
 63
 64    os.makedirs(root_img_save_dir, exist_ok=True)
 65    os.makedirs(root_label_save_dir, exist_ok=True)
 66
 67    if split == "train":
 68        all_img_dir = sorted(glob(os.path.join(path, "*", "Tissue*", "*")))
 69        all_xml_label_dir = sorted(glob(os.path.join(path, "*", "Annotations", "*")))
 70    else:
 71        all_img_dir = sorted(glob(os.path.join(path, "MoNuSegTestData", "*.tif")))
 72        all_xml_label_dir = sorted(glob(os.path.join(path, "MoNuSegTestData", "*.xml")))
 73
 74    assert len(all_img_dir) == len(all_xml_label_dir)
 75
 76    for img_path, xml_label_path in tqdm(zip(all_img_dir, all_xml_label_dir),
 77                                         desc=f"Converting {split} split to the expected format",
 78                                         total=len(all_img_dir)):
 79        desired_label_shape = imageio.imread(img_path).shape[:-1]
 80
 81        img_id = os.path.split(img_path)[-1]
 82        dst = os.path.join(root_img_save_dir, img_id)
 83        shutil.move(src=img_path, dst=dst)
 84
 85        _label = util.generate_labeled_array_from_xml(shape=desired_label_shape, xml_file=xml_label_path)
 86        _fileid = img_id.split(".")[0]
 87        imageio.imwrite(os.path.join(root_label_save_dir, f"{_fileid}.tif"), _label)
 88
 89    shutil.rmtree(glob(os.path.join(path, "MoNuSeg*"))[0])
 90    if split == "train":
 91        shutil.rmtree(glob(os.path.join(path, "__MACOSX"))[0])
 92
 93
 94def get_monuseg_dataset(
 95    path, patch_shape, split, organ_type: Optional[List[str]] = None, download=False,
 96    offsets=None, boundaries=False, binary=False, **kwargs
 97):
 98    """Dataset from https://monuseg.grand-challenge.org/Data/
 99    """
100    _download_monuseg(path, download, split)
101
102    image_paths = sorted(glob(os.path.join(path, "images", split, "*")))
103    label_paths = sorted(glob(os.path.join(path, "labels", split, "*")))
104
105    if split == "train" and organ_type is not None:
106        # get all patients for multiple organ selection
107        all_organ_splits = sum([ORGAN_SPLITS[_o] for _o in organ_type], [])
108
109        image_paths = [_path for _path in image_paths if Path(_path).stem in all_organ_splits]
110        label_paths = [_path for _path in label_paths if Path(_path).stem in all_organ_splits]
111
112    elif split == "test" and organ_type is not None:
113        # we don't have organ splits in the test dataset
114        raise ValueError("The test split does not have any organ informations, please pass `organ_type=None`")
115
116    kwargs, _ = util.add_instance_label_transform(
117        kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets
118    )
119    return torch_em.default_segmentation_dataset(
120        image_paths, None, label_paths, None, patch_shape, is_seg_dataset=False, **kwargs
121    )
122
123
124def get_monuseg_loader(
125    path, patch_shape, batch_size, split, organ_type=None, download=False, offsets=None, boundaries=False, binary=False,
126    **kwargs
127):
128    ds_kwargs, loader_kwargs = util.split_kwargs(
129        torch_em.default_segmentation_dataset, **kwargs
130    )
131    dataset = get_monuseg_dataset(
132        path, patch_shape, split, organ_type=organ_type, download=download,
133        offsets=offsets, boundaries=boundaries, binary=binary, **ds_kwargs
134    )
135    loader = torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
136    return loader
URL = {'train': 'https://drive.google.com/uc?export=download&id=1ZgqFJomqQGNnsx7w7QBzQQMVA16lbVCA', 'test': 'https://drive.google.com/uc?export=download&id=1NKkSQ5T0ZNQ8aUhh0a8Dt2YKYCQXIViw'}
CHECKSUM = {'train': '25d3d3185bb2970b397cafa72eb664c9b4d24294aee382e7e3df9885affce742', 'test': '13e522387ae8b1bcc0530e13ff9c7b4d91ec74959ef6f6e57747368d7ee6f88a'}
ORGAN_SPLITS = {'breast': ['TCGA-A7-A13E-01Z-00-DX1', 'TCGA-A7-A13F-01Z-00-DX1', 'TCGA-AR-A1AK-01Z-00-DX1', 'TCGA-AR-A1AS-01Z-00-DX1', 'TCGA-E2-A1B5-01Z-00-DX1', 'TCGA-E2-A14V-01Z-00-DX1'], 'kidney': ['TCGA-B0-5711-01Z-00-DX1', 'TCGA-HE-7128-01Z-00-DX1', 'TCGA-HE-7129-01Z-00-DX1', 'TCGA-HE-7130-01Z-00-DX1', 'TCGA-B0-5710-01Z-00-DX1', 'TCGA-B0-5698-01Z-00-DX1'], 'liver': ['TCGA-18-5592-01Z-00-DX1', 'TCGA-38-6178-01Z-00-DX1', 'TCGA-49-4488-01Z-00-DX1', 'TCGA-50-5931-01Z-00-DX1', 'TCGA-21-5784-01Z-00-DX1', 'TCGA-21-5786-01Z-00-DX1'], 'prostate': ['TCGA-G9-6336-01Z-00-DX1', 'TCGA-G9-6348-01Z-00-DX1', 'TCGA-G9-6356-01Z-00-DX1', 'TCGA-G9-6363-01Z-00-DX1', 'TCGA-CH-5767-01Z-00-DX1', 'TCGA-G9-6362-01Z-00-DX1'], 'bladder': ['TCGA-DK-A2I6-01A-01-TS1', 'TCGA-G2-A2EK-01A-02-TSB'], 'colon': ['TCGA-AY-A8YK-01A-01-TS1', 'TCGA-NH-A8F7-01A-01-TS1'], 'stomach': ['TCGA-KB-A93J-01A-01-TS1', 'TCGA-RD-A8N9-01A-01-TS1']}
def get_monuseg_dataset( path, patch_shape, split, organ_type: Optional[List[str]] = None, download=False, offsets=None, boundaries=False, binary=False, **kwargs):
 95def get_monuseg_dataset(
 96    path, patch_shape, split, organ_type: Optional[List[str]] = None, download=False,
 97    offsets=None, boundaries=False, binary=False, **kwargs
 98):
 99    """Dataset from https://monuseg.grand-challenge.org/Data/
100    """
101    _download_monuseg(path, download, split)
102
103    image_paths = sorted(glob(os.path.join(path, "images", split, "*")))
104    label_paths = sorted(glob(os.path.join(path, "labels", split, "*")))
105
106    if split == "train" and organ_type is not None:
107        # get all patients for multiple organ selection
108        all_organ_splits = sum([ORGAN_SPLITS[_o] for _o in organ_type], [])
109
110        image_paths = [_path for _path in image_paths if Path(_path).stem in all_organ_splits]
111        label_paths = [_path for _path in label_paths if Path(_path).stem in all_organ_splits]
112
113    elif split == "test" and organ_type is not None:
114        # we don't have organ splits in the test dataset
115        raise ValueError("The test split does not have any organ informations, please pass `organ_type=None`")
116
117    kwargs, _ = util.add_instance_label_transform(
118        kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets
119    )
120    return torch_em.default_segmentation_dataset(
121        image_paths, None, label_paths, None, patch_shape, is_seg_dataset=False, **kwargs
122    )
def get_monuseg_loader( path, patch_shape, batch_size, split, organ_type=None, download=False, offsets=None, boundaries=False, binary=False, **kwargs):
125def get_monuseg_loader(
126    path, patch_shape, batch_size, split, organ_type=None, download=False, offsets=None, boundaries=False, binary=False,
127    **kwargs
128):
129    ds_kwargs, loader_kwargs = util.split_kwargs(
130        torch_em.default_segmentation_dataset, **kwargs
131    )
132    dataset = get_monuseg_dataset(
133        path, patch_shape, split, organ_type=organ_type, download=download,
134        offsets=offsets, boundaries=boundaries, binary=binary, **ds_kwargs
135    )
136    loader = torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
137    return loader