torch_em.data.datasets.histopathology.monuseg

This dataset contains annotations for nucleus segmentation in H&E stained tissue images derived from different organs.

This dataset comes from https://monuseg.grand-challenge.org/Data/.

Please cite the relevant publications from the challenge if you use this dataset in your research.

View Source

  1"""This dataset contains annotations for nucleus segmentation in
  2H&E stained tissue images derived from different organs.
  3
  4This dataset comes from https://monuseg.grand-challenge.org/Data/.
  5
  6Please cite the relevant publications from the challenge
  7if you use this dataset in your research.
  8"""
  9
 10import os
 11import shutil
 12from tqdm import tqdm
 13from glob import glob
 14from pathlib import Path
 15from typing import List, Optional, Union, Tuple, Literal
 16
 17import imageio.v3 as imageio
 18
 19from torch.utils.data import Dataset, DataLoader
 20
 21import torch_em
 22
 23from .. import util
 24
 25
 26URL = {
 27    "train": "https://drive.google.com/uc?export=download&id=1ZgqFJomqQGNnsx7w7QBzQQMVA16lbVCA",
 28    "test": "https://drive.google.com/uc?export=download&id=1NKkSQ5T0ZNQ8aUhh0a8Dt2YKYCQXIViw"
 29}
 30
 31CHECKSUM = {
 32    "train": "25d3d3185bb2970b397cafa72eb664c9b4d24294aee382e7e3df9885affce742",
 33    "test": "13e522387ae8b1bcc0530e13ff9c7b4d91ec74959ef6f6e57747368d7ee6f88a"
 34}
 35
 36# Here is the description: https://drive.google.com/file/d/1xYyQ31CHFRnvTCTuuHdconlJCMk2SK7Z/view?usp=sharing
 37ORGAN_SPLITS = {
 38    "breast": [
 39        "TCGA-A7-A13E-01Z-00-DX1", "TCGA-A7-A13F-01Z-00-DX1", "TCGA-AR-A1AK-01Z-00-DX1",
 40        "TCGA-AR-A1AS-01Z-00-DX1", "TCGA-E2-A1B5-01Z-00-DX1", "TCGA-E2-A14V-01Z-00-DX1"
 41    ],
 42    "kidney": [
 43        "TCGA-B0-5711-01Z-00-DX1", "TCGA-HE-7128-01Z-00-DX1", "TCGA-HE-7129-01Z-00-DX1",
 44        "TCGA-HE-7130-01Z-00-DX1", "TCGA-B0-5710-01Z-00-DX1", "TCGA-B0-5698-01Z-00-DX1"
 45    ],
 46    "liver": [
 47        "TCGA-18-5592-01Z-00-DX1", "TCGA-38-6178-01Z-00-DX1", "TCGA-49-4488-01Z-00-DX1",
 48        "TCGA-50-5931-01Z-00-DX1", "TCGA-21-5784-01Z-00-DX1", "TCGA-21-5786-01Z-00-DX1"
 49    ],
 50    "prostate": [
 51        "TCGA-G9-6336-01Z-00-DX1", "TCGA-G9-6348-01Z-00-DX1", "TCGA-G9-6356-01Z-00-DX1",
 52        "TCGA-G9-6363-01Z-00-DX1", "TCGA-CH-5767-01Z-00-DX1", "TCGA-G9-6362-01Z-00-DX1"
 53    ],
 54    "bladder": ["TCGA-DK-A2I6-01A-01-TS1", "TCGA-G2-A2EK-01A-02-TSB"],
 55    "colon": ["TCGA-AY-A8YK-01A-01-TS1", "TCGA-NH-A8F7-01A-01-TS1"],
 56    "stomach": ["TCGA-KB-A93J-01A-01-TS1", "TCGA-RD-A8N9-01A-01-TS1"]
 57}
 58
 59
 60def _process_monuseg(path, split):
 61    util.unzip(os.path.join(path, f"monuseg_{split}.zip"), path)
 62
 63    # assorting the images into expected dir;
 64    # converting the label xml files to numpy arrays (of same dimension as input images) in the expected dir
 65    root_img_save_dir = os.path.join(path, "images", split)
 66    root_label_save_dir = os.path.join(path, "labels", split)
 67
 68    os.makedirs(root_img_save_dir, exist_ok=True)
 69    os.makedirs(root_label_save_dir, exist_ok=True)
 70
 71    if split == "train":
 72        all_img_dir = sorted(glob(os.path.join(path, "*", "Tissue*", "*")))
 73        all_xml_label_dir = sorted(glob(os.path.join(path, "*", "Annotations", "*")))
 74    else:
 75        all_img_dir = sorted(glob(os.path.join(path, "MoNuSegTestData", "*.tif")))
 76        all_xml_label_dir = sorted(glob(os.path.join(path, "MoNuSegTestData", "*.xml")))
 77
 78    assert len(all_img_dir) == len(all_xml_label_dir)
 79
 80    for img_path, xml_label_path in tqdm(
 81        zip(all_img_dir, all_xml_label_dir),
 82        desc=f"Converting {split} split to the expected format",
 83        total=len(all_img_dir)
 84    ):
 85        desired_label_shape = imageio.imread(img_path).shape[:-1]
 86
 87        img_id = os.path.split(img_path)[-1]
 88        dst = os.path.join(root_img_save_dir, img_id)
 89        shutil.move(src=img_path, dst=dst)
 90
 91        _label = util.generate_labeled_array_from_xml(shape=desired_label_shape, xml_file=xml_label_path)
 92        _fileid = img_id.split(".")[0]
 93        imageio.imwrite(os.path.join(root_label_save_dir, f"{_fileid}.tif"), _label, compression="zlib")
 94
 95    shutil.rmtree(glob(os.path.join(path, "MoNuSeg*"))[0])
 96    if split == "train":
 97        shutil.rmtree(glob(os.path.join(path, "__MACOSX"))[0])
 98
 99
100def get_monuseg_data(path: Union[os.PathLike, str], split: Literal['train', 'test'], download: bool = False):
101    """Download the MoNuSeg dataset.
102
103    Args:
104        path: Filepath to a folder where the downloaded data will be saved.
105        split: The split to use for the dataset. Either 'train' or 'test'.
106        download: Whether to download the data if it is not present.
107    """
108    assert split in ["train", "test"], "The split choices in MoNuSeg datset are train/test, please choose from them"
109
110    # check if we have extracted the images and labels already
111    im_path = os.path.join(path, "images", split)
112    label_path = os.path.join(path, "labels", split)
113    if os.path.exists(im_path) and os.path.exists(label_path):
114        return
115
116    os.makedirs(path, exist_ok=True)
117    zip_path = os.path.join(path, f"monuseg_{split}.zip")
118    util.download_source_gdrive(zip_path, URL[split], download=download, checksum=CHECKSUM[split])
119
120    _process_monuseg(path, split)
121
122
123def get_monuseg_paths(
124    path: Union[os.PathLike, str],
125    split: Literal['train', 'test'],
126    organ_type: Optional[List[str]] = None,
127    download: bool = False
128) -> Tuple[List[str], List[str]]:
129    """Get paths to the MoNuSeg data.
130
131    Args:
132        path: Filepath to a folder where the downloaded data will be saved.
133        split: The split to use for the dataset. Either 'train' or 'test'.
134        organ_type: The choice of organ type.
135        download: Whether to download the data if it is not present.
136
137    Returns:
138        List of filepaths to the image data.
139        List of filepaths to the label data.
140    """
141    get_monuseg_data(path, split, download)
142
143    image_paths = sorted(glob(os.path.join(path, "images", split, "*")))
144    label_paths = sorted(glob(os.path.join(path, "labels", split, "*")))
145
146    if split == "train" and organ_type is not None:
147        # get all patients for multiple organ selection
148        all_organ_splits = sum([ORGAN_SPLITS[_o] for _o in organ_type], [])
149
150        image_paths = [_path for _path in image_paths if Path(_path).stem in all_organ_splits]
151        label_paths = [_path for _path in label_paths if Path(_path).stem in all_organ_splits]
152
153    elif split == "test" and organ_type is not None:
154        # we don't have organ splits in the test dataset
155        raise ValueError("The test split does not have any organ informations, please pass `organ_type=None`")
156
157    return image_paths, label_paths
158
159
160def get_monuseg_dataset(
161    path: Union[os.PathLike, str],
162    patch_shape: Tuple[int, ...],
163    split: Literal['train', 'test'],
164    organ_type: Optional[List[str]] = None,
165    download: bool = False,
166    offsets: Optional[List[List[int]]] = None,
167    boundaries: bool = False,
168    binary: bool = False,
169    resize_inputs: bool = False,
170    **kwargs
171) -> Dataset:
172    """Get the MoNuSeg dataset for nucleus segmentation in H&E stained tissue images.
173
174    Args:
175        path: Filepath to a folder where the downloaded data will be saved.
176        patch_shape: The patch shape to use for training.
177        split: The split to use for the dataset. Either 'train' or 'test'.
178        organ_type: The choice of organ type.
179        download: Whether to download the data if it is not present.
180        offsets: Offset values for affinity computation used as target.
181        boundaries: Whether to compute boundaries as the target.
182        binary: Whether to use a binary segmentation target.
183        resize_inputs: Whether to resize the inputs.
184        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
185
186    Returns:
187        The segmentation dataset.
188    """
189    image_paths, label_paths = get_monuseg_paths(path, split, organ_type, download)
190
191    kwargs, _ = util.add_instance_label_transform(
192        kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets
193    )
194
195    if resize_inputs:
196        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
197        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
198            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
199        )
200
201    return torch_em.default_segmentation_dataset(
202        raw_paths=image_paths,
203        raw_key=None,
204        label_paths=label_paths,
205        label_key=None,
206        patch_shape=patch_shape,
207        is_seg_dataset=False,
208        **kwargs
209    )
210
211
212def get_monuseg_loader(
213    path: Union[os.PathLike, str],
214    patch_shape: Tuple[int, ...],
215    batch_size: int,
216    split: Literal['train', 'test'],
217    organ_type: Optional[List[str]] = None,
218    download: bool = False,
219    offsets: Optional[List[List[int]]] = None,
220    boundaries: bool = False,
221    binary: bool = False,
222    resize_inputs: bool = False,
223    **kwargs
224) -> DataLoader:
225    """Get the MoNuSeg dataloader for nucleus segmentation in H&E stained tissue images.
226
227    Args:
228        path: Filepath to a folder where the downloaded data will be saved.
229        patch_shape: The patch shape to use for training.
230        batch_size: The batch size for training.
231        split: The split to use for the dataset. Either 'train' or 'test'.
232        organ_type: The choice of organ type.
233        download: Whether to download the data if it is not present.
234        offsets: Offset values for affinity computation used as target.
235        boundaries: Whether to compute boundaries as the target.
236        binary: Whether to use a binary segmentation target.
237        resize_inputs: Whether to resize the inputs.
238        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
239
240    Returns:
241        The DataLoader
242    """
243    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
244    dataset = get_monuseg_dataset(
245        path, patch_shape, split, organ_type, download, offsets, boundaries, binary, resize_inputs, **ds_kwargs
246    )
247    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

URL = {'train': 'https://drive.google.com/uc?export=download&id=1ZgqFJomqQGNnsx7w7QBzQQMVA16lbVCA', 'test': 'https://drive.google.com/uc?export=download&id=1NKkSQ5T0ZNQ8aUhh0a8Dt2YKYCQXIViw'}

CHECKSUM = {'train': '25d3d3185bb2970b397cafa72eb664c9b4d24294aee382e7e3df9885affce742', 'test': '13e522387ae8b1bcc0530e13ff9c7b4d91ec74959ef6f6e57747368d7ee6f88a'}

ORGAN_SPLITS = {'breast': ['TCGA-A7-A13E-01Z-00-DX1', 'TCGA-A7-A13F-01Z-00-DX1', 'TCGA-AR-A1AK-01Z-00-DX1', 'TCGA-AR-A1AS-01Z-00-DX1', 'TCGA-E2-A1B5-01Z-00-DX1', 'TCGA-E2-A14V-01Z-00-DX1'], 'kidney': ['TCGA-B0-5711-01Z-00-DX1', 'TCGA-HE-7128-01Z-00-DX1', 'TCGA-HE-7129-01Z-00-DX1', 'TCGA-HE-7130-01Z-00-DX1', 'TCGA-B0-5710-01Z-00-DX1', 'TCGA-B0-5698-01Z-00-DX1'], 'liver': ['TCGA-18-5592-01Z-00-DX1', 'TCGA-38-6178-01Z-00-DX1', 'TCGA-49-4488-01Z-00-DX1', 'TCGA-50-5931-01Z-00-DX1', 'TCGA-21-5784-01Z-00-DX1', 'TCGA-21-5786-01Z-00-DX1'], 'prostate': ['TCGA-G9-6336-01Z-00-DX1', 'TCGA-G9-6348-01Z-00-DX1', 'TCGA-G9-6356-01Z-00-DX1', 'TCGA-G9-6363-01Z-00-DX1', 'TCGA-CH-5767-01Z-00-DX1', 'TCGA-G9-6362-01Z-00-DX1'], 'bladder': ['TCGA-DK-A2I6-01A-01-TS1', 'TCGA-G2-A2EK-01A-02-TSB'], 'colon': ['TCGA-AY-A8YK-01A-01-TS1', 'TCGA-NH-A8F7-01A-01-TS1'], 'stomach': ['TCGA-KB-A93J-01A-01-TS1', 'TCGA-RD-A8N9-01A-01-TS1']}

def get_monuseg_data( path: Union[os.PathLike, str], split: Literal['train', 'test'], download: bool = False): View Source

101def get_monuseg_data(path: Union[os.PathLike, str], split: Literal['train', 'test'], download: bool = False):
102    """Download the MoNuSeg dataset.
103
104    Args:
105        path: Filepath to a folder where the downloaded data will be saved.
106        split: The split to use for the dataset. Either 'train' or 'test'.
107        download: Whether to download the data if it is not present.
108    """
109    assert split in ["train", "test"], "The split choices in MoNuSeg datset are train/test, please choose from them"
110
111    # check if we have extracted the images and labels already
112    im_path = os.path.join(path, "images", split)
113    label_path = os.path.join(path, "labels", split)
114    if os.path.exists(im_path) and os.path.exists(label_path):
115        return
116
117    os.makedirs(path, exist_ok=True)
118    zip_path = os.path.join(path, f"monuseg_{split}.zip")
119    util.download_source_gdrive(zip_path, URL[split], download=download, checksum=CHECKSUM[split])
120
121    _process_monuseg(path, split)

Download the MoNuSeg dataset.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
split: The split to use for the dataset. Either 'train' or 'test'.
download: Whether to download the data if it is not present.

def get_monuseg_paths( path: Union[os.PathLike, str], split: Literal['train', 'test'], organ_type: Optional[List[str]] = None, download: bool = False) -> Tuple[List[str], List[str]]: View Source

124def get_monuseg_paths(
125    path: Union[os.PathLike, str],
126    split: Literal['train', 'test'],
127    organ_type: Optional[List[str]] = None,
128    download: bool = False
129) -> Tuple[List[str], List[str]]:
130    """Get paths to the MoNuSeg data.
131
132    Args:
133        path: Filepath to a folder where the downloaded data will be saved.
134        split: The split to use for the dataset. Either 'train' or 'test'.
135        organ_type: The choice of organ type.
136        download: Whether to download the data if it is not present.
137
138    Returns:
139        List of filepaths to the image data.
140        List of filepaths to the label data.
141    """
142    get_monuseg_data(path, split, download)
143
144    image_paths = sorted(glob(os.path.join(path, "images", split, "*")))
145    label_paths = sorted(glob(os.path.join(path, "labels", split, "*")))
146
147    if split == "train" and organ_type is not None:
148        # get all patients for multiple organ selection
149        all_organ_splits = sum([ORGAN_SPLITS[_o] for _o in organ_type], [])
150
151        image_paths = [_path for _path in image_paths if Path(_path).stem in all_organ_splits]
152        label_paths = [_path for _path in label_paths if Path(_path).stem in all_organ_splits]
153
154    elif split == "test" and organ_type is not None:
155        # we don't have organ splits in the test dataset
156        raise ValueError("The test split does not have any organ informations, please pass `organ_type=None`")
157
158    return image_paths, label_paths

Get paths to the MoNuSeg data.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
split: The split to use for the dataset. Either 'train' or 'test'.
organ_type: The choice of organ type.
download: Whether to download the data if it is not present.

Returns:

List of filepaths to the image data. List of filepaths to the label data.

def get_monuseg_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], split: Literal['train', 'test'], organ_type: Optional[List[str]] = None, download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, resize_inputs: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset: View Source

161def get_monuseg_dataset(
162    path: Union[os.PathLike, str],
163    patch_shape: Tuple[int, ...],
164    split: Literal['train', 'test'],
165    organ_type: Optional[List[str]] = None,
166    download: bool = False,
167    offsets: Optional[List[List[int]]] = None,
168    boundaries: bool = False,
169    binary: bool = False,
170    resize_inputs: bool = False,
171    **kwargs
172) -> Dataset:
173    """Get the MoNuSeg dataset for nucleus segmentation in H&E stained tissue images.
174
175    Args:
176        path: Filepath to a folder where the downloaded data will be saved.
177        patch_shape: The patch shape to use for training.
178        split: The split to use for the dataset. Either 'train' or 'test'.
179        organ_type: The choice of organ type.
180        download: Whether to download the data if it is not present.
181        offsets: Offset values for affinity computation used as target.
182        boundaries: Whether to compute boundaries as the target.
183        binary: Whether to use a binary segmentation target.
184        resize_inputs: Whether to resize the inputs.
185        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
186
187    Returns:
188        The segmentation dataset.
189    """
190    image_paths, label_paths = get_monuseg_paths(path, split, organ_type, download)
191
192    kwargs, _ = util.add_instance_label_transform(
193        kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets
194    )
195
196    if resize_inputs:
197        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
198        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
199            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
200        )
201
202    return torch_em.default_segmentation_dataset(
203        raw_paths=image_paths,
204        raw_key=None,
205        label_paths=label_paths,
206        label_key=None,
207        patch_shape=patch_shape,
208        is_seg_dataset=False,
209        **kwargs
210    )

Get the MoNuSeg dataset for nucleus segmentation in H&E stained tissue images.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
patch_shape: The patch shape to use for training.
split: The split to use for the dataset. Either 'train' or 'test'.
organ_type: The choice of organ type.
download: Whether to download the data if it is not present.
offsets: Offset values for affinity computation used as target.
boundaries: Whether to compute boundaries as the target.
binary: Whether to use a binary segmentation target.
resize_inputs: Whether to resize the inputs.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

Returns:

The segmentation dataset.

def get_monuseg_loader( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], batch_size: int, split: Literal['train', 'test'], organ_type: Optional[List[str]] = None, download: bool = False, offsets: Optional[List[List[int]]] = None, boundaries: bool = False, binary: bool = False, resize_inputs: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader: View Source

213def get_monuseg_loader(
214    path: Union[os.PathLike, str],
215    patch_shape: Tuple[int, ...],
216    batch_size: int,
217    split: Literal['train', 'test'],
218    organ_type: Optional[List[str]] = None,
219    download: bool = False,
220    offsets: Optional[List[List[int]]] = None,
221    boundaries: bool = False,
222    binary: bool = False,
223    resize_inputs: bool = False,
224    **kwargs
225) -> DataLoader:
226    """Get the MoNuSeg dataloader for nucleus segmentation in H&E stained tissue images.
227
228    Args:
229        path: Filepath to a folder where the downloaded data will be saved.
230        patch_shape: The patch shape to use for training.
231        batch_size: The batch size for training.
232        split: The split to use for the dataset. Either 'train' or 'test'.
233        organ_type: The choice of organ type.
234        download: Whether to download the data if it is not present.
235        offsets: Offset values for affinity computation used as target.
236        boundaries: Whether to compute boundaries as the target.
237        binary: Whether to use a binary segmentation target.
238        resize_inputs: Whether to resize the inputs.
239        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
240
241    Returns:
242        The DataLoader
243    """
244    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
245    dataset = get_monuseg_dataset(
246        path, patch_shape, split, organ_type, download, offsets, boundaries, binary, resize_inputs, **ds_kwargs
247    )
248    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the MoNuSeg dataloader for nucleus segmentation in H&E stained tissue images.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
patch_shape: The patch shape to use for training.
batch_size: The batch size for training.
split: The split to use for the dataset. Either 'train' or 'test'.
organ_type: The choice of organ type.
download: Whether to download the data if it is not present.
offsets: Offset values for affinity computation used as target.
boundaries: Whether to compute boundaries as the target.
binary: Whether to use a binary segmentation target.
resize_inputs: Whether to resize the inputs.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

Returns:

The DataLoader