torch_em.data.datasets.medical.montgomery

The Montgomery dataset contains annotations for lung segmentation in chest x-ray images.

The database is located at https://data.lhncbc.nlm.nih.gov/public/Tuberculosis-Chest-X-ray-Datasets/Montgomery-County-CXR-Set/MontgomerySet/index.html. This dataset is from the publication:

  1"""The Montgomery dataset contains annotations for lung segmentation
  2in chest x-ray images.
  3
  4The database is located at
  5https://data.lhncbc.nlm.nih.gov/public/Tuberculosis-Chest-X-ray-Datasets/Montgomery-County-CXR-Set/MontgomerySet/index.html.
  6This dataset is from the publication:
  7- https://doi.org/10.1109/TMI.2013.2284099
  8- https://doi.org/10.1109/tmi.2013.2290491
  9Please cite them if you use this dataset for your research.
 10"""
 11
 12import os
 13from glob import glob
 14from tqdm import tqdm
 15from typing import Union, Tuple, List
 16
 17import imageio.v3 as imageio
 18
 19from torch.utils.data import Dataset, DataLoader
 20
 21import torch_em
 22
 23from .. import util
 24
 25
 26URL = "http://openi.nlm.nih.gov/imgs/collections/NLM-MontgomeryCXRSet.zip"
 27CHECKSUM = "54601e952315d8f67383e9202a6e145997ade429f54f7e0af44b4e158714f424"
 28
 29
 30def get_montgomery_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 31    """Download the Montgomery dataset.
 32
 33    Args:
 34        path: Filepath to a folder where the data is downloaded for further processing.
 35        download: Whether to download the data if it is not present.
 36
 37    Returns:
 38        Filepath where the data is downloaded.
 39    """
 40    data_dir = os.path.join(path, "MontgomerySet")
 41    if os.path.exists(data_dir):
 42        return data_dir
 43
 44    os.makedirs(path, exist_ok=True)
 45
 46    zip_path = os.path.join(path, "NLM-MontgomeryCXRSet.zip")
 47    util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM)
 48    util.unzip(zip_path=zip_path, dst=path)
 49
 50    return data_dir
 51
 52
 53def get_montgomery_paths(path: Union[os.PathLike, str], download: bool = False) -> Tuple[List[str], List[str]]:
 54    """Get paths to the Montgomery data.
 55
 56    Args:
 57        path: Filepath to a folder where the data is downloaded for further processing.
 58        download: Whether to download the data if it is not present.
 59
 60    Returns:
 61        List of filepaths for the image data.
 62        List of filepaths for the label data.
 63    """
 64    data_dir = get_montgomery_data(path=path, download=download)
 65    gt_dir = os.path.join(data_dir, "ManualMask", "gt")
 66
 67    image_paths = sorted(glob(os.path.join(data_dir, "CXR_png", "*.png")))
 68
 69    if os.path.exists(gt_dir):
 70        gt_paths = sorted(glob(os.path.join(gt_dir, "*.png")))
 71        if len(image_paths) == len(gt_paths):
 72            return image_paths, gt_paths
 73
 74    else:
 75        os.makedirs(gt_dir, exist_ok=True)
 76
 77    lmask_dir = os.path.join(data_dir, "ManualMask", "leftMask")
 78    rmask_dir = os.path.join(data_dir, "ManualMask", "rightMask")
 79    gt_paths = []
 80    for image_path in tqdm(image_paths, desc="Merging left and right lung halves"):
 81        image_id = os.path.split(image_path)[-1]
 82
 83        # merge the left and right lung halves into one gt file
 84        gt = imageio.imread(os.path.join(lmask_dir, image_id))
 85        gt += imageio.imread(os.path.join(rmask_dir, image_id))
 86        gt = gt.astype("uint8")
 87
 88        gt_path = os.path.join(gt_dir, image_id)
 89
 90        imageio.imwrite(gt_path, gt)
 91        gt_paths.append(gt_path)
 92
 93    return image_paths, gt_paths
 94
 95
 96def get_montgomery_dataset(
 97    path: Union[os.PathLike, str],
 98    patch_shape: Tuple[int, int],
 99    resize_inputs: bool = True,
100    download: bool = False,
101    **kwargs
102) -> Dataset:
103    """Get the Montgomery dataset for lung segmentation.
104
105    Args:
106        path: Filepath to a folder where the data is downloaded for further processing.
107        patch_shape: The patch shape to use for training.
108        resize_inputs: Whether to resize the inputs to the patch shape.
109        download: Whether to download the data if it is not present.
110        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
111
112    Returns:
113        The segmentation dataset.
114    """
115    image_paths, gt_paths = get_montgomery_paths(path, download)
116
117    if resize_inputs:
118        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": False}
119        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
120            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
121        )
122
123    return torch_em.default_segmentation_dataset(
124        raw_paths=image_paths,
125        raw_key=None,
126        label_paths=gt_paths,
127        label_key=None,
128        patch_shape=patch_shape,
129        **kwargs
130    )
131
132
133def get_montgomery_loader(
134    path: Union[os.PathLike, str],
135    patch_shape: Tuple[int, int],
136    batch_size: int,
137    resize_inputs: bool = True,
138    download: bool = False,
139    **kwargs
140) -> DataLoader:
141    """Get the Montgomery dataloader for lung segmentation.
142
143    Args:
144        path: Filepath to a folder where the data is downloaded for further processing.
145        patch_shape: The patch shape to use for training.
146        resize_inputs: Whether to resize the inputs to the patch shape.
147        download: Whether to download the data if it is not present.
148        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
149
150    Returns:
151        The DataLoader.
152    """
153    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
154    dataset = get_montgomery_dataset(path, patch_shape, resize_inputs, download, **ds_kwargs)
155    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URL = 'http://openi.nlm.nih.gov/imgs/collections/NLM-MontgomeryCXRSet.zip'
CHECKSUM = '54601e952315d8f67383e9202a6e145997ade429f54f7e0af44b4e158714f424'
def get_montgomery_data(path: Union[os.PathLike, str], download: bool = False) -> str:
31def get_montgomery_data(path: Union[os.PathLike, str], download: bool = False) -> str:
32    """Download the Montgomery dataset.
33
34    Args:
35        path: Filepath to a folder where the data is downloaded for further processing.
36        download: Whether to download the data if it is not present.
37
38    Returns:
39        Filepath where the data is downloaded.
40    """
41    data_dir = os.path.join(path, "MontgomerySet")
42    if os.path.exists(data_dir):
43        return data_dir
44
45    os.makedirs(path, exist_ok=True)
46
47    zip_path = os.path.join(path, "NLM-MontgomeryCXRSet.zip")
48    util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM)
49    util.unzip(zip_path=zip_path, dst=path)
50
51    return data_dir

Download the Montgomery dataset.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • download: Whether to download the data if it is not present.
Returns:

Filepath where the data is downloaded.

def get_montgomery_paths( path: Union[os.PathLike, str], download: bool = False) -> Tuple[List[str], List[str]]:
54def get_montgomery_paths(path: Union[os.PathLike, str], download: bool = False) -> Tuple[List[str], List[str]]:
55    """Get paths to the Montgomery data.
56
57    Args:
58        path: Filepath to a folder where the data is downloaded for further processing.
59        download: Whether to download the data if it is not present.
60
61    Returns:
62        List of filepaths for the image data.
63        List of filepaths for the label data.
64    """
65    data_dir = get_montgomery_data(path=path, download=download)
66    gt_dir = os.path.join(data_dir, "ManualMask", "gt")
67
68    image_paths = sorted(glob(os.path.join(data_dir, "CXR_png", "*.png")))
69
70    if os.path.exists(gt_dir):
71        gt_paths = sorted(glob(os.path.join(gt_dir, "*.png")))
72        if len(image_paths) == len(gt_paths):
73            return image_paths, gt_paths
74
75    else:
76        os.makedirs(gt_dir, exist_ok=True)
77
78    lmask_dir = os.path.join(data_dir, "ManualMask", "leftMask")
79    rmask_dir = os.path.join(data_dir, "ManualMask", "rightMask")
80    gt_paths = []
81    for image_path in tqdm(image_paths, desc="Merging left and right lung halves"):
82        image_id = os.path.split(image_path)[-1]
83
84        # merge the left and right lung halves into one gt file
85        gt = imageio.imread(os.path.join(lmask_dir, image_id))
86        gt += imageio.imread(os.path.join(rmask_dir, image_id))
87        gt = gt.astype("uint8")
88
89        gt_path = os.path.join(gt_dir, image_id)
90
91        imageio.imwrite(gt_path, gt)
92        gt_paths.append(gt_path)
93
94    return image_paths, gt_paths

Get paths to the Montgomery data.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths for the image data. List of filepaths for the label data.

def get_montgomery_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], resize_inputs: bool = True, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
 97def get_montgomery_dataset(
 98    path: Union[os.PathLike, str],
 99    patch_shape: Tuple[int, int],
100    resize_inputs: bool = True,
101    download: bool = False,
102    **kwargs
103) -> Dataset:
104    """Get the Montgomery dataset for lung segmentation.
105
106    Args:
107        path: Filepath to a folder where the data is downloaded for further processing.
108        patch_shape: The patch shape to use for training.
109        resize_inputs: Whether to resize the inputs to the patch shape.
110        download: Whether to download the data if it is not present.
111        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
112
113    Returns:
114        The segmentation dataset.
115    """
116    image_paths, gt_paths = get_montgomery_paths(path, download)
117
118    if resize_inputs:
119        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": False}
120        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
121            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
122        )
123
124    return torch_em.default_segmentation_dataset(
125        raw_paths=image_paths,
126        raw_key=None,
127        label_paths=gt_paths,
128        label_key=None,
129        patch_shape=patch_shape,
130        **kwargs
131    )

Get the Montgomery dataset for lung segmentation.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • patch_shape: The patch shape to use for training.
  • resize_inputs: Whether to resize the inputs to the patch shape.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_montgomery_loader( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], batch_size: int, resize_inputs: bool = True, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
134def get_montgomery_loader(
135    path: Union[os.PathLike, str],
136    patch_shape: Tuple[int, int],
137    batch_size: int,
138    resize_inputs: bool = True,
139    download: bool = False,
140    **kwargs
141) -> DataLoader:
142    """Get the Montgomery dataloader for lung segmentation.
143
144    Args:
145        path: Filepath to a folder where the data is downloaded for further processing.
146        patch_shape: The patch shape to use for training.
147        resize_inputs: Whether to resize the inputs to the patch shape.
148        download: Whether to download the data if it is not present.
149        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
150
151    Returns:
152        The DataLoader.
153    """
154    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
155    dataset = get_montgomery_dataset(path, patch_shape, resize_inputs, download, **ds_kwargs)
156    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the Montgomery dataloader for lung segmentation.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • patch_shape: The patch shape to use for training.
  • resize_inputs: Whether to resize the inputs to the patch shape.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.