torch_em.data.datasets.histopathology.bcss

This dataset contains annotations for tissue region segmentation in breast cancer histopathology images.

NOTE: There are multiple semantic instances in tissue labels. Below mentioned are their respective index details: - 0: outside_roi (~background) - 1: tumor - 2: stroma - 3: lymphocytic_infiltrate - 4: necrosis_or_debris - 5: glandular_secretions - 6: blood - 7: exclude - 8: metaplasia_NOS - 9: fat - 10: plasma_cells - 11: other_immune_infiltrate - 12: mucoid_material - 13: normal_acinus_or_duct - 14: lymphatics - 15: undetermined - 16: nerve - 17: skin_adnexa - 18: blood_vessel - 19: angioinvasion - 20: dcis - 21: other

This dataset is from https://bcsegmentation.grand-challenge.org/BCSS/. Please cite this paper (https://doi.org/10.1093/bioinformatics/btz083) if you use this dataset for a publication.

  1"""This dataset contains annotations for tissue region segmentation in
  2breast cancer histopathology images.
  3
  4NOTE: There are multiple semantic instances in tissue labels. Below mentioned are their respective index details:
  5    - 0: outside_roi (~background)
  6    - 1: tumor
  7    - 2: stroma
  8    - 3: lymphocytic_infiltrate
  9    - 4: necrosis_or_debris
 10    - 5: glandular_secretions
 11    - 6: blood
 12    - 7: exclude
 13    - 8: metaplasia_NOS
 14    - 9: fat
 15    - 10: plasma_cells
 16    - 11: other_immune_infiltrate
 17    - 12: mucoid_material
 18    - 13: normal_acinus_or_duct
 19    - 14: lymphatics
 20    - 15: undetermined
 21    - 16: nerve
 22    - 17: skin_adnexa
 23    - 18: blood_vessel
 24    - 19: angioinvasion
 25    - 20: dcis
 26    - 21: other
 27
 28This dataset is from https://bcsegmentation.grand-challenge.org/BCSS/.
 29Please cite this paper (https://doi.org/10.1093/bioinformatics/btz083) if you use this dataset for a publication.
 30"""
 31
 32import os
 33import shutil
 34from glob import glob
 35from pathlib import Path
 36from typing import Union, Optional, List, Tuple
 37
 38from sklearn.model_selection import train_test_split
 39
 40import torch
 41from torch.utils.data import Dataset, DataLoader
 42
 43import torch_em
 44
 45from .. import util
 46
 47
 48URL = "https://drive.google.com/drive/folders/1zqbdkQF8i5cEmZOGmbdQm-EP8dRYtvss?usp=sharing"
 49
 50
 51# TODO
 52CHECKSUM = None
 53
 54
 55TEST_LIST = [
 56    "TCGA-A2-A0SX-DX1_xmin53791_ymin56683_MPP-0.2500", "TCGA-BH-A0BG-DX1_xmin64019_ymin24975_MPP-0.2500",
 57    "TCGA-AR-A1AI-DX1_xmin38671_ymin10616_MPP-0.2500", "TCGA-E2-A574-DX1_xmin54962_ymin47475_MPP-0.2500",
 58    "TCGA-GM-A3XL-DX1_xmin29910_ymin15820_MPP-0.2500", "TCGA-E2-A14X-DX1_xmin88836_ymin66393_MPP-0.2500",
 59    "TCGA-A2-A04P-DX1_xmin104246_ymin48517_MPP-0.2500", "TCGA-E2-A14N-DX1_xmin21383_ymin66838_MPP-0.2500",
 60    "TCGA-EW-A1OV-DX1_xmin126026_ymin65132_MPP-0.2500", "TCGA-S3-AA15-DX1_xmin55486_ymin28926_MPP-0.2500",
 61    "TCGA-LL-A5YO-DX1_xmin36631_ymin44396_MPP-0.2500", "TCGA-GI-A2C9-DX1_xmin20882_ymin11843_MPP-0.2500",
 62    "TCGA-BH-A0BW-DX1_xmin42346_ymin30843_MPP-0.2500", "TCGA-E2-A1B6-DX1_xmin16266_ymin50634_MPP-0.2500",
 63    "TCGA-AO-A0J2-DX1_xmin33561_ymin14515_MPP-0.2500"
 64]
 65
 66
 67def _download_bcss_dataset(path, download):
 68    """Current recommendation:
 69        - download the folder from URL manually
 70        - use the consortium's git repo to download the dataset (https://github.com/PathologyDataScience/BCSS)
 71    """
 72    raise NotImplementedError("Please download the dataset using the drive link / git repo directly")
 73
 74    # FIXME: limitation for the installation below:
 75    #   - only downloads first 50 files - due to `gdown`'s download folder function
 76    #   - (optional) clone their git repo to download their data
 77    util.download_source_gdrive(path=path, url=URL, download=download, checksum=CHECKSUM, download_type="folder")
 78
 79
 80def _get_image_and_label_paths(path):
 81    # when downloading the files from `URL`, the input images are stored under `rgbs_colorNormalized`
 82    # when getting the files from the git repo's command line feature, the input images are stored under `images`
 83    if os.path.exists(os.path.join(path, "images")):
 84        image_paths = sorted(glob(os.path.join(path, "images", "*")))
 85        label_paths = sorted(glob(os.path.join(path, "masks", "*")))
 86    elif os.path.exists(os.path.join(path, "0_Public-data-Amgad2019_0.25MPP", "rgbs_colorNormalized")):
 87        image_paths = sorted(glob(os.path.join(path, "0_Public-data-Amgad2019_0.25MPP", "rgbs_colorNormalized", "*")))
 88        label_paths = sorted(glob(os.path.join(path, "0_Public-data-Amgad2019_0.25MPP", "masks", "*")))
 89    else:
 90        raise ValueError(
 91            "Please check the image directory. "
 92            "If downloaded from gdrive, it's named \"rgbs_colorNormalized\", if from github it's named \"images\""
 93        )
 94
 95    return image_paths, label_paths
 96
 97
 98def get_bcss_data(path: Union[os.PathLike, str], download: bool = False):
 99    """Download the BCSS dataset.
100
101    Args:
102        path: Filepath to a folder where the downloaded data will be saved.
103        download: Whether to download the data if it is not present.
104    """
105    if download:
106        _download_bcss_dataset(path, download)
107
108    if os.path.exists(os.path.join(path, "train")) and os.path.exists(os.path.join(path, "test")):
109        return
110
111    all_image_paths, all_label_paths = _get_image_and_label_paths(path)
112
113    train_img_dir, train_lab_dir = os.path.join(path, "train", "images"), os.path.join(path, "train", "masks")
114    test_img_dir, test_lab_dir = os.path.join(path, "test", "images"), os.path.join(path, "test", "masks")
115    os.makedirs(train_img_dir, exist_ok=True)
116    os.makedirs(train_lab_dir, exist_ok=True)
117    os.makedirs(test_img_dir, exist_ok=True)
118    os.makedirs(test_lab_dir, exist_ok=True)
119
120    for image_path, label_path in zip(all_image_paths, all_label_paths):
121        img_idx, label_idx = os.path.split(image_path)[-1], os.path.split(label_path)[-1]
122        if Path(image_path).stem in TEST_LIST:
123            # move image and label to test
124            dst_img_path, dst_lab_path = os.path.join(test_img_dir, img_idx), os.path.join(test_lab_dir, label_idx)
125            shutil.copy(src=image_path, dst=dst_img_path)
126            shutil.copy(src=label_path, dst=dst_lab_path)
127        else:
128            # move image and label to train
129            dst_img_path, dst_lab_path = os.path.join(train_img_dir, img_idx), os.path.join(train_lab_dir, label_idx)
130            shutil.copy(src=image_path, dst=dst_img_path)
131            shutil.copy(src=label_path, dst=dst_lab_path)
132
133
134def get_bcsss_paths(
135    path: Union[os.PathLike, str], split: Optional[str] = None, val_fraction: float = 0.2, download: bool = False
136) -> Tuple[List[str], List[str]]:
137    """Get paths to the BCSS data.
138
139    Args:
140        path: Filepath to a folder where the downloaded data will be saved.
141        split: The split to use for the dataset. Either 'train', 'val' or 'test'.
142        val_fraction: The fraction of data to be considered for validation split.
143        download: Whether to download the data if it is not present.
144
145    Returns:
146        List of filepaths for the image data.
147        List of filepaths for the label data.
148    """
149    get_bcss_data(path, download)
150
151    if split is None:
152        image_paths = sorted(glob(os.path.join(path, "*", "images", "*")))
153        label_paths = sorted(glob(os.path.join(path, "*", "masks", "*")))
154    else:
155        assert split in ["train", "val", "test"], "Please choose from the available `train` / `val` / `test` splits"
156        if split == "test":
157            image_paths = sorted(glob(os.path.join(path, "test", "images", "*")))
158            label_paths = sorted(glob(os.path.join(path, "test", "masks", "*")))
159        else:
160            image_paths = sorted(glob(os.path.join(path, "train", "images", "*")))
161            label_paths = sorted(glob(os.path.join(path, "train", "masks", "*")))
162
163            (train_image_paths, val_image_paths,
164             train_label_paths, val_label_paths) = train_test_split(
165                image_paths, label_paths, test_size=val_fraction, random_state=42
166            )
167
168            image_paths = train_image_paths if split == "train" else val_image_paths
169            label_paths = train_label_paths if split == "train" else val_label_paths
170
171    assert len(image_paths) == len(label_paths)
172
173    return image_paths, label_paths
174
175
176def get_bcss_dataset(
177    path: Union[os.PathLike, str],
178    patch_shape: Tuple[int, ...],
179    split: Optional[str] = None,
180    val_fraction: float = 0.2,
181    download: bool = False,
182    label_dtype: torch.dtype = torch.int64,
183    **kwargs
184) -> Dataset:
185    """Get the BCSS dataset for breast cancer tissue segmentation in histopathology.
186
187    Args:
188        path: Filepath to a folder where the downloaded data will be saved.
189        patch_shape: The patch shape to use for training.
190        split: The split to use for the dataset. Either 'train', 'val' or 'test'.
191        val_fraction: The fraction of data to be considered for validation split.
192        download: Whether to download the data if it is not present.
193        label_dtype: The datatype of labels.
194        kwargs: kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
195
196    Returns:
197        The segmentation dataset.
198    """
199    image_paths, label_paths = get_bcsss_paths(path, split, val_fraction, download)
200
201    return torch_em.default_segmentation_dataset(
202        raw_paths=image_paths,
203        raw_key=None,
204        label_paths=label_paths,
205        label_key=None,
206        patch_shape=patch_shape,
207        label_dtype=label_dtype,
208        is_seg_dataset=False,
209        **kwargs
210    )
211
212
213def get_bcss_loader(
214    path: Union[os.PathLike, str],
215    patch_shape: Tuple[int, ...],
216    batch_size: int,
217    split: Optional[str] = None,
218    val_fraction: float = 0.2,
219    download: bool = False,
220    label_dtype: torch.dtype = torch.int64,
221    **kwargs
222) -> DataLoader:
223    """Get the BCSS dataloader for breast cancer tissue segmentation in histopathology.
224
225    Args:
226        path: Filepath to a folder where the downloaded data will be saved.
227        patch_shape: The patch shape to use for training.
228        batch_size: The batch size for training.
229        split: The split to use for the dataset. Either 'train', 'val' or 'test'.
230        val_fraction: The fraction of data to be considered for validation split.
231        download: Whether to download the data if it is not present.
232        label_dtype: The datatype of labels.
233        kwargs: kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
234
235    Returns:
236        The DataLoader.
237    """
238    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
239    dataset = get_bcss_dataset(
240        path, patch_shape, split, val_fraction, download=download, label_dtype=label_dtype, **ds_kwargs
241    )
242    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URL = 'https://drive.google.com/drive/folders/1zqbdkQF8i5cEmZOGmbdQm-EP8dRYtvss?usp=sharing'
CHECKSUM = None
TEST_LIST = ['TCGA-A2-A0SX-DX1_xmin53791_ymin56683_MPP-0.2500', 'TCGA-BH-A0BG-DX1_xmin64019_ymin24975_MPP-0.2500', 'TCGA-AR-A1AI-DX1_xmin38671_ymin10616_MPP-0.2500', 'TCGA-E2-A574-DX1_xmin54962_ymin47475_MPP-0.2500', 'TCGA-GM-A3XL-DX1_xmin29910_ymin15820_MPP-0.2500', 'TCGA-E2-A14X-DX1_xmin88836_ymin66393_MPP-0.2500', 'TCGA-A2-A04P-DX1_xmin104246_ymin48517_MPP-0.2500', 'TCGA-E2-A14N-DX1_xmin21383_ymin66838_MPP-0.2500', 'TCGA-EW-A1OV-DX1_xmin126026_ymin65132_MPP-0.2500', 'TCGA-S3-AA15-DX1_xmin55486_ymin28926_MPP-0.2500', 'TCGA-LL-A5YO-DX1_xmin36631_ymin44396_MPP-0.2500', 'TCGA-GI-A2C9-DX1_xmin20882_ymin11843_MPP-0.2500', 'TCGA-BH-A0BW-DX1_xmin42346_ymin30843_MPP-0.2500', 'TCGA-E2-A1B6-DX1_xmin16266_ymin50634_MPP-0.2500', 'TCGA-AO-A0J2-DX1_xmin33561_ymin14515_MPP-0.2500']
def get_bcss_data(path: Union[os.PathLike, str], download: bool = False):
 99def get_bcss_data(path: Union[os.PathLike, str], download: bool = False):
100    """Download the BCSS dataset.
101
102    Args:
103        path: Filepath to a folder where the downloaded data will be saved.
104        download: Whether to download the data if it is not present.
105    """
106    if download:
107        _download_bcss_dataset(path, download)
108
109    if os.path.exists(os.path.join(path, "train")) and os.path.exists(os.path.join(path, "test")):
110        return
111
112    all_image_paths, all_label_paths = _get_image_and_label_paths(path)
113
114    train_img_dir, train_lab_dir = os.path.join(path, "train", "images"), os.path.join(path, "train", "masks")
115    test_img_dir, test_lab_dir = os.path.join(path, "test", "images"), os.path.join(path, "test", "masks")
116    os.makedirs(train_img_dir, exist_ok=True)
117    os.makedirs(train_lab_dir, exist_ok=True)
118    os.makedirs(test_img_dir, exist_ok=True)
119    os.makedirs(test_lab_dir, exist_ok=True)
120
121    for image_path, label_path in zip(all_image_paths, all_label_paths):
122        img_idx, label_idx = os.path.split(image_path)[-1], os.path.split(label_path)[-1]
123        if Path(image_path).stem in TEST_LIST:
124            # move image and label to test
125            dst_img_path, dst_lab_path = os.path.join(test_img_dir, img_idx), os.path.join(test_lab_dir, label_idx)
126            shutil.copy(src=image_path, dst=dst_img_path)
127            shutil.copy(src=label_path, dst=dst_lab_path)
128        else:
129            # move image and label to train
130            dst_img_path, dst_lab_path = os.path.join(train_img_dir, img_idx), os.path.join(train_lab_dir, label_idx)
131            shutil.copy(src=image_path, dst=dst_img_path)
132            shutil.copy(src=label_path, dst=dst_lab_path)

Download the BCSS dataset.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • download: Whether to download the data if it is not present.
def get_bcsss_paths( path: Union[os.PathLike, str], split: Optional[str] = None, val_fraction: float = 0.2, download: bool = False) -> Tuple[List[str], List[str]]:
135def get_bcsss_paths(
136    path: Union[os.PathLike, str], split: Optional[str] = None, val_fraction: float = 0.2, download: bool = False
137) -> Tuple[List[str], List[str]]:
138    """Get paths to the BCSS data.
139
140    Args:
141        path: Filepath to a folder where the downloaded data will be saved.
142        split: The split to use for the dataset. Either 'train', 'val' or 'test'.
143        val_fraction: The fraction of data to be considered for validation split.
144        download: Whether to download the data if it is not present.
145
146    Returns:
147        List of filepaths for the image data.
148        List of filepaths for the label data.
149    """
150    get_bcss_data(path, download)
151
152    if split is None:
153        image_paths = sorted(glob(os.path.join(path, "*", "images", "*")))
154        label_paths = sorted(glob(os.path.join(path, "*", "masks", "*")))
155    else:
156        assert split in ["train", "val", "test"], "Please choose from the available `train` / `val` / `test` splits"
157        if split == "test":
158            image_paths = sorted(glob(os.path.join(path, "test", "images", "*")))
159            label_paths = sorted(glob(os.path.join(path, "test", "masks", "*")))
160        else:
161            image_paths = sorted(glob(os.path.join(path, "train", "images", "*")))
162            label_paths = sorted(glob(os.path.join(path, "train", "masks", "*")))
163
164            (train_image_paths, val_image_paths,
165             train_label_paths, val_label_paths) = train_test_split(
166                image_paths, label_paths, test_size=val_fraction, random_state=42
167            )
168
169            image_paths = train_image_paths if split == "train" else val_image_paths
170            label_paths = train_label_paths if split == "train" else val_label_paths
171
172    assert len(image_paths) == len(label_paths)
173
174    return image_paths, label_paths

Get paths to the BCSS data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • split: The split to use for the dataset. Either 'train', 'val' or 'test'.
  • val_fraction: The fraction of data to be considered for validation split.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths for the image data. List of filepaths for the label data.

def get_bcss_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], split: Optional[str] = None, val_fraction: float = 0.2, download: bool = False, label_dtype: torch.dtype = torch.int64, **kwargs) -> torch.utils.data.dataset.Dataset:
177def get_bcss_dataset(
178    path: Union[os.PathLike, str],
179    patch_shape: Tuple[int, ...],
180    split: Optional[str] = None,
181    val_fraction: float = 0.2,
182    download: bool = False,
183    label_dtype: torch.dtype = torch.int64,
184    **kwargs
185) -> Dataset:
186    """Get the BCSS dataset for breast cancer tissue segmentation in histopathology.
187
188    Args:
189        path: Filepath to a folder where the downloaded data will be saved.
190        patch_shape: The patch shape to use for training.
191        split: The split to use for the dataset. Either 'train', 'val' or 'test'.
192        val_fraction: The fraction of data to be considered for validation split.
193        download: Whether to download the data if it is not present.
194        label_dtype: The datatype of labels.
195        kwargs: kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
196
197    Returns:
198        The segmentation dataset.
199    """
200    image_paths, label_paths = get_bcsss_paths(path, split, val_fraction, download)
201
202    return torch_em.default_segmentation_dataset(
203        raw_paths=image_paths,
204        raw_key=None,
205        label_paths=label_paths,
206        label_key=None,
207        patch_shape=patch_shape,
208        label_dtype=label_dtype,
209        is_seg_dataset=False,
210        **kwargs
211    )

Get the BCSS dataset for breast cancer tissue segmentation in histopathology.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • split: The split to use for the dataset. Either 'train', 'val' or 'test'.
  • val_fraction: The fraction of data to be considered for validation split.
  • download: Whether to download the data if it is not present.
  • label_dtype: The datatype of labels.
  • kwargs: kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_bcss_loader( path: Union[os.PathLike, str], patch_shape: Tuple[int, ...], batch_size: int, split: Optional[str] = None, val_fraction: float = 0.2, download: bool = False, label_dtype: torch.dtype = torch.int64, **kwargs) -> torch.utils.data.dataloader.DataLoader:
214def get_bcss_loader(
215    path: Union[os.PathLike, str],
216    patch_shape: Tuple[int, ...],
217    batch_size: int,
218    split: Optional[str] = None,
219    val_fraction: float = 0.2,
220    download: bool = False,
221    label_dtype: torch.dtype = torch.int64,
222    **kwargs
223) -> DataLoader:
224    """Get the BCSS dataloader for breast cancer tissue segmentation in histopathology.
225
226    Args:
227        path: Filepath to a folder where the downloaded data will be saved.
228        patch_shape: The patch shape to use for training.
229        batch_size: The batch size for training.
230        split: The split to use for the dataset. Either 'train', 'val' or 'test'.
231        val_fraction: The fraction of data to be considered for validation split.
232        download: Whether to download the data if it is not present.
233        label_dtype: The datatype of labels.
234        kwargs: kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
235
236    Returns:
237        The DataLoader.
238    """
239    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
240    dataset = get_bcss_dataset(
241        path, patch_shape, split, val_fraction, download=download, label_dtype=label_dtype, **ds_kwargs
242    )
243    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the BCSS dataloader for breast cancer tissue segmentation in histopathology.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • batch_size: The batch size for training.
  • split: The split to use for the dataset. Either 'train', 'val' or 'test'.
  • val_fraction: The fraction of data to be considered for validation split.
  • download: Whether to download the data if it is not present.
  • label_dtype: The datatype of labels.
  • kwargs: kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The DataLoader.