torch_em.data.datasets.histopathology.pcns

The PCNS dataset contains manual annotations for nucleus instance segmentation in H&E stained histopathology images of fourteen cancer types from TCGA.

The dataset contains 1,365 manually annotated patches of 400x400 pixels at 40x magnification, covering BLCA, BRCA, CESC, COAD, GBM, LUAD, LUSC, PAAD, PRAD, READ, SKCM, STAD, UCEC and UVM cancer types. Annotations were created by three human annotators correcting Mask R-CNN predictions.

NOTE: This dataset requires manual download. Please download 'manual_segmentation_data.tar.gz' from the TCIA collection page at https://www.cancerimagingarchive.net/analysis-result/pan-cancer-nuclei-seg/ (direct Box link: https://stonybrookmedicine.app.box.com/v/cnn-nuclear-segmentations-2019/file/586046955275) and place it in the 'path' directory you pass to the dataset functions.

NOTE: For the automatic segmentation results of 5,060 WSIs (~665 GB) via the same TCIA collection, use the IBM Aspera Connect plugin from the TCIA page. The Aspera manifests cover 10 cancer types (BLCA, BRCA, CESC, GBM, LUAD, LUSC, PAAD, PRAD, SKCM, UCEC) with per-WSI polygon CSV files under '{cancer_type}_polygon/' subdirectories.

The dataset is located at https://doi.org/10.7937/TCIA.2019.4A4DKP9U. This dataset is from the publication https://doi.org/10.1038/s41597-020-0528-1. Please cite it if you use this dataset in your research.

  1"""The PCNS dataset contains manual annotations for nucleus instance segmentation
  2in H&E stained histopathology images of fourteen cancer types from TCGA.
  3
  4The dataset contains 1,365 manually annotated patches of 400x400 pixels at 40x
  5magnification, covering BLCA, BRCA, CESC, COAD, GBM, LUAD, LUSC, PAAD, PRAD,
  6READ, SKCM, STAD, UCEC and UVM cancer types. Annotations were created by three
  7human annotators correcting Mask R-CNN predictions.
  8
  9NOTE: This dataset requires manual download. Please download
 10'manual_segmentation_data.tar.gz' from the TCIA collection page at
 11https://www.cancerimagingarchive.net/analysis-result/pan-cancer-nuclei-seg/
 12(direct Box link: https://stonybrookmedicine.app.box.com/v/cnn-nuclear-segmentations-2019/file/586046955275)
 13and place it in the 'path' directory you pass to the dataset functions.
 14
 15NOTE: For the automatic segmentation results of 5,060 WSIs (~665 GB) via the same TCIA
 16collection, use the IBM Aspera Connect plugin from the TCIA page. The Aspera manifests
 17cover 10 cancer types (BLCA, BRCA, CESC, GBM, LUAD, LUSC, PAAD, PRAD, SKCM, UCEC)
 18with per-WSI polygon CSV files under '{cancer_type}_polygon/' subdirectories.
 19
 20The dataset is located at https://doi.org/10.7937/TCIA.2019.4A4DKP9U.
 21This dataset is from the publication https://doi.org/10.1038/s41597-020-0528-1.
 22Please cite it if you use this dataset in your research.
 23"""
 24
 25import io
 26import json
 27import os
 28import shutil
 29import tarfile
 30from glob import glob
 31from typing import List, Literal, Optional, Tuple, Union
 32
 33from tqdm import tqdm
 34
 35import h5py
 36import numpy as np
 37import pandas as pd
 38import imageio.v3 as imageio
 39from sklearn.model_selection import train_test_split
 40
 41from torch.utils.data import DataLoader, Dataset
 42
 43import torch_em
 44
 45from .. import util
 46
 47
 48CROSSWALK_URL = (
 49    "https://www.cancerimagingarchive.net/wp-content/uploads/"
 50    "Pan-Cancer-Nuclei-Seg_1365patches_to_TCGA-ID_readme.txt"
 51)
 52
 53BOX_URL = "https://stonybrookmedicine.app.box.com/v/cnn-nuclear-segmentations-2019/file/586046955275"
 54
 55CANCER_TYPES = [
 56    "blca", "brca", "cesc", "coad", "gbm", "luad", "lusc",
 57    "paad", "prad", "read", "skcm", "stad", "ucec", "uvm",
 58]
 59
 60
 61def _load_crosswalk(path: str) -> pd.DataFrame:
 62    crosswalk_path = os.path.join(path, "pcns_crosswalk.txt")
 63    if not os.path.exists(crosswalk_path):
 64        util.download_source(path=crosswalk_path, url=CROSSWALK_URL, download=True)
 65
 66    with open(crosswalk_path, "rb") as f:
 67        raw = f.read()
 68
 69    if raw[:2] == b"\x1f\x8b":
 70        import gzip
 71        content = gzip.decompress(raw).decode("utf-8")
 72    else:
 73        content = raw.decode("utf-8")
 74
 75    lines = content.split("\n")
 76    csv_start = next((i for i, line in enumerate(lines) if line.startswith("Patch-ID,")), None)
 77    if csv_start is None:
 78        raise RuntimeError("Failed to parse the PCNS crosswalk file. Re-download it and try again.")
 79
 80    df = pd.read_csv(io.StringIO("\n".join(lines[csv_start:])))
 81    df = df.dropna(subset=["Patch-ID", "CancerType"])
 82    df["Patch-ID"] = df["Patch-ID"].astype(int)
 83    df["CancerType"] = df["CancerType"].str.lower()
 84    return df
 85
 86
 87def _create_split_csv(path: str, all_patch_ids: List[int], split: str) -> List[int]:
 88    csv_path = os.path.join(path, "pcns_split.csv")
 89    if os.path.exists(csv_path):
 90        df = pd.read_csv(csv_path)
 91        df[split] = df[split].apply(lambda x: json.loads(x.replace("'", '"')))
 92        return df.iloc[0][split]
 93
 94    print(f"Creating a new split file at '{csv_path}'.")
 95    train_ids, test_ids = train_test_split(all_patch_ids, test_size=0.2)
 96    split_ids = {"train": sorted(train_ids), "test": sorted(test_ids)}
 97    pd.DataFrame.from_dict([split_ids]).to_csv(csv_path, index=False)
 98    return split_ids[split]
 99
100
101def _create_samples(path: str, extract_dir: str, crosswalk_df: pd.DataFrame) -> str:
102    preprocessed_dir = os.path.join(path, "preprocessed_data")
103    if os.path.exists(preprocessed_dir):
104        return preprocessed_dir
105    os.makedirs(preprocessed_dir, exist_ok=True)
106
107    crop_paths = {
108        int(os.path.basename(p).split("_crop")[0]): p
109        for p in glob(os.path.join(extract_dir, "**", "*_crop.png"), recursive=True)
110    }
111
112    ct_map = {int(row["Patch-ID"]): str(row["CancerType"]).lower() for _, row in crosswalk_df.iterrows()}
113
114    valid_ids = [
115        pid for pid in crop_paths
116        if os.path.exists(crop_paths[pid].replace("_crop.png", "_labeled_mask_corrected.png"))
117    ]
118
119    for patch_id in tqdm(sorted(valid_ids), desc="Creating PCNS samples"):
120        image_path = crop_paths[patch_id]
121        mask_path = image_path.replace("_crop.png", "_labeled_mask_corrected.png")
122
123        raw = imageio.imread(image_path)[..., :3].transpose(2, 0, 1)
124        mask = imageio.imread(mask_path).astype(np.int32)
125        h, w = mask.shape
126
127        h5_path = os.path.join(preprocessed_dir, f"{patch_id}.h5")
128        with h5py.File(h5_path, "w") as f:
129            f.create_dataset("raw", data=raw, compression="gzip")
130            f.create_dataset("labels/instances", data=mask, compression="gzip")
131
132            has_all = True
133            for k in range(3):
134                common_path = image_path.replace("_crop.png", f"_labeled_mask_common{k}.png")
135                if os.path.exists(common_path):
136                    common_mask = imageio.imread(common_path).astype(np.int32)
137                else:
138                    common_mask = np.zeros((h, w), dtype=np.int32)
139                    has_all = False
140                f.create_dataset(f"labels/common{k}", data=common_mask, compression="gzip")
141
142            f.attrs["cancer_type"] = ct_map.get(patch_id, "unknown")
143            f.attrs["has_common"] = has_all
144
145    return preprocessed_dir
146
147
148def get_pcns_data(path: Union[os.PathLike, str], download: bool = False) -> str:
149    """Locate and extract the PCNS dataset, then build per-sample H5 files.
150
151    The dataset requires manual download. Download 'manual_segmentation_data.tar.gz'
152    from https://www.cancerimagingarchive.net/analysis-result/pan-cancer-nuclei-seg/
153    and place it in the 'path' directory before calling this function.
154
155    After preprocessing the final layout under 'path' is:
156    - manual_segmentation_data.tar.gz
157    - pcns_crosswalk.txt
158    - pcns_split.csv
159    - preprocessed_data/{patch_id}.h5 (one per patch)
160
161    Each sample H5 stores:
162    - raw: (3, H, W) uint8 RGB image
163    - labels/instances: (H, W) int32 corrected instance mask
164    - labels/common0/1/2: (H, W) int32 per-annotator masks (zero-filled if absent)
165    - attrs['cancer_type']: str cancer type code
166    - attrs['has_common']: bool, True for the 27 patches with per-annotator annotations
167
168    Args:
169        path: Filepath to the folder where the tarball was placed and data will be extracted.
170        download: Unused. The dataset cannot be downloaded automatically.
171
172    Returns:
173        The filepath to the preprocessed_data directory containing per-sample H5 files.
174    """
175    path = os.path.normpath(path)
176    preprocessed_dir = os.path.join(path, "preprocessed_data")
177
178    if os.path.exists(preprocessed_dir):
179        return preprocessed_dir
180
181    tar_path = os.path.join(path, "manual_segmentation_data.tar.gz")
182    if download:
183        raise RuntimeError(
184            "The PCNS dataset cannot be downloaded automatically. "
185            f"Please download 'manual_segmentation_data.tar.gz' manually from {BOX_URL} "
186            f"and place it at '{tar_path}'."
187        )
188    if not os.path.exists(tar_path):
189        raise RuntimeError(
190            "The PCNS dataset requires manual download. "
191            f"Please download 'manual_segmentation_data.tar.gz' from {BOX_URL} "
192            f"and place it at '{tar_path}'."
193        )
194
195    extract_dir = os.path.join(path, "_raw")
196    os.makedirs(extract_dir, exist_ok=True)
197    print(f"Extracting PCNS data to '{extract_dir}'...")
198    with tarfile.open(tar_path, "r:gz") as tar:
199        tar.extractall(path=extract_dir)
200
201    crosswalk_df = _load_crosswalk(path)
202    _create_samples(path, extract_dir, crosswalk_df)
203
204    shutil.rmtree(extract_dir)
205
206    return preprocessed_dir
207
208
209def get_pcns_paths(
210    path: Union[os.PathLike, str],
211    split: Literal["train", "test"],
212    cancer_type: Optional[Union[str, List[str]]] = None,
213    download: bool = False,
214) -> List[str]:
215    """Get the paths to the per-sample H5 files for the requested split.
216
217    Args:
218        path: Filepath to the folder where the data is located.
219        split: The data split to use. Either 'train' or 'test'.
220        cancer_type: The cancer type(s) to load. If None, all fourteen types are used.
221            Valid values: 'blca', 'brca', 'cesc', 'coad', 'gbm', 'luad', 'lusc',
222            'paad', 'prad', 'read', 'skcm', 'stad', 'ucec', 'uvm'.
223        download: Unused. The dataset cannot be downloaded automatically.
224
225    Returns:
226        List of filepaths to the per-sample H5 files for the requested split and cancer type.
227    """
228    if split not in ("train", "test"):
229        raise ValueError(f"'{split}' is not a valid split. Choose from 'train' or 'test'.")
230
231    preprocessed_dir = get_pcns_data(path, download)
232    crosswalk_df = _load_crosswalk(path)
233
234    all_patch_ids = crosswalk_df["Patch-ID"].tolist()
235    split_ids = set(_create_split_csv(path, all_patch_ids, split))
236
237    if cancer_type is not None:
238        if isinstance(cancer_type, str):
239            cancer_type = [cancer_type]
240        cancer_type = [ct.lower() for ct in cancer_type]
241        invalid = [ct for ct in cancer_type if ct not in CANCER_TYPES]
242        if invalid:
243            raise ValueError(f"Invalid cancer type(s): {invalid}. Choose from {CANCER_TYPES}.")
244        type_ids = set(crosswalk_df[crosswalk_df["CancerType"].isin(cancer_type)]["Patch-ID"].tolist())
245        split_ids = split_ids & type_ids
246
247    volume_paths = [
248        os.path.join(preprocessed_dir, f"{pid}.h5")
249        for pid in sorted(split_ids)
250        if os.path.exists(os.path.join(preprocessed_dir, f"{pid}.h5"))
251    ]
252
253    if not volume_paths:
254        raise RuntimeError(
255            f"No samples found for split='{split}', cancer_type={cancer_type!r}. "
256            "Ensure the data was extracted and preprocessed correctly."
257        )
258
259    return volume_paths
260
261
262def get_pcns_dataset(
263    path: Union[os.PathLike, str],
264    patch_shape: Tuple[int, int],
265    split: Literal["train", "test"],
266    cancer_type: Optional[Union[str, List[str]]] = None,
267    download: bool = False,
268    **kwargs
269) -> Dataset:
270    """Get the PCNS dataset for nucleus instance segmentation.
271
272    Args:
273        path: Filepath to the folder where the data is located.
274        patch_shape: The patch shape to use for training.
275        split: The data split to use. Either 'train' or 'test'.
276        cancer_type: The cancer type(s) to load. If None, all fourteen types are used.
277            Valid values: 'blca', 'brca', 'cesc', 'coad', 'gbm', 'luad', 'lusc',
278            'paad', 'prad', 'read', 'skcm', 'stad', 'ucec', 'uvm'.
279        download: Unused. The dataset cannot be downloaded automatically.
280        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
281
282    Returns:
283        The segmentation dataset.
284    """
285    volume_paths = get_pcns_paths(path, split, cancer_type, download)
286    return torch_em.default_segmentation_dataset(
287        raw_paths=volume_paths,
288        raw_key="raw",
289        label_paths=volume_paths,
290        label_key="labels/instances",
291        patch_shape=patch_shape,
292        ndim=2,
293        with_channels=True,
294        **kwargs
295    )
296
297
298def get_pcns_loader(
299    path: Union[os.PathLike, str],
300    batch_size: int,
301    patch_shape: Tuple[int, int],
302    split: Literal["train", "test"],
303    cancer_type: Optional[Union[str, List[str]]] = None,
304    download: bool = False,
305    **kwargs
306) -> DataLoader:
307    """Get the PCNS dataloader for nucleus instance segmentation.
308
309    Args:
310        path: Filepath to the folder where the data is located.
311        batch_size: The batch size for training.
312        patch_shape: The patch shape to use for training.
313        split: The data split to use. Either 'train' or 'test'.
314        cancer_type: The cancer type(s) to load. If None, all fourteen types are used.
315            Valid values: 'blca', 'brca', 'cesc', 'coad', 'gbm', 'luad', 'lusc',
316            'paad', 'prad', 'read', 'skcm', 'stad', 'ucec', 'uvm'.
317        download: Unused. The dataset cannot be downloaded automatically.
318        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
319
320    Returns:
321        The DataLoader.
322    """
323    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
324    dataset = get_pcns_dataset(path, patch_shape, split, cancer_type, download, **ds_kwargs)
325    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
CROSSWALK_URL = 'https://www.cancerimagingarchive.net/wp-content/uploads/Pan-Cancer-Nuclei-Seg_1365patches_to_TCGA-ID_readme.txt'
BOX_URL = 'https://stonybrookmedicine.app.box.com/v/cnn-nuclear-segmentations-2019/file/586046955275'
CANCER_TYPES = ['blca', 'brca', 'cesc', 'coad', 'gbm', 'luad', 'lusc', 'paad', 'prad', 'read', 'skcm', 'stad', 'ucec', 'uvm']
def get_pcns_data(path: Union[os.PathLike, str], download: bool = False) -> str:
149def get_pcns_data(path: Union[os.PathLike, str], download: bool = False) -> str:
150    """Locate and extract the PCNS dataset, then build per-sample H5 files.
151
152    The dataset requires manual download. Download 'manual_segmentation_data.tar.gz'
153    from https://www.cancerimagingarchive.net/analysis-result/pan-cancer-nuclei-seg/
154    and place it in the 'path' directory before calling this function.
155
156    After preprocessing the final layout under 'path' is:
157    - manual_segmentation_data.tar.gz
158    - pcns_crosswalk.txt
159    - pcns_split.csv
160    - preprocessed_data/{patch_id}.h5 (one per patch)
161
162    Each sample H5 stores:
163    - raw: (3, H, W) uint8 RGB image
164    - labels/instances: (H, W) int32 corrected instance mask
165    - labels/common0/1/2: (H, W) int32 per-annotator masks (zero-filled if absent)
166    - attrs['cancer_type']: str cancer type code
167    - attrs['has_common']: bool, True for the 27 patches with per-annotator annotations
168
169    Args:
170        path: Filepath to the folder where the tarball was placed and data will be extracted.
171        download: Unused. The dataset cannot be downloaded automatically.
172
173    Returns:
174        The filepath to the preprocessed_data directory containing per-sample H5 files.
175    """
176    path = os.path.normpath(path)
177    preprocessed_dir = os.path.join(path, "preprocessed_data")
178
179    if os.path.exists(preprocessed_dir):
180        return preprocessed_dir
181
182    tar_path = os.path.join(path, "manual_segmentation_data.tar.gz")
183    if download:
184        raise RuntimeError(
185            "The PCNS dataset cannot be downloaded automatically. "
186            f"Please download 'manual_segmentation_data.tar.gz' manually from {BOX_URL} "
187            f"and place it at '{tar_path}'."
188        )
189    if not os.path.exists(tar_path):
190        raise RuntimeError(
191            "The PCNS dataset requires manual download. "
192            f"Please download 'manual_segmentation_data.tar.gz' from {BOX_URL} "
193            f"and place it at '{tar_path}'."
194        )
195
196    extract_dir = os.path.join(path, "_raw")
197    os.makedirs(extract_dir, exist_ok=True)
198    print(f"Extracting PCNS data to '{extract_dir}'...")
199    with tarfile.open(tar_path, "r:gz") as tar:
200        tar.extractall(path=extract_dir)
201
202    crosswalk_df = _load_crosswalk(path)
203    _create_samples(path, extract_dir, crosswalk_df)
204
205    shutil.rmtree(extract_dir)
206
207    return preprocessed_dir

Locate and extract the PCNS dataset, then build per-sample H5 files.

The dataset requires manual download. Download 'manual_segmentation_data.tar.gz' from https://www.cancerimagingarchive.net/analysis-result/pan-cancer-nuclei-seg/ and place it in the 'path' directory before calling this function.

After preprocessing the final layout under 'path' is:

  • manual_segmentation_data.tar.gz
  • pcns_crosswalk.txt
  • pcns_split.csv
  • preprocessed_data/{patch_id}.h5 (one per patch)

Each sample H5 stores:

  • raw: (3, H, W) uint8 RGB image
  • labels/instances: (H, W) int32 corrected instance mask
  • labels/common0/1/2: (H, W) int32 per-annotator masks (zero-filled if absent)
  • attrs['cancer_type']: str cancer type code
  • attrs['has_common']: bool, True for the 27 patches with per-annotator annotations
Arguments:
  • path: Filepath to the folder where the tarball was placed and data will be extracted.
  • download: Unused. The dataset cannot be downloaded automatically.
Returns:

The filepath to the preprocessed_data directory containing per-sample H5 files.

def get_pcns_paths( path: Union[os.PathLike, str], split: Literal['train', 'test'], cancer_type: Union[List[str], str, NoneType] = None, download: bool = False) -> List[str]:
210def get_pcns_paths(
211    path: Union[os.PathLike, str],
212    split: Literal["train", "test"],
213    cancer_type: Optional[Union[str, List[str]]] = None,
214    download: bool = False,
215) -> List[str]:
216    """Get the paths to the per-sample H5 files for the requested split.
217
218    Args:
219        path: Filepath to the folder where the data is located.
220        split: The data split to use. Either 'train' or 'test'.
221        cancer_type: The cancer type(s) to load. If None, all fourteen types are used.
222            Valid values: 'blca', 'brca', 'cesc', 'coad', 'gbm', 'luad', 'lusc',
223            'paad', 'prad', 'read', 'skcm', 'stad', 'ucec', 'uvm'.
224        download: Unused. The dataset cannot be downloaded automatically.
225
226    Returns:
227        List of filepaths to the per-sample H5 files for the requested split and cancer type.
228    """
229    if split not in ("train", "test"):
230        raise ValueError(f"'{split}' is not a valid split. Choose from 'train' or 'test'.")
231
232    preprocessed_dir = get_pcns_data(path, download)
233    crosswalk_df = _load_crosswalk(path)
234
235    all_patch_ids = crosswalk_df["Patch-ID"].tolist()
236    split_ids = set(_create_split_csv(path, all_patch_ids, split))
237
238    if cancer_type is not None:
239        if isinstance(cancer_type, str):
240            cancer_type = [cancer_type]
241        cancer_type = [ct.lower() for ct in cancer_type]
242        invalid = [ct for ct in cancer_type if ct not in CANCER_TYPES]
243        if invalid:
244            raise ValueError(f"Invalid cancer type(s): {invalid}. Choose from {CANCER_TYPES}.")
245        type_ids = set(crosswalk_df[crosswalk_df["CancerType"].isin(cancer_type)]["Patch-ID"].tolist())
246        split_ids = split_ids & type_ids
247
248    volume_paths = [
249        os.path.join(preprocessed_dir, f"{pid}.h5")
250        for pid in sorted(split_ids)
251        if os.path.exists(os.path.join(preprocessed_dir, f"{pid}.h5"))
252    ]
253
254    if not volume_paths:
255        raise RuntimeError(
256            f"No samples found for split='{split}', cancer_type={cancer_type!r}. "
257            "Ensure the data was extracted and preprocessed correctly."
258        )
259
260    return volume_paths

Get the paths to the per-sample H5 files for the requested split.

Arguments:
  • path: Filepath to the folder where the data is located.
  • split: The data split to use. Either 'train' or 'test'.
  • cancer_type: The cancer type(s) to load. If None, all fourteen types are used. Valid values: 'blca', 'brca', 'cesc', 'coad', 'gbm', 'luad', 'lusc', 'paad', 'prad', 'read', 'skcm', 'stad', 'ucec', 'uvm'.
  • download: Unused. The dataset cannot be downloaded automatically.
Returns:

List of filepaths to the per-sample H5 files for the requested split and cancer type.

def get_pcns_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Literal['train', 'test'], cancer_type: Union[List[str], str, NoneType] = None, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
263def get_pcns_dataset(
264    path: Union[os.PathLike, str],
265    patch_shape: Tuple[int, int],
266    split: Literal["train", "test"],
267    cancer_type: Optional[Union[str, List[str]]] = None,
268    download: bool = False,
269    **kwargs
270) -> Dataset:
271    """Get the PCNS dataset for nucleus instance segmentation.
272
273    Args:
274        path: Filepath to the folder where the data is located.
275        patch_shape: The patch shape to use for training.
276        split: The data split to use. Either 'train' or 'test'.
277        cancer_type: The cancer type(s) to load. If None, all fourteen types are used.
278            Valid values: 'blca', 'brca', 'cesc', 'coad', 'gbm', 'luad', 'lusc',
279            'paad', 'prad', 'read', 'skcm', 'stad', 'ucec', 'uvm'.
280        download: Unused. The dataset cannot be downloaded automatically.
281        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
282
283    Returns:
284        The segmentation dataset.
285    """
286    volume_paths = get_pcns_paths(path, split, cancer_type, download)
287    return torch_em.default_segmentation_dataset(
288        raw_paths=volume_paths,
289        raw_key="raw",
290        label_paths=volume_paths,
291        label_key="labels/instances",
292        patch_shape=patch_shape,
293        ndim=2,
294        with_channels=True,
295        **kwargs
296    )

Get the PCNS dataset for nucleus instance segmentation.

Arguments:
  • path: Filepath to the folder where the data is located.
  • patch_shape: The patch shape to use for training.
  • split: The data split to use. Either 'train' or 'test'.
  • cancer_type: The cancer type(s) to load. If None, all fourteen types are used. Valid values: 'blca', 'brca', 'cesc', 'coad', 'gbm', 'luad', 'lusc', 'paad', 'prad', 'read', 'skcm', 'stad', 'ucec', 'uvm'.
  • download: Unused. The dataset cannot be downloaded automatically.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_pcns_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Literal['train', 'test'], cancer_type: Union[List[str], str, NoneType] = None, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
299def get_pcns_loader(
300    path: Union[os.PathLike, str],
301    batch_size: int,
302    patch_shape: Tuple[int, int],
303    split: Literal["train", "test"],
304    cancer_type: Optional[Union[str, List[str]]] = None,
305    download: bool = False,
306    **kwargs
307) -> DataLoader:
308    """Get the PCNS dataloader for nucleus instance segmentation.
309
310    Args:
311        path: Filepath to the folder where the data is located.
312        batch_size: The batch size for training.
313        patch_shape: The patch shape to use for training.
314        split: The data split to use. Either 'train' or 'test'.
315        cancer_type: The cancer type(s) to load. If None, all fourteen types are used.
316            Valid values: 'blca', 'brca', 'cesc', 'coad', 'gbm', 'luad', 'lusc',
317            'paad', 'prad', 'read', 'skcm', 'stad', 'ucec', 'uvm'.
318        download: Unused. The dataset cannot be downloaded automatically.
319        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
320
321    Returns:
322        The DataLoader.
323    """
324    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
325    dataset = get_pcns_dataset(path, patch_shape, split, cancer_type, download, **ds_kwargs)
326    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the PCNS dataloader for nucleus instance segmentation.

Arguments:
  • path: Filepath to the folder where the data is located.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape to use for training.
  • split: The data split to use. Either 'train' or 'test'.
  • cancer_type: The cancer type(s) to load. If None, all fourteen types are used. Valid values: 'blca', 'brca', 'cesc', 'coad', 'gbm', 'luad', 'lusc', 'paad', 'prad', 'read', 'skcm', 'stad', 'ucec', 'uvm'.
  • download: Unused. The dataset cannot be downloaded automatically.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.