torch_em.data.datasets.histopathology.janowczyk

The Janowczyk dataset contains annotations for nucleus, epithelium and tubule segmentation in H&E stained histopathology images for breast cancer.

NOTE:

  • The nuclei are sparsely annotated instances for ER+ breast cancer images.
  • The epithelium and tubule are dense semantic annotations for breast cancer images.

The dataset is located at https://andrewjanowczyk.com/deep-learning/. This dataset is from the publication https://doi.org/10.4103/2153-3539.186902. Please cite it if you use this dataset for your research.

  1"""The Janowczyk dataset contains annotations for nucleus, epithelium and tubule segmentation
  2in H&E stained histopathology images for breast cancer.
  3
  4NOTE:
  5- The nuclei are sparsely annotated instances for ER+ breast cancer images.
  6- The epithelium and tubule are dense semantic annotations for breast cancer images.
  7
  8The dataset is located at https://andrewjanowczyk.com/deep-learning/.
  9This dataset is from the publication https://doi.org/10.4103/2153-3539.186902.
 10Please cite it if you use this dataset for your research.
 11"""
 12
 13import os
 14from glob import glob
 15from tqdm import tqdm
 16from natsort import natsorted
 17from typing import Union, Tuple, Literal, List, Optional
 18
 19import json
 20import pandas as pd
 21import imageio.v3 as imageio
 22from sklearn.model_selection import train_test_split
 23from skimage.measure import label as connected_components
 24
 25from torch.utils.data import Dataset, DataLoader
 26
 27import torch_em
 28
 29from .. import util
 30
 31
 32URL = {
 33    "nuclei": "https://andrewjanowczyk.com/wp-static/nuclei.tgz",
 34    "epithelium": "https://andrewjanowczyk.com/wp-static/epi.tgz",
 35    "tubule": "https://andrewjanowczyk.com/wp-static/tubule.tgz",
 36}
 37
 38CHECKSUM = {
 39    "nuclei": "cb881c29d9f0ae5ad1d953160a4e00be70af329e0351eed614d51b4b66c65e6b",
 40    "epithelium": "5ac91a48de7d4f158f72cfc239b9a465849166397580b95d8f695095f54bcf6d",
 41    "tubule": "4f3e49d32b993c773a4d437f7483677d6b7c53a1d29f6b0b359a21722fa1f8f3",
 42}
 43
 44
 45def _create_split_csv(path, split):
 46    "Create splits on patient level data."
 47    csv_path = os.path.join(path, 'janowczyk_split.csv')
 48    if os.path.exists(csv_path):
 49        df = pd.read_csv(csv_path)
 50        df[split] = df[split].apply(lambda x: json.loads(x.replace("'", '"')))  # ensures all items from column in list.
 51        split_list = df.iloc[0][split]
 52
 53    else:
 54        print(f"Creating a new split file at '{csv_path}'.")
 55        patient_ids = [
 56            os.path.basename(image).split("_original")[0]
 57            for image in glob(os.path.join(path, 'data', 'nuclei', '*original.tif'))
 58        ]
 59
 60        train_ids, test_ids = train_test_split(patient_ids, test_size=0.2)  # 20% for test split.
 61        train_ids, val_ids = train_test_split(train_ids, test_size=0.15)  # 15% for train split.
 62
 63        split_ids = {"train": train_ids, "val": val_ids, "test": test_ids}
 64        df = pd.DataFrame.from_dict([split_ids])
 65        df.to_csv(csv_path)
 66        split_list = split_ids[split]
 67
 68    return split_list
 69
 70
 71def get_janowczyk_data(
 72    path: Union[os.PathLike, str],
 73    annotation: Literal['nuclei', 'epithelium', 'tubule'] = "nuclei",
 74    download: bool = False
 75) -> str:
 76    """Download the Janowczyk dataset.
 77
 78    Args:
 79        path: Filepath to a folder where the downloaded data will be saved.
 80        annotation: The choice of annotated labels.
 81        download: Whether to download the data if it is not present.
 82
 83    Returns:
 84        Filepath where the dataset is downloaded.
 85    """
 86    if annotation not in ['nuclei', 'epithelium', 'tubule']:
 87        raise ValueError(f"'{annotation}' is not a supported annotation for labels.")
 88
 89    data_dir = os.path.join(path, "data", annotation)
 90    if os.path.exists(data_dir):
 91        return data_dir
 92
 93    os.makedirs(path, exist_ok=True)
 94
 95    tar_path = os.path.join(path, f"{annotation}.tgz")
 96    util.download_source(
 97        path=tar_path, url=URL[annotation], download=download, checksum=CHECKSUM[annotation], verify=False
 98    )
 99    util.unzip_tarfile(tar_path=tar_path, dst=data_dir, remove=False)
100
101    return data_dir
102
103
104def get_janowczyk_paths(
105    path: Union[os.PathLike, str],
106    split: Optional[Literal["train", "val", "test"]] = None,
107    annotation: Literal['nuclei', 'epithelium', 'tubule'] = "nuclei",
108    download: bool = False
109) -> Tuple[List[str], List[str]]:
110    """Get paths to the Janowczyk data.
111
112    Args:
113        path: Filepath to a folder where the downloaded data will be saved.
114        split: The choice of data split.
115        annotation: The choice of annotated labels.
116        download: Whether to download the data if it is not present.
117
118    Returns:
119        List of filepaths for the image data.
120        List of filepaths for the label data.
121    """
122    data_dir = get_janowczyk_data(path, annotation, download)
123
124    if annotation == "nuclei":
125        split_list = _create_split_csv(path, split)
126
127        raw_paths = [os.path.join(data_dir, f"{name}_original.tif") for name in split_list]
128        label_paths = [os.path.join(data_dir, f"{name}_mask.png") for name in split_list]
129
130        neu_label_paths = []
131        for lpath in tqdm(label_paths, desc="Preprocessing 'nuclei' labels"):
132            neu_label_path = lpath.replace("_mask.png", "_preprocessed_labels.tif")
133            neu_label_paths.append(neu_label_path)
134            if os.path.exists(neu_label_path):
135                continue
136
137            label = imageio.imread(lpath)
138            label = connected_components(label)  # run coonected components on all nuclei instances.
139            imageio.imwrite(neu_label_path, label, compression="zlib")
140
141        label_paths = natsorted(label_paths)
142        raw_paths = natsorted(raw_paths)
143
144    else:
145        assert split is None, "No other dataset besides 'nuclei' has splits at the moment."
146
147        if annotation == "epithelium":
148            label_paths = natsorted(glob(os.path.join(data_dir, "masks", "*_mask.png")))
149            raw_paths = [p.replace("masks/", "").replace("_mask.png", ".tif") for p in label_paths]
150
151        else:  # tubule
152            label_paths = natsorted(glob(os.path.join(data_dir, "*_anno.bmp")))
153            raw_paths = [p.replace("_anno", "") for p in label_paths]
154
155    assert len(raw_paths) == len(label_paths) and len(raw_paths) > 0
156
157    return raw_paths, label_paths
158
159
160def get_janowczyk_dataset(
161    path: Union[os.PathLike, str],
162    patch_shape: Tuple[int, int],
163    split: Optional[Literal["train", "val", "test"]] = None,
164    annotation: Literal['nuclei', 'epithelium', 'tubule'] = "nuclei",
165    resize_inputs: bool = False,
166    download: bool = False,
167    **kwargs
168) -> Dataset:
169    """Get the Janowczyk dataset for nucleus, epithelium and tubule segmentation.
170
171    Args:
172        path: Filepath to a folder where the downloaded data will be saved.
173        patch_shape: The patch shape to use for training.
174        split: The choice of data split.
175        annotation: The choice of annotated labels.
176        resize_inputs: Whether to resize the inputs.
177        download: Whether to download the data if it is not present.
178        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
179
180    Returns:
181        The segmentation dataset.
182    """
183    raw_paths, label_paths = get_janowczyk_paths(path, split, annotation, download)
184
185    if resize_inputs:
186        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
187        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
188            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
189        )
190
191    return torch_em.default_segmentation_dataset(
192        raw_paths=raw_paths,
193        raw_key=None,
194        label_paths=label_paths,
195        label_key=None,
196        is_seg_dataset=False,
197        with_channels=True,
198        ndim=2,
199        patch_shape=patch_shape,
200        **kwargs
201    )
202
203
204def get_janowczyk_loader(
205    path: Union[os.PathLike, str],
206    batch_size: int,
207    patch_shape: Tuple[int, int],
208    split: Optional[Literal["train", "val", "test"]] = None,
209    annotation: Literal['nuclei', 'epithelium', 'tubule'] = "nuclei",
210    resize_inputs: bool = False,
211    download: bool = False,
212    **kwargs
213) -> DataLoader:
214    """Get the Janowczyk dataloader for nucleus, epithelium and tubule segmentation.
215
216    Args:
217        path: Filepath to a folder where the downloaded data will be saved.
218        batch_size: The batch size for training.
219        patch_shape: The patch shape to use for training.
220        split: The choice of data split/
221        annotation: The choice of annotated labels.
222        resize_inputs: Whether to resize the inputs.
223        download: Whether to download the data if it is not present.
224        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
225
226    Returns:
227        The DataLoader.
228    """
229    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
230    dataset = get_janowczyk_dataset(path, patch_shape, split, annotation, resize_inputs, download, **ds_kwargs)
231    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URL = {'nuclei': 'https://andrewjanowczyk.com/wp-static/nuclei.tgz', 'epithelium': 'https://andrewjanowczyk.com/wp-static/epi.tgz', 'tubule': 'https://andrewjanowczyk.com/wp-static/tubule.tgz'}
CHECKSUM = {'nuclei': 'cb881c29d9f0ae5ad1d953160a4e00be70af329e0351eed614d51b4b66c65e6b', 'epithelium': '5ac91a48de7d4f158f72cfc239b9a465849166397580b95d8f695095f54bcf6d', 'tubule': '4f3e49d32b993c773a4d437f7483677d6b7c53a1d29f6b0b359a21722fa1f8f3'}
def get_janowczyk_data( path: Union[os.PathLike, str], annotation: Literal['nuclei', 'epithelium', 'tubule'] = 'nuclei', download: bool = False) -> str:
 72def get_janowczyk_data(
 73    path: Union[os.PathLike, str],
 74    annotation: Literal['nuclei', 'epithelium', 'tubule'] = "nuclei",
 75    download: bool = False
 76) -> str:
 77    """Download the Janowczyk dataset.
 78
 79    Args:
 80        path: Filepath to a folder where the downloaded data will be saved.
 81        annotation: The choice of annotated labels.
 82        download: Whether to download the data if it is not present.
 83
 84    Returns:
 85        Filepath where the dataset is downloaded.
 86    """
 87    if annotation not in ['nuclei', 'epithelium', 'tubule']:
 88        raise ValueError(f"'{annotation}' is not a supported annotation for labels.")
 89
 90    data_dir = os.path.join(path, "data", annotation)
 91    if os.path.exists(data_dir):
 92        return data_dir
 93
 94    os.makedirs(path, exist_ok=True)
 95
 96    tar_path = os.path.join(path, f"{annotation}.tgz")
 97    util.download_source(
 98        path=tar_path, url=URL[annotation], download=download, checksum=CHECKSUM[annotation], verify=False
 99    )
100    util.unzip_tarfile(tar_path=tar_path, dst=data_dir, remove=False)
101
102    return data_dir

Download the Janowczyk dataset.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • annotation: The choice of annotated labels.
  • download: Whether to download the data if it is not present.
Returns:

Filepath where the dataset is downloaded.

def get_janowczyk_paths( path: Union[os.PathLike, str], split: Optional[Literal['train', 'val', 'test']] = None, annotation: Literal['nuclei', 'epithelium', 'tubule'] = 'nuclei', download: bool = False) -> Tuple[List[str], List[str]]:
105def get_janowczyk_paths(
106    path: Union[os.PathLike, str],
107    split: Optional[Literal["train", "val", "test"]] = None,
108    annotation: Literal['nuclei', 'epithelium', 'tubule'] = "nuclei",
109    download: bool = False
110) -> Tuple[List[str], List[str]]:
111    """Get paths to the Janowczyk data.
112
113    Args:
114        path: Filepath to a folder where the downloaded data will be saved.
115        split: The choice of data split.
116        annotation: The choice of annotated labels.
117        download: Whether to download the data if it is not present.
118
119    Returns:
120        List of filepaths for the image data.
121        List of filepaths for the label data.
122    """
123    data_dir = get_janowczyk_data(path, annotation, download)
124
125    if annotation == "nuclei":
126        split_list = _create_split_csv(path, split)
127
128        raw_paths = [os.path.join(data_dir, f"{name}_original.tif") for name in split_list]
129        label_paths = [os.path.join(data_dir, f"{name}_mask.png") for name in split_list]
130
131        neu_label_paths = []
132        for lpath in tqdm(label_paths, desc="Preprocessing 'nuclei' labels"):
133            neu_label_path = lpath.replace("_mask.png", "_preprocessed_labels.tif")
134            neu_label_paths.append(neu_label_path)
135            if os.path.exists(neu_label_path):
136                continue
137
138            label = imageio.imread(lpath)
139            label = connected_components(label)  # run coonected components on all nuclei instances.
140            imageio.imwrite(neu_label_path, label, compression="zlib")
141
142        label_paths = natsorted(label_paths)
143        raw_paths = natsorted(raw_paths)
144
145    else:
146        assert split is None, "No other dataset besides 'nuclei' has splits at the moment."
147
148        if annotation == "epithelium":
149            label_paths = natsorted(glob(os.path.join(data_dir, "masks", "*_mask.png")))
150            raw_paths = [p.replace("masks/", "").replace("_mask.png", ".tif") for p in label_paths]
151
152        else:  # tubule
153            label_paths = natsorted(glob(os.path.join(data_dir, "*_anno.bmp")))
154            raw_paths = [p.replace("_anno", "") for p in label_paths]
155
156    assert len(raw_paths) == len(label_paths) and len(raw_paths) > 0
157
158    return raw_paths, label_paths

Get paths to the Janowczyk data.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • split: The choice of data split.
  • annotation: The choice of annotated labels.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths for the image data. List of filepaths for the label data.

def get_janowczyk_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Optional[Literal['train', 'val', 'test']] = None, annotation: Literal['nuclei', 'epithelium', 'tubule'] = 'nuclei', resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
161def get_janowczyk_dataset(
162    path: Union[os.PathLike, str],
163    patch_shape: Tuple[int, int],
164    split: Optional[Literal["train", "val", "test"]] = None,
165    annotation: Literal['nuclei', 'epithelium', 'tubule'] = "nuclei",
166    resize_inputs: bool = False,
167    download: bool = False,
168    **kwargs
169) -> Dataset:
170    """Get the Janowczyk dataset for nucleus, epithelium and tubule segmentation.
171
172    Args:
173        path: Filepath to a folder where the downloaded data will be saved.
174        patch_shape: The patch shape to use for training.
175        split: The choice of data split.
176        annotation: The choice of annotated labels.
177        resize_inputs: Whether to resize the inputs.
178        download: Whether to download the data if it is not present.
179        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
180
181    Returns:
182        The segmentation dataset.
183    """
184    raw_paths, label_paths = get_janowczyk_paths(path, split, annotation, download)
185
186    if resize_inputs:
187        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
188        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
189            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
190        )
191
192    return torch_em.default_segmentation_dataset(
193        raw_paths=raw_paths,
194        raw_key=None,
195        label_paths=label_paths,
196        label_key=None,
197        is_seg_dataset=False,
198        with_channels=True,
199        ndim=2,
200        patch_shape=patch_shape,
201        **kwargs
202    )

Get the Janowczyk dataset for nucleus, epithelium and tubule segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • patch_shape: The patch shape to use for training.
  • split: The choice of data split.
  • annotation: The choice of annotated labels.
  • resize_inputs: Whether to resize the inputs.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_janowczyk_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Optional[Literal['train', 'val', 'test']] = None, annotation: Literal['nuclei', 'epithelium', 'tubule'] = 'nuclei', resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
205def get_janowczyk_loader(
206    path: Union[os.PathLike, str],
207    batch_size: int,
208    patch_shape: Tuple[int, int],
209    split: Optional[Literal["train", "val", "test"]] = None,
210    annotation: Literal['nuclei', 'epithelium', 'tubule'] = "nuclei",
211    resize_inputs: bool = False,
212    download: bool = False,
213    **kwargs
214) -> DataLoader:
215    """Get the Janowczyk dataloader for nucleus, epithelium and tubule segmentation.
216
217    Args:
218        path: Filepath to a folder where the downloaded data will be saved.
219        batch_size: The batch size for training.
220        patch_shape: The patch shape to use for training.
221        split: The choice of data split/
222        annotation: The choice of annotated labels.
223        resize_inputs: Whether to resize the inputs.
224        download: Whether to download the data if it is not present.
225        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
226
227    Returns:
228        The DataLoader.
229    """
230    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
231    dataset = get_janowczyk_dataset(path, patch_shape, split, annotation, resize_inputs, download, **ds_kwargs)
232    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the Janowczyk dataloader for nucleus, epithelium and tubule segmentation.

Arguments:
  • path: Filepath to a folder where the downloaded data will be saved.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape to use for training.
  • split: The choice of data split/
  • annotation: The choice of annotated labels.
  • resize_inputs: Whether to resize the inputs.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.