torch_em.data.datasets.histopathology.janowczyk
The Janowczyk dataset contains annotations for nucleus, epithelium and tubule segmentation in H&E stained histopathology images for breast cancer.
NOTE:
- The nuclei are sparsely annotated instances for ER+ breast cancer images.
- The epithelium and tubule are dense semantic annotations for breast cancer images.
The dataset is located at https://andrewjanowczyk.com/deep-learning/. This dataset is from the publication https://doi.org/10.4103/2153-3539.186902. Please cite it if you use this dataset for your research.
1"""The Janowczyk dataset contains annotations for nucleus, epithelium and tubule segmentation 2in H&E stained histopathology images for breast cancer. 3 4NOTE: 5- The nuclei are sparsely annotated instances for ER+ breast cancer images. 6- The epithelium and tubule are dense semantic annotations for breast cancer images. 7 8The dataset is located at https://andrewjanowczyk.com/deep-learning/. 9This dataset is from the publication https://doi.org/10.4103/2153-3539.186902. 10Please cite it if you use this dataset for your research. 11""" 12 13import os 14from glob import glob 15from tqdm import tqdm 16from natsort import natsorted 17from typing import Union, Tuple, Literal, List, Optional 18 19import json 20import pandas as pd 21import imageio.v3 as imageio 22from sklearn.model_selection import train_test_split 23from skimage.measure import label as connected_components 24 25from torch.utils.data import Dataset, DataLoader 26 27import torch_em 28 29from .. import util 30 31 32URL = { 33 "nuclei": "https://andrewjanowczyk.com/wp-static/nuclei.tgz", 34 "epithelium": "https://andrewjanowczyk.com/wp-static/epi.tgz", 35 "tubule": "https://andrewjanowczyk.com/wp-static/tubule.tgz", 36} 37 38CHECKSUM = { 39 "nuclei": "cb881c29d9f0ae5ad1d953160a4e00be70af329e0351eed614d51b4b66c65e6b", 40 "epithelium": "5ac91a48de7d4f158f72cfc239b9a465849166397580b95d8f695095f54bcf6d", 41 "tubule": "4f3e49d32b993c773a4d437f7483677d6b7c53a1d29f6b0b359a21722fa1f8f3", 42} 43 44 45def _create_split_csv(path, split): 46 "Create splits on patient level data." 47 csv_path = os.path.join(path, 'janowczyk_split.csv') 48 if os.path.exists(csv_path): 49 df = pd.read_csv(csv_path) 50 df[split] = df[split].apply(lambda x: json.loads(x.replace("'", '"'))) # ensures all items from column in list. 51 split_list = df.iloc[0][split] 52 53 else: 54 print(f"Creating a new split file at '{csv_path}'.") 55 patient_ids = [ 56 os.path.basename(image).split("_original")[0] 57 for image in glob(os.path.join(path, 'data', 'nuclei', '*original.tif')) 58 ] 59 60 train_ids, test_ids = train_test_split(patient_ids, test_size=0.2) # 20% for test split. 61 train_ids, val_ids = train_test_split(train_ids, test_size=0.15) # 15% for train split. 62 63 split_ids = {"train": train_ids, "val": val_ids, "test": test_ids} 64 df = pd.DataFrame.from_dict([split_ids]) 65 df.to_csv(csv_path) 66 split_list = split_ids[split] 67 68 return split_list 69 70 71def get_janowczyk_data( 72 path: Union[os.PathLike, str], 73 annotation: Literal['nuclei', 'epithelium', 'tubule'] = "nuclei", 74 download: bool = False 75) -> str: 76 """Download the Janowczyk dataset. 77 78 Args: 79 path: Filepath to a folder where the downloaded data will be saved. 80 annotation: The choice of annotated labels. 81 download: Whether to download the data if it is not present. 82 83 Returns: 84 Filepath where the dataset is downloaded. 85 """ 86 if annotation not in ['nuclei', 'epithelium', 'tubule']: 87 raise ValueError(f"'{annotation}' is not a supported annotation for labels.") 88 89 data_dir = os.path.join(path, "data", annotation) 90 if os.path.exists(data_dir): 91 return data_dir 92 93 os.makedirs(path, exist_ok=True) 94 95 tar_path = os.path.join(path, f"{annotation}.tgz") 96 util.download_source( 97 path=tar_path, url=URL[annotation], download=download, checksum=CHECKSUM[annotation], verify=False 98 ) 99 util.unzip_tarfile(tar_path=tar_path, dst=data_dir, remove=False) 100 101 return data_dir 102 103 104def get_janowczyk_paths( 105 path: Union[os.PathLike, str], 106 split: Optional[Literal["train", "val", "test"]] = None, 107 annotation: Literal['nuclei', 'epithelium', 'tubule'] = "nuclei", 108 download: bool = False 109) -> Tuple[List[str], List[str]]: 110 """Get paths to the Janowczyk data. 111 112 Args: 113 path: Filepath to a folder where the downloaded data will be saved. 114 split: The choice of data split. 115 annotation: The choice of annotated labels. 116 download: Whether to download the data if it is not present. 117 118 Returns: 119 List of filepaths for the image data. 120 List of filepaths for the label data. 121 """ 122 data_dir = get_janowczyk_data(path, annotation, download) 123 124 if annotation == "nuclei": 125 split_list = _create_split_csv(path, split) 126 127 raw_paths = [os.path.join(data_dir, f"{name}_original.tif") for name in split_list] 128 label_paths = [os.path.join(data_dir, f"{name}_mask.png") for name in split_list] 129 130 neu_label_paths = [] 131 for lpath in tqdm(label_paths, desc="Preprocessing 'nuclei' labels"): 132 neu_label_path = lpath.replace("_mask.png", "_preprocessed_labels.tif") 133 neu_label_paths.append(neu_label_path) 134 if os.path.exists(neu_label_path): 135 continue 136 137 label = imageio.imread(lpath) 138 label = connected_components(label) # run coonected components on all nuclei instances. 139 imageio.imwrite(neu_label_path, label, compression="zlib") 140 141 label_paths = natsorted(label_paths) 142 raw_paths = natsorted(raw_paths) 143 144 else: 145 assert split is None, "No other dataset besides 'nuclei' has splits at the moment." 146 147 if annotation == "epithelium": 148 label_paths = natsorted(glob(os.path.join(data_dir, "masks", "*_mask.png"))) 149 raw_paths = [p.replace("masks/", "").replace("_mask.png", ".tif") for p in label_paths] 150 151 else: # tubule 152 label_paths = natsorted(glob(os.path.join(data_dir, "*_anno.bmp"))) 153 raw_paths = [p.replace("_anno", "") for p in label_paths] 154 155 assert len(raw_paths) == len(label_paths) and len(raw_paths) > 0 156 157 return raw_paths, label_paths 158 159 160def get_janowczyk_dataset( 161 path: Union[os.PathLike, str], 162 patch_shape: Tuple[int, int], 163 split: Optional[Literal["train", "val", "test"]] = None, 164 annotation: Literal['nuclei', 'epithelium', 'tubule'] = "nuclei", 165 resize_inputs: bool = False, 166 download: bool = False, 167 **kwargs 168) -> Dataset: 169 """Get the Janowczyk dataset for nucleus, epithelium and tubule segmentation. 170 171 Args: 172 path: Filepath to a folder where the downloaded data will be saved. 173 patch_shape: The patch shape to use for training. 174 split: The choice of data split. 175 annotation: The choice of annotated labels. 176 resize_inputs: Whether to resize the inputs. 177 download: Whether to download the data if it is not present. 178 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 179 180 Returns: 181 The segmentation dataset. 182 """ 183 raw_paths, label_paths = get_janowczyk_paths(path, split, annotation, download) 184 185 if resize_inputs: 186 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True} 187 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 188 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 189 ) 190 191 return torch_em.default_segmentation_dataset( 192 raw_paths=raw_paths, 193 raw_key=None, 194 label_paths=label_paths, 195 label_key=None, 196 is_seg_dataset=False, 197 with_channels=True, 198 ndim=2, 199 patch_shape=patch_shape, 200 **kwargs 201 ) 202 203 204def get_janowczyk_loader( 205 path: Union[os.PathLike, str], 206 batch_size: int, 207 patch_shape: Tuple[int, int], 208 split: Optional[Literal["train", "val", "test"]] = None, 209 annotation: Literal['nuclei', 'epithelium', 'tubule'] = "nuclei", 210 resize_inputs: bool = False, 211 download: bool = False, 212 **kwargs 213) -> DataLoader: 214 """Get the Janowczyk dataloader for nucleus, epithelium and tubule segmentation. 215 216 Args: 217 path: Filepath to a folder where the downloaded data will be saved. 218 batch_size: The batch size for training. 219 patch_shape: The patch shape to use for training. 220 split: The choice of data split/ 221 annotation: The choice of annotated labels. 222 resize_inputs: Whether to resize the inputs. 223 download: Whether to download the data if it is not present. 224 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 225 226 Returns: 227 The DataLoader. 228 """ 229 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 230 dataset = get_janowczyk_dataset(path, patch_shape, split, annotation, resize_inputs, download, **ds_kwargs) 231 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URL =
{'nuclei': 'https://andrewjanowczyk.com/wp-static/nuclei.tgz', 'epithelium': 'https://andrewjanowczyk.com/wp-static/epi.tgz', 'tubule': 'https://andrewjanowczyk.com/wp-static/tubule.tgz'}
CHECKSUM =
{'nuclei': 'cb881c29d9f0ae5ad1d953160a4e00be70af329e0351eed614d51b4b66c65e6b', 'epithelium': '5ac91a48de7d4f158f72cfc239b9a465849166397580b95d8f695095f54bcf6d', 'tubule': '4f3e49d32b993c773a4d437f7483677d6b7c53a1d29f6b0b359a21722fa1f8f3'}
def
get_janowczyk_data( path: Union[os.PathLike, str], annotation: Literal['nuclei', 'epithelium', 'tubule'] = 'nuclei', download: bool = False) -> str:
72def get_janowczyk_data( 73 path: Union[os.PathLike, str], 74 annotation: Literal['nuclei', 'epithelium', 'tubule'] = "nuclei", 75 download: bool = False 76) -> str: 77 """Download the Janowczyk dataset. 78 79 Args: 80 path: Filepath to a folder where the downloaded data will be saved. 81 annotation: The choice of annotated labels. 82 download: Whether to download the data if it is not present. 83 84 Returns: 85 Filepath where the dataset is downloaded. 86 """ 87 if annotation not in ['nuclei', 'epithelium', 'tubule']: 88 raise ValueError(f"'{annotation}' is not a supported annotation for labels.") 89 90 data_dir = os.path.join(path, "data", annotation) 91 if os.path.exists(data_dir): 92 return data_dir 93 94 os.makedirs(path, exist_ok=True) 95 96 tar_path = os.path.join(path, f"{annotation}.tgz") 97 util.download_source( 98 path=tar_path, url=URL[annotation], download=download, checksum=CHECKSUM[annotation], verify=False 99 ) 100 util.unzip_tarfile(tar_path=tar_path, dst=data_dir, remove=False) 101 102 return data_dir
Download the Janowczyk dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- annotation: The choice of annotated labels.
- download: Whether to download the data if it is not present.
Returns:
Filepath where the dataset is downloaded.
def
get_janowczyk_paths( path: Union[os.PathLike, str], split: Optional[Literal['train', 'val', 'test']] = None, annotation: Literal['nuclei', 'epithelium', 'tubule'] = 'nuclei', download: bool = False) -> Tuple[List[str], List[str]]:
105def get_janowczyk_paths( 106 path: Union[os.PathLike, str], 107 split: Optional[Literal["train", "val", "test"]] = None, 108 annotation: Literal['nuclei', 'epithelium', 'tubule'] = "nuclei", 109 download: bool = False 110) -> Tuple[List[str], List[str]]: 111 """Get paths to the Janowczyk data. 112 113 Args: 114 path: Filepath to a folder where the downloaded data will be saved. 115 split: The choice of data split. 116 annotation: The choice of annotated labels. 117 download: Whether to download the data if it is not present. 118 119 Returns: 120 List of filepaths for the image data. 121 List of filepaths for the label data. 122 """ 123 data_dir = get_janowczyk_data(path, annotation, download) 124 125 if annotation == "nuclei": 126 split_list = _create_split_csv(path, split) 127 128 raw_paths = [os.path.join(data_dir, f"{name}_original.tif") for name in split_list] 129 label_paths = [os.path.join(data_dir, f"{name}_mask.png") for name in split_list] 130 131 neu_label_paths = [] 132 for lpath in tqdm(label_paths, desc="Preprocessing 'nuclei' labels"): 133 neu_label_path = lpath.replace("_mask.png", "_preprocessed_labels.tif") 134 neu_label_paths.append(neu_label_path) 135 if os.path.exists(neu_label_path): 136 continue 137 138 label = imageio.imread(lpath) 139 label = connected_components(label) # run coonected components on all nuclei instances. 140 imageio.imwrite(neu_label_path, label, compression="zlib") 141 142 label_paths = natsorted(label_paths) 143 raw_paths = natsorted(raw_paths) 144 145 else: 146 assert split is None, "No other dataset besides 'nuclei' has splits at the moment." 147 148 if annotation == "epithelium": 149 label_paths = natsorted(glob(os.path.join(data_dir, "masks", "*_mask.png"))) 150 raw_paths = [p.replace("masks/", "").replace("_mask.png", ".tif") for p in label_paths] 151 152 else: # tubule 153 label_paths = natsorted(glob(os.path.join(data_dir, "*_anno.bmp"))) 154 raw_paths = [p.replace("_anno", "") for p in label_paths] 155 156 assert len(raw_paths) == len(label_paths) and len(raw_paths) > 0 157 158 return raw_paths, label_paths
Get paths to the Janowczyk data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The choice of data split.
- annotation: The choice of annotated labels.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the image data. List of filepaths for the label data.
def
get_janowczyk_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Optional[Literal['train', 'val', 'test']] = None, annotation: Literal['nuclei', 'epithelium', 'tubule'] = 'nuclei', resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
161def get_janowczyk_dataset( 162 path: Union[os.PathLike, str], 163 patch_shape: Tuple[int, int], 164 split: Optional[Literal["train", "val", "test"]] = None, 165 annotation: Literal['nuclei', 'epithelium', 'tubule'] = "nuclei", 166 resize_inputs: bool = False, 167 download: bool = False, 168 **kwargs 169) -> Dataset: 170 """Get the Janowczyk dataset for nucleus, epithelium and tubule segmentation. 171 172 Args: 173 path: Filepath to a folder where the downloaded data will be saved. 174 patch_shape: The patch shape to use for training. 175 split: The choice of data split. 176 annotation: The choice of annotated labels. 177 resize_inputs: Whether to resize the inputs. 178 download: Whether to download the data if it is not present. 179 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 180 181 Returns: 182 The segmentation dataset. 183 """ 184 raw_paths, label_paths = get_janowczyk_paths(path, split, annotation, download) 185 186 if resize_inputs: 187 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True} 188 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 189 kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs 190 ) 191 192 return torch_em.default_segmentation_dataset( 193 raw_paths=raw_paths, 194 raw_key=None, 195 label_paths=label_paths, 196 label_key=None, 197 is_seg_dataset=False, 198 with_channels=True, 199 ndim=2, 200 patch_shape=patch_shape, 201 **kwargs 202 )
Get the Janowczyk dataset for nucleus, epithelium and tubule segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- split: The choice of data split.
- annotation: The choice of annotated labels.
- resize_inputs: Whether to resize the inputs.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
def
get_janowczyk_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Optional[Literal['train', 'val', 'test']] = None, annotation: Literal['nuclei', 'epithelium', 'tubule'] = 'nuclei', resize_inputs: bool = False, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
205def get_janowczyk_loader( 206 path: Union[os.PathLike, str], 207 batch_size: int, 208 patch_shape: Tuple[int, int], 209 split: Optional[Literal["train", "val", "test"]] = None, 210 annotation: Literal['nuclei', 'epithelium', 'tubule'] = "nuclei", 211 resize_inputs: bool = False, 212 download: bool = False, 213 **kwargs 214) -> DataLoader: 215 """Get the Janowczyk dataloader for nucleus, epithelium and tubule segmentation. 216 217 Args: 218 path: Filepath to a folder where the downloaded data will be saved. 219 batch_size: The batch size for training. 220 patch_shape: The patch shape to use for training. 221 split: The choice of data split/ 222 annotation: The choice of annotated labels. 223 resize_inputs: Whether to resize the inputs. 224 download: Whether to download the data if it is not present. 225 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 226 227 Returns: 228 The DataLoader. 229 """ 230 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 231 dataset = get_janowczyk_dataset(path, patch_shape, split, annotation, resize_inputs, download, **ds_kwargs) 232 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the Janowczyk dataloader for nucleus, epithelium and tubule segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- split: The choice of data split/
- annotation: The choice of annotated labels.
- resize_inputs: Whether to resize the inputs.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader.