torch_em.data.datasets.light_microscopy.bmgd
The BMGD (Breast Mammary Gland Dataset) contains DAPI-stained fluorescent microscopy images for nuclei segmentation in mammary gland tissue.
The dataset includes 819 image patches with over 9,500 manually segmented nuclei from mammary epithelial cells cultured under different microenvironmental stiffness conditions.
The dataset is from: https://github.com/zt089/Breast-Mammary-Gland-Dataset-BMGD Please cite the following paper if you use this dataset in your research: https://doi.org/10.21203/rs.3.rs-8263420/v1
1"""The BMGD (Breast Mammary Gland Dataset) contains DAPI-stained fluorescent microscopy 2images for nuclei segmentation in mammary gland tissue. 3 4The dataset includes 819 image patches with over 9,500 manually segmented nuclei 5from mammary epithelial cells cultured under different microenvironmental stiffness conditions. 6 7The dataset is from: https://github.com/zt089/Breast-Mammary-Gland-Dataset-BMGD 8Please cite the following paper if you use this dataset in your research: 9https://doi.org/10.21203/rs.3.rs-8263420/v1 10""" 11 12import os 13from glob import glob 14from typing import Union, Tuple, List, Optional 15 16import numpy as np 17 18from torch.utils.data import Dataset, DataLoader 19 20import torch_em 21 22from .. import util 23 24 25URLS = { 26 "250pa": "https://github.com/zt089/Breast-Mammary-Gland-Dataset-BMGD/raw/main/250%20Pa.7z", 27 "950pa": "https://github.com/zt089/Breast-Mammary-Gland-Dataset-BMGD/raw/main/950%20Pa.7z", 28 "1200pa": "https://github.com/zt089/Breast-Mammary-Gland-Dataset-BMGD/raw/main/1200%20Pa.7z", 29 "1800pa": "https://github.com/zt089/Breast-Mammary-Gland-Dataset-BMGD/raw/main/1800%20Pa.7z", 30} 31 32# Folder names inside the archives (with spaces) 33_FOLDER_NAMES = { 34 "250pa": "250 Pa", 35 "950pa": "950 Pa", 36 "1200pa": "1200 Pa", 37 "1800pa": "1800 Pa", 38} 39 40STIFFNESS_LEVELS = list(URLS.keys()) 41 42 43def get_bmgd_data( 44 path: Union[os.PathLike, str], 45 stiffness: Optional[Union[str, List[str]]] = None, 46 download: bool = False, 47) -> str: 48 """Download the BMGD dataset. 49 50 Args: 51 path: Filepath to a folder where the downloaded data will be saved. 52 stiffness: The stiffness level(s) to download. One of '250pa', '950pa', '1200pa', '1800pa'. 53 If None, downloads all stiffness levels. 54 download: Whether to download the data if it is not present. 55 56 Returns: 57 The filepath to the dataset directory. 58 """ 59 if stiffness is None: 60 stiffness = STIFFNESS_LEVELS 61 elif isinstance(stiffness, str): 62 stiffness = [stiffness] 63 64 for s in stiffness: 65 assert s in STIFFNESS_LEVELS, f"'{s}' is not valid. Choose from {STIFFNESS_LEVELS}." 66 67 folder_name = _FOLDER_NAMES[s] 68 data_dir = os.path.join(path, folder_name) 69 70 if os.path.exists(data_dir) and len(glob(os.path.join(data_dir, "image", "*.tif"))) > 0: 71 continue 72 73 os.makedirs(path, exist_ok=True) 74 75 archive_path = os.path.join(path, f"{s}.7z") 76 util.download_source(path=archive_path, url=URLS[s], download=download, checksum=None) 77 78 # Extract 7z archive 79 util.unzip(zip_path=archive_path, dst=path, remove=False) 80 81 return path 82 83 84def _create_bmgd_h5(path, stiffness): 85 """Create processed h5 files with instance labels from semantic masks.""" 86 import h5py 87 from skimage.measure import label 88 from tqdm import tqdm 89 import tifffile 90 91 folder_name = _FOLDER_NAMES[stiffness] 92 data_dir = os.path.join(path, folder_name) 93 h5_out_dir = os.path.join(path, "processed", stiffness) 94 os.makedirs(h5_out_dir, exist_ok=True) 95 96 images_dir = os.path.join(data_dir, "image") 97 masks_dir = os.path.join(data_dir, "mask") 98 99 # Find all image files 100 image_files = sorted(glob(os.path.join(images_dir, "*.tif"))) 101 102 for img_path in tqdm(image_files, desc=f"Processing BMGD {stiffness}"): 103 fname = os.path.basename(img_path) 104 mask_path = os.path.join(masks_dir, fname) 105 106 if not os.path.exists(mask_path): 107 continue 108 109 out_fname = f"bmgd_{stiffness}_{fname.replace('.tif', '.h5')}" 110 out_path = os.path.join(h5_out_dir, out_fname) 111 112 if os.path.exists(out_path): 113 continue 114 115 raw = tifffile.imread(img_path) 116 mask = tifffile.imread(mask_path) 117 118 # Convert semantic mask to instance labels using connected components 119 instances = label(mask > 0).astype("int64") 120 121 with h5py.File(out_path, "w") as f: 122 f.create_dataset("raw", data=raw, compression="gzip") 123 f.create_dataset("labels/instances", data=instances, compression="gzip") 124 f.create_dataset("labels/semantic", data=(mask > 0).astype("uint8"), compression="gzip") 125 126 return h5_out_dir 127 128 129def get_bmgd_paths( 130 path: Union[os.PathLike, str], 131 stiffness: Optional[Union[str, List[str]]] = None, 132 download: bool = False, 133) -> List[str]: 134 """Get paths to the BMGD data. 135 136 Args: 137 path: Filepath to a folder where the downloaded data will be saved. 138 stiffness: The stiffness level(s). If None, uses all levels. 139 download: Whether to download the data if it is not present. 140 141 Returns: 142 List of filepaths for the processed h5 data. 143 """ 144 from natsort import natsorted 145 146 get_bmgd_data(path, stiffness, download) 147 148 if stiffness is None: 149 stiffness = STIFFNESS_LEVELS 150 elif isinstance(stiffness, str): 151 stiffness = [stiffness] 152 153 all_h5_paths = [] 154 for s in stiffness: 155 h5_out_dir = os.path.join(path, "processed", s) 156 157 # Process data if not already done 158 if not os.path.exists(h5_out_dir) or len(glob(os.path.join(h5_out_dir, "*.h5"))) == 0: 159 _create_bmgd_h5(path, s) 160 161 h5_paths = glob(os.path.join(h5_out_dir, "*.h5")) 162 all_h5_paths.extend(h5_paths) 163 164 assert len(all_h5_paths) > 0, f"No data found for stiffness '{stiffness}'" 165 166 return natsorted(all_h5_paths) 167 168 169def get_bmgd_dataset( 170 path: Union[os.PathLike, str], 171 patch_shape: Tuple[int, int], 172 stiffness: Optional[Union[str, List[str]]] = None, 173 download: bool = False, 174 **kwargs 175) -> Dataset: 176 """Get the BMGD dataset for nuclei segmentation. 177 178 Args: 179 path: Filepath to a folder where the downloaded data will be saved. 180 patch_shape: The patch shape to use for training. 181 stiffness: The stiffness level(s). One of '250pa', '950pa', '1200pa', '1800pa'. 182 If None, uses all stiffness levels. 183 download: Whether to download the data if it is not present. 184 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 185 186 Returns: 187 The segmentation dataset. 188 """ 189 h5_paths = get_bmgd_paths(path, stiffness, download) 190 191 kwargs, _ = util.add_instance_label_transform( 192 kwargs, add_binary_target=True, label_dtype=np.int64, 193 ) 194 195 return torch_em.default_segmentation_dataset( 196 raw_paths=h5_paths, 197 raw_key="raw", 198 label_paths=h5_paths, 199 label_key="labels/instances", 200 patch_shape=patch_shape, 201 ndim=2, 202 **kwargs 203 ) 204 205 206def get_bmgd_loader( 207 path: Union[os.PathLike, str], 208 batch_size: int, 209 patch_shape: Tuple[int, int], 210 stiffness: Optional[Union[str, List[str]]] = None, 211 download: bool = False, 212 **kwargs 213) -> DataLoader: 214 """Get the BMGD dataloader for nuclei segmentation. 215 216 Args: 217 path: Filepath to a folder where the downloaded data will be saved. 218 batch_size: The batch size for training. 219 patch_shape: The patch shape to use for training. 220 stiffness: The stiffness level(s). One of '250pa', '950pa', '1200pa', '1800pa'. 221 If None, uses all stiffness levels. 222 download: Whether to download the data if it is not present. 223 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 224 225 Returns: 226 The DataLoader. 227 """ 228 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 229 dataset = get_bmgd_dataset( 230 path=path, 231 patch_shape=patch_shape, 232 stiffness=stiffness, 233 download=download, 234 **ds_kwargs, 235 ) 236 return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)
44def get_bmgd_data( 45 path: Union[os.PathLike, str], 46 stiffness: Optional[Union[str, List[str]]] = None, 47 download: bool = False, 48) -> str: 49 """Download the BMGD dataset. 50 51 Args: 52 path: Filepath to a folder where the downloaded data will be saved. 53 stiffness: The stiffness level(s) to download. One of '250pa', '950pa', '1200pa', '1800pa'. 54 If None, downloads all stiffness levels. 55 download: Whether to download the data if it is not present. 56 57 Returns: 58 The filepath to the dataset directory. 59 """ 60 if stiffness is None: 61 stiffness = STIFFNESS_LEVELS 62 elif isinstance(stiffness, str): 63 stiffness = [stiffness] 64 65 for s in stiffness: 66 assert s in STIFFNESS_LEVELS, f"'{s}' is not valid. Choose from {STIFFNESS_LEVELS}." 67 68 folder_name = _FOLDER_NAMES[s] 69 data_dir = os.path.join(path, folder_name) 70 71 if os.path.exists(data_dir) and len(glob(os.path.join(data_dir, "image", "*.tif"))) > 0: 72 continue 73 74 os.makedirs(path, exist_ok=True) 75 76 archive_path = os.path.join(path, f"{s}.7z") 77 util.download_source(path=archive_path, url=URLS[s], download=download, checksum=None) 78 79 # Extract 7z archive 80 util.unzip(zip_path=archive_path, dst=path, remove=False) 81 82 return path
Download the BMGD dataset.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- stiffness: The stiffness level(s) to download. One of '250pa', '950pa', '1200pa', '1800pa'. If None, downloads all stiffness levels.
- download: Whether to download the data if it is not present.
Returns:
The filepath to the dataset directory.
130def get_bmgd_paths( 131 path: Union[os.PathLike, str], 132 stiffness: Optional[Union[str, List[str]]] = None, 133 download: bool = False, 134) -> List[str]: 135 """Get paths to the BMGD data. 136 137 Args: 138 path: Filepath to a folder where the downloaded data will be saved. 139 stiffness: The stiffness level(s). If None, uses all levels. 140 download: Whether to download the data if it is not present. 141 142 Returns: 143 List of filepaths for the processed h5 data. 144 """ 145 from natsort import natsorted 146 147 get_bmgd_data(path, stiffness, download) 148 149 if stiffness is None: 150 stiffness = STIFFNESS_LEVELS 151 elif isinstance(stiffness, str): 152 stiffness = [stiffness] 153 154 all_h5_paths = [] 155 for s in stiffness: 156 h5_out_dir = os.path.join(path, "processed", s) 157 158 # Process data if not already done 159 if not os.path.exists(h5_out_dir) or len(glob(os.path.join(h5_out_dir, "*.h5"))) == 0: 160 _create_bmgd_h5(path, s) 161 162 h5_paths = glob(os.path.join(h5_out_dir, "*.h5")) 163 all_h5_paths.extend(h5_paths) 164 165 assert len(all_h5_paths) > 0, f"No data found for stiffness '{stiffness}'" 166 167 return natsorted(all_h5_paths)
Get paths to the BMGD data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- stiffness: The stiffness level(s). If None, uses all levels.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the processed h5 data.
170def get_bmgd_dataset( 171 path: Union[os.PathLike, str], 172 patch_shape: Tuple[int, int], 173 stiffness: Optional[Union[str, List[str]]] = None, 174 download: bool = False, 175 **kwargs 176) -> Dataset: 177 """Get the BMGD dataset for nuclei segmentation. 178 179 Args: 180 path: Filepath to a folder where the downloaded data will be saved. 181 patch_shape: The patch shape to use for training. 182 stiffness: The stiffness level(s). One of '250pa', '950pa', '1200pa', '1800pa'. 183 If None, uses all stiffness levels. 184 download: Whether to download the data if it is not present. 185 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 186 187 Returns: 188 The segmentation dataset. 189 """ 190 h5_paths = get_bmgd_paths(path, stiffness, download) 191 192 kwargs, _ = util.add_instance_label_transform( 193 kwargs, add_binary_target=True, label_dtype=np.int64, 194 ) 195 196 return torch_em.default_segmentation_dataset( 197 raw_paths=h5_paths, 198 raw_key="raw", 199 label_paths=h5_paths, 200 label_key="labels/instances", 201 patch_shape=patch_shape, 202 ndim=2, 203 **kwargs 204 )
Get the BMGD dataset for nuclei segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- patch_shape: The patch shape to use for training.
- stiffness: The stiffness level(s). One of '250pa', '950pa', '1200pa', '1800pa'. If None, uses all stiffness levels.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset.
Returns:
The segmentation dataset.
207def get_bmgd_loader( 208 path: Union[os.PathLike, str], 209 batch_size: int, 210 patch_shape: Tuple[int, int], 211 stiffness: Optional[Union[str, List[str]]] = None, 212 download: bool = False, 213 **kwargs 214) -> DataLoader: 215 """Get the BMGD dataloader for nuclei segmentation. 216 217 Args: 218 path: Filepath to a folder where the downloaded data will be saved. 219 batch_size: The batch size for training. 220 patch_shape: The patch shape to use for training. 221 stiffness: The stiffness level(s). One of '250pa', '950pa', '1200pa', '1800pa'. 222 If None, uses all stiffness levels. 223 download: Whether to download the data if it is not present. 224 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 225 226 Returns: 227 The DataLoader. 228 """ 229 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 230 dataset = get_bmgd_dataset( 231 path=path, 232 patch_shape=patch_shape, 233 stiffness=stiffness, 234 download=download, 235 **ds_kwargs, 236 ) 237 return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)
Get the BMGD dataloader for nuclei segmentation.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- stiffness: The stiffness level(s). One of '250pa', '950pa', '1200pa', '1800pa'. If None, uses all stiffness levels.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_datasetor for the PyTorch DataLoader.
Returns:
The DataLoader.