torch_em.data.datasets.light_microscopy.bmgd

The BMGD (Breast Mammary Gland Dataset) contains DAPI-stained fluorescent microscopy images for nuclei segmentation in mammary gland tissue.

The dataset includes 819 image patches with over 9,500 manually segmented nuclei from mammary epithelial cells cultured under different microenvironmental stiffness conditions.

The dataset is from: https://github.com/zt089/Breast-Mammary-Gland-Dataset-BMGD Please cite the following paper if you use this dataset in your research: https://doi.org/10.21203/rs.3.rs-8263420/v1

View Source

  1"""The BMGD (Breast Mammary Gland Dataset) contains DAPI-stained fluorescent microscopy
  2images for nuclei segmentation in mammary gland tissue.
  3
  4The dataset includes 819 image patches with over 9,500 manually segmented nuclei
  5from mammary epithelial cells cultured under different microenvironmental stiffness conditions.
  6
  7The dataset is from: https://github.com/zt089/Breast-Mammary-Gland-Dataset-BMGD
  8Please cite the following paper if you use this dataset in your research:
  9https://doi.org/10.21203/rs.3.rs-8263420/v1
 10"""
 11
 12import os
 13from glob import glob
 14from typing import Union, Tuple, List, Optional
 15
 16import numpy as np
 17
 18from torch.utils.data import Dataset, DataLoader
 19
 20import torch_em
 21
 22from .. import util
 23
 24
 25URLS = {
 26    "250pa": "https://github.com/zt089/Breast-Mammary-Gland-Dataset-BMGD/raw/main/250%20Pa.7z",
 27    "950pa": "https://github.com/zt089/Breast-Mammary-Gland-Dataset-BMGD/raw/main/950%20Pa.7z",
 28    "1200pa": "https://github.com/zt089/Breast-Mammary-Gland-Dataset-BMGD/raw/main/1200%20Pa.7z",
 29    "1800pa": "https://github.com/zt089/Breast-Mammary-Gland-Dataset-BMGD/raw/main/1800%20Pa.7z",
 30}
 31
 32# Folder names inside the archives (with spaces)
 33_FOLDER_NAMES = {
 34    "250pa": "250 Pa",
 35    "950pa": "950 Pa",
 36    "1200pa": "1200 Pa",
 37    "1800pa": "1800 Pa",
 38}
 39
 40STIFFNESS_LEVELS = list(URLS.keys())
 41
 42
 43def get_bmgd_data(
 44    path: Union[os.PathLike, str],
 45    stiffness: Optional[Union[str, List[str]]] = None,
 46    download: bool = False,
 47) -> str:
 48    """Download the BMGD dataset.
 49
 50    Args:
 51        path: Filepath to a folder where the downloaded data will be saved.
 52        stiffness: The stiffness level(s) to download. One of '250pa', '950pa', '1200pa', '1800pa'.
 53            If None, downloads all stiffness levels.
 54        download: Whether to download the data if it is not present.
 55
 56    Returns:
 57        The filepath to the dataset directory.
 58    """
 59    if stiffness is None:
 60        stiffness = STIFFNESS_LEVELS
 61    elif isinstance(stiffness, str):
 62        stiffness = [stiffness]
 63
 64    for s in stiffness:
 65        assert s in STIFFNESS_LEVELS, f"'{s}' is not valid. Choose from {STIFFNESS_LEVELS}."
 66
 67        folder_name = _FOLDER_NAMES[s]
 68        data_dir = os.path.join(path, folder_name)
 69
 70        if os.path.exists(data_dir) and len(glob(os.path.join(data_dir, "image", "*.tif"))) > 0:
 71            continue
 72
 73        os.makedirs(path, exist_ok=True)
 74
 75        archive_path = os.path.join(path, f"{s}.7z")
 76        util.download_source(path=archive_path, url=URLS[s], download=download, checksum=None)
 77
 78        # Extract 7z archive
 79        util.unzip(zip_path=archive_path, dst=path, remove=False)
 80
 81    return path
 82
 83
 84def _create_bmgd_h5(path, stiffness):
 85    """Create processed h5 files with instance labels from semantic masks."""
 86    import h5py
 87    from skimage.measure import label
 88    from tqdm import tqdm
 89    import tifffile
 90
 91    folder_name = _FOLDER_NAMES[stiffness]
 92    data_dir = os.path.join(path, folder_name)
 93    h5_out_dir = os.path.join(path, "processed", stiffness)
 94    os.makedirs(h5_out_dir, exist_ok=True)
 95
 96    images_dir = os.path.join(data_dir, "image")
 97    masks_dir = os.path.join(data_dir, "mask")
 98
 99    # Find all image files
100    image_files = sorted(glob(os.path.join(images_dir, "*.tif")))
101
102    for img_path in tqdm(image_files, desc=f"Processing BMGD {stiffness}"):
103        fname = os.path.basename(img_path)
104        mask_path = os.path.join(masks_dir, fname)
105
106        if not os.path.exists(mask_path):
107            continue
108
109        out_fname = f"bmgd_{stiffness}_{fname.replace('.tif', '.h5')}"
110        out_path = os.path.join(h5_out_dir, out_fname)
111
112        if os.path.exists(out_path):
113            continue
114
115        raw = tifffile.imread(img_path)
116        mask = tifffile.imread(mask_path)
117
118        # Convert semantic mask to instance labels using connected components
119        instances = label(mask > 0).astype("int64")
120
121        with h5py.File(out_path, "w") as f:
122            f.create_dataset("raw", data=raw, compression="gzip")
123            f.create_dataset("labels/instances", data=instances, compression="gzip")
124            f.create_dataset("labels/semantic", data=(mask > 0).astype("uint8"), compression="gzip")
125
126    return h5_out_dir
127
128
129def get_bmgd_paths(
130    path: Union[os.PathLike, str],
131    stiffness: Optional[Union[str, List[str]]] = None,
132    download: bool = False,
133) -> List[str]:
134    """Get paths to the BMGD data.
135
136    Args:
137        path: Filepath to a folder where the downloaded data will be saved.
138        stiffness: The stiffness level(s). If None, uses all levels.
139        download: Whether to download the data if it is not present.
140
141    Returns:
142        List of filepaths for the processed h5 data.
143    """
144    from natsort import natsorted
145
146    get_bmgd_data(path, stiffness, download)
147
148    if stiffness is None:
149        stiffness = STIFFNESS_LEVELS
150    elif isinstance(stiffness, str):
151        stiffness = [stiffness]
152
153    all_h5_paths = []
154    for s in stiffness:
155        h5_out_dir = os.path.join(path, "processed", s)
156
157        # Process data if not already done
158        if not os.path.exists(h5_out_dir) or len(glob(os.path.join(h5_out_dir, "*.h5"))) == 0:
159            _create_bmgd_h5(path, s)
160
161        h5_paths = glob(os.path.join(h5_out_dir, "*.h5"))
162        all_h5_paths.extend(h5_paths)
163
164    assert len(all_h5_paths) > 0, f"No data found for stiffness '{stiffness}'"
165
166    return natsorted(all_h5_paths)
167
168
169def get_bmgd_dataset(
170    path: Union[os.PathLike, str],
171    patch_shape: Tuple[int, int],
172    stiffness: Optional[Union[str, List[str]]] = None,
173    download: bool = False,
174    **kwargs
175) -> Dataset:
176    """Get the BMGD dataset for nuclei segmentation.
177
178    Args:
179        path: Filepath to a folder where the downloaded data will be saved.
180        patch_shape: The patch shape to use for training.
181        stiffness: The stiffness level(s). One of '250pa', '950pa', '1200pa', '1800pa'.
182            If None, uses all stiffness levels.
183        download: Whether to download the data if it is not present.
184        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
185
186    Returns:
187        The segmentation dataset.
188    """
189    h5_paths = get_bmgd_paths(path, stiffness, download)
190
191    kwargs, _ = util.add_instance_label_transform(
192        kwargs, add_binary_target=True, label_dtype=np.int64,
193    )
194
195    return torch_em.default_segmentation_dataset(
196        raw_paths=h5_paths,
197        raw_key="raw",
198        label_paths=h5_paths,
199        label_key="labels/instances",
200        patch_shape=patch_shape,
201        ndim=2,
202        **kwargs
203    )
204
205
206def get_bmgd_loader(
207    path: Union[os.PathLike, str],
208    batch_size: int,
209    patch_shape: Tuple[int, int],
210    stiffness: Optional[Union[str, List[str]]] = None,
211    download: bool = False,
212    **kwargs
213) -> DataLoader:
214    """Get the BMGD dataloader for nuclei segmentation.
215
216    Args:
217        path: Filepath to a folder where the downloaded data will be saved.
218        batch_size: The batch size for training.
219        patch_shape: The patch shape to use for training.
220        stiffness: The stiffness level(s). One of '250pa', '950pa', '1200pa', '1800pa'.
221            If None, uses all stiffness levels.
222        download: Whether to download the data if it is not present.
223        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
224
225    Returns:
226        The DataLoader.
227    """
228    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
229    dataset = get_bmgd_dataset(
230        path=path,
231        patch_shape=patch_shape,
232        stiffness=stiffness,
233        download=download,
234        **ds_kwargs,
235    )
236    return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)

URLS = {'250pa': 'https://github.com/zt089/Breast-Mammary-Gland-Dataset-BMGD/raw/main/250%20Pa.7z', '950pa': 'https://github.com/zt089/Breast-Mammary-Gland-Dataset-BMGD/raw/main/950%20Pa.7z', '1200pa': 'https://github.com/zt089/Breast-Mammary-Gland-Dataset-BMGD/raw/main/1200%20Pa.7z', '1800pa': 'https://github.com/zt089/Breast-Mammary-Gland-Dataset-BMGD/raw/main/1800%20Pa.7z'}

STIFFNESS_LEVELS = ['250pa', '950pa', '1200pa', '1800pa']

def get_bmgd_data( path: Union[os.PathLike, str], stiffness: Union[List[str], str, NoneType] = None, download: bool = False) -> str: View Source

44def get_bmgd_data(
45    path: Union[os.PathLike, str],
46    stiffness: Optional[Union[str, List[str]]] = None,
47    download: bool = False,
48) -> str:
49    """Download the BMGD dataset.
50
51    Args:
52        path: Filepath to a folder where the downloaded data will be saved.
53        stiffness: The stiffness level(s) to download. One of '250pa', '950pa', '1200pa', '1800pa'.
54            If None, downloads all stiffness levels.
55        download: Whether to download the data if it is not present.
56
57    Returns:
58        The filepath to the dataset directory.
59    """
60    if stiffness is None:
61        stiffness = STIFFNESS_LEVELS
62    elif isinstance(stiffness, str):
63        stiffness = [stiffness]
64
65    for s in stiffness:
66        assert s in STIFFNESS_LEVELS, f"'{s}' is not valid. Choose from {STIFFNESS_LEVELS}."
67
68        folder_name = _FOLDER_NAMES[s]
69        data_dir = os.path.join(path, folder_name)
70
71        if os.path.exists(data_dir) and len(glob(os.path.join(data_dir, "image", "*.tif"))) > 0:
72            continue
73
74        os.makedirs(path, exist_ok=True)
75
76        archive_path = os.path.join(path, f"{s}.7z")
77        util.download_source(path=archive_path, url=URLS[s], download=download, checksum=None)
78
79        # Extract 7z archive
80        util.unzip(zip_path=archive_path, dst=path, remove=False)
81
82    return path

Download the BMGD dataset.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
stiffness: The stiffness level(s) to download. One of '250pa', '950pa', '1200pa', '1800pa'. If None, downloads all stiffness levels.
download: Whether to download the data if it is not present.

Returns:

The filepath to the dataset directory.

def get_bmgd_paths( path: Union[os.PathLike, str], stiffness: Union[List[str], str, NoneType] = None, download: bool = False) -> List[str]: View Source

130def get_bmgd_paths(
131    path: Union[os.PathLike, str],
132    stiffness: Optional[Union[str, List[str]]] = None,
133    download: bool = False,
134) -> List[str]:
135    """Get paths to the BMGD data.
136
137    Args:
138        path: Filepath to a folder where the downloaded data will be saved.
139        stiffness: The stiffness level(s). If None, uses all levels.
140        download: Whether to download the data if it is not present.
141
142    Returns:
143        List of filepaths for the processed h5 data.
144    """
145    from natsort import natsorted
146
147    get_bmgd_data(path, stiffness, download)
148
149    if stiffness is None:
150        stiffness = STIFFNESS_LEVELS
151    elif isinstance(stiffness, str):
152        stiffness = [stiffness]
153
154    all_h5_paths = []
155    for s in stiffness:
156        h5_out_dir = os.path.join(path, "processed", s)
157
158        # Process data if not already done
159        if not os.path.exists(h5_out_dir) or len(glob(os.path.join(h5_out_dir, "*.h5"))) == 0:
160            _create_bmgd_h5(path, s)
161
162        h5_paths = glob(os.path.join(h5_out_dir, "*.h5"))
163        all_h5_paths.extend(h5_paths)
164
165    assert len(all_h5_paths) > 0, f"No data found for stiffness '{stiffness}'"
166
167    return natsorted(all_h5_paths)

Get paths to the BMGD data.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
stiffness: The stiffness level(s). If None, uses all levels.
download: Whether to download the data if it is not present.

Returns:

List of filepaths for the processed h5 data.

def get_bmgd_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], stiffness: Union[List[str], str, NoneType] = None, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset: View Source

170def get_bmgd_dataset(
171    path: Union[os.PathLike, str],
172    patch_shape: Tuple[int, int],
173    stiffness: Optional[Union[str, List[str]]] = None,
174    download: bool = False,
175    **kwargs
176) -> Dataset:
177    """Get the BMGD dataset for nuclei segmentation.
178
179    Args:
180        path: Filepath to a folder where the downloaded data will be saved.
181        patch_shape: The patch shape to use for training.
182        stiffness: The stiffness level(s). One of '250pa', '950pa', '1200pa', '1800pa'.
183            If None, uses all stiffness levels.
184        download: Whether to download the data if it is not present.
185        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
186
187    Returns:
188        The segmentation dataset.
189    """
190    h5_paths = get_bmgd_paths(path, stiffness, download)
191
192    kwargs, _ = util.add_instance_label_transform(
193        kwargs, add_binary_target=True, label_dtype=np.int64,
194    )
195
196    return torch_em.default_segmentation_dataset(
197        raw_paths=h5_paths,
198        raw_key="raw",
199        label_paths=h5_paths,
200        label_key="labels/instances",
201        patch_shape=patch_shape,
202        ndim=2,
203        **kwargs
204    )

Get the BMGD dataset for nuclei segmentation.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
patch_shape: The patch shape to use for training.
stiffness: The stiffness level(s). One of '250pa', '950pa', '1200pa', '1800pa'. If None, uses all stiffness levels.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

Returns:

The segmentation dataset.

def get_bmgd_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], stiffness: Union[List[str], str, NoneType] = None, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader: View Source

207def get_bmgd_loader(
208    path: Union[os.PathLike, str],
209    batch_size: int,
210    patch_shape: Tuple[int, int],
211    stiffness: Optional[Union[str, List[str]]] = None,
212    download: bool = False,
213    **kwargs
214) -> DataLoader:
215    """Get the BMGD dataloader for nuclei segmentation.
216
217    Args:
218        path: Filepath to a folder where the downloaded data will be saved.
219        batch_size: The batch size for training.
220        patch_shape: The patch shape to use for training.
221        stiffness: The stiffness level(s). One of '250pa', '950pa', '1200pa', '1800pa'.
222            If None, uses all stiffness levels.
223        download: Whether to download the data if it is not present.
224        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
225
226    Returns:
227        The DataLoader.
228    """
229    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
230    dataset = get_bmgd_dataset(
231        path=path,
232        patch_shape=patch_shape,
233        stiffness=stiffness,
234        download=download,
235        **ds_kwargs,
236    )
237    return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)

Get the BMGD dataloader for nuclei segmentation.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
batch_size: The batch size for training.
patch_shape: The patch shape to use for training.
stiffness: The stiffness level(s). One of '250pa', '950pa', '1200pa', '1800pa'. If None, uses all stiffness levels.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

Returns:

The DataLoader.