torch_em.data.datasets.light_microscopy.u20s

The U20S dataset contains annotations for nucleus segmentation in fluoroscence microscopy images of U20S cells.

The dataset is hosted at https://bbbc.broadinstitute.org/BBBC039. This dataset is available as a BBBC collection, published by https://www.nature.com/articles/nmeth.2083. Please cite it if you use this dataset for your research.

  1"""The U20S dataset contains annotations for nucleus segmentation in
  2fluoroscence microscopy images of U20S cells.
  3
  4The dataset is hosted at https://bbbc.broadinstitute.org/BBBC039.
  5This dataset is available as a BBBC collection, published by https://www.nature.com/articles/nmeth.2083.
  6Please cite it if you use this dataset for your research.
  7"""
  8
  9import os
 10import shutil
 11from glob import glob
 12from tqdm import tqdm
 13from pathlib import Path
 14from natsort import natsorted
 15from typing import List, Union, Tuple
 16
 17import imageio.v3 as imageio
 18from skimage.measure import label as connected_components
 19
 20from torch.utils.data import Dataset, DataLoader
 21
 22import torch_em
 23
 24from .. import util
 25
 26
 27URLS = {
 28    "images": "https://data.broadinstitute.org/bbbc/BBBC039/images.zip",
 29    "masks": "https://data.broadinstitute.org/bbbc/BBBC039/masks.zip"
 30}
 31
 32CHECKSUMS = {
 33    "images": "6f30a5d4fe38c928ded972704f085975f8dc0d65d9aa366df00e5a9d449fddd7",
 34    "masks": "f9e6043d8ca56344a4886f96a700d804d6ee982f31e2b2cd3194af2a053c2710"
 35}
 36
 37
 38def _process_masks(path):
 39    label_dir = os.path.join(path, "labels")
 40    os.makedirs(label_dir)
 41
 42    for p in tqdm(glob(os.path.join(path, "masks", "*.png")), desc="Processing masks"):
 43        curr_mask = imageio.imread(p)
 44
 45        assert curr_mask.ndim == 3 and curr_mask.shape[-1] == 4  # Making the obvious assumption here.
 46
 47        # Choose the first channel and run cc.
 48        curr_mask = connected_components(curr_mask[:, :, 0])
 49
 50        # Store labels as tif now.
 51        imageio.imwrite(os.path.join(label_dir, f"{Path(p).stem}.tif"), curr_mask, compression="zlib")
 52
 53    # Remove the mask directory and random MAC cache files now.
 54    shutil.rmtree(os.path.join(path, "masks"))
 55    shutil.rmtree(os.path.join(path, "__MACOSX"))
 56
 57
 58def get_u20s_data(path: Union[os.PathLike, str], download: bool = False) -> str:
 59    """Download the U20S dataset.
 60
 61    Args:
 62        path: Filepath to a folder where the data is downloaded for further processing.
 63        download: Whether to download the data if it is not present.
 64
 65    Returns:
 66        The path where the dataset is downloaded for further processing.
 67    """
 68    label_dir = os.path.join(path, "labels")
 69    if os.path.exists(label_dir):
 70        return path
 71
 72    os.makedirs(path, exist_ok=True)
 73
 74    # Download the image and labels
 75    for name, url in URLS.items():
 76        zip_path = os.path.join(path, f"{name}.zip")
 77        util.download_source(path=zip_path, url=url, download=download, checksum=CHECKSUMS[name])
 78        util.unzip(zip_path, dst=path)
 79
 80    # Postprocess masks
 81    _process_masks(path)
 82
 83    return path
 84
 85
 86def get_u20s_paths(
 87    path: Union[os.PathLike, str], download: bool = False
 88) -> Tuple[List[str], List[str]]:
 89    """Get paths to the Usiigaci data.
 90
 91    Args:
 92        path: Filepath to a folder where the data is downloaded for further processing.
 93        download: Whether to download the data if it is not present.
 94
 95    Returns:
 96        List of filepaths for the image data.
 97        List of filepaths for the label data.
 98    """
 99    data_dir = get_u20s_data(path, download)
100
101    image_paths = natsorted(glob(os.path.join(data_dir, "images", "*.tif")))
102    label_paths = natsorted(glob(os.path.join(data_dir, "labels", "*.tif")))
103
104    return image_paths, label_paths
105
106
107def get_u20s_dataset(
108    path: Union[os.PathLike, str],
109    patch_shape: Tuple[int, int],
110    download: bool = False,
111    **kwargs
112) -> Dataset:
113    """Get the U20S dataset for nucleus segmentation.
114
115    Args:
116        path: Filepath to a folder where the data is downloaded for further processing.
117        patch_shape: The patch shape to use for training.
118        download: Whether to download the data if it is not present.
119        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
120
121    Returns:
122        The DataLoader.
123    """
124    image_paths, label_paths = get_u20s_paths(path, download)
125
126    return torch_em.default_segmentation_dataset(
127        raw_paths=image_paths,
128        raw_key=None,
129        label_paths=label_paths,
130        label_key=None,
131        ndim=2,
132        patch_shape=patch_shape,
133        is_seg_dataset=False,
134        **kwargs
135    )
136
137
138def get_u20s_loader(
139    path: Union[os.PathLike, str],
140    batch_size: int,
141    patch_shape: Tuple[int, int],
142    download: bool = False,
143    **kwargs
144) -> DataLoader:
145    """Get the U20S dataloader for nucleus segmentation.
146
147    Args:
148        path: Filepath to a folder where the data is downloaded for further processing.
149        batch_size: The batch size for training.
150        patch_shape: The patch shape to use for training.
151        download: Whether to download the data if it is not present.
152        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
153
154    Returns:
155        The DataLoader.
156    """
157    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
158    dataset = get_u20s_dataset(path, patch_shape, download, **ds_kwargs)
159    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URLS = {'images': 'https://data.broadinstitute.org/bbbc/BBBC039/images.zip', 'masks': 'https://data.broadinstitute.org/bbbc/BBBC039/masks.zip'}
CHECKSUMS = {'images': '6f30a5d4fe38c928ded972704f085975f8dc0d65d9aa366df00e5a9d449fddd7', 'masks': 'f9e6043d8ca56344a4886f96a700d804d6ee982f31e2b2cd3194af2a053c2710'}
def get_u20s_data(path: Union[os.PathLike, str], download: bool = False) -> str:
59def get_u20s_data(path: Union[os.PathLike, str], download: bool = False) -> str:
60    """Download the U20S dataset.
61
62    Args:
63        path: Filepath to a folder where the data is downloaded for further processing.
64        download: Whether to download the data if it is not present.
65
66    Returns:
67        The path where the dataset is downloaded for further processing.
68    """
69    label_dir = os.path.join(path, "labels")
70    if os.path.exists(label_dir):
71        return path
72
73    os.makedirs(path, exist_ok=True)
74
75    # Download the image and labels
76    for name, url in URLS.items():
77        zip_path = os.path.join(path, f"{name}.zip")
78        util.download_source(path=zip_path, url=url, download=download, checksum=CHECKSUMS[name])
79        util.unzip(zip_path, dst=path)
80
81    # Postprocess masks
82    _process_masks(path)
83
84    return path

Download the U20S dataset.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • download: Whether to download the data if it is not present.
Returns:

The path where the dataset is downloaded for further processing.

def get_u20s_paths( path: Union[os.PathLike, str], download: bool = False) -> Tuple[List[str], List[str]]:
 87def get_u20s_paths(
 88    path: Union[os.PathLike, str], download: bool = False
 89) -> Tuple[List[str], List[str]]:
 90    """Get paths to the Usiigaci data.
 91
 92    Args:
 93        path: Filepath to a folder where the data is downloaded for further processing.
 94        download: Whether to download the data if it is not present.
 95
 96    Returns:
 97        List of filepaths for the image data.
 98        List of filepaths for the label data.
 99    """
100    data_dir = get_u20s_data(path, download)
101
102    image_paths = natsorted(glob(os.path.join(data_dir, "images", "*.tif")))
103    label_paths = natsorted(glob(os.path.join(data_dir, "labels", "*.tif")))
104
105    return image_paths, label_paths

Get paths to the Usiigaci data.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths for the image data. List of filepaths for the label data.

def get_u20s_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
108def get_u20s_dataset(
109    path: Union[os.PathLike, str],
110    patch_shape: Tuple[int, int],
111    download: bool = False,
112    **kwargs
113) -> Dataset:
114    """Get the U20S dataset for nucleus segmentation.
115
116    Args:
117        path: Filepath to a folder where the data is downloaded for further processing.
118        patch_shape: The patch shape to use for training.
119        download: Whether to download the data if it is not present.
120        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
121
122    Returns:
123        The DataLoader.
124    """
125    image_paths, label_paths = get_u20s_paths(path, download)
126
127    return torch_em.default_segmentation_dataset(
128        raw_paths=image_paths,
129        raw_key=None,
130        label_paths=label_paths,
131        label_key=None,
132        ndim=2,
133        patch_shape=patch_shape,
134        is_seg_dataset=False,
135        **kwargs
136    )

Get the U20S dataset for nucleus segmentation.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • patch_shape: The patch shape to use for training.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.

def get_u20s_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
139def get_u20s_loader(
140    path: Union[os.PathLike, str],
141    batch_size: int,
142    patch_shape: Tuple[int, int],
143    download: bool = False,
144    **kwargs
145) -> DataLoader:
146    """Get the U20S dataloader for nucleus segmentation.
147
148    Args:
149        path: Filepath to a folder where the data is downloaded for further processing.
150        batch_size: The batch size for training.
151        patch_shape: The patch shape to use for training.
152        download: Whether to download the data if it is not present.
153        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
154
155    Returns:
156        The DataLoader.
157    """
158    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
159    dataset = get_u20s_dataset(path, patch_shape, download, **ds_kwargs)
160    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the U20S dataloader for nucleus segmentation.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape to use for training.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.