torch_em.data.datasets.light_microscopy.u20s
The U20S dataset contains annotations for nucleus segmentation in fluoroscence microscopy images of U20S cells.
The dataset is hosted at https://bbbc.broadinstitute.org/BBBC039. This dataset is available as a BBBC collection, published by https://www.nature.com/articles/nmeth.2083. Please cite it if you use this dataset for your research.
1"""The U20S dataset contains annotations for nucleus segmentation in 2fluoroscence microscopy images of U20S cells. 3 4The dataset is hosted at https://bbbc.broadinstitute.org/BBBC039. 5This dataset is available as a BBBC collection, published by https://www.nature.com/articles/nmeth.2083. 6Please cite it if you use this dataset for your research. 7""" 8 9import os 10import shutil 11from glob import glob 12from tqdm import tqdm 13from pathlib import Path 14from natsort import natsorted 15from typing import List, Union, Tuple 16 17import imageio.v3 as imageio 18from skimage.measure import label as connected_components 19 20from torch.utils.data import Dataset, DataLoader 21 22import torch_em 23 24from .. import util 25 26 27URLS = { 28 "images": "https://data.broadinstitute.org/bbbc/BBBC039/images.zip", 29 "masks": "https://data.broadinstitute.org/bbbc/BBBC039/masks.zip" 30} 31 32CHECKSUMS = { 33 "images": "6f30a5d4fe38c928ded972704f085975f8dc0d65d9aa366df00e5a9d449fddd7", 34 "masks": "f9e6043d8ca56344a4886f96a700d804d6ee982f31e2b2cd3194af2a053c2710" 35} 36 37 38def _process_masks(path): 39 label_dir = os.path.join(path, "labels") 40 os.makedirs(label_dir) 41 42 for p in tqdm(glob(os.path.join(path, "masks", "*.png")), desc="Processing masks"): 43 curr_mask = imageio.imread(p) 44 45 assert curr_mask.ndim == 3 and curr_mask.shape[-1] == 4 # Making the obvious assumption here. 46 47 # Choose the first channel and run cc. 48 curr_mask = connected_components(curr_mask[:, :, 0]) 49 50 # Store labels as tif now. 51 imageio.imwrite(os.path.join(label_dir, f"{Path(p).stem}.tif"), curr_mask, compression="zlib") 52 53 # Remove the mask directory and random MAC cache files now. 54 shutil.rmtree(os.path.join(path, "masks")) 55 shutil.rmtree(os.path.join(path, "__MACOSX")) 56 57 58def get_u20s_data(path: Union[os.PathLike, str], download: bool = False) -> str: 59 """Download the U20S dataset. 60 61 Args: 62 path: Filepath to a folder where the data is downloaded for further processing. 63 download: Whether to download the data if it is not present. 64 65 Returns: 66 The path where the dataset is downloaded for further processing. 67 """ 68 label_dir = os.path.join(path, "labels") 69 if os.path.exists(label_dir): 70 return path 71 72 os.makedirs(path, exist_ok=True) 73 74 # Download the image and labels 75 for name, url in URLS.items(): 76 zip_path = os.path.join(path, f"{name}.zip") 77 util.download_source(path=zip_path, url=url, download=download, checksum=CHECKSUMS[name]) 78 util.unzip(zip_path, dst=path) 79 80 # Postprocess masks 81 _process_masks(path) 82 83 return path 84 85 86def get_u20s_paths( 87 path: Union[os.PathLike, str], download: bool = False 88) -> Tuple[List[str], List[str]]: 89 """Get paths to the Usiigaci data. 90 91 Args: 92 path: Filepath to a folder where the data is downloaded for further processing. 93 download: Whether to download the data if it is not present. 94 95 Returns: 96 List of filepaths for the image data. 97 List of filepaths for the label data. 98 """ 99 data_dir = get_u20s_data(path, download) 100 101 image_paths = natsorted(glob(os.path.join(data_dir, "images", "*.tif"))) 102 label_paths = natsorted(glob(os.path.join(data_dir, "labels", "*.tif"))) 103 104 return image_paths, label_paths 105 106 107def get_u20s_dataset( 108 path: Union[os.PathLike, str], 109 patch_shape: Tuple[int, int], 110 download: bool = False, 111 **kwargs 112) -> Dataset: 113 """Get the U20S dataset for nucleus segmentation. 114 115 Args: 116 path: Filepath to a folder where the data is downloaded for further processing. 117 patch_shape: The patch shape to use for training. 118 download: Whether to download the data if it is not present. 119 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 120 121 Returns: 122 The DataLoader. 123 """ 124 image_paths, label_paths = get_u20s_paths(path, download) 125 126 return torch_em.default_segmentation_dataset( 127 raw_paths=image_paths, 128 raw_key=None, 129 label_paths=label_paths, 130 label_key=None, 131 ndim=2, 132 patch_shape=patch_shape, 133 is_seg_dataset=False, 134 **kwargs 135 ) 136 137 138def get_u20s_loader( 139 path: Union[os.PathLike, str], 140 batch_size: int, 141 patch_shape: Tuple[int, int], 142 download: bool = False, 143 **kwargs 144) -> DataLoader: 145 """Get the U20S dataloader for nucleus segmentation. 146 147 Args: 148 path: Filepath to a folder where the data is downloaded for further processing. 149 batch_size: The batch size for training. 150 patch_shape: The patch shape to use for training. 151 download: Whether to download the data if it is not present. 152 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 153 154 Returns: 155 The DataLoader. 156 """ 157 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 158 dataset = get_u20s_dataset(path, patch_shape, download, **ds_kwargs) 159 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
URLS =
{'images': 'https://data.broadinstitute.org/bbbc/BBBC039/images.zip', 'masks': 'https://data.broadinstitute.org/bbbc/BBBC039/masks.zip'}
CHECKSUMS =
{'images': '6f30a5d4fe38c928ded972704f085975f8dc0d65d9aa366df00e5a9d449fddd7', 'masks': 'f9e6043d8ca56344a4886f96a700d804d6ee982f31e2b2cd3194af2a053c2710'}
def
get_u20s_data(path: Union[os.PathLike, str], download: bool = False) -> str:
59def get_u20s_data(path: Union[os.PathLike, str], download: bool = False) -> str: 60 """Download the U20S dataset. 61 62 Args: 63 path: Filepath to a folder where the data is downloaded for further processing. 64 download: Whether to download the data if it is not present. 65 66 Returns: 67 The path where the dataset is downloaded for further processing. 68 """ 69 label_dir = os.path.join(path, "labels") 70 if os.path.exists(label_dir): 71 return path 72 73 os.makedirs(path, exist_ok=True) 74 75 # Download the image and labels 76 for name, url in URLS.items(): 77 zip_path = os.path.join(path, f"{name}.zip") 78 util.download_source(path=zip_path, url=url, download=download, checksum=CHECKSUMS[name]) 79 util.unzip(zip_path, dst=path) 80 81 # Postprocess masks 82 _process_masks(path) 83 84 return path
Download the U20S dataset.
Arguments:
- path: Filepath to a folder where the data is downloaded for further processing.
- download: Whether to download the data if it is not present.
Returns:
The path where the dataset is downloaded for further processing.
def
get_u20s_paths( path: Union[os.PathLike, str], download: bool = False) -> Tuple[List[str], List[str]]:
87def get_u20s_paths( 88 path: Union[os.PathLike, str], download: bool = False 89) -> Tuple[List[str], List[str]]: 90 """Get paths to the Usiigaci data. 91 92 Args: 93 path: Filepath to a folder where the data is downloaded for further processing. 94 download: Whether to download the data if it is not present. 95 96 Returns: 97 List of filepaths for the image data. 98 List of filepaths for the label data. 99 """ 100 data_dir = get_u20s_data(path, download) 101 102 image_paths = natsorted(glob(os.path.join(data_dir, "images", "*.tif"))) 103 label_paths = natsorted(glob(os.path.join(data_dir, "labels", "*.tif"))) 104 105 return image_paths, label_paths
Get paths to the Usiigaci data.
Arguments:
- path: Filepath to a folder where the data is downloaded for further processing.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the image data. List of filepaths for the label data.
def
get_u20s_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
108def get_u20s_dataset( 109 path: Union[os.PathLike, str], 110 patch_shape: Tuple[int, int], 111 download: bool = False, 112 **kwargs 113) -> Dataset: 114 """Get the U20S dataset for nucleus segmentation. 115 116 Args: 117 path: Filepath to a folder where the data is downloaded for further processing. 118 patch_shape: The patch shape to use for training. 119 download: Whether to download the data if it is not present. 120 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 121 122 Returns: 123 The DataLoader. 124 """ 125 image_paths, label_paths = get_u20s_paths(path, download) 126 127 return torch_em.default_segmentation_dataset( 128 raw_paths=image_paths, 129 raw_key=None, 130 label_paths=label_paths, 131 label_key=None, 132 ndim=2, 133 patch_shape=patch_shape, 134 is_seg_dataset=False, 135 **kwargs 136 )
Get the U20S dataset for nucleus segmentation.
Arguments:
- path: Filepath to a folder where the data is downloaded for further processing.
- patch_shape: The patch shape to use for training.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_datasetor for the PyTorch DataLoader.
Returns:
The DataLoader.
def
get_u20s_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
139def get_u20s_loader( 140 path: Union[os.PathLike, str], 141 batch_size: int, 142 patch_shape: Tuple[int, int], 143 download: bool = False, 144 **kwargs 145) -> DataLoader: 146 """Get the U20S dataloader for nucleus segmentation. 147 148 Args: 149 path: Filepath to a folder where the data is downloaded for further processing. 150 batch_size: The batch size for training. 151 patch_shape: The patch shape to use for training. 152 download: Whether to download the data if it is not present. 153 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 154 155 Returns: 156 The DataLoader. 157 """ 158 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 159 dataset = get_u20s_dataset(path, patch_shape, download, **ds_kwargs) 160 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the U20S dataloader for nucleus segmentation.
Arguments:
- path: Filepath to a folder where the data is downloaded for further processing.
- batch_size: The batch size for training.
- patch_shape: The patch shape to use for training.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_datasetor for the PyTorch DataLoader.
Returns:
The DataLoader.