torch_em.data.datasets.light_microscopy.tissuenet
The TissueNet dataset contains annotations for cell and nucleus segmentation in microscopy images of different tissue types.
This dataset is from the publication https://doi.org/10.1038/s41587-021-01094-0. Please cite it if you use this dataset for your research.
This dataset cannot be downloaded automatically, please visit https://datasets.deepcell.org/data and download it yourself.
1"""The TissueNet dataset contains annotations for cell and nucleus segmentation in microscopy images 2of different tissue types. 3 4This dataset is from the publication https://doi.org/10.1038/s41587-021-01094-0. 5Please cite it if you use this dataset for your research. 6 7This dataset cannot be downloaded automatically, please visit https://datasets.deepcell.org/data 8and download it yourself. 9""" 10 11import os 12from glob import glob 13from tqdm import tqdm 14from typing import Tuple, Union, List, Literal 15 16import numpy as np 17import pandas as pd 18 19from torch.utils.data import Dataset, DataLoader 20 21import torch_em 22 23from .. import util 24 25 26def _create_split(path, split): 27 import z5py 28 29 split_file = os.path.join(path, f"tissuenet_v1.1_{split}.npz") 30 split_folder = os.path.join(path, split) 31 os.makedirs(split_folder, exist_ok=True) 32 data = np.load(split_file, allow_pickle=True) 33 34 x, y = data["X"], data["y"] 35 metadata = data["meta"] 36 metadata = pd.DataFrame(metadata[1:], columns=metadata[0]) 37 38 for i, (im, label) in tqdm(enumerate(zip(x, y)), total=len(x), desc=f"Creating files for {split}-split"): 39 out_path = os.path.join(split_folder, f"image_{i:04}.zarr") 40 nucleus_channel = im[..., 0] 41 cell_channel = im[..., 1] 42 rgb = np.stack([cell_channel, nucleus_channel, np.zeros_like(nucleus_channel)]) 43 chunks = cell_channel.shape 44 with z5py.File(out_path, "a") as f: 45 46 f.create_dataset("raw/nucleus", data=nucleus_channel, compression="gzip", chunks=chunks) 47 f.create_dataset("raw/cell", data=cell_channel, compression="gzip", chunks=chunks) 48 f.create_dataset("raw/rgb", data=rgb, compression="gzip", chunks=(3,) + chunks) 49 50 # the switch 0<->1 is intentional, the data format is chaotic... 51 f.create_dataset("labels/nucleus", data=label[..., 1], compression="gzip", chunks=chunks) 52 f.create_dataset("labels/cell", data=label[..., 0], compression="gzip", chunks=chunks) 53 54 os.remove(split_file) 55 56 57def _create_dataset(path, zip_path): 58 util.unzip(zip_path, path, remove=False) 59 splits = ["train", "val", "test"] 60 assert all([os.path.exists(os.path.join(path, f"tissuenet_v1.1_{split}.npz")) for split in splits]) 61 for split in splits: 62 _create_split(path, split) 63 64 65def get_tissuenet_data( 66 path: Union[os.PathLike, str], split: Literal["train", "val", "test"], download: bool = False 67) -> str: 68 """Obtain the TissueNet dataset. 69 70 NOTE: Automatic download is not supported for TissueNet dataset. 71 Please download the dataset from https://datasets.deepcell.org/data. 72 73 Args: 74 path: Filepath to a folder where the manually downloaded data will be saved. 75 split: The data split to use. Either 'train', 'val' or 'test'. 76 download: Whether to download the data if it is not present. 77 78 Returns: 79 The path where inputs are stored per split. 80 """ 81 splits = ["train", "val", "test"] 82 assert split in splits 83 84 # check if the dataset exists already 85 zip_path = os.path.join(path, "tissuenet_v1.1.zip") 86 if all([os.path.exists(os.path.join(path, split)) for split in splits]): # yes it does 87 pass 88 elif os.path.exists(zip_path): # no it does not, but we have the zip there and can unpack it 89 _create_dataset(path, zip_path) 90 else: 91 raise RuntimeError( 92 "We do not support automatic download for the tissuenet datasets yet." 93 f"Please download the dataset from https://datasets.deepcell.org/data and put it here: {zip_path}" 94 ) 95 96 split_folder = os.path.join(path, split) 97 return split_folder 98 99 100def get_tissuenet_paths( 101 path: Union[os.PathLike, str], split: Literal["train", "val", "test"], download: bool = False 102) -> List[str]: 103 """Get paths to the TissueNet data. 104 105 Args: 106 path: Filepath to a folder where the downloaded data will be saved. 107 split: The data split to use. Either 'train', 'val' or 'test'. 108 download: Whether to download the data if it is not present. 109 110 Returns: 111 List of filepaths for the data. 112 """ 113 split_folder = get_tissuenet_data(path, split, download) 114 assert os.path.exists(split_folder) 115 data_paths = glob(os.path.join(split_folder, "*.zarr")) 116 assert len(data_paths) > 0 117 118 return data_paths 119 120 121def get_tissuenet_dataset( 122 path: Union[os.PathLike, str], 123 split: Literal["train", "val", "test"], 124 patch_shape: Tuple[int, int], 125 raw_channel: Literal["nucleus", "cell", "rgb"], 126 label_channel: Literal["nucleus", "cell"], 127 download: bool = False, 128 **kwargs 129) -> Dataset: 130 """Get the TissueNet dataset for segmenting cells and nucleus in microscopy tissue images. 131 132 Args: 133 path: Filepath to a folder where the downloaded data will be saved. 134 split: The data split to use. Either 'train', 'val' or 'test'. 135 patch_shape: The patch shape to use for training. 136 raw_channel: The channel to load for the raw data. Either 'nucleus', 'cell' or 'rgb'. 137 label_channel: The channel to load for the label data. Either 'nucleus' or 'cell'. 138 download: Whether to download the data if it is not present. 139 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 140 141 Returns: 142 The segmentation dataset. 143 """ 144 assert raw_channel in ("nucleus", "cell", "rgb") 145 assert label_channel in ("nucleus", "cell") 146 147 data_paths = get_tissuenet_paths(path, split, download) 148 149 with_channels = True if raw_channel == "rgb" else False 150 kwargs = util.update_kwargs(kwargs, "with_channels", with_channels) 151 kwargs = util.update_kwargs(kwargs, "is_seg_dataset", True) 152 kwargs = util.update_kwargs(kwargs, "ndim", 2) 153 154 return torch_em.default_segmentation_dataset( 155 raw_paths=data_paths, 156 raw_key=f"raw/{raw_channel}", 157 label_paths=data_paths, 158 label_key=f"labels/{label_channel}", 159 patch_shape=patch_shape, 160 **kwargs 161 ) 162 163 164# TODO enable loading specific tissue types etc. (from the 'meta' attributes) 165def get_tissuenet_loader( 166 path: Union[os.PathLike, str], 167 split: Literal["train", "val", "test"], 168 patch_shape: Tuple[int, int], 169 batch_size: int, 170 raw_channel: Literal["nucleus", "cell", "rgb"], 171 label_channel: Literal["nucleus", "cell"], 172 download: bool = False, 173 **kwargs 174) -> DataLoader: 175 """Get the TissueNet dataloader for segmenting cells and nucleus in microscopy tissue images. 176 177 Args: 178 path: Filepath to a folder where the downloaded data will be saved. 179 split: The data split to use. Either 'train', 'val' or 'test'. 180 patch_shape: The patch shape to use for training. 181 batch_size: The batch size for training. 182 raw_channel: The channel to load for the raw data. Either 'nucleus', 'cell' or 'rgb'. 183 label_channel: The channel to load for the label data. Either 'nucleus' or 'cell'. 184 download: Whether to download the data if it is not present. 185 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 186 187 Returns: 188 The DataLoader. 189 """ 190 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 191 dataset = get_tissuenet_dataset(path, split, patch_shape, raw_channel, label_channel, download, **ds_kwargs) 192 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
66def get_tissuenet_data( 67 path: Union[os.PathLike, str], split: Literal["train", "val", "test"], download: bool = False 68) -> str: 69 """Obtain the TissueNet dataset. 70 71 NOTE: Automatic download is not supported for TissueNet dataset. 72 Please download the dataset from https://datasets.deepcell.org/data. 73 74 Args: 75 path: Filepath to a folder where the manually downloaded data will be saved. 76 split: The data split to use. Either 'train', 'val' or 'test'. 77 download: Whether to download the data if it is not present. 78 79 Returns: 80 The path where inputs are stored per split. 81 """ 82 splits = ["train", "val", "test"] 83 assert split in splits 84 85 # check if the dataset exists already 86 zip_path = os.path.join(path, "tissuenet_v1.1.zip") 87 if all([os.path.exists(os.path.join(path, split)) for split in splits]): # yes it does 88 pass 89 elif os.path.exists(zip_path): # no it does not, but we have the zip there and can unpack it 90 _create_dataset(path, zip_path) 91 else: 92 raise RuntimeError( 93 "We do not support automatic download for the tissuenet datasets yet." 94 f"Please download the dataset from https://datasets.deepcell.org/data and put it here: {zip_path}" 95 ) 96 97 split_folder = os.path.join(path, split) 98 return split_folder
Obtain the TissueNet dataset.
NOTE: Automatic download is not supported for TissueNet dataset. Please download the dataset from https://datasets.deepcell.org/data.
Arguments:
- path: Filepath to a folder where the manually downloaded data will be saved.
- split: The data split to use. Either 'train', 'val' or 'test'.
- download: Whether to download the data if it is not present.
Returns:
The path where inputs are stored per split.
101def get_tissuenet_paths( 102 path: Union[os.PathLike, str], split: Literal["train", "val", "test"], download: bool = False 103) -> List[str]: 104 """Get paths to the TissueNet data. 105 106 Args: 107 path: Filepath to a folder where the downloaded data will be saved. 108 split: The data split to use. Either 'train', 'val' or 'test'. 109 download: Whether to download the data if it is not present. 110 111 Returns: 112 List of filepaths for the data. 113 """ 114 split_folder = get_tissuenet_data(path, split, download) 115 assert os.path.exists(split_folder) 116 data_paths = glob(os.path.join(split_folder, "*.zarr")) 117 assert len(data_paths) > 0 118 119 return data_paths
Get paths to the TissueNet data.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The data split to use. Either 'train', 'val' or 'test'.
- download: Whether to download the data if it is not present.
Returns:
List of filepaths for the data.
122def get_tissuenet_dataset( 123 path: Union[os.PathLike, str], 124 split: Literal["train", "val", "test"], 125 patch_shape: Tuple[int, int], 126 raw_channel: Literal["nucleus", "cell", "rgb"], 127 label_channel: Literal["nucleus", "cell"], 128 download: bool = False, 129 **kwargs 130) -> Dataset: 131 """Get the TissueNet dataset for segmenting cells and nucleus in microscopy tissue images. 132 133 Args: 134 path: Filepath to a folder where the downloaded data will be saved. 135 split: The data split to use. Either 'train', 'val' or 'test'. 136 patch_shape: The patch shape to use for training. 137 raw_channel: The channel to load for the raw data. Either 'nucleus', 'cell' or 'rgb'. 138 label_channel: The channel to load for the label data. Either 'nucleus' or 'cell'. 139 download: Whether to download the data if it is not present. 140 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`. 141 142 Returns: 143 The segmentation dataset. 144 """ 145 assert raw_channel in ("nucleus", "cell", "rgb") 146 assert label_channel in ("nucleus", "cell") 147 148 data_paths = get_tissuenet_paths(path, split, download) 149 150 with_channels = True if raw_channel == "rgb" else False 151 kwargs = util.update_kwargs(kwargs, "with_channels", with_channels) 152 kwargs = util.update_kwargs(kwargs, "is_seg_dataset", True) 153 kwargs = util.update_kwargs(kwargs, "ndim", 2) 154 155 return torch_em.default_segmentation_dataset( 156 raw_paths=data_paths, 157 raw_key=f"raw/{raw_channel}", 158 label_paths=data_paths, 159 label_key=f"labels/{label_channel}", 160 patch_shape=patch_shape, 161 **kwargs 162 )
Get the TissueNet dataset for segmenting cells and nucleus in microscopy tissue images.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The data split to use. Either 'train', 'val' or 'test'.
- patch_shape: The patch shape to use for training.
- raw_channel: The channel to load for the raw data. Either 'nucleus', 'cell' or 'rgb'.
- label_channel: The channel to load for the label data. Either 'nucleus' or 'cell'.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
.
Returns:
The segmentation dataset.
166def get_tissuenet_loader( 167 path: Union[os.PathLike, str], 168 split: Literal["train", "val", "test"], 169 patch_shape: Tuple[int, int], 170 batch_size: int, 171 raw_channel: Literal["nucleus", "cell", "rgb"], 172 label_channel: Literal["nucleus", "cell"], 173 download: bool = False, 174 **kwargs 175) -> DataLoader: 176 """Get the TissueNet dataloader for segmenting cells and nucleus in microscopy tissue images. 177 178 Args: 179 path: Filepath to a folder where the downloaded data will be saved. 180 split: The data split to use. Either 'train', 'val' or 'test'. 181 patch_shape: The patch shape to use for training. 182 batch_size: The batch size for training. 183 raw_channel: The channel to load for the raw data. Either 'nucleus', 'cell' or 'rgb'. 184 label_channel: The channel to load for the label data. Either 'nucleus' or 'cell'. 185 download: Whether to download the data if it is not present. 186 kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader. 187 188 Returns: 189 The DataLoader. 190 """ 191 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 192 dataset = get_tissuenet_dataset(path, split, patch_shape, raw_channel, label_channel, download, **ds_kwargs) 193 return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
Get the TissueNet dataloader for segmenting cells and nucleus in microscopy tissue images.
Arguments:
- path: Filepath to a folder where the downloaded data will be saved.
- split: The data split to use. Either 'train', 'val' or 'test'.
- patch_shape: The patch shape to use for training.
- batch_size: The batch size for training.
- raw_channel: The channel to load for the raw data. Either 'nucleus', 'cell' or 'rgb'.
- label_channel: The channel to load for the label data. Either 'nucleus' or 'cell'.
- download: Whether to download the data if it is not present.
- kwargs: Additional keyword arguments for
torch_em.default_segmentation_dataset
or for the PyTorch DataLoader.
Returns:
The DataLoader.