torch_em.data.datasets.light_microscopy.tissuenet

The TissueNet dataset contains annotations for cell and nucleus segmentation in microscopy images of different tissue types.

This dataset is from the publication https://doi.org/10.1038/s41587-021-01094-0. Please cite it if you use this dataset for your research.

This dataset cannot be downloaded automatically, please visit https://datasets.deepcell.org/data and download it yourself.

View Source

  1"""The TissueNet dataset contains annotations for cell and nucleus segmentation in microscopy images
  2of different tissue types.
  3
  4This dataset is from the publication https://doi.org/10.1038/s41587-021-01094-0.
  5Please cite it if you use this dataset for your research.
  6
  7This dataset cannot be downloaded automatically, please visit https://datasets.deepcell.org/data
  8and download it yourself.
  9"""
 10
 11import os
 12from glob import glob
 13from tqdm import tqdm
 14from typing import Tuple, Union, List, Literal
 15
 16import numpy as np
 17import pandas as pd
 18
 19from torch.utils.data import Dataset, DataLoader
 20
 21import torch_em
 22
 23from .. import util
 24
 25
 26def _create_split(path, split):
 27    import z5py
 28
 29    split_file = os.path.join(path, f"tissuenet_v1.1_{split}.npz")
 30    split_folder = os.path.join(path, split)
 31    os.makedirs(split_folder, exist_ok=True)
 32    data = np.load(split_file, allow_pickle=True)
 33
 34    x, y = data["X"], data["y"]
 35    metadata = data["meta"]
 36    metadata = pd.DataFrame(metadata[1:], columns=metadata[0])
 37
 38    for i, (im, label) in tqdm(enumerate(zip(x, y)), total=len(x), desc=f"Creating files for {split}-split"):
 39        out_path = os.path.join(split_folder, f"image_{i:04}.zarr")
 40        nucleus_channel = im[..., 0]
 41        cell_channel = im[..., 1]
 42        rgb = np.stack([cell_channel, nucleus_channel, np.zeros_like(nucleus_channel)])
 43        chunks = cell_channel.shape
 44        with z5py.File(out_path, "a") as f:
 45
 46            f.create_dataset("raw/nucleus", data=nucleus_channel, compression="gzip", chunks=chunks)
 47            f.create_dataset("raw/cell", data=cell_channel, compression="gzip", chunks=chunks)
 48            f.create_dataset("raw/rgb", data=rgb, compression="gzip", chunks=(3,) + chunks)
 49
 50            # the switch 0<->1 is intentional, the data format is chaotic...
 51            f.create_dataset("labels/nucleus", data=label[..., 1], compression="gzip", chunks=chunks)
 52            f.create_dataset("labels/cell", data=label[..., 0], compression="gzip", chunks=chunks)
 53
 54    os.remove(split_file)
 55
 56
 57def _create_dataset(path, zip_path):
 58    util.unzip(zip_path, path, remove=False)
 59    splits = ["train", "val", "test"]
 60    assert all([os.path.exists(os.path.join(path, f"tissuenet_v1.1_{split}.npz")) for split in splits])
 61    for split in splits:
 62        _create_split(path, split)
 63
 64
 65def get_tissuenet_data(
 66    path: Union[os.PathLike, str], split: Literal["train", "val", "test"], download: bool = False
 67) -> str:
 68    """Obtain the TissueNet dataset.
 69
 70    NOTE: Automatic download is not supported for TissueNet dataset.
 71    Please download the dataset from https://datasets.deepcell.org/data.
 72
 73    Args:
 74        path: Filepath to a folder where the manually downloaded data will be saved.
 75        split: The data split to use. Either 'train', 'val' or 'test'.
 76        download: Whether to download the data if it is not present.
 77
 78    Returns:
 79        The path where inputs are stored per split.
 80    """
 81    splits = ["train", "val", "test"]
 82    assert split in splits
 83
 84    # check if the dataset exists already
 85    zip_path = os.path.join(path, "tissuenet_v1.1.zip")
 86    if all([os.path.exists(os.path.join(path, split)) for split in splits]):  # yes it does
 87        pass
 88    elif os.path.exists(zip_path):  # no it does not, but we have the zip there and can unpack it
 89        _create_dataset(path, zip_path)
 90    else:
 91        raise RuntimeError(
 92            "We do not support automatic download for the tissuenet datasets yet."
 93            f"Please download the dataset from https://datasets.deepcell.org/data and put it here: {zip_path}"
 94        )
 95
 96    split_folder = os.path.join(path, split)
 97    return split_folder
 98
 99
100def get_tissuenet_paths(
101    path: Union[os.PathLike, str], split: Literal["train", "val", "test"], download: bool = False
102) -> List[str]:
103    """Get paths to the TissueNet data.
104
105    Args:
106        path: Filepath to a folder where the downloaded data will be saved.
107        split: The data split to use. Either 'train', 'val' or 'test'.
108        download: Whether to download the data if it is not present.
109
110    Returns:
111        List of filepaths for the data.
112    """
113    split_folder = get_tissuenet_data(path, split, download)
114    assert os.path.exists(split_folder)
115    data_paths = glob(os.path.join(split_folder, "*.zarr"))
116    assert len(data_paths) > 0
117
118    return data_paths
119
120
121def get_tissuenet_dataset(
122    path: Union[os.PathLike, str],
123    split: Literal["train", "val", "test"],
124    patch_shape: Tuple[int, int],
125    raw_channel: Literal["nucleus", "cell", "rgb"],
126    label_channel: Literal["nucleus", "cell"],
127    download: bool = False,
128    **kwargs
129) -> Dataset:
130    """Get the TissueNet dataset for segmenting cells and nucleus in microscopy tissue images.
131
132    Args:
133        path: Filepath to a folder where the downloaded data will be saved.
134        split: The data split to use. Either 'train', 'val' or 'test'.
135        patch_shape: The patch shape to use for training.
136        raw_channel: The channel to load for the raw data. Either 'nucleus', 'cell' or 'rgb'.
137        label_channel: The channel to load for the label data. Either 'nucleus' or 'cell'.
138        download: Whether to download the data if it is not present.
139        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
140
141    Returns:
142        The segmentation dataset.
143    """
144    assert raw_channel in ("nucleus", "cell", "rgb")
145    assert label_channel in ("nucleus", "cell")
146
147    data_paths = get_tissuenet_paths(path, split, download)
148
149    with_channels = True if raw_channel == "rgb" else False
150    kwargs = util.update_kwargs(kwargs, "with_channels", with_channels)
151    kwargs = util.update_kwargs(kwargs, "is_seg_dataset", True)
152    kwargs = util.update_kwargs(kwargs, "ndim", 2)
153
154    return torch_em.default_segmentation_dataset(
155        raw_paths=data_paths,
156        raw_key=f"raw/{raw_channel}",
157        label_paths=data_paths,
158        label_key=f"labels/{label_channel}",
159        patch_shape=patch_shape,
160        **kwargs
161    )
162
163
164# TODO enable loading specific tissue types etc. (from the 'meta' attributes)
165def get_tissuenet_loader(
166    path: Union[os.PathLike, str],
167    split: Literal["train", "val", "test"],
168    patch_shape: Tuple[int, int],
169    batch_size: int,
170    raw_channel: Literal["nucleus", "cell", "rgb"],
171    label_channel: Literal["nucleus", "cell"],
172    download: bool = False,
173    **kwargs
174) -> DataLoader:
175    """Get the TissueNet dataloader for segmenting cells and nucleus in microscopy tissue images.
176
177    Args:
178        path: Filepath to a folder where the downloaded data will be saved.
179        split: The data split to use. Either 'train', 'val' or 'test'.
180        patch_shape: The patch shape to use for training.
181        batch_size: The batch size for training.
182        raw_channel: The channel to load for the raw data. Either 'nucleus', 'cell' or 'rgb'.
183        label_channel: The channel to load for the label data. Either 'nucleus' or 'cell'.
184        download: Whether to download the data if it is not present.
185        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
186
187    Returns:
188        The DataLoader.
189    """
190    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
191    dataset = get_tissuenet_dataset(path, split, patch_shape, raw_channel, label_channel, download, **ds_kwargs)
192    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

def get_tissuenet_data( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], download: bool = False) -> str: View Source

66def get_tissuenet_data(
67    path: Union[os.PathLike, str], split: Literal["train", "val", "test"], download: bool = False
68) -> str:
69    """Obtain the TissueNet dataset.
70
71    NOTE: Automatic download is not supported for TissueNet dataset.
72    Please download the dataset from https://datasets.deepcell.org/data.
73
74    Args:
75        path: Filepath to a folder where the manually downloaded data will be saved.
76        split: The data split to use. Either 'train', 'val' or 'test'.
77        download: Whether to download the data if it is not present.
78
79    Returns:
80        The path where inputs are stored per split.
81    """
82    splits = ["train", "val", "test"]
83    assert split in splits
84
85    # check if the dataset exists already
86    zip_path = os.path.join(path, "tissuenet_v1.1.zip")
87    if all([os.path.exists(os.path.join(path, split)) for split in splits]):  # yes it does
88        pass
89    elif os.path.exists(zip_path):  # no it does not, but we have the zip there and can unpack it
90        _create_dataset(path, zip_path)
91    else:
92        raise RuntimeError(
93            "We do not support automatic download for the tissuenet datasets yet."
94            f"Please download the dataset from https://datasets.deepcell.org/data and put it here: {zip_path}"
95        )
96
97    split_folder = os.path.join(path, split)
98    return split_folder

Obtain the TissueNet dataset.

NOTE: Automatic download is not supported for TissueNet dataset. Please download the dataset from https://datasets.deepcell.org/data.

Arguments:

path: Filepath to a folder where the manually downloaded data will be saved.
split: The data split to use. Either 'train', 'val' or 'test'.
download: Whether to download the data if it is not present.

Returns:

The path where inputs are stored per split.

def get_tissuenet_paths( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], download: bool = False) -> List[str]: View Source

101def get_tissuenet_paths(
102    path: Union[os.PathLike, str], split: Literal["train", "val", "test"], download: bool = False
103) -> List[str]:
104    """Get paths to the TissueNet data.
105
106    Args:
107        path: Filepath to a folder where the downloaded data will be saved.
108        split: The data split to use. Either 'train', 'val' or 'test'.
109        download: Whether to download the data if it is not present.
110
111    Returns:
112        List of filepaths for the data.
113    """
114    split_folder = get_tissuenet_data(path, split, download)
115    assert os.path.exists(split_folder)
116    data_paths = glob(os.path.join(split_folder, "*.zarr"))
117    assert len(data_paths) > 0
118
119    return data_paths

Get paths to the TissueNet data.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
split: The data split to use. Either 'train', 'val' or 'test'.
download: Whether to download the data if it is not present.

Returns:

List of filepaths for the data.

def get_tissuenet_dataset( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], patch_shape: Tuple[int, int], raw_channel: Literal['nucleus', 'cell', 'rgb'], label_channel: Literal['nucleus', 'cell'], download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset: View Source

122def get_tissuenet_dataset(
123    path: Union[os.PathLike, str],
124    split: Literal["train", "val", "test"],
125    patch_shape: Tuple[int, int],
126    raw_channel: Literal["nucleus", "cell", "rgb"],
127    label_channel: Literal["nucleus", "cell"],
128    download: bool = False,
129    **kwargs
130) -> Dataset:
131    """Get the TissueNet dataset for segmenting cells and nucleus in microscopy tissue images.
132
133    Args:
134        path: Filepath to a folder where the downloaded data will be saved.
135        split: The data split to use. Either 'train', 'val' or 'test'.
136        patch_shape: The patch shape to use for training.
137        raw_channel: The channel to load for the raw data. Either 'nucleus', 'cell' or 'rgb'.
138        label_channel: The channel to load for the label data. Either 'nucleus' or 'cell'.
139        download: Whether to download the data if it is not present.
140        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
141
142    Returns:
143        The segmentation dataset.
144    """
145    assert raw_channel in ("nucleus", "cell", "rgb")
146    assert label_channel in ("nucleus", "cell")
147
148    data_paths = get_tissuenet_paths(path, split, download)
149
150    with_channels = True if raw_channel == "rgb" else False
151    kwargs = util.update_kwargs(kwargs, "with_channels", with_channels)
152    kwargs = util.update_kwargs(kwargs, "is_seg_dataset", True)
153    kwargs = util.update_kwargs(kwargs, "ndim", 2)
154
155    return torch_em.default_segmentation_dataset(
156        raw_paths=data_paths,
157        raw_key=f"raw/{raw_channel}",
158        label_paths=data_paths,
159        label_key=f"labels/{label_channel}",
160        patch_shape=patch_shape,
161        **kwargs
162    )

Get the TissueNet dataset for segmenting cells and nucleus in microscopy tissue images.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
split: The data split to use. Either 'train', 'val' or 'test'.
patch_shape: The patch shape to use for training.
raw_channel: The channel to load for the raw data. Either 'nucleus', 'cell' or 'rgb'.
label_channel: The channel to load for the label data. Either 'nucleus' or 'cell'.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

Returns:

The segmentation dataset.

def get_tissuenet_loader( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], patch_shape: Tuple[int, int], batch_size: int, raw_channel: Literal['nucleus', 'cell', 'rgb'], label_channel: Literal['nucleus', 'cell'], download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader: View Source

166def get_tissuenet_loader(
167    path: Union[os.PathLike, str],
168    split: Literal["train", "val", "test"],
169    patch_shape: Tuple[int, int],
170    batch_size: int,
171    raw_channel: Literal["nucleus", "cell", "rgb"],
172    label_channel: Literal["nucleus", "cell"],
173    download: bool = False,
174    **kwargs
175) -> DataLoader:
176    """Get the TissueNet dataloader for segmenting cells and nucleus in microscopy tissue images.
177
178    Args:
179        path: Filepath to a folder where the downloaded data will be saved.
180        split: The data split to use. Either 'train', 'val' or 'test'.
181        patch_shape: The patch shape to use for training.
182        batch_size: The batch size for training.
183        raw_channel: The channel to load for the raw data. Either 'nucleus', 'cell' or 'rgb'.
184        label_channel: The channel to load for the label data. Either 'nucleus' or 'cell'.
185        download: Whether to download the data if it is not present.
186        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
187
188    Returns:
189        The DataLoader.
190    """
191    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
192    dataset = get_tissuenet_dataset(path, split, patch_shape, raw_channel, label_channel, download, **ds_kwargs)
193    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the TissueNet dataloader for segmenting cells and nucleus in microscopy tissue images.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
split: The data split to use. Either 'train', 'val' or 'test'.
patch_shape: The patch shape to use for training.
batch_size: The batch size for training.
raw_channel: The channel to load for the raw data. Either 'nucleus', 'cell' or 'rgb'.
label_channel: The channel to load for the label data. Either 'nucleus' or 'cell'.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

Returns:

The DataLoader.