torch_em.data.datasets.light_microscopy.morphonet

The MorphoNet dataset contains 3D segmentation annotations for five organisms from the MorphoNet 2.0 publication.

The dataset provides 3D instance segmentation labels for cell and nucleus segmentation across five model organisms imaged with confocal and light-sheet microscopy:

Patiria miniata (starfish embryo, confocal, membrane/nuclei) from https://doi.org/10.1242/dev.202362.
Tribolium castaneum (beetle embryo, light-sheet, nuclei) from https://doi.org/10.1038/s41592-023-01879-y.
Arabidopsis thaliana (plant shoot apical meristem, confocal, membranes) from https://doi.org/10.1073/pnas.1616768113.
Caenorhabditis elegans (nematode embryo, confocal, nuclei) from https://doi.org/10.1038/s41592-023-01879-y.
Phallusia mammillata (ascidian embryo, light-sheet, membranes) from https://doi.org/10.1126/science.aar5663.

The dataset is located at https://doi.org/10.6084/m9.figshare.30529745.v2. This dataset is from the publication https://doi.org/10.7554/eLife.106227.2. Please cite it if you use this dataset in your research.

View Source

  1"""The MorphoNet dataset contains 3D segmentation annotations for five organisms
  2from the MorphoNet 2.0 publication.
  3
  4The dataset provides 3D instance segmentation labels for cell and nucleus segmentation
  5across five model organisms imaged with confocal and light-sheet microscopy:
  6- Patiria miniata (starfish embryo, confocal, membrane/nuclei) from https://doi.org/10.1242/dev.202362.
  7- Tribolium castaneum (beetle embryo, light-sheet, nuclei) from https://doi.org/10.1038/s41592-023-01879-y.
  8- Arabidopsis thaliana (plant shoot apical meristem, confocal, membranes) from https://doi.org/10.1073/pnas.1616768113.
  9- Caenorhabditis elegans (nematode embryo, confocal, nuclei) from https://doi.org/10.1038/s41592-023-01879-y.
 10- Phallusia mammillata (ascidian embryo, light-sheet, membranes) from https://doi.org/10.1126/science.aar5663.
 11
 12The dataset is located at https://doi.org/10.6084/m9.figshare.30529745.v2.
 13This dataset is from the publication https://doi.org/10.7554/eLife.106227.2.
 14Please cite it if you use this dataset in your research.
 15"""
 16
 17import os
 18from glob import glob
 19from typing import Union, Tuple, List, Optional
 20
 21from torch.utils.data import Dataset, DataLoader
 22
 23import torch_em
 24
 25from .. import util
 26
 27
 28URLS = {
 29    "patiria_miniata": "https://ndownloader.figshare.com/files/59296676",
 30    "tribolium_castaneum": "https://ndownloader.figshare.com/files/59296685",
 31    "arabidopsis_thaliana": "https://ndownloader.figshare.com/files/59296700",
 32    "caenorhabditis_elegans": "https://ndownloader.figshare.com/files/59296703",
 33    "phallusia_mammillata": "https://ndownloader.figshare.com/files/59296712",
 34}
 35
 36DIR_NAMES = {
 37    "patiria_miniata": "Patiria miniata",
 38    "tribolium_castaneum": "Tribolium castaneum",
 39    "arabidopsis_thaliana": "Arabidopsis thaliana",
 40    "caenorhabditis_elegans": "Caenorhabditis elegans",
 41    "phallusia_mammillata": "Phallusia mammillata",
 42}
 43
 44ORGANISMS = list(URLS.keys())
 45
 46
 47def _get_tif_files(directory):
 48    """Get all TIF/TIFF files from a directory."""
 49    files = glob(os.path.join(directory, "*.tif")) + glob(os.path.join(directory, "*.tiff"))
 50    # Exclude macOS metadata files.
 51    files = [f for f in files if not os.path.basename(f).startswith(".")]
 52    return files
 53
 54
 55def _match_raw_seg_files(raw_dir, seg_dir, organism):
 56    """Match RAW and SEG files for a given organism."""
 57    from natsort import natsorted
 58
 59    raw_files = natsorted(_get_tif_files(raw_dir))
 60    seg_files = natsorted(_get_tif_files(seg_dir))
 61
 62    # For Tribolium, filter out the empty channel 0 from RAW (only ch1 has nuclei).
 63    if organism == "tribolium_castaneum":
 64        raw_files = [f for f in raw_files if "ch0" not in os.path.basename(f)]
 65
 66    assert len(raw_files) > 0, f"No RAW files found in {raw_dir}"
 67    assert len(seg_files) > 0, f"No SEG files found in {seg_dir}"
 68    assert len(raw_files) == len(seg_files), (
 69        f"Mismatch for {organism}: {len(raw_files)} RAW files vs {len(seg_files)} SEG files"
 70    )
 71
 72    return list(zip(raw_files, seg_files))
 73
 74
 75def _create_h5_data(path, organism):
 76    """Create h5 files with raw images and instance segmentation labels."""
 77    import h5py
 78    import imageio.v3 as imageio
 79    from tqdm import tqdm
 80
 81    h5_dir = os.path.join(path, "h5_data", organism)
 82    os.makedirs(h5_dir, exist_ok=True)
 83
 84    org_dir = os.path.join(path, DIR_NAMES[organism])
 85    raw_dir = os.path.join(org_dir, "published", "RAW")
 86    seg_dir = os.path.join(org_dir, "published", "SEG")
 87
 88    pairs = _match_raw_seg_files(raw_dir, seg_dir, organism)
 89
 90    for i, (raw_path, seg_path) in enumerate(tqdm(pairs, desc=f"Creating h5 for {organism}")):
 91        h5_path = os.path.join(h5_dir, f"t{i:04d}.h5")
 92
 93        if os.path.exists(h5_path):
 94            continue
 95
 96        raw = imageio.imread(raw_path)
 97        seg = imageio.imread(seg_path)
 98
 99        with h5py.File(h5_path, "w") as f:
100            f.create_dataset("raw", data=raw, compression="gzip")
101            f.create_dataset("labels", data=seg.astype("int64"), compression="gzip")
102
103    return h5_dir
104
105
106def get_morphonet_data(
107    path: Union[os.PathLike, str],
108    organism: Optional[Union[str, List[str]]] = None,
109    download: bool = False,
110) -> str:
111    """Download the MorphoNet dataset.
112
113    Args:
114        path: Filepath to a folder where the downloaded data will be saved.
115        organism: The organism(s) to download data for. If None, all organisms will be downloaded.
116            Available organisms: patiria_miniata, tribolium_castaneum, arabidopsis_thaliana,
117            caenorhabditis_elegans, phallusia_mammillata.
118        download: Whether to download the data if it is not present.
119
120    Returns:
121        The filepath to the directory with the data.
122    """
123    if organism is None:
124        organism = ORGANISMS
125    elif isinstance(organism, str):
126        organism = [organism]
127
128    for org in organism:
129        assert org in ORGANISMS, f"'{org}' is not a valid organism. Choose from {ORGANISMS}."
130
131        org_dir = os.path.join(path, DIR_NAMES[org])
132        if os.path.exists(org_dir):
133            continue
134
135        os.makedirs(path, exist_ok=True)
136        zip_path = os.path.join(path, f"{org}.zip")
137        util.download_source(path=zip_path, url=URLS[org], download=download, checksum=None)
138        util.unzip(zip_path=zip_path, dst=path)
139
140    return path
141
142
143def get_morphonet_paths(
144    path: Union[os.PathLike, str],
145    organism: Optional[Union[str, List[str]]] = None,
146    download: bool = False,
147) -> List[str]:
148    """Get paths to the MorphoNet data.
149
150    Args:
151        path: Filepath to a folder where the downloaded data will be saved.
152        organism: The organism(s) to use. If None, all organisms will be used.
153        download: Whether to download the data if it is not present.
154
155    Returns:
156        List of filepaths for the h5 data.
157    """
158    from natsort import natsorted
159
160    if organism is None:
161        organism = ORGANISMS
162    elif isinstance(organism, str):
163        organism = [organism]
164
165    get_morphonet_data(path, organism, download)
166
167    all_h5_paths = []
168    for org in organism:
169        h5_dir = os.path.join(path, "h5_data", org)
170        if not os.path.exists(h5_dir) or len(glob(os.path.join(h5_dir, "*.h5"))) == 0:
171            _create_h5_data(path, org)
172
173        h5_paths = glob(os.path.join(h5_dir, "*.h5"))
174        all_h5_paths.extend(h5_paths)
175
176    assert len(all_h5_paths) > 0, f"No data found for organism(s) '{organism}'"
177
178    return natsorted(all_h5_paths)
179
180
181def get_morphonet_dataset(
182    path: Union[os.PathLike, str],
183    patch_shape: Tuple[int, int, int],
184    organism: Optional[Union[str, List[str]]] = None,
185    download: bool = False,
186    **kwargs
187) -> Dataset:
188    """Get the MorphoNet dataset for 3D cell/nucleus segmentation.
189
190    Args:
191        path: Filepath to a folder where the downloaded data will be saved.
192        patch_shape: The patch shape to use for training.
193        organism: The organism(s) to use. If None, all organisms will be used.
194        download: Whether to download the data if it is not present.
195        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
196
197    Returns:
198        The segmentation dataset.
199    """
200    h5_paths = get_morphonet_paths(path, organism, download)
201
202    kwargs, _ = util.add_instance_label_transform(
203        kwargs, add_binary_target=True,
204    )
205    kwargs = util.ensure_transforms(ndim=3, **kwargs)
206
207    return torch_em.default_segmentation_dataset(
208        raw_paths=h5_paths,
209        raw_key="raw",
210        label_paths=h5_paths,
211        label_key="labels",
212        patch_shape=patch_shape,
213        ndim=3,
214        **kwargs
215    )
216
217
218def get_morphonet_loader(
219    path: Union[os.PathLike, str],
220    batch_size: int,
221    patch_shape: Tuple[int, int, int],
222    organism: Optional[Union[str, List[str]]] = None,
223    download: bool = False,
224    **kwargs
225) -> DataLoader:
226    """Get the MorphoNet dataloader for 3D cell/nucleus segmentation.
227
228    Args:
229        path: Filepath to a folder where the downloaded data will be saved.
230        batch_size: The batch size for training.
231        patch_shape: The patch shape to use for training.
232        organism: The organism(s) to use. If None, all organisms will be used.
233        download: Whether to download the data if it is not present.
234        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
235
236    Returns:
237        The DataLoader.
238    """
239    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
240    dataset = get_morphonet_dataset(
241        path=path,
242        patch_shape=patch_shape,
243        organism=organism,
244        download=download,
245        **ds_kwargs,
246    )
247    return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)

URLS = {'patiria_miniata': 'https://ndownloader.figshare.com/files/59296676', 'tribolium_castaneum': 'https://ndownloader.figshare.com/files/59296685', 'arabidopsis_thaliana': 'https://ndownloader.figshare.com/files/59296700', 'caenorhabditis_elegans': 'https://ndownloader.figshare.com/files/59296703', 'phallusia_mammillata': 'https://ndownloader.figshare.com/files/59296712'}

DIR_NAMES = {'patiria_miniata': 'Patiria miniata', 'tribolium_castaneum': 'Tribolium castaneum', 'arabidopsis_thaliana': 'Arabidopsis thaliana', 'caenorhabditis_elegans': 'Caenorhabditis elegans', 'phallusia_mammillata': 'Phallusia mammillata'}

ORGANISMS = ['patiria_miniata', 'tribolium_castaneum', 'arabidopsis_thaliana', 'caenorhabditis_elegans', 'phallusia_mammillata']

def get_morphonet_data( path: Union[os.PathLike, str], organism: Union[List[str], str, NoneType] = None, download: bool = False) -> str: View Source

107def get_morphonet_data(
108    path: Union[os.PathLike, str],
109    organism: Optional[Union[str, List[str]]] = None,
110    download: bool = False,
111) -> str:
112    """Download the MorphoNet dataset.
113
114    Args:
115        path: Filepath to a folder where the downloaded data will be saved.
116        organism: The organism(s) to download data for. If None, all organisms will be downloaded.
117            Available organisms: patiria_miniata, tribolium_castaneum, arabidopsis_thaliana,
118            caenorhabditis_elegans, phallusia_mammillata.
119        download: Whether to download the data if it is not present.
120
121    Returns:
122        The filepath to the directory with the data.
123    """
124    if organism is None:
125        organism = ORGANISMS
126    elif isinstance(organism, str):
127        organism = [organism]
128
129    for org in organism:
130        assert org in ORGANISMS, f"'{org}' is not a valid organism. Choose from {ORGANISMS}."
131
132        org_dir = os.path.join(path, DIR_NAMES[org])
133        if os.path.exists(org_dir):
134            continue
135
136        os.makedirs(path, exist_ok=True)
137        zip_path = os.path.join(path, f"{org}.zip")
138        util.download_source(path=zip_path, url=URLS[org], download=download, checksum=None)
139        util.unzip(zip_path=zip_path, dst=path)
140
141    return path

Download the MorphoNet dataset.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
organism: The organism(s) to download data for. If None, all organisms will be downloaded. Available organisms: patiria_miniata, tribolium_castaneum, arabidopsis_thaliana, caenorhabditis_elegans, phallusia_mammillata.
download: Whether to download the data if it is not present.

Returns:

The filepath to the directory with the data.

def get_morphonet_paths( path: Union[os.PathLike, str], organism: Union[List[str], str, NoneType] = None, download: bool = False) -> List[str]: View Source

144def get_morphonet_paths(
145    path: Union[os.PathLike, str],
146    organism: Optional[Union[str, List[str]]] = None,
147    download: bool = False,
148) -> List[str]:
149    """Get paths to the MorphoNet data.
150
151    Args:
152        path: Filepath to a folder where the downloaded data will be saved.
153        organism: The organism(s) to use. If None, all organisms will be used.
154        download: Whether to download the data if it is not present.
155
156    Returns:
157        List of filepaths for the h5 data.
158    """
159    from natsort import natsorted
160
161    if organism is None:
162        organism = ORGANISMS
163    elif isinstance(organism, str):
164        organism = [organism]
165
166    get_morphonet_data(path, organism, download)
167
168    all_h5_paths = []
169    for org in organism:
170        h5_dir = os.path.join(path, "h5_data", org)
171        if not os.path.exists(h5_dir) or len(glob(os.path.join(h5_dir, "*.h5"))) == 0:
172            _create_h5_data(path, org)
173
174        h5_paths = glob(os.path.join(h5_dir, "*.h5"))
175        all_h5_paths.extend(h5_paths)
176
177    assert len(all_h5_paths) > 0, f"No data found for organism(s) '{organism}'"
178
179    return natsorted(all_h5_paths)

Get paths to the MorphoNet data.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
organism: The organism(s) to use. If None, all organisms will be used.
download: Whether to download the data if it is not present.

Returns:

List of filepaths for the h5 data.

def get_morphonet_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int, int], organism: Union[List[str], str, NoneType] = None, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset: View Source

182def get_morphonet_dataset(
183    path: Union[os.PathLike, str],
184    patch_shape: Tuple[int, int, int],
185    organism: Optional[Union[str, List[str]]] = None,
186    download: bool = False,
187    **kwargs
188) -> Dataset:
189    """Get the MorphoNet dataset for 3D cell/nucleus segmentation.
190
191    Args:
192        path: Filepath to a folder where the downloaded data will be saved.
193        patch_shape: The patch shape to use for training.
194        organism: The organism(s) to use. If None, all organisms will be used.
195        download: Whether to download the data if it is not present.
196        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
197
198    Returns:
199        The segmentation dataset.
200    """
201    h5_paths = get_morphonet_paths(path, organism, download)
202
203    kwargs, _ = util.add_instance_label_transform(
204        kwargs, add_binary_target=True,
205    )
206    kwargs = util.ensure_transforms(ndim=3, **kwargs)
207
208    return torch_em.default_segmentation_dataset(
209        raw_paths=h5_paths,
210        raw_key="raw",
211        label_paths=h5_paths,
212        label_key="labels",
213        patch_shape=patch_shape,
214        ndim=3,
215        **kwargs
216    )

Get the MorphoNet dataset for 3D cell/nucleus segmentation.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
patch_shape: The patch shape to use for training.
organism: The organism(s) to use. If None, all organisms will be used.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

Returns:

The segmentation dataset.

def get_morphonet_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int, int], organism: Union[List[str], str, NoneType] = None, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader: View Source

219def get_morphonet_loader(
220    path: Union[os.PathLike, str],
221    batch_size: int,
222    patch_shape: Tuple[int, int, int],
223    organism: Optional[Union[str, List[str]]] = None,
224    download: bool = False,
225    **kwargs
226) -> DataLoader:
227    """Get the MorphoNet dataloader for 3D cell/nucleus segmentation.
228
229    Args:
230        path: Filepath to a folder where the downloaded data will be saved.
231        batch_size: The batch size for training.
232        patch_shape: The patch shape to use for training.
233        organism: The organism(s) to use. If None, all organisms will be used.
234        download: Whether to download the data if it is not present.
235        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
236
237    Returns:
238        The DataLoader.
239    """
240    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
241    dataset = get_morphonet_dataset(
242        path=path,
243        patch_shape=patch_shape,
244        organism=organism,
245        download=download,
246        **ds_kwargs,
247    )
248    return torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)

Get the MorphoNet dataloader for 3D cell/nucleus segmentation.

Arguments:

path: Filepath to a folder where the downloaded data will be saved.
batch_size: The batch size for training.
patch_shape: The patch shape to use for training.
organism: The organism(s) to use. If None, all organisms will be used.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

Returns:

The DataLoader.