torch_em.data.datasets.light_microscopy.orgline

The OrgLine dataset contains organoid images and associated segmentation masks.

The organoids come from different organs and were assembled from different prior publications. Specifically:

Intestine: from OrgaQuant (https://doi.org/10.1038/s41598-019-48874-y) from OrgaSegment (https://doi.org/10.1038/s42003-024-05966-4)
Brain: from https://doi.org/10.1038/s41597-024-03330-z
Colon: from OrgaExtractor (https://doi.org/10.1038/s41598-023-46485-2)
PDAC: from OrganoID (https://doi.org/10.1371/journal.pcbi.1010584) from OrganoidNet (https://doi.org/10.1007/s13402-024-00958-2)
Stomach: from https://zenodo.org/records/18447547
Breast: from https://zenodo.org/records/18447547

Please cite the associated zenodo entry (https://zenodo.org/records/16355179) and the relevant original publications if you use this dataset for your research.

View Source

  1"""The OrgLine dataset contains organoid images and associated segmentation masks.
  2
  3The organoids come from different organs and were assembled from different prior publications.
  4Specifically:
  5- Intestine: from OrgaQuant (https://doi.org/10.1038/s41598-019-48874-y)
  6             from OrgaSegment (https://doi.org/10.1038/s42003-024-05966-4)
  7- Brain:     from  https://doi.org/10.1038/s41597-024-03330-z
  8- Colon:     from OrgaExtractor (https://doi.org/10.1038/s41598-023-46485-2)
  9- PDAC:      from OrganoID (https://doi.org/10.1371/journal.pcbi.1010584)
 10             from OrganoidNet (https://doi.org/10.1007/s13402-024-00958-2)
 11- Stomach:   from https://zenodo.org/records/18447547
 12- Breast:    from https://zenodo.org/records/18447547
 13
 14Please cite the associated zenodo entry (https://zenodo.org/records/16355179) and the relevant original publications
 15if you use this dataset for your research.
 16"""
 17
 18import os
 19import shutil
 20from glob import glob
 21from typing import Union, Tuple, List, Literal, Optional, Sequence
 22
 23import h5py
 24import imageio.v3 as imageio
 25import numpy as np
 26
 27from sklearn.model_selection import train_test_split
 28from tqdm import tqdm
 29from torch.utils.data import DataLoader, Dataset
 30
 31import torch_em
 32
 33from .. import util
 34
 35try:
 36    from pycocotools.coco import COCO
 37except ImportError:
 38    COCO = None
 39
 40
 41URL1 = "https://zenodo.org/records/16355179/files/InstanceSeg.zip?download=1"
 42URL2 = "https://zenodo.org/records/18447547/files/data.zip?download=1"
 43
 44CHECKSUM1 = "6787dc47ee5f800e7ecf4a51d958fc88591c877ca7f8f03c2aa3e7fa7c4aca50"
 45CHECKSUM2 = "8b5984ee19232c06cdf5366080a3f3b27fb2109f38a2a345316e22dd2bb9a1c2"
 46
 47ORGANS1 = ("PDAC", "colon", "Intestine", "brain")
 48ORGANS2 = ("stomach", "breast")
 49
 50
 51def _annotations_to_instances(coco, image_metadata):
 52    from bioimage_cpp.segmentation import label
 53    from bioimage_cpp.segmentation import relabel_sequential
 54
 55    # create and save the segmentation
 56    annotation_ids = coco.getAnnIds(imgIds=image_metadata["id"])
 57    annotations = coco.loadAnns(annotation_ids)
 58    assert len(annotations) <= np.iinfo("uint16").max
 59    shape = (image_metadata["height"], image_metadata["width"])
 60    seg = np.zeros(shape, dtype="uint32")
 61
 62    sizes = [ann["area"] for ann in annotations]
 63    sorting = np.argsort(sizes)
 64    annotations = [annotations[i] for i in sorting]
 65
 66    for seg_id, annotation in enumerate(annotations, 1):
 67        mask = coco.annToMask(annotation).astype("bool")
 68        assert mask.shape == seg.shape
 69        seg[mask] = seg_id
 70
 71    # Filter out small pieces from pasting organoids on top of each other.
 72    min_size = 25
 73    seg = label(seg)
 74    seg_ids, sizes = np.unique(seg, return_counts=True)
 75    seg[np.isin(seg, seg_ids[sizes < min_size])] = 0
 76    seg, _, _ = relabel_sequential(seg)
 77
 78    return seg.astype("uint16")
 79
 80
 81def _prepare_data(data_dir, organ):
 82    if organ in ORGANS1:
 83        for org in ORGANS1:
 84            input_root, output_root = os.path.join(data_dir, "InstanceSeg", org), os.path.join(data_dir, org)
 85            for split in ("train", "val", "test"):
 86                images = sorted(glob(os.path.join(input_root, split, "images", "*")))
 87                masks = sorted(glob(os.path.join(input_root, split, "masks", "*")))
 88                if len(images) != len(masks):
 89                    continue
 90                assert len(images) == len(masks)
 91                output_folder = os.path.join(output_root, split)
 92                os.makedirs(output_folder, exist_ok=True)
 93                for im_path, mask_path in tqdm(
 94                    zip(images, masks), total=len(images), desc=f"Converting {org}, {split}-split"
 95                ):
 96                    im = imageio.imread(im_path)
 97                    mask = np.load(mask_path) if mask_path.endswith(".npy") else imageio.imread(mask_path)
 98                    if im.ndim == 3:
 99                        im = im[..., 0]
100                    assert im.shape == mask.shape
101                    out_path = os.path.join(output_folder, f"{os.path.basename(im_path)}.h5")
102                    with h5py.File(out_path, mode="w") as f:
103                        f.create_dataset("image", data=im, compression="gzip")
104                        f.create_dataset("masks", data=mask, compression="gzip")
105        shutil.rmtree(os.path.join(data_dir, "InstanceSeg"))
106
107    else:
108        if COCO is None:
109            raise ModuleNotFoundError(
110                "'pycocotools' is required for processing the OrgLine ground-truth. "
111                "Install it with 'conda install -c conda-forge pycocotools'."
112            )
113        for org in ORGANS2:
114            input_root, output_root = os.path.join(data_dir, org), os.path.join(data_dir, org)
115            coco_file = os.path.join(input_root, "coco.json")
116            coco = COCO(coco_file)
117
118            image_ids = coco.getImgIds()
119            # Create splits.
120            train_ids, test_ids = train_test_split(image_ids, test_size=0.2, random_state=42)
121            test_ids, val_ids = train_test_split(test_ids, test_size=0.6, random_state=42)
122            train_out, val_out = os.path.join(output_root, "train"), os.path.join(output_root, "val")
123            test_out = os.path.join(output_root, "test")
124            os.makedirs(train_out, exist_ok=True)
125            os.makedirs(val_out, exist_ok=True)
126            os.makedirs(test_out, exist_ok=True)
127
128            for image_id in tqdm(image_ids, desc=f"Converting {org}"):
129                image_metadata = coco.loadImgs(image_id)[0]
130                file_name = image_metadata["file_name"]
131                image_path = os.path.join(input_root, file_name)
132                im = imageio.imread(image_path)
133                if im.ndim == 3:
134                    im = np.mean(im[..., :3], axis=-1)
135                mask = _annotations_to_instances(coco, image_metadata)
136                assert im.shape == mask.shape
137                # For debugging.
138                # import napari
139                # v = napari.Viewer()
140                # v.add_image(im)
141                # v.add_labels(mask)
142                # napari.run()
143                if image_id in train_ids:
144                    output_folder = train_out
145                elif image_id in val_ids:
146                    output_folder = val_out
147                else:
148                    output_folder = test_out
149                out_path = os.path.join(output_folder, f"{os.path.basename(image_path)}.h5")
150                with h5py.File(out_path, mode="w") as f:
151                    f.create_dataset("image", data=im, compression="gzip")
152                    f.create_dataset("masks", data=mask, compression="gzip")
153
154            # Clean up.
155            shutil.rmtree(os.path.join(input_root, "images"))
156            json_files = glob(os.path.join(input_root, "*.json"))
157            for json_file in json_files:
158                os.remove(json_file)
159
160
161def get_orgline_data(path: Union[os.PathLike, str], organ: str,  download: bool = False) -> str:
162    """Download the OrgLine dataset.
163
164    Args:
165        path: Filepath to the folder where the downloaded data will be saved.
166        organ: The organ from which the organoids are derived.
167        download: Whether to download the data if it is not present.
168
169    Returns:
170        The filepath where the data is downloaded.
171    """
172    if organ in ORGANS1:
173        url, checksum = URL1, CHECKSUM1
174        data_folder = "data1"
175    elif organ in ORGANS2:
176        url, checksum = URL2, CHECKSUM2
177        data_folder = "data2"
178    else:
179        raise ValueError(f"Invalid organ: {organ}. Must be one of {ORGANS1 + ORGANS2}.")
180
181    data_dir = os.path.join(path, data_folder)
182    if os.path.exists(data_dir):
183        return data_dir
184
185    os.makedirs(data_dir, exist_ok=True)
186    zip_path = os.path.join(data_dir, "data.zip")
187    util.download_source(path=zip_path, url=url, download=download, checksum=checksum)
188    util.unzip(zip_path=zip_path, dst=data_dir, remove=True)
189    _prepare_data(data_dir, organ)
190    return data_dir
191
192
193def get_orgline_paths(
194    path: Union[os.PathLike, str],
195    split: Literal["train", "val", "test"],
196    organs: Optional[Union[str, Sequence[str]]] = None,
197    download: bool = False,
198) -> List[str]:
199    """Get paths to the OrgLine data.
200
201    Args:
202        path: Filepath to the folder where the downloaded data will be saved.
203        organ: .
204        split: The data split to use.
205        download: Whether to download the data if it is not present.
206
207    Returns:
208        List of filepaths for the input data.
209    """
210    if isinstance(organs, str):
211        organs = [organs]
212    elif organs is None:
213        organs = ORGANS1 + ORGANS2
214    paths = []
215    for organ in organs:
216        data_dir = get_orgline_data(path, organ, download)
217        this_paths = sorted(glob(os.path.join(data_dir, organ, split, "*.h5")))
218        paths.extend(this_paths)
219    return paths
220
221
222def get_orgline_dataset(
223    path: Union[os.PathLike, str],
224    patch_shape: Tuple[int, int],
225    split: Literal["train", "val", "test"],
226    organs: Optional[Union[str, Sequence[str]]] = None,
227    download: bool = False,
228    **kwargs,
229) -> Dataset:
230    """Get OrgLine dataset for organoid segmentation in brightfield microscopy images.
231
232    Args:
233        path: Filepath to the folder where the downloaded data will be saved.
234        patch_shape: The patch shape to use for training.
235        split: The data split to use.
236        organ:
237        download: Whether to download the data if it is not present.
238        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
239
240    Returns:
241        The segmentation dataset.
242    """
243    paths = get_orgline_paths(path, split, organs, download)
244    return torch_em.default_segmentation_dataset(
245        raw_paths=paths,
246        raw_key="image",
247        label_paths=paths,
248        label_key="masks",
249        is_seg_dataset=True,
250        patch_shape=patch_shape,
251        ndim=2,
252        **kwargs
253    )
254
255
256def get_orgline_loader(
257    path: Union[os.PathLike, str],
258    batch_size: int,
259    patch_shape: Tuple[int, int],
260    split: Literal["train", "val", "test"],
261    organs: Optional[Union[str, Sequence[str]]] = None,
262    download: bool = False,
263    **kwargs,
264) -> DataLoader:
265    """Get OrgLine dataloader for organoid segmentation in brightfield microscopy images.
266
267    Args:
268        path: Filepath to the folder where the downloaded data will be saved.
269        batch_size: The batch size for training.
270        patch_shape: The patch shape to use for training.
271
272        split: The data split to use.
273
274        download: Whether to download the data if it is not present.
275        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
276
277    Returns:
278        The DataLoader.
279    """
280    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
281    dataset = get_orgline_dataset(path, patch_shape, split=split, organs=organs, download=download, **ds_kwargs)
282    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

URL1 = 'https://zenodo.org/records/16355179/files/InstanceSeg.zip?download=1'

URL2 = 'https://zenodo.org/records/18447547/files/data.zip?download=1'

CHECKSUM1 = '6787dc47ee5f800e7ecf4a51d958fc88591c877ca7f8f03c2aa3e7fa7c4aca50'

CHECKSUM2 = '8b5984ee19232c06cdf5366080a3f3b27fb2109f38a2a345316e22dd2bb9a1c2'

ORGANS1 = ('PDAC', 'colon', 'Intestine', 'brain')

ORGANS2 = ('stomach', 'breast')

def get_orgline_data(path: Union[os.PathLike, str], organ: str, download: bool = False) -> str: View Source

162def get_orgline_data(path: Union[os.PathLike, str], organ: str,  download: bool = False) -> str:
163    """Download the OrgLine dataset.
164
165    Args:
166        path: Filepath to the folder where the downloaded data will be saved.
167        organ: The organ from which the organoids are derived.
168        download: Whether to download the data if it is not present.
169
170    Returns:
171        The filepath where the data is downloaded.
172    """
173    if organ in ORGANS1:
174        url, checksum = URL1, CHECKSUM1
175        data_folder = "data1"
176    elif organ in ORGANS2:
177        url, checksum = URL2, CHECKSUM2
178        data_folder = "data2"
179    else:
180        raise ValueError(f"Invalid organ: {organ}. Must be one of {ORGANS1 + ORGANS2}.")
181
182    data_dir = os.path.join(path, data_folder)
183    if os.path.exists(data_dir):
184        return data_dir
185
186    os.makedirs(data_dir, exist_ok=True)
187    zip_path = os.path.join(data_dir, "data.zip")
188    util.download_source(path=zip_path, url=url, download=download, checksum=checksum)
189    util.unzip(zip_path=zip_path, dst=data_dir, remove=True)
190    _prepare_data(data_dir, organ)
191    return data_dir

Download the OrgLine dataset.

Arguments:

path: Filepath to the folder where the downloaded data will be saved.
organ: The organ from which the organoids are derived.
download: Whether to download the data if it is not present.

Returns:

The filepath where the data is downloaded.

def get_orgline_paths( path: Union[os.PathLike, str], split: Literal['train', 'val', 'test'], organs: Union[str, Sequence[str], NoneType] = None, download: bool = False) -> List[str]: View Source

194def get_orgline_paths(
195    path: Union[os.PathLike, str],
196    split: Literal["train", "val", "test"],
197    organs: Optional[Union[str, Sequence[str]]] = None,
198    download: bool = False,
199) -> List[str]:
200    """Get paths to the OrgLine data.
201
202    Args:
203        path: Filepath to the folder where the downloaded data will be saved.
204        organ: .
205        split: The data split to use.
206        download: Whether to download the data if it is not present.
207
208    Returns:
209        List of filepaths for the input data.
210    """
211    if isinstance(organs, str):
212        organs = [organs]
213    elif organs is None:
214        organs = ORGANS1 + ORGANS2
215    paths = []
216    for organ in organs:
217        data_dir = get_orgline_data(path, organ, download)
218        this_paths = sorted(glob(os.path.join(data_dir, organ, split, "*.h5")))
219        paths.extend(this_paths)
220    return paths

Get paths to the OrgLine data.

Arguments:

path: Filepath to the folder where the downloaded data will be saved.
organ: .
split: The data split to use.
download: Whether to download the data if it is not present.

Returns:

List of filepaths for the input data.

def get_orgline_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], organs: Union[str, Sequence[str], NoneType] = None, download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset: View Source

223def get_orgline_dataset(
224    path: Union[os.PathLike, str],
225    patch_shape: Tuple[int, int],
226    split: Literal["train", "val", "test"],
227    organs: Optional[Union[str, Sequence[str]]] = None,
228    download: bool = False,
229    **kwargs,
230) -> Dataset:
231    """Get OrgLine dataset for organoid segmentation in brightfield microscopy images.
232
233    Args:
234        path: Filepath to the folder where the downloaded data will be saved.
235        patch_shape: The patch shape to use for training.
236        split: The data split to use.
237        organ:
238        download: Whether to download the data if it is not present.
239        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
240
241    Returns:
242        The segmentation dataset.
243    """
244    paths = get_orgline_paths(path, split, organs, download)
245    return torch_em.default_segmentation_dataset(
246        raw_paths=paths,
247        raw_key="image",
248        label_paths=paths,
249        label_key="masks",
250        is_seg_dataset=True,
251        patch_shape=patch_shape,
252        ndim=2,
253        **kwargs
254    )

Get OrgLine dataset for organoid segmentation in brightfield microscopy images.

Arguments:

path: Filepath to the folder where the downloaded data will be saved.
patch_shape: The patch shape to use for training.
split: The data split to use.
organ:
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.

Returns:

The segmentation dataset.

def get_orgline_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], split: Literal['train', 'val', 'test'], organs: Union[str, Sequence[str], NoneType] = None, download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader: View Source

257def get_orgline_loader(
258    path: Union[os.PathLike, str],
259    batch_size: int,
260    patch_shape: Tuple[int, int],
261    split: Literal["train", "val", "test"],
262    organs: Optional[Union[str, Sequence[str]]] = None,
263    download: bool = False,
264    **kwargs,
265) -> DataLoader:
266    """Get OrgLine dataloader for organoid segmentation in brightfield microscopy images.
267
268    Args:
269        path: Filepath to the folder where the downloaded data will be saved.
270        batch_size: The batch size for training.
271        patch_shape: The patch shape to use for training.
272
273        split: The data split to use.
274
275        download: Whether to download the data if it is not present.
276        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
277
278    Returns:
279        The DataLoader.
280    """
281    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
282    dataset = get_orgline_dataset(path, patch_shape, split=split, organs=organs, download=download, **ds_kwargs)
283    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get OrgLine dataloader for organoid segmentation in brightfield microscopy images.

Arguments:

path: Filepath to the folder where the downloaded data will be saved.
batch_size: The batch size for training.
patch_shape: The patch shape to use for training.
split: The data split to use.
download: Whether to download the data if it is not present.
kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.

Returns:

The DataLoader.