torch_em.data.datasets.medical.sa_med2d

The SA-Med2D-20M dataset contains annotations for several organs and structures in biomedical images from several imaging modalities.

NOTE: The current version contains 3.7M images and 15.8M masks.

The dataset is located in HuggingFace at https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M. The dataset is from the publication: https://arxiv.org/abs/2311.11969. And the dataset is curated in alignment with the publication: https://doi.org/10.48550/arXiv.2308.16184. Please cite it if you use this dataset in your research.

  1"""The SA-Med2D-20M dataset contains annotations for several organs and structures in biomedical
  2images from several imaging modalities.
  3
  4NOTE: The current version contains 3.7M images and 15.8M masks.
  5
  6The dataset is located in HuggingFace at https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M.
  7The dataset is from the publication: https://arxiv.org/abs/2311.11969.
  8And the dataset is curated in alignment with the publication: https://doi.org/10.48550/arXiv.2308.16184.
  9Please cite it if you use this dataset in your research.
 10"""
 11
 12import os
 13import shutil
 14import zipfile
 15from glob import glob
 16from math import ceil
 17from tqdm import tqdm
 18from natsort import natsorted
 19from typing import Union, Tuple, List
 20from concurrent.futures import ThreadPoolExecutor, as_completed
 21
 22import json
 23import numpy as np
 24import imageio.v3 as imageio
 25from skimage.segmentation import relabel_sequential
 26
 27from torch.utils.data import Dataset, DataLoader
 28
 29import torch_em
 30from torch_em.transform.generic import ResizeLongestSideInputs
 31
 32from .. import util
 33
 34
 35DATASET_NAMES = [
 36    "ACDC",  # cardiac structures in MRI
 37    "AMOS2022",  # multi-organ in CT
 38    "ATM2022",  # pulmonary airway in CT
 39    "AbdomenCT1K",  # abdominal organ in CT
 40    "ASC18",  # left atrium in LGE-MRI
 41    "COSMOS2022",  # cartoid vessel wall in MRI
 42    "BTCV",  # organs in CT
 43    "BTCV_Cervix",  # cervical organs in CT
 44    "BraTS2013",  # brain tumour in MRI
 45    "BraTS2015",  # brain tumour in MRI
 46    "BraTS2018",  # brain tumour in MRI
 47    "BraTS2019",  # brain tumour in MRI
 48    "BraTS2020",  # brain tumour in MRI
 49    "BraTS2021",  # brain tumour in MRI
 50    "Brain_PTM",  # white matter tracts in brain MRI
 51    "CAD_PE",  # pulmonary embolism in CTPA
 52    "CHAOS_Task_4",  # liver, kidney and spleen in T1W-MR
 53    "CMRxMotions",  # cardiac structures in CMR
 54    "COVID19CTscans",  # lung and covid infection in CT
 55    "COVID-19-20",  # covid infection in CT
 56    "covid_19_ct_cxr",  # lung in CXR
 57    "crass",  # clavicle in CXR
 58    "CTPelvic1k",  # pelvic bones in CT
 59    "CTSpine1K_Full",  # spinal vertebrae in CT
 60    "cvc_clinicdb",  # polyp in colonoscopy
 61    "Chest_Image_Pneum",  # pneumonia in CXR
 62    "cranium",  # cranial segmentation in CT
 63    "CrossMoDA21",  # vestibular schwannoma and cochlea segmentation in T1-CE and TI-HR MRI
 64    "CrossMoDA22",  # vestibular schwannoma and cochlea segmentation in T1-CE and TI-HR MRI
 65    "EMIDEC",  # cardiac structures in MRI
 66    "endovis15",  # polyp in endoscopy
 67    "FLARE21",  # abdominal organs in CT
 68    "FLARE22",  # abdominal organs in CT
 69    "fusc2021",  # skin lesion in dermoscopy
 70    "hvsmr_2016",  # blood pool and ventricular myocardium in CMR
 71    "Heart_Seg_MRI",  # heart in MRI
 72    "ichallenge_adam_task2",  # optic disc in fundus images
 73    "PALM19",  # optic disc in fundus images
 74    "gamma",  # optic disk, optic cup and ring in fundus images
 75    "gamma3",  # optic disk, optic cup and ring in fundus images
 76    "ISLES_SPES",  # ischemic stroke lesion in brain MRI
 77    "ISLES_SISS",  # ischemic stroke lesion in brain MRI
 78    "ISLES2016",  # ischemic stroke lesion in brain MRI
 79    "ISLES2017",  # ischemic stroke lesion in brain MRI
 80    "ISLES2018",  # ischemic stroke in brain CT
 81    "ISLES2022",  # ischemic stroke in brain MRI
 82    "Instance22",  # intracranial hemorrhage in nc-ct
 83    "KiTS",  # kidney and kidney tumor in CT
 84    "KiTS2021",  # kidney and kidney tumor in CT
 85    "LNDb",  # lung nodules in thoracic CT
 86    "LUNA16",  # lung and trachea in thoracic CT
 87    "LongitudinalMultipleSclerosisLesionSegmentation",  # MS lesion in FLAIR-MRI
 88    "mnms2",  # cardiac structures in MRI
 89    "MMWHS",  # whole heart in CT
 90    "BrainTumour",  # brain tumor in MRI
 91    "MSD_Heart",  # heart in MRI
 92    "MSD_Liver",  # liver in CT
 93    "MSD_Prostate",  # prostate in ADC-MRI
 94    "MSD_Lung",  # lung tumour in CT
 95    "MSD_Pancreas",  # pancreas in CT
 96    "MSD_HepaticVessel",  # hepatic vessel in CT
 97    "MSD_Spleen",  # spleen in CT
 98    "MSD_Colon",  # colon in CT
 99    "CT_ORG",  # multiple organ in CT
100    "picai_baseline",  # prostate cancer in MRI
101    "picai_semi",  # prostate cancer in MRI
102    "Promise09",  # prostate in MRI
103    "PROMISE12",  # prostate in MRI
104    "Parse22",  # pulmonary atery in CT
105    "chest_x_ray_images_with_pneumothorax_masks",  # pneumothorax in CXR
106    "Prostate_MRI_Segmentation_Dataset",  # prostate in MRI
107    "Pulmonary_Chest_X-Ray_Abnormalities_seg",  # lung in CXR
108    "QUBIQ2020",  # kidney in CT
109    "StructSeg2019_subtask1",  # OAR in H&N CT
110    "StructSeg2019_subtask2",  # OAR in chest CT
111    "Totalsegmentator_dataset",  # organ in CT
112    "ultrasound_nerve_segmentation",  # nerve in US
113    "VESSEL2012",  # lung in CT
114    "VerSe20",  # vertebrae in CT
115    "VerSe19",  # vertebrae in CT
116    "WORD",  # abdominal organs in CT
117    "autoPET",  # lesions in PET and CT
118    "braimMRI",  # brain lesions in MRI
119    "breast_ultrasound_images_dataset",  # breast cancer in US
120    "kvasircapsule_seg",  # polyp in endoscopy
121    "sz_cxr",  # lungs in CXR
122    "EndoVis_2017_RIS",  # instruments in endoscopy
123    "kvasir_seg",  # polyp in endoscopy
124    "isic2018_task1",  # skin lesions in dermoscopy
125    "isic2017_task1",  # skin lesions in dermoscopy
126    "isic2016_task1",  # skin lesions in dermoscopy
127]
128
129MODALITY_NAMES = [
130    # CT modalities
131    'ct_00', 'ct_cbf', 'ct_cbv', 'ct_mtt', 'ct_tmax',
132    # RGB0-image modalities
133    'dermoscopy_00', 'endoscopy_00', 'fundus_photography',
134    # MRI modalities
135    'mr_00', 'mr_adc', 'mr_cbf', 'mr_cbv', 'mr_cmr', 'mr_dwi',
136    'mr_flair', 'mr_hbv', 'mr_lge', 'mr_mprage', 'mr_mtt',
137    'mr_pd', 'mr_rcbf', 'mr_rcbv', 'mr_t1', 'mr_t1c', 'mr_t1ce',
138    'mr_t1gd', 'mr_t1w', 'mr_t2', 'mr_t2w', 'mr_tmax', 'mr_ttp',
139    # mono-channel modalities
140    'pet_00', 'ultrasound_00', 'x_ray'
141]
142
143
144# datasets under 1000 samples
145SMALL_DATASETS = [
146    "crass", "covid_19_ct_cxr", "cvc_clinicdb", "cranium", "CrossMoDA21", "EMIDEC",
147    "endovis15", "fusc2021", "Heart_Seg_MRI", "ichallenge_adam_task2", "gamma", "gamma3",
148    "Instance22", "LNDb", "MSD_Heart", "MSD_Prostate", "MSD_Spleen", "MSD_Colon",
149    "picai_baseline", "picai_semi", "Promise09", "PROMISE12", "Pulmonary_Chest_X-Ray_Abnormalities_seg",
150    "QUBIQ2020", "breast_ultrasound_images_dataset", "kvasircapsule_seg", "sz_cxr", "kvasir_seg"
151]
152
153SHARD_SIZE = 50000   # maximum images per dataset container file.
154
155
156def _preprocess_data(path):
157    import h5py
158
159    data_dir = os.path.join(path, "data")
160    if os.path.exists(data_dir):
161        return data_dir
162
163    os.makedirs(data_dir, exist_ok=True)
164
165    # We must ensure that the core zipfile (all small zipped splits merged into one) exists as expected.
166    zip_path = os.path.join(path, "data.zip")  # NOTE: The zipfile name is hard-coded to 'data.zip'.
167    if not os.path.exists(zip_path):
168        raise FileNotFoundError(
169            f"The combined zip file does not exist under the file name 'data.zip' at '{path}'. "
170            "Please see 'get_sa_med2d_data' for details."
171        )
172
173    # Function to preprocess each image.
174    def _process_each_image(image_path, data, dataset_name, data_dir, raw_transform, label_transform):
175        image = imageio.imread(image_path)
176
177        if image.ndim == 3:
178            image = image.transpose(2, 0, 1)  # Make channels first for the transform to work.
179        else:
180            assert image.ndim == 2, image.ndim
181            image = np.stack([image] * 3, axis=0)
182
183        shape = image.shape[1:]
184
185        # Get the image filename.
186        image_fname = f"images/{os.path.basename(image_path)}"
187        instances = np.zeros(shape, dtype="uint8")
188
189        # Merge all masks into one label image.
190        for idx, gt_fname in enumerate(sorted(data.get(image_fname, [])), start=1):
191            # HACK: (SKIP) We remove the segmentation of entire ventricular cavity in ACDC.
192            # Avoid whole ventricular rois specifically.
193            if dataset_name == "ACDC" and "0003_000" in gt_fname and len(data[image_fname]) > 1:
194                continue
195
196            gt_path = os.path.join(data_dir, "SAMed2Dv1", gt_fname)
197            gt_mask = imageio.imread(gt_path)
198
199            if gt_mask.shape != shape:
200                print("Skipping these images with mismatching ground-truth shapes.")
201                continue
202
203            # HACK: (UPDATE) The optic disk is mapped as 0, and background as 1
204            if dataset_name == "ichallenge_adam_task2":
205                gt_mask = (gt_mask == 0).astype("uint8")  # Simply reversing binary optic disc masks.
206
207            instances[gt_mask > 0] = idx
208
209        # Check if the image and corresponding labels are valid.
210        if len(np.unique(instances)) > 1 and len(np.unique(image)) > 1:
211            # This checks if the label has atleast one foreground object and the raw data has some valid information.
212            instances = relabel_sequential(instances)[0]
213            return raw_transform(image), label_transform(instances)
214        else:
215            return None
216
217    print("We will start pre-processing the dataset. This might take a while.")
218    with zipfile.ZipFile(zip_path, "r") as f:
219        all_members = f.namelist()
220
221        # First, we extract json files.
222        json_members = [m for m in all_members if m.endswith(".json")]
223        f.extractall(path=data_dir, members=json_members)
224
225        # Load the json file.
226        with open(os.path.join(data_dir, "SAMed2Dv1", "SAMed2D_v1.json")) as j:
227            data = json.load(j)
228
229        # Get image and label transforms to resize images to expected patch shape for training.
230        raw_transform = ResizeLongestSideInputs(target_shape=(512, 512), is_rgb=True)
231        label_transform = ResizeLongestSideInputs(target_shape=(512, 512), is_label=True)
232
233        # Get members per dataset and extract them one-by-one.
234        for dataset_name in tqdm(DATASET_NAMES, desc="Preprocessing data"):
235            # First, we check if this dataset has any related h5 files, otherwise proceed with extraction.
236            if len(glob(os.path.join(data_dir, f"{dataset_name}*.h5"))) > 0:
237                continue
238
239            # Extract only the images and labels matching the dataset name.
240            dataset_members = [m for m in all_members if dataset_name in m]
241            f.extractall(path=data_dir, members=dataset_members)
242
243            # Get all image and label paths.
244            image_dir = os.path.join(data_dir, "SAMed2Dv1", "images")
245            image_paths = natsorted(glob(os.path.join(image_dir, "*")))
246            num_images = len(image_paths)
247
248            # Compute the total number of shards.
249            # The files blow up some strange buffer memory, so I just piece the datasets down a bit.
250            num_shards = ceil(num_images / SHARD_SIZE)
251
252            for shard_idx in range(num_shards):
253                start_idx = shard_idx * SHARD_SIZE
254                end_idx = min((shard_idx + 1) * SHARD_SIZE, num_images)
255                shard_image_paths = image_paths[start_idx:end_idx]
256
257                # Store all images in current set inside one h5 file.
258                shard_fpath = os.path.join(data_dir, f"{dataset_name}_{shard_idx:02d}.h5")
259                if os.path.exists(shard_fpath):
260                    continue
261
262                with h5py.File(shard_fpath, "w") as h:
263                    raw_ds = h.create_dataset(
264                        "raw",
265                        shape=(3, 0, 512, 512),
266                        maxshape=(3, None, 512, 512),
267                        chunks=(3, 1, 512, 512),
268                        compression="lzf",
269                    )
270                    label_ds = h.create_dataset(
271                        "labels",
272                        shape=(0, 512, 512),
273                        maxshape=(None, 512, 512),
274                        chunks=(1, 512, 512),
275                        compression="lzf",
276                    )
277
278                    # We need to preprocess images and corresponding labels, and store them.
279                    curr_len = 0
280                    with ThreadPoolExecutor(max_workers=32) as executor:
281                        futures = [
282                            executor.submit(
283                                _process_each_image,
284                                image_path, data, dataset_name, data_dir, raw_transform, label_transform,
285                            ) for image_path in shard_image_paths
286                        ]
287
288                        for i, future in enumerate(
289                            tqdm(
290                                as_completed(futures), total=len(futures),
291                                desc=f"Processing '{dataset_name}' images for shard '{shard_idx:02d}'")
292                        ):
293                            result = future.result()
294
295                            if result is None:  # When the image or corresponding labels are not valid.
296                                print(f"Skipping invalid image and labels: {shard_image_paths[i]}")
297                                continue
298
299                            image_transformed, label_transformed = result
300
301                            # We resize the dataset object to incrementally add new samples.
302                            raw_ds.resize((3, curr_len + 1, 512, 512))
303                            label_ds.resize((curr_len + 1, 512, 512))
304
305                            # Let's write the images and labels incrementally.
306                            raw_ds[:, curr_len] = image_transformed
307                            label_ds[curr_len] = label_transformed
308
309                            curr_len += 1
310
311            # And finally, remove all files for the current dataset at the end.
312            shutil.rmtree(os.path.join(data_dir, "SAMed2Dv1", "images"))
313            shutil.rmtree(os.path.join(data_dir, "SAMed2Dv1", "masks"))
314
315    # And remove the json files as well
316    shutil.rmtree(os.path.join(data_dir, "SAMed2Dv1"))
317
318    return data_dir
319
320
321def get_sa_med2d_data(path: Union[os.PathLike, str], download: bool = False) -> str:
322    """This function describes the download functionality and ensures your data has been downloaded in expected format.
323
324    The dataset is located at https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M.
325
326    There are two ways of downloading the dataset:
327    1. wget (Recommended):
328        - There are 10 `z.*` files and 1 `.zip` file which needs to be installed together.
329        - Go to `Files` -> download each file individually using `wget <LINK>`. Below mentioned are the links:
330            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z01
331            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z02
332            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z03
333            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z04
334            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z05
335            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z06
336            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z07
337            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z08
338            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z09
339            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z10
340            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.zip
341
342    2. Using Git Large File Storage (lfs):
343        - `git lfs install` (Make sure you have git-lfs installed (https://git-lfs.com))
344        - `git clone https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M`
345            - This step takes several hours, make sure you have a consistent internet and sufficient space.
346
347    Once you have downloaded the archives, please run the following script to create one unified zipfile:
348    - zip SA-Med2D-16M.zip SA-Med2D-16M.z0* SA-Med2D-16M.z10 -s=0 --out data.zip`
349        - NOTE: deflates the entire dataset to ensemble into one zip, make sure you have ~1.5TB free space.
350
351    And the following preprocessing parts are taken care of by `get_sa_med2d_data` for you.
352
353    Args:
354        path: Filepath to a folder where the data is downloaded for further processing.
355        download: Whether to download the data if it is not present.
356
357    Returns:
358        Filepath where the data is already downloaded and unzipped.
359    """
360    if download:
361        print("Download is not supported, as the data is huge and takes quite a while to download and extract.")
362
363    # And the final stage is preprocessing the images to be able to efficiently access the entire dataset.
364    data_dir = _preprocess_data(path)
365    print("Looks like the dataset is ready to use.")
366    return data_dir
367
368
369def get_sa_med2d_paths(path: Union[os.PathLike, str], download: bool = False) -> List[str]:
370    """Get paths to the SA-Med2D-20M data.
371
372    Args:
373        path: Filepath to a folder where the data is downloaded for further processing.
374        download: Whether to download the data if it is not present.
375
376    Returns:
377        List of filepaths for the input data.
378    """
379    data_dir = get_sa_med2d_data(path, download)
380    input_paths = natsorted(glob(os.path.join(data_dir, "*.h5")))
381    return input_paths
382
383
384def get_sa_med2d_dataset(
385    path: Union[os.PathLike, str], patch_shape: Tuple[int, int], download: bool = False, **kwargs,
386) -> Dataset:
387    """Get the SA-Med2D-20M dataset for various medical image segmentation tasks.
388
389    Args:
390        path: Filepath to a folder where the data is downloaded for further processing.
391        patch_shape: The patch shape to use for training.
392        download: Whether to download the data if it is not present.
393        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
394
395    Returns:
396        The segmentation dataset.
397    """
398    input_paths = get_sa_med2d_paths(path, download)
399
400    return torch_em.default_segmentation_dataset(
401        raw_paths=input_paths,
402        raw_key="raw",
403        label_paths=input_paths,
404        label_key="labels",
405        patch_shape=patch_shape,
406        ndim=2,
407        with_channels=True,
408        is_seg_dataset=True,
409        verify_paths=False,
410        **kwargs
411    )
412
413
414def get_sa_med2d_loader(
415    path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], download: bool = False, **kwargs,
416) -> DataLoader:
417    """Get the SA-Med2D-20M dataloader for various medical image segmentation tasks.
418
419    Args:
420        path: Filepath to a folder where the data is downloaded for further processing.
421        batch_size: The batch size for training.
422        patch_shape: The patch shape to use for training.
423        download: Whether to download the data if it is not present.
424        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
425
426    Returns:
427        The DataLoader.
428    """
429    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
430    dataset = get_sa_med2d_dataset(path, patch_shape, download, **ds_kwargs)
431    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
DATASET_NAMES = ['ACDC', 'AMOS2022', 'ATM2022', 'AbdomenCT1K', 'ASC18', 'COSMOS2022', 'BTCV', 'BTCV_Cervix', 'BraTS2013', 'BraTS2015', 'BraTS2018', 'BraTS2019', 'BraTS2020', 'BraTS2021', 'Brain_PTM', 'CAD_PE', 'CHAOS_Task_4', 'CMRxMotions', 'COVID19CTscans', 'COVID-19-20', 'covid_19_ct_cxr', 'crass', 'CTPelvic1k', 'CTSpine1K_Full', 'cvc_clinicdb', 'Chest_Image_Pneum', 'cranium', 'CrossMoDA21', 'CrossMoDA22', 'EMIDEC', 'endovis15', 'FLARE21', 'FLARE22', 'fusc2021', 'hvsmr_2016', 'Heart_Seg_MRI', 'ichallenge_adam_task2', 'PALM19', 'gamma', 'gamma3', 'ISLES_SPES', 'ISLES_SISS', 'ISLES2016', 'ISLES2017', 'ISLES2018', 'ISLES2022', 'Instance22', 'KiTS', 'KiTS2021', 'LNDb', 'LUNA16', 'LongitudinalMultipleSclerosisLesionSegmentation', 'mnms2', 'MMWHS', 'BrainTumour', 'MSD_Heart', 'MSD_Liver', 'MSD_Prostate', 'MSD_Lung', 'MSD_Pancreas', 'MSD_HepaticVessel', 'MSD_Spleen', 'MSD_Colon', 'CT_ORG', 'picai_baseline', 'picai_semi', 'Promise09', 'PROMISE12', 'Parse22', 'chest_x_ray_images_with_pneumothorax_masks', 'Prostate_MRI_Segmentation_Dataset', 'Pulmonary_Chest_X-Ray_Abnormalities_seg', 'QUBIQ2020', 'StructSeg2019_subtask1', 'StructSeg2019_subtask2', 'Totalsegmentator_dataset', 'ultrasound_nerve_segmentation', 'VESSEL2012', 'VerSe20', 'VerSe19', 'WORD', 'autoPET', 'braimMRI', 'breast_ultrasound_images_dataset', 'kvasircapsule_seg', 'sz_cxr', 'EndoVis_2017_RIS', 'kvasir_seg', 'isic2018_task1', 'isic2017_task1', 'isic2016_task1']
MODALITY_NAMES = ['ct_00', 'ct_cbf', 'ct_cbv', 'ct_mtt', 'ct_tmax', 'dermoscopy_00', 'endoscopy_00', 'fundus_photography', 'mr_00', 'mr_adc', 'mr_cbf', 'mr_cbv', 'mr_cmr', 'mr_dwi', 'mr_flair', 'mr_hbv', 'mr_lge', 'mr_mprage', 'mr_mtt', 'mr_pd', 'mr_rcbf', 'mr_rcbv', 'mr_t1', 'mr_t1c', 'mr_t1ce', 'mr_t1gd', 'mr_t1w', 'mr_t2', 'mr_t2w', 'mr_tmax', 'mr_ttp', 'pet_00', 'ultrasound_00', 'x_ray']
SMALL_DATASETS = ['crass', 'covid_19_ct_cxr', 'cvc_clinicdb', 'cranium', 'CrossMoDA21', 'EMIDEC', 'endovis15', 'fusc2021', 'Heart_Seg_MRI', 'ichallenge_adam_task2', 'gamma', 'gamma3', 'Instance22', 'LNDb', 'MSD_Heart', 'MSD_Prostate', 'MSD_Spleen', 'MSD_Colon', 'picai_baseline', 'picai_semi', 'Promise09', 'PROMISE12', 'Pulmonary_Chest_X-Ray_Abnormalities_seg', 'QUBIQ2020', 'breast_ultrasound_images_dataset', 'kvasircapsule_seg', 'sz_cxr', 'kvasir_seg']
SHARD_SIZE = 50000
def get_sa_med2d_data(path: Union[os.PathLike, str], download: bool = False) -> str:
322def get_sa_med2d_data(path: Union[os.PathLike, str], download: bool = False) -> str:
323    """This function describes the download functionality and ensures your data has been downloaded in expected format.
324
325    The dataset is located at https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M.
326
327    There are two ways of downloading the dataset:
328    1. wget (Recommended):
329        - There are 10 `z.*` files and 1 `.zip` file which needs to be installed together.
330        - Go to `Files` -> download each file individually using `wget <LINK>`. Below mentioned are the links:
331            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z01
332            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z02
333            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z03
334            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z04
335            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z05
336            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z06
337            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z07
338            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z08
339            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z09
340            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z10
341            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.zip
342
343    2. Using Git Large File Storage (lfs):
344        - `git lfs install` (Make sure you have git-lfs installed (https://git-lfs.com))
345        - `git clone https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M`
346            - This step takes several hours, make sure you have a consistent internet and sufficient space.
347
348    Once you have downloaded the archives, please run the following script to create one unified zipfile:
349    - zip SA-Med2D-16M.zip SA-Med2D-16M.z0* SA-Med2D-16M.z10 -s=0 --out data.zip`
350        - NOTE: deflates the entire dataset to ensemble into one zip, make sure you have ~1.5TB free space.
351
352    And the following preprocessing parts are taken care of by `get_sa_med2d_data` for you.
353
354    Args:
355        path: Filepath to a folder where the data is downloaded for further processing.
356        download: Whether to download the data if it is not present.
357
358    Returns:
359        Filepath where the data is already downloaded and unzipped.
360    """
361    if download:
362        print("Download is not supported, as the data is huge and takes quite a while to download and extract.")
363
364    # And the final stage is preprocessing the images to be able to efficiently access the entire dataset.
365    data_dir = _preprocess_data(path)
366    print("Looks like the dataset is ready to use.")
367    return data_dir

This function describes the download functionality and ensures your data has been downloaded in expected format.

The dataset is located at https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M.

There are two ways of downloading the dataset:

  1. wget (Recommended):
  1. Using Git Large File Storage (lfs):
    • git lfs install (Make sure you have git-lfs installed (https://git-lfs.com))
    • git clone https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M
      • This step takes several hours, make sure you have a consistent internet and sufficient space.

Once you have downloaded the archives, please run the following script to create one unified zipfile:

  • zip SA-Med2D-16M.zip SA-Med2D-16M.z0* SA-Med2D-16M.z10 -s=0 --out data.zip`
    • NOTE: deflates the entire dataset to ensemble into one zip, make sure you have ~1.5TB free space.

And the following preprocessing parts are taken care of by get_sa_med2d_data for you.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • download: Whether to download the data if it is not present.
Returns:

Filepath where the data is already downloaded and unzipped.

def get_sa_med2d_paths(path: Union[os.PathLike, str], download: bool = False) -> List[str]:
370def get_sa_med2d_paths(path: Union[os.PathLike, str], download: bool = False) -> List[str]:
371    """Get paths to the SA-Med2D-20M data.
372
373    Args:
374        path: Filepath to a folder where the data is downloaded for further processing.
375        download: Whether to download the data if it is not present.
376
377    Returns:
378        List of filepaths for the input data.
379    """
380    data_dir = get_sa_med2d_data(path, download)
381    input_paths = natsorted(glob(os.path.join(data_dir, "*.h5")))
382    return input_paths

Get paths to the SA-Med2D-20M data.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • download: Whether to download the data if it is not present.
Returns:

List of filepaths for the input data.

def get_sa_med2d_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], download: bool = False, **kwargs) -> torch.utils.data.dataset.Dataset:
385def get_sa_med2d_dataset(
386    path: Union[os.PathLike, str], patch_shape: Tuple[int, int], download: bool = False, **kwargs,
387) -> Dataset:
388    """Get the SA-Med2D-20M dataset for various medical image segmentation tasks.
389
390    Args:
391        path: Filepath to a folder where the data is downloaded for further processing.
392        patch_shape: The patch shape to use for training.
393        download: Whether to download the data if it is not present.
394        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
395
396    Returns:
397        The segmentation dataset.
398    """
399    input_paths = get_sa_med2d_paths(path, download)
400
401    return torch_em.default_segmentation_dataset(
402        raw_paths=input_paths,
403        raw_key="raw",
404        label_paths=input_paths,
405        label_key="labels",
406        patch_shape=patch_shape,
407        ndim=2,
408        with_channels=True,
409        is_seg_dataset=True,
410        verify_paths=False,
411        **kwargs
412    )

Get the SA-Med2D-20M dataset for various medical image segmentation tasks.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • patch_shape: The patch shape to use for training.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset.
Returns:

The segmentation dataset.

def get_sa_med2d_loader( path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], download: bool = False, **kwargs) -> torch.utils.data.dataloader.DataLoader:
415def get_sa_med2d_loader(
416    path: Union[os.PathLike, str], batch_size: int, patch_shape: Tuple[int, int], download: bool = False, **kwargs,
417) -> DataLoader:
418    """Get the SA-Med2D-20M dataloader for various medical image segmentation tasks.
419
420    Args:
421        path: Filepath to a folder where the data is downloaded for further processing.
422        batch_size: The batch size for training.
423        patch_shape: The patch shape to use for training.
424        download: Whether to download the data if it is not present.
425        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
426
427    Returns:
428        The DataLoader.
429    """
430    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
431    dataset = get_sa_med2d_dataset(path, patch_shape, download, **ds_kwargs)
432    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

Get the SA-Med2D-20M dataloader for various medical image segmentation tasks.

Arguments:
  • path: Filepath to a folder where the data is downloaded for further processing.
  • batch_size: The batch size for training.
  • patch_shape: The patch shape to use for training.
  • download: Whether to download the data if it is not present.
  • kwargs: Additional keyword arguments for torch_em.default_segmentation_dataset or for the PyTorch DataLoader.
Returns:

The DataLoader.