torch_em.data.datasets.medical.sa_med2d

  1import os
  2import random
  3from tqdm import tqdm
  4from pathlib import Path
  5from typing import Union, Tuple, Optional
  6
  7import json
  8import numpy as np
  9import imageio.v3 as imageio
 10from skimage.segmentation import relabel_sequential
 11from sklearn.model_selection import train_test_split
 12
 13import torch_em
 14
 15from .. import util
 16from ..light_microscopy.neurips_cell_seg import to_rgb
 17
 18
 19DATASET_NAMES = [
 20    "ACDC",  # cardiac structures in MRI
 21    "AMOS2022",  # multi-organ in CT
 22    "ATM2022",  # pulmonary airway in CT
 23    "AbdomenCT1K",  # abdominal organ in CT
 24    "ASC18",  # left atrium in LGE-MRI
 25    "COSMOS2022",  # cartoid vessel wall in MRI
 26    "BTCV",  # organs in CT
 27    "BTCV_Cervix",  # cervical organs in CT
 28    "BraTS2013",  # brain tumour in MRI
 29    "BraTS2015",  # brain tumour in MRI
 30    "BraTS2018",  # brain tumour in MRI
 31    "BraTS2019",  # brain tumour in MRI
 32    "BraTS2020",  # brain tumour in MRI
 33    "BraTS2021",  # brain tumour in MRI
 34    "Brain_PTM",  # white matter tracts in brain MRI
 35    "CAD_PE",  # pulmonary embolism in CTPA
 36    "CHAOS_Task_4",  # liver, kidney and spleen in T1W-MR
 37    "CMRxMotions",  # cardiac structures in CMR
 38    "COVID19CTscans",  # lung and covid infection in CT
 39    "COVID-19-20",  # covid infection in CT
 40    "covid_19_ct_cxr",  # lung in CXR
 41    "crass",  # clavicle in CXR
 42    "CTPelvic1k",  # pelvic bones in CT
 43    "CTSpine1K_Full",  # spinal vertebrae in CT
 44    "cvc_clinicdb",  # polyp in colonoscopy
 45    "Chest_Image_Pneum",  # pneumonia in CXR
 46    "cranium",  # cranial segmentation in CT
 47    "CrossMoDA21",  # vestibular schwannoma and cochlea segmentation in T1-CE and TI-HR MRI
 48    "CrossMoDA22",  # vestibular schwannoma and cochlea segmentation in T1-CE and TI-HR MRI
 49    "EMIDEC",  # cardiac structures in MRI
 50    "endovis15",  # polyp in endoscopy
 51    "FLARE21",  # abdominal organs in CT
 52    "FLARE22",  # abdominal organs in CT
 53    "fusc2021",  # skin lesion in dermoscopy
 54    "hvsmr_2016",  # blood pool and ventricular myocardium in CMR
 55    "Heart_Seg_MRI",  # heart in MRI
 56    "ichallenge_adam_task2",  # optic disc in fundus images
 57    "PALM19",  # optic disc in fundus images
 58    "gamma",  # optic disk, optic cup and ring in fundus images
 59    "gamma3",  # optic disk, optic cup and ring in fundus images
 60    "ISLES_SPES",  # ischemic stroke lesion in brain MRI
 61    "ISLES_SISS",  # ischemic stroke lesion in brain MRI
 62    "ISLES2016",  # ischemic stroke lesion in brain MRI
 63    "ISLES2017",  # ischemic stroke lesion in brain MRI
 64    "ISLES2018",  # ischemic stroke in brain CT
 65    "ISLES2022",  # ischemic stroke in brain MRI
 66    "Instance22",  # intracranial hemorrhage in nc-ct
 67    "KiTS",  # kidney and kidney tumor in CT
 68    "KiTS2021",  # kidney and kidney tumor in CT
 69    "LNDb",  # lung nodules in thoracic CT
 70    "LUNA16",  # lung and trachea in thoracic CT
 71    "LongitudinalMultipleSclerosisLesionSegmentation",  # MS lesion in FLAIR-MRI
 72    "mnms2",  # cardiac structures in MRI
 73    "MMWHS",  # whole heart in CT
 74    "BrainTumour",  # brain tumor in MRI
 75    "MSD_Heart",  # heart in MRI
 76    "MSD_Liver",  # liver in CT
 77    "MSD_Prostate",  # prostate in ADC-MRI
 78    "MSD_Lung",  # lung tumour in CT
 79    "MSD_Pancreas",  # pancreas in CT
 80    "MSD_HepaticVessel",  # hepatic vessel in CT
 81    "MSD_Spleen",  # spleen in CT
 82    "MSD_Colon",  # colon in CT
 83    "CT_ORG",  # multiple organ in CT
 84    "picai_baseline",  # prostate cancer in MRI
 85    "picai_semi",  # prostate cancer in MRI
 86    "Promise09",  # prostate in MRI
 87    "PROMISE12",  # prostate in MRI
 88    "Parse22",  # pulmonary atery in CT
 89    "chest_x_ray_images_with_pneumothorax_masks",  # pneumothorax in CXR
 90    "Prostate_MRI_Segmentation_Dataset",  # prostate in MRI
 91    "Pulmonary_Chest_X-Ray_Abnormalities_seg",  # lung in CXR
 92    "QUBIQ2020",  # kidney in CT
 93    "StructSeg2019_subtask1",  # OAR in H&N CT
 94    "StructSeg2019_subtask2",  # OAR in chest CT
 95    "Totalsegmentator_dataset",  # organ in CT
 96    "ultrasound_nerve_segmentation",  # nerve in US
 97    "VESSEL2012",  # lung in CT
 98    "VerSe20",  # vertebrae in CT
 99    "VerSe19",  # vertebrae in CT
100    "WORD",  # abdominal organs in CT
101    "autoPET",  # lesions in PET and CT
102    "braimMRI",  # brain lesions in MRI
103    "breast_ultrasound_images_dataset",  # breast cancer in US
104    "kvasircapsule_seg",  # polyp in endoscopy
105    "sz_cxr",  # lungs in CXR
106    "EndoVis_2017_RIS",  # instruments in endoscopy
107    "kvasir_seg",  # polyp in endoscopy
108    "isic2018_task1",  # skin lesions in dermoscopy
109    "isic2017_task1",  # skin lesions in dermoscopy
110    "isic2016_task1",  # skin lesions in dermoscopy
111]
112
113MODALITY_NAMES = [
114    # CT modalities
115    'ct_00', 'ct_cbf', 'ct_cbv', 'ct_mtt', 'ct_tmax',
116    # RGB0-image modalities
117    'dermoscopy_00', 'endoscopy_00', 'fundus_photography',
118    # MRI modalities
119    'mr_00', 'mr_adc', 'mr_cbf', 'mr_cbv', 'mr_cmr', 'mr_dwi',
120    'mr_flair', 'mr_hbv', 'mr_lge', 'mr_mprage', 'mr_mtt',
121    'mr_pd', 'mr_rcbf', 'mr_rcbv', 'mr_t1', 'mr_t1c', 'mr_t1ce',
122    'mr_t1gd', 'mr_t1w', 'mr_t2', 'mr_t2w', 'mr_tmax', 'mr_ttp',
123    # mono-channel modalities
124    'pet_00', 'ultrasound_00', 'x_ray'
125]
126
127
128# datasets under 1000 samples
129SMALL_DATASETS = [
130    "crass", "covid_19_ct_cxr", "cvc_clinicdb", "cranium", "CrossMoDA21", "EMIDEC",
131    "endovis15", "fusc2021", "Heart_Seg_MRI", "ichallenge_adam_task2", "gamma", "gamma3",
132    "Instance22", "LNDb", "MSD_Heart", "MSD_Prostate", "MSD_Spleen", "MSD_Colon",
133    "picai_baseline", "picai_semi", "Promise09", "PROMISE12", "Pulmonary_Chest_X-Ray_Abnormalities_seg",
134    "QUBIQ2020", "breast_ultrasound_images_dataset", "kvasircapsule_seg", "sz_cxr", "kvasir_seg"
135]
136
137
138def get_sa_med2d_data(path, download):
139    """This function describes the download functionality and ensures your data has been downloaded in expected format.
140
141    The dataset is located at https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M.
142
143    There are two ways of downloading the dataset:
144    1. wget (Recommended):
145        - There are 10 `z.*` files and 1 `.zip` file which needs to be installed together.
146        - Go to `Files` -> download each file individually using `wget <LINK>`. Below mentioned are the links:
147            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z01
148            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z02
149            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z03
150            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z04
151            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z05
152            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z06
153            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z07
154            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z08
155            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z09
156            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z10
157            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.zip
158
159    2. Using Git Large File Storage (lfs):
160        - `git lfs install` (Make sure you have git-lfs installed (https://git-lfs.com))
161        - `git clone https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M`
162            - This step takes several hours, make sure you have a consistent internet and sufficient space.
163
164    Once you have downloaded the archives, you need to unzip the splitted-up zip files:
165    - For Windows: decompress SA-Med2D-16M.zip to automatically extract the other volumes together.
166    - For Linux:
167        - `zip SA-Med2D-16M.zip SA-Med2D-16M.z0* SA-Med2D-16M.z10 -s=0 --out {full}.zip`
168            - NOTE: deflates the entire dataset to ensemble into one zip, make sure you have ~1.5TB free space.
169        - `unzip {full}.zip`
170            - NOTE: there are >4M images paired with >19M ground-truth masks. unzipping takes a lot of inodes and time.
171    """
172    if download:
173        print("Download is not supported, as the data is huge and takes quite a while to download and extract.")
174
175    data_dir = os.path.join(path, "SAMed2Dv1")
176
177    # the first part is to ensure if the data has been unzipped in the expected data directory
178    msg = "The data directory is not found. "
179    msg += "Please ensure that you provide the path to the parent directory where the unzip operation took place. "
180    msg += "For example: `unzip <ZIPFILE> -d /path/to/dir/`. Hence, the argument 'path' expects '/path/to/dir/'."
181    assert os.path.exists(data_dir), msg
182
183    # next, let's investigate the presence of the json files
184    json_file = "SAMed2D_v1.json"
185    assert os.path.exists(os.path.join(data_dir, json_file)), f"The json file '{json_file}' is missing."
186
187    json_file = "SAMed2D_v1_class_mapping_id.json"
188    assert os.path.exists(os.path.join(data_dir, json_file)), f"The json file '{json_file}' is missing."
189
190    print("Looks like the dataset is ready to use.")
191
192    return data_dir
193
194
195def _assort_sa_med2d_data(data_dir):
196    with open(os.path.join(data_dir, "SAMed2D_v1.json")) as f:
197        data = json.load(f)
198
199    image_files = list(data.keys())
200
201    gt_instances_dir = os.path.join(data_dir, "preprocessed_instances")
202    os.makedirs(gt_instances_dir, exist_ok=True)
203
204    skipped_files = []
205    for ifile in tqdm(image_files):
206        image_path = os.path.join(data_dir, ifile)
207        image_id = Path(image_path).stem
208
209        gt_path = os.path.join(gt_instances_dir, f"{image_id}.tif")
210        if os.path.exists(gt_path):
211            continue
212
213        # let's split different components
214        splits = image_id.split("--")
215        dataset = splits[1]
216
217        # HACK: (SKIP) there are some known images which are pretty weird (binary brain masks as inputs)
218        if splits[2].find("brain-growth") != -1:
219            skipped_files.append(ifile)
220            continue
221
222        # let's get the shape of the image
223        image = imageio.imread(image_path)
224        shape = image.shape if image.ndim == 2 else image.shape[:-1]
225
226        # HACK: (SKIP) there are weird images which appear to be whole brain binary masks
227        if dataset == "Brain_PTM":
228            if len(np.unique(image)) == 2:  # easy check for binary values in the input image
229                skipped_files.append(ifile)
230                continue
231
232        # let's create an empty array and merge all segmentations into one
233        instances = np.zeros(shape, dtype="uint8")
234        for idx, gfile in enumerate(sorted(data[ifile]), start=1):
235            # HACK: (SKIP) we remove the segmentation of entire ventricular cavity in ACDC
236            if dataset == "ACDC":
237                if gfile.find("0003_000") != -1 and len(data[ifile]) > 1:  # to avoid whole ventricular rois
238                    continue
239
240            per_gt = imageio.imread(os.path.join(data_dir, gfile))
241
242            # HACK: need to see if we can resize this inputs
243            if per_gt.shape != shape:
244                print("Skipping these images with mismatching ground-truth shapes.")
245                continue
246
247            # HACK: (UPDATE) optic disk is mapped as 0, and background as 1
248            if dataset == "ichallenge_adam_task2":
249                per_gt = (per_gt == 0).astype("uint8")  # simply reversing the binary optic disc masks
250
251            instances[per_gt > 0] = idx
252
253        instances = relabel_sequential(instances)[0]
254        imageio.imwrite(gt_path, instances, compression="zlib")
255
256    return skipped_files
257
258
259def _create_splits_per_dataset(data_dir, json_file, skipped_files, val_fraction=0.1):
260    with open(os.path.join(data_dir, "SAMed2D_v1.json")) as f:
261        data = json.load(f)
262
263    image_files = list(data.keys())
264
265    # now, get's group them data-wise and make splits per dataset
266    data_dict = {}
267    for image_file in image_files:
268        if image_file in skipped_files:
269            print("Skipping this file:", image_file)
270            continue
271
272        _image_file = os.path.split(image_file)[-1]
273        splits = _image_file.split("--")
274        dataset = splits[1]
275
276        if dataset in data_dict:
277            data_dict[dataset].append(_image_file)
278        else:
279            data_dict[dataset] = [_image_file]
280
281    # next, let's make a train-val split out of the dataset and write them in a json file
282    train_dict, val_dict = {}, {}
283    for dataset, dfiles in data_dict.items():
284        tr_split, val_split = train_test_split(dfiles, test_size=val_fraction)
285        train_dict[dataset] = tr_split
286        val_dict[dataset] = val_split
287
288    fdict = {"train": train_dict, "val": val_dict}
289    with open(json_file, "w") as f:
290        json.dump(fdict, f)
291
292
293def _get_split_wise_paths(data_dir, json_file, split, exclude_dataset, exclude_modality, n_fraction_per_dataset):
294    with open(json_file, "r") as f:
295        data = json.load(f)
296
297    if exclude_dataset is not None and not isinstance(exclude_dataset, list):
298        exclude_dataset = [exclude_dataset]
299
300    if exclude_modality is not None and not isinstance(exclude_modality, list):
301        exclude_modality = [exclude_modality]
302
303    image_files = data[split]
304    image_paths, gt_paths = [], []
305    for dfiles in image_files.values():
306        splits = dfiles[0].split("--")
307        modality = splits[0]
308        dataset = splits[1]
309
310        if exclude_dataset is not None and dataset in exclude_dataset:
311            continue
312
313        if exclude_modality is not None and modality in exclude_modality:
314            continue
315
316        if n_fraction_per_dataset is not None and dataset not in SMALL_DATASETS:
317            dfiles = random.sample(dfiles, k=int(n_fraction_per_dataset * len(dfiles)))
318
319        per_dataset_ipaths = [os.path.join(data_dir, "images", fname) for fname in dfiles]
320        per_dataset_gpaths = [
321            os.path.join(data_dir, "preprocessed_instances", f"{Path(fname).stem}.tif") for fname in dfiles
322        ]
323
324        image_paths.extend(per_dataset_ipaths)
325        gt_paths.extend(per_dataset_gpaths)
326
327    return image_paths, gt_paths
328
329
330def _get_sa_med2d_paths(path, split, exclude_dataset, exclude_modality, n_fraction_per_dataset, download):
331    data_dir = get_sa_med2d_data(path=path, download=download)
332
333    json_file = os.path.join(data_dir, "preprocessed_inputs.json")
334    if not os.path.exists(json_file):
335        skipped_files = _assort_sa_med2d_data(data_dir=data_dir)
336        _create_splits_per_dataset(data_dir=data_dir, json_file=json_file, skipped_files=skipped_files)
337
338    image_paths, gt_paths = _get_split_wise_paths(
339        data_dir=data_dir,
340        json_file=json_file,
341        split=split,
342        exclude_dataset=exclude_dataset,
343        exclude_modality=exclude_modality,
344        n_fraction_per_dataset=n_fraction_per_dataset
345    )
346
347    return image_paths, gt_paths
348
349
350def get_sa_med2d_dataset(
351    path: Union[os.PathLike, str],
352    patch_shape: Tuple[int, int],
353    split: str,
354    resize_inputs: bool = False,
355    exclude_dataset: Optional[Union[str, list]] = None,
356    exclude_modality: Optional[Union[str, list]] = None,
357    n_fraction_per_dataset: Optional[float] = None,
358    download: bool = False,
359    **kwargs
360):
361    """Dataset for segmentation of various organs and structures in multiple medical imaging modalities.
362
363    You should download the dataset yourself. See `get_sa_med2d_data` for details.
364
365    The dataset is from Ye et al. - https://doi.org/10.48550/arXiv.2311.11969.
366    The dataset is curated in alignment with Cheng et al. - https://doi.org/10.48550/arXiv.2308.16184.
367
368    Please cite it if you use it in a publication.
369    """
370    image_paths, gt_paths = _get_sa_med2d_paths(
371        path=path,
372        split=split,
373        exclude_dataset=exclude_dataset,
374        exclude_modality=exclude_modality,
375        n_fraction_per_dataset=n_fraction_per_dataset,
376        download=download,
377    )
378
379    if resize_inputs:
380        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
381        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
382            kwargs=kwargs,
383            patch_shape=patch_shape,
384            resize_inputs=resize_inputs,
385            resize_kwargs=resize_kwargs,
386            ensure_rgb=to_rgb,
387        )
388
389    print("Creating the dataset for the SA-Med2D-20M dataset. This takes a bit of time.")
390
391    dataset = torch_em.default_segmentation_dataset(
392        raw_paths=image_paths,
393        raw_key=None,
394        label_paths=gt_paths,
395        label_key=None,
396        patch_shape=patch_shape,
397        ndim=2,
398        with_channels=True,
399        is_seg_dataset=False,
400        verify_paths=False,
401        **kwargs
402    )
403
404    return dataset
405
406
407def get_sa_med2d_loader(
408    path: Union[os.PathLike, str],
409    patch_shape: Tuple[int, int],
410    batch_size: int,
411    split: str,
412    resize_inputs: bool = False,
413    exclude_dataset: Optional[Union[str, list]] = None,
414    exclude_modality: Optional[Union[str, list]] = None,
415    n_fraction_per_dataset: Optional[float] = None,
416    download: bool = False,
417    **kwargs
418):
419    """Dataloader for segmentation of various organs and structures in multiple medical imaging modalities.
420    See `get_sa_med2d_dataset` for details.
421    """
422    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
423    dataset = get_sa_med2d_dataset(
424        path=path,
425        patch_shape=patch_shape,
426        split=split,
427        resize_inputs=resize_inputs,
428        exclude_dataset=exclude_dataset,
429        exclude_modality=exclude_modality,
430        n_fraction_per_dataset=n_fraction_per_dataset,
431        download=download,
432        **ds_kwargs
433    )
434    print("Creating the dataloader for the SA-Med2D-20M dataset. This takes a bit of time.")
435    loader = torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)
436    return loader
DATASET_NAMES = ['ACDC', 'AMOS2022', 'ATM2022', 'AbdomenCT1K', 'ASC18', 'COSMOS2022', 'BTCV', 'BTCV_Cervix', 'BraTS2013', 'BraTS2015', 'BraTS2018', 'BraTS2019', 'BraTS2020', 'BraTS2021', 'Brain_PTM', 'CAD_PE', 'CHAOS_Task_4', 'CMRxMotions', 'COVID19CTscans', 'COVID-19-20', 'covid_19_ct_cxr', 'crass', 'CTPelvic1k', 'CTSpine1K_Full', 'cvc_clinicdb', 'Chest_Image_Pneum', 'cranium', 'CrossMoDA21', 'CrossMoDA22', 'EMIDEC', 'endovis15', 'FLARE21', 'FLARE22', 'fusc2021', 'hvsmr_2016', 'Heart_Seg_MRI', 'ichallenge_adam_task2', 'PALM19', 'gamma', 'gamma3', 'ISLES_SPES', 'ISLES_SISS', 'ISLES2016', 'ISLES2017', 'ISLES2018', 'ISLES2022', 'Instance22', 'KiTS', 'KiTS2021', 'LNDb', 'LUNA16', 'LongitudinalMultipleSclerosisLesionSegmentation', 'mnms2', 'MMWHS', 'BrainTumour', 'MSD_Heart', 'MSD_Liver', 'MSD_Prostate', 'MSD_Lung', 'MSD_Pancreas', 'MSD_HepaticVessel', 'MSD_Spleen', 'MSD_Colon', 'CT_ORG', 'picai_baseline', 'picai_semi', 'Promise09', 'PROMISE12', 'Parse22', 'chest_x_ray_images_with_pneumothorax_masks', 'Prostate_MRI_Segmentation_Dataset', 'Pulmonary_Chest_X-Ray_Abnormalities_seg', 'QUBIQ2020', 'StructSeg2019_subtask1', 'StructSeg2019_subtask2', 'Totalsegmentator_dataset', 'ultrasound_nerve_segmentation', 'VESSEL2012', 'VerSe20', 'VerSe19', 'WORD', 'autoPET', 'braimMRI', 'breast_ultrasound_images_dataset', 'kvasircapsule_seg', 'sz_cxr', 'EndoVis_2017_RIS', 'kvasir_seg', 'isic2018_task1', 'isic2017_task1', 'isic2016_task1']
MODALITY_NAMES = ['ct_00', 'ct_cbf', 'ct_cbv', 'ct_mtt', 'ct_tmax', 'dermoscopy_00', 'endoscopy_00', 'fundus_photography', 'mr_00', 'mr_adc', 'mr_cbf', 'mr_cbv', 'mr_cmr', 'mr_dwi', 'mr_flair', 'mr_hbv', 'mr_lge', 'mr_mprage', 'mr_mtt', 'mr_pd', 'mr_rcbf', 'mr_rcbv', 'mr_t1', 'mr_t1c', 'mr_t1ce', 'mr_t1gd', 'mr_t1w', 'mr_t2', 'mr_t2w', 'mr_tmax', 'mr_ttp', 'pet_00', 'ultrasound_00', 'x_ray']
SMALL_DATASETS = ['crass', 'covid_19_ct_cxr', 'cvc_clinicdb', 'cranium', 'CrossMoDA21', 'EMIDEC', 'endovis15', 'fusc2021', 'Heart_Seg_MRI', 'ichallenge_adam_task2', 'gamma', 'gamma3', 'Instance22', 'LNDb', 'MSD_Heart', 'MSD_Prostate', 'MSD_Spleen', 'MSD_Colon', 'picai_baseline', 'picai_semi', 'Promise09', 'PROMISE12', 'Pulmonary_Chest_X-Ray_Abnormalities_seg', 'QUBIQ2020', 'breast_ultrasound_images_dataset', 'kvasircapsule_seg', 'sz_cxr', 'kvasir_seg']
def get_sa_med2d_data(path, download):
139def get_sa_med2d_data(path, download):
140    """This function describes the download functionality and ensures your data has been downloaded in expected format.
141
142    The dataset is located at https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M.
143
144    There are two ways of downloading the dataset:
145    1. wget (Recommended):
146        - There are 10 `z.*` files and 1 `.zip` file which needs to be installed together.
147        - Go to `Files` -> download each file individually using `wget <LINK>`. Below mentioned are the links:
148            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z01
149            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z02
150            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z03
151            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z04
152            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z05
153            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z06
154            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z07
155            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z08
156            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z09
157            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z10
158            - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.zip
159
160    2. Using Git Large File Storage (lfs):
161        - `git lfs install` (Make sure you have git-lfs installed (https://git-lfs.com))
162        - `git clone https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M`
163            - This step takes several hours, make sure you have a consistent internet and sufficient space.
164
165    Once you have downloaded the archives, you need to unzip the splitted-up zip files:
166    - For Windows: decompress SA-Med2D-16M.zip to automatically extract the other volumes together.
167    - For Linux:
168        - `zip SA-Med2D-16M.zip SA-Med2D-16M.z0* SA-Med2D-16M.z10 -s=0 --out {full}.zip`
169            - NOTE: deflates the entire dataset to ensemble into one zip, make sure you have ~1.5TB free space.
170        - `unzip {full}.zip`
171            - NOTE: there are >4M images paired with >19M ground-truth masks. unzipping takes a lot of inodes and time.
172    """
173    if download:
174        print("Download is not supported, as the data is huge and takes quite a while to download and extract.")
175
176    data_dir = os.path.join(path, "SAMed2Dv1")
177
178    # the first part is to ensure if the data has been unzipped in the expected data directory
179    msg = "The data directory is not found. "
180    msg += "Please ensure that you provide the path to the parent directory where the unzip operation took place. "
181    msg += "For example: `unzip <ZIPFILE> -d /path/to/dir/`. Hence, the argument 'path' expects '/path/to/dir/'."
182    assert os.path.exists(data_dir), msg
183
184    # next, let's investigate the presence of the json files
185    json_file = "SAMed2D_v1.json"
186    assert os.path.exists(os.path.join(data_dir, json_file)), f"The json file '{json_file}' is missing."
187
188    json_file = "SAMed2D_v1_class_mapping_id.json"
189    assert os.path.exists(os.path.join(data_dir, json_file)), f"The json file '{json_file}' is missing."
190
191    print("Looks like the dataset is ready to use.")
192
193    return data_dir

This function describes the download functionality and ensures your data has been downloaded in expected format.

The dataset is located at https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M.

There are two ways of downloading the dataset:

  1. wget (Recommended):
  1. Using Git Large File Storage (lfs):
    • git lfs install (Make sure you have git-lfs installed (https://git-lfs.com))
    • git clone https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M
      • This step takes several hours, make sure you have a consistent internet and sufficient space.

Once you have downloaded the archives, you need to unzip the splitted-up zip files:

  • For Windows: decompress SA-Med2D-16M.zip to automatically extract the other volumes together.
  • For Linux:
    • zip SA-Med2D-16M.zip SA-Med2D-16M.z0* SA-Med2D-16M.z10 -s=0 --out {full}.zip
      • NOTE: deflates the entire dataset to ensemble into one zip, make sure you have ~1.5TB free space.
    • unzip {full}.zip
      • NOTE: there are >4M images paired with >19M ground-truth masks. unzipping takes a lot of inodes and time.
def get_sa_med2d_dataset( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], split: str, resize_inputs: bool = False, exclude_dataset: Union[str, list, NoneType] = None, exclude_modality: Union[str, list, NoneType] = None, n_fraction_per_dataset: Optional[float] = None, download: bool = False, **kwargs):
351def get_sa_med2d_dataset(
352    path: Union[os.PathLike, str],
353    patch_shape: Tuple[int, int],
354    split: str,
355    resize_inputs: bool = False,
356    exclude_dataset: Optional[Union[str, list]] = None,
357    exclude_modality: Optional[Union[str, list]] = None,
358    n_fraction_per_dataset: Optional[float] = None,
359    download: bool = False,
360    **kwargs
361):
362    """Dataset for segmentation of various organs and structures in multiple medical imaging modalities.
363
364    You should download the dataset yourself. See `get_sa_med2d_data` for details.
365
366    The dataset is from Ye et al. - https://doi.org/10.48550/arXiv.2311.11969.
367    The dataset is curated in alignment with Cheng et al. - https://doi.org/10.48550/arXiv.2308.16184.
368
369    Please cite it if you use it in a publication.
370    """
371    image_paths, gt_paths = _get_sa_med2d_paths(
372        path=path,
373        split=split,
374        exclude_dataset=exclude_dataset,
375        exclude_modality=exclude_modality,
376        n_fraction_per_dataset=n_fraction_per_dataset,
377        download=download,
378    )
379
380    if resize_inputs:
381        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
382        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
383            kwargs=kwargs,
384            patch_shape=patch_shape,
385            resize_inputs=resize_inputs,
386            resize_kwargs=resize_kwargs,
387            ensure_rgb=to_rgb,
388        )
389
390    print("Creating the dataset for the SA-Med2D-20M dataset. This takes a bit of time.")
391
392    dataset = torch_em.default_segmentation_dataset(
393        raw_paths=image_paths,
394        raw_key=None,
395        label_paths=gt_paths,
396        label_key=None,
397        patch_shape=patch_shape,
398        ndim=2,
399        with_channels=True,
400        is_seg_dataset=False,
401        verify_paths=False,
402        **kwargs
403    )
404
405    return dataset

Dataset for segmentation of various organs and structures in multiple medical imaging modalities.

You should download the dataset yourself. See get_sa_med2d_data for details.

The dataset is from Ye et al. - https://doi.org/10.48550/arXiv.2311.11969. The dataset is curated in alignment with Cheng et al. - https://doi.org/10.48550/arXiv.2308.16184.

Please cite it if you use it in a publication.

def get_sa_med2d_loader( path: Union[os.PathLike, str], patch_shape: Tuple[int, int], batch_size: int, split: str, resize_inputs: bool = False, exclude_dataset: Union[str, list, NoneType] = None, exclude_modality: Union[str, list, NoneType] = None, n_fraction_per_dataset: Optional[float] = None, download: bool = False, **kwargs):
408def get_sa_med2d_loader(
409    path: Union[os.PathLike, str],
410    patch_shape: Tuple[int, int],
411    batch_size: int,
412    split: str,
413    resize_inputs: bool = False,
414    exclude_dataset: Optional[Union[str, list]] = None,
415    exclude_modality: Optional[Union[str, list]] = None,
416    n_fraction_per_dataset: Optional[float] = None,
417    download: bool = False,
418    **kwargs
419):
420    """Dataloader for segmentation of various organs and structures in multiple medical imaging modalities.
421    See `get_sa_med2d_dataset` for details.
422    """
423    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
424    dataset = get_sa_med2d_dataset(
425        path=path,
426        patch_shape=patch_shape,
427        split=split,
428        resize_inputs=resize_inputs,
429        exclude_dataset=exclude_dataset,
430        exclude_modality=exclude_modality,
431        n_fraction_per_dataset=n_fraction_per_dataset,
432        download=download,
433        **ds_kwargs
434    )
435    print("Creating the dataloader for the SA-Med2D-20M dataset. This takes a bit of time.")
436    loader = torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)
437    return loader

Dataloader for segmentation of various organs and structures in multiple medical imaging modalities. See get_sa_med2d_dataset for details.