torch_em.data.datasets.medical.sa_med2d
1import os 2import random 3from tqdm import tqdm 4from pathlib import Path 5from typing import Union, Tuple, Optional 6 7import json 8import numpy as np 9import imageio.v3 as imageio 10from skimage.segmentation import relabel_sequential 11from sklearn.model_selection import train_test_split 12 13import torch_em 14 15from .. import util 16from ..light_microscopy.neurips_cell_seg import to_rgb 17 18 19DATASET_NAMES = [ 20 "ACDC", # cardiac structures in MRI 21 "AMOS2022", # multi-organ in CT 22 "ATM2022", # pulmonary airway in CT 23 "AbdomenCT1K", # abdominal organ in CT 24 "ASC18", # left atrium in LGE-MRI 25 "COSMOS2022", # cartoid vessel wall in MRI 26 "BTCV", # organs in CT 27 "BTCV_Cervix", # cervical organs in CT 28 "BraTS2013", # brain tumour in MRI 29 "BraTS2015", # brain tumour in MRI 30 "BraTS2018", # brain tumour in MRI 31 "BraTS2019", # brain tumour in MRI 32 "BraTS2020", # brain tumour in MRI 33 "BraTS2021", # brain tumour in MRI 34 "Brain_PTM", # white matter tracts in brain MRI 35 "CAD_PE", # pulmonary embolism in CTPA 36 "CHAOS_Task_4", # liver, kidney and spleen in T1W-MR 37 "CMRxMotions", # cardiac structures in CMR 38 "COVID19CTscans", # lung and covid infection in CT 39 "COVID-19-20", # covid infection in CT 40 "covid_19_ct_cxr", # lung in CXR 41 "crass", # clavicle in CXR 42 "CTPelvic1k", # pelvic bones in CT 43 "CTSpine1K_Full", # spinal vertebrae in CT 44 "cvc_clinicdb", # polyp in colonoscopy 45 "Chest_Image_Pneum", # pneumonia in CXR 46 "cranium", # cranial segmentation in CT 47 "CrossMoDA21", # vestibular schwannoma and cochlea segmentation in T1-CE and TI-HR MRI 48 "CrossMoDA22", # vestibular schwannoma and cochlea segmentation in T1-CE and TI-HR MRI 49 "EMIDEC", # cardiac structures in MRI 50 "endovis15", # polyp in endoscopy 51 "FLARE21", # abdominal organs in CT 52 "FLARE22", # abdominal organs in CT 53 "fusc2021", # skin lesion in dermoscopy 54 "hvsmr_2016", # blood pool and ventricular myocardium in CMR 55 "Heart_Seg_MRI", # heart in MRI 56 "ichallenge_adam_task2", # optic disc in fundus images 57 "PALM19", # optic disc in fundus images 58 "gamma", # optic disk, optic cup and ring in fundus images 59 "gamma3", # optic disk, optic cup and ring in fundus images 60 "ISLES_SPES", # ischemic stroke lesion in brain MRI 61 "ISLES_SISS", # ischemic stroke lesion in brain MRI 62 "ISLES2016", # ischemic stroke lesion in brain MRI 63 "ISLES2017", # ischemic stroke lesion in brain MRI 64 "ISLES2018", # ischemic stroke in brain CT 65 "ISLES2022", # ischemic stroke in brain MRI 66 "Instance22", # intracranial hemorrhage in nc-ct 67 "KiTS", # kidney and kidney tumor in CT 68 "KiTS2021", # kidney and kidney tumor in CT 69 "LNDb", # lung nodules in thoracic CT 70 "LUNA16", # lung and trachea in thoracic CT 71 "LongitudinalMultipleSclerosisLesionSegmentation", # MS lesion in FLAIR-MRI 72 "mnms2", # cardiac structures in MRI 73 "MMWHS", # whole heart in CT 74 "BrainTumour", # brain tumor in MRI 75 "MSD_Heart", # heart in MRI 76 "MSD_Liver", # liver in CT 77 "MSD_Prostate", # prostate in ADC-MRI 78 "MSD_Lung", # lung tumour in CT 79 "MSD_Pancreas", # pancreas in CT 80 "MSD_HepaticVessel", # hepatic vessel in CT 81 "MSD_Spleen", # spleen in CT 82 "MSD_Colon", # colon in CT 83 "CT_ORG", # multiple organ in CT 84 "picai_baseline", # prostate cancer in MRI 85 "picai_semi", # prostate cancer in MRI 86 "Promise09", # prostate in MRI 87 "PROMISE12", # prostate in MRI 88 "Parse22", # pulmonary atery in CT 89 "chest_x_ray_images_with_pneumothorax_masks", # pneumothorax in CXR 90 "Prostate_MRI_Segmentation_Dataset", # prostate in MRI 91 "Pulmonary_Chest_X-Ray_Abnormalities_seg", # lung in CXR 92 "QUBIQ2020", # kidney in CT 93 "StructSeg2019_subtask1", # OAR in H&N CT 94 "StructSeg2019_subtask2", # OAR in chest CT 95 "Totalsegmentator_dataset", # organ in CT 96 "ultrasound_nerve_segmentation", # nerve in US 97 "VESSEL2012", # lung in CT 98 "VerSe20", # vertebrae in CT 99 "VerSe19", # vertebrae in CT 100 "WORD", # abdominal organs in CT 101 "autoPET", # lesions in PET and CT 102 "braimMRI", # brain lesions in MRI 103 "breast_ultrasound_images_dataset", # breast cancer in US 104 "kvasircapsule_seg", # polyp in endoscopy 105 "sz_cxr", # lungs in CXR 106 "EndoVis_2017_RIS", # instruments in endoscopy 107 "kvasir_seg", # polyp in endoscopy 108 "isic2018_task1", # skin lesions in dermoscopy 109 "isic2017_task1", # skin lesions in dermoscopy 110 "isic2016_task1", # skin lesions in dermoscopy 111] 112 113MODALITY_NAMES = [ 114 # CT modalities 115 'ct_00', 'ct_cbf', 'ct_cbv', 'ct_mtt', 'ct_tmax', 116 # RGB0-image modalities 117 'dermoscopy_00', 'endoscopy_00', 'fundus_photography', 118 # MRI modalities 119 'mr_00', 'mr_adc', 'mr_cbf', 'mr_cbv', 'mr_cmr', 'mr_dwi', 120 'mr_flair', 'mr_hbv', 'mr_lge', 'mr_mprage', 'mr_mtt', 121 'mr_pd', 'mr_rcbf', 'mr_rcbv', 'mr_t1', 'mr_t1c', 'mr_t1ce', 122 'mr_t1gd', 'mr_t1w', 'mr_t2', 'mr_t2w', 'mr_tmax', 'mr_ttp', 123 # mono-channel modalities 124 'pet_00', 'ultrasound_00', 'x_ray' 125] 126 127 128# datasets under 1000 samples 129SMALL_DATASETS = [ 130 "crass", "covid_19_ct_cxr", "cvc_clinicdb", "cranium", "CrossMoDA21", "EMIDEC", 131 "endovis15", "fusc2021", "Heart_Seg_MRI", "ichallenge_adam_task2", "gamma", "gamma3", 132 "Instance22", "LNDb", "MSD_Heart", "MSD_Prostate", "MSD_Spleen", "MSD_Colon", 133 "picai_baseline", "picai_semi", "Promise09", "PROMISE12", "Pulmonary_Chest_X-Ray_Abnormalities_seg", 134 "QUBIQ2020", "breast_ultrasound_images_dataset", "kvasircapsule_seg", "sz_cxr", "kvasir_seg" 135] 136 137 138def get_sa_med2d_data(path, download): 139 """This function describes the download functionality and ensures your data has been downloaded in expected format. 140 141 The dataset is located at https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M. 142 143 There are two ways of downloading the dataset: 144 1. wget (Recommended): 145 - There are 10 `z.*` files and 1 `.zip` file which needs to be installed together. 146 - Go to `Files` -> download each file individually using `wget <LINK>`. Below mentioned are the links: 147 - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z01 148 - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z02 149 - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z03 150 - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z04 151 - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z05 152 - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z06 153 - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z07 154 - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z08 155 - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z09 156 - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z10 157 - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.zip 158 159 2. Using Git Large File Storage (lfs): 160 - `git lfs install` (Make sure you have git-lfs installed (https://git-lfs.com)) 161 - `git clone https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M` 162 - This step takes several hours, make sure you have a consistent internet and sufficient space. 163 164 Once you have downloaded the archives, you need to unzip the splitted-up zip files: 165 - For Windows: decompress SA-Med2D-16M.zip to automatically extract the other volumes together. 166 - For Linux: 167 - `zip SA-Med2D-16M.zip SA-Med2D-16M.z0* SA-Med2D-16M.z10 -s=0 --out {full}.zip` 168 - NOTE: deflates the entire dataset to ensemble into one zip, make sure you have ~1.5TB free space. 169 - `unzip {full}.zip` 170 - NOTE: there are >4M images paired with >19M ground-truth masks. unzipping takes a lot of inodes and time. 171 """ 172 if download: 173 print("Download is not supported, as the data is huge and takes quite a while to download and extract.") 174 175 data_dir = os.path.join(path, "SAMed2Dv1") 176 177 # the first part is to ensure if the data has been unzipped in the expected data directory 178 msg = "The data directory is not found. " 179 msg += "Please ensure that you provide the path to the parent directory where the unzip operation took place. " 180 msg += "For example: `unzip <ZIPFILE> -d /path/to/dir/`. Hence, the argument 'path' expects '/path/to/dir/'." 181 assert os.path.exists(data_dir), msg 182 183 # next, let's investigate the presence of the json files 184 json_file = "SAMed2D_v1.json" 185 assert os.path.exists(os.path.join(data_dir, json_file)), f"The json file '{json_file}' is missing." 186 187 json_file = "SAMed2D_v1_class_mapping_id.json" 188 assert os.path.exists(os.path.join(data_dir, json_file)), f"The json file '{json_file}' is missing." 189 190 print("Looks like the dataset is ready to use.") 191 192 return data_dir 193 194 195def _assort_sa_med2d_data(data_dir): 196 with open(os.path.join(data_dir, "SAMed2D_v1.json")) as f: 197 data = json.load(f) 198 199 image_files = list(data.keys()) 200 201 gt_instances_dir = os.path.join(data_dir, "preprocessed_instances") 202 os.makedirs(gt_instances_dir, exist_ok=True) 203 204 skipped_files = [] 205 for ifile in tqdm(image_files): 206 image_path = os.path.join(data_dir, ifile) 207 image_id = Path(image_path).stem 208 209 gt_path = os.path.join(gt_instances_dir, f"{image_id}.tif") 210 if os.path.exists(gt_path): 211 continue 212 213 # let's split different components 214 splits = image_id.split("--") 215 dataset = splits[1] 216 217 # HACK: (SKIP) there are some known images which are pretty weird (binary brain masks as inputs) 218 if splits[2].find("brain-growth") != -1: 219 skipped_files.append(ifile) 220 continue 221 222 # let's get the shape of the image 223 image = imageio.imread(image_path) 224 shape = image.shape if image.ndim == 2 else image.shape[:-1] 225 226 # HACK: (SKIP) there are weird images which appear to be whole brain binary masks 227 if dataset == "Brain_PTM": 228 if len(np.unique(image)) == 2: # easy check for binary values in the input image 229 skipped_files.append(ifile) 230 continue 231 232 # let's create an empty array and merge all segmentations into one 233 instances = np.zeros(shape, dtype="uint8") 234 for idx, gfile in enumerate(sorted(data[ifile]), start=1): 235 # HACK: (SKIP) we remove the segmentation of entire ventricular cavity in ACDC 236 if dataset == "ACDC": 237 if gfile.find("0003_000") != -1 and len(data[ifile]) > 1: # to avoid whole ventricular rois 238 continue 239 240 per_gt = imageio.imread(os.path.join(data_dir, gfile)) 241 242 # HACK: need to see if we can resize this inputs 243 if per_gt.shape != shape: 244 print("Skipping these images with mismatching ground-truth shapes.") 245 continue 246 247 # HACK: (UPDATE) optic disk is mapped as 0, and background as 1 248 if dataset == "ichallenge_adam_task2": 249 per_gt = (per_gt == 0).astype("uint8") # simply reversing the binary optic disc masks 250 251 instances[per_gt > 0] = idx 252 253 instances = relabel_sequential(instances)[0] 254 imageio.imwrite(gt_path, instances, compression="zlib") 255 256 return skipped_files 257 258 259def _create_splits_per_dataset(data_dir, json_file, skipped_files, val_fraction=0.1): 260 with open(os.path.join(data_dir, "SAMed2D_v1.json")) as f: 261 data = json.load(f) 262 263 image_files = list(data.keys()) 264 265 # now, get's group them data-wise and make splits per dataset 266 data_dict = {} 267 for image_file in image_files: 268 if image_file in skipped_files: 269 print("Skipping this file:", image_file) 270 continue 271 272 _image_file = os.path.split(image_file)[-1] 273 splits = _image_file.split("--") 274 dataset = splits[1] 275 276 if dataset in data_dict: 277 data_dict[dataset].append(_image_file) 278 else: 279 data_dict[dataset] = [_image_file] 280 281 # next, let's make a train-val split out of the dataset and write them in a json file 282 train_dict, val_dict = {}, {} 283 for dataset, dfiles in data_dict.items(): 284 tr_split, val_split = train_test_split(dfiles, test_size=val_fraction) 285 train_dict[dataset] = tr_split 286 val_dict[dataset] = val_split 287 288 fdict = {"train": train_dict, "val": val_dict} 289 with open(json_file, "w") as f: 290 json.dump(fdict, f) 291 292 293def _get_split_wise_paths(data_dir, json_file, split, exclude_dataset, exclude_modality, n_fraction_per_dataset): 294 with open(json_file, "r") as f: 295 data = json.load(f) 296 297 if exclude_dataset is not None and not isinstance(exclude_dataset, list): 298 exclude_dataset = [exclude_dataset] 299 300 if exclude_modality is not None and not isinstance(exclude_modality, list): 301 exclude_modality = [exclude_modality] 302 303 image_files = data[split] 304 image_paths, gt_paths = [], [] 305 for dfiles in image_files.values(): 306 splits = dfiles[0].split("--") 307 modality = splits[0] 308 dataset = splits[1] 309 310 if exclude_dataset is not None and dataset in exclude_dataset: 311 continue 312 313 if exclude_modality is not None and modality in exclude_modality: 314 continue 315 316 if n_fraction_per_dataset is not None and dataset not in SMALL_DATASETS: 317 dfiles = random.sample(dfiles, k=int(n_fraction_per_dataset * len(dfiles))) 318 319 per_dataset_ipaths = [os.path.join(data_dir, "images", fname) for fname in dfiles] 320 per_dataset_gpaths = [ 321 os.path.join(data_dir, "preprocessed_instances", f"{Path(fname).stem}.tif") for fname in dfiles 322 ] 323 324 image_paths.extend(per_dataset_ipaths) 325 gt_paths.extend(per_dataset_gpaths) 326 327 return image_paths, gt_paths 328 329 330def _get_sa_med2d_paths(path, split, exclude_dataset, exclude_modality, n_fraction_per_dataset, download): 331 data_dir = get_sa_med2d_data(path=path, download=download) 332 333 json_file = os.path.join(data_dir, "preprocessed_inputs.json") 334 if not os.path.exists(json_file): 335 skipped_files = _assort_sa_med2d_data(data_dir=data_dir) 336 _create_splits_per_dataset(data_dir=data_dir, json_file=json_file, skipped_files=skipped_files) 337 338 image_paths, gt_paths = _get_split_wise_paths( 339 data_dir=data_dir, 340 json_file=json_file, 341 split=split, 342 exclude_dataset=exclude_dataset, 343 exclude_modality=exclude_modality, 344 n_fraction_per_dataset=n_fraction_per_dataset 345 ) 346 347 return image_paths, gt_paths 348 349 350def get_sa_med2d_dataset( 351 path: Union[os.PathLike, str], 352 patch_shape: Tuple[int, int], 353 split: str, 354 resize_inputs: bool = False, 355 exclude_dataset: Optional[Union[str, list]] = None, 356 exclude_modality: Optional[Union[str, list]] = None, 357 n_fraction_per_dataset: Optional[float] = None, 358 download: bool = False, 359 **kwargs 360): 361 """Dataset for segmentation of various organs and structures in multiple medical imaging modalities. 362 363 You should download the dataset yourself. See `get_sa_med2d_data` for details. 364 365 The dataset is from Ye et al. - https://doi.org/10.48550/arXiv.2311.11969. 366 The dataset is curated in alignment with Cheng et al. - https://doi.org/10.48550/arXiv.2308.16184. 367 368 Please cite it if you use it in a publication. 369 """ 370 image_paths, gt_paths = _get_sa_med2d_paths( 371 path=path, 372 split=split, 373 exclude_dataset=exclude_dataset, 374 exclude_modality=exclude_modality, 375 n_fraction_per_dataset=n_fraction_per_dataset, 376 download=download, 377 ) 378 379 if resize_inputs: 380 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True} 381 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 382 kwargs=kwargs, 383 patch_shape=patch_shape, 384 resize_inputs=resize_inputs, 385 resize_kwargs=resize_kwargs, 386 ensure_rgb=to_rgb, 387 ) 388 389 print("Creating the dataset for the SA-Med2D-20M dataset. This takes a bit of time.") 390 391 dataset = torch_em.default_segmentation_dataset( 392 raw_paths=image_paths, 393 raw_key=None, 394 label_paths=gt_paths, 395 label_key=None, 396 patch_shape=patch_shape, 397 ndim=2, 398 with_channels=True, 399 is_seg_dataset=False, 400 verify_paths=False, 401 **kwargs 402 ) 403 404 return dataset 405 406 407def get_sa_med2d_loader( 408 path: Union[os.PathLike, str], 409 patch_shape: Tuple[int, int], 410 batch_size: int, 411 split: str, 412 resize_inputs: bool = False, 413 exclude_dataset: Optional[Union[str, list]] = None, 414 exclude_modality: Optional[Union[str, list]] = None, 415 n_fraction_per_dataset: Optional[float] = None, 416 download: bool = False, 417 **kwargs 418): 419 """Dataloader for segmentation of various organs and structures in multiple medical imaging modalities. 420 See `get_sa_med2d_dataset` for details. 421 """ 422 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 423 dataset = get_sa_med2d_dataset( 424 path=path, 425 patch_shape=patch_shape, 426 split=split, 427 resize_inputs=resize_inputs, 428 exclude_dataset=exclude_dataset, 429 exclude_modality=exclude_modality, 430 n_fraction_per_dataset=n_fraction_per_dataset, 431 download=download, 432 **ds_kwargs 433 ) 434 print("Creating the dataloader for the SA-Med2D-20M dataset. This takes a bit of time.") 435 loader = torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs) 436 return loader
139def get_sa_med2d_data(path, download): 140 """This function describes the download functionality and ensures your data has been downloaded in expected format. 141 142 The dataset is located at https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M. 143 144 There are two ways of downloading the dataset: 145 1. wget (Recommended): 146 - There are 10 `z.*` files and 1 `.zip` file which needs to be installed together. 147 - Go to `Files` -> download each file individually using `wget <LINK>`. Below mentioned are the links: 148 - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z01 149 - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z02 150 - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z03 151 - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z04 152 - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z05 153 - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z06 154 - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z07 155 - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z08 156 - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z09 157 - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z10 158 - https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.zip 159 160 2. Using Git Large File Storage (lfs): 161 - `git lfs install` (Make sure you have git-lfs installed (https://git-lfs.com)) 162 - `git clone https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M` 163 - This step takes several hours, make sure you have a consistent internet and sufficient space. 164 165 Once you have downloaded the archives, you need to unzip the splitted-up zip files: 166 - For Windows: decompress SA-Med2D-16M.zip to automatically extract the other volumes together. 167 - For Linux: 168 - `zip SA-Med2D-16M.zip SA-Med2D-16M.z0* SA-Med2D-16M.z10 -s=0 --out {full}.zip` 169 - NOTE: deflates the entire dataset to ensemble into one zip, make sure you have ~1.5TB free space. 170 - `unzip {full}.zip` 171 - NOTE: there are >4M images paired with >19M ground-truth masks. unzipping takes a lot of inodes and time. 172 """ 173 if download: 174 print("Download is not supported, as the data is huge and takes quite a while to download and extract.") 175 176 data_dir = os.path.join(path, "SAMed2Dv1") 177 178 # the first part is to ensure if the data has been unzipped in the expected data directory 179 msg = "The data directory is not found. " 180 msg += "Please ensure that you provide the path to the parent directory where the unzip operation took place. " 181 msg += "For example: `unzip <ZIPFILE> -d /path/to/dir/`. Hence, the argument 'path' expects '/path/to/dir/'." 182 assert os.path.exists(data_dir), msg 183 184 # next, let's investigate the presence of the json files 185 json_file = "SAMed2D_v1.json" 186 assert os.path.exists(os.path.join(data_dir, json_file)), f"The json file '{json_file}' is missing." 187 188 json_file = "SAMed2D_v1_class_mapping_id.json" 189 assert os.path.exists(os.path.join(data_dir, json_file)), f"The json file '{json_file}' is missing." 190 191 print("Looks like the dataset is ready to use.") 192 193 return data_dir
This function describes the download functionality and ensures your data has been downloaded in expected format.
The dataset is located at https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M.
There are two ways of downloading the dataset:
- wget (Recommended):
- There are 10
z.*
files and 1.zip
file which needs to be installed together. - Go to
Files
-> download each file individually usingwget <LINK>
. Below mentioned are the links:- https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z01
- https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z02
- https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z03
- https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z04
- https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z05
- https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z06
- https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z07
- https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z08
- https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z09
- https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.z10
- https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M/resolve/main/raw/SA-Med2D-16M.zip
- There are 10
- Using Git Large File Storage (lfs):
git lfs install
(Make sure you have git-lfs installed (https://git-lfs.com))git clone https://huggingface.co/datasets/OpenGVLab/SA-Med2D-20M
- This step takes several hours, make sure you have a consistent internet and sufficient space.
Once you have downloaded the archives, you need to unzip the splitted-up zip files:
- For Windows: decompress SA-Med2D-16M.zip to automatically extract the other volumes together.
- For Linux:
zip SA-Med2D-16M.zip SA-Med2D-16M.z0* SA-Med2D-16M.z10 -s=0 --out {full}.zip
- NOTE: deflates the entire dataset to ensemble into one zip, make sure you have ~1.5TB free space.
unzip {full}.zip
- NOTE: there are >4M images paired with >19M ground-truth masks. unzipping takes a lot of inodes and time.
351def get_sa_med2d_dataset( 352 path: Union[os.PathLike, str], 353 patch_shape: Tuple[int, int], 354 split: str, 355 resize_inputs: bool = False, 356 exclude_dataset: Optional[Union[str, list]] = None, 357 exclude_modality: Optional[Union[str, list]] = None, 358 n_fraction_per_dataset: Optional[float] = None, 359 download: bool = False, 360 **kwargs 361): 362 """Dataset for segmentation of various organs and structures in multiple medical imaging modalities. 363 364 You should download the dataset yourself. See `get_sa_med2d_data` for details. 365 366 The dataset is from Ye et al. - https://doi.org/10.48550/arXiv.2311.11969. 367 The dataset is curated in alignment with Cheng et al. - https://doi.org/10.48550/arXiv.2308.16184. 368 369 Please cite it if you use it in a publication. 370 """ 371 image_paths, gt_paths = _get_sa_med2d_paths( 372 path=path, 373 split=split, 374 exclude_dataset=exclude_dataset, 375 exclude_modality=exclude_modality, 376 n_fraction_per_dataset=n_fraction_per_dataset, 377 download=download, 378 ) 379 380 if resize_inputs: 381 resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True} 382 kwargs, patch_shape = util.update_kwargs_for_resize_trafo( 383 kwargs=kwargs, 384 patch_shape=patch_shape, 385 resize_inputs=resize_inputs, 386 resize_kwargs=resize_kwargs, 387 ensure_rgb=to_rgb, 388 ) 389 390 print("Creating the dataset for the SA-Med2D-20M dataset. This takes a bit of time.") 391 392 dataset = torch_em.default_segmentation_dataset( 393 raw_paths=image_paths, 394 raw_key=None, 395 label_paths=gt_paths, 396 label_key=None, 397 patch_shape=patch_shape, 398 ndim=2, 399 with_channels=True, 400 is_seg_dataset=False, 401 verify_paths=False, 402 **kwargs 403 ) 404 405 return dataset
Dataset for segmentation of various organs and structures in multiple medical imaging modalities.
You should download the dataset yourself. See get_sa_med2d_data
for details.
The dataset is from Ye et al. - https://doi.org/10.48550/arXiv.2311.11969. The dataset is curated in alignment with Cheng et al. - https://doi.org/10.48550/arXiv.2308.16184.
Please cite it if you use it in a publication.
408def get_sa_med2d_loader( 409 path: Union[os.PathLike, str], 410 patch_shape: Tuple[int, int], 411 batch_size: int, 412 split: str, 413 resize_inputs: bool = False, 414 exclude_dataset: Optional[Union[str, list]] = None, 415 exclude_modality: Optional[Union[str, list]] = None, 416 n_fraction_per_dataset: Optional[float] = None, 417 download: bool = False, 418 **kwargs 419): 420 """Dataloader for segmentation of various organs and structures in multiple medical imaging modalities. 421 See `get_sa_med2d_dataset` for details. 422 """ 423 ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) 424 dataset = get_sa_med2d_dataset( 425 path=path, 426 patch_shape=patch_shape, 427 split=split, 428 resize_inputs=resize_inputs, 429 exclude_dataset=exclude_dataset, 430 exclude_modality=exclude_modality, 431 n_fraction_per_dataset=n_fraction_per_dataset, 432 download=download, 433 **ds_kwargs 434 ) 435 print("Creating the dataloader for the SA-Med2D-20M dataset. This takes a bit of time.") 436 loader = torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs) 437 return loader
Dataloader for segmentation of various organs and structures in multiple medical imaging modalities.
See get_sa_med2d_dataset
for details.