From 8d820e9b1bb9cb373100187a88c60f72beb62f59 Mon Sep 17 00:00:00 2001 From: camillebrianceau <57992134+camillebrianceau@users.noreply.github.com> Date: Thu, 15 Feb 2024 18:22:55 +0100 Subject: [PATCH] hugging face (#468) * adapt to clinicadl hugging face organization * finish skip leak check to use clinicadl with downloaded masp --- clinicadl/cmdline.py | 2 + clinicadl/hugging_face/hugging_face.py | 232 +++++++++++++++++++ clinicadl/hugging_face/hugging_face_cli.py | 18 ++ clinicadl/hugging_face/pull_cli.py | 32 +++ clinicadl/hugging_face/push_cli.py | 33 +++ clinicadl/utils/maps_manager/maps_manager.py | 9 +- docs/Train/Share.md | 66 ++++++ docs/index.md | 3 + 8 files changed, 391 insertions(+), 4 deletions(-) create mode 100644 clinicadl/hugging_face/hugging_face.py create mode 100644 clinicadl/hugging_face/hugging_face_cli.py create mode 100644 clinicadl/hugging_face/pull_cli.py create mode 100644 clinicadl/hugging_face/push_cli.py create mode 100644 docs/Train/Share.md diff --git a/clinicadl/cmdline.py b/clinicadl/cmdline.py index 67ff3a1c2..30bae6d78 100644 --- a/clinicadl/cmdline.py +++ b/clinicadl/cmdline.py @@ -2,6 +2,7 @@ import click from clinicadl.generate.generate_cli import cli as generate_cli +from clinicadl.hugging_face.hugging_face_cli import cli as hf_cli from clinicadl.interpret.interpret_cli import cli as interpret_cli from clinicadl.predict.predict_cli import cli as predict_cli from clinicadl.prepare_data.prepare_data_cli import cli as prepare_data_cli @@ -45,6 +46,7 @@ def cli(verbose): cli.add_command(interpret_cli) cli.add_command(qc_cli) cli.add_command(random_search_cli) +cli.add_command(hf_cli) if __name__ == "__main__": cli() diff --git a/clinicadl/hugging_face/hugging_face.py b/clinicadl/hugging_face/hugging_face.py new file mode 100644 index 000000000..8bc5a4ae3 --- /dev/null +++ b/clinicadl/hugging_face/hugging_face.py @@ -0,0 +1,232 @@ +import importlib +import os +from logging import getLogger +from pathlib import Path + +import toml + +from clinicadl.utils.exceptions import ClinicaDLArgumentError +from clinicadl.utils.maps_manager.maps_manager_utils import ( + change_str_to_path, + read_json, + remove_unused_tasks, +) + +logger = getLogger("clinicadl") + + +def hf_hub_is_available(): + return importlib.util.find_spec("huggingface_hub") is not None + + +def push_to_hf_hub( + hf_hub_path: str, + maps_dir: Path, + model_name: str, +): + if hf_hub_is_available(): + from huggingface_hub import CommitOperationAdd, HfApi + else: + raise ModuleNotFoundError( + "`huggingface_hub` package must be installed to push your model to the HF hub. " + "Run `python -m pip install huggingface_hub` and log in to your account with " + "`huggingface-cli login`." + ) + + model_card_ = """ +--- +language: en +library_name: clinicadl +tags: +- clinicadl +license: mit +--- +""" + hf_hub_path = "ClinicaDL" if hf_hub_path.lower() == "clinicadl" else hf_hub_path + + config_file = maps_dir / "maps.json" + n_splits = create_readme( + config_file=config_file, model_name=model_name, model_card=model_card_ + ) + logger.info(f"Uploading {model_name} model to {hf_hub_path} repo in HF hub...") + api = HfApi() + hf_operations = [] + id_ = os.path.join(hf_hub_path, model_name) + user = api.whoami() + list_orgs = [x["name"] for x in user["orgs"]] + + if hf_hub_path == "ClinicaDL": + if "ClinicaDL" not in list_orgs: + raise ClinicaDLArgumentError( + "You're not in the ClinicaDL organization on Hugging Face. Please follow the link to request to join the organization: https://huggingface.co/clinicadl-test" + ) + elif hf_hub_path != user["name"]: + raise ClinicaDLArgumentError( + f"You're logged as {user['name']} in Hugging Face and you are trying to push a model under {hf_hub_path} logging." + ) + + tmp_file = "tmp_README.md" + hf_operations = [ + CommitOperationAdd(path_in_repo="README.md", path_or_fileobj=tmp_file), + CommitOperationAdd( + path_in_repo="maps.json", path_or_fileobj=maps_dir / "maps.json" + ), + ] + + for split in range(n_splits): + hf_operations.append( + CommitOperationAdd( + path_in_repo=str(("split-" + str(split)) + "/best-loss/model.pth.tar"), + path_or_fileobj=str( + maps_dir / ("split-" + str(split)) / "best-loss" / "model.pth.tar" + ), + ) + ) + + for root, dirs, files in os.walk(maps_dir, topdown=False): + for name in files: + hf_operations.append( + CommitOperationAdd( + path_in_repo=str( + ("split-" + str(split)) + "/best-loss/model.pth.tar" + ), + path_or_fileobj=str( + maps_dir + / ("split-" + str(split)) + / "best-loss" + / "model.pth.tar" + ), + ) + ) + + try: + api.create_commit( + commit_message=f"Uploading {model_name} in {maps_dir}", + repo_id=id_, + operations=hf_operations, + private=True, + ) + logger.info(f"Successfully uploaded {model_name} to {maps_dir} repo in HF hub!") + + except: + from huggingface_hub import create_repo + + repo_name = maps_dir.name + logger.info(f"Creating {repo_name} in the HF hub since it does not exist...") + create_repo(repo_id=id_) + logger.info(f"Successfully created {repo_name} in the HF hub!") + + api.create_commit( + commit_message=f"Uploading {model_name} in {maps_dir}", + repo_id=id_, + operations=hf_operations, + ) + + if Path(tmp_file).exists(): + Path(tmp_file).unlink() + + +def create_readme( + config_file: Path = None, model_name: str = "test", model_card: str = None +): + if not config_file.is_file(): + raise ClinicaDLArgumentError("There is no maps.json file in your repository.") + + clinicadl_root_dir = (Path(__file__) / "../..").resolve() + config_path = ( + Path(clinicadl_root_dir) / "resources" / "config" / "train_config.toml" + ) + config_dict = toml.load(config_path) + + train_dict = read_json(config_file) + train_dict = change_str_to_path(train_dict) + + task = train_dict["network_task"] + + config_dict = remove_unused_tasks(config_dict, task) + config_dict = change_str_to_path(config_dict) + + file = open("tmp_README.md", "w") + list_lines = [] + list_lines.append(model_card) + list_lines.append(f"# Model Card for {model_name} \n") + list_lines.append( + f"This model was trained with ClinicaDL. You can find here all the information.\n" + ) + + list_lines.append(f"## General information \n") + + if train_dict["multi_cohort"]: + list_lines.append( + f"This model was trained on several datasets at the same time. \n" + ) + list_lines.append( + f"This model was trained for **{task}** and the architecture chosen is **{train_dict['architecture']}**. \n" + ) + + for config_section in config_dict: + list_lines.append(f"### {config_section} \n") + for key in config_dict[config_section]: + if key == "preprocessing_dict": + list_lines.append(f"### Preprocessing \n") + for key_bis in config_dict[config_section][key]: + list_lines.append( + f"**{key_bis}**: {config_dict[config_section][key][key_bis]} \n" + ) + else: + if key in train_dict: + config_dict[config_section][key] = train_dict[key] + train_dict.pop(key) + list_lines.append(f"**{key}**: {config_dict[config_section][key]} \n") + list_lines.append(f"### Other information \n") + for key in train_dict: + list_lines.append(f"**{key}**: {train_dict[key]} \n") + + file.writelines(list_lines) + file.close() + return config_dict["Cross_validation"]["n_splits"] + + +def load_from_hf_hub( + output_maps: Path, hf_hub_path: str, maps_name: str +): # pragma: no cover + """Class method to be used to load a pretrained model from the Hugging Face hub + + Parameters + ---------- + output_path: str, + + hf_hub_path: (str) + The path where the model should have been be saved on thehugginface hub. + maps_name: str + """ + + if hf_hub_is_available(): + from huggingface_hub import HfApi, snapshot_download + else: + raise ModuleNotFoundError( + "`huggingface_hub` package must be installed to push your model to the HF hub. " + "Run `python -m pip install huggingface_hub` and log in to your account with " + "`huggingface-cli login`." + ) + + hf_hub_path = "ClinicaDL" if hf_hub_path.lower() == "clinicadl" else hf_hub_path + + api = HfApi() + id_ = os.path.join(hf_hub_path, maps_name) + user = api.whoami() + list_orgs = [x["name"] for x in user["orgs"]] + + if hf_hub_path == "ClinicaDL": + if "ClinicaDL" not in list_orgs: + raise ClinicaDLArgumentError( + "You're not in the ClinicaDL organization on Hugging Face. Please follow the link to request to join the organization: https://huggingface.co/clinicadl-test" + ) + elif hf_hub_path != user["name"]: + logger.warning( + f"You're logged as {user['name']} in Hugging Face and you are trying to pull a model from {hf_hub_path}." + ) + else: + logger.info(f"Downloading {hf_hub_path} files for rebuilding...") + + environment_json = snapshot_download(repo_id=id_, local_dir=output_maps) diff --git a/clinicadl/hugging_face/hugging_face_cli.py b/clinicadl/hugging_face/hugging_face_cli.py new file mode 100644 index 000000000..939064049 --- /dev/null +++ b/clinicadl/hugging_face/hugging_face_cli.py @@ -0,0 +1,18 @@ +import click + +from .pull_cli import cli as pull_cli +from .push_cli import cli as push_cli + + +@click.group(name="hugging-face", no_args_is_help=True) +def cli(): + """Train a deep learning model for a specific task.""" + pass + + +cli.add_command(push_cli) +cli.add_command(pull_cli) + + +if __name__ == "__main__": + cli() diff --git a/clinicadl/hugging_face/pull_cli.py b/clinicadl/hugging_face/pull_cli.py new file mode 100644 index 000000000..36158ef5c --- /dev/null +++ b/clinicadl/hugging_face/pull_cli.py @@ -0,0 +1,32 @@ +from pathlib import Path + +import click + +from clinicadl.utils import cli_param +from clinicadl.utils.maps_manager import MapsManager + + +@click.command(name="pull", no_args_is_help=True) +@click.argument( + "hf_hub_path", + type=str, + default=None, +) +@click.argument( + "maps_name", + type=str, + default="maps", +) +@cli_param.argument.output_maps +def cli(hf_hub_path, maps_name, output_maps_directory): + from .hugging_face import load_from_hf_hub + + load_from_hf_hub( + output_maps=output_maps_directory, + hf_hub_path=hf_hub_path, + maps_name=maps_name, + ) + + +if __name__ == "__main__": + cli() diff --git a/clinicadl/hugging_face/push_cli.py b/clinicadl/hugging_face/push_cli.py new file mode 100644 index 000000000..48657c03b --- /dev/null +++ b/clinicadl/hugging_face/push_cli.py @@ -0,0 +1,33 @@ +import click + +from clinicadl.utils import cli_param + + +@click.command(name="push", no_args_is_help=True) +@click.argument( + "organization", + type=str, + default=None, +) +@cli_param.argument.input_maps +@click.argument( + "hf_maps_directory", + type=str, + default=None, +) +def cli( + organization, + input_maps_directory, + hf_maps_directory, +): + from .hugging_face import push_to_hf_hub + + push_to_hf_hub( + hf_hub_path=organization, + maps_dir=input_maps_directory, + model_name=hf_maps_directory, + ) + + +if __name__ == "__main__": + cli() diff --git a/clinicadl/utils/maps_manager/maps_manager.py b/clinicadl/utils/maps_manager/maps_manager.py index 2ca73e7b3..b2668a29f 100644 --- a/clinicadl/utils/maps_manager/maps_manager.py +++ b/clinicadl/utils/maps_manager/maps_manager.py @@ -426,7 +426,7 @@ def predict( if cluster.master: self._ensemble_prediction( - data_group, split, selection_metrics, use_labels + data_group, split, selection_metrics, use_labels, skip_leak_check ) def interpret( @@ -1990,6 +1990,7 @@ def _ensemble_prediction( split, selection_metrics, use_labels=True, + skip_leak_check=False, ): """Computes the results on the image-level.""" @@ -1998,14 +1999,14 @@ def _ensemble_prediction( for selection_metric in selection_metrics: # Soft voting - if self.num_networks > 1: + if self.num_networks > 1 and not skip_leak_check: self._ensemble_to_tsv( split, selection=selection_metric, data_group=data_group, use_labels=use_labels, ) - elif self.mode != "image": + elif self.mode != "image" and not skip_leak_check: self._mode_to_image_tsv( split, selection=selection_metric, @@ -2241,7 +2242,7 @@ def _check_data_group( f"To erase {data_group} please set overwrite to True." ) - if not group_dir.is_dir() and ( + elif not group_dir.is_dir() and ( caps_directory is None or df is None ): # Data group does not exist yet / was overwritten + missing data raise ClinicaDLArgumentError( diff --git a/docs/Train/Share.md b/docs/Train/Share.md new file mode 100644 index 000000000..f8c7cea5f --- /dev/null +++ b/docs/Train/Share.md @@ -0,0 +1,66 @@ +# `clinicadl hugging-face` - Share your experiment with hugging-face + +[Hugging Face](https://huggingface.co) is a machine learning (ML) and data science platform and community that helps users build, deploy and train machine learning models. + +It provides the infrastructure to demo, run and deploy artificial intelligence (AI) in live applications. Users can also browse through models and data sets that other people have uploaded. +Hugging Face is often called the GitHub of machine learning because it lets developers share and test their work openly. + +For now, you can push and pull [MAPS](../Introduction.md#maps-definition) in Hugging Face with clinicadl. + +You can find the ClinicaDL organization [here](https://huggingface.co/ClinicaDL) and send a request to join and access our pre-trained models. + +!!! warning "Sharing limitations" + Depending on the data, you can not be authorized to share all the MAPS, + because it includes TSV files with patient informations. + +## `push` - Push your MAPS to Hugging Face + +### Description + +This commnandline will create a new repository in Hugging face or update the repository with the given name. + +### Running the task + +```bash +clinicadl hugging-face push [OPTIONS] ORGANIZATION MAPS_DIRECTORY HF_MAPS_DIRECTORY +``` +where: + + - `ORGANIZATION` (Path) is either the name of the organization or your profile. + - `MAPS_DIRECTORY` (Path) is the folder you want to push, containing the dataset in a MAPS hierarchy. + - `HF_MAPS_DIRECtORY` (Path) is the name you want the repo to have in hugging face. + + +#### Example of how to run the task : + +```bash +clinicadl hugging-face push clinicadl /DATA/maps maps-test +``` + +## `pull` - Pull a MAPS from Hugging Face + +### Description + +This commnandline will download a new repository from Hugging face. + +### Running the task + +```bash +clinicadl hugging-face pull [OPTIONS] ORGANIZATION MAPS_DIRECTORY HF_MAPS_DIRECTORY +``` +where: + + - `ORGANIZATION` (Path) is either the name of the organization or your profile. + - `MAPS_DIRECTORY` (Path) is the folder you want to push, containing the dataset in a MAPS hierarchy. + - `HF_MAPS_DIRECtORY` (Path) is the name you want the repo to have in hugging face. + + +#### Example of how to run the task : + +```bash +clinicadl hugging-face pull clinicadl /DATA/maps maps-test +``` + +This commandline will download in a new repository (`DATA/maps`) the MAPS named `maps-test` in the clinicadl organization in Hugging Face. + +### Output tree diff --git a/docs/index.md b/docs/index.md index 6a18f4adb..f7efaefd2 100644 --- a/docs/index.md +++ b/docs/index.md @@ -60,6 +60,9 @@ command line. - `clinicadl train resume` - [Resume a prematurely stopped job](./Train/Resume.md) - `clinicadl train custom` - [Custom experiments](./Contribute/Custom/) +### Share pretrained models +- `clinicadl hugging-face` - [Share an experiment with hugging-face](./Train/Share.md) + ### Inference using pretrained models - `clinicadl predict` - [Predict one image or a list of images with your previously trained network](Predict.md)