From 683396c48186f5ad911fe74f78ced7f4a9ae21ac Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 23 Apr 2025 23:58:34 +0200 Subject: [PATCH 1/5] arg : clean up handling --mmproj with -hf --- common/arg.cpp | 51 ++++++++++++++++++++++++++++++++++--------------- common/common.h | 1 + 2 files changed, 37 insertions(+), 15 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 1cfd0168d95ae..b5741c85995ef 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -641,11 +641,16 @@ static struct common_hf_file_res common_get_hf_file(const std::string &, const s // utils // -static void common_params_handle_model( +struct handle_model_result { + bool found_mmproj = false; + common_params_model mmproj; +}; + +static handle_model_result common_params_handle_model( struct common_params_model & model, const std::string & bearer_token, - const std::string & model_path_default, - bool is_mmproj = false) { // TODO: move is_mmproj to an enum when we have more files? + const std::string & model_path_default) { + handle_model_result result; // handle pre-fill default model path and url based on hf_repo and hf_file { if (!model.hf_repo.empty()) { @@ -657,7 +662,12 @@ static void common_params_handle_model( exit(1); // built without CURL, error message already printed } model.hf_repo = auto_detected.repo; - model.hf_file = is_mmproj ? auto_detected.mmprojFile : auto_detected.ggufFile; + model.hf_file = auto_detected.ggufFile; + if (!auto_detected.mmprojFile.empty()) { + result.found_mmproj = true; + result.mmproj.hf_repo = model.hf_repo; + result.mmproj.hf_file = auto_detected.mmprojFile; + } } else { model.hf_file = model.path; } @@ -694,6 +704,8 @@ static void common_params_handle_model( exit(1); } } + + return result; } const std::vector kv_cache_types = { @@ -827,16 +839,17 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); } - common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH); - common_params_handle_model(params.speculative.model, params.hf_token, ""); - common_params_handle_model(params.vocoder.model, params.hf_token, ""); - - // allow --mmproj to be set from -hf - // assuming that mmproj is always in the same repo as text model - if (!params.model.hf_repo.empty() && ctx_arg.ex == LLAMA_EXAMPLE_LLAVA) { - params.mmproj.hf_repo = params.model.hf_repo; + // handle model and download + { + auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH); + // optionally, handle mmproj model when -hf is specified + if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) { + params.mmproj = res.mmproj; + } + common_params_handle_model(params.mmproj, params.hf_token, ""); + common_params_handle_model(params.speculative.model, params.hf_token, ""); + common_params_handle_model(params.vocoder.model, params.hf_token, ""); } - common_params_handle_model(params.mmproj, params.hf_token, "", true); if (params.escape) { string_process_escapes(params.prompt); @@ -2095,18 +2108,25 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING")); add_opt(common_arg( {"--mmproj"}, "FILE", - "path to a multimodal projector file for LLaVA. see examples/llava/README.md", + "path to a multimodal projector file. see examples/llava/README.md", [](common_params & params, const std::string & value) { params.mmproj.path = value; } ).set_examples({LLAMA_EXAMPLE_LLAVA})); add_opt(common_arg( {"--mmproj-url"}, "URL", - "URL to a multimodal projector file for LLaVA. see examples/llava/README.md", + "URL to a multimodal projector file. see examples/llava/README.md", [](common_params & params, const std::string & value) { params.mmproj.url = value; } ).set_examples({LLAMA_EXAMPLE_LLAVA})); + add_opt(common_arg( + {"--no-mmproj"}, + "explicitly disable multimodal projector, useful when using -hf", + [](common_params & params) { + params.no_mmproj = true; + } + ).set_examples({LLAMA_EXAMPLE_LLAVA})); add_opt(common_arg( {"--image"}, "FILE", "path to an image file. use with multimodal models. Specify multiple times for batching", @@ -2381,6 +2401,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"-hf", "-hfr", "--hf-repo"}, "/[:quant]", "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n" + "mmproj is also downloaded automatically if available. to disable, add --no-mmproj\n" "example: unsloth/phi-4-GGUF:q4_k_m\n" "(default: unused)", [](common_params & params, const std::string & value) { diff --git a/common/common.h b/common/common.h index e6eaa8e80cf05..70d3ef8f27870 100644 --- a/common/common.h +++ b/common/common.h @@ -342,6 +342,7 @@ struct common_params { // multimodal models (see examples/llava) struct common_params_model mmproj; + bool no_mmproj = false; // explicitly disable multimodal model std::vector image; // path to image file(s) // embedding From 2cac8e0efb629d66c612f137e75d562f94bb9e6c Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 24 Apr 2025 10:05:18 +0200 Subject: [PATCH 2/5] rm change about no_mmproj --- common/arg.cpp | 7 ------- common/common.h | 1 - 2 files changed, 8 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index b5741c85995ef..9b08b9a65ae42 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2120,13 +2120,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.mmproj.url = value; } ).set_examples({LLAMA_EXAMPLE_LLAVA})); - add_opt(common_arg( - {"--no-mmproj"}, - "explicitly disable multimodal projector, useful when using -hf", - [](common_params & params) { - params.no_mmproj = true; - } - ).set_examples({LLAMA_EXAMPLE_LLAVA})); add_opt(common_arg( {"--image"}, "FILE", "path to an image file. use with multimodal models. Specify multiple times for batching", diff --git a/common/common.h b/common/common.h index 70d3ef8f27870..e6eaa8e80cf05 100644 --- a/common/common.h +++ b/common/common.h @@ -342,7 +342,6 @@ struct common_params { // multimodal models (see examples/llava) struct common_params_model mmproj; - bool no_mmproj = false; // explicitly disable multimodal model std::vector image; // path to image file(s) // embedding From aa88f32afd136f6f3c7ba15033504c372644c36e Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 24 Apr 2025 10:08:13 +0200 Subject: [PATCH 3/5] Revert "rm change about no_mmproj" This reverts commit 2cac8e0efb629d66c612f137e75d562f94bb9e6c. --- common/arg.cpp | 7 +++++++ common/common.h | 1 + 2 files changed, 8 insertions(+) diff --git a/common/arg.cpp b/common/arg.cpp index 9b08b9a65ae42..b5741c85995ef 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2120,6 +2120,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.mmproj.url = value; } ).set_examples({LLAMA_EXAMPLE_LLAVA})); + add_opt(common_arg( + {"--no-mmproj"}, + "explicitly disable multimodal projector, useful when using -hf", + [](common_params & params) { + params.no_mmproj = true; + } + ).set_examples({LLAMA_EXAMPLE_LLAVA})); add_opt(common_arg( {"--image"}, "FILE", "path to an image file. use with multimodal models. Specify multiple times for batching", diff --git a/common/common.h b/common/common.h index e6eaa8e80cf05..70d3ef8f27870 100644 --- a/common/common.h +++ b/common/common.h @@ -342,6 +342,7 @@ struct common_params { // multimodal models (see examples/llava) struct common_params_model mmproj; + bool no_mmproj = false; // explicitly disable multimodal model std::vector image; // path to image file(s) // embedding From 397348c23add9eba09c7bcfd5018e1ee265cc65a Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 24 Apr 2025 10:12:56 +0200 Subject: [PATCH 4/5] handle no_mmproj explicitly --- common/arg.cpp | 6 ++++-- examples/llava/mtmd-cli.cpp | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index b5741c85995ef..194614256c8bc 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -842,8 +842,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context // handle model and download { auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH); - // optionally, handle mmproj model when -hf is specified - if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) { + if (params.no_mmproj) { + params.mmproj = {}; + } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) { + // optionally, handle mmproj model when -hf is specified params.mmproj = res.mmproj; } common_params_handle_model(params.mmproj, params.hf_token, ""); diff --git a/examples/llava/mtmd-cli.cpp b/examples/llava/mtmd-cli.cpp index e80845a2c5469..b4b226bebb119 100644 --- a/examples/llava/mtmd-cli.cpp +++ b/examples/llava/mtmd-cli.cpp @@ -249,6 +249,7 @@ int main(int argc, char ** argv) { if (params.mmproj.path.empty()) { show_additional_info(argc, argv); + LOG_ERR("ERR: Missing --mmproj argument\n"); return 1; } From 1d85e731084d9c310676d09a9d9b766fcb886319 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 24 Apr 2025 11:03:00 +0200 Subject: [PATCH 5/5] skip download mmproj on examples not using it --- common/arg.cpp | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 194614256c8bc..85ba411146786 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -38,6 +38,11 @@ using json = nlohmann::ordered_json; +std::initializer_list mmproj_examples = { + LLAMA_EXAMPLE_LLAVA, + // TODO: add LLAMA_EXAMPLE_SERVER when it's ready +}; + common_arg & common_arg::set_examples(std::initializer_list examples) { this->examples = std::move(examples); return *this; @@ -841,14 +846,20 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context // handle model and download { - auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH); + auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH); if (params.no_mmproj) { params.mmproj = {}; } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) { // optionally, handle mmproj model when -hf is specified params.mmproj = res.mmproj; } - common_params_handle_model(params.mmproj, params.hf_token, ""); + // only download mmproj if the current example is using it + for (auto & ex : mmproj_examples) { + if (ctx_arg.ex == ex) { + common_params_handle_model(params.mmproj, params.hf_token, ""); + break; + } + } common_params_handle_model(params.speculative.model, params.hf_token, ""); common_params_handle_model(params.vocoder.model, params.hf_token, ""); } @@ -2114,21 +2125,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.mmproj.path = value; } - ).set_examples({LLAMA_EXAMPLE_LLAVA})); + ).set_examples(mmproj_examples)); add_opt(common_arg( {"--mmproj-url"}, "URL", "URL to a multimodal projector file. see examples/llava/README.md", [](common_params & params, const std::string & value) { params.mmproj.url = value; } - ).set_examples({LLAMA_EXAMPLE_LLAVA})); + ).set_examples(mmproj_examples)); add_opt(common_arg( {"--no-mmproj"}, "explicitly disable multimodal projector, useful when using -hf", [](common_params & params) { params.no_mmproj = true; } - ).set_examples({LLAMA_EXAMPLE_LLAVA})); + ).set_examples(mmproj_examples)); add_opt(common_arg( {"--image"}, "FILE", "path to an image file. use with multimodal models. Specify multiple times for batching",