From 683396c48186f5ad911fe74f78ced7f4a9ae21ac Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Wed, 23 Apr 2025 23:58:34 +0200
Subject: [PATCH 1/5] arg : clean up handling --mmproj with -hf

---
 common/arg.cpp  | 51 ++++++++++++++++++++++++++++++++++---------------
 common/common.h |  1 +
 2 files changed, 37 insertions(+), 15 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 1cfd0168d95ae..b5741c85995ef 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -641,11 +641,16 @@ static struct common_hf_file_res common_get_hf_file(const std::string &, const s
 // utils
 //
 
-static void common_params_handle_model(
+struct handle_model_result {
+    bool found_mmproj = false;
+    common_params_model mmproj;
+};
+
+static handle_model_result common_params_handle_model(
         struct common_params_model & model,
         const std::string & bearer_token,
-        const std::string & model_path_default,
-        bool is_mmproj = false) { // TODO: move is_mmproj to an enum when we have more files?
+        const std::string & model_path_default) {
+    handle_model_result result;
     // handle pre-fill default model path and url based on hf_repo and hf_file
     {
         if (!model.hf_repo.empty()) {
@@ -657,7 +662,12 @@ static void common_params_handle_model(
                         exit(1); // built without CURL, error message already printed
                     }
                     model.hf_repo = auto_detected.repo;
-                    model.hf_file = is_mmproj ? auto_detected.mmprojFile : auto_detected.ggufFile;
+                    model.hf_file = auto_detected.ggufFile;
+                    if (!auto_detected.mmprojFile.empty()) {
+                        result.found_mmproj   = true;
+                        result.mmproj.hf_repo = model.hf_repo;
+                        result.mmproj.hf_file = auto_detected.mmprojFile;
+                    }
                 } else {
                     model.hf_file = model.path;
                 }
@@ -694,6 +704,8 @@ static void common_params_handle_model(
             exit(1);
         }
     }
+
+    return result;
 }
 
 const std::vector<ggml_type> kv_cache_types = {
@@ -827,16 +839,17 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
     }
 
-    common_params_handle_model(params.model,             params.hf_token, DEFAULT_MODEL_PATH);
-    common_params_handle_model(params.speculative.model, params.hf_token, "");
-    common_params_handle_model(params.vocoder.model,     params.hf_token, "");
-
-    // allow --mmproj to be set from -hf
-    // assuming that mmproj is always in the same repo as text model
-    if (!params.model.hf_repo.empty() && ctx_arg.ex == LLAMA_EXAMPLE_LLAVA) {
-        params.mmproj.hf_repo = params.model.hf_repo;
+    // handle model and download
+    {
+        auto res = common_params_handle_model(params.model,  params.hf_token, DEFAULT_MODEL_PATH);
+        // optionally, handle mmproj model when -hf is specified
+        if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
+            params.mmproj = res.mmproj;
+        }
+        common_params_handle_model(params.mmproj,            params.hf_token, "");
+        common_params_handle_model(params.speculative.model, params.hf_token, "");
+        common_params_handle_model(params.vocoder.model,     params.hf_token, "");
     }
-    common_params_handle_model(params.mmproj,            params.hf_token, "", true);
 
     if (params.escape) {
         string_process_escapes(params.prompt);
@@ -2095,18 +2108,25 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
     add_opt(common_arg(
         {"--mmproj"}, "FILE",
-        "path to a multimodal projector file for LLaVA. see examples/llava/README.md",
+        "path to a multimodal projector file. see examples/llava/README.md",
         [](common_params & params, const std::string & value) {
             params.mmproj.path = value;
         }
     ).set_examples({LLAMA_EXAMPLE_LLAVA}));
     add_opt(common_arg(
         {"--mmproj-url"}, "URL",
-        "URL to a multimodal projector file for LLaVA. see examples/llava/README.md",
+        "URL to a multimodal projector file. see examples/llava/README.md",
         [](common_params & params, const std::string & value) {
             params.mmproj.url = value;
         }
     ).set_examples({LLAMA_EXAMPLE_LLAVA}));
+    add_opt(common_arg(
+        {"--no-mmproj"},
+        "explicitly disable multimodal projector, useful when using -hf",
+        [](common_params & params) {
+            params.no_mmproj = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_LLAVA}));
     add_opt(common_arg(
         {"--image"}, "FILE",
         "path to an image file. use with multimodal models. Specify multiple times for batching",
@@ -2381,6 +2401,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     add_opt(common_arg(
         {"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
         "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
+        "mmproj is also downloaded automatically if available. to disable, add --no-mmproj\n"
         "example: unsloth/phi-4-GGUF:q4_k_m\n"
         "(default: unused)",
         [](common_params & params, const std::string & value) {
diff --git a/common/common.h b/common/common.h
index e6eaa8e80cf05..70d3ef8f27870 100644
--- a/common/common.h
+++ b/common/common.h
@@ -342,6 +342,7 @@ struct common_params {
 
     // multimodal models (see examples/llava)
     struct common_params_model mmproj;
+    bool no_mmproj = false;         // explicitly disable multimodal model
     std::vector<std::string> image; // path to image file(s)
 
     // embedding

From 2cac8e0efb629d66c612f137e75d562f94bb9e6c Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 24 Apr 2025 10:05:18 +0200
Subject: [PATCH 2/5] rm change about no_mmproj

---
 common/arg.cpp  | 7 -------
 common/common.h | 1 -
 2 files changed, 8 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index b5741c85995ef..9b08b9a65ae42 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2120,13 +2120,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.mmproj.url = value;
         }
     ).set_examples({LLAMA_EXAMPLE_LLAVA}));
-    add_opt(common_arg(
-        {"--no-mmproj"},
-        "explicitly disable multimodal projector, useful when using -hf",
-        [](common_params & params) {
-            params.no_mmproj = true;
-        }
-    ).set_examples({LLAMA_EXAMPLE_LLAVA}));
     add_opt(common_arg(
         {"--image"}, "FILE",
         "path to an image file. use with multimodal models. Specify multiple times for batching",
diff --git a/common/common.h b/common/common.h
index 70d3ef8f27870..e6eaa8e80cf05 100644
--- a/common/common.h
+++ b/common/common.h
@@ -342,7 +342,6 @@ struct common_params {
 
     // multimodal models (see examples/llava)
     struct common_params_model mmproj;
-    bool no_mmproj = false;         // explicitly disable multimodal model
     std::vector<std::string> image; // path to image file(s)
 
     // embedding

From aa88f32afd136f6f3c7ba15033504c372644c36e Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 24 Apr 2025 10:08:13 +0200
Subject: [PATCH 3/5] Revert "rm change about no_mmproj"

This reverts commit 2cac8e0efb629d66c612f137e75d562f94bb9e6c.
---
 common/arg.cpp  | 7 +++++++
 common/common.h | 1 +
 2 files changed, 8 insertions(+)

diff --git a/common/arg.cpp b/common/arg.cpp
index 9b08b9a65ae42..b5741c85995ef 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2120,6 +2120,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.mmproj.url = value;
         }
     ).set_examples({LLAMA_EXAMPLE_LLAVA}));
+    add_opt(common_arg(
+        {"--no-mmproj"},
+        "explicitly disable multimodal projector, useful when using -hf",
+        [](common_params & params) {
+            params.no_mmproj = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_LLAVA}));
     add_opt(common_arg(
         {"--image"}, "FILE",
         "path to an image file. use with multimodal models. Specify multiple times for batching",
diff --git a/common/common.h b/common/common.h
index e6eaa8e80cf05..70d3ef8f27870 100644
--- a/common/common.h
+++ b/common/common.h
@@ -342,6 +342,7 @@ struct common_params {
 
     // multimodal models (see examples/llava)
     struct common_params_model mmproj;
+    bool no_mmproj = false;         // explicitly disable multimodal model
     std::vector<std::string> image; // path to image file(s)
 
     // embedding

From 397348c23add9eba09c7bcfd5018e1ee265cc65a Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 24 Apr 2025 10:12:56 +0200
Subject: [PATCH 4/5] handle no_mmproj explicitly

---
 common/arg.cpp              | 6 ++++--
 examples/llava/mtmd-cli.cpp | 1 +
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index b5741c85995ef..194614256c8bc 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -842,8 +842,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
     // handle model and download
     {
         auto res = common_params_handle_model(params.model,  params.hf_token, DEFAULT_MODEL_PATH);
-        // optionally, handle mmproj model when -hf is specified
-        if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
+        if (params.no_mmproj) {
+            params.mmproj = {};
+        } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
+            // optionally, handle mmproj model when -hf is specified
             params.mmproj = res.mmproj;
         }
         common_params_handle_model(params.mmproj,            params.hf_token, "");
diff --git a/examples/llava/mtmd-cli.cpp b/examples/llava/mtmd-cli.cpp
index e80845a2c5469..b4b226bebb119 100644
--- a/examples/llava/mtmd-cli.cpp
+++ b/examples/llava/mtmd-cli.cpp
@@ -249,6 +249,7 @@ int main(int argc, char ** argv) {
 
     if (params.mmproj.path.empty()) {
         show_additional_info(argc, argv);
+        LOG_ERR("ERR: Missing --mmproj argument\n");
         return 1;
     }
 

From 1d85e731084d9c310676d09a9d9b766fcb886319 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 24 Apr 2025 11:03:00 +0200
Subject: [PATCH 5/5] skip download mmproj on examples not using it

---
 common/arg.cpp | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 194614256c8bc..85ba411146786 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -38,6 +38,11 @@
 
 using json = nlohmann::ordered_json;
 
+std::initializer_list<enum llama_example> mmproj_examples = {
+    LLAMA_EXAMPLE_LLAVA,
+    // TODO: add LLAMA_EXAMPLE_SERVER when it's ready
+};
+
 common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
     this->examples = std::move(examples);
     return *this;
@@ -841,14 +846,20 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
 
     // handle model and download
     {
-        auto res = common_params_handle_model(params.model,  params.hf_token, DEFAULT_MODEL_PATH);
+        auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH);
         if (params.no_mmproj) {
             params.mmproj = {};
         } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
             // optionally, handle mmproj model when -hf is specified
             params.mmproj = res.mmproj;
         }
-        common_params_handle_model(params.mmproj,            params.hf_token, "");
+        // only download mmproj if the current example is using it
+        for (auto & ex : mmproj_examples) {
+            if (ctx_arg.ex == ex) {
+                common_params_handle_model(params.mmproj,    params.hf_token, "");
+                break;
+            }
+        }
         common_params_handle_model(params.speculative.model, params.hf_token, "");
         common_params_handle_model(params.vocoder.model,     params.hf_token, "");
     }
@@ -2114,21 +2125,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             params.mmproj.path = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_LLAVA}));
+    ).set_examples(mmproj_examples));
     add_opt(common_arg(
         {"--mmproj-url"}, "URL",
         "URL to a multimodal projector file. see examples/llava/README.md",
         [](common_params & params, const std::string & value) {
             params.mmproj.url = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_LLAVA}));
+    ).set_examples(mmproj_examples));
     add_opt(common_arg(
         {"--no-mmproj"},
         "explicitly disable multimodal projector, useful when using -hf",
         [](common_params & params) {
             params.no_mmproj = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_LLAVA}));
+    ).set_examples(mmproj_examples));
     add_opt(common_arg(
         {"--image"}, "FILE",
         "path to an image file. use with multimodal models. Specify multiple times for batching",