server : fix mtmd checkpoints (ggml-org#16591)

ggerganov · web-flow · commit 554fd578a5ed · 2025-10-15T11:51:27.000+02:00
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
@@ -3812,7 +3812,7 @@ struct server_context {
                             if (slot.n_past > 0 && slot.n_past < (int) slot.prompt.tokens.size()) {
                                 const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id);
                                 if (pos_min == -1) {
-                                    SLT_ERR(slot, "n_past = %d, cache_tokens.size() = %d, seq_id = %d, pos_min = %d\n", slot.n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min);
+                                    SLT_ERR(slot, "n_past = %d, slot.prompt.tokens.size() = %d, seq_id = %d, pos_min = %d\n", slot.n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min);
                                     GGML_ABORT("pos_min == -1, but n_past > 0 - should not happen: https://github.com/ggml-org/llama.cpp/pull/13833#discussion_r2116181237");
                                 }
 
@@ -3860,7 +3860,7 @@ struct server_context {
                                 }
 
                                 if (pos_min > pos_min_thold) {
-                                    SLT_WRN(slot, "n_past = %d, cache_tokens.size() = %d, seq_id = %d, pos_min = %d, n_swa = %d\n", slot.n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min, n_swa);
+                                    SLT_WRN(slot, "n_past = %d, slot.prompt.tokens.size() = %d, seq_id = %d, pos_min = %d, n_swa = %d\n", slot.n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min, n_swa);
 
                                     // search for a context checkpoint
                                     const auto it = std::find_if(
@@ -4028,7 +4028,7 @@ struct server_context {
                         }
                     }
 
-                    // SLT_INF(slot, "new cache_tokens: %s\n", slot.cache_tokens.str().c_str());
+                    // SLT_INF(slot, "new slot.prompt.tokens: %s\n", slot.slot.prompt.tokens.str().c_str());
 
                     SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_past / slot.n_prompt_tokens());
 
diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
@@ -1237,9 +1237,10 @@ struct server_tokens {
             // allowed to resize      ^                    ^
             // disallowed to resize          ^      ^             ^
             if (n > 0) {
-                llama_token last_token = tokens[n - 1];
                 // make sure we never remove tokens in the middle of an image
-                if (last_token == LLAMA_TOKEN_NULL) {
+                // note that the case where we keep a full image at the end is allowed:
+                //   tokens[n - 1] == LLAMA_TOKEN_NULL && tokens[n] != LLAMA_TOKEN_NULL
+                if (tokens[n - 1] == LLAMA_TOKEN_NULL && tokens[n] == LLAMA_TOKEN_NULL) {
                     find_chunk(n - 1); // will throw an error if the token is not begin-of-chunk
                 }
             }

Original file line number	Diff line number	Diff line change
`@@ -3812,7 +3812,7 @@ struct server_context {`
`3812`	`3812`	`if (slot.n_past > 0 && slot.n_past < (int) slot.prompt.tokens.size()) {`
`3813`	`3813`	`const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id);`
`3814`	`3814`	`if (pos_min == -1) {`
`3815`		`- SLT_ERR(slot, "n_past = %d, cache_tokens.size() = %d, seq_id = %d, pos_min = %d\n", slot.n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min);`
	`3815`	`+ SLT_ERR(slot, "n_past = %d, slot.prompt.tokens.size() = %d, seq_id = %d, pos_min = %d\n", slot.n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min);`
`3816`	`3816`	`GGML_ABORT("pos_min == -1, but n_past > 0 - should not happen: https://github.com/ggml-org/llama.cpp/pull/13833#discussion_r2116181237");`
`3817`	`3817`	`}`
`3818`	`3818`
`@@ -3860,7 +3860,7 @@ struct server_context {`
`3860`	`3860`	`}`
`3861`	`3861`
`3862`	`3862`	`if (pos_min > pos_min_thold) {`
`3863`		`- SLT_WRN(slot, "n_past = %d, cache_tokens.size() = %d, seq_id = %d, pos_min = %d, n_swa = %d\n", slot.n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min, n_swa);`
	`3863`	`+ SLT_WRN(slot, "n_past = %d, slot.prompt.tokens.size() = %d, seq_id = %d, pos_min = %d, n_swa = %d\n", slot.n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min, n_swa);`
`3864`	`3864`
`3865`	`3865`	`// search for a context checkpoint`
`3866`	`3866`	`const auto it = std::find_if(`
`@@ -4028,7 +4028,7 @@ struct server_context {`
`4028`	`4028`	`}`
`4029`	`4029`	`}`
`4030`	`4030`
`4031`		`- // SLT_INF(slot, "new cache_tokens: %s\n", slot.cache_tokens.str().c_str());`
	`4031`	`+ // SLT_INF(slot, "new slot.prompt.tokens: %s\n", slot.slot.prompt.tokens.str().c_str());`
`4032`	`4032`
`4033`	`4033`	`SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_past / slot.n_prompt_tokens());`
`4034`	`4034`
Original file line number	Diff line number	Diff line change
`@@ -1237,9 +1237,10 @@ struct server_tokens {`
`1237`	`1237`	`// allowed to resize ^ ^`
`1238`	`1238`	`// disallowed to resize ^ ^ ^`
`1239`	`1239`	`if (n > 0) {`
`1240`		`- llama_token last_token = tokens[n - 1];`
`1241`	`1240`	`// make sure we never remove tokens in the middle of an image`
`1242`		`- if (last_token == LLAMA_TOKEN_NULL) {`
	`1241`	`+ // note that the case where we keep a full image at the end is allowed:`
	`1242`	`+ // tokens[n - 1] == LLAMA_TOKEN_NULL && tokens[n] != LLAMA_TOKEN_NULL`
	`1243`	`+ if (tokens[n - 1] == LLAMA_TOKEN_NULL && tokens[n] == LLAMA_TOKEN_NULL) {`
`1243`	`1244`	`find_chunk(n - 1); // will throw an error if the token is not begin-of-chunk`
`1244`	`1245`	`}`
`1245`	`1246`	`}`