Self-speculate with only one context

KerfuffleV2 · KerfuffleV2 · commit 317910c9fd68 · 2023-10-18T07:43:02.000-06:00
Possible improvements to speculation logic
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
@@ -36,14 +36,27 @@ int main(int argc, char ** argv) {
     llama_context * ctx_tgt = NULL;
     llama_context * ctx_dft = NULL;
 
+    bool self_speculation   = false;
+
     // load the target model
     params.logits_all = true;
     std::tie(model_tgt, ctx_tgt) = llama_init_from_gpt_params(params);
 
     // load the draft model
-    params.model = params.model_draft;
-    params.n_gpu_layers = params.n_gpu_layers_draft;
-    std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
+    if (params.model != params.model_draft) {
+        params.model = params.model_draft;
+        params.n_gpu_layers = params.n_gpu_layers_draft;
+        std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
+    } else {
+        self_speculation = true;
+        model_dft = model_tgt;
+        ctx_dft = ctx_tgt;
+    }
+
+    // the 2 models should have the same vocab
+    const int n_ctx   = llama_n_ctx(ctx_tgt);
+    const int n_vocab = llama_n_vocab(model_tgt);
+    GGML_ASSERT(n_vocab == llama_n_vocab(model_dft));
 
     // tokenize the prompt
     std::vector<llama_token> inp;
@@ -68,6 +81,7 @@ int main(int argc, char ** argv) {
     const int n_input = inp.size();
 
     llama_batch batch_dft = llama_batch_get_one(NULL, 0, 0, 0);
+    llama_batch batch_tgt = llama_batch_get_one(NULL, 0, 0, 1);
     std::vector<int32_t> run_layers_dft = {
         0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 3, 1, 0, 3, 3, 0, 3, 0, 1, 1,
         3, 3, 3, 0, 2, 3, 2, 3, 3, 3, 1, 3, 0, 0, 2, 1, 0, 2, 0, 0,
@@ -76,25 +90,39 @@ int main(int argc, char ** argv) {
 
     const auto t_enc_start = ggml_time_us();
 
-    // eval the prompt with both models
-    llama_decode(ctx_tgt, llama_batch_get_one( inp.data(), n_input - 1, 0,           0));
-    llama_decode(ctx_tgt, llama_batch_get_one(&inp.back(),           1, n_input - 1, 0));
+    std::vector<float> logits_tgt, logits_dft;
 
-    batch_dft.n_tokens = n_input;
-    batch_dft.token = inp.data();
-    batch_dft.all_pos_0 = 0;
-    llama_decode(ctx_dft, batch_dft);
+    if (self_speculation) {
+        logits_tgt.resize(n_vocab * 30);
+        logits_dft.resize(n_vocab);
+    }
+
+    // eval the prompt with both models
+    batch_tgt.n_tokens = n_input - 1;
+    batch_tgt.token = inp.data();
+    batch_tgt.all_pos_0 = 0;
+    llama_decode(ctx_tgt, batch_tgt);
+    batch_tgt.n_tokens = 1;
+    batch_tgt.token = &inp.back();
+    batch_tgt.all_pos_0 = n_input - 1;
+    llama_decode(ctx_tgt, batch_tgt);
+
+    if (!self_speculation) {
+        batch_dft.n_tokens = n_input;
+        batch_dft.token = inp.data();
+        batch_dft.all_pos_0 = 0;
+        llama_decode(ctx_dft, batch_dft);
+    } else {
+        memcpy(logits_tgt.data(), llama_get_logits(ctx_tgt), sizeof(float) * n_vocab);
+        memcpy(logits_dft.data(), llama_get_logits(ctx_tgt), sizeof(float) * n_vocab);
+        llama_kv_cache_seq_cp(ctx_dft, 1, 0, 0, -1);
+    }
 
     const auto t_enc_end = ggml_time_us();
 
     // Don't skip layers until after prompt eval.
     batch_dft.run_layers = run_layers_dft.data();
 
-    // the 2 models should have the same vocab
-    const int n_ctx   = llama_n_ctx(ctx_tgt);
-    const int n_vocab = llama_n_vocab(model_tgt);
-    //GGML_ASSERT(n_vocab == llama_n_vocab(model_dft));
-
     // how many tokens to draft each time
     int n_draft = params.n_draft;
 
@@ -150,7 +178,15 @@ int main(int argc, char ** argv) {
 
         while (true) {
             // sample from the target model
-            llama_token id = llama_sampling_sample(ctx_tgt, NULL, ctx_sampling, last_tokens, candidates, i_dft);
+            llama_token id;
+            if (!self_speculation) {
+                id = llama_sampling_sample(ctx_tgt, NULL, ctx_sampling, last_tokens, candidates, i_dft, 1);
+            } else {
+                memcpy(llama_get_logits(ctx_tgt),
+                    logits_tgt.data() + i_dft * n_vocab,
+                    sizeof(float) * size_t(n_vocab));
+                id = llama_sampling_sample(ctx_tgt, NULL, ctx_sampling, last_tokens, candidates, 0, 1);
+            }
 
             // remember which tokens were sampled - used for repetition penalties during sampling
             last_tokens.erase(last_tokens.begin());
@@ -193,6 +229,11 @@ int main(int argc, char ** argv) {
             batch_dft.n_tokens = 1;
             batch_dft.all_pos_0 = n_past_dft;
             llama_decode(ctx_dft, batch_dft);
+
+            if (self_speculation) {
+                memcpy(logits_dft.data(), llama_get_logits(ctx_dft), sizeof(float) * n_vocab);
+            }
+
             ++n_past_dft;
 
             // heuristic for n_draft
@@ -212,7 +253,7 @@ int main(int argc, char ** argv) {
                     LOG(" - partially drafted tokens accepted - no change\n");
                 } else {
                     LOG(" - drafted token rejected - n_draft -= 1\n");
-                    n_draft = std::max(2, n_draft - 1);
+                    n_draft = std::max(6, n_draft - 1);
                 }
             }
 
@@ -244,6 +285,10 @@ int main(int argc, char ** argv) {
         // sample n_draft tokens from the draft model using greedy decoding
         int n_past_cur = n_past_dft;
         for (int i = 0; i < n_draft; ++i) {
+
+            if (self_speculation) {
+               memcpy(llama_get_logits(ctx_dft), logits_dft.data(), sizeof(float) * n_vocab);
+            }
             float * logits = llama_get_logits(ctx_dft);
 
             candidates.clear();
@@ -265,7 +310,13 @@ int main(int argc, char ** argv) {
             }
 
             // TODO: better logic?
-            if (cur_p.data[0].p < 2*cur_p.data[1].p) {
+
+            // const float skip_scale = 1.25f + std::min(2.0f, 0.25f * float(i)); // 46.6
+            // const float skip_scale = 1.35f + std::min(2.5f, 0.15f * float(i)); // 48.48
+            // const float skip_scale = 1.35f + std::min(2.0f, 0.15f * float(i)); // 48.98
+            // const float skip_scale = 1.50f + std::min(2.0f, 0.10f * float(i)); // 51.64
+            const float skip_scale = 1.50f + std::min(2.0f, 0.75f * float(i)); // 61.76
+            if (cur_p.data[0].p < skip_scale*cur_p.data[1].p) {
                 LOG("stopping drafting, probability too low: %.3f < 2*%.3f\n", cur_p.data[0].p, cur_p.data[1].p);
                 break;
             }
@@ -287,6 +338,11 @@ int main(int argc, char ** argv) {
             batch_dft.n_tokens = 1;
             batch_dft.all_pos_0 = n_past_cur;
             llama_decode(ctx_dft, batch_dft);
+
+            if (self_speculation) {
+                memcpy(logits_dft.data(), llama_get_logits(ctx_dft), sizeof(float) * n_vocab);
+            }
+
             ++n_past_cur;
 
             if (grammar_dft != NULL) {
@@ -295,8 +351,16 @@ int main(int argc, char ** argv) {
         }
 
         // evaluate the target model on the drafted tokens
-        llama_kv_cache_seq_rm(ctx_tgt, 0, n_past_tgt, -1);
-        llama_decode(ctx_tgt, llama_batch_get_one(drafted.data(), drafted.size(), n_past_tgt, 0));
+        llama_kv_cache_seq_rm(ctx_tgt, 1, n_past_tgt, -1);
+        batch_tgt.n_tokens = drafted.size();
+        batch_tgt.token = drafted.data();
+        batch_tgt.all_pos_0 = n_past_tgt;
+        llama_decode(ctx_tgt, batch_tgt);
+
+        if (self_speculation) {
+            memcpy(logits_tgt.data(), llama_get_logits(ctx_tgt),
+                sizeof(float) * n_vocab * size_t(batch_tgt.n_tokens));
+        }
         ++n_past_tgt;
 
         // the first token is always proposed by the traget model before the speculation loop
@@ -327,8 +391,10 @@ int main(int argc, char ** argv) {
     llama_free(ctx_tgt);
     llama_free_model(model_tgt);
 
-    llama_free(ctx_dft);
-    llama_free_model(model_dft);
+    if (!self_speculation) {
+        llama_free(ctx_dft);
+        llama_free_model(model_dft);
+    }
 
     if (grammar_dft != NULL) {
         llama_grammar_free(grammar_dft);