@@ -37,7 +37,7 @@ bool starcoder_eval(
37
37
const int n_head = hparams.n_head ;
38
38
const int n_vocab = hparams.n_vocab ;
39
39
40
- static size_t buf_size = 256u *1024 *1024 ;
40
+ static size_t buf_size = 512u *1024 *1024 ;
41
41
static void * buf = malloc (buf_size);
42
42
43
43
// use 2 scratch buffers
@@ -48,17 +48,21 @@ bool starcoder_eval(
48
48
static size_t scr1_size = 512u *1024 *1024 ;
49
49
static void * scr1 = malloc (scr1_size);
50
50
51
- if (mem_per_token > 0 && mem_per_token*N > buf_size) {
52
- const size_t buf_size_new = 1.1 *(mem_per_token*N); // add 10% to account for ggml object overhead
53
- spdlog::debug (" {}: reallocating buffer from {} to {} bytes\n " , __func__, buf_size, buf_size_new);
51
+ if (mem_per_token > 0 && 2 *mem_per_token*N > buf_size) {
52
+ const size_t buf_size_new = 2 *(mem_per_token*N); // add 10% to account for ggml object overhead
54
53
55
- // reallocate
56
- buf_size = buf_size_new;
57
- buf = realloc (buf, buf_size);
58
- if (buf == nullptr ) {
59
- spdlog::error (" {}: failed to allocate {} bytes\n " , __func__, buf_size);
60
- return false ;
54
+ if (buf_size_new > buf_size){
55
+ spdlog::debug (" {}: reallocating buffer from {} to {} bytes\n " , __func__, buf_size, buf_size_new);
56
+
57
+ // reallocate
58
+ buf_size = buf_size_new;
59
+ buf = realloc (buf, buf_size);
60
+ if (buf == nullptr ) {
61
+ spdlog::error (" {}: failed to allocate {} bytes\n " , __func__, buf_size);
62
+ return false ;
63
+ }
61
64
}
65
+
62
66
}
63
67
64
68
struct ggml_init_params params = {
@@ -67,6 +71,7 @@ bool starcoder_eval(
67
71
/* .no_alloc =*/ false ,
68
72
};
69
73
74
+
70
75
struct ggml_context * ctx0 = ggml_init (params);
71
76
struct ggml_cgraph gf = {};
72
77
@@ -338,7 +343,9 @@ bool starcoder_eval(
338
343
if (mem_per_token == 0 ) {
339
344
mem_per_token = ggml_used_mem (ctx0)/N;
340
345
}
341
- // printf("used_mem = %zu MB\n", ggml_used_mem(ctx0)/(1024*1024));
346
+
347
+ spdlog::debug (" {}: used mem buf={} bytes" , __func__, ggml_used_mem (ctx0));
348
+
342
349
343
350
ggml_free (ctx0);
344
351
@@ -743,11 +750,22 @@ std::stringstream StarcoderModel::predict_impl(std::string prompt, int max_lengt
743
750
size_t mem_per_token = 0 ;
744
751
745
752
std::vector<float > logits;
753
+ std::vector<gpt_vocab::id> test = {};
754
+
755
+ for (int i=0 ;i<64 ;i++){
756
+ test.push_back (i);
757
+ }
758
+
759
+ spdlog::debug (" {}: calculate required memory per token" , __func__);
760
+ starcoder_eval ((*model), config.n_threads , 0 , test, logits, mem_per_token);
761
+ spdlog::debug (" {}: mem_per_token={}" , __func__, mem_per_token);
762
+ spdlog::debug (" {}: total mem needed for prompt = {}*{}={}" , __func__, embd_inp.size (), mem_per_token, embd_inp.size ()*mem_per_token);
746
763
747
- starcoder_eval ((*model), config.n_threads , 0 , { 0 , 1 , 2 , 3 }, logits, mem_per_token);
748
764
749
765
for (int i = embd.size (); i < embd_inp.size () + n_predict; i++) {
750
766
// predict
767
+ spdlog::debug (" {}: process token #{}: " , __func__, i);
768
+
751
769
if (embd.size () > 0 ) {
752
770
const int64_t t_start_us = ggml_time_us ();
753
771
0 commit comments