@@ -1156,6 +1156,7 @@ static void llama_model_load_internal(
11561156 }
11571157 }
11581158#endif // GGML_USE_CUBLAS
1159+
11591160#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
11601161 const int n_gpu = std::min (n_gpu_layers, int (hparams.n_layer ));
11611162
@@ -1164,6 +1165,10 @@ static void llama_model_load_internal(
11641165 fprintf (stderr, " %s: offloading non-repeating layers to GPU\n " , __func__);
11651166 }
11661167 size_t vram_kv_cache = 0 ;
1168+
1169+ #ifdef GGML_USE_CUBLAS
1170+ const int max_backend_supported_layers = hparams.n_layer + 3 ;
1171+ const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3 ;
11671172 if (n_gpu_layers > (int ) hparams.n_layer + 1 ) {
11681173 if (low_vram) {
11691174 fprintf (stderr, " %s: cannot offload v cache to GPU due to low VRAM option\n " , __func__);
@@ -1180,14 +1185,18 @@ static void llama_model_load_internal(
11801185 vram_kv_cache += MEM_REQ_KV_SELF ().at (model.type ) / 2 ;
11811186 }
11821187 }
1183- const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3 ;
1188+ #elif defined(GGML_USE_CLBLAST)
1189+ const int max_backend_supported_layers = hparams.n_layer + 1 ;
1190+ const int max_offloadable_layers = hparams.n_layer + 1 ;
1191+ #endif // GGML_USE_CUBLAS
1192+
11841193 fprintf (stderr, " %s: offloaded %d/%d layers to GPU\n " ,
1185- __func__, std::min (n_gpu_layers, max_offloadable_layers), hparams. n_layer + 3 );
1194+ __func__, std::min (n_gpu_layers, max_offloadable_layers), max_backend_supported_layers );
11861195 fprintf (stderr, " %s: total VRAM used: %zu MB\n " ,
11871196 __func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1 ) / MB); // round up
11881197#else
11891198 (void ) n_gpu_layers;
1190- #endif
1199+ #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
11911200 }
11921201
11931202 // populate `tensors_by_name`
0 commit comments