feat: improve RAM estimation by using values from summary

mudler · mudler · commit 4909aa6750ab · 2025-05-29T22:42:58.000+02:00
Signed-off-by: Ettore Di Giacinto &lt;mudler@localai.io&gt;
diff --git a/pkg/xsysinfo/gguf.go b/pkg/xsysinfo/gguf.go
@@ -1,8 +1,6 @@
 package xsysinfo
 
 import (
-	"errors"
-
 	gguf "github.com/gpustack/gguf-parser-go"
 )
 
@@ -18,35 +16,45 @@ type VRAMEstimate struct {
 func EstimateGGUFVRAMUsage(f *gguf.GGUFFile, availableVRAM uint64) (*VRAMEstimate, error) {
 	// Get model metadata
 	m := f.Metadata()
-	a := f.Architecture()
+
+	estimate := f.EstimateLLaMACppRun()
+
+	lmes := estimate.SummarizeItem(true, 0, 0)
+	estimatedVRAM := uint64(0)
+	availableLayers := lmes.OffloadLayers // TODO: check if we can just use OffloadLayers here
+
+	for _, vram := range lmes.VRAMs {
+		estimatedVRAM += uint64(vram.NonUMA)
+	}
 
 	// Calculate base model size
 	modelSize := uint64(m.Size)
 
-	if a.BlockCount == 0 {
-		return nil, errors.New("block count is 0")
+	if availableLayers == 0 {
+		availableLayers = 1
+	}
+
+	if estimatedVRAM == 0 {
+		estimatedVRAM = 1
 	}
 
 	// Estimate number of layers that can fit in VRAM
 	// Each layer typically requires about 1/32 of the model size
-	layerSize := modelSize / uint64(a.BlockCount)
-	estimatedLayers := int(availableVRAM / layerSize)
+	layerSize := estimatedVRAM / availableLayers
 
-	// If we can't fit even one layer, we need to do full offload
-	isFullOffload := estimatedLayers <= 0
-	if isFullOffload {
-		estimatedLayers = 0
+	estimatedLayers := int(availableVRAM / layerSize)
+	if availableVRAM > estimatedVRAM {
+		estimatedLayers = int(availableLayers)
 	}
 
 	// Calculate estimated VRAM usage
-	estimatedVRAM := uint64(estimatedLayers) * layerSize
 
 	return &VRAMEstimate{
 		TotalVRAM:       availableVRAM,
 		AvailableVRAM:   availableVRAM,
 		ModelSize:       modelSize,
 		EstimatedLayers: estimatedLayers,
 		EstimatedVRAM:   estimatedVRAM,
-		IsFullOffload:   isFullOffload,
+		IsFullOffload:   availableVRAM > estimatedVRAM,
 	}, nil
 }