11package xsysinfo
22
33import (
4- "errors"
5-
64 gguf "github.com/gpustack/gguf-parser-go"
75)
86
@@ -18,35 +16,45 @@ type VRAMEstimate struct {
1816func EstimateGGUFVRAMUsage (f * gguf.GGUFFile , availableVRAM uint64 ) (* VRAMEstimate , error ) {
1917 // Get model metadata
2018 m := f .Metadata ()
21- a := f .Architecture ()
19+
20+ estimate := f .EstimateLLaMACppRun ()
21+
22+ lmes := estimate .SummarizeItem (true , 0 , 0 )
23+ estimatedVRAM := uint64 (0 )
24+ availableLayers := lmes .OffloadLayers // TODO: check if we can just use OffloadLayers here
25+
26+ for _ , vram := range lmes .VRAMs {
27+ estimatedVRAM += uint64 (vram .NonUMA )
28+ }
2229
2330 // Calculate base model size
2431 modelSize := uint64 (m .Size )
2532
26- if a .BlockCount == 0 {
27- return nil , errors .New ("block count is 0" )
33+ if availableLayers == 0 {
34+ availableLayers = 1
35+ }
36+
37+ if estimatedVRAM == 0 {
38+ estimatedVRAM = 1
2839 }
2940
3041 // Estimate number of layers that can fit in VRAM
3142 // Each layer typically requires about 1/32 of the model size
32- layerSize := modelSize / uint64 (a .BlockCount )
33- estimatedLayers := int (availableVRAM / layerSize )
43+ layerSize := estimatedVRAM / availableLayers
3444
35- // If we can't fit even one layer, we need to do full offload
36- isFullOffload := estimatedLayers <= 0
37- if isFullOffload {
38- estimatedLayers = 0
45+ estimatedLayers := int (availableVRAM / layerSize )
46+ if availableVRAM > estimatedVRAM {
47+ estimatedLayers = int (availableLayers )
3948 }
4049
4150 // Calculate estimated VRAM usage
42- estimatedVRAM := uint64 (estimatedLayers ) * layerSize
4351
4452 return & VRAMEstimate {
4553 TotalVRAM : availableVRAM ,
4654 AvailableVRAM : availableVRAM ,
4755 ModelSize : modelSize ,
4856 EstimatedLayers : estimatedLayers ,
4957 EstimatedVRAM : estimatedVRAM ,
50- IsFullOffload : isFullOffload ,
58+ IsFullOffload : availableVRAM > estimatedVRAM ,
5159 }, nil
5260}
0 commit comments