From 800f749c7ba2fdefce6270fc22167a256a418983 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Sun, 1 Feb 2026 17:33:28 +0100 Subject: [PATCH] fix: drop gguf VRAM estimation (now redundant) (#8325) fix: drop gguf VRAM estimation Cleanup. This is now handled directly in llama.cpp, no need to estimate from Go. VRAM estimation in general is tricky, but llama.cpp ( https://github.com/ggml-org/llama.cpp/blob/41ea26144e55d23f37bb765f88c07588d786567f/src/llama.cpp#L168 ) lately has added an automatic "fitting" of models to VRAM, so we can drop backend-specific GGUF VRAM estimation from our code instead of trying to guess as we already enable it https://github.com/mudler/LocalAI/blob/397f7f0862d4105b874523e1a0105ae036db18ec/backend/cpp/llama-cpp/grpc-server.cpp#L393 Fixes: https://github.com/mudler/LocalAI/issues/8302 See: https://github.com/mudler/LocalAI/issues/8302#issuecomment-3830773472 --- core/config/gguf.go | 24 ------------------ pkg/xsysinfo/gguf.go | 60 -------------------------------------------- 2 files changed, 84 deletions(-) delete mode 100644 pkg/xsysinfo/gguf.go diff --git a/core/config/gguf.go b/core/config/gguf.go index 0d788dad4..7b23c8ce9 100644 --- a/core/config/gguf.go +++ b/core/config/gguf.go @@ -38,30 +38,6 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) { } } - // vram estimation - vram, err := xsysinfo.TotalAvailableVRAM() - if err != nil { - xlog.Error("guessDefaultsFromFile(TotalAvailableVRAM)", "error", err) - } else if vram > 0 { - estimate, err := xsysinfo.EstimateGGUFVRAMUsage(f, vram) - if err != nil { - xlog.Error("guessDefaultsFromFile(EstimateGGUFVRAMUsage)", "error", err) - } else { - if estimate.IsFullOffload { - xlog.Warn("guessDefaultsFromFile: full offload is recommended") - } - - if estimate.EstimatedVRAM > vram { - xlog.Warn("guessDefaultsFromFile: estimated VRAM usage is greater than available VRAM") - } - - if cfg.NGPULayers == nil && estimate.EstimatedLayers > 0 { - xlog.Debug("guessDefaultsFromFile: layers estimated", "layers", estimate.EstimatedLayers) - cfg.NGPULayers = &estimate.EstimatedLayers - } - } - } - if cfg.NGPULayers == nil { // we assume we want to offload all layers defaultHigh := defaultNGPULayers diff --git a/pkg/xsysinfo/gguf.go b/pkg/xsysinfo/gguf.go deleted file mode 100644 index 0ea9bca06..000000000 --- a/pkg/xsysinfo/gguf.go +++ /dev/null @@ -1,60 +0,0 @@ -package xsysinfo - -import ( - gguf "github.com/gpustack/gguf-parser-go" -) - -type VRAMEstimate struct { - TotalVRAM uint64 - AvailableVRAM uint64 - ModelSize uint64 - EstimatedLayers int - EstimatedVRAM uint64 - IsFullOffload bool -} - -func EstimateGGUFVRAMUsage(f *gguf.GGUFFile, availableVRAM uint64) (*VRAMEstimate, error) { - // Get model metadata - m := f.Metadata() - - estimate := f.EstimateLLaMACppRun() - - lmes := estimate.SummarizeItem(true, 0, 0) - estimatedVRAM := uint64(0) - availableLayers := lmes.OffloadLayers // TODO: check if we can just use OffloadLayers here - - for _, vram := range lmes.VRAMs { - estimatedVRAM += uint64(vram.NonUMA) - } - - // Calculate base model size - modelSize := uint64(m.Size) - - if availableLayers == 0 { - availableLayers = 1 - } - - if estimatedVRAM == 0 { - estimatedVRAM = 1 - } - - // Estimate number of layers that can fit in VRAM - // Each layer typically requires about 1/32 of the model size - layerSize := estimatedVRAM / availableLayers - - estimatedLayers := int(availableVRAM / layerSize) - if availableVRAM > estimatedVRAM { - estimatedLayers = int(availableLayers) - } - - // Calculate estimated VRAM usage - - return &VRAMEstimate{ - TotalVRAM: availableVRAM, - AvailableVRAM: availableVRAM, - ModelSize: modelSize, - EstimatedLayers: estimatedLayers, - EstimatedVRAM: estimatedVRAM, - IsFullOffload: availableVRAM > estimatedVRAM, - }, nil -}