fix: drop gguf VRAM estimation (now redundant) (#8325)

fix: drop gguf VRAM estimation Cleanup. This is now handled directly in llama.cpp, no need to estimate from Go. VRAM estimation in general is tricky, but llama.cpp ( 41ea26144e/src/llama.cpp (L168) ) lately has added an automatic "fitting" of models to VRAM, so we can drop backend-specific GGUF VRAM estimation from our code instead of trying to guess as we already enable it 397f7f0862/backend/cpp/llama-cpp/grpc-server.cpp (L393) Fixes: https://github.com/mudler/LocalAI/issues/8302 See: https://github.com/mudler/LocalAI/issues/8302#issuecomment-3830773472
2026-02-01 17:33:28 +01:00
parent b6459ddd57
commit 800f749c7b
2 changed files with 0 additions and 84 deletions
--- a/core/config/gguf.go
+++ b/core/config/gguf.go
@@ -38,30 +38,6 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
 		}
 	}
 	// vram estimation
 	vram, err := xsysinfo.TotalAvailableVRAM()
 	if err != nil {
 		xlog.Error("guessDefaultsFromFile(TotalAvailableVRAM)", "error", err)
 	} else if vram > 0 {
 		estimate, err := xsysinfo.EstimateGGUFVRAMUsage(f, vram)
 		if err != nil {
 			xlog.Error("guessDefaultsFromFile(EstimateGGUFVRAMUsage)", "error", err)
 		} else {
 			if estimate.IsFullOffload {
 				xlog.Warn("guessDefaultsFromFile: full offload is recommended")
 			}
 			if estimate.EstimatedVRAM > vram {
 				xlog.Warn("guessDefaultsFromFile: estimated VRAM usage is greater than available VRAM")
 			}
 			if cfg.NGPULayers == nil && estimate.EstimatedLayers > 0 {
 				xlog.Debug("guessDefaultsFromFile: layers estimated", "layers", estimate.EstimatedLayers)
 				cfg.NGPULayers = &estimate.EstimatedLayers
 			}
 		}
 	}
 	if cfg.NGPULayers == nil {
 		// we assume we want to offload all layers
 		defaultHigh := defaultNGPULayers
--- a/pkg/xsysinfo/gguf.go
+++ b/pkg/xsysinfo/gguf.go
@@ -1,60 +0,0 @@
 package xsysinfo
 import (
 	gguf "github.com/gpustack/gguf-parser-go"
 )
 type VRAMEstimate struct {
 	TotalVRAM       uint64
 	AvailableVRAM   uint64
 	ModelSize       uint64
 	EstimatedLayers int
 	EstimatedVRAM   uint64
 	IsFullOffload   bool
 }
 func EstimateGGUFVRAMUsage(f *gguf.GGUFFile, availableVRAM uint64) (*VRAMEstimate, error) {
 	// Get model metadata
 	m := f.Metadata()
 	estimate := f.EstimateLLaMACppRun()
 	lmes := estimate.SummarizeItem(true, 0, 0)
 	estimatedVRAM := uint64(0)
 	availableLayers := lmes.OffloadLayers // TODO: check if we can just use OffloadLayers here
 	for _, vram := range lmes.VRAMs {
 		estimatedVRAM += uint64(vram.NonUMA)
 	}
 	// Calculate base model size
 	modelSize := uint64(m.Size)
 	if availableLayers == 0 {
 		availableLayers = 1
 	}
 	if estimatedVRAM == 0 {
 		estimatedVRAM = 1
 	}
 	// Estimate number of layers that can fit in VRAM
 	// Each layer typically requires about 1/32 of the model size
 	layerSize := estimatedVRAM / availableLayers
 	estimatedLayers := int(availableVRAM / layerSize)
 	if availableVRAM > estimatedVRAM {
 		estimatedLayers = int(availableLayers)
 	}
 	// Calculate estimated VRAM usage
 	return &VRAMEstimate{
 		TotalVRAM:       availableVRAM,
 		AvailableVRAM:   availableVRAM,
 		ModelSize:       modelSize,
 		EstimatedLayers: estimatedLayers,
 		EstimatedVRAM:   estimatedVRAM,
 		IsFullOffload:   availableVRAM > estimatedVRAM,
 	}, nil
 }