From 800f749c7ba2fdefce6270fc22167a256a418983 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sun, 1 Feb 2026 17:33:28 +0100
Subject: [PATCH] fix: drop gguf VRAM estimation (now redundant) (#8325)

fix: drop gguf VRAM estimation

Cleanup. This is now handled directly in llama.cpp, no need to estimate from Go.

VRAM estimation in general is tricky, but llama.cpp ( https://github.com/ggml-org/llama.cpp/blob/41ea26144e55d23f37bb765f88c07588d786567f/src/llama.cpp#L168 ) lately has added an automatic "fitting" of models to VRAM, so we can drop backend-specific GGUF VRAM estimation from our code instead of trying to guess as we already enable it

 https://github.com/mudler/LocalAI/blob/397f7f0862d4105b874523e1a0105ae036db18ec/backend/cpp/llama-cpp/grpc-server.cpp#L393

Fixes: https://github.com/mudler/LocalAI/issues/8302
See: https://github.com/mudler/LocalAI/issues/8302#issuecomment-3830773472
---
 core/config/gguf.go  | 24 ------------------
 pkg/xsysinfo/gguf.go | 60 --------------------------------------------
 2 files changed, 84 deletions(-)
 delete mode 100644 pkg/xsysinfo/gguf.go

diff --git a/core/config/gguf.go b/core/config/gguf.go
index 0d788dad4..7b23c8ce9 100644
--- a/core/config/gguf.go
+++ b/core/config/gguf.go
@@ -38,30 +38,6 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
 		}
 	}
 
-	// vram estimation
-	vram, err := xsysinfo.TotalAvailableVRAM()
-	if err != nil {
-		xlog.Error("guessDefaultsFromFile(TotalAvailableVRAM)", "error", err)
-	} else if vram > 0 {
-		estimate, err := xsysinfo.EstimateGGUFVRAMUsage(f, vram)
-		if err != nil {
-			xlog.Error("guessDefaultsFromFile(EstimateGGUFVRAMUsage)", "error", err)
-		} else {
-			if estimate.IsFullOffload {
-				xlog.Warn("guessDefaultsFromFile: full offload is recommended")
-			}
-
-			if estimate.EstimatedVRAM > vram {
-				xlog.Warn("guessDefaultsFromFile: estimated VRAM usage is greater than available VRAM")
-			}
-
-			if cfg.NGPULayers == nil && estimate.EstimatedLayers > 0 {
-				xlog.Debug("guessDefaultsFromFile: layers estimated", "layers", estimate.EstimatedLayers)
-				cfg.NGPULayers = &estimate.EstimatedLayers
-			}
-		}
-	}
-
 	if cfg.NGPULayers == nil {
 		// we assume we want to offload all layers
 		defaultHigh := defaultNGPULayers
diff --git a/pkg/xsysinfo/gguf.go b/pkg/xsysinfo/gguf.go
deleted file mode 100644
index 0ea9bca06..000000000
--- a/pkg/xsysinfo/gguf.go
+++ /dev/null
@@ -1,60 +0,0 @@
-package xsysinfo
-
-import (
-	gguf "github.com/gpustack/gguf-parser-go"
-)
-
-type VRAMEstimate struct {
-	TotalVRAM       uint64
-	AvailableVRAM   uint64
-	ModelSize       uint64
-	EstimatedLayers int
-	EstimatedVRAM   uint64
-	IsFullOffload   bool
-}
-
-func EstimateGGUFVRAMUsage(f *gguf.GGUFFile, availableVRAM uint64) (*VRAMEstimate, error) {
-	// Get model metadata
-	m := f.Metadata()
-
-	estimate := f.EstimateLLaMACppRun()
-
-	lmes := estimate.SummarizeItem(true, 0, 0)
-	estimatedVRAM := uint64(0)
-	availableLayers := lmes.OffloadLayers // TODO: check if we can just use OffloadLayers here
-
-	for _, vram := range lmes.VRAMs {
-		estimatedVRAM += uint64(vram.NonUMA)
-	}
-
-	// Calculate base model size
-	modelSize := uint64(m.Size)
-
-	if availableLayers == 0 {
-		availableLayers = 1
-	}
-
-	if estimatedVRAM == 0 {
-		estimatedVRAM = 1
-	}
-
-	// Estimate number of layers that can fit in VRAM
-	// Each layer typically requires about 1/32 of the model size
-	layerSize := estimatedVRAM / availableLayers
-
-	estimatedLayers := int(availableVRAM / layerSize)
-	if availableVRAM > estimatedVRAM {
-		estimatedLayers = int(availableLayers)
-	}
-
-	// Calculate estimated VRAM usage
-
-	return &VRAMEstimate{
-		TotalVRAM:       availableVRAM,
-		AvailableVRAM:   availableVRAM,
-		ModelSize:       modelSize,
-		EstimatedLayers: estimatedLayers,
-		EstimatedVRAM:   estimatedVRAM,
-		IsFullOffload:   availableVRAM > estimatedVRAM,
-	}, nil
-}