fix: drop gguf VRAM estimation (now redundant) (#8325)
fix: drop gguf VRAM estimation Cleanup. This is now handled directly in llama.cpp, no need to estimate from Go. VRAM estimation in general is tricky, but llama.cpp (41ea26144e/src/llama.cpp (L168)) lately has added an automatic "fitting" of models to VRAM, so we can drop backend-specific GGUF VRAM estimation from our code instead of trying to guess as we already enable it397f7f0862/backend/cpp/llama-cpp/grpc-server.cpp (L393)Fixes: https://github.com/mudler/LocalAI/issues/8302 See: https://github.com/mudler/LocalAI/issues/8302#issuecomment-3830773472
This commit is contained in:
committed by
GitHub
parent
b6459ddd57
commit
800f749c7b
@@ -38,30 +38,6 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// vram estimation
|
|
||||||
vram, err := xsysinfo.TotalAvailableVRAM()
|
|
||||||
if err != nil {
|
|
||||||
xlog.Error("guessDefaultsFromFile(TotalAvailableVRAM)", "error", err)
|
|
||||||
} else if vram > 0 {
|
|
||||||
estimate, err := xsysinfo.EstimateGGUFVRAMUsage(f, vram)
|
|
||||||
if err != nil {
|
|
||||||
xlog.Error("guessDefaultsFromFile(EstimateGGUFVRAMUsage)", "error", err)
|
|
||||||
} else {
|
|
||||||
if estimate.IsFullOffload {
|
|
||||||
xlog.Warn("guessDefaultsFromFile: full offload is recommended")
|
|
||||||
}
|
|
||||||
|
|
||||||
if estimate.EstimatedVRAM > vram {
|
|
||||||
xlog.Warn("guessDefaultsFromFile: estimated VRAM usage is greater than available VRAM")
|
|
||||||
}
|
|
||||||
|
|
||||||
if cfg.NGPULayers == nil && estimate.EstimatedLayers > 0 {
|
|
||||||
xlog.Debug("guessDefaultsFromFile: layers estimated", "layers", estimate.EstimatedLayers)
|
|
||||||
cfg.NGPULayers = &estimate.EstimatedLayers
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if cfg.NGPULayers == nil {
|
if cfg.NGPULayers == nil {
|
||||||
// we assume we want to offload all layers
|
// we assume we want to offload all layers
|
||||||
defaultHigh := defaultNGPULayers
|
defaultHigh := defaultNGPULayers
|
||||||
|
|||||||
@@ -1,60 +0,0 @@
|
|||||||
package xsysinfo
|
|
||||||
|
|
||||||
import (
|
|
||||||
gguf "github.com/gpustack/gguf-parser-go"
|
|
||||||
)
|
|
||||||
|
|
||||||
type VRAMEstimate struct {
|
|
||||||
TotalVRAM uint64
|
|
||||||
AvailableVRAM uint64
|
|
||||||
ModelSize uint64
|
|
||||||
EstimatedLayers int
|
|
||||||
EstimatedVRAM uint64
|
|
||||||
IsFullOffload bool
|
|
||||||
}
|
|
||||||
|
|
||||||
func EstimateGGUFVRAMUsage(f *gguf.GGUFFile, availableVRAM uint64) (*VRAMEstimate, error) {
|
|
||||||
// Get model metadata
|
|
||||||
m := f.Metadata()
|
|
||||||
|
|
||||||
estimate := f.EstimateLLaMACppRun()
|
|
||||||
|
|
||||||
lmes := estimate.SummarizeItem(true, 0, 0)
|
|
||||||
estimatedVRAM := uint64(0)
|
|
||||||
availableLayers := lmes.OffloadLayers // TODO: check if we can just use OffloadLayers here
|
|
||||||
|
|
||||||
for _, vram := range lmes.VRAMs {
|
|
||||||
estimatedVRAM += uint64(vram.NonUMA)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Calculate base model size
|
|
||||||
modelSize := uint64(m.Size)
|
|
||||||
|
|
||||||
if availableLayers == 0 {
|
|
||||||
availableLayers = 1
|
|
||||||
}
|
|
||||||
|
|
||||||
if estimatedVRAM == 0 {
|
|
||||||
estimatedVRAM = 1
|
|
||||||
}
|
|
||||||
|
|
||||||
// Estimate number of layers that can fit in VRAM
|
|
||||||
// Each layer typically requires about 1/32 of the model size
|
|
||||||
layerSize := estimatedVRAM / availableLayers
|
|
||||||
|
|
||||||
estimatedLayers := int(availableVRAM / layerSize)
|
|
||||||
if availableVRAM > estimatedVRAM {
|
|
||||||
estimatedLayers = int(availableLayers)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Calculate estimated VRAM usage
|
|
||||||
|
|
||||||
return &VRAMEstimate{
|
|
||||||
TotalVRAM: availableVRAM,
|
|
||||||
AvailableVRAM: availableVRAM,
|
|
||||||
ModelSize: modelSize,
|
|
||||||
EstimatedLayers: estimatedLayers,
|
|
||||||
EstimatedVRAM: estimatedVRAM,
|
|
||||||
IsFullOffload: availableVRAM > estimatedVRAM,
|
|
||||||
}, nil
|
|
||||||
}
|
|
||||||
Reference in New Issue
Block a user