feat(loader): enhance single active backend to support LRU eviction (#7535)

* feat(loader): refactor single active backend support to LRU

This changeset introduces LRU management of loaded backends. Users can
set now a maximum number of models to be loaded concurrently, and, when
setting LocalAI in single active backend mode we set LRU to 1 for
backward compatibility.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* chore: add tests

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Update docs

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Fixups

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto
2025-12-12 12:28:38 +01:00
committed by GitHub
parent c141a40e00
commit fc5b9ebfcc
39 changed files with 836 additions and 131 deletions

View File

@@ -29,7 +29,7 @@ type Application struct {
func newApplication(appConfig *config.ApplicationConfig) *Application {
return &Application{
backendLoader: config.NewModelConfigLoader(appConfig.SystemState.Model.ModelsPath),
modelLoader: model.NewModelLoader(appConfig.SystemState, appConfig.SingleBackend),
modelLoader: model.NewModelLoader(appConfig.SystemState),
applicationConfig: appConfig,
templatesEvaluator: templates.NewEvaluator(appConfig.SystemState.Model.ModelsPath),
}

View File

@@ -191,7 +191,8 @@ type runtimeSettings struct {
WatchdogBusyEnabled *bool `json:"watchdog_busy_enabled,omitempty"`
WatchdogIdleTimeout *string `json:"watchdog_idle_timeout,omitempty"`
WatchdogBusyTimeout *string `json:"watchdog_busy_timeout,omitempty"`
SingleBackend *bool `json:"single_backend,omitempty"`
SingleBackend *bool `json:"single_backend,omitempty"` // Deprecated: use MaxActiveBackends = 1 instead
MaxActiveBackends *int `json:"max_active_backends,omitempty"` // Maximum number of active backends (0 = unlimited, 1 = single backend mode)
ParallelBackendRequests *bool `json:"parallel_backend_requests,omitempty"`
Threads *int `json:"threads,omitempty"`
ContextSize *int `json:"context_size,omitempty"`
@@ -224,6 +225,7 @@ func readRuntimeSettingsJson(startupAppConfig config.ApplicationConfig) fileHand
envWatchdogIdleTimeout := appConfig.WatchDogIdleTimeout == startupAppConfig.WatchDogIdleTimeout
envWatchdogBusyTimeout := appConfig.WatchDogBusyTimeout == startupAppConfig.WatchDogBusyTimeout
envSingleBackend := appConfig.SingleBackend == startupAppConfig.SingleBackend
envMaxActiveBackends := appConfig.MaxActiveBackends == startupAppConfig.MaxActiveBackends
envParallelRequests := appConfig.ParallelBackendRequests == startupAppConfig.ParallelBackendRequests
envThreads := appConfig.Threads == startupAppConfig.Threads
envContextSize := appConfig.ContextSize == startupAppConfig.ContextSize
@@ -275,8 +277,19 @@ func readRuntimeSettingsJson(startupAppConfig config.ApplicationConfig) fileHand
log.Warn().Err(err).Str("timeout", *settings.WatchdogBusyTimeout).Msg("invalid watchdog busy timeout in runtime_settings.json")
}
}
if settings.SingleBackend != nil && !envSingleBackend {
// Handle MaxActiveBackends (new) and SingleBackend (deprecated)
if settings.MaxActiveBackends != nil && !envMaxActiveBackends {
appConfig.MaxActiveBackends = *settings.MaxActiveBackends
// For backward compatibility, also set SingleBackend if MaxActiveBackends == 1
appConfig.SingleBackend = (*settings.MaxActiveBackends == 1)
} else if settings.SingleBackend != nil && !envSingleBackend {
// Legacy: SingleBackend maps to MaxActiveBackends = 1
appConfig.SingleBackend = *settings.SingleBackend
if *settings.SingleBackend {
appConfig.MaxActiveBackends = 1
} else {
appConfig.MaxActiveBackends = 0
}
}
if settings.ParallelBackendRequests != nil && !envParallelRequests {
appConfig.ParallelBackendRequests = *settings.ParallelBackendRequests

View File

@@ -224,7 +224,8 @@ func loadRuntimeSettingsFromFile(options *config.ApplicationConfig) {
WatchdogBusyEnabled *bool `json:"watchdog_busy_enabled,omitempty"`
WatchdogIdleTimeout *string `json:"watchdog_idle_timeout,omitempty"`
WatchdogBusyTimeout *string `json:"watchdog_busy_timeout,omitempty"`
SingleBackend *bool `json:"single_backend,omitempty"`
SingleBackend *bool `json:"single_backend,omitempty"` // Deprecated: use MaxActiveBackends = 1 instead
MaxActiveBackends *int `json:"max_active_backends,omitempty"` // Maximum number of active backends (0 = unlimited)
ParallelBackendRequests *bool `json:"parallel_backend_requests,omitempty"`
AgentJobRetentionDays *int `json:"agent_job_retention_days,omitempty"`
}
@@ -280,9 +281,21 @@ func loadRuntimeSettingsFromFile(options *config.ApplicationConfig) {
}
}
}
if settings.SingleBackend != nil {
// Handle MaxActiveBackends (new) and SingleBackend (deprecated)
if settings.MaxActiveBackends != nil {
// Only apply if current value is default (0), suggesting it wasn't set from env var
if options.MaxActiveBackends == 0 {
options.MaxActiveBackends = *settings.MaxActiveBackends
// For backward compatibility, also set SingleBackend if MaxActiveBackends == 1
options.SingleBackend = (*settings.MaxActiveBackends == 1)
}
} else if settings.SingleBackend != nil {
// Legacy: SingleBackend maps to MaxActiveBackends = 1
if !options.SingleBackend {
options.SingleBackend = *settings.SingleBackend
if *settings.SingleBackend {
options.MaxActiveBackends = 1
}
}
}
if settings.ParallelBackendRequests != nil {
@@ -307,15 +320,25 @@ func loadRuntimeSettingsFromFile(options *config.ApplicationConfig) {
// initializeWatchdog initializes the watchdog with current ApplicationConfig settings
func initializeWatchdog(application *Application, options *config.ApplicationConfig) {
if options.WatchDog {
// Get effective max active backends (considers both MaxActiveBackends and deprecated SingleBackend)
lruLimit := options.GetEffectiveMaxActiveBackends()
// Create watchdog if enabled OR if LRU limit is set
if options.WatchDog || lruLimit > 0 {
wd := model.NewWatchDog(
application.ModelLoader(),
options.WatchDogBusyTimeout,
options.WatchDogIdleTimeout,
options.WatchDogBusy,
options.WatchDogIdle)
options.WatchDogIdle,
lruLimit)
application.ModelLoader().SetWatchDog(wd)
go wd.Run()
// Start watchdog goroutine only if busy/idle checks are enabled
if options.WatchDogBusy || options.WatchDogIdle {
go wd.Run()
}
go func() {
<-options.Context.Done()
log.Debug().Msgf("Context canceled, shutting down")

View File

@@ -20,21 +20,29 @@ func (a *Application) StopWatchdog() error {
func (a *Application) startWatchdog() error {
appConfig := a.ApplicationConfig()
// Create new watchdog if enabled
if appConfig.WatchDog {
// Get effective max active backends (considers both MaxActiveBackends and deprecated SingleBackend)
lruLimit := appConfig.GetEffectiveMaxActiveBackends()
// Create watchdog if enabled OR if LRU limit is set
// LRU eviction requires watchdog infrastructure even without busy/idle checks
if appConfig.WatchDog || lruLimit > 0 {
wd := model.NewWatchDog(
a.modelLoader,
appConfig.WatchDogBusyTimeout,
appConfig.WatchDogIdleTimeout,
appConfig.WatchDogBusy,
appConfig.WatchDogIdle)
appConfig.WatchDogIdle,
lruLimit)
a.modelLoader.SetWatchDog(wd)
// Create new stop channel
a.watchdogStop = make(chan bool, 1)
// Start watchdog goroutine
go wd.Run()
// Start watchdog goroutine only if busy/idle checks are enabled
// LRU eviction doesn't need the Run() loop - it's triggered on model load
if appConfig.WatchDogBusy || appConfig.WatchDogIdle {
go wd.Run()
}
// Setup shutdown handler
go func() {
@@ -48,7 +56,7 @@ func (a *Application) startWatchdog() error {
}
}()
log.Info().Msg("Watchdog started with new settings")
log.Info().Int("lruLimit", lruLimit).Bool("busyCheck", appConfig.WatchDogBusy).Bool("idleCheck", appConfig.WatchDogIdle).Msg("Watchdog started with new settings")
} else {
log.Info().Msg("Watchdog disabled")
}