feat(loader): enhance single active backend to support LRU eviction (#7535)

* feat(loader): refactor single active backend support to LRU This changeset introduces LRU management of loaded backends. Users can set now a maximum number of models to be loaded concurrently, and, when setting LocalAI in single active backend mode we set LRU to 1 for backward compatibility. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * chore: add tests Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Update docs Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Fixups Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-12-12 12:28:38 +01:00
parent c141a40e00
commit fc5b9ebfcc
39 changed files with 836 additions and 131 deletions
--- a/core/application/application.go
+++ b/core/application/application.go
@@ -29,7 +29,7 @@ type Application struct {
 func newApplication(appConfig *config.ApplicationConfig) *Application {
 	return &Application{
 		backendLoader:      config.NewModelConfigLoader(appConfig.SystemState.Model.ModelsPath),
-		modelLoader:        model.NewModelLoader(appConfig.SystemState, appConfig.SingleBackend),
+		modelLoader:        model.NewModelLoader(appConfig.SystemState),
 		applicationConfig:  appConfig,
 		templatesEvaluator: templates.NewEvaluator(appConfig.SystemState.Model.ModelsPath),
 	}
--- a/core/application/config_file_watcher.go
+++ b/core/application/config_file_watcher.go
@@ -191,7 +191,8 @@ type runtimeSettings struct {
 	WatchdogBusyEnabled      *bool             `json:"watchdog_busy_enabled,omitempty"`
 	WatchdogIdleTimeout      *string           `json:"watchdog_idle_timeout,omitempty"`
 	WatchdogBusyTimeout      *string           `json:"watchdog_busy_timeout,omitempty"`
-	SingleBackend            *bool             `json:"single_backend,omitempty"`
+	SingleBackend            *bool             `json:"single_backend,omitempty"`      // Deprecated: use MaxActiveBackends = 1 instead
+	MaxActiveBackends        *int              `json:"max_active_backends,omitempty"` // Maximum number of active backends (0 = unlimited, 1 = single backend mode)
 	ParallelBackendRequests  *bool             `json:"parallel_backend_requests,omitempty"`
 	Threads                  *int              `json:"threads,omitempty"`
 	ContextSize              *int              `json:"context_size,omitempty"`
@@ -224,6 +225,7 @@ func readRuntimeSettingsJson(startupAppConfig config.ApplicationConfig) fileHand
 		envWatchdogIdleTimeout := appConfig.WatchDogIdleTimeout == startupAppConfig.WatchDogIdleTimeout
 		envWatchdogBusyTimeout := appConfig.WatchDogBusyTimeout == startupAppConfig.WatchDogBusyTimeout
 		envSingleBackend := appConfig.SingleBackend == startupAppConfig.SingleBackend
+		envMaxActiveBackends := appConfig.MaxActiveBackends == startupAppConfig.MaxActiveBackends
 		envParallelRequests := appConfig.ParallelBackendRequests == startupAppConfig.ParallelBackendRequests
 		envThreads := appConfig.Threads == startupAppConfig.Threads
 		envContextSize := appConfig.ContextSize == startupAppConfig.ContextSize
@@ -275,8 +277,19 @@ func readRuntimeSettingsJson(startupAppConfig config.ApplicationConfig) fileHand
 					log.Warn().Err(err).Str("timeout", *settings.WatchdogBusyTimeout).Msg("invalid watchdog busy timeout in runtime_settings.json")
 				}
 			}
-			if settings.SingleBackend != nil && !envSingleBackend {
+			// Handle MaxActiveBackends (new) and SingleBackend (deprecated)
+			if settings.MaxActiveBackends != nil && !envMaxActiveBackends {
+				appConfig.MaxActiveBackends = *settings.MaxActiveBackends
+				// For backward compatibility, also set SingleBackend if MaxActiveBackends == 1
+				appConfig.SingleBackend = (*settings.MaxActiveBackends == 1)
+			} else if settings.SingleBackend != nil && !envSingleBackend {
+				// Legacy: SingleBackend maps to MaxActiveBackends = 1
 				appConfig.SingleBackend = *settings.SingleBackend
+				if *settings.SingleBackend {
+					appConfig.MaxActiveBackends = 1
+				} else {
+					appConfig.MaxActiveBackends = 0
+				}
 			}
 			if settings.ParallelBackendRequests != nil && !envParallelRequests {
 				appConfig.ParallelBackendRequests = *settings.ParallelBackendRequests
--- a/core/application/startup.go
+++ b/core/application/startup.go
@@ -224,7 +224,8 @@ func loadRuntimeSettingsFromFile(options *config.ApplicationConfig) {
 		WatchdogBusyEnabled     *bool   `json:"watchdog_busy_enabled,omitempty"`
 		WatchdogIdleTimeout     *string `json:"watchdog_idle_timeout,omitempty"`
 		WatchdogBusyTimeout     *string `json:"watchdog_busy_timeout,omitempty"`
-		SingleBackend           *bool   `json:"single_backend,omitempty"`
+		SingleBackend           *bool   `json:"single_backend,omitempty"`      // Deprecated: use MaxActiveBackends = 1 instead
+		MaxActiveBackends       *int    `json:"max_active_backends,omitempty"` // Maximum number of active backends (0 = unlimited)
 		ParallelBackendRequests *bool   `json:"parallel_backend_requests,omitempty"`
 		AgentJobRetentionDays   *int    `json:"agent_job_retention_days,omitempty"`
 	}
@@ -280,9 +281,21 @@ func loadRuntimeSettingsFromFile(options *config.ApplicationConfig) {
 			}
 		}
 	}
-	if settings.SingleBackend != nil {
+	// Handle MaxActiveBackends (new) and SingleBackend (deprecated)
+	if settings.MaxActiveBackends != nil {
+		// Only apply if current value is default (0), suggesting it wasn't set from env var
+		if options.MaxActiveBackends == 0 {
+			options.MaxActiveBackends = *settings.MaxActiveBackends
+			// For backward compatibility, also set SingleBackend if MaxActiveBackends == 1
+			options.SingleBackend = (*settings.MaxActiveBackends == 1)
+		}
+	} else if settings.SingleBackend != nil {
+		// Legacy: SingleBackend maps to MaxActiveBackends = 1
 		if !options.SingleBackend {
 			options.SingleBackend = *settings.SingleBackend
+			if *settings.SingleBackend {
+				options.MaxActiveBackends = 1
+			}
 		}
 	}
 	if settings.ParallelBackendRequests != nil {
@@ -307,15 +320,25 @@ func loadRuntimeSettingsFromFile(options *config.ApplicationConfig) {

 // initializeWatchdog initializes the watchdog with current ApplicationConfig settings
 func initializeWatchdog(application *Application, options *config.ApplicationConfig) {
-	if options.WatchDog {
+	// Get effective max active backends (considers both MaxActiveBackends and deprecated SingleBackend)
+	lruLimit := options.GetEffectiveMaxActiveBackends()
+
+	// Create watchdog if enabled OR if LRU limit is set
+	if options.WatchDog || lruLimit > 0 {
 		wd := model.NewWatchDog(
 			application.ModelLoader(),
 			options.WatchDogBusyTimeout,
 			options.WatchDogIdleTimeout,
 			options.WatchDogBusy,
-			options.WatchDogIdle)
+			options.WatchDogIdle,
+			lruLimit)
 		application.ModelLoader().SetWatchDog(wd)
-		go wd.Run()
+
+		// Start watchdog goroutine only if busy/idle checks are enabled
+		if options.WatchDogBusy || options.WatchDogIdle {
+			go wd.Run()
+		}
+
 		go func() {
 			<-options.Context.Done()
 			log.Debug().Msgf("Context canceled, shutting down")
--- a/core/application/watchdog.go
+++ b/core/application/watchdog.go
@@ -20,21 +20,29 @@ func (a *Application) StopWatchdog() error {
 func (a *Application) startWatchdog() error {
 	appConfig := a.ApplicationConfig()

-	// Create new watchdog if enabled
-	if appConfig.WatchDog {
+	// Get effective max active backends (considers both MaxActiveBackends and deprecated SingleBackend)
+	lruLimit := appConfig.GetEffectiveMaxActiveBackends()
+
+	// Create watchdog if enabled OR if LRU limit is set
+	// LRU eviction requires watchdog infrastructure even without busy/idle checks
+	if appConfig.WatchDog || lruLimit > 0 {
 		wd := model.NewWatchDog(
 			a.modelLoader,
 			appConfig.WatchDogBusyTimeout,
 			appConfig.WatchDogIdleTimeout,
 			appConfig.WatchDogBusy,
-			appConfig.WatchDogIdle)
+			appConfig.WatchDogIdle,
+			lruLimit)
 		a.modelLoader.SetWatchDog(wd)

 		// Create new stop channel
 		a.watchdogStop = make(chan bool, 1)

-		// Start watchdog goroutine
-		go wd.Run()
+		// Start watchdog goroutine only if busy/idle checks are enabled
+		// LRU eviction doesn't need the Run() loop - it's triggered on model load
+		if appConfig.WatchDogBusy || appConfig.WatchDogIdle {
+			go wd.Run()
+		}

 		// Setup shutdown handler
 		go func() {
@@ -48,7 +56,7 @@ func (a *Application) startWatchdog() error {
 			}
 		}()

-		log.Info().Msg("Watchdog started with new settings")
+		log.Info().Int("lruLimit", lruLimit).Bool("busyCheck", appConfig.WatchDogBusy).Bool("idleCheck", appConfig.WatchDogIdle).Msg("Watchdog started with new settings")
 	} else {
 		log.Info().Msg("Watchdog disabled")
 	}