diff --git a/AGENTS.md b/AGENTS.md index 87a96ec88..d8b902ebe 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -280,3 +280,11 @@ Always check `llama.cpp` for new model configuration options that should be supp - `llama.cpp/common/chat-parser.cpp` - Format presets and model-specific handlers - `llama.cpp/common/chat.h` - Format enums and parameter structures - `llama.cpp/tools/server/server-context.cpp` - Server configuration options + +# Documentation + +The project documentation is located in `docs/content`. When adding new features or changing existing functionality, it is crucial to update the documentation to reflect these changes. This helps users understand how to use the new capabilities and ensures the documentation stays relevant. + +- **Feature Documentation**: If you add a new feature (like a new backend or API endpoint), create a new markdown file in `docs/content/features/` explaining what it is, how to configure it, and how to use it. +- **Configuration**: If you modify configuration options, update the relevant sections in `docs/content/`. +- **Examples**: providing concrete examples (like YAML configuration blocks) is highly encouraged to help users get started quickly. diff --git a/README.md b/README.md index fcb146a54..aa7337414 100644 --- a/README.md +++ b/README.md @@ -239,6 +239,7 @@ Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3A - 🔈 [Audio to Text](https://localai.io/features/audio-to-text/) (Audio transcription with `whisper.cpp`) - 🎨 [Image generation](https://localai.io/features/image-generation) - 🔥 [OpenAI-alike tools API](https://localai.io/features/openai-functions/) +- ⚡ [Realtime API](https://localai.io/features/openai-realtime/) (Speech-to-speech) - 🧠 [Embeddings generation for vector databases](https://localai.io/features/embeddings/) - ✍️ [Constrained grammars](https://localai.io/features/constrained_grammars/) - 🖼️ [Download Models directly from Huggingface ](https://localai.io/models/) diff --git a/core/http/endpoints/openai/realtime.go b/core/http/endpoints/openai/realtime.go index 517fa0045..6339c7cd3 100644 --- a/core/http/endpoints/openai/realtime.go +++ b/core/http/endpoints/openai/realtime.go @@ -6,7 +6,6 @@ import ( "encoding/json" "fmt" "os" - "strings" "sync" "time" @@ -16,24 +15,25 @@ import ( "github.com/gorilla/websocket" "github.com/labstack/echo/v4" "github.com/mudler/LocalAI/core/application" + "github.com/mudler/LocalAI/core/backend" + "github.com/mudler/LocalAI/core/config" "github.com/mudler/LocalAI/core/http/endpoints/openai/types" + "github.com/mudler/LocalAI/core/schema" "github.com/mudler/LocalAI/core/templates" laudio "github.com/mudler/LocalAI/pkg/audio" "github.com/mudler/LocalAI/pkg/functions" "github.com/mudler/LocalAI/pkg/grpc/proto" model "github.com/mudler/LocalAI/pkg/model" + "github.com/mudler/LocalAI/pkg/reasoning" "github.com/mudler/LocalAI/pkg/sound" - "google.golang.org/grpc" - "github.com/mudler/xlog" ) const ( localSampleRate = 16000 remoteSampleRate = 24000 - vadModel = "silero-vad-ggml" ) // A model can be "emulated" that is: transcribe audio to text -> feed text to the LLM -> generate audio as result @@ -41,62 +41,69 @@ const ( // Session represents a single WebSocket connection and its state type Session struct { - ID string - TranscriptionOnly bool - Model string + ID string + TranscriptionOnly bool + // The pipeline or any-to-any model name (full realtime mode) + Model string + // The voice may be a TTS model name or a parameter passed to a TTS model Voice string - TurnDetection *types.ServerTurnDetection `json:"turn_detection"` // "server_vad" or "none" - InputAudioTranscription *types.InputAudioTranscription - Functions functions.Functions + TurnDetection *types.TurnDetectionUnion // "server_vad", "semantic_vad" or "none" + InputAudioTranscription *types.AudioTranscription + Tools []types.ToolUnion + ToolChoice *types.ToolChoiceUnion Conversations map[string]*Conversation InputAudioBuffer []byte AudioBufferLock sync.Mutex Instructions string DefaultConversationID string ModelInterface Model + // The pipeline model config or the config for an any-to-any model + ModelConfig *config.ModelConfig } -func (s *Session) FromClient(session *types.ClientSession) { +func (s *Session) FromClient(session *types.SessionUnion) { } -func (s *Session) ToServer() types.ServerSession { - return types.ServerSession{ - ID: s.ID, - Object: func() string { - if s.TranscriptionOnly { - return "realtime.transcription_session" - } else { - return "realtime.session" - } - }(), - Model: s.Model, - Modalities: []types.Modality{types.ModalityText, types.ModalityAudio}, - Instructions: s.Instructions, - Voice: s.Voice, - InputAudioFormat: types.AudioFormatPcm16, - OutputAudioFormat: types.AudioFormatPcm16, - TurnDetection: s.TurnDetection, - InputAudioTranscription: s.InputAudioTranscription, - // TODO: Should be constructed from Functions? - Tools: []types.Tool{}, - // TODO: ToolChoice - // TODO: Temperature - // TODO: MaxOutputTokens - // TODO: InputAudioNoiseReduction +func (s *Session) ToServer() types.SessionUnion { + if s.TranscriptionOnly { + return types.SessionUnion{ + Transcription: &types.TranscriptionSession{ + ID: s.ID, + Object: "realtime.transcription_session", + Audio: &types.TranscriptionSessionAudio{ + Input: &types.SessionAudioInput{ + Transcription: s.InputAudioTranscription, + }, + }, + }, + } + } else { + return types.SessionUnion{ + Realtime: &types.RealtimeSession{ + ID: s.ID, + Object: "realtime.session", + Model: s.Model, + Instructions: s.Instructions, + Tools: s.Tools, + ToolChoice: s.ToolChoice, + Audio: &types.RealtimeSessionAudio{ + Input: &types.SessionAudioInput{ + TurnDetection: s.TurnDetection, + Transcription: s.InputAudioTranscription, + }, + Output: &types.SessionAudioOutput{ + Voice: types.Voice(s.Voice), + }, + }, + }, + } } } -// TODO: Update to tools? -// FunctionCall represents a function call initiated by the model -type FunctionCall struct { - Name string `json:"name"` - Arguments map[string]interface{} `json:"arguments"` -} - // Conversation represents a conversation with a list of items type Conversation struct { ID string - Items []*types.MessageItem + Items []*types.MessageItemUnion Lock sync.Mutex } @@ -107,66 +114,16 @@ func (c *Conversation) ToServer() types.Conversation { } } -// Item represents a message, function_call, or function_call_output -type Item struct { - ID string `json:"id"` - Object string `json:"object"` - Type string `json:"type"` // "message", "function_call", "function_call_output" - Status string `json:"status"` - Role string `json:"role"` - Content []ConversationContent `json:"content,omitempty"` - FunctionCall *FunctionCall `json:"function_call,omitempty"` -} - -// ConversationContent represents the content of an item -type ConversationContent struct { - Type string `json:"type"` // "input_text", "input_audio", "text", "audio", etc. - Audio string `json:"audio,omitempty"` - Text string `json:"text,omitempty"` - // Additional fields as needed -} - -// Define the structures for incoming messages -type IncomingMessage struct { - Type types.ClientEventType `json:"type"` - Session json.RawMessage `json:"session,omitempty"` - Item json.RawMessage `json:"item,omitempty"` - Audio string `json:"audio,omitempty"` - Response json.RawMessage `json:"response,omitempty"` - Error *ErrorMessage `json:"error,omitempty"` - // Other fields as needed -} - -// ErrorMessage represents an error message sent to the client -type ErrorMessage struct { - Type string `json:"type"` - Code string `json:"code"` - Message string `json:"message"` - Param string `json:"param,omitempty"` - EventID string `json:"event_id,omitempty"` -} - -// Define a structure for outgoing messages -type OutgoingMessage struct { - Type string `json:"type"` - Session *Session `json:"session,omitempty"` - Conversation *Conversation `json:"conversation,omitempty"` - Item *Item `json:"item,omitempty"` - Content string `json:"content,omitempty"` - Audio string `json:"audio,omitempty"` - Error *ErrorMessage `json:"error,omitempty"` -} - // Map to store sessions (in-memory) var sessions = make(map[string]*Session) var sessionLock sync.Mutex -// TODO: implement interface as we start to define usages type Model interface { - VAD(ctx context.Context, in *proto.VADRequest, opts ...grpc.CallOption) (*proto.VADResponse, error) - Transcribe(ctx context.Context, in *proto.TranscriptRequest, opts ...grpc.CallOption) (*proto.TranscriptResult, error) - Predict(ctx context.Context, in *proto.PredictOptions, opts ...grpc.CallOption) (*proto.Reply, error) - PredictStream(ctx context.Context, in *proto.PredictOptions, f func(*proto.Reply), opts ...grpc.CallOption) error + VAD(ctx context.Context, request *schema.VADRequest) (*schema.VADResponse, error) + Transcribe(ctx context.Context, audio, language string, translate bool, diarize bool, prompt string) (*schema.TranscriptionResult, error) + Predict(ctx context.Context, messages schema.Messages, images, videos, audios []string, tokenCallback func(string, backend.TokenUsage) bool, tools []types.ToolUnion, toolChoice *types.ToolChoiceUnion, logprobs *int, topLogprobs *int, logitBias map[string]float64) (func() (backend.LLMResponse, error), error) + TTS(ctx context.Context, text, voice, language string) (string, *proto.Result, error) + PredictConfig() *config.ModelConfig } var upgrader = websocket.Upgrader{ @@ -198,46 +155,53 @@ func Realtime(application *application.Application) echo.HandlerFunc { // Extract query parameters from Echo context before passing to websocket handler model := c.QueryParam("model") - if model == "" { - model = "gpt-4o" - } - intent := c.QueryParam("intent") - registerRealtime(application, model, intent)(ws) + registerRealtime(application, model)(ws) return nil } } -func registerRealtime(application *application.Application, model, intent string) func(c *websocket.Conn) { +func registerRealtime(application *application.Application, model string) func(c *websocket.Conn) { return func(c *websocket.Conn) { evaluator := application.TemplatesEvaluator() - xlog.Debug("WebSocket connection established", "address", c.RemoteAddr().String()) - if intent != "transcription" { - sendNotImplemented(c, "Only transcription mode is supported which requires the intent=transcription parameter") + xlog.Debug("Realtime WebSocket connection established", "address", c.RemoteAddr().String(), "model", model) + + // TODO: Allow any-to-any model to be specified + cl := application.ModelConfigLoader() + cfg, err := cl.LoadModelConfigFileByNameDefaultOptions(model, application.ApplicationConfig()) + if err != nil { + xlog.Error("failed to load model config", "error", err) + sendError(c, "model_load_error", "Failed to load model config", "", "") + return } - xlog.Debug("Realtime params", "model", model, "intent", intent) + if cfg == nil || (cfg.Pipeline.VAD == "" && cfg.Pipeline.Transcription == "" && cfg.Pipeline.TTS == "" && cfg.Pipeline.LLM == "") { + xlog.Error("model is not a pipeline", "model", model) + sendError(c, "invalid_model", "Model is not a pipeline model", "", "") + return + } + + sttModel := cfg.Pipeline.Transcription + ttsModel := cfg.Pipeline.TTS sessionID := generateSessionID() session := &Session{ ID: sessionID, - TranscriptionOnly: true, - Model: model, // default model - Voice: "alloy", // default voice - TurnDetection: &types.ServerTurnDetection{ - Type: types.ServerTurnDetectionTypeServerVad, - TurnDetectionParams: types.TurnDetectionParams{ - // TODO: Need some way to pass this to the backend - Threshold: 0.5, - // TODO: This is ignored and the amount of padding is random at present - PrefixPaddingMs: 30, + TranscriptionOnly: false, + Model: model, + Voice: ttsModel, + ModelConfig: cfg, + TurnDetection: &types.TurnDetectionUnion{ + ServerVad: &types.ServerVad{ + Threshold: 0.5, + PrefixPaddingMs: 300, SilenceDurationMs: 500, - CreateResponse: func() *bool { t := true; return &t }(), + CreateResponse: true, }, }, - InputAudioTranscription: &types.InputAudioTranscription{ - Model: "whisper-1", + InputAudioTranscription: &types.AudioTranscription{ + Model: sttModel, }, Conversations: make(map[string]*Conversation), } @@ -245,24 +209,20 @@ func registerRealtime(application *application.Application, model, intent string // Create a default conversation conversationID := generateConversationID() conversation := &Conversation{ - ID: conversationID, - Items: []*types.MessageItem{}, + ID: conversationID, + // TODO: We need to truncate the conversation items when a new item is added and we have run out of space. There are multiple places where items + // can be added so we could use a datastructure here that enforces truncation upon addition + Items: []*types.MessageItemUnion{}, } session.Conversations[conversationID] = conversation session.DefaultConversationID = conversationID - // TODO: The API has no way to configure the VAD model or other models that make up a pipeline to fake any-to-any - // So possibly we could have a way to configure a composite model that can be used in situations where any-to-any is expected - pipeline := config.Pipeline{ - VAD: vadModel, - Transcription: session.InputAudioTranscription.Model, - } - - m, cfg, err := newTranscriptionOnlyModel( - &pipeline, + m, err := newModel( + &cfg.Pipeline, application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig(), + evaluator, ) if err != nil { xlog.Error("failed to load model", "error", err) @@ -276,28 +236,41 @@ func registerRealtime(application *application.Application, model, intent string sessions[sessionID] = session sessionLock.Unlock() - sendEvent(c, types.TranscriptionSessionCreatedEvent{ + sendEvent(c, types.SessionCreatedEvent{ ServerEventBase: types.ServerEventBase{ EventID: "event_TODO", - Type: types.ServerEventTypeTranscriptionSessionCreated, }, Session: session.ToServer(), }) var ( - // mt int msg []byte wg sync.WaitGroup done = make(chan struct{}) ) - vadServerStarted := true - wg.Add(1) - go func() { - defer wg.Done() - conversation := session.Conversations[session.DefaultConversationID] - handleVAD(cfg, evaluator, session, conversation, c, done) - }() + vadServerStarted := false + toggleVAD := func() { + if session.TurnDetection.ServerVad != nil && !vadServerStarted { + xlog.Debug("Starting VAD goroutine...") + wg.Add(1) + go func() { + defer wg.Done() + conversation := session.Conversations[session.DefaultConversationID] + handleVAD(session, conversation, c, done) + }() + vadServerStarted = true + } else if session.TurnDetection.ServerVad == nil && vadServerStarted { + xlog.Debug("Stopping VAD goroutine...") + + go func() { + done <- struct{}{} + }() + vadServerStarted = false + } + } + + toggleVAD() for { if _, msg, err = c.ReadMessage(); err != nil { @@ -306,100 +279,76 @@ func registerRealtime(application *application.Application, model, intent string } // Parse the incoming message - var incomingMsg IncomingMessage - if err := json.Unmarshal(msg, &incomingMsg); err != nil { + event, err := types.UnmarshalClientEvent(msg) + if err != nil { xlog.Error("invalid json", "error", err) sendError(c, "invalid_json", "Invalid JSON format", "", "") continue } - var sessionUpdate types.ClientSession - switch incomingMsg.Type { - case types.ClientEventTypeTranscriptionSessionUpdate: + switch e := event.(type) { + case types.SessionUpdateEvent: xlog.Debug("recv", "message", string(msg)) - if err := json.Unmarshal(incomingMsg.Session, &sessionUpdate); err != nil { - xlog.Error("failed to unmarshal 'transcription_session.update'", "error", err) - sendError(c, "invalid_session_update", "Invalid session update format", "", "") - continue - } - if err := updateTransSession( - session, - &sessionUpdate, - application.ModelConfigLoader(), - application.ModelLoader(), - application.ApplicationConfig(), - ); err != nil { - xlog.Error("failed to update session", "error", err) - sendError(c, "session_update_error", "Failed to update session", "", "") - continue + // Handle transcription session update + if e.Session.Transcription != nil { + if err := updateTransSession( + session, + &e.Session, + application.ModelConfigLoader(), + application.ModelLoader(), + application.ApplicationConfig(), + ); err != nil { + xlog.Error("failed to update session", "error", err) + sendError(c, "session_update_error", "Failed to update session", "", "") + continue + } + + toggleVAD() + + sendEvent(c, types.SessionUpdatedEvent{ + ServerEventBase: types.ServerEventBase{ + EventID: "event_TODO", + }, + Session: session.ToServer(), + }) } - sendEvent(c, types.SessionUpdatedEvent{ - ServerEventBase: types.ServerEventBase{ - EventID: "event_TODO", - Type: types.ServerEventTypeTranscriptionSessionUpdated, - }, - Session: session.ToServer(), - }) + // Handle realtime session update + if e.Session.Realtime != nil { + if err := updateSession( + session, + &e.Session, + application.ModelConfigLoader(), + application.ModelLoader(), + application.ApplicationConfig(), + evaluator, + ); err != nil { + xlog.Error("failed to update session", "error", err) + sendError(c, "session_update_error", "Failed to update session", "", "") + continue + } - case types.ClientEventTypeSessionUpdate: - xlog.Debug("recv", "message", string(msg)) + toggleVAD() - // Update session configurations - if err := json.Unmarshal(incomingMsg.Session, &sessionUpdate); err != nil { - xlog.Error("failed to unmarshal 'session.update'", "error", err) - sendError(c, "invalid_session_update", "Invalid session update format", "", "") - continue - } - if err := updateSession( - session, - &sessionUpdate, - application.ModelConfigLoader(), - application.ModelLoader(), - application.ApplicationConfig(), - ); err != nil { - xlog.Error("failed to update session", "error", err) - sendError(c, "session_update_error", "Failed to update session", "", "") - continue + sendEvent(c, types.SessionUpdatedEvent{ + ServerEventBase: types.ServerEventBase{ + EventID: "event_TODO", + }, + Session: session.ToServer(), + }) } - sendEvent(c, types.SessionUpdatedEvent{ - ServerEventBase: types.ServerEventBase{ - EventID: "event_TODO", - Type: types.ServerEventTypeSessionUpdated, - }, - Session: session.ToServer(), - }) - - if session.TurnDetection.Type == types.ServerTurnDetectionTypeServerVad && !vadServerStarted { - xlog.Debug("Starting VAD goroutine...") - wg.Add(1) - go func() { - defer wg.Done() - conversation := session.Conversations[session.DefaultConversationID] - handleVAD(cfg, evaluator, session, conversation, c, done) - }() - vadServerStarted = true - } else if session.TurnDetection.Type != types.ServerTurnDetectionTypeServerVad && vadServerStarted { - xlog.Debug("Stopping VAD goroutine...") - - wg.Add(-1) - go func() { - done <- struct{}{} - }() - vadServerStarted = false - } - case types.ClientEventTypeInputAudioBufferAppend: + case types.InputAudioBufferAppendEvent: // Handle 'input_audio_buffer.append' - if incomingMsg.Audio == "" { + if e.Audio == "" { xlog.Error("Audio data is missing in 'input_audio_buffer.append'") sendError(c, "missing_audio_data", "Audio data is missing", "", "") continue } // Decode base64 audio data - decodedAudio, err := base64.StdEncoding.DecodeString(incomingMsg.Audio) + decodedAudio, err := base64.StdEncoding.DecodeString(e.Audio) if err != nil { xlog.Error("failed to decode audio data", "error", err) sendError(c, "invalid_audio_data", "Failed to decode audio data", "", "") @@ -411,110 +360,78 @@ func registerRealtime(application *application.Application, model, intent string session.InputAudioBuffer = append(session.InputAudioBuffer, decodedAudio...) session.AudioBufferLock.Unlock() - case types.ClientEventTypeInputAudioBufferCommit: + case types.InputAudioBufferCommitEvent: xlog.Debug("recv", "message", string(msg)) - // TODO: Trigger transcription. - // TODO: Ignore this if VAD enabled or interrupt VAD? + sessionLock.Lock() + isServerVAD := session.TurnDetection.ServerVad != nil + sessionLock.Unlock() - if session.TranscriptionOnly { + // TODO: At the least need to check locking and timer state in the VAD Go routine before allowing this + if isServerVAD { + sendNotImplemented(c, "input_audio_buffer.commit in conjunction with VAD") continue } - // Commit the audio buffer to the conversation as a new item - item := &types.MessageItem{ - ID: generateItemID(), - Type: "message", - Status: "completed", - Role: "user", - Content: []types.MessageContentPart{ - { - Type: "input_audio", - Audio: base64.StdEncoding.EncodeToString(session.InputAudioBuffer), - }, - }, - } - - // Add item to conversation - conversation.Lock.Lock() - conversation.Items = append(conversation.Items, item) - conversation.Lock.Unlock() - - // Reset InputAudioBuffer session.AudioBufferLock.Lock() + allAudio := make([]byte, len(session.InputAudioBuffer)) + copy(allAudio, session.InputAudioBuffer) session.InputAudioBuffer = nil session.AudioBufferLock.Unlock() - // Send item.created event - sendEvent(c, types.ConversationItemCreatedEvent{ - ServerEventBase: types.ServerEventBase{ - EventID: "event_TODO", - Type: "conversation.item.created", - }, - Item: types.ResponseMessageItem{ - Object: "realtime.item", - MessageItem: *item, - }, - }) + go commitUtterance(context.TODO(), allAudio, session, conversation, c) - case types.ClientEventTypeConversationItemCreate: + case types.ConversationItemCreateEvent: + xlog.Debug("recv", "message", string(msg)) + sendNotImplemented(c, "conversation.item.create") + + case types.ConversationItemDeleteEvent: + sendError(c, "not_implemented", "Deleting items not implemented", "", "event_TODO") + + case types.ConversationItemRetrieveEvent: xlog.Debug("recv", "message", string(msg)) - // Handle creating new conversation items - var item types.ConversationItemCreateEvent - if err := json.Unmarshal(incomingMsg.Item, &item); err != nil { - xlog.Error("failed to unmarshal 'conversation.item.create'", "error", err) - sendError(c, "invalid_item", "Invalid item format", "", "") + if e.ItemID == "" { + sendError(c, "invalid_item_id", "Need item_id, but none specified", "", "event_TODO") continue } - sendNotImplemented(c, "conversation.item.create") + conversation.Lock.Lock() + var retrievedItem types.MessageItemUnion + for _, item := range conversation.Items { + // We need to check ID in the union + var id string + if item.System != nil { + id = item.System.ID + } else if item.User != nil { + id = item.User.ID + } else if item.Assistant != nil { + id = item.Assistant.ID + } else if item.FunctionCall != nil { + id = item.FunctionCall.ID + } else if item.FunctionCallOutput != nil { + id = item.FunctionCallOutput.ID + } - // Generate item ID and set status - // item.ID = generateItemID() - // item.Object = "realtime.item" - // item.Status = "completed" - // - // // Add item to conversation - // conversation.Lock.Lock() - // conversation.Items = append(conversation.Items, &item) - // conversation.Lock.Unlock() - // - // // Send item.created event - // sendEvent(c, OutgoingMessage{ - // Type: "conversation.item.created", - // Item: &item, - // }) - - case types.ClientEventTypeConversationItemDelete: - sendError(c, "not_implemented", "Deleting items not implemented", "", "event_TODO") - - case types.ClientEventTypeResponseCreate: - // Handle generating a response - var responseCreate types.ResponseCreateEvent - if len(incomingMsg.Response) > 0 { - if err := json.Unmarshal(incomingMsg.Response, &responseCreate); err != nil { - xlog.Error("failed to unmarshal 'response.create' response object", "error", err) - sendError(c, "invalid_response_create", "Invalid response create format", "", "") - continue + if id == e.ItemID { + retrievedItem = *item + break } } + conversation.Lock.Unlock() - // Update session functions if provided - if len(responseCreate.Response.Tools) > 0 { - // TODO: Tools -> Functions - } + sendEvent(c, types.ConversationItemRetrievedEvent{ + ServerEventBase: types.ServerEventBase{ + EventID: "event_TODO", + }, + Item: retrievedItem, + }) + case types.ResponseCreateEvent: + xlog.Debug("recv", "message", string(msg)) sendNotImplemented(c, "response.create") - // TODO: Generate a response based on the conversation history - // wg.Add(1) - // go func() { - // defer wg.Done() - // generateResponse(cfg, evaluator, session, conversation, responseCreate, c, mt) - // }() - - case types.ClientEventTypeResponseCancel: + case types.ResponseCancelEvent: xlog.Debug("recv", "message", string(msg)) // Handle cancellation of ongoing responses @@ -522,8 +439,8 @@ func registerRealtime(application *application.Application, model, intent string sendNotImplemented(c, "response.cancel") default: - xlog.Error("unknown message type", "type", incomingMsg.Type) - sendError(c, "unknown_message_type", fmt.Sprintf("Unknown message type: %s", incomingMsg.Type), "", "") + xlog.Error("unknown message type") + // sendError(c, "unknown_message_type", fmt.Sprintf("Unknown message type: %s", incomingMsg.Type), "", "") } } @@ -554,13 +471,13 @@ func sendEvent(c *websocket.Conn, event types.ServerEvent) { func sendError(c *websocket.Conn, code, message, param, eventID string) { errorEvent := types.ErrorEvent{ ServerEventBase: types.ServerEventBase{ - Type: types.ServerEventTypeError, EventID: eventID, }, Error: types.Error{ Type: "invalid_request_error", Code: code, Message: message, + Param: param, EventID: eventID, }, } @@ -572,25 +489,36 @@ func sendNotImplemented(c *websocket.Conn, message string) { sendError(c, "not_implemented", message, "", "event_TODO") } -func updateTransSession(session *Session, update *types.ClientSession, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) error { +func updateTransSession(session *Session, update *types.SessionUnion, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) error { sessionLock.Lock() defer sessionLock.Unlock() - trUpd := update.InputAudioTranscription + // In transcription session update, we look at Transcription field + if update.Transcription == nil || update.Transcription.Audio == nil || update.Transcription.Audio.Input == nil { + return nil + } + + trUpd := update.Transcription.Audio.Input.Transcription trCur := session.InputAudioTranscription + session.TranscriptionOnly = true + if trUpd != nil && trUpd.Model != "" && trUpd.Model != trCur.Model { - pipeline := config.Pipeline{ - VAD: vadModel, - Transcription: trUpd.Model, + cfg, err := cl.LoadModelConfigFileByNameDefaultOptions(trUpd.Model, appConfig) + if err != nil { + return err + } + if cfg == nil || (cfg.Pipeline.VAD == "" || cfg.Pipeline.Transcription == "") { + return fmt.Errorf("model is not a valid pipeline model: %s", trUpd.Model) } - m, _, err := newTranscriptionOnlyModel(&pipeline, cl, ml, appConfig) + m, cfg, err := newTranscriptionOnlyModel(&cfg.Pipeline, cl, ml, appConfig) if err != nil { return err } session.ModelInterface = m + session.ModelConfig = cfg } if trUpd != nil { @@ -598,62 +526,91 @@ func updateTransSession(session *Session, update *types.ClientSession, cl *confi trCur.Prompt = trUpd.Prompt } - if update.TurnDetection != nil && update.TurnDetection.Type != "" { - session.TurnDetection.Type = types.ServerTurnDetectionType(update.TurnDetection.Type) - session.TurnDetection.TurnDetectionParams = update.TurnDetection.TurnDetectionParams + if update.Transcription.Audio.Input.TurnDetection != nil { + session.TurnDetection = update.Transcription.Audio.Input.TurnDetection } return nil } -// Function to update session configurations -func updateSession(session *Session, update *types.ClientSession, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) error { +func updateSession(session *Session, update *types.SessionUnion, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig, evaluator *templates.Evaluator) error { sessionLock.Lock() defer sessionLock.Unlock() - if update.Model != "" { - pipeline := config.Pipeline{ - LLM: update.Model, - // TODO: Setup pipeline by configuring STT and TTS models + if update.Realtime == nil { + return nil + } + + session.TranscriptionOnly = false + rt := update.Realtime + + if rt.Model != "" { + cfg, err := cl.LoadModelConfigFileByNameDefaultOptions(rt.Model, appConfig) + if err != nil { + return err } - m, err := newModel(&pipeline, cl, ml, appConfig) + if cfg == nil || (cfg.Pipeline.VAD == "" || cfg.Pipeline.Transcription == "" || cfg.Pipeline.TTS == "" || cfg.Pipeline.LLM == "") { + return fmt.Errorf("model is not a valid pipeline model: %s", rt.Model) + } + + if session.InputAudioTranscription == nil { + session.InputAudioTranscription = &types.AudioTranscription{} + } + session.InputAudioTranscription.Model = cfg.Pipeline.Transcription + session.Voice = cfg.Pipeline.TTS + session.Model = rt.Model + session.ModelConfig = cfg + } + + if rt.Audio != nil && rt.Audio.Output != nil && rt.Audio.Output.Voice != "" { + xlog.Warn("Ignoring voice setting; not implemented", "voice", rt.Audio.Output.Voice) + } + + if rt.Audio != nil && rt.Audio.Input != nil && rt.Audio.Input.Transcription != nil { + session.InputAudioTranscription = rt.Audio.Input.Transcription + session.ModelConfig.Pipeline.Transcription = rt.Audio.Input.Transcription.Model + } + + if rt.Model != "" || (rt.Audio != nil && rt.Audio.Output != nil && rt.Audio.Output.Voice != "") || (rt.Audio != nil && rt.Audio.Input != nil && rt.Audio.Input.Transcription != nil) { + m, err := newModel(&session.ModelConfig.Pipeline, cl, ml, appConfig, evaluator) if err != nil { return err } session.ModelInterface = m - session.Model = update.Model } - if update.Voice != "" { - session.Voice = update.Voice - } - if update.TurnDetection != nil && update.TurnDetection.Type != "" { - session.TurnDetection.Type = types.ServerTurnDetectionType(update.TurnDetection.Type) - session.TurnDetection.TurnDetectionParams = update.TurnDetection.TurnDetectionParams - } - // TODO: We should actually check if the field was present in the JSON; empty string means clear the settings - if update.Instructions != "" { - session.Instructions = update.Instructions - } - if update.Tools != nil { - return fmt.Errorf("Haven't implemented tools") + if rt.Audio != nil && rt.Audio.Input != nil && rt.Audio.Input.TurnDetection != nil { + session.TurnDetection = rt.Audio.Input.TurnDetection } - session.InputAudioTranscription = update.InputAudioTranscription + if rt.Instructions != "" { + session.Instructions = rt.Instructions + } + + if rt.Tools != nil { + session.Tools = rt.Tools + } + if rt.ToolChoice != nil { + session.ToolChoice = rt.ToolChoice + } return nil } // handleVAD is a goroutine that listens for audio data from the client, // runs VAD on the audio data, and commits utterances to the conversation -func handleVAD(cfg *config.ModelConfig, evaluator *templates.Evaluator, session *Session, conv *Conversation, c *websocket.Conn, done chan struct{}) { +func handleVAD(session *Session, conv *Conversation, c *websocket.Conn, done chan struct{}) { vadContext, cancel := context.WithCancel(context.Background()) go func() { <-done cancel() }() - silenceThreshold := float64(session.TurnDetection.SilenceDurationMs) / 1000 + silenceThreshold := 0.5 // Default 500ms + if session.TurnDetection.ServerVad != nil { + silenceThreshold = float64(session.TurnDetection.ServerVad.SilenceDurationMs) / 1000 + } + speechStarted := false startTime := time.Now() @@ -702,7 +659,6 @@ func handleVAD(cfg *config.ModelConfig, evaluator *templates.Evaluator, session sendEvent(c, types.InputAudioBufferClearedEvent{ ServerEventBase: types.ServerEventBase{ EventID: "event_TODO", - Type: types.ServerEventTypeInputAudioBufferCleared, }, }) @@ -715,15 +671,14 @@ func handleVAD(cfg *config.ModelConfig, evaluator *templates.Evaluator, session sendEvent(c, types.InputAudioBufferSpeechStartedEvent{ ServerEventBase: types.ServerEventBase{ EventID: "event_TODO", - Type: types.ServerEventTypeInputAudioBufferSpeechStarted, }, - AudioStartMs: time.Now().Sub(startTime).Milliseconds(), + AudioStartMs: time.Since(startTime).Milliseconds(), }) speechStarted = true } // Segment still in progress when audio ended - segEndTime := segments[len(segments)-1].GetEnd() + segEndTime := segments[len(segments)-1].End if segEndTime == 0 { continue } @@ -737,16 +692,14 @@ func handleVAD(cfg *config.ModelConfig, evaluator *templates.Evaluator, session sendEvent(c, types.InputAudioBufferSpeechStoppedEvent{ ServerEventBase: types.ServerEventBase{ EventID: "event_TODO", - Type: types.ServerEventTypeInputAudioBufferSpeechStopped, }, - AudioEndMs: time.Now().Sub(startTime).Milliseconds(), + AudioEndMs: time.Since(startTime).Milliseconds(), }) speechStarted = false sendEvent(c, types.InputAudioBufferCommittedEvent{ ServerEventBase: types.ServerEventBase{ EventID: "event_TODO", - Type: types.ServerEventTypeInputAudioBufferCommitted, }, ItemID: generateItemID(), PreviousItemID: "TODO", @@ -754,19 +707,17 @@ func handleVAD(cfg *config.ModelConfig, evaluator *templates.Evaluator, session abytes := sound.Int16toBytesLE(aints) // TODO: Remove prefix silence that is is over TurnDetectionParams.PrefixPaddingMs - go commitUtterance(vadContext, abytes, cfg, evaluator, session, conv, c) + go commitUtterance(vadContext, abytes, session, conv, c) } } } } -func commitUtterance(ctx context.Context, utt []byte, cfg *config.ModelConfig, evaluator *templates.Evaluator, session *Session, conv *Conversation, c *websocket.Conn) { +func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Conversation, c *websocket.Conn) { if len(utt) == 0 { return } - // TODO: If we have a real any-to-any model then transcription is optional - f, err := os.CreateTemp("", "realtime-audio-chunk-*.wav") if err != nil { xlog.Error("failed to create temp file", "error", err) @@ -789,68 +740,37 @@ func commitUtterance(ctx context.Context, utt []byte, cfg *config.ModelConfig, e f.Sync() + // TODO: If we have a real any-to-any model then transcription is optional + var transcript string if session.InputAudioTranscription != nil { - tr, err := session.ModelInterface.Transcribe(ctx, &proto.TranscriptRequest{ - Dst: f.Name(), - Language: session.InputAudioTranscription.Language, - Translate: false, - Threads: uint32(*cfg.Threads), - Prompt: session.InputAudioTranscription.Prompt, - }) + tr, err := session.ModelInterface.Transcribe(ctx, f.Name(), session.InputAudioTranscription.Language, false, false, session.InputAudioTranscription.Prompt) if err != nil { sendError(c, "transcription_failed", err.Error(), "", "event_TODO") } - sendEvent(c, types.ResponseAudioTranscriptDoneEvent{ + transcript = tr.Text + sendEvent(c, types.ConversationItemInputAudioTranscriptionCompletedEvent{ ServerEventBase: types.ServerEventBase{ - Type: types.ServerEventTypeResponseAudioTranscriptDone, EventID: "event_TODO", }, - ItemID: generateItemID(), - ResponseID: "resp_TODO", - OutputIndex: 0, + ItemID: generateItemID(), + // ResponseID: "resp_TODO", // Not needed for transcription completed event + // OutputIndex: 0, ContentIndex: 0, - Transcript: tr.GetText(), + Transcript: transcript, }) - // TODO: Update the prompt with transcription result? + } else { + sendNotImplemented(c, "any-to-any models") + return } if !session.TranscriptionOnly { - sendNotImplemented(c, "Commiting items to the conversation not implemented") + generateResponse(session, utt, transcript, conv, c, websocket.TextMessage) } - - // TODO: Commit the audio and/or transcribed text to the conversation - // Commit logic: create item, broadcast item.created, etc. - // item := &Item{ - // ID: generateItemID(), - // Object: "realtime.item", - // Type: "message", - // Status: "completed", - // Role: "user", - // Content: []ConversationContent{ - // { - // Type: "input_audio", - // Audio: base64.StdEncoding.EncodeToString(utt), - // }, - // }, - // } - // conv.Lock.Lock() - // conv.Items = append(conv.Items, item) - // conv.Lock.Unlock() - // - // - // sendEvent(c, OutgoingMessage{ - // Type: "conversation.item.created", - // Item: item, - // }) - // - // - // // trigger the response generation - // generateResponse(cfg, evaluator, session, conv, ResponseCreate{}, c, websocket.TextMessage) } -func runVAD(ctx context.Context, session *Session, adata []int16) ([]*proto.VADSegment, error) { +func runVAD(ctx context.Context, session *Session, adata []int16) ([]schema.VADSegment, error) { soundIntBuffer := &audio.IntBuffer{ Format: &audio.Format{SampleRate: localSampleRate, NumChannels: 1}, SourceBitDepth: 16, @@ -859,7 +779,7 @@ func runVAD(ctx context.Context, session *Session, adata []int16) ([]*proto.VADS float32Data := soundIntBuffer.AsFloat32Buffer().Data - resp, err := session.ModelInterface.VAD(ctx, &proto.VADRequest{ + resp, err := session.ModelInterface.VAD(ctx, &schema.VADRequest{ Audio: float32Data, }) if err != nil { @@ -870,406 +790,348 @@ func runVAD(ctx context.Context, session *Session, adata []int16) ([]*proto.VADS return resp.Segments, nil } -// TODO: Below needed for normal mode instead of transcription only // Function to generate a response based on the conversation -// func generateResponse(config *config.ModelConfig, evaluator *templates.Evaluator, session *Session, conversation *Conversation, responseCreate ResponseCreate, c *websocket.Conn, mt int) { -// -// log.Debug().Msg("Generating realtime response...") -// -// // Compile the conversation history -// conversation.Lock.Lock() -// var conversationHistory []schema.Message -// var latestUserAudio string -// for _, item := range conversation.Items { -// for _, content := range item.Content { -// switch content.Type { -// case "input_text", "text": -// conversationHistory = append(conversationHistory, schema.Message{ -// Role: string(item.Role), -// StringContent: content.Text, -// Content: content.Text, -// }) -// case "input_audio": -// // We do not to turn to text here the audio result. -// // When generating it later on from the LLM, -// // we will also generate text and return it and store it in the conversation -// // Here we just want to get the user audio if there is any as a new input for the conversation. -// if item.Role == "user" { -// latestUserAudio = content.Audio -// } -// } -// } -// } -// -// conversation.Lock.Unlock() -// -// var generatedText string -// var generatedAudio []byte -// var functionCall *FunctionCall -// var err error -// -// if latestUserAudio != "" { -// // Process the latest user audio input -// decodedAudio, err := base64.StdEncoding.DecodeString(latestUserAudio) -// if err != nil { -// log.Error().Msgf("failed to decode latest user audio: %s", err.Error()) -// sendError(c, "invalid_audio_data", "Failed to decode audio data", "", "") -// return -// } -// -// // Process the audio input and generate a response -// generatedText, generatedAudio, functionCall, err = processAudioResponse(session, decodedAudio) -// if err != nil { -// log.Error().Msgf("failed to process audio response: %s", err.Error()) -// sendError(c, "processing_error", "Failed to generate audio response", "", "") -// return -// } -// } else { -// -// if session.Instructions != "" { -// conversationHistory = append([]schema.Message{{ -// Role: "system", -// StringContent: session.Instructions, -// Content: session.Instructions, -// }}, conversationHistory...) -// } -// -// funcs := session.Functions -// shouldUseFn := len(funcs) > 0 && config.ShouldUseFunctions() -// -// // Allow the user to set custom actions via config file -// // to be "embedded" in each model -// noActionName := "answer" -// noActionDescription := "use this action to answer without performing any action" -// -// if config.FunctionsConfig.NoActionFunctionName != "" { -// noActionName = config.FunctionsConfig.NoActionFunctionName -// } -// if config.FunctionsConfig.NoActionDescriptionName != "" { -// noActionDescription = config.FunctionsConfig.NoActionDescriptionName -// } -// -// if (!config.FunctionsConfig.GrammarConfig.NoGrammar) && shouldUseFn { -// noActionGrammar := functions.Function{ -// Name: noActionName, -// Description: noActionDescription, -// Parameters: map[string]interface{}{ -// "properties": map[string]interface{}{ -// "message": map[string]interface{}{ -// "type": "string", -// "description": "The message to reply the user with", -// }}, -// }, -// } -// -// // Append the no action function -// if !config.FunctionsConfig.DisableNoAction { -// funcs = append(funcs, noActionGrammar) -// } -// -// // Update input grammar -// jsStruct := funcs.ToJSONStructure(config.FunctionsConfig.FunctionNameKey, config.FunctionsConfig.FunctionNameKey) -// g, err := jsStruct.Grammar(config.FunctionsConfig.GrammarOptions()...) -// if err == nil { -// config.Grammar = g -// } -// } -// -// // Generate a response based on text conversation history -// prompt := evaluator.TemplateMessages(conversationHistory, config, funcs, shouldUseFn) -// -// generatedText, functionCall, err = processTextResponse(config, session, prompt) -// if err != nil { -// log.Error().Msgf("failed to process text response: %s", err.Error()) -// sendError(c, "processing_error", "Failed to generate text response", "", "") -// return -// } -// log.Debug().Any("text", generatedText).Msg("Generated text response") -// } -// -// if functionCall != nil { -// // The model wants to call a function -// // Create a function_call item and send it to the client -// item := &Item{ -// ID: generateItemID(), -// Object: "realtime.item", -// Type: "function_call", -// Status: "completed", -// Role: "assistant", -// FunctionCall: functionCall, -// } -// -// // Add item to conversation -// conversation.Lock.Lock() -// conversation.Items = append(conversation.Items, item) -// conversation.Lock.Unlock() -// -// // Send item.created event -// sendEvent(c, OutgoingMessage{ -// Type: "conversation.item.created", -// Item: item, -// }) -// -// // Optionally, you can generate a message to the user indicating the function call -// // For now, we'll assume the client handles the function call and may trigger another response -// -// } else { -// // Send response.stream messages -// if generatedAudio != nil { -// // If generatedAudio is available, send it as audio -// encodedAudio := base64.StdEncoding.EncodeToString(generatedAudio) -// outgoingMsg := OutgoingMessage{ -// Type: "response.stream", -// Audio: encodedAudio, -// } -// sendEvent(c, outgoingMsg) -// } else { -// // Send text response (could be streamed in chunks) -// chunks := splitResponseIntoChunks(generatedText) -// for _, chunk := range chunks { -// outgoingMsg := OutgoingMessage{ -// Type: "response.stream", -// Content: chunk, -// } -// sendEvent(c, outgoingMsg) -// } -// } -// -// // Send response.done message -// sendEvent(c, OutgoingMessage{ -// Type: "response.done", -// }) -// -// // Add the assistant's response to the conversation -// content := []ConversationContent{} -// if generatedAudio != nil { -// content = append(content, ConversationContent{ -// Type: "audio", -// Audio: base64.StdEncoding.EncodeToString(generatedAudio), -// }) -// // Optionally include a text transcript -// if generatedText != "" { -// content = append(content, ConversationContent{ -// Type: "text", -// Text: generatedText, -// }) -// } -// } else { -// content = append(content, ConversationContent{ -// Type: "text", -// Text: generatedText, -// }) -// } -// -// item := &Item{ -// ID: generateItemID(), -// Object: "realtime.item", -// Type: "message", -// Status: "completed", -// Role: "assistant", -// Content: content, -// } -// -// // Add item to conversation -// conversation.Lock.Lock() -// conversation.Items = append(conversation.Items, item) -// conversation.Lock.Unlock() -// -// // Send item.created event -// sendEvent(c, OutgoingMessage{ -// Type: "conversation.item.created", -// Item: item, -// }) -// -// log.Debug().Any("item", item).Msg("Realtime response sent") -// } -// } +func generateResponse(session *Session, utt []byte, transcript string, conv *Conversation, c *websocket.Conn, mt int) { + xlog.Debug("Generating realtime response...") -// Function to process text response and detect function calls -func processTextResponse(config *config.ModelConfig, session *Session, prompt string) (string, *FunctionCall, error) { + config := session.ModelInterface.PredictConfig() - // Placeholder implementation - // Replace this with actual model inference logic using session.Model and prompt - // For example, the model might return a special token or JSON indicating a function call - - /* - predFunc, err := backend.ModelInference(context.Background(), prompt, input.Messages, images, videos, audios, ml, *config, o, nil, "", "", nil, nil, nil) - - result, tokenUsage, err := ComputeChoices(input, prompt, config, startupOptions, ml, func(s string, c *[]schema.Choice) { - if !shouldUseFn { - // no function is called, just reply and use stop as finish reason - stopReason := FinishReasonStop - *c = append(*c, schema.Choice{FinishReason: &stopReason, Index: 0, Message: &schema.Message{Role: "assistant", Content: &s}}) - return - } - - textContentToReturn = functions.ParseTextContent(s, config.FunctionsConfig) - s = functions.CleanupLLMResult(s, config.FunctionsConfig) - results := functions.ParseFunctionCall(s, config.FunctionsConfig) - xlog.Debug("Text content to return", "text", textContentToReturn) - noActionsToRun := len(results) > 0 && results[0].Name == noActionName || len(results) == 0 - - switch { - case noActionsToRun: - result, err := handleQuestion(config, input, ml, startupOptions, results, s, predInput) - if err != nil { - xlog.Error("error handling question", "error", err) - return - } - *c = append(*c, schema.Choice{ - Message: &schema.Message{Role: "assistant", Content: &result}}) - default: - toolChoice := schema.Choice{ - Message: &schema.Message{ - Role: "assistant", - }, - } - - if len(input.Tools) > 0 { - toolCallsReason := FinishReasonToolCalls - toolChoice.FinishReason = &toolCallsReason - } - - for _, ss := range results { - name, args := ss.Name, ss.Arguments - if len(input.Tools) > 0 { - // If we are using tools, we condense the function calls into - // a single response choice with all the tools - toolChoice.Message.Content = textContentToReturn - toolChoice.Message.ToolCalls = append(toolChoice.Message.ToolCalls, - schema.ToolCall{ - ID: id, - Type: "function", - FunctionCall: schema.FunctionCall{ - Name: name, - Arguments: args, - }, - }, - ) - } else { - // otherwise we return more choices directly - functionCallReason := FinishReasonFunctionCall - *c = append(*c, schema.Choice{ - FinishReason: &functionCallReason, - Message: &schema.Message{ - Role: "assistant", - Content: &textContentToReturn, - FunctionCall: map[string]interface{}{ - "name": name, - "arguments": args, - }, - }, - }) - } - } - - if len(input.Tools) > 0 { - // we need to append our result if we are using tools - *c = append(*c, toolChoice) - } - } - - }, nil) - if err != nil { - return err - } - - resp := &schema.OpenAIResponse{ - ID: id, - Created: created, - Model: input.Model, // we have to return what the user sent here, due to OpenAI spec. - Choices: result, - Object: "chat.completion", - Usage: schema.OpenAIUsage{ - PromptTokens: tokenUsage.Prompt, - CompletionTokens: tokenUsage.Completion, - TotalTokens: tokenUsage.Prompt + tokenUsage.Completion, + item := types.MessageItemUnion{ + User: &types.MessageItemUser{ + ID: generateItemID(), + Status: types.ItemStatusCompleted, + Content: []types.MessageContentInput{ + { + Type: types.MessageContentTypeInputAudio, + Audio: base64.StdEncoding.EncodeToString(utt), + Transcript: transcript, + }, }, - } - respData, _ := json.Marshal(resp) - xlog.Debug("Response", "response", string(respData)) - - // Return the prediction in the response body - return c.JSON(resp) - - */ - - // TODO: use session.ModelInterface... - // Simulate a function call - if strings.Contains(prompt, "weather") { - functionCall := &FunctionCall{ - Name: "get_weather", - Arguments: map[string]interface{}{ - "location": "New York", - "scale": "celsius", - }, - } - return "", functionCall, nil + }, } + conv.Lock.Lock() + conv.Items = append(conv.Items, &item) + conv.Lock.Unlock() - // Otherwise, return a normal text response - return "This is a generated response based on the conversation.", nil, nil -} - -// Function to process audio response and detect function calls -func processAudioResponse(session *Session, audioData []byte) (string, []byte, *FunctionCall, error) { - // TODO: Do the below or use an any-to-any model like Qwen Omni - // Implement the actual model inference logic using session.Model and audioData - // For example: - // 1. Transcribe the audio to text - // 2. Generate a response based on the transcribed text - // 3. Check if the model wants to call a function - // 4. Convert the response text to speech (audio) - // - // Placeholder implementation: - - // TODO: template eventual messages, like chat.go - reply, err := session.ModelInterface.Predict(context.Background(), &proto.PredictOptions{ - Prompt: "What's the weather in New York?", + sendEvent(c, types.ConversationItemAddedEvent{ + Item: item, }) + var conversationHistory schema.Messages + conversationHistory = append(conversationHistory, schema.Message{ + Role: string(types.MessageRoleSystem), + StringContent: session.Instructions, + Content: session.Instructions, + }) + + conv.Lock.Lock() + for _, item := range conv.Items { + if item.User != nil { + for _, content := range item.User.Content { + switch content.Type { + case types.MessageContentTypeInputText: + conversationHistory = append(conversationHistory, schema.Message{ + Role: string(types.MessageRoleUser), + StringContent: content.Text, + Content: content.Text, + }) + case types.MessageContentTypeInputAudio: + conversationHistory = append(conversationHistory, schema.Message{ + Role: string(types.MessageRoleUser), + StringContent: content.Transcript, + Content: content.Transcript, + StringAudios: []string{content.Audio}, + }) + } + } + } else if item.Assistant != nil { + for _, content := range item.Assistant.Content { + switch content.Type { + case types.MessageContentTypeOutputText: + conversationHistory = append(conversationHistory, schema.Message{ + Role: string(types.MessageRoleAssistant), + StringContent: content.Text, + Content: content.Text, + }) + case types.MessageContentTypeOutputAudio: + conversationHistory = append(conversationHistory, schema.Message{ + Role: string(types.MessageRoleAssistant), + StringContent: content.Transcript, + Content: content.Transcript, + StringAudios: []string{content.Audio}, + }) + } + } + } else if item.System != nil { + for _, content := range item.System.Content { + conversationHistory = append(conversationHistory, schema.Message{ + Role: string(types.MessageRoleSystem), + StringContent: content.Text, + Content: content.Text, + }) + } + } + } + conv.Lock.Unlock() + + responseID := generateUniqueID() + sendEvent(c, types.ResponseCreatedEvent{ + ServerEventBase: types.ServerEventBase{}, + Response: types.Response{ + ID: responseID, + Object: "realtime.response", + Status: types.ResponseStatusInProgress, + }, + }) + + predFunc, err := session.ModelInterface.Predict(context.TODO(), conversationHistory, nil, nil, nil, nil, session.Tools, session.ToolChoice, nil, nil, nil) if err != nil { - return "", nil, nil, err + sendError(c, "inference_failed", fmt.Sprintf("backend error: %v", err), "", item.Assistant.ID) + return } - generatedAudio := reply.Audio + pred, err := predFunc() + if err != nil { + sendError(c, "prediction_failed", fmt.Sprintf("backend error: %v", err), "", item.Assistant.ID) + return + } - transcribedText := "What's the weather in New York?" - var functionCall *FunctionCall + xlog.Debug("Function config for parsing", "function_name_key", config.FunctionsConfig.FunctionNameKey, "function_arguments_key", config.FunctionsConfig.FunctionArgumentsKey) - // Simulate a function call - if strings.Contains(transcribedText, "weather") { - functionCall = &FunctionCall{ - Name: "get_weather", - Arguments: map[string]interface{}{ - "location": "New York", - "scale": "celsius", + rawResponse := pred.Response + if config.TemplateConfig.ReplyPrefix != "" { + rawResponse = config.TemplateConfig.ReplyPrefix + rawResponse + } + + reasoningText, responseWithoutReasoning := reasoning.ExtractReasoningWithConfig(rawResponse, "", config.ReasoningConfig) + xlog.Debug("LLM Response", "reasoning", reasoningText, "response_without_reasoning", responseWithoutReasoning) + + textContent := functions.ParseTextContent(responseWithoutReasoning, config.FunctionsConfig) + cleanedResponse := functions.CleanupLLMResult(responseWithoutReasoning, config.FunctionsConfig) + toolCalls := functions.ParseFunctionCall(cleanedResponse, config.FunctionsConfig) + + xlog.Debug("Function call parsing", "textContent", textContent, "cleanedResponse", cleanedResponse, "toolCallsCount", len(toolCalls)) + + noActionName := "answer" + if config.FunctionsConfig.NoActionFunctionName != "" { + noActionName = config.FunctionsConfig.NoActionFunctionName + } + isNoAction := len(toolCalls) > 0 && toolCalls[0].Name == noActionName + + var finalSpeech string + var finalToolCalls []functions.FuncCallResults + + if isNoAction { + arg := toolCalls[0].Arguments + arguments := map[string]interface{}{} + if err := json.Unmarshal([]byte(arg), &arguments); err == nil { + if m, exists := arguments["message"]; exists { + if message, ok := m.(string); ok { + finalSpeech = message + } else { + xlog.Warn("NoAction function message field is not a string", "type", fmt.Sprintf("%T", m)) + } + } else { + xlog.Warn("NoAction function missing 'message' field in arguments") + } + } else { + xlog.Warn("Failed to unmarshal NoAction function arguments", "error", err, "arguments", arg) + } + if finalSpeech == "" { + // Fallback if parsing failed + xlog.Warn("NoAction function did not produce speech, using cleaned response as fallback") + finalSpeech = cleanedResponse + } + } else { + finalToolCalls = toolCalls + xlog.Debug("Setting finalToolCalls", "count", len(finalToolCalls)) + if len(toolCalls) > 0 { + finalSpeech = textContent + } else { + finalSpeech = cleanedResponse + } + } + + if finalSpeech != "" { + // Create the assistant item now that we have content + item := types.MessageItemUnion{ + Assistant: &types.MessageItemAssistant{ + ID: generateItemID(), + Status: types.ItemStatusInProgress, + Content: []types.MessageContentOutput{ + { + Type: types.MessageContentTypeOutputAudio, + Transcript: finalSpeech, + }, + }, }, } - return "", nil, functionCall, nil - } - // Generate a response - generatedText := "This is a response to your speech input." + conv.Lock.Lock() + conv.Items = append(conv.Items, &item) + conv.Lock.Unlock() - return generatedText, generatedAudio, nil, nil -} + sendEvent(c, types.ResponseOutputItemAddedEvent{ + ServerEventBase: types.ServerEventBase{}, + ResponseID: responseID, + OutputIndex: 0, + Item: item, + }) -// Function to split the response into chunks (for streaming) -func splitResponseIntoChunks(response string) []string { - // Split the response into chunks of fixed size - chunkSize := 50 // characters per chunk - var chunks []string - for len(response) > 0 { - if len(response) > chunkSize { - chunks = append(chunks, response[:chunkSize]) - response = response[chunkSize:] - } else { - chunks = append(chunks, response) - break + sendEvent(c, types.ResponseContentPartAddedEvent{ + ServerEventBase: types.ServerEventBase{}, + ResponseID: responseID, + ItemID: item.Assistant.ID, + OutputIndex: 0, + ContentIndex: 0, + Part: item.Assistant.Content[0], + }) + + audioFilePath, res, err := session.ModelInterface.TTS(context.TODO(), finalSpeech, session.Voice, session.InputAudioTranscription.Language) + if err != nil { + xlog.Error("TTS failed", "error", err) + sendError(c, "tts_error", fmt.Sprintf("TTS generation failed: %v", err), "", item.Assistant.ID) + return } + if !res.Success { + xlog.Error("TTS failed", "message", res.Message) + sendError(c, "tts_error", fmt.Sprintf("TTS generation failed: %s", res.Message), "", item.Assistant.ID) + return + } + defer os.Remove(audioFilePath) + + audioBytes, err := os.ReadFile(audioFilePath) + if err != nil { + xlog.Error("failed to read TTS file", "error", err) + sendError(c, "tts_error", fmt.Sprintf("Failed to read TTS audio: %v", err), "", item.Assistant.ID) + return + } + audioString := base64.StdEncoding.EncodeToString(audioBytes) + + sendEvent(c, types.ResponseOutputAudioTranscriptDeltaEvent{ + ServerEventBase: types.ServerEventBase{}, + ResponseID: responseID, + ItemID: item.Assistant.ID, + OutputIndex: 0, + ContentIndex: 0, + Delta: finalSpeech, + }) + sendEvent(c, types.ResponseOutputAudioTranscriptDoneEvent{ + ServerEventBase: types.ServerEventBase{}, + ResponseID: responseID, + ItemID: item.Assistant.ID, + OutputIndex: 0, + ContentIndex: 0, + Transcript: finalSpeech, + }) + + sendEvent(c, types.ResponseOutputAudioDeltaEvent{ + ServerEventBase: types.ServerEventBase{}, + ResponseID: responseID, + ItemID: item.Assistant.ID, + OutputIndex: 0, + ContentIndex: 0, + Delta: audioString, + }) + sendEvent(c, types.ResponseOutputAudioDoneEvent{ + ServerEventBase: types.ServerEventBase{}, + ResponseID: responseID, + ItemID: item.Assistant.ID, + OutputIndex: 0, + ContentIndex: 0, + }) + + sendEvent(c, types.ResponseContentPartDoneEvent{ + ServerEventBase: types.ServerEventBase{}, + ResponseID: responseID, + ItemID: item.Assistant.ID, + OutputIndex: 0, + ContentIndex: 0, + Part: item.Assistant.Content[0], + }) + + conv.Lock.Lock() + item.Assistant.Status = types.ItemStatusCompleted + item.Assistant.Content[0].Audio = audioString + conv.Lock.Unlock() + + sendEvent(c, types.ResponseOutputItemDoneEvent{ + ServerEventBase: types.ServerEventBase{}, + ResponseID: responseID, + OutputIndex: 0, + Item: item, + }) } - return chunks + + // Handle Tool Calls + xlog.Debug("About to handle tool calls", "finalToolCallsCount", len(finalToolCalls)) + for i, tc := range finalToolCalls { + toolCallID := generateItemID() + callID := "call_" + generateUniqueID() // OpenAI uses call_xyz + + // Create FunctionCall Item + fcItem := types.MessageItemUnion{ + FunctionCall: &types.MessageItemFunctionCall{ + ID: toolCallID, + CallID: callID, + Name: tc.Name, + Arguments: tc.Arguments, + Status: types.ItemStatusCompleted, + }, + } + + conv.Lock.Lock() + conv.Items = append(conv.Items, &fcItem) + conv.Lock.Unlock() + + outputIndex := i + if finalSpeech != "" { + outputIndex++ + } + + sendEvent(c, types.ResponseOutputItemAddedEvent{ + ServerEventBase: types.ServerEventBase{}, + ResponseID: responseID, + OutputIndex: outputIndex, + Item: fcItem, + }) + + sendEvent(c, types.ResponseFunctionCallArgumentsDeltaEvent{ + ServerEventBase: types.ServerEventBase{}, + ResponseID: responseID, + ItemID: toolCallID, + OutputIndex: outputIndex, + CallID: callID, + Delta: tc.Arguments, + }) + + sendEvent(c, types.ResponseFunctionCallArgumentsDoneEvent{ + ServerEventBase: types.ServerEventBase{}, + ResponseID: responseID, + ItemID: toolCallID, + OutputIndex: outputIndex, + CallID: callID, + Arguments: tc.Arguments, + Name: tc.Name, + }) + + sendEvent(c, types.ResponseOutputItemDoneEvent{ + ServerEventBase: types.ServerEventBase{}, + ResponseID: responseID, + OutputIndex: outputIndex, + Item: fcItem, + }) + } + + sendEvent(c, types.ResponseDoneEvent{ + ServerEventBase: types.ServerEventBase{}, + Response: types.Response{ + ID: responseID, + Object: "realtime.response", + Status: types.ResponseStatusCompleted, + }, + }) + } // Helper functions to generate unique IDs @@ -1297,11 +1159,3 @@ func generateUniqueID() string { // Implement as needed return "unique_id" } - -// Structures for 'response.create' messages -type ResponseCreate struct { - Modalities []string `json:"modalities,omitempty"` - Instructions string `json:"instructions,omitempty"` - Functions functions.Functions `json:"functions,omitempty"` - // Other fields as needed -} diff --git a/core/http/endpoints/openai/realtime_model.go b/core/http/endpoints/openai/realtime_model.go index ac52627a8..44631bf25 100644 --- a/core/http/endpoints/openai/realtime_model.go +++ b/core/http/endpoints/openai/realtime_model.go @@ -2,20 +2,23 @@ package openai import ( "context" + "encoding/json" "fmt" "github.com/mudler/LocalAI/core/backend" "github.com/mudler/LocalAI/core/config" - grpcClient "github.com/mudler/LocalAI/pkg/grpc" + "github.com/mudler/LocalAI/core/http/endpoints/openai/types" + "github.com/mudler/LocalAI/core/schema" + "github.com/mudler/LocalAI/core/templates" + "github.com/mudler/LocalAI/pkg/functions" "github.com/mudler/LocalAI/pkg/grpc/proto" model "github.com/mudler/LocalAI/pkg/model" "github.com/mudler/xlog" - "google.golang.org/grpc" ) var ( _ Model = new(wrappedModel) - _ Model = new(anyToAnyModel) + _ Model = new(transcriptOnlyModel) ) // wrappedModel represent a model which does not support Any-to-Any operations @@ -25,12 +28,12 @@ type wrappedModel struct { TTSConfig *config.ModelConfig TranscriptionConfig *config.ModelConfig LLMConfig *config.ModelConfig - TTSClient grpcClient.Backend - TranscriptionClient grpcClient.Backend - LLMClient grpcClient.Backend - VADConfig *config.ModelConfig - VADClient grpcClient.Backend + + appConfig *config.ApplicationConfig + modelLoader *model.ModelLoader + confLoader *config.ModelConfigLoader + evaluator *templates.Evaluator } // anyToAnyModel represent a model which supports Any-to-Any operations @@ -38,71 +41,158 @@ type wrappedModel struct { // In the future there could be models that accept continous audio input only so this design will be useful for that type anyToAnyModel struct { LLMConfig *config.ModelConfig - LLMClient grpcClient.Backend - VADConfig *config.ModelConfig - VADClient grpcClient.Backend + + appConfig *config.ApplicationConfig + modelLoader *model.ModelLoader + confLoader *config.ModelConfigLoader } type transcriptOnlyModel struct { TranscriptionConfig *config.ModelConfig - TranscriptionClient grpcClient.Backend VADConfig *config.ModelConfig - VADClient grpcClient.Backend + + appConfig *config.ApplicationConfig + modelLoader *model.ModelLoader + confLoader *config.ModelConfigLoader } -func (m *transcriptOnlyModel) VAD(ctx context.Context, in *proto.VADRequest, opts ...grpc.CallOption) (*proto.VADResponse, error) { - return m.VADClient.VAD(ctx, in) +func (m *transcriptOnlyModel) VAD(ctx context.Context, request *schema.VADRequest) (*schema.VADResponse, error) { + return backend.VAD(request, ctx, m.modelLoader, m.appConfig, *m.VADConfig) } -func (m *transcriptOnlyModel) Transcribe(ctx context.Context, in *proto.TranscriptRequest, opts ...grpc.CallOption) (*proto.TranscriptResult, error) { - return m.TranscriptionClient.AudioTranscription(ctx, in, opts...) +func (m *transcriptOnlyModel) Transcribe(ctx context.Context, audio, language string, translate bool, diarize bool, prompt string) (*schema.TranscriptionResult, error) { + return backend.ModelTranscription(audio, language, translate, diarize, prompt, m.modelLoader, *m.TranscriptionConfig, m.appConfig) } -func (m *transcriptOnlyModel) Predict(ctx context.Context, in *proto.PredictOptions, opts ...grpc.CallOption) (*proto.Reply, error) { +func (m *transcriptOnlyModel) Predict(ctx context.Context, messages schema.Messages, images, videos, audios []string, tokenCallback func(string, backend.TokenUsage) bool, tools []types.ToolUnion, toolChoice *types.ToolChoiceUnion, logprobs *int, topLogprobs *int, logitBias map[string]float64) (func() (backend.LLMResponse, error), error) { return nil, fmt.Errorf("predict operation not supported in transcript-only mode") } -func (m *transcriptOnlyModel) PredictStream(ctx context.Context, in *proto.PredictOptions, f func(reply *proto.Reply), opts ...grpc.CallOption) error { - return fmt.Errorf("predict stream operation not supported in transcript-only mode") +func (m *transcriptOnlyModel) TTS(ctx context.Context, text, voice, language string) (string, *proto.Result, error) { + return "", nil, fmt.Errorf("TTS not supported in transcript-only mode") } -func (m *wrappedModel) VAD(ctx context.Context, in *proto.VADRequest, opts ...grpc.CallOption) (*proto.VADResponse, error) { - return m.VADClient.VAD(ctx, in) +func (m *transcriptOnlyModel) PredictConfig() *config.ModelConfig { + return nil } -func (m *anyToAnyModel) VAD(ctx context.Context, in *proto.VADRequest, opts ...grpc.CallOption) (*proto.VADResponse, error) { - return m.VADClient.VAD(ctx, in) +func (m *wrappedModel) VAD(ctx context.Context, request *schema.VADRequest) (*schema.VADResponse, error) { + return backend.VAD(request, ctx, m.modelLoader, m.appConfig, *m.VADConfig) } -func (m *wrappedModel) Transcribe(ctx context.Context, in *proto.TranscriptRequest, opts ...grpc.CallOption) (*proto.TranscriptResult, error) { - return m.TranscriptionClient.AudioTranscription(ctx, in, opts...) +func (m *wrappedModel) Transcribe(ctx context.Context, audio, language string, translate bool, diarize bool, prompt string) (*schema.TranscriptionResult, error) { + return backend.ModelTranscription(audio, language, translate, diarize, prompt, m.modelLoader, *m.TranscriptionConfig, m.appConfig) } -func (m *anyToAnyModel) Transcribe(ctx context.Context, in *proto.TranscriptRequest, opts ...grpc.CallOption) (*proto.TranscriptResult, error) { - // TODO: Can any-to-any models transcribe? - return m.LLMClient.AudioTranscription(ctx, in, opts...) +func (m *wrappedModel) Predict(ctx context.Context, messages schema.Messages, images, videos, audios []string, tokenCallback func(string, backend.TokenUsage) bool, tools []types.ToolUnion, toolChoice *types.ToolChoiceUnion, logprobs *int, topLogprobs *int, logitBias map[string]float64) (func() (backend.LLMResponse, error), error) { + input := schema.OpenAIRequest{ + Messages: messages, + } + + var predInput string + var funcs []functions.Function + if !m.LLMConfig.TemplateConfig.UseTokenizerTemplate { + if len(tools) > 0 { + for _, t := range tools { + if t.Function != nil { + var params map[string]any + + switch p := t.Function.Parameters.(type) { + case map[string]any: + params = p + case string: + if err := json.Unmarshal([]byte(p), ¶ms); err != nil { + xlog.Warn("Failed to parse parameters JSON string", "error", err, "function", t.Function.Name) + } + } + + funcs = append(funcs, functions.Function{ + Name: t.Function.Name, + Description: t.Function.Description, + Parameters: params, + }) + } + } + } + + predInput = m.evaluator.TemplateMessages(input, input.Messages, m.LLMConfig, funcs, len(funcs) > 0) + + xlog.Debug("Prompt (after templating)", "prompt", predInput) + if m.LLMConfig.Grammar != "" { + xlog.Debug("Grammar", "grammar", m.LLMConfig.Grammar) + } + } + + // Generate grammar for function calling if tools are provided and grammar generation is enabled + shouldUseFn := len(tools) > 0 && m.LLMConfig.ShouldUseFunctions() + + if !m.LLMConfig.FunctionsConfig.GrammarConfig.NoGrammar && shouldUseFn { + // Allow the user to set custom actions via config file + noActionName := "answer" + noActionDescription := "use this action to answer without performing any action" + + if m.LLMConfig.FunctionsConfig.NoActionFunctionName != "" { + noActionName = m.LLMConfig.FunctionsConfig.NoActionFunctionName + } + if m.LLMConfig.FunctionsConfig.NoActionDescriptionName != "" { + noActionDescription = m.LLMConfig.FunctionsConfig.NoActionDescriptionName + } + + noActionGrammar := functions.Function{ + Name: noActionName, + Description: noActionDescription, + Parameters: map[string]interface{}{ + "properties": map[string]interface{}{ + "message": map[string]interface{}{ + "type": "string", + "description": "The message to reply the user with", + }, + }, + }, + } + + if !m.LLMConfig.FunctionsConfig.DisableNoAction { + funcs = append(funcs, noActionGrammar) + } + + // Force picking one of the functions by the request + if m.LLMConfig.FunctionToCall() != "" { + funcs = functions.Functions(funcs).Select(m.LLMConfig.FunctionToCall()) + } + + // Generate grammar from function definitions + jsStruct := functions.Functions(funcs).ToJSONStructure(m.LLMConfig.FunctionsConfig.FunctionNameKey, m.LLMConfig.FunctionsConfig.FunctionNameKey) + g, err := jsStruct.Grammar(m.LLMConfig.FunctionsConfig.GrammarOptions()...) + if err == nil { + m.LLMConfig.Grammar = g + xlog.Debug("Generated grammar for function calling", "grammar", g) + } else { + xlog.Error("Failed generating grammar", "error", err) + } + } + + var toolsJSON string + if len(tools) > 0 { + b, _ := json.Marshal(tools) + toolsJSON = string(b) + } + + var toolChoiceJSON string + if toolChoice != nil { + b, _ := json.Marshal(toolChoice) + toolChoiceJSON = string(b) + } + + return backend.ModelInference(ctx, predInput, messages, images, videos, audios, m.modelLoader, m.LLMConfig, m.confLoader, m.appConfig, tokenCallback, toolsJSON, toolChoiceJSON, logprobs, topLogprobs, logitBias, ) } -func (m *wrappedModel) Predict(ctx context.Context, in *proto.PredictOptions, opts ...grpc.CallOption) (*proto.Reply, error) { - // TODO: Convert with pipeline (audio to text, text to llm, result to tts, and return it) - // sound.BufferAsWAV(audioData, "audio.wav") - - return m.LLMClient.Predict(ctx, in) +func (m *wrappedModel) TTS(ctx context.Context, text, voice, language string) (string, *proto.Result, error) { + return backend.ModelTTS(text, voice, language, m.modelLoader, m.appConfig, *m.TTSConfig) } -func (m *wrappedModel) PredictStream(ctx context.Context, in *proto.PredictOptions, f func(reply *proto.Reply), opts ...grpc.CallOption) error { - // TODO: Convert with pipeline (audio to text, text to llm, result to tts, and return it) - - return m.LLMClient.PredictStream(ctx, in, f) -} - -func (m *anyToAnyModel) Predict(ctx context.Context, in *proto.PredictOptions, opts ...grpc.CallOption) (*proto.Reply, error) { - return m.LLMClient.Predict(ctx, in) -} - -func (m *anyToAnyModel) PredictStream(ctx context.Context, in *proto.PredictOptions, f func(reply *proto.Reply), opts ...grpc.CallOption) error { - return m.LLMClient.PredictStream(ctx, in, f) +func (m *wrappedModel) PredictConfig() *config.ModelConfig { + return m.LLMConfig } func newTranscriptionOnlyModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) (Model, *config.ModelConfig, error) { @@ -116,12 +206,6 @@ func newTranscriptionOnlyModel(pipeline *config.Pipeline, cl *config.ModelConfig return nil, nil, fmt.Errorf("failed to validate config: %w", err) } - opts := backend.ModelOptions(*cfgVAD, appConfig) - VADClient, err := ml.Load(opts...) - if err != nil { - return nil, nil, fmt.Errorf("failed to load tts model: %w", err) - } - cfgSST, err := cl.LoadModelConfigFileByName(pipeline.Transcription, ml.ModelPath) if err != nil { @@ -132,22 +216,19 @@ func newTranscriptionOnlyModel(pipeline *config.Pipeline, cl *config.ModelConfig return nil, nil, fmt.Errorf("failed to validate config: %w", err) } - opts = backend.ModelOptions(*cfgSST, appConfig) - transcriptionClient, err := ml.Load(opts...) - if err != nil { - return nil, nil, fmt.Errorf("failed to load SST model: %w", err) - } - return &transcriptOnlyModel{ - VADConfig: cfgVAD, - VADClient: VADClient, TranscriptionConfig: cfgSST, - TranscriptionClient: transcriptionClient, + VADConfig: cfgVAD, + + confLoader: cl, + modelLoader: ml, + appConfig: appConfig, }, cfgSST, nil } // returns and loads either a wrapped model or a model that support audio-to-audio -func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) (Model, error) { +func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig, evaluator *templates.Evaluator) (Model, error) { + xlog.Debug("Creating new model pipeline model", "pipeline", pipeline) cfgVAD, err := cl.LoadModelConfigFileByName(pipeline.VAD, ml.ModelPath) if err != nil { @@ -159,12 +240,6 @@ func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model return nil, fmt.Errorf("failed to validate config: %w", err) } - opts := backend.ModelOptions(*cfgVAD, appConfig) - VADClient, err := ml.Load(opts...) - if err != nil { - return nil, fmt.Errorf("failed to load tts model: %w", err) - } - // TODO: Do we always need a transcription model? It can be disabled. Note that any-to-any instruction following models don't transcribe as such, so if transcription is required it is a separate process cfgSST, err := cl.LoadModelConfigFileByName(pipeline.Transcription, ml.ModelPath) if err != nil { @@ -176,38 +251,24 @@ func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model return nil, fmt.Errorf("failed to validate config: %w", err) } - opts = backend.ModelOptions(*cfgSST, appConfig) - transcriptionClient, err := ml.Load(opts...) - if err != nil { - return nil, fmt.Errorf("failed to load SST model: %w", err) - } - // TODO: Decide when we have a real any-to-any model - if false { - - cfgAnyToAny, err := cl.LoadModelConfigFileByName(pipeline.LLM, ml.ModelPath) - if err != nil { - - return nil, fmt.Errorf("failed to load backend config: %w", err) - } - - if valid, _ := cfgAnyToAny.Validate(); !valid { - return nil, fmt.Errorf("failed to validate config: %w", err) - } - - opts := backend.ModelOptions(*cfgAnyToAny, appConfig) - anyToAnyClient, err := ml.Load(opts...) - if err != nil { - return nil, fmt.Errorf("failed to load tts model: %w", err) - } - - return &anyToAnyModel{ - LLMConfig: cfgAnyToAny, - LLMClient: anyToAnyClient, - VADConfig: cfgVAD, - VADClient: VADClient, - }, nil - } + // if false { + // + // cfgAnyToAny, err := cl.LoadModelConfigFileByName(pipeline.LLM, ml.ModelPath) + // if err != nil { + // + // return nil, fmt.Errorf("failed to load backend config: %w", err) + // } + // + // if valid, _ := cfgAnyToAny.Validate(); !valid { + // return nil, fmt.Errorf("failed to validate config: %w", err) + // } + // + // return &anyToAnyModel{ + // LLMConfig: cfgAnyToAny, + // VADConfig: cfgVAD, + // }, nil + // } xlog.Debug("Loading a wrapped model") @@ -232,27 +293,15 @@ func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model return nil, fmt.Errorf("failed to validate config: %w", err) } - opts = backend.ModelOptions(*cfgTTS, appConfig) - ttsClient, err := ml.Load(opts...) - if err != nil { - return nil, fmt.Errorf("failed to load tts model: %w", err) - } - - opts = backend.ModelOptions(*cfgLLM, appConfig) - llmClient, err := ml.Load(opts...) - if err != nil { - return nil, fmt.Errorf("failed to load LLM model: %w", err) - } - return &wrappedModel{ TTSConfig: cfgTTS, TranscriptionConfig: cfgSST, LLMConfig: cfgLLM, - TTSClient: ttsClient, - TranscriptionClient: transcriptionClient, - LLMClient: llmClient, - VADConfig: cfgVAD, - VADClient: VADClient, + + confLoader: cl, + modelLoader: ml, + appConfig: appConfig, + evaluator: evaluator, }, nil } diff --git a/core/http/endpoints/openai/types/client_events.go b/core/http/endpoints/openai/types/client_events.go new file mode 100644 index 000000000..3ce547971 --- /dev/null +++ b/core/http/endpoints/openai/types/client_events.go @@ -0,0 +1,413 @@ +package types + +import "encoding/json" + +// ClientEventType is the type of client event. See https://platform.openai.com/docs/guides/realtime/client-events +type ClientEventType string + +const ( + ClientEventTypeSessionUpdate ClientEventType = "session.update" + ClientEventTypeInputAudioBufferAppend ClientEventType = "input_audio_buffer.append" + ClientEventTypeInputAudioBufferCommit ClientEventType = "input_audio_buffer.commit" + ClientEventTypeInputAudioBufferClear ClientEventType = "input_audio_buffer.clear" + ClientEventTypeConversationItemCreate ClientEventType = "conversation.item.create" + ClientEventTypeConversationItemRetrieve ClientEventType = "conversation.item.retrieve" + ClientEventTypeConversationItemTruncate ClientEventType = "conversation.item.truncate" + ClientEventTypeConversationItemDelete ClientEventType = "conversation.item.delete" + ClientEventTypeResponseCreate ClientEventType = "response.create" + ClientEventTypeResponseCancel ClientEventType = "response.cancel" + ClientEventTypeOutputAudioBufferClear ClientEventType = "output_audio_buffer.clear" +) + +// ClientEvent is the interface for client event. +type ClientEvent interface { + ClientEventType() ClientEventType +} + +// EventBase is the base struct for all client events. +type EventBase struct { + Type string `json:"type"` + // Optional client-generated ID used to identify this event. + EventID string `json:"event_id,omitempty"` +} + +// Send this event to update the session’s configuration. The client may send this event at any time to update any field except for voice and model. voice can be updated only if there have been no other audio outputs yet. +// +// When the server receives a session.update, it will respond with a session.updated event showing the full, effective configuration. Only the fields that are present in the session.update are updated. To clear a field like instructions, pass an empty string. To clear a field like tools, pass an empty array. To clear a field like turn_detection, pass null.// +// +// See https://platform.openai.com/docs/api-reference/realtime-client-events/session/update +type SessionUpdateEvent struct { + EventBase + // Session configuration to update. + Session SessionUnion `json:"session"` +} + +func (m SessionUpdateEvent) ClientEventType() ClientEventType { + return ClientEventTypeSessionUpdate +} + +func (m SessionUpdateEvent) MarshalJSON() ([]byte, error) { + type typeAlias SessionUpdateEvent + type typeWrapper struct { + typeAlias + Type ClientEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ClientEventType(), + } + return json.Marshal(shadow) +} + +type NoiseReductionType string + +const ( + NoiseReductionNearField NoiseReductionType = "near_field" + NoiseReductionFarField NoiseReductionType = "far_field" +) + +// Send this event to append audio bytes to the input audio buffer. The audio buffer is temporary storage you can write to and later commit. A "commit" will create a new user message item in the conversation history from the buffer content and clear the buffer. Input audio transcription (if enabled) will be generated when the buffer is committed. +// +// If VAD is enabled the audio buffer is used to detect speech and the server will decide when to commit. When Server VAD is disabled, you must commit the audio buffer manually. Input audio noise reduction operates on writes to the audio buffer. +// +// The client may choose how much audio to place in each event up to a maximum of 15 MiB, for example streaming smaller chunks from the client may allow the VAD to be more responsive. Unlike most other client events, the server will not send a confirmation response to this event. +// +// See https://platform.openai.com/docs/api-reference/realtime-client-events/input_audio_buffer/append +type InputAudioBufferAppendEvent struct { + EventBase + Audio string `json:"audio"` // Base64-encoded audio bytes. +} + +func (m InputAudioBufferAppendEvent) ClientEventType() ClientEventType { + return ClientEventTypeInputAudioBufferAppend +} + +func (m InputAudioBufferAppendEvent) MarshalJSON() ([]byte, error) { + type typeAlias InputAudioBufferAppendEvent + type typeWrapper struct { + typeAlias + Type ClientEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ClientEventType(), + } + return json.Marshal(shadow) +} + +// Send this event to commit the user input audio buffer, which will create a new user message item in the conversation. This event will produce an error if the input audio buffer is empty. When in Server VAD mode, the client does not need to send this event, the server will commit the audio buffer automatically. +// +// Committing the input audio buffer will trigger input audio transcription (if enabled in session configuration), but it will not create a response from the model. The server will respond with an input_audio_buffer.committed event. +// +// See https://platform.openai.com/docs/api-reference/realtime-client-events/input_audio_buffer/commit +type InputAudioBufferCommitEvent struct { + EventBase +} + +func (m InputAudioBufferCommitEvent) ClientEventType() ClientEventType { + return ClientEventTypeInputAudioBufferCommit +} + +func (m InputAudioBufferCommitEvent) MarshalJSON() ([]byte, error) { + type typeAlias InputAudioBufferCommitEvent + type typeWrapper struct { + typeAlias + Type ClientEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ClientEventType(), + } + return json.Marshal(shadow) +} + +// Send this event to clear the audio bytes in the buffer. The server will respond with an input_audio_buffer.cleared event. +// +// See https://platform.openai.com/docs/api-reference/realtime-client-events/input_audio_buffer/clear +type InputAudioBufferClearEvent struct { + EventBase +} + +func (m InputAudioBufferClearEvent) ClientEventType() ClientEventType { + return ClientEventTypeInputAudioBufferClear +} + +func (m InputAudioBufferClearEvent) MarshalJSON() ([]byte, error) { + type typeAlias InputAudioBufferClearEvent + type typeWrapper struct { + typeAlias + Type ClientEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ClientEventType(), + } + return json.Marshal(shadow) +} + +// Send this event to clear the audio bytes in the buffer. The server will respond with an input_audio_buffer.cleared event. +// +// See https://platform.openai.com/docs/api-reference/realtime-client-events/output_audio_buffer/clear + +type OutputAudioBufferClearEvent struct { + EventBase +} + +func (m OutputAudioBufferClearEvent) ClientEventType() ClientEventType { + return ClientEventTypeOutputAudioBufferClear +} + +func (m OutputAudioBufferClearEvent) MarshalJSON() ([]byte, error) { + type typeAlias OutputAudioBufferClearEvent + type typeWrapper struct { + typeAlias + Type ClientEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ClientEventType(), + } + return json.Marshal(shadow) +} + +// Add a new Item to the Conversation's context, including messages, function calls, and function call responses. This event can be used both to populate a "history" of the conversation and to add new items mid-stream, but has the current limitation that it cannot populate assistant audio messages. +// +// If successful, the server will respond with a conversation.item.created event, otherwise an error event will be sent. +// +// See https://platform.openai.com/docs/api-reference/realtime-client-events/conversation/item/create +type ConversationItemCreateEvent struct { + EventBase + // The ID of the preceding item after which the new item will be inserted. + PreviousItemID string `json:"previous_item_id,omitempty"` + // The item to add to the conversation. + Item MessageItemUnion `json:"item"` +} + +func (m ConversationItemCreateEvent) ClientEventType() ClientEventType { + return ClientEventTypeConversationItemCreate +} + +func (m ConversationItemCreateEvent) MarshalJSON() ([]byte, error) { + type typeAlias ConversationItemCreateEvent + type typeWrapper struct { + typeAlias + Type ClientEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ClientEventType(), + } + return json.Marshal(shadow) +} + +// Send this event when you want to retrieve the server's representation of a specific item in the conversation history. This is useful, for example, to inspect user audio after noise cancellation and VAD. The server will respond with a conversation.item.retrieved event, unless the item does not exist in the conversation history, in which case the server will respond with an error. +// +// See https://platform.openai.com/docs/api-reference/realtime-client-events/conversation/item/retrieve +type ConversationItemRetrieveEvent struct { + EventBase + // The ID of the item to retrieve. + ItemID string `json:"item_id"` +} + +func (m ConversationItemRetrieveEvent) ClientEventType() ClientEventType { + return ClientEventTypeConversationItemRetrieve +} + +func (m ConversationItemRetrieveEvent) MarshalJSON() ([]byte, error) { + type typeAlias ConversationItemRetrieveEvent + type typeWrapper struct { + typeAlias + Type ClientEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ClientEventType(), + } + return json.Marshal(shadow) +} + +// Send this event to truncate a previous assistant message’s audio. The server will produce audio faster than realtime, so this event is useful when the user interrupts to truncate audio that has already been sent to the client but not yet played. This will synchronize the server's understanding of the audio with the client's playback. +// +// Truncating audio will delete the server-side text transcript to ensure there is not text in the context that hasn't been heard by the user. +// +// If successful, the server will respond with a conversation.item.truncated event. +// +// See https://platform.openai.com/docs/api-reference/realtime-client-events/conversation/item/truncate +type ConversationItemTruncateEvent struct { + EventBase + // The ID of the assistant message item to truncate. + ItemID string `json:"item_id"` + // The index of the content part to truncate. + ContentIndex int `json:"content_index"` + // Inclusive duration up to which audio is truncated, in milliseconds. + AudioEndMs int `json:"audio_end_ms"` +} + +func (m ConversationItemTruncateEvent) ClientEventType() ClientEventType { + return ClientEventTypeConversationItemTruncate +} + +func (m ConversationItemTruncateEvent) MarshalJSON() ([]byte, error) { + type typeAlias ConversationItemTruncateEvent + type typeWrapper struct { + typeAlias + Type ClientEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ClientEventType(), + } + return json.Marshal(shadow) +} + +// Send this event when you want to remove any item from the conversation history. The server will respond with a conversation.item.deleted event, unless the item does not exist in the conversation history, in which case the server will respond with an error. +// +// See https://platform.openai.com/docs/api-reference/realtime-client-events/conversation/item/delete +type ConversationItemDeleteEvent struct { + EventBase + // The ID of the item to delete. + ItemID string `json:"item_id"` +} + +func (m ConversationItemDeleteEvent) ClientEventType() ClientEventType { + return ClientEventTypeConversationItemDelete +} + +func (m ConversationItemDeleteEvent) MarshalJSON() ([]byte, error) { + type typeAlias ConversationItemDeleteEvent + type typeWrapper struct { + typeAlias + Type ClientEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ClientEventType(), + } + return json.Marshal(shadow) +} + +// This event instructs the server to create a Response, which means triggering model inference. When in Server VAD mode, the server will create Responses automatically. +// +// A Response will include at least one Item, and may have two, in which case the second will be a function call. These Items will be appended to the conversation history by default. +// +// The server will respond with a response.created event, events for Items and content created, and finally a response.done event to indicate the Response is complete. +// +// The response.create event includes inference configuration like instructions and tools. If these are set, they will override the Session's configuration for this Response only. +// +// Responses can be created out-of-band of the default Conversation, meaning that they can have arbitrary input, and it's possible to disable writing the output to the Conversation. Only one Response can write to the default Conversation at a time, but otherwise multiple Responses can be created in parallel. The metadata field is a good way to disambiguate multiple simultaneous Responses. +// +// Clients can set conversation to none to create a Response that does not write to the default Conversation. Arbitrary input can be provided with the input field, which is an array accepting raw Items and references to existing Items. +// +// See https://platform.openai.com/docs/api-reference/realtime-client-events/response/create +type ResponseCreateEvent struct { + EventBase + // Configuration for the response. + Response ResponseCreateParams `json:"response"` +} + +func (m ResponseCreateEvent) ClientEventType() ClientEventType { + return ClientEventTypeResponseCreate +} + +func (m ResponseCreateEvent) MarshalJSON() ([]byte, error) { + type typeAlias ResponseCreateEvent + type typeWrapper struct { + typeAlias + Type ClientEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ClientEventType(), + } + return json.Marshal(shadow) +} + +// Send this event to cancel an in-progress response. The server will respond with a response.done event with a status of response.status=cancelled. If there is no response to cancel, the server will respond with an error. It's safe to call response.cancel even if no response is in progress, an error will be returned the session will remain unaffected. +// +// See https://platform.openai.com/docs/api-reference/realtime-client-events/response/cancel +type ResponseCancelEvent struct { + EventBase + // A specific response ID to cancel - if not provided, will cancel an in-progress response in the default conversation. + ResponseID string `json:"response_id,omitempty"` +} + +func (m ResponseCancelEvent) ClientEventType() ClientEventType { + return ClientEventTypeResponseCancel +} + +func (m ResponseCancelEvent) MarshalJSON() ([]byte, error) { + type typeAlias ResponseCancelEvent + type typeWrapper struct { + typeAlias + Type ClientEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ClientEventType(), + } + return json.Marshal(shadow) +} + +type ClientEventInterface interface { + SessionUpdateEvent | + InputAudioBufferAppendEvent | + InputAudioBufferCommitEvent | + InputAudioBufferClearEvent | + OutputAudioBufferClearEvent | + ConversationItemCreateEvent | + ConversationItemRetrieveEvent | + ConversationItemTruncateEvent | + ConversationItemDeleteEvent | + ResponseCreateEvent | + ResponseCancelEvent +} + +func unmarshalClientEvent[T ClientEventInterface](data []byte) (T, error) { + var t T + err := json.Unmarshal(data, &t) + if err != nil { + return t, err + } + return t, nil +} + +// UnmarshalClientEvent unmarshals the client event from the given JSON data. +func UnmarshalClientEvent(data []byte) (ClientEvent, error) { + var eventType struct { + Type ClientEventType `json:"type"` + } + err := json.Unmarshal(data, &eventType) + if err != nil { + return nil, err + } + + switch eventType.Type { + case ClientEventTypeSessionUpdate: + return unmarshalClientEvent[SessionUpdateEvent](data) + case ClientEventTypeInputAudioBufferAppend: + return unmarshalClientEvent[InputAudioBufferAppendEvent](data) + case ClientEventTypeInputAudioBufferCommit: + return unmarshalClientEvent[InputAudioBufferCommitEvent](data) + case ClientEventTypeInputAudioBufferClear: + return unmarshalClientEvent[InputAudioBufferClearEvent](data) + case ClientEventTypeOutputAudioBufferClear: + return unmarshalClientEvent[OutputAudioBufferClearEvent](data) + case ClientEventTypeConversationItemCreate: + return unmarshalClientEvent[ConversationItemCreateEvent](data) + case ClientEventTypeConversationItemRetrieve: + return unmarshalClientEvent[ConversationItemRetrieveEvent](data) + case ClientEventTypeConversationItemTruncate: + return unmarshalClientEvent[ConversationItemTruncateEvent](data) + case ClientEventTypeConversationItemDelete: + return unmarshalClientEvent[ConversationItemDeleteEvent](data) + case ClientEventTypeResponseCreate: + return unmarshalClientEvent[ResponseCreateEvent](data) + case ClientEventTypeResponseCancel: + return unmarshalClientEvent[ResponseCancelEvent](data) + default: + // We should probably return a generic event or error here, but for now just nil. + // Or maybe a "UnknownEvent" struct? + // For now matching the existing pattern + return nil, nil + } +} diff --git a/core/http/endpoints/openai/types/int_or_inf.go b/core/http/endpoints/openai/types/int_or_inf.go new file mode 100644 index 000000000..c2d57e255 --- /dev/null +++ b/core/http/endpoints/openai/types/int_or_inf.go @@ -0,0 +1,39 @@ +package types + +import ( + "encoding/json" + "math" +) + +const ( + // Inf is the maximum value for an IntOrInf. + Inf IntOrInf = math.MaxInt +) + +// IntOrInf is a type that can be either an int or "inf". +type IntOrInf int + +// IsInf returns true if the value is "inf". +func (m IntOrInf) IsInf() bool { + return m == Inf +} + +// MarshalJSON marshals the IntOrInf to JSON. +func (m IntOrInf) MarshalJSON() ([]byte, error) { + if m == Inf { + return []byte("\"inf\""), nil + } + return json.Marshal(int(m)) +} + +// UnmarshalJSON unmarshals the IntOrInf from JSON. +func (m *IntOrInf) UnmarshalJSON(data []byte) error { + if string(data) == "\"inf\"" { + *m = Inf + return nil + } + if len(data) == 0 { + return nil + } + return json.Unmarshal(data, (*int)(m)) +} diff --git a/core/http/endpoints/openai/types/message_item.go b/core/http/endpoints/openai/types/message_item.go new file mode 100644 index 000000000..2b4f0c95f --- /dev/null +++ b/core/http/endpoints/openai/types/message_item.go @@ -0,0 +1,628 @@ +package types + +import ( + "encoding/json" + "errors" + "fmt" +) + +type MessageItemType string + +const ( + MessageItemTypeMessage MessageItemType = "message" + MessageItemTypeFunctionCall MessageItemType = "function_call" + MessageItemTypeFunctionCallOutput MessageItemType = "function_call_output" + MessageItemTypeMCPApprovalResponse MessageItemType = "mcp_approval_response" + MessageItemTypeMCPListTools MessageItemType = "mcp_list_tools" + MessageItemTypeMCPCall MessageItemType = "mcp_call" + MessageItemTypeMCPApprovalRequest MessageItemType = "mcp_approval_request" +) + +type MessageContentType string + +const ( + MessageContentTypeText MessageContentType = "text" + MessageContentTypeAudio MessageContentType = "audio" + MessageContentTypeTranscript MessageContentType = "transcript" + MessageContentTypeInputText MessageContentType = "input_text" + MessageContentTypeInputAudio MessageContentType = "input_audio" + MessageContentTypeOutputText MessageContentType = "output_text" + MessageContentTypeOutputAudio MessageContentType = "output_audio" +) + +type MessageContentText struct { + Text string `json:"text,omitempty"` +} + +type MessageContentAudio struct { + Type MessageContentType `json:"type,omitempty"` + Audio string `json:"audio,omitempty"` +} + +type MessageContentTranscript struct { + Type MessageContentType `json:"type,omitempty"` + Transcript string `json:"transcript,omitempty"` +} + +type MessageContentImage struct { + Type MessageContentType `json:"type,omitempty"` + ImageURL string `json:"image_url,omitempty"` + Detail ImageDetail `json:"detail,omitempty"` +} + +type MessageContentSystem MessageContentText + +type MessageItemSystem struct { + // The unique ID of the item. This may be provided by the client or generated by the server. + ID string `json:"id,omitempty"` + + // The content of the message. + Content []MessageContentSystem `json:"content,omitempty"` + + // Identifier for the API object being returned - always realtime.item. Optional when creating a new item. + Object string `json:"object,omitempty"` + + // The status of the item. Has no effect on the conversation. + Status ItemStatus `json:"status,omitempty"` +} + +func (m MessageItemSystem) MessageItemType() MessageItemType { + return MessageItemTypeMessage +} + +func (m MessageItemSystem) Role() MessageRole { + return MessageRoleSystem +} + +func (m MessageItemSystem) MarshalJSON() ([]byte, error) { + type typeAlias MessageItemSystem + type typeWrapper struct { + typeAlias + Type MessageItemType `json:"type"` + Role MessageRole `json:"role"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.MessageItemType(), + Role: m.Role(), + } + return json.Marshal(shadow) +} + +type MessageItemUser struct { + // The unique ID of the item. This may be provided by the client or generated by the server. + ID string `json:"id,omitempty"` + + // The content of the message. + Content []MessageContentInput `json:"content,omitempty"` + + // Identifier for the API object being returned - always realtime.item. Optional when creating a new item. + Object string `json:"object,omitempty"` + + // The status of the item. Has no effect on the conversation. + Status ItemStatus `json:"status,omitempty"` +} + +func (m MessageItemUser) MessageItemType() MessageItemType { + return MessageItemTypeMessage +} + +func (m MessageItemUser) Role() MessageRole { + return MessageRoleUser +} + +func (m MessageItemUser) MarshalJSON() ([]byte, error) { + type typeAlias MessageItemUser + type typeWrapper struct { + typeAlias + Type MessageItemType `json:"type"` + Role MessageRole `json:"role"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.MessageItemType(), + Role: m.Role(), + } + return json.Marshal(shadow) +} + +type MessageItemAssistant struct { + // The unique ID of the item. This may be provided by the client or generated by the server. + ID string `json:"id,omitempty"` + + // The content of the message. + Content []MessageContentOutput `json:"content,omitempty"` + + // Identifier for the API object being returned - always realtime.item. Optional when creating a new item. + Object string `json:"object,omitempty"` + + // The status of the item. Has no effect on the conversation. + Status ItemStatus `json:"status,omitempty"` +} + +func (m MessageItemAssistant) MessageItemType() MessageItemType { + return MessageItemTypeMessage +} + +func (m MessageItemAssistant) Role() MessageRole { + return MessageRoleAssistant +} + +func (m MessageItemAssistant) MarshalJSON() ([]byte, error) { + type typeAlias MessageItemAssistant + type typeWrapper struct { + typeAlias + Type MessageItemType `json:"type"` + Role MessageRole `json:"role"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.MessageItemType(), + Role: m.Role(), + } + return json.Marshal(shadow) +} + +type MessageContentInput struct { + // The content type (input_text, input_audio, or input_image). + Type MessageContentType `json:"type"` + + // Base64-encoded audio bytes (for input_audio), these will be parsed as the format specified in the session input audio type configuration. This defaults to PCM 16-bit 24kHz mono if not specified. + Audio string `json:"audio,omitempty"` + + // The detail level of the image (for input_image). auto will default to high. + Detail ImageDetail `json:"detail,omitempty"` + + // Base64-encoded image bytes (for input_image) as a data URI. For example .... Supported formats are PNG and JPEG. + ImageURL string `json:"image_url,omitempty"` + + // The text content (for input_text). + Text string `json:"text,omitempty"` + + // Transcript of the audio (for input_audio). This is not sent to the model, but will be attached to the message item for reference. + Transcript string `json:"transcript,omitempty"` +} + +type MessageContentOutput struct { + // The content type (input_text, input_audio, or input_image). + Type MessageContentType `json:"type,omitempty"` + + // Base64-encoded audio bytes (for input_audio), these will be parsed as the format specified in the session input audio type configuration. This defaults to PCM 16-bit 24kHz mono if not specified. + Audio string `json:"audio,omitempty"` + + // The text content (for input_text). + Text string `json:"text,omitempty"` + + // Transcript of the audio (for input_audio). This is not sent to the model, but will be attached to the message item for reference. + Transcript string `json:"transcript,omitempty"` +} + +type MessageItemFunctionCall struct { + // The unique ID of the item. This may be provided by the client or generated by the server. + ID string `json:"id,omitempty"` + + // The ID of the function call. + CallID string `json:"call_id,omitempty"` + + // The arguments of the function call. This is a JSON-encoded string representing the arguments passed to the function, for example {"arg1": "value1", "arg2": 42}. + Arguments string `json:"arguments,omitempty"` + + // The name of the function being called. + Name string `json:"name,omitempty"` + + // Identifier for the API object being returned - always realtime.item. Optional when creating a new item. + Object string `json:"object,omitempty"` + + // The status of the item. Has no effect on the conversation. + Status ItemStatus `json:"status,omitempty"` +} + +func (m MessageItemFunctionCall) MessageItemType() MessageItemType { + return MessageItemTypeFunctionCall +} + +func (m MessageItemFunctionCall) MarshalJSON() ([]byte, error) { + type typeAlias MessageItemFunctionCall + type typeWrapper struct { + typeAlias + Type MessageItemType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.MessageItemType(), + } + return json.Marshal(shadow) +} + +type MessageItemFunctionCallOutput struct { + // The unique ID of the item. This may be provided by the client or generated by the server. + ID string `json:"id,omitempty"` + + // The ID of the function call this output is for. + CallID string `json:"call_id,omitempty"` + + // The output of the function call, this is free text and can contain any information or simply be empty. + Output string `json:"output,omitempty"` + + // Identifier for the API object being returned - always realtime.item. Optional when creating a new item. + Object string `json:"object,omitempty"` + + // The status of the item. Has no effect on the conversation. + Status ItemStatus `json:"status,omitempty"` +} + +func (m MessageItemFunctionCallOutput) MessageItemType() MessageItemType { + return MessageItemTypeFunctionCallOutput +} + +func (m MessageItemFunctionCallOutput) MarshalJSON() ([]byte, error) { + type typeAlias MessageItemFunctionCallOutput + type typeWrapper struct { + typeAlias + Type MessageItemType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.MessageItemType(), + } + return json.Marshal(shadow) +} + +type MessageItemMCPApprovalResponse struct { + // The unique ID of the approval response. + ID string `json:"id,omitempty"` + + // The ID of the approval request being answered. + ApprovalRequestID string `json:"approval_request_id,omitempty"` + + // Whether the request was approved. + Approve bool `json:"approve,omitempty"` + + // Optional reason for the decision. + Reason string `json:"reason,omitempty"` +} + +func (m MessageItemMCPApprovalResponse) MessageItemType() MessageItemType { + return MessageItemTypeMCPApprovalResponse +} + +func (m MessageItemMCPApprovalResponse) MarshalJSON() ([]byte, error) { + type typeAlias MessageItemMCPApprovalResponse + type typeWrapper struct { + typeAlias + Type MessageItemType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.MessageItemType(), + } + return json.Marshal(shadow) +} + +type MCPTool struct { + // JSON schema describing the tool's expected input shape. + InputSchema string `json:"input_schema,omitempty"` + + // The name of the MCP tool. + Name string `json:"name,omitempty"` + + // A human-readable description of what the tool does. + Description string `json:"description,omitempty"` + + // Additional metadata or annotations supplied by the server. + Annotations any `json:"annotations,omitempty"` +} + +type MessageItemMCPListTools struct { + // The unique ID of the list. + ID string `json:"id,omitempty"` + + // The label of the MCP server. + ServerLabel string `json:"server_label,omitempty"` + + // The tools available on the server. + Tools []MCPTool `json:"tools,omitempty"` +} + +func (m MessageItemMCPListTools) MessageItemType() MessageItemType { + return MessageItemTypeMCPListTools +} + +func (m MessageItemMCPListTools) MarshalJSON() ([]byte, error) { + type typeAlias MessageItemMCPListTools + type typeWrapper struct { + typeAlias + Type MessageItemType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.MessageItemType(), + } + return json.Marshal(shadow) +} + +type MCPErrorType string + +const ( + MCPErrorTypeProtocolError MCPErrorType = "protocol_error" + MCPErrorTypeToolExecution MCPErrorType = "tool_execution_error" + MCPErrorTypeHTTPError MCPErrorType = "http_error" +) + +type MCPProtocolError struct { + // Numeric error code (protocol-specific). + Code int `json:"code,omitempty"` + + // Human-readable error message. + Message string `json:"message,omitempty"` +} + +func (m MCPProtocolError) ErrorType() MCPErrorType { + return MCPErrorTypeProtocolError +} + +func (m MCPProtocolError) MarshalJSON() ([]byte, error) { + type typeAlias MCPProtocolError + type typeWrapper struct { + typeAlias + Type MCPErrorType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ErrorType(), + } + return json.Marshal(shadow) +} + +type MCPToolExecutionError struct { + // Human-readable error message from tool execution. + Message string `json:"message,omitempty"` +} + +func (m MCPToolExecutionError) ErrorType() MCPErrorType { + return MCPErrorTypeToolExecution +} + +func (m MCPToolExecutionError) MarshalJSON() ([]byte, error) { + type typeAlias MCPToolExecutionError + type typeWrapper struct { + typeAlias + Type MCPErrorType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ErrorType(), + } + return json.Marshal(shadow) +} + +type MCPHTTPError struct { + // HTTP status code returned by the upstream call. + Code int `json:"code,omitempty"` + + // Human-readable HTTP error message. + Message string `json:"message,omitempty"` +} + +func (m MCPHTTPError) ErrorType() MCPErrorType { + return MCPErrorTypeHTTPError +} + +func (m MCPHTTPError) MarshalJSON() ([]byte, error) { + type typeAlias MCPHTTPError + type typeWrapper struct { + typeAlias + Type MCPErrorType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ErrorType(), + } + return json.Marshal(shadow) +} + +type MCPError struct { + // Details when type is protocol_error. + Protocol *MCPProtocolError `json:",omitempty"` + + // Details when type is tool_execution_error. + ToolExecution *MCPToolExecutionError `json:",omitempty"` + + // Details when type is http_error. + HTTP *MCPHTTPError `json:",omitempty"` +} + +func (m MCPError) MarshalJSON() ([]byte, error) { + if m.Protocol != nil { + return json.Marshal(m.Protocol) + } + if m.ToolExecution != nil { + return json.Marshal(m.ToolExecution) + } + return json.Marshal(m.HTTP) +} + +func (m *MCPError) UnmarshalJSON(data []byte) error { + if isNull(data) { + return nil + } + var u typeStruct + if err := json.Unmarshal(data, &u); err != nil { + return err + } + switch MCPErrorType(u.Type) { + case MCPErrorTypeProtocolError: + return json.Unmarshal(data, &m.Protocol) + case MCPErrorTypeToolExecution: + return json.Unmarshal(data, &m.ToolExecution) + case MCPErrorTypeHTTPError: + return json.Unmarshal(data, &m.HTTP) + default: + return errors.New("unknown error type: " + u.Type) + } +} + +type MessageItemMCPToolCall struct { + // The unique ID of the tool call. + ID string `json:"id,omitempty"` + + // The label of the MCP server running the tool. + ServerLabel string `json:"server_label,omitempty"` + + // A JSON string of the arguments passed to the tool. + Arguments string `json:"arguments,omitempty"` + + // The name of the tool that was run. + Name string `json:"name,omitempty"` + + // The ID of an associated approval request, if any. + ApprovalRequestID string `json:"approval_request_id,omitempty"` + + // The error from the tool call, if any. + Error *MCPProtocolError `json:"error,omitempty"` + + // The output from the tool call. + Output string `json:"output,omitempty"` +} + +func (m MessageItemMCPToolCall) MessageItemType() MessageItemType { + return MessageItemTypeMCPCall +} + +func (m MessageItemMCPToolCall) MarshalJSON() ([]byte, error) { + type typeAlias MessageItemMCPToolCall + type typeWrapper struct { + typeAlias + Type MessageItemType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.MessageItemType(), + } + return json.Marshal(shadow) +} + +type MessageItemMCPApprovalRequest struct { + // The unique ID of the approval request. + ID string `json:"id,omitempty"` + + // The name of the tool to run. + Name string `json:"name,omitempty"` + + // A JSON string of arguments for the tool. + Arguments string `json:"arguments,omitempty"` + + // The label of the MCP server making the request. + ServerLabel string `json:"server_label,omitempty"` +} + +func (m MessageItemMCPApprovalRequest) MessageItemType() MessageItemType { + return MessageItemTypeMCPApprovalRequest +} + +func (m MessageItemMCPApprovalRequest) MarshalJSON() ([]byte, error) { + type typeAlias MessageItemMCPApprovalRequest + type typeWrapper struct { + typeAlias + Type MessageItemType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.MessageItemType(), + } + return json.Marshal(shadow) +} + +type MessageItemUnion struct { + // A system message in a Realtime conversation can be used to provide additional context or instructions to the model. This is similar but distinct from the instruction prompt provided at the start of a conversation, as system messages can be added at any point in the conversation. For major changes to the conversation's behavior, use instructions, but for smaller updates (e.g. "the user is now asking about a different topic"), use system messages. + System *MessageItemSystem `json:",omitempty"` + + // A user message item in a Realtime conversation. + User *MessageItemUser `json:",omitempty"` + + // An assistant message item in a Realtime conversation. + Assistant *MessageItemAssistant `json:",omitempty"` + + // A function call item in a Realtime conversation. + FunctionCall *MessageItemFunctionCall `json:",omitempty"` + + // A function call output item in a Realtime conversation. + FunctionCallOutput *MessageItemFunctionCallOutput `json:",omitempty"` + + // A Realtime item responding to an MCP approval request. + MCPApprovalResponse *MessageItemMCPApprovalResponse `json:",omitempty"` + + // A Realtime item listing tools available on an MCP server. + MCPListTools *MessageItemMCPListTools `json:",omitempty"` + + // A Realtime item representing an invocation of a tool on an MCP server. + MCPToolCall *MessageItemMCPToolCall `json:",omitempty"` + + // A Realtime item requesting human approval of a tool invocation. + MCPApprovalRequest *MessageItemMCPApprovalRequest `json:",omitempty"` +} + +func (m MessageItemUnion) MarshalJSON() ([]byte, error) { + switch { + case m.System != nil: + return json.Marshal(m.System) + case m.User != nil: + return json.Marshal(m.User) + case m.Assistant != nil: + return json.Marshal(m.Assistant) + case m.FunctionCall != nil: + return json.Marshal(m.FunctionCall) + case m.FunctionCallOutput != nil: + return json.Marshal(m.FunctionCallOutput) + case m.MCPApprovalResponse != nil: + return json.Marshal(m.MCPApprovalResponse) + case m.MCPListTools != nil: + return json.Marshal(m.MCPListTools) + case m.MCPToolCall != nil: + return json.Marshal(m.MCPToolCall) + case m.MCPApprovalRequest != nil: + return json.Marshal(m.MCPApprovalRequest) + default: + return nil, errors.New("unknown message item type") + } +} + +func (m *MessageItemUnion) UnmarshalJSON(data []byte) error { + if isNull(data) { + return nil + } + var t struct { + Type string `json:"type"` + Role string `json:"role"` + } + if err := json.Unmarshal(data, &t); err != nil { + return err + } + switch MessageItemType(t.Type) { + case MessageItemTypeMessage: + switch MessageRole(t.Role) { + case MessageRoleUser: + return json.Unmarshal(data, &m.User) + case MessageRoleAssistant: + return json.Unmarshal(data, &m.Assistant) + case MessageRoleSystem: + return json.Unmarshal(data, &m.System) + default: + return fmt.Errorf("unknown message role: %s", t.Role) + } + case MessageItemTypeFunctionCall: + return json.Unmarshal(data, &m.FunctionCall) + case MessageItemTypeFunctionCallOutput: + return json.Unmarshal(data, &m.FunctionCallOutput) + case MessageItemTypeMCPApprovalResponse: + return json.Unmarshal(data, &m.MCPApprovalResponse) + case MessageItemTypeMCPListTools: + return json.Unmarshal(data, &m.MCPListTools) + case MessageItemTypeMCPCall: + return json.Unmarshal(data, &m.MCPToolCall) + case MessageItemTypeMCPApprovalRequest: + return json.Unmarshal(data, &m.MCPApprovalRequest) + default: + return fmt.Errorf("unknown message item type: %s", t.Type) + } +} diff --git a/core/http/endpoints/openai/types/realtime.go b/core/http/endpoints/openai/types/realtime.go deleted file mode 100644 index a79d05d9c..000000000 --- a/core/http/endpoints/openai/types/realtime.go +++ /dev/null @@ -1,1188 +0,0 @@ -package types - -// Most of this file was coppied from https://github.com/WqyJh/go-openai-realtime -// Copyright (c) 2024 Qiying Wang MIT License - -import ( - "encoding/json" - "fmt" - "math" -) - -const ( - // Inf is the maximum value for an IntOrInf. - Inf IntOrInf = math.MaxInt -) - -// IntOrInf is a type that can be either an int or "inf". -type IntOrInf int - -// IsInf returns true if the value is "inf". -func (m IntOrInf) IsInf() bool { - return m == Inf -} - -// MarshalJSON marshals the IntOrInf to JSON. -func (m IntOrInf) MarshalJSON() ([]byte, error) { - if m == Inf { - return []byte("\"inf\""), nil - } - return json.Marshal(int(m)) -} - -// UnmarshalJSON unmarshals the IntOrInf from JSON. -func (m *IntOrInf) UnmarshalJSON(data []byte) error { - if string(data) == "\"inf\"" { - *m = Inf - return nil - } - if len(data) == 0 { - return nil - } - return json.Unmarshal(data, (*int)(m)) -} - -type AudioFormat string - -const ( - AudioFormatPcm16 AudioFormat = "pcm16" - AudioFormatG711Ulaw AudioFormat = "g711_ulaw" - AudioFormatG711Alaw AudioFormat = "g711_alaw" -) - -type Modality string - -const ( - ModalityText Modality = "text" - ModalityAudio Modality = "audio" -) - -type ClientTurnDetectionType string - -const ( - ClientTurnDetectionTypeServerVad ClientTurnDetectionType = "server_vad" -) - -type ServerTurnDetectionType string - -const ( - ServerTurnDetectionTypeNone ServerTurnDetectionType = "none" - ServerTurnDetectionTypeServerVad ServerTurnDetectionType = "server_vad" -) - -type TurnDetectionType string - -const ( - // TurnDetectionTypeNone means turn detection is disabled. - // This can only be used in ServerSession, not in ClientSession. - // If you want to disable turn detection, you should send SessionUpdateEvent with TurnDetection set to nil. - TurnDetectionTypeNone TurnDetectionType = "none" - // TurnDetectionTypeServerVad use server-side VAD to detect turn. - // This is default value for newly created session. - TurnDetectionTypeServerVad TurnDetectionType = "server_vad" -) - -type TurnDetectionParams struct { - // Activation threshold for VAD. - Threshold float64 `json:"threshold,omitempty"` - // Audio included before speech starts (in milliseconds). - PrefixPaddingMs int `json:"prefix_padding_ms,omitempty"` - // Duration of silence to detect speech stop (in milliseconds). - SilenceDurationMs int `json:"silence_duration_ms,omitempty"` - // Whether or not to automatically generate a response when VAD is enabled. true by default. - CreateResponse *bool `json:"create_response,omitempty"` -} - -type ClientTurnDetection struct { - // Type of turn detection, only "server_vad" is currently supported. - Type ClientTurnDetectionType `json:"type"` - - TurnDetectionParams -} - -type ServerTurnDetection struct { - // The type of turn detection ("server_vad" or "none"). - Type ServerTurnDetectionType `json:"type"` - - TurnDetectionParams -} - -type ToolType string - -const ( - ToolTypeFunction ToolType = "function" -) - -type ToolChoiceInterface interface { - ToolChoice() -} - -type ToolChoiceString string - -func (ToolChoiceString) ToolChoice() {} - -const ( - ToolChoiceAuto ToolChoiceString = "auto" - ToolChoiceNone ToolChoiceString = "none" - ToolChoiceRequired ToolChoiceString = "required" -) - -type ToolChoice struct { - Type ToolType `json:"type"` - Function ToolFunction `json:"function,omitempty"` -} - -func (t ToolChoice) ToolChoice() {} - -type ToolFunction struct { - Name string `json:"name"` -} - -type MessageRole string - -const ( - MessageRoleSystem MessageRole = "system" - MessageRoleAssistant MessageRole = "assistant" - MessageRoleUser MessageRole = "user" -) - -type InputAudioTranscription struct { - // The model used for transcription. - Model string `json:"model"` - Language string `json:"language,omitempty"` - Prompt string `json:"prompt,omitempty"` -} - -type Tool struct { - Type ToolType `json:"type"` - Name string `json:"name"` - Description string `json:"description"` - Parameters any `json:"parameters"` -} - -type MessageItemType string - -const ( - MessageItemTypeMessage MessageItemType = "message" - MessageItemTypeFunctionCall MessageItemType = "function_call" - MessageItemTypeFunctionCallOutput MessageItemType = "function_call_output" -) - -type MessageContentType string - -const ( - MessageContentTypeText MessageContentType = "text" - MessageContentTypeAudio MessageContentType = "audio" - MessageContentTypeTranscript MessageContentType = "transcript" - MessageContentTypeInputText MessageContentType = "input_text" - MessageContentTypeInputAudio MessageContentType = "input_audio" -) - -type MessageContentPart struct { - // The content type. - Type MessageContentType `json:"type"` - // The text content. Validated if type is text. - Text string `json:"text,omitempty"` - // Base64-encoded audio data. Validated if type is audio. - Audio string `json:"audio,omitempty"` - // The transcript of the audio. Validated if type is transcript. - Transcript string `json:"transcript,omitempty"` -} - -type MessageItem struct { - // The unique ID of the item. - ID string `json:"id,omitempty"` - // The type of the item ("message", "function_call", "function_call_output"). - Type MessageItemType `json:"type"` - // The final status of the item. - Status ItemStatus `json:"status,omitempty"` - // The role associated with the item. - Role MessageRole `json:"role,omitempty"` - // The content of the item. - Content []MessageContentPart `json:"content,omitempty"` - // The ID of the function call, if the item is a function call. - CallID string `json:"call_id,omitempty"` - // The name of the function, if the item is a function call. - Name string `json:"name,omitempty"` - // The arguments of the function, if the item is a function call. - Arguments string `json:"arguments,omitempty"` - // The output of the function, if the item is a function call output. - Output string `json:"output,omitempty"` -} - -type ResponseMessageItem struct { - MessageItem - // The object type, must be "realtime.item". - Object string `json:"object,omitempty"` -} - -type Error struct { - // The type of error (e.g., "invalid_request_error", "server_error"). - Message string `json:"message,omitempty"` - // Error code, if any. - Type string `json:"type,omitempty"` - // A human-readable error message. - Code string `json:"code,omitempty"` - // Parameter related to the error, if any. - Param string `json:"param,omitempty"` - // The event_id of the client event that caused the error, if applicable. - EventID string `json:"event_id,omitempty"` -} - -// ServerToolChoice is a type that can be used to choose a tool response from the server. -type ServerToolChoice struct { - String ToolChoiceString - Function ToolChoice -} - -// UnmarshalJSON is a custom unmarshaler for ServerToolChoice. -func (m *ServerToolChoice) UnmarshalJSON(data []byte) error { - err := json.Unmarshal(data, &m.Function) - if err != nil { - if data[0] == '"' { - data = data[1:] - } - if data[len(data)-1] == '"' { - data = data[:len(data)-1] - } - m.String = ToolChoiceString(data) - m.Function = ToolChoice{} - return nil - } - return nil -} - -// IsFunction returns true if the tool choice is a function call. -func (m *ServerToolChoice) IsFunction() bool { - return m.Function.Type == ToolTypeFunction -} - -// Get returns the ToolChoiceInterface based on the type of tool choice. -func (m ServerToolChoice) Get() ToolChoiceInterface { - if m.IsFunction() { - return m.Function - } - return m.String -} - -type ServerSession struct { - // The unique ID of the session. - ID string `json:"id"` - // The object type, must be "realtime.session". - Object string `json:"object"` - // The default model used for this session. - Model string `json:"model"` - // The set of modalities the model can respond with. - Modalities []Modality `json:"modalities,omitempty"` - // The default system instructions. - Instructions string `json:"instructions,omitempty"` - // The voice the model uses to respond - one of alloy, echo, or shimmer. - Voice string `json:"voice,omitempty"` - // The format of input audio. - InputAudioFormat AudioFormat `json:"input_audio_format,omitempty"` - // The format of output audio. - OutputAudioFormat AudioFormat `json:"output_audio_format,omitempty"` - // Configuration for input audio transcription. - InputAudioTranscription *InputAudioTranscription `json:"input_audio_transcription,omitempty"` - // Configuration for turn detection. - TurnDetection *ServerTurnDetection `json:"turn_detection,omitempty"` - // Tools (functions) available to the model. - Tools []Tool `json:"tools,omitempty"` - // How the model chooses tools. - ToolChoice ServerToolChoice `json:"tool_choice,omitempty"` - // Sampling temperature. - Temperature *float32 `json:"temperature,omitempty"` - // Maximum number of output tokens. - MaxOutputTokens IntOrInf `json:"max_response_output_tokens,omitempty"` -} - -type ItemStatus string - -const ( - ItemStatusInProgress ItemStatus = "in_progress" - ItemStatusCompleted ItemStatus = "completed" - ItemStatusIncomplete ItemStatus = "incomplete" -) - -type Conversation struct { - // The unique ID of the conversation. - ID string `json:"id"` - // The object type, must be "realtime.conversation". - Object string `json:"object"` -} - -type ResponseStatus string - -const ( - ResponseStatusInProgress ResponseStatus = "in_progress" - ResponseStatusCompleted ResponseStatus = "completed" - ResponseStatusCancelled ResponseStatus = "cancelled" - ResponseStatusIncomplete ResponseStatus = "incomplete" - ResponseStatusFailed ResponseStatus = "failed" -) - -type CachedTokensDetails struct { - TextTokens int `json:"text_tokens"` - AudioTokens int `json:"audio_tokens"` -} - -type InputTokenDetails struct { - CachedTokens int `json:"cached_tokens"` - TextTokens int `json:"text_tokens"` - AudioTokens int `json:"audio_tokens"` - CachedTokensDetails CachedTokensDetails `json:"cached_tokens_details,omitempty"` -} - -type OutputTokenDetails struct { - TextTokens int `json:"text_tokens"` - AudioTokens int `json:"audio_tokens"` -} - -type Usage struct { - TotalTokens int `json:"total_tokens"` - InputTokens int `json:"input_tokens"` - OutputTokens int `json:"output_tokens"` - // Input token details. - InputTokenDetails InputTokenDetails `json:"input_token_details,omitempty"` - // Output token details. - OutputTokenDetails OutputTokenDetails `json:"output_token_details,omitempty"` -} - -type Response struct { - // The unique ID of the response. - ID string `json:"id"` - // The object type, must be "realtime.response". - Object string `json:"object"` - // The status of the response. - Status ResponseStatus `json:"status"` - // Additional details about the status. - StatusDetails any `json:"status_details,omitempty"` - // The list of output items generated by the response. - Output []ResponseMessageItem `json:"output"` - // Usage statistics for the response. - Usage *Usage `json:"usage,omitempty"` -} - -type RateLimit struct { - // The name of the rate limit ("requests", "tokens", "input_tokens", "output_tokens"). - Name string `json:"name"` - // The maximum allowed value for the rate limit. - Limit int `json:"limit"` - // The remaining value before the limit is reached. - Remaining int `json:"remaining"` - // Seconds until the rate limit resets. - ResetSeconds float64 `json:"reset_seconds"` -} - -// ClientEventType is the type of client event. See https://platform.openai.com/docs/guides/realtime/client-events -type ClientEventType string - -const ( - ClientEventTypeSessionUpdate ClientEventType = "session.update" - ClientEventTypeTranscriptionSessionUpdate ClientEventType = "transcription_session.update" - ClientEventTypeInputAudioBufferAppend ClientEventType = "input_audio_buffer.append" - ClientEventTypeInputAudioBufferCommit ClientEventType = "input_audio_buffer.commit" - ClientEventTypeInputAudioBufferClear ClientEventType = "input_audio_buffer.clear" - ClientEventTypeConversationItemCreate ClientEventType = "conversation.item.create" - ClientEventTypeConversationItemTruncate ClientEventType = "conversation.item.truncate" - ClientEventTypeConversationItemDelete ClientEventType = "conversation.item.delete" - ClientEventTypeResponseCreate ClientEventType = "response.create" - ClientEventTypeResponseCancel ClientEventType = "response.cancel" -) - -// ClientEvent is the interface for client event. -type ClientEvent interface { - ClientEventType() ClientEventType -} - -// EventBase is the base struct for all client events. -type EventBase struct { - // Optional client-generated ID used to identify this event. - EventID string `json:"event_id,omitempty"` -} - -type ClientSession struct { - Model string `json:"model,omitempty"` - // The set of modalities the model can respond with. To disable audio, set this to ["text"]. - Modalities []Modality `json:"modalities,omitempty"` - // The default system instructions prepended to model calls. - Instructions string `json:"instructions,omitempty"` - // The voice the model uses to respond - one of alloy, echo, or shimmer. Cannot be changed once the model has responded with audio at least once. - Voice string `json:"voice,omitempty"` - // The format of input audio. Options are "pcm16", "g711_ulaw", or "g711_alaw". - InputAudioFormat AudioFormat `json:"input_audio_format,omitempty"` - // The format of output audio. Options are "pcm16", "g711_ulaw", or "g711_alaw". - OutputAudioFormat AudioFormat `json:"output_audio_format,omitempty"` - // Configuration for input audio transcription. Can be set to `nil` to turn off. - InputAudioTranscription *InputAudioTranscription `json:"input_audio_transcription,omitempty"` - // Configuration for turn detection. Can be set to `nil` to turn off. - TurnDetection *ClientTurnDetection `json:"turn_detection"` - // Tools (functions) available to the model. - Tools []Tool `json:"tools,omitempty"` - // How the model chooses tools. Options are "auto", "none", "required", or specify a function. - ToolChoice ToolChoiceInterface `json:"tool_choice,omitempty"` - // Sampling temperature for the model. - Temperature *float32 `json:"temperature,omitempty"` - // Maximum number of output tokens for a single assistant response, inclusive of tool calls. Provide an integer between 1 and 4096 to limit output tokens, or "inf" for the maximum available tokens for a given model. Defaults to "inf". - MaxOutputTokens IntOrInf `json:"max_response_output_tokens,omitempty"` -} - -type CreateSessionRequest struct { - ClientSession - - // The Realtime model used for this session. - Model string `json:"model,omitempty"` -} - -type ClientSecret struct { - // Ephemeral key usable in client environments to authenticate connections to the Realtime API. Use this in client-side environments rather than a standard API token, which should only be used server-side. - Value string `json:"value"` - // Timestamp for when the token expires. Currently, all tokens expire after one minute. - ExpiresAt int64 `json:"expires_at"` -} - -type CreateSessionResponse struct { - ServerSession - - // Ephemeral key returned by the API. - ClientSecret ClientSecret `json:"client_secret"` -} - -// SessionUpdateEvent is the event for session update. -// Send this event to update the session’s default configuration. -// See https://platform.openai.com/docs/api-reference/realtime-client-events/session/update -type SessionUpdateEvent struct { - EventBase - // Session configuration to update. - Session ClientSession `json:"session"` -} - -func (m SessionUpdateEvent) ClientEventType() ClientEventType { - return ClientEventTypeSessionUpdate -} - -func (m SessionUpdateEvent) MarshalJSON() ([]byte, error) { - type sessionUpdateEvent SessionUpdateEvent - v := struct { - *sessionUpdateEvent - Type ClientEventType `json:"type"` - }{ - sessionUpdateEvent: (*sessionUpdateEvent)(&m), - Type: m.ClientEventType(), - } - return json.Marshal(v) -} - -// InputAudioBufferAppendEvent is the event for input audio buffer append. -// Send this event to append audio bytes to the input audio buffer. -// See https://platform.openai.com/docs/api-reference/realtime-client-events/input_audio_buffer/append -type InputAudioBufferAppendEvent struct { - EventBase - Audio string `json:"audio"` // Base64-encoded audio bytes. -} - -func (m InputAudioBufferAppendEvent) ClientEventType() ClientEventType { - return ClientEventTypeInputAudioBufferAppend -} - -func (m InputAudioBufferAppendEvent) MarshalJSON() ([]byte, error) { - type inputAudioBufferAppendEvent InputAudioBufferAppendEvent - v := struct { - *inputAudioBufferAppendEvent - Type ClientEventType `json:"type"` - }{ - inputAudioBufferAppendEvent: (*inputAudioBufferAppendEvent)(&m), - Type: m.ClientEventType(), - } - return json.Marshal(v) -} - -// InputAudioBufferCommitEvent is the event for input audio buffer commit. -// Send this event to commit audio bytes to a user message. -// See https://platform.openai.com/docs/api-reference/realtime-client-events/input_audio_buffer/commit -type InputAudioBufferCommitEvent struct { - EventBase -} - -func (m InputAudioBufferCommitEvent) ClientEventType() ClientEventType { - return ClientEventTypeInputAudioBufferCommit -} - -func (m InputAudioBufferCommitEvent) MarshalJSON() ([]byte, error) { - type inputAudioBufferCommitEvent InputAudioBufferCommitEvent - v := struct { - *inputAudioBufferCommitEvent - Type ClientEventType `json:"type"` - }{ - inputAudioBufferCommitEvent: (*inputAudioBufferCommitEvent)(&m), - Type: m.ClientEventType(), - } - return json.Marshal(v) -} - -// InputAudioBufferClearEvent is the event for input audio buffer clear. -// Send this event to clear the audio bytes in the buffer. -// See https://platform.openai.com/docs/api-reference/realtime-client-events/input_audio_buffer/clear -type InputAudioBufferClearEvent struct { - EventBase -} - -func (m InputAudioBufferClearEvent) ClientEventType() ClientEventType { - return ClientEventTypeInputAudioBufferClear -} - -func (m InputAudioBufferClearEvent) MarshalJSON() ([]byte, error) { - type inputAudioBufferClearEvent InputAudioBufferClearEvent - v := struct { - *inputAudioBufferClearEvent - Type ClientEventType `json:"type"` - }{ - inputAudioBufferClearEvent: (*inputAudioBufferClearEvent)(&m), - Type: m.ClientEventType(), - } - return json.Marshal(v) -} - -// ConversationItemCreateEvent is the event for conversation item create. -// Send this event when adding an item to the conversation. -// See https://platform.openai.com/docs/api-reference/realtime-client-events/conversation/item/create -type ConversationItemCreateEvent struct { - EventBase - // The ID of the preceding item after which the new item will be inserted. - PreviousItemID string `json:"previous_item_id,omitempty"` - // The item to add to the conversation. - Item MessageItem `json:"item"` -} - -func (m ConversationItemCreateEvent) ClientEventType() ClientEventType { - return ClientEventTypeConversationItemCreate -} - -func (m ConversationItemCreateEvent) MarshalJSON() ([]byte, error) { - type conversationItemCreateEvent ConversationItemCreateEvent - v := struct { - *conversationItemCreateEvent - Type ClientEventType `json:"type"` - }{ - conversationItemCreateEvent: (*conversationItemCreateEvent)(&m), - Type: m.ClientEventType(), - } - return json.Marshal(v) -} - -// ConversationItemTruncateEvent is the event for conversation item truncate. -// Send this event when you want to truncate a previous assistant message’s audio. -// See https://platform.openai.com/docs/api-reference/realtime-client-events/conversation/item/truncate -type ConversationItemTruncateEvent struct { - EventBase - // The ID of the assistant message item to truncate. - ItemID string `json:"item_id"` - // The index of the content part to truncate. - ContentIndex int `json:"content_index"` - // Inclusive duration up to which audio is truncated, in milliseconds. - AudioEndMs int `json:"audio_end_ms"` -} - -func (m ConversationItemTruncateEvent) ClientEventType() ClientEventType { - return ClientEventTypeConversationItemTruncate -} - -func (m ConversationItemTruncateEvent) MarshalJSON() ([]byte, error) { - type conversationItemTruncateEvent ConversationItemTruncateEvent - v := struct { - *conversationItemTruncateEvent - Type ClientEventType `json:"type"` - }{ - conversationItemTruncateEvent: (*conversationItemTruncateEvent)(&m), - Type: m.ClientEventType(), - } - return json.Marshal(v) -} - -// ConversationItemDeleteEvent is the event for conversation item delete. -// Send this event when you want to remove any item from the conversation history. -// See https://platform.openai.com/docs/api-reference/realtime-client-events/conversation/item/delete -type ConversationItemDeleteEvent struct { - EventBase - // The ID of the item to delete. - ItemID string `json:"item_id"` -} - -func (m ConversationItemDeleteEvent) ClientEventType() ClientEventType { - return ClientEventTypeConversationItemDelete -} - -func (m ConversationItemDeleteEvent) MarshalJSON() ([]byte, error) { - type conversationItemDeleteEvent ConversationItemDeleteEvent - v := struct { - *conversationItemDeleteEvent - Type ClientEventType `json:"type"` - }{ - conversationItemDeleteEvent: (*conversationItemDeleteEvent)(&m), - Type: m.ClientEventType(), - } - return json.Marshal(v) -} - -type ResponseCreateParams struct { - // The modalities for the response. - Modalities []Modality `json:"modalities,omitempty"` - // Instructions for the model. - Instructions string `json:"instructions,omitempty"` - // The voice the model uses to respond - one of alloy, echo, or shimmer. - Voice string `json:"voice,omitempty"` - // The format of output audio. - OutputAudioFormat AudioFormat `json:"output_audio_format,omitempty"` - // Tools (functions) available to the model. - Tools []Tool `json:"tools,omitempty"` - // How the model chooses tools. - ToolChoice ToolChoiceInterface `json:"tool_choice,omitempty"` - // Sampling temperature. - Temperature *float32 `json:"temperature,omitempty"` - // Maximum number of output tokens for a single assistant response, inclusive of tool calls. Provide an integer between 1 and 4096 to limit output tokens, or "inf" for the maximum available tokens for a given model. Defaults to "inf". - MaxOutputTokens IntOrInf `json:"max_output_tokens,omitempty"` -} - -// ResponseCreateEvent is the event for response create. -// Send this event to trigger a response generation. -// See https://platform.openai.com/docs/api-reference/realtime-client-events/response/create -type ResponseCreateEvent struct { - EventBase - // Configuration for the response. - Response ResponseCreateParams `json:"response"` -} - -func (m ResponseCreateEvent) ClientEventType() ClientEventType { - return ClientEventTypeResponseCreate -} - -func (m ResponseCreateEvent) MarshalJSON() ([]byte, error) { - type responseCreateEvent ResponseCreateEvent - v := struct { - *responseCreateEvent - Type ClientEventType `json:"type"` - }{ - responseCreateEvent: (*responseCreateEvent)(&m), - Type: m.ClientEventType(), - } - return json.Marshal(v) -} - -// ResponseCancelEvent is the event for response cancel. -// Send this event to cancel an in-progress response. -// See https://platform.openai.com/docs/api-reference/realtime-client-events/response/cancel -type ResponseCancelEvent struct { - EventBase - // A specific response ID to cancel - if not provided, will cancel an in-progress response in the default conversation. - ResponseID string `json:"response_id,omitempty"` -} - -func (m ResponseCancelEvent) ClientEventType() ClientEventType { - return ClientEventTypeResponseCancel -} - -func (m ResponseCancelEvent) MarshalJSON() ([]byte, error) { - type responseCancelEvent ResponseCancelEvent - v := struct { - *responseCancelEvent - Type ClientEventType `json:"type"` - }{ - responseCancelEvent: (*responseCancelEvent)(&m), - Type: m.ClientEventType(), - } - return json.Marshal(v) -} - -// MarshalClientEvent marshals the client event to JSON. -func MarshalClientEvent(event ClientEvent) ([]byte, error) { - return json.Marshal(event) -} - -type ServerEventType string - -const ( - ServerEventTypeError ServerEventType = "error" - ServerEventTypeSessionCreated ServerEventType = "session.created" - ServerEventTypeSessionUpdated ServerEventType = "session.updated" - ServerEventTypeTranscriptionSessionCreated ServerEventType = "transcription_session.created" - ServerEventTypeTranscriptionSessionUpdated ServerEventType = "transcription_session.updated" - ServerEventTypeConversationCreated ServerEventType = "conversation.created" - ServerEventTypeInputAudioBufferCommitted ServerEventType = "input_audio_buffer.committed" - ServerEventTypeInputAudioBufferCleared ServerEventType = "input_audio_buffer.cleared" - ServerEventTypeInputAudioBufferSpeechStarted ServerEventType = "input_audio_buffer.speech_started" - ServerEventTypeInputAudioBufferSpeechStopped ServerEventType = "input_audio_buffer.speech_stopped" - ServerEventTypeConversationItemCreated ServerEventType = "conversation.item.created" - ServerEventTypeConversationItemInputAudioTranscriptionCompleted ServerEventType = "conversation.item.input_audio_transcription.completed" - ServerEventTypeConversationItemInputAudioTranscriptionFailed ServerEventType = "conversation.item.input_audio_transcription.failed" - ServerEventTypeConversationItemTruncated ServerEventType = "conversation.item.truncated" - ServerEventTypeConversationItemDeleted ServerEventType = "conversation.item.deleted" - ServerEventTypeResponseCreated ServerEventType = "response.created" - ServerEventTypeResponseDone ServerEventType = "response.done" - ServerEventTypeResponseOutputItemAdded ServerEventType = "response.output_item.added" - ServerEventTypeResponseOutputItemDone ServerEventType = "response.output_item.done" - ServerEventTypeResponseContentPartAdded ServerEventType = "response.content_part.added" - ServerEventTypeResponseContentPartDone ServerEventType = "response.content_part.done" - ServerEventTypeResponseTextDelta ServerEventType = "response.text.delta" - ServerEventTypeResponseTextDone ServerEventType = "response.text.done" - ServerEventTypeResponseAudioTranscriptDelta ServerEventType = "response.audio_transcript.delta" - ServerEventTypeResponseAudioTranscriptDone ServerEventType = "response.audio_transcript.done" - ServerEventTypeResponseAudioDelta ServerEventType = "response.audio.delta" - ServerEventTypeResponseAudioDone ServerEventType = "response.audio.done" - ServerEventTypeResponseFunctionCallArgumentsDelta ServerEventType = "response.function_call_arguments.delta" - ServerEventTypeResponseFunctionCallArgumentsDone ServerEventType = "response.function_call_arguments.done" - ServerEventTypeRateLimitsUpdated ServerEventType = "rate_limits.updated" -) - -// ServerEvent is the interface for server events. -type ServerEvent interface { - ServerEventType() ServerEventType -} - -// ServerEventBase is the base struct for all server events. -type ServerEventBase struct { - // The unique ID of the server event. - EventID string `json:"event_id,omitempty"` - // The type of the server event. - Type ServerEventType `json:"type"` -} - -func (m ServerEventBase) ServerEventType() ServerEventType { - return m.Type -} - -// ErrorEvent is the event for error. -// Returned when an error occurs. -// See https://platform.openai.com/docs/api-reference/realtime-server-events/error -type ErrorEvent struct { - ServerEventBase - // Details of the error. - Error Error `json:"error"` -} - -// SessionCreatedEvent is the event for session created. -// Returned when a session is created. Emitted automatically when a new connection is established. -// See https://platform.openai.com/docs/api-reference/realtime-server-events/session/created -type SessionCreatedEvent struct { - ServerEventBase - // The session resource. - Session ServerSession `json:"session"` -} - -// TranscriptionSessionCreatedEvent is the event for session created. -// Returned when a transcription session is created. -// See https://platform.openai.com/docs/api-reference/realtime-server-events/session/created -type TranscriptionSessionCreatedEvent struct { - ServerEventBase - // The transcription session resource. - Session ServerSession `json:"session"` -} - -// SessionUpdatedEvent is the event for session updated. -// Returned when a session is updated. -// See https://platform.openai.com/docs/api-reference/realtime-server-events/session/updated -type SessionUpdatedEvent struct { - ServerEventBase - // The updated session resource. - Session ServerSession `json:"session"` -} - -// ConversationCreatedEvent is the event for conversation created. -// Returned when a conversation is created. Emitted right after session creation. -// See https://platform.openai.com/docs/api-reference/realtime-server-events/conversation/created -type ConversationCreatedEvent struct { - ServerEventBase - // The conversation resource. - Conversation Conversation `json:"conversation"` -} - -// InputAudioBufferCommittedEvent is the event for input audio buffer committed. -// Returned when an input audio buffer is committed, either by the client or automatically in server VAD mode. -// See https://platform.openai.com/docs/api-reference/realtime-server-events/input_audio_buffer/committed -type InputAudioBufferCommittedEvent struct { - ServerEventBase - // The ID of the preceding item after which the new item will be inserted. - PreviousItemID string `json:"previous_item_id,omitempty"` - // The ID of the user message item that will be created. - ItemID string `json:"item_id"` -} - -// InputAudioBufferClearedEvent is the event for input audio buffer cleared. -// Returned when the input audio buffer is cleared by the client. -// See https://platform.openai.com/docs/api-reference/realtime-server-events/input_audio_buffer/cleared -type InputAudioBufferClearedEvent struct { - ServerEventBase -} - -// InputAudioBufferSpeechStartedEvent is the event for input audio buffer speech started. -// Returned in server turn detection mode when speech is detected. -// See https://platform.openai.com/docs/api-reference/realtime-server-events/input_audio_buffer/speech_started -type InputAudioBufferSpeechStartedEvent struct { - ServerEventBase - // Milliseconds since the session started when speech was detected. - AudioStartMs int64 `json:"audio_start_ms"` - // The ID of the user message item that will be created when speech stops. - ItemID string `json:"item_id"` -} - -// InputAudioBufferSpeechStoppedEvent is the event for input audio buffer speech stopped. -// Returned in server turn detection mode when speech stops. -// See https://platform.openai.com/docs/api-reference/realtime-server-events/input_audio_buffer/speech_stopped -type InputAudioBufferSpeechStoppedEvent struct { - ServerEventBase - // Milliseconds since the session started when speech stopped. - AudioEndMs int64 `json:"audio_end_ms"` - // The ID of the user message item that will be created. - ItemID string `json:"item_id"` -} - -type ConversationItemCreatedEvent struct { - ServerEventBase - PreviousItemID string `json:"previous_item_id,omitempty"` - Item ResponseMessageItem `json:"item"` -} - -type ConversationItemInputAudioTranscriptionCompletedEvent struct { - ServerEventBase - ItemID string `json:"item_id"` - ContentIndex int `json:"content_index"` - Transcript string `json:"transcript"` -} - -type ConversationItemInputAudioTranscriptionFailedEvent struct { - ServerEventBase - ItemID string `json:"item_id"` - ContentIndex int `json:"content_index"` - Error Error `json:"error"` -} - -type ConversationItemTruncatedEvent struct { - ServerEventBase - ItemID string `json:"item_id"` // The ID of the assistant message item that was truncated. - ContentIndex int `json:"content_index"` // The index of the content part that was truncated. - AudioEndMs int `json:"audio_end_ms"` // The duration up to which the audio was truncated, in milliseconds. -} - -type ConversationItemDeletedEvent struct { - ServerEventBase - ItemID string `json:"item_id"` // The ID of the item that was deleted. -} - -// ResponseCreatedEvent is the event for response created. -// Returned when a new Response is created. The first event of response creation, where the response is in an initial state of "in_progress". -// See https://platform.openai.com/docs/api-reference/realtime-server-events/response/created -type ResponseCreatedEvent struct { - ServerEventBase - // The response resource. - Response Response `json:"response"` -} - -// ResponseDoneEvent is the event for response done. -// Returned when a Response is done streaming. Always emitted, no matter the final state. -// See https://platform.openai.com/docs/api-reference/realtime-server-events/response/done -type ResponseDoneEvent struct { - ServerEventBase - // The response resource. - Response Response `json:"response"` -} - -// ResponseOutputItemAddedEvent is the event for response output item added. -// Returned when a new Item is created during response generation. -// See https://platform.openai.com/docs/api-reference/realtime-server-events/response/output_item/added -type ResponseOutputItemAddedEvent struct { - ServerEventBase - // The ID of the response to which the item belongs. - ResponseID string `json:"response_id"` - // The index of the output item in the response. - OutputIndex int `json:"output_index"` - // The item that was added. - Item ResponseMessageItem `json:"item"` -} - -// ResponseOutputItemDoneEvent is the event for response output item done. -// Returned when an Item is done streaming. Also emitted when a Response is interrupted, incomplete, or cancelled. -// See https://platform.openai.com/docs/api-reference/realtime-server-events/response/output_item/done -type ResponseOutputItemDoneEvent struct { - ServerEventBase - // The ID of the response to which the item belongs. - ResponseID string `json:"response_id"` - // The index of the output item in the response. - OutputIndex int `json:"output_index"` - // The completed item. - Item ResponseMessageItem `json:"item"` -} - -// ResponseContentPartAddedEvent is the event for response content part added. -// Returned when a new content part is added to an assistant message item during response generation. -// See https://platform.openai.com/docs/api-reference/realtime-server-events/response/content_part/added -type ResponseContentPartAddedEvent struct { - ServerEventBase - ResponseID string `json:"response_id"` - ItemID string `json:"item_id"` - OutputIndex int `json:"output_index"` - ContentIndex int `json:"content_index"` - Part MessageContentPart `json:"part"` -} - -// ResponseContentPartDoneEvent is the event for response content part done. -// Returned when a content part is done streaming in an assistant message item. Also emitted when a Response is interrupted, incomplete, or cancelled. -// See https://platform.openai.com/docs/api-reference/realtime-server-events/response/content_part/done -type ResponseContentPartDoneEvent struct { - ServerEventBase - // The ID of the response. - ResponseID string `json:"response_id"` - // The ID of the item to which the content part was added. - ItemID string `json:"item_id"` - // The index of the output item in the response. - OutputIndex int `json:"output_index"` - // The index of the content part in the item's content array. - ContentIndex int `json:"content_index"` - // The content part that was added. - Part MessageContentPart `json:"part"` -} - -// ResponseTextDeltaEvent is the event for response text delta. -// Returned when the text value of a "text" content part is updated. -// See https://platform.openai.com/docs/api-reference/realtime-server-events/response/text/delta -type ResponseTextDeltaEvent struct { - ServerEventBase - ResponseID string `json:"response_id"` - ItemID string `json:"item_id"` - OutputIndex int `json:"output_index"` - ContentIndex int `json:"content_index"` - Delta string `json:"delta"` -} - -// ResponseTextDoneEvent is the event for response text done. -// Returned when the text value of a "text" content part is done streaming. Also emitted when a Response is interrupted, incomplete, or cancelled. -// See https://platform.openai.com/docs/api-reference/realtime-server-events/response/text/done -type ResponseTextDoneEvent struct { - ServerEventBase - ResponseID string `json:"response_id"` - ItemID string `json:"item_id"` - OutputIndex int `json:"output_index"` - ContentIndex int `json:"content_index"` - Text string `json:"text"` -} - -// ResponseAudioTranscriptDeltaEvent is the event for response audio transcript delta. -// Returned when the model-generated transcription of audio output is updated. -// See https://platform.openai.com/docs/api-reference/realtime-server-events/response/audio_transcript/delta -type ResponseAudioTranscriptDeltaEvent struct { - ServerEventBase - // The ID of the response. - ResponseID string `json:"response_id"` - // The ID of the item. - ItemID string `json:"item_id"` - // The index of the output item in the response. - OutputIndex int `json:"output_index"` - // The index of the content part in the item's content array. - ContentIndex int `json:"content_index"` - // The transcript delta. - Delta string `json:"delta"` -} - -// ResponseAudioTranscriptDoneEvent is the event for response audio transcript done. -// Returned when the model-generated transcription of audio output is done streaming. Also emitted when a Response is interrupted, incomplete, or cancelled. -// See https://platform.openai.com/docs/api-reference/realtime-server-events/response/audio_transcript/done -type ResponseAudioTranscriptDoneEvent struct { - ServerEventBase - // The ID of the response. - ResponseID string `json:"response_id"` - // The ID of the item. - ItemID string `json:"item_id"` - // The index of the output item in the response. - OutputIndex int `json:"output_index"` - // The index of the content part in the item's content array. - ContentIndex int `json:"content_index"` - // The final transcript of the audio. - Transcript string `json:"transcript"` -} - -// ResponseAudioDeltaEvent is the event for response audio delta. -// Returned when the model-generated audio is updated. -// See https://platform.openai.com/docs/api-reference/realtime-server-events/response/audio/delta -type ResponseAudioDeltaEvent struct { - ServerEventBase - // The ID of the response. - ResponseID string `json:"response_id"` - // The ID of the item. - ItemID string `json:"item_id"` - // The index of the output item in the response. - OutputIndex int `json:"output_index"` - // The index of the content part in the item's content array. - ContentIndex int `json:"content_index"` - // Base64-encoded audio data delta. - Delta string `json:"delta"` -} - -// ResponseAudioDoneEvent is the event for response audio done. -// Returned when the model-generated audio is done. Also emitted when a Response is interrupted, incomplete, or cancelled. -// See https://platform.openai.com/docs/api-reference/realtime-server-events/response/audio/done -type ResponseAudioDoneEvent struct { - ServerEventBase - // The ID of the response. - ResponseID string `json:"response_id"` - // The ID of the item. - ItemID string `json:"item_id"` - // The index of the output item in the response. - OutputIndex int `json:"output_index"` - // The index of the content part in the item's content array. - ContentIndex int `json:"content_index"` -} - -// ResponseFunctionCallArgumentsDeltaEvent is the event for response function call arguments delta. -// Returned when the model-generated function call arguments are updated. -// See https://platform.openai.com/docs/api-reference/realtime-server-events/response/function_call_arguments/delta -type ResponseFunctionCallArgumentsDeltaEvent struct { - ServerEventBase - // The ID of the response. - ResponseID string `json:"response_id"` - // The ID of the item. - ItemID string `json:"item_id"` - // The index of the output item in the response. - OutputIndex int `json:"output_index"` - // The ID of the function call. - CallID string `json:"call_id"` - // The arguments delta as a JSON string. - Delta string `json:"delta"` -} - -// ResponseFunctionCallArgumentsDoneEvent is the event for response function call arguments done. -// Returned when the model-generated function call arguments are done streaming. Also emitted when a Response is interrupted, incomplete, or cancelled. -// See https://platform.openai.com/docs/api-reference/realtime-server-events/response/function_call_arguments/done -type ResponseFunctionCallArgumentsDoneEvent struct { - ServerEventBase - // The ID of the response. - ResponseID string `json:"response_id"` - // The ID of the item. - ItemID string `json:"item_id"` - // The index of the output item in the response. - OutputIndex int `json:"output_index"` - // The ID of the function call. - CallID string `json:"call_id"` - // The final arguments as a JSON string. - Arguments string `json:"arguments"` - // The name of the function. Not shown in API reference but present in the actual event. - Name string `json:"name"` -} - -// RateLimitsUpdatedEvent is the event for rate limits updated. -// Emitted after every "response.done" event to indicate the updated rate limits. -// See https://platform.openai.com/docs/api-reference/realtime-server-events/rate_limits/updated -type RateLimitsUpdatedEvent struct { - ServerEventBase - // List of rate limit information. - RateLimits []RateLimit `json:"rate_limits"` -} - -type ServerEventInterface interface { - ErrorEvent | - SessionCreatedEvent | - SessionUpdatedEvent | - ConversationCreatedEvent | - InputAudioBufferCommittedEvent | - InputAudioBufferClearedEvent | - InputAudioBufferSpeechStartedEvent | - InputAudioBufferSpeechStoppedEvent | - ConversationItemCreatedEvent | - ConversationItemInputAudioTranscriptionCompletedEvent | - ConversationItemInputAudioTranscriptionFailedEvent | - ConversationItemTruncatedEvent | - ConversationItemDeletedEvent | - ResponseCreatedEvent | - ResponseDoneEvent | - ResponseOutputItemAddedEvent | - ResponseOutputItemDoneEvent | - ResponseContentPartAddedEvent | - ResponseContentPartDoneEvent | - ResponseTextDeltaEvent | - ResponseTextDoneEvent | - ResponseAudioTranscriptDeltaEvent | - ResponseAudioTranscriptDoneEvent | - ResponseAudioDeltaEvent | - ResponseAudioDoneEvent | - ResponseFunctionCallArgumentsDeltaEvent | - ResponseFunctionCallArgumentsDoneEvent | - RateLimitsUpdatedEvent -} - -func unmarshalServerEvent[T ServerEventInterface](data []byte) (T, error) { - var t T - err := json.Unmarshal(data, &t) - if err != nil { - return t, err - } - return t, nil -} - -// UnmarshalServerEvent unmarshals the server event from the given JSON data. -func UnmarshalServerEvent(data []byte) (ServerEvent, error) { //nolint:funlen,cyclop // TODO: optimize - var eventType struct { - Type ServerEventType `json:"type"` - } - err := json.Unmarshal(data, &eventType) - if err != nil { - return nil, err - } - switch eventType.Type { - case ServerEventTypeError: - return unmarshalServerEvent[ErrorEvent](data) - case ServerEventTypeSessionCreated: - return unmarshalServerEvent[SessionCreatedEvent](data) - case ServerEventTypeSessionUpdated: - return unmarshalServerEvent[SessionUpdatedEvent](data) - case ServerEventTypeConversationCreated: - return unmarshalServerEvent[ConversationCreatedEvent](data) - case ServerEventTypeInputAudioBufferCommitted: - return unmarshalServerEvent[InputAudioBufferCommittedEvent](data) - case ServerEventTypeInputAudioBufferCleared: - return unmarshalServerEvent[InputAudioBufferClearedEvent](data) - case ServerEventTypeInputAudioBufferSpeechStarted: - return unmarshalServerEvent[InputAudioBufferSpeechStartedEvent](data) - case ServerEventTypeInputAudioBufferSpeechStopped: - return unmarshalServerEvent[InputAudioBufferSpeechStoppedEvent](data) - case ServerEventTypeConversationItemCreated: - return unmarshalServerEvent[ConversationItemCreatedEvent](data) - case ServerEventTypeConversationItemInputAudioTranscriptionCompleted: - return unmarshalServerEvent[ConversationItemInputAudioTranscriptionCompletedEvent](data) - case ServerEventTypeConversationItemInputAudioTranscriptionFailed: - return unmarshalServerEvent[ConversationItemInputAudioTranscriptionFailedEvent](data) - case ServerEventTypeConversationItemTruncated: - return unmarshalServerEvent[ConversationItemTruncatedEvent](data) - case ServerEventTypeConversationItemDeleted: - return unmarshalServerEvent[ConversationItemDeletedEvent](data) - case ServerEventTypeResponseCreated: - return unmarshalServerEvent[ResponseCreatedEvent](data) - case ServerEventTypeResponseDone: - return unmarshalServerEvent[ResponseDoneEvent](data) - case ServerEventTypeResponseOutputItemAdded: - return unmarshalServerEvent[ResponseOutputItemAddedEvent](data) - case ServerEventTypeResponseOutputItemDone: - return unmarshalServerEvent[ResponseOutputItemDoneEvent](data) - case ServerEventTypeResponseContentPartAdded: - return unmarshalServerEvent[ResponseContentPartAddedEvent](data) - case ServerEventTypeResponseContentPartDone: - return unmarshalServerEvent[ResponseContentPartDoneEvent](data) - case ServerEventTypeResponseTextDelta: - return unmarshalServerEvent[ResponseTextDeltaEvent](data) - case ServerEventTypeResponseTextDone: - return unmarshalServerEvent[ResponseTextDoneEvent](data) - case ServerEventTypeResponseAudioTranscriptDelta: - return unmarshalServerEvent[ResponseAudioTranscriptDeltaEvent](data) - case ServerEventTypeResponseAudioTranscriptDone: - return unmarshalServerEvent[ResponseAudioTranscriptDoneEvent](data) - case ServerEventTypeResponseAudioDelta: - return unmarshalServerEvent[ResponseAudioDeltaEvent](data) - case ServerEventTypeResponseAudioDone: - return unmarshalServerEvent[ResponseAudioDoneEvent](data) - case ServerEventTypeResponseFunctionCallArgumentsDelta: - return unmarshalServerEvent[ResponseFunctionCallArgumentsDeltaEvent](data) - case ServerEventTypeResponseFunctionCallArgumentsDone: - return unmarshalServerEvent[ResponseFunctionCallArgumentsDoneEvent](data) - case ServerEventTypeRateLimitsUpdated: - return unmarshalServerEvent[RateLimitsUpdatedEvent](data) - default: - // This should never happen. - return nil, fmt.Errorf("unknown server event type: %s", eventType.Type) - } -} diff --git a/core/http/endpoints/openai/types/server_events.go b/core/http/endpoints/openai/types/server_events.go new file mode 100644 index 000000000..bae680fd5 --- /dev/null +++ b/core/http/endpoints/openai/types/server_events.go @@ -0,0 +1,1500 @@ +package types + +import ( + "encoding/json" + "fmt" +) + +type ServerEventType string + +const ( + ServerEventTypeError ServerEventType = "error" + ServerEventTypeSessionCreated ServerEventType = "session.created" + ServerEventTypeSessionUpdated ServerEventType = "session.updated" + ServerEventTypeConversationItemAdded ServerEventType = "conversation.item.added" + ServerEventTypeConversationItemDone ServerEventType = "conversation.item.done" + ServerEventTypeConversationItemRetrieved ServerEventType = "conversation.item.retrieved" + ServerEventTypeConversationItemInputAudioTranscriptionCompleted ServerEventType = "conversation.item.input_audio_transcription.completed" + ServerEventTypeConversationItemInputAudioTranscriptionDelta ServerEventType = "conversation.item.input_audio_transcription.delta" + ServerEventTypeConversationItemInputAudioTranscriptionSegment ServerEventType = "conversation.item.input_audio_transcription.segment" + ServerEventTypeConversationItemInputAudioTranscriptionFailed ServerEventType = "conversation.item.input_audio_transcription.failed" + ServerEventTypeConversationItemTruncated ServerEventType = "conversation.item.truncated" + ServerEventTypeConversationItemDeleted ServerEventType = "conversation.item.deleted" + ServerEventTypeInputAudioBufferCommitted ServerEventType = "input_audio_buffer.committed" + ServerEventTypeInputAudioBufferCleared ServerEventType = "input_audio_buffer.cleared" + ServerEventTypeInputAudioBufferSpeechStarted ServerEventType = "input_audio_buffer.speech_started" + ServerEventTypeInputAudioBufferSpeechStopped ServerEventType = "input_audio_buffer.speech_stopped" + ServerEventTypeInputAudioBufferTimeoutTriggered ServerEventType = "input_audio_buffer.timeout_triggered" + ServerEventTypeResponseCreated ServerEventType = "response.created" + ServerEventTypeResponseDone ServerEventType = "response.done" + ServerEventTypeResponseOutputItemAdded ServerEventType = "response.output_item.added" + ServerEventTypeResponseOutputItemDone ServerEventType = "response.output_item.done" + ServerEventTypeResponseContentPartAdded ServerEventType = "response.content_part.added" + ServerEventTypeResponseContentPartDone ServerEventType = "response.content_part.done" + ServerEventTypeResponseOutputTextDelta ServerEventType = "response.output_text.delta" + ServerEventTypeResponseOutputTextDone ServerEventType = "response.output_text.done" + ServerEventTypeResponseOutputAudioTranscriptDelta ServerEventType = "response.output_audio_transcript.delta" + ServerEventTypeResponseOutputAudioTranscriptDone ServerEventType = "response.output_audio_transcript.done" + ServerEventTypeResponseOutputAudioDelta ServerEventType = "response.output_audio.delta" + ServerEventTypeResponseOutputAudioDone ServerEventType = "response.output_audio.done" + ServerEventTypeResponseFunctionCallArgumentsDelta ServerEventType = "response.function_call_arguments.delta" + ServerEventTypeResponseFunctionCallArgumentsDone ServerEventType = "response.function_call_arguments.done" + ServerEventTypeResponseMcpCallArgumentsDelta ServerEventType = "response.mcp_call_arguments.delta" + ServerEventTypeResponseMcpCallArgumentsDone ServerEventType = "response.mcp_call_arguments.done" + ServerEventTypeResponseMcpCallInProgress ServerEventType = "response.mcp_call.in_progress" + ServerEventTypeResponseMcpCallCompleted ServerEventType = "response.mcp_call.completed" + ServerEventTypeResponseMcpCallFailed ServerEventType = "response.mcp_call.failed" + ServerEventTypeMcpListToolsInProgress ServerEventType = "mcp_list_tools.in_progress" + ServerEventTypeMcpListToolsCompleted ServerEventType = "mcp_list_tools.completed" + ServerEventTypeMcpListToolsFailed ServerEventType = "mcp_list_tools.failed" + ServerEventTypeRateLimitsUpdated ServerEventType = "rate_limits.updated" +) + +// ServerEvent is the interface for server events. +type ServerEvent interface { + ServerEventType() ServerEventType +} + +// ServerEventBase is the base struct for all server events. +type ServerEventBase struct { + EventID string `json:"event_id,omitempty"` +} + +// Returned when an error occurs, which could be a client problem or a server problem. Most errors are recoverable and the session will stay open, we recommend to implementors to monitor and log error messages by default. +// +// See https://platform.openai.com/docs/api-reference/realtime-server-events/error +type ErrorEvent struct { + ServerEventBase + // Details of the error. + Error Error `json:"error"` +} + +func (m ErrorEvent) ServerEventType() ServerEventType { + return ServerEventTypeError +} + +func (m ErrorEvent) MarshalJSON() ([]byte, error) { + type typeAlias ErrorEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + +// Returned when a Session is created. Emitted automatically when a new connection is established as the first server event. This event will contain the default Session configuration. +// +// See https://platform.openai.com/docs/api-reference/realtime-server-events/session/created +type SessionCreatedEvent struct { + ServerEventBase + // The session resource. + Session SessionUnion `json:"session"` +} + +func (m SessionCreatedEvent) ServerEventType() ServerEventType { + return ServerEventTypeSessionCreated +} + +func (m SessionCreatedEvent) MarshalJSON() ([]byte, error) { + type typeAlias SessionCreatedEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + +// Returned when a session is updated with a `session.update` event, unless there is an error. +// +// See https://platform.openai.com/docs/api-reference/realtime-server-events/session/updated +type SessionUpdatedEvent struct { + ServerEventBase + // The updated session resource. + Session SessionUnion `json:"session"` +} + +func (m SessionUpdatedEvent) ServerEventType() ServerEventType { + return ServerEventTypeSessionUpdated +} + +func (m SessionUpdatedEvent) MarshalJSON() ([]byte, error) { + type typeAlias SessionUpdatedEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + +// Returned when an input audio buffer is committed, either by the client or automatically in server VAD mode. +// +// The `item_id` property is the ID of the user message item that will be created, thus a `conversation.item.created` event will also be sent to the client. +// +// See https://platform.openai.com/docs/api-reference/realtime-server-events/input_audio_buffer/committed +type InputAudioBufferCommittedEvent struct { + ServerEventBase + // The ID of the preceding item after which the new item will be inserted. + PreviousItemID string `json:"previous_item_id,omitempty"` + // The ID of the user message item that will be created. + ItemID string `json:"item_id"` +} + +func (m InputAudioBufferCommittedEvent) ServerEventType() ServerEventType { + return ServerEventTypeInputAudioBufferCommitted +} + +func (m InputAudioBufferCommittedEvent) MarshalJSON() ([]byte, error) { + type typeAlias InputAudioBufferCommittedEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + +// Returned when the input audio buffer is cleared by the client with a `input_audio_buffer.clear` event. +// +// See https://platform.openai.com/docs/api-reference/realtime-server-events/input_audio_buffer/cleared +type InputAudioBufferClearedEvent struct { + ServerEventBase +} + +func (m InputAudioBufferClearedEvent) ServerEventType() ServerEventType { + return ServerEventTypeInputAudioBufferCleared +} + +func (m InputAudioBufferClearedEvent) MarshalJSON() ([]byte, error) { + type typeAlias InputAudioBufferClearedEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + +// Sent by the server when in `server_vad` mode to indicate that speech has been detected in the audio buffer. +// +// This can happen any time audio is added to the buffer (unless speech is already detected). The client may want to use this event to interrupt audio playback or provide visual feedback to the user. +// +// The client should expect to receive a `input_audio_buffer.speech_stopped` event when speech stops. +// +// The `item_id` property is the ID of the user message item that will be created when speech stops and will also be included in the `input_audio_buffer.speech_stopped` event (unless the client manually commits the audio buffer during VAD activation). +// +// See https://platform.openai.com/docs/api-reference/realtime-server-events/input_audio_buffer/speech_started +type InputAudioBufferSpeechStartedEvent struct { + ServerEventBase + // Milliseconds since the session started when speech was detected. + AudioStartMs int64 `json:"audio_start_ms"` + // The ID of the user message item that will be created when speech stops. + ItemID string `json:"item_id"` +} + +func (m InputAudioBufferSpeechStartedEvent) ServerEventType() ServerEventType { + return ServerEventTypeInputAudioBufferSpeechStarted +} + +func (m InputAudioBufferSpeechStartedEvent) MarshalJSON() ([]byte, error) { + type typeAlias InputAudioBufferSpeechStartedEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + +// Returned in `server_vad` mode when the server detects the end of speech in the audio buffer. +// +// The server will also send an `conversation.item.created` event with the user message item that is created from the audio buffer. +// +// See https://platform.openai.com/docs/api-reference/realtime-server-events/input_audio_buffer/speech_stopped +type InputAudioBufferSpeechStoppedEvent struct { + ServerEventBase + // Milliseconds since the session started when speech stopped. + AudioEndMs int64 `json:"audio_end_ms"` + // The ID of the user message item that will be created. + ItemID string `json:"item_id"` +} + +func (m InputAudioBufferSpeechStoppedEvent) ServerEventType() ServerEventType { + return ServerEventTypeInputAudioBufferSpeechStopped +} + +func (m InputAudioBufferSpeechStoppedEvent) MarshalJSON() ([]byte, error) { + type typeAlias InputAudioBufferSpeechStoppedEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + +// Returned when the Server VAD timeout is triggered for the input audio buffer. +// +// This is configured with `idle_timeout_ms` in the `turn_detection` settings of the session, and it indicates that there hasn't been any speech detected for the configured duration. +// +// The `audio_start_ms` and `audio_end_ms` fields indicate the segment of audio after the last model response up to the triggering time, as an offset from the beginning of audio written to the input audio buffer. +// +// This means it demarcates the segment of audio that was silent and the difference between the start and end values will roughly match the configured timeout. +// +// The empty audio will be committed to the conversation as an `input_audio` item (there will be a `input_audio_buffer.committed` event) and a model response will be generated. +// +// There may be speech that didn't trigger VAD but is still detected by the model, so the model may respond with something relevant to the conversation or a prompt to continue speaking. +// +// See https://platform.openai.com/docs/api-reference/realtime-server-events/input_audio_buffer/timeout_triggered +type InputAudioBufferTimeoutTriggeredEvent struct { + ServerEventBase + // Milliseconds since the session started when speech started. + AudioStartMs int64 `json:"audio_start_ms"` + // Milliseconds since the session started when speech stopped. + AudioEndMs int64 `json:"audio_end_ms"` + // The ID of the user message item that will be created. + ItemID string `json:"item_id"` +} + +func (m InputAudioBufferTimeoutTriggeredEvent) ServerEventType() ServerEventType { + return ServerEventTypeInputAudioBufferTimeoutTriggered +} + +func (m InputAudioBufferTimeoutTriggeredEvent) MarshalJSON() ([]byte, error) { + type typeAlias InputAudioBufferTimeoutTriggeredEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + +// Sent by the server when an Item is added to the default Conversation. +// +// This can happen in several cases: +// +// - When the client sends a `conversation.item.create` event. +// +// - When the input audio buffer is committed. In this case the item will be a user message containing the audio from the buffer. +// +// - When the model is generating a Response. In this case the `conversation.item.added` event will be sent when the model starts generating a specific Item, and thus it will not yet have any content (and `status` will be `in_progress`). +// +// The event will include the full content of the Item (except when model is generating a Response) except for audio data, which can be retrieved separately with a `conversation.item.retrieve` event if necessary. +// +// See https://platform.openai.com/docs/api-reference/realtime-server-events/conversation/item/added +type ConversationItemAddedEvent struct { + ServerEventBase + // The ID of the preceding item after which the new item will be inserted. + PreviousItemID string `json:"previous_item_id,omitempty"` + + // The item that was added. + Item MessageItemUnion `json:"item"` +} + +func (m ConversationItemAddedEvent) ServerEventType() ServerEventType { + return ServerEventTypeConversationItemAdded +} + +func (m ConversationItemAddedEvent) MarshalJSON() ([]byte, error) { + type typeAlias ConversationItemAddedEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + +// Returned when a conversation item is finalized. +// +// The event will include the full content of the Item except for audio data, which can be retrieved separately with a `conversation.item.retrieve` event if needed. +// +// See https://platform.openai.com/docs/api-reference/realtime-server-events/conversation/item/done +type ConversationItemDoneEvent struct { + ServerEventBase + // The ID of the preceding item after which the item appears. + PreviousItemID string `json:"previous_item_id,omitempty"` + + // The completed item. + Item MessageItemUnion `json:"item"` +} + +func (m ConversationItemDoneEvent) ServerEventType() ServerEventType { + return ServerEventTypeConversationItemDone +} + +func (m ConversationItemDoneEvent) MarshalJSON() ([]byte, error) { + type typeAlias ConversationItemDoneEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + +// Returned when a conversation item is retrieved with `conversation.item.retrieve`. This is provided as a way to fetch the server's representation of an item, for example to get access to the post-processed audio data after noise cancellation and VAD. It includes the full content of the Item, including audio data. +// +// See https://platform.openai.com/docs/api-reference/realtime-server-events/conversation/item/retrieved +type ConversationItemRetrievedEvent struct { + ServerEventBase + // The item that was retrieved. + Item MessageItemUnion `json:"item"` +} + +func (m ConversationItemRetrievedEvent) ServerEventType() ServerEventType { + return ServerEventTypeConversationItemRetrieved +} + +func (m ConversationItemRetrievedEvent) MarshalJSON() ([]byte, error) { + type typeAlias ConversationItemRetrievedEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + +type Logprobs struct { + // Raw byte sequence corresponding to the token (if applicable). + Bytes []byte `json:"bytes,omitempty"` + + // Log probability of the token or segment. + Logprob float64 `json:"logprob,omitempty"` + + // The decoded token text. + Token string `json:"token,omitempty"` +} + +// This event is the output of audio transcription for user audio written to the user audio buffer. Transcription begins when the input audio buffer is committed by the client or server (in `server_vad` mode). Transcription runs asynchronously with Response creation, so this event may come before or after the Response events. + +// Realtime API models accept audio natively, and thus input transcription is a separate process run on a separate ASR (Automatic Speech Recognition) model. The transcript may diverge somewhat from the model's interpretation, and should be treated as a rough guide. +// +// See https://platform.openai.com/docs/api-reference/realtime-server-events/conversation/item/input_audio_transcription/completed +type ConversationItemInputAudioTranscriptionCompletedEvent struct { + ServerEventBase + // The ID of the item. + ItemID string `json:"item_id"` + + // The index of the content part in the item's content array. + ContentIndex int `json:"content_index"` + + // The final transcript of the audio. + Transcript string `json:"transcript"` + + // Log probability information for the transcription, if available. + Logprobs []Logprobs `json:"logprobs,omitempty"` + + // Usage information for the transcription, if available. + Usage *UsageUnion `json:"usage,omitempty"` +} + +func (m ConversationItemInputAudioTranscriptionCompletedEvent) ServerEventType() ServerEventType { + return ServerEventTypeConversationItemInputAudioTranscriptionCompleted +} + +func (m ConversationItemInputAudioTranscriptionCompletedEvent) MarshalJSON() ([]byte, error) { + type typeAlias ConversationItemInputAudioTranscriptionCompletedEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + +// Returned when the text value of an input audio transcription content part is updated with incremental transcription results. +// +// See https://platform.openai.com/docs/api-reference/realtime-server-events/conversation/item/input_audio_transcription/delta +type ConversationItemInputAudioTranscriptionDeltaEvent struct { + ServerEventBase + // The ID of the item. + ItemID string `json:"item_id"` + + // The index of the content part in the item's content array. + ContentIndex int `json:"content_index"` + + // The transcript delta. + Delta string `json:"delta"` + + // Log probability updates for the delta, if available. + Logprobs []Logprobs `json:"logprobs,omitempty"` +} + +func (m ConversationItemInputAudioTranscriptionDeltaEvent) ServerEventType() ServerEventType { + return ServerEventTypeConversationItemInputAudioTranscriptionDelta +} + +func (m ConversationItemInputAudioTranscriptionDeltaEvent) MarshalJSON() ([]byte, error) { + type typeAlias ConversationItemInputAudioTranscriptionDeltaEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + +// Returned when an input audio transcription segment is identified for an item. +// +// See https://platform.openai.com/docs/api-reference/realtime-server-events/conversation/item/input_audio_transcription/segment +type ConversationItemInputAudioTranscriptionSegmentEvent struct { + ServerEventBase + // The ID of the item. + ItemID string `json:"item_id"` + + // The index of the content part in the item's content array. + ContentIndex int `json:"content_index"` + + // Log probability information for the segment, if available. + Logprobs []Logprobs `json:"logprobs,omitempty"` + + // The unique ID of the transcript segment. + ID string `json:"id,omitempty"` + + // The speaker label for the segment, if available. + Speaker string `json:"speaker,omitempty"` + + // The start time of the segment in seconds. + Start float64 `json:"start,omitempty"` + + // The end time of the segment in seconds. + End float64 `json:"end,omitempty"` + + // The text content of the segment. + Text string `json:"text,omitempty"` +} + +func (m ConversationItemInputAudioTranscriptionSegmentEvent) ServerEventType() ServerEventType { + return ServerEventTypeConversationItemInputAudioTranscriptionSegment +} + +func (m ConversationItemInputAudioTranscriptionSegmentEvent) MarshalJSON() ([]byte, error) { + type typeAlias ConversationItemInputAudioTranscriptionSegmentEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + +// Returned when input audio transcription is configured, and a transcription request for a user message failed. These events are separate from other error events so that the client can identify the related Item. +// +// See https://platform.openai.com/docs/api-reference/realtime-server-events/conversation/item/input_audio_transcription/failed +type ConversationItemInputAudioTranscriptionFailedEvent struct { + ServerEventBase + // The ID of the item. + ItemID string `json:"item_id"` + + // The index of the content part in the item's content array. + ContentIndex int `json:"content_index"` + + // Details of the failure. + Error Error `json:"error"` +} + +func (m ConversationItemInputAudioTranscriptionFailedEvent) ServerEventType() ServerEventType { + return ServerEventTypeConversationItemInputAudioTranscriptionFailed +} + +func (m ConversationItemInputAudioTranscriptionFailedEvent) MarshalJSON() ([]byte, error) { + type typeAlias ConversationItemInputAudioTranscriptionFailedEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + +// Returned when an earlier assistant audio message item is truncated by the client with a `conversation.item.truncate` event. This event is used to synchronize the server's understanding of the audio with the client's playback. +// +// This action will truncate the audio and remove the server-side text transcript to ensure there is no text in the context that hasn't been heard by the user. +// +// See https://platform.openai.com/docs/api-reference/realtime-server-events/conversation/item/truncated +type ConversationItemTruncatedEvent struct { + ServerEventBase + // The ID of the assistant message item that was truncated. + ItemID string `json:"item_id"` + + // The index of the content part that was truncated. + ContentIndex int `json:"content_index"` + + // The duration up to which the audio was truncated, in milliseconds. + AudioEndMs int `json:"audio_end_ms"` +} + +func (m ConversationItemTruncatedEvent) ServerEventType() ServerEventType { + return ServerEventTypeConversationItemTruncated +} + +func (m ConversationItemTruncatedEvent) MarshalJSON() ([]byte, error) { + type typeAlias ConversationItemTruncatedEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + +// Returned when an item in the conversation is deleted by the client with a `conversation.item.delete` event. This event is used to synchronize the server's understanding of the conversation history with the client's view. +// +// See https://platform.openai.com/docs/api-reference/realtime-server-events/conversation/item/deleted +type ConversationItemDeletedEvent struct { + ServerEventBase + // The ID of the item that was deleted. + ItemID string `json:"item_id"` +} + +func (m ConversationItemDeletedEvent) ServerEventType() ServerEventType { + return ServerEventTypeConversationItemDeleted +} + +func (m ConversationItemDeletedEvent) MarshalJSON() ([]byte, error) { + type typeAlias ConversationItemDeletedEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + +// Returned when a new Response is created. The first event of response creation, where the response is in an initial state of in_progress. +// +// See https://platform.openai.com/docs/api-reference/realtime-server-events/response/created +type ResponseCreatedEvent struct { + ServerEventBase + // The response resource. + Response Response `json:"response"` +} + +func (m ResponseCreatedEvent) ServerEventType() ServerEventType { + return ServerEventTypeResponseCreated +} + +func (m ResponseCreatedEvent) MarshalJSON() ([]byte, error) { + type typeAlias ResponseCreatedEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + +// Returned when a Response is done streaming. Always emitted, no matter the final state. The Response object included in the response.done event will include all output Items in the Response but will omit the raw audio data. +// +// Clients should check the status field of the Response to determine if it was successful (completed) or if there was another outcome: cancelled, failed, or incomplete. +// +// A response will contain all output items that were generated during the response, excluding any audio content. +// See https://platform.openai.com/docs/api-reference/realtime-server-events/response/done +type ResponseDoneEvent struct { + ServerEventBase + // The response resource. + Response Response `json:"response"` +} + +func (m ResponseDoneEvent) ServerEventType() ServerEventType { + return ServerEventTypeResponseDone +} + +func (m ResponseDoneEvent) MarshalJSON() ([]byte, error) { + type typeAlias ResponseDoneEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + +// Returned when a new Item is created during Response generation. +// +// See https://platform.openai.com/docs/api-reference/realtime-server-events/response/output_item/added +type ResponseOutputItemAddedEvent struct { + ServerEventBase + // The ID of the response to which the item belongs. + ResponseID string `json:"response_id"` + // The index of the output item in the response. + OutputIndex int `json:"output_index"` + // The item that was added. + Item MessageItemUnion `json:"item"` +} + +func (m ResponseOutputItemAddedEvent) ServerEventType() ServerEventType { + return ServerEventTypeResponseOutputItemAdded +} + +func (m ResponseOutputItemAddedEvent) MarshalJSON() ([]byte, error) { + type typeAlias ResponseOutputItemAddedEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + +// Returned when an Item is done streaming. Also emitted when a Response is interrupted, incomplete, or cancelled. +// +// See https://platform.openai.com/docs/api-reference/realtime-server-events/response/output_item/done +type ResponseOutputItemDoneEvent struct { + ServerEventBase + // The ID of the response to which the item belongs. + ResponseID string `json:"response_id"` + // The index of the output item in the response. + OutputIndex int `json:"output_index"` + // The completed item. + Item MessageItemUnion `json:"item"` +} + +func (m ResponseOutputItemDoneEvent) ServerEventType() ServerEventType { + return ServerEventTypeResponseOutputItemDone +} + +func (m ResponseOutputItemDoneEvent) MarshalJSON() ([]byte, error) { + type typeAlias ResponseOutputItemDoneEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + +// Returned when a new content part is added to an assistant message item during response generation. +// +// See https://platform.openai.com/docs/api-reference/realtime-server-events/response/content_part/added +type ResponseContentPartAddedEvent struct { + ServerEventBase + ResponseID string `json:"response_id"` + ItemID string `json:"item_id"` + OutputIndex int `json:"output_index"` + ContentIndex int `json:"content_index"` + Part MessageContentOutput `json:"part"` +} + +func (m ResponseContentPartAddedEvent) ServerEventType() ServerEventType { + return ServerEventTypeResponseContentPartAdded +} + +func (m ResponseContentPartAddedEvent) MarshalJSON() ([]byte, error) { + type typeAlias ResponseContentPartAddedEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + +// Returned when a content part is done streaming in an assistant message item. Also emitted when a Response is interrupted, incomplete, or cancelled. +// +// See https://platform.openai.com/docs/api-reference/realtime-server-events/response/content_part/done +type ResponseContentPartDoneEvent struct { + ServerEventBase + // The ID of the response. + ResponseID string `json:"response_id"` + // The ID of the item to which the content part was added. + ItemID string `json:"item_id"` + // The index of the output item in the response. + OutputIndex int `json:"output_index"` + // The index of the content part in the item's content array. + ContentIndex int `json:"content_index"` + // The content part that was added. + Part MessageContentOutput `json:"part"` +} + +func (m ResponseContentPartDoneEvent) ServerEventType() ServerEventType { + return ServerEventTypeResponseContentPartDone +} + +func (m ResponseContentPartDoneEvent) MarshalJSON() ([]byte, error) { + type typeAlias ResponseContentPartDoneEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + +// Returned when the text value of an "output_text" content part is updated. +// +// See https://platform.openai.com/docs/api-reference/realtime-server-events/response/output_text/delta +type ResponseOutputTextDeltaEvent struct { + ServerEventBase + ResponseID string `json:"response_id"` + ItemID string `json:"item_id"` + OutputIndex int `json:"output_index"` + ContentIndex int `json:"content_index"` + Delta string `json:"delta"` +} + +func (m ResponseOutputTextDeltaEvent) ServerEventType() ServerEventType { + return ServerEventTypeResponseOutputTextDelta +} + +func (m ResponseOutputTextDeltaEvent) MarshalJSON() ([]byte, error) { + type typeAlias ResponseOutputTextDeltaEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + +// Returned when the text value of an "output_text" content part is done streaming. Also emitted when a Response is interrupted, incomplete, or cancelled. +// +// See https://platform.openai.com/docs/api-reference/realtime-server-events/response/output_text/done +type ResponseOutputTextDoneEvent struct { + ServerEventBase + ResponseID string `json:"response_id"` + ItemID string `json:"item_id"` + OutputIndex int `json:"output_index"` + ContentIndex int `json:"content_index"` + Text string `json:"text"` +} + +func (m ResponseOutputTextDoneEvent) ServerEventType() ServerEventType { + return ServerEventTypeResponseOutputTextDone +} + +func (m ResponseOutputTextDoneEvent) MarshalJSON() ([]byte, error) { + type typeAlias ResponseOutputTextDoneEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + +// Returned when the model-generated transcription of audio output is updated. +// +// See https://platform.openai.com/docs/api-reference/realtime-server-events/response/output_audio_transcript/delta +type ResponseOutputAudioTranscriptDeltaEvent struct { + ServerEventBase + // The ID of the response. + ResponseID string `json:"response_id"` + // The ID of the item. + ItemID string `json:"item_id"` + // The index of the output item in the response. + OutputIndex int `json:"output_index"` + // The index of the content part in the item's content array. + ContentIndex int `json:"content_index"` + // The transcript delta. + Delta string `json:"delta"` +} + +func (m ResponseOutputAudioTranscriptDeltaEvent) ServerEventType() ServerEventType { + return ServerEventTypeResponseOutputAudioTranscriptDelta +} + +func (m ResponseOutputAudioTranscriptDeltaEvent) MarshalJSON() ([]byte, error) { + type typeAlias ResponseOutputAudioTranscriptDeltaEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + +// Returned when the model-generated transcription of audio output is done streaming. Also emitted when a Response is interrupted, incomplete, or cancelled. +// +// See https://platform.openai.com/docs/api-reference/realtime-server-events/response/output_audio_transcript/done +type ResponseOutputAudioTranscriptDoneEvent struct { + ServerEventBase + // The ID of the response. + ResponseID string `json:"response_id"` + // The ID of the item. + ItemID string `json:"item_id"` + // The index of the output item in the response. + OutputIndex int `json:"output_index"` + // The index of the content part in the item's content array. + ContentIndex int `json:"content_index"` + // The final transcript of the audio. + Transcript string `json:"transcript"` +} + +func (m ResponseOutputAudioTranscriptDoneEvent) ServerEventType() ServerEventType { + return ServerEventTypeResponseOutputAudioTranscriptDone +} + +func (m ResponseOutputAudioTranscriptDoneEvent) MarshalJSON() ([]byte, error) { + type typeAlias ResponseOutputAudioTranscriptDoneEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + +// Returned when the model-generated audio is updated. +// +// See https://platform.openai.com/docs/api-reference/realtime-server-events/response/output_audio/delta +type ResponseOutputAudioDeltaEvent struct { + ServerEventBase + // The ID of the response. + ResponseID string `json:"response_id"` + // The ID of the item. + ItemID string `json:"item_id"` + // The index of the output item in the response. + OutputIndex int `json:"output_index"` + // The index of the content part in the item's content array. + ContentIndex int `json:"content_index"` + // Base64-encoded audio data delta. + Delta string `json:"delta"` +} + +func (m ResponseOutputAudioDeltaEvent) ServerEventType() ServerEventType { + return ServerEventTypeResponseOutputAudioDelta +} + +func (m ResponseOutputAudioDeltaEvent) MarshalJSON() ([]byte, error) { + type typeAlias ResponseOutputAudioDeltaEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + +// Returned when the model-generated audio is done. Also emitted when a Response is interrupted, incomplete, or cancelled. +// +// See https://platform.openai.com/docs/api-reference/realtime-server-events/response/output_audio/done +type ResponseOutputAudioDoneEvent struct { + ServerEventBase + // The ID of the response. + ResponseID string `json:"response_id"` + // The ID of the item. + ItemID string `json:"item_id"` + // The index of the output item in the response. + OutputIndex int `json:"output_index"` + // The index of the content part in the item's content array. + ContentIndex int `json:"content_index"` +} + +func (m ResponseOutputAudioDoneEvent) ServerEventType() ServerEventType { + return ServerEventTypeResponseOutputAudioDone +} + +func (m ResponseOutputAudioDoneEvent) MarshalJSON() ([]byte, error) { + type typeAlias ResponseOutputAudioDoneEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + +// Returned when the model-generated function call arguments are updated. +// +// See https://platform.openai.com/docs/api-reference/realtime-server-events/response/function_call_arguments/delta +type ResponseFunctionCallArgumentsDeltaEvent struct { + ServerEventBase + // The ID of the response. + ResponseID string `json:"response_id"` + // The ID of the item. + ItemID string `json:"item_id"` + // The index of the output item in the response. + OutputIndex int `json:"output_index"` + // The ID of the function call. + CallID string `json:"call_id"` + // The arguments delta as a JSON string. + Delta string `json:"delta"` +} + +func (m ResponseFunctionCallArgumentsDeltaEvent) ServerEventType() ServerEventType { + return ServerEventTypeResponseFunctionCallArgumentsDelta +} + +func (m ResponseFunctionCallArgumentsDeltaEvent) MarshalJSON() ([]byte, error) { + type typeAlias ResponseFunctionCallArgumentsDeltaEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + +// Returned when the model-generated function call arguments are done streaming. Also emitted when a Response is interrupted, incomplete, or cancelled. +// +// See https://platform.openai.com/docs/api-reference/realtime-server-events/response/function_call_arguments/done +type ResponseFunctionCallArgumentsDoneEvent struct { + ServerEventBase + // The ID of the response. + ResponseID string `json:"response_id"` + // The ID of the item. + ItemID string `json:"item_id"` + // The index of the output item in the response. + OutputIndex int `json:"output_index"` + // The ID of the function call. + CallID string `json:"call_id"` + // The final arguments as a JSON string. + Arguments string `json:"arguments"` + // The name of the function. Not shown in API reference but present in the actual event. + Name string `json:"name"` +} + +func (m ResponseFunctionCallArgumentsDoneEvent) ServerEventType() ServerEventType { + return ServerEventTypeResponseFunctionCallArgumentsDone +} + +func (m ResponseFunctionCallArgumentsDoneEvent) MarshalJSON() ([]byte, error) { + type typeAlias ResponseFunctionCallArgumentsDoneEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + +// Returned when MCP tool call arguments are updated during response generation. +// +// See https://platform.openai.com/docs/api-reference/realtime-server-events/response/mcp_call_arguments/delta +type ResponseMcpCallArgumentsDeltaEvent struct { + ServerEventBase + // The ID of the response. + ResponseID string `json:"response_id"` + // The ID of the item. + ItemID string `json:"item_id"` + // The index of the output item in the response. + OutputIndex int `json:"output_index"` + // The arguments delta as a JSON string. + Delta string `json:"delta"` + // Obfuscation + Obfuscation string `json:"obfuscation"` +} + +func (m ResponseMcpCallArgumentsDeltaEvent) ServerEventType() ServerEventType { + return ServerEventTypeResponseMcpCallArgumentsDelta +} + +func (m ResponseMcpCallArgumentsDeltaEvent) MarshalJSON() ([]byte, error) { + type typeAlias ResponseMcpCallArgumentsDeltaEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + +// Returned when MCP tool call arguments are finalized during response generation. +// +// See https://platform.openai.com/docs/api-reference/realtime-server-events/response/mcp_call_arguments/done +type ResponseMcpCallArgumentsDoneEvent struct { + ServerEventBase + // The ID of the response. + ResponseID string `json:"response_id"` + // The ID of the item. + ItemID string `json:"item_id"` + // The index of the output item in the response. + OutputIndex int `json:"output_index"` + // The final arguments as a JSON string. + Arguments string `json:"arguments"` +} + +func (m ResponseMcpCallArgumentsDoneEvent) ServerEventType() ServerEventType { + return ServerEventTypeResponseMcpCallArgumentsDone +} + +func (m ResponseMcpCallArgumentsDoneEvent) MarshalJSON() ([]byte, error) { + type typeAlias ResponseMcpCallArgumentsDoneEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + +// Returned when an MCP tool call has started and is in progress. +// +// See https://platform.openai.com/docs/api-reference/realtime-server-events/response/mcp_call/in_progress +type ResponseMcpCallInProgressEvent struct { + ServerEventBase + // The ID of the item. + ItemID string `json:"item_id"` + // The index of the output item in the response. + OutputIndex int `json:"output_index"` +} + +func (m ResponseMcpCallInProgressEvent) ServerEventType() ServerEventType { + return ServerEventTypeResponseMcpCallInProgress +} + +func (m ResponseMcpCallInProgressEvent) MarshalJSON() ([]byte, error) { + type typeAlias ResponseMcpCallInProgressEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + +// Returned when an MCP tool call has completed successfully. +// +// See https://platform.openai.com/docs/api-reference/realtime-server-events/response/mcp_call/completed +type ResponseMcpCallCompletedEvent struct { + ServerEventBase + // The ID of the item. + ItemID string `json:"item_id"` + // The index of the output item in the response. + OutputIndex int `json:"output_index"` +} + +func (m ResponseMcpCallCompletedEvent) ServerEventType() ServerEventType { + return ServerEventTypeResponseMcpCallCompleted +} + +func (m ResponseMcpCallCompletedEvent) MarshalJSON() ([]byte, error) { + type typeAlias ResponseMcpCallCompletedEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + +// Returned when an MCP tool call has failed. +// +// See https://platform.openai.com/docs/api-reference/realtime-server-events/response/mcp_call/failed +type ResponseMcpCallFailedEvent struct { + ServerEventBase + // The ID of the item. + ItemID string `json:"item_id"` + // The index of the output item in the response. + OutputIndex int `json:"output_index"` +} + +func (m ResponseMcpCallFailedEvent) ServerEventType() ServerEventType { + return ServerEventTypeResponseMcpCallFailed +} + +func (m ResponseMcpCallFailedEvent) MarshalJSON() ([]byte, error) { + type typeAlias ResponseMcpCallFailedEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + +// Returned when listing MCP tools is in progress for an item. +// +// See https://platform.openai.com/docs/api-reference/realtime-server-events/mcp_list_tools/in_progress +type McpListToolsInProgressEvent struct { + ServerEventBase + // The ID of the MCP list tools item. + ItemID string `json:"item_id"` +} + +func (m McpListToolsInProgressEvent) ServerEventType() ServerEventType { + return ServerEventTypeMcpListToolsInProgress +} + +func (m McpListToolsInProgressEvent) MarshalJSON() ([]byte, error) { + type typeAlias McpListToolsInProgressEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + +// Returned when listing MCP tools has completed for an item. +// +// See https://platform.openai.com/docs/api-reference/realtime-server-events/mcp_list_tools/completed +type McpListToolsCompletedEvent struct { + ServerEventBase + // The ID of the MCP list tools item. + ItemID string `json:"item_id"` +} + +func (m McpListToolsCompletedEvent) ServerEventType() ServerEventType { + return ServerEventTypeMcpListToolsCompleted +} + +func (m McpListToolsCompletedEvent) MarshalJSON() ([]byte, error) { + type typeAlias McpListToolsCompletedEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + +// Returned when listing MCP tools has failed for an item. +// +// See https://platform.openai.com/docs/api-reference/realtime-server-events/mcp_list_tools/failed +type McpListToolsFailedEvent struct { + ServerEventBase + // The ID of the MCP list tools item. + ItemID string `json:"item_id"` +} + +func (m McpListToolsFailedEvent) ServerEventType() ServerEventType { + return ServerEventTypeMcpListToolsFailed +} + +func (m McpListToolsFailedEvent) MarshalJSON() ([]byte, error) { + type typeAlias McpListToolsFailedEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + +// Emitted at the beginning of a Response to indicate the updated rate limits. When a Response is created some tokens will be "reserved" for the output tokens, the rate limits shown here reflect that reservation, which is then adjusted accordingly once the Response is completed. +// +// See https://platform.openai.com/docs/api-reference/realtime-server-events/rate_limits/updated +type RateLimitsUpdatedEvent struct { + ServerEventBase + // List of rate limit information. + RateLimits []RateLimit `json:"rate_limits"` +} + +func (m RateLimitsUpdatedEvent) ServerEventType() ServerEventType { + return ServerEventTypeRateLimitsUpdated +} + +func (m RateLimitsUpdatedEvent) MarshalJSON() ([]byte, error) { + type typeAlias RateLimitsUpdatedEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + +type ServerEventInterface interface { + ErrorEvent | + SessionCreatedEvent | + SessionUpdatedEvent | + ConversationItemAddedEvent | + ConversationItemDoneEvent | + ConversationItemRetrievedEvent | + ConversationItemInputAudioTranscriptionCompletedEvent | + ConversationItemInputAudioTranscriptionDeltaEvent | + ConversationItemInputAudioTranscriptionSegmentEvent | + ConversationItemInputAudioTranscriptionFailedEvent | + ConversationItemTruncatedEvent | + ConversationItemDeletedEvent | + InputAudioBufferCommittedEvent | + InputAudioBufferClearedEvent | + InputAudioBufferSpeechStartedEvent | + InputAudioBufferSpeechStoppedEvent | + InputAudioBufferTimeoutTriggeredEvent | + ResponseCreatedEvent | + ResponseDoneEvent | + ResponseOutputItemAddedEvent | + ResponseOutputItemDoneEvent | + ResponseContentPartAddedEvent | + ResponseContentPartDoneEvent | + ResponseOutputTextDeltaEvent | + ResponseOutputTextDoneEvent | + ResponseOutputAudioTranscriptDeltaEvent | + ResponseOutputAudioTranscriptDoneEvent | + ResponseOutputAudioDeltaEvent | + ResponseOutputAudioDoneEvent | + ResponseFunctionCallArgumentsDeltaEvent | + ResponseFunctionCallArgumentsDoneEvent | + ResponseMcpCallArgumentsDeltaEvent | + ResponseMcpCallArgumentsDoneEvent | + ResponseMcpCallInProgressEvent | + ResponseMcpCallCompletedEvent | + ResponseMcpCallFailedEvent | + McpListToolsInProgressEvent | + McpListToolsCompletedEvent | + McpListToolsFailedEvent | + RateLimitsUpdatedEvent +} + +func unmarshalServerEvent[T ServerEventInterface](data []byte) (T, error) { + var t T + err := json.Unmarshal(data, &t) + if err != nil { + return t, err + } + return t, nil +} + +// UnmarshalServerEvent unmarshals the server event from the given JSON data. +func UnmarshalServerEvent(data []byte) (ServerEvent, error) { //nolint:funlen,cyclop,gocyclo // TODO: optimize + var eventType struct { + Type ServerEventType `json:"type"` + } + err := json.Unmarshal(data, &eventType) + if err != nil { + return nil, err + } + switch eventType.Type { + case ServerEventTypeError: + return unmarshalServerEvent[ErrorEvent](data) + + case ServerEventTypeSessionCreated: + return unmarshalServerEvent[SessionCreatedEvent](data) + + case ServerEventTypeSessionUpdated: + return unmarshalServerEvent[SessionUpdatedEvent](data) + + case ServerEventTypeConversationItemAdded: + return unmarshalServerEvent[ConversationItemAddedEvent](data) + + case ServerEventTypeConversationItemDone: + return unmarshalServerEvent[ConversationItemDoneEvent](data) + + case ServerEventTypeConversationItemRetrieved: + return unmarshalServerEvent[ConversationItemRetrievedEvent](data) + + case ServerEventTypeConversationItemInputAudioTranscriptionCompleted: + return unmarshalServerEvent[ConversationItemInputAudioTranscriptionCompletedEvent](data) + + case ServerEventTypeConversationItemInputAudioTranscriptionDelta: + return unmarshalServerEvent[ConversationItemInputAudioTranscriptionDeltaEvent](data) + + case ServerEventTypeConversationItemInputAudioTranscriptionSegment: + return unmarshalServerEvent[ConversationItemInputAudioTranscriptionSegmentEvent](data) + + case ServerEventTypeConversationItemInputAudioTranscriptionFailed: + return unmarshalServerEvent[ConversationItemInputAudioTranscriptionFailedEvent](data) + + case ServerEventTypeConversationItemTruncated: + return unmarshalServerEvent[ConversationItemTruncatedEvent](data) + + case ServerEventTypeConversationItemDeleted: + return unmarshalServerEvent[ConversationItemDeletedEvent](data) + + case ServerEventTypeInputAudioBufferCommitted: + return unmarshalServerEvent[InputAudioBufferCommittedEvent](data) + + case ServerEventTypeInputAudioBufferCleared: + return unmarshalServerEvent[InputAudioBufferClearedEvent](data) + + case ServerEventTypeInputAudioBufferSpeechStarted: + return unmarshalServerEvent[InputAudioBufferSpeechStartedEvent](data) + + case ServerEventTypeInputAudioBufferSpeechStopped: + return unmarshalServerEvent[InputAudioBufferSpeechStoppedEvent](data) + + case ServerEventTypeInputAudioBufferTimeoutTriggered: + return unmarshalServerEvent[InputAudioBufferTimeoutTriggeredEvent](data) + + case ServerEventTypeResponseCreated: + return unmarshalServerEvent[ResponseCreatedEvent](data) + + case ServerEventTypeResponseDone: + return unmarshalServerEvent[ResponseDoneEvent](data) + + case ServerEventTypeResponseOutputItemAdded: + return unmarshalServerEvent[ResponseOutputItemAddedEvent](data) + + case ServerEventTypeResponseOutputItemDone: + return unmarshalServerEvent[ResponseOutputItemDoneEvent](data) + + case ServerEventTypeResponseContentPartAdded: + return unmarshalServerEvent[ResponseContentPartAddedEvent](data) + + case ServerEventTypeResponseContentPartDone: + return unmarshalServerEvent[ResponseContentPartDoneEvent](data) + + case ServerEventTypeResponseOutputTextDelta: + return unmarshalServerEvent[ResponseOutputTextDeltaEvent](data) + + case ServerEventTypeResponseOutputTextDone: + return unmarshalServerEvent[ResponseOutputTextDoneEvent](data) + + case ServerEventTypeResponseOutputAudioTranscriptDelta: + return unmarshalServerEvent[ResponseOutputAudioTranscriptDeltaEvent](data) + + case ServerEventTypeResponseOutputAudioTranscriptDone: + return unmarshalServerEvent[ResponseOutputAudioTranscriptDoneEvent](data) + + case ServerEventTypeResponseOutputAudioDelta: + return unmarshalServerEvent[ResponseOutputAudioDeltaEvent](data) + + case ServerEventTypeResponseOutputAudioDone: + return unmarshalServerEvent[ResponseOutputAudioDoneEvent](data) + + case ServerEventTypeResponseFunctionCallArgumentsDelta: + return unmarshalServerEvent[ResponseFunctionCallArgumentsDeltaEvent](data) + + case ServerEventTypeResponseFunctionCallArgumentsDone: + return unmarshalServerEvent[ResponseFunctionCallArgumentsDoneEvent](data) + + case ServerEventTypeResponseMcpCallArgumentsDelta: + return unmarshalServerEvent[ResponseMcpCallArgumentsDeltaEvent](data) + + case ServerEventTypeResponseMcpCallArgumentsDone: + return unmarshalServerEvent[ResponseMcpCallArgumentsDoneEvent](data) + + case ServerEventTypeResponseMcpCallInProgress: + return unmarshalServerEvent[ResponseMcpCallInProgressEvent](data) + + case ServerEventTypeResponseMcpCallCompleted: + return unmarshalServerEvent[ResponseMcpCallCompletedEvent](data) + + case ServerEventTypeResponseMcpCallFailed: + return unmarshalServerEvent[ResponseMcpCallFailedEvent](data) + + case ServerEventTypeMcpListToolsInProgress: + return unmarshalServerEvent[McpListToolsInProgressEvent](data) + + case ServerEventTypeMcpListToolsCompleted: + return unmarshalServerEvent[McpListToolsCompletedEvent](data) + + case ServerEventTypeMcpListToolsFailed: + return unmarshalServerEvent[McpListToolsFailedEvent](data) + + case ServerEventTypeRateLimitsUpdated: + return unmarshalServerEvent[RateLimitsUpdatedEvent](data) + + default: + // This should never happen. + return nil, fmt.Errorf("unknown server event type: %s", eventType.Type) + } +} diff --git a/core/http/endpoints/openai/types/types.go b/core/http/endpoints/openai/types/types.go new file mode 100644 index 000000000..ee2e35e66 --- /dev/null +++ b/core/http/endpoints/openai/types/types.go @@ -0,0 +1,1196 @@ +package types + +import ( + "bytes" + "encoding/json" + "errors" + "fmt" +) + +// The voice the model uses to respond. Voice cannot be changed during the session once the model has responded with audio at least once. Current voice options are alloy, ash, ballad, coral, echo, sage, shimmer, verse, marin, and cedar. We recommend marin and cedar for best quality. +type Voice string + +const ( + VoiceAlloy Voice = "alloy" + VoiceAsh Voice = "ash" + VoiceBallad Voice = "ballad" + VoiceCoral Voice = "coral" + VoiceEcho Voice = "echo" + VoiceSage Voice = "sage" + VoiceShimmer Voice = "shimmer" + VoiceVerse Voice = "verse" + VoiceMarin Voice = "marin" + VoiceCedar Voice = "cedar" + VoiceFable Voice = "fable" + VoiceOnyx Voice = "onyx" + VoiceNova Voice = "nova" +) + +type AudioFormat string + +const ( + AudioFormatPcm16 AudioFormat = "pcm16" + AudioFormatG711Ulaw AudioFormat = "g711_ulaw" + AudioFormatG711Alaw AudioFormat = "g711_alaw" +) + +type Modality string + +const ( + ModalityText Modality = "text" + ModalityAudio Modality = "audio" +) + +type TurnDetectionType string + +const ( + TurnDetectionTypeServerVad TurnDetectionType = "server_vad" + TurnDetectionTypeSemanticVad TurnDetectionType = "semantic_vad" +) + +type ToolChoiceMode string + +const ( + ToolChoiceModeNone ToolChoiceMode = "none" + ToolChoiceModeAuto ToolChoiceMode = "auto" + ToolChoiceModeRequired ToolChoiceMode = "required" +) + +func (t ToolChoiceMode) ToolChoiceType() string { + return string(t) +} + +type ToolChoiceType string + +const ( + ToolChoiceTypeFunction ToolChoiceType = "function" + ToolChoiceTypeMCP ToolChoiceType = "mcp" +) + +type ToolChoiceFunction struct { + // The name of the function to call. + Name string `json:"name,omitempty"` +} + +func (t ToolChoiceFunction) ToolChoiceType() string { + return string(ToolChoiceTypeFunction) +} + +func (t ToolChoiceFunction) MarshalJSON() ([]byte, error) { + type typeAlias ToolChoiceFunction + type typeWrapper struct { + typeAlias + Type string `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(t), + Type: t.ToolChoiceType(), + } + return json.Marshal(shadow) +} + +type ToolChoiceMCP struct { + // The label of the MCP server to use. + ServerLabel string `json:"server_label,omitempty"` + + // The name of the tool to call on the server. + Name string `json:"name,omitempty"` +} + +func (t ToolChoiceMCP) ToolChoiceType() string { + return string(ToolChoiceTypeMCP) +} + +func (t ToolChoiceMCP) MarshalJSON() ([]byte, error) { + type typeAlias ToolChoiceMCP + type typeWrapper struct { + typeAlias + Type string `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(t), + Type: t.ToolChoiceType(), + } + return json.Marshal(shadow) +} + +type ToolChoiceUnion struct { + // Controls which (if any) tool is called by the model. + // + // none means the model will not call any tool and instead generates a message. + // + // auto means the model can pick between generating a message or calling one or more tools. + // + // required means the model must call one or more tools. + Mode ToolChoiceMode `json:",omitempty"` + + // Use this option to force the model to call a specific function. + Function *ToolChoiceFunction `json:",omitempty"` + + // Use this option to force the model to call a specific tool on a remote MCP server. + MCP *ToolChoiceMCP `json:",omitempty"` +} + +func (t ToolChoiceUnion) MarshalJSON() ([]byte, error) { + if t.Function != nil { + return json.Marshal(t.Function) + } + if t.MCP != nil { + return json.Marshal(t.MCP) + } + return json.Marshal(t.Mode) +} + +func (t *ToolChoiceUnion) UnmarshalJSON(data []byte) error { + if isNull(data) { + return nil + } + var u typeStruct + if err := json.Unmarshal(data, &u); err != nil { + t.Mode = ToolChoiceMode(bytes.Trim(data, "\"")) + return nil //nolint: nilerr // data is string instead of object + } + switch ToolChoiceType(u.Type) { + case ToolChoiceTypeFunction: + return json.Unmarshal(data, &t.Function) + case ToolChoiceTypeMCP: + return json.Unmarshal(data, &t.MCP) + default: + t.Mode = ToolChoiceMode(u.Type) + } + return nil +} + +type ToolType string + +const ( + ToolTypeFunction ToolType = "function" + ToolTypeMCP ToolType = "mcp" +) + +type ToolFunction struct { + // The name of the function. + Name string `json:"name"` + + // The description of the function, including guidance on when and how to call it, and guidance about what to tell the user when calling (if anything). + Description string `json:"description"` + + // The type of the tool, i.e. function. + Parameters any `json:"parameters"` +} + +func (t ToolFunction) ToolType() ToolType { + return ToolTypeFunction +} + +func (t ToolFunction) MarshalJSON() ([]byte, error) { + type typeAlias ToolFunction + type toolFunction struct { + typeAlias + Type ToolType `json:"type"` + } + shadow := toolFunction{ + typeAlias: typeAlias(t), + Type: t.ToolType(), + } + return json.Marshal(shadow) +} + +type MCPToolFilter struct { + // Indicates whether or not a tool modifies data or is read-only. If an MCP server is annotated with readOnlyHint, it will match this filter. + ReadOnly bool `json:"read_only,omitempty"` + + // List of allowed tool names. + ToolNames []string `json:"tool_names,omitempty"` +} + +type MCPAllowedToolsUnion struct { + // A string array of allowed tool names + ToolNames []string `json:",omitempty"` + + // A filter object to specify which tools are allowed. + Filter *MCPToolFilter `json:",omitempty"` +} + +func (t MCPAllowedToolsUnion) MarshalJSON() ([]byte, error) { + if len(t.ToolNames) > 0 { + return json.Marshal(t.ToolNames) + } + return json.Marshal(t.Filter) +} + +func (t *MCPAllowedToolsUnion) UnmarshalJSON(data []byte) error { + if isNull(data) { + return nil + } + if err := json.Unmarshal(data, &t.Filter); err == nil { + return nil + } + return json.Unmarshal(data, &t.ToolNames) +} + +type MCPRequireApprovalFilter struct { + // A filter object to specify which tools are allowed. + Always *MCPToolFilter `json:",omitempty"` + + // A filter object to specify which tools are allowed. + Never *MCPToolFilter `json:",omitempty"` +} + +type MCPToolRequireApprovalUnion struct { + // Specify which of the MCP server's tools require approval. Can be always, never, or a filter object associated with tools that require approval. + Filter *MCPRequireApprovalFilter `json:",omitempty"` + + // Specify a single approval policy for all tools. One of always or never. When set to always, all tools will require approval. When set to never, all tools will not require approval. + Setting string `json:",omitempty"` +} + +func (t MCPToolRequireApprovalUnion) MarshalJSON() ([]byte, error) { + if t.Filter != nil { + return json.Marshal(t.Filter) + } + return json.Marshal(t.Setting) +} + +func (t *MCPToolRequireApprovalUnion) UnmarshalJSON(data []byte) error { + if isNull(data) { + return nil + } + if err := json.Unmarshal(data, &t.Filter); err == nil { + return nil + } + return json.Unmarshal(data, &t.Setting) +} + +type ToolMCP struct { + // A label for this MCP server, used to identify it in tool calls. + ServerLabel string `json:"server_label,omitempty"` + + // An OAuth access token that can be used with a remote MCP server, either with a custom MCP server URL or a service connector. Your application must handle the OAuth authorization flow and provide the token here. + Authorization string `json:"authorization,omitempty"` + + // Optional description of the MCP server, used to provide more context. + ServerDescription string `json:"server_description,omitempty"` + + // The URL for the MCP server. One of server_url or connector_id must be provided. + ServerURL string `json:"server_url,omitempty"` + + // List of allowed tool names or a filter object. + AllowedTools *MCPAllowedToolsUnion `json:"allowed_tools,omitempty"` + + // Optional HTTP headers to send to the MCP server. Use for authentication or other purposes. + Headers map[string]string `json:"headers,omitempty"` + + // Specify which of the MCP server's tools require approval. + RequireApproval *MCPToolRequireApprovalUnion `json:"require_approval,omitempty"` + + // Identifier for service connectors, like those available in ChatGPT. One of server_url or connector_id must be provided. Learn more about service connectors here. + // + // Currently supported connector_id values are: + // + // Dropbox: connector_dropbox + // Gmail: connector_gmail + // Google Calendar: connector_googlecalendar + // Google Drive: connector_googledrive + // Microsoft Teams: connector_microsoftteams + // Outlook Calendar: connector_outlookcalendar + // Outlook Email: connector_outlookemail + // SharePoint: connector_sharepoint + ConnectorID string `json:"connector_id,omitempty"` +} + +func (t ToolMCP) ToolType() ToolType { + return ToolTypeMCP +} + +func (t ToolMCP) MarshalJSON() ([]byte, error) { + type typeAlias ToolMCP + type toolMCP struct { + typeAlias + Type ToolType `json:"type"` + } + shadow := toolMCP{ + typeAlias: typeAlias(t), + Type: t.ToolType(), + } + return json.Marshal(shadow) +} + +type TracingConfiguration struct { + GroupID string `json:"group_id,omitempty"` + Metadata any `json:"metadata,omitempty"` + WorkflowName string `json:"workflow_name,omitempty"` +} + +type ToolUnion struct { + Function *ToolFunction `json:",omitempty"` + + // Give the model access to additional tools via remote Model Context Protocol (MCP) servers. Learn more about MCP. + MCP *ToolMCP `json:",omitempty"` +} + +func (t ToolUnion) MarshalJSON() ([]byte, error) { + if t.Function != nil { + return json.Marshal(t.Function) + } + if t.MCP != nil { + return json.Marshal(t.MCP) + } + return nil, errors.New("no tool") +} + +func (t *ToolUnion) UnmarshalJSON(data []byte) error { + if isNull(data) { + return nil + } + var u typeStruct + if err := json.Unmarshal(data, &u); err != nil { + return err + } + switch ToolType(u.Type) { + case ToolTypeFunction: + return json.Unmarshal(data, &t.Function) + case ToolTypeMCP: + return json.Unmarshal(data, &t.MCP) + default: + return fmt.Errorf("unknown tool type: %s", u.Type) + } +} + +type TracingMode string + +const ( + TracingModeAuto = "auto" +) + +type TracingUnion struct { + Mode TracingMode `json:",omitempty"` + Configuration *TracingConfiguration `json:",omitempty"` +} + +type TruncationStrategy string + +const ( + TruncationStrategyAuto TruncationStrategy = "auto" + TruncationStrategyDisabled TruncationStrategy = "disabled" + TruncationStrategyRetentionRatio TruncationStrategy = "retention_ratio" +) + +func (t TruncationStrategy) TruncationStrategy() string { + return string(t) +} + +type RetentionRatioTruncation struct { + Ratio float32 `json:"retention_ratio,omitempty"` +} + +func (t RetentionRatioTruncation) TruncationStrategy() string { + return string(TruncationStrategyRetentionRatio) +} + +type TruncationUnion struct { + Strategy TruncationStrategy `json:",omitempty"` + RetentionRatioTruncation *RetentionRatioTruncation `json:",omitempty"` +} + +const nullString = "null" + +func isNull(data []byte) bool { + return len(data) == len(nullString) && string(data) == nullString +} + +func (t *TruncationUnion) UnmarshalJSON(data []byte) error { + if isNull(data) { + return nil + } + var u typeStruct + if err := json.Unmarshal(data, &u); err != nil { + t.Strategy = TruncationStrategy(bytes.Trim(data, "\"")) + return nil //nolint: nilerr // data is string instead of object + } + switch TruncationStrategy(u.Type) { + case TruncationStrategyRetentionRatio: + return json.Unmarshal(data, &t.RetentionRatioTruncation) + case TruncationStrategyDisabled, TruncationStrategyAuto: + t.Strategy = TruncationStrategy(data) + default: + return fmt.Errorf("unknown truncation strategy: %s", u.Type) + } + return nil +} + +type ResponseAudioOutput struct { + // The format of the output audio. + Format *AudioFormatUnion `json:"format,omitempty"` + + // The voice the model uses to respond. Voice cannot be changed during the session once the model has responded with audio at least once. Current voice options are alloy, ash, ballad, coral, echo, sage, shimmer, verse, marin, and cedar. We recommend marin and cedar for best quality. + Voice Voice `json:"voice,omitempty"` +} + +type ResponseAudio struct { + Output *ResponseAudioOutput `json:"output,omitempty"` +} + +type MessageRole string + +const ( + MessageRoleSystem MessageRole = "system" + MessageRoleAssistant MessageRole = "assistant" + MessageRoleUser MessageRole = "user" +) + +type Tool struct { + Type ToolType `json:"type"` + Name string `json:"name"` + Description string `json:"description"` + Parameters any `json:"parameters"` +} + +type ResponseMessageItem struct { + MessageItemUnion + // The object type, must be "realtime.item". + Object string `json:"object,omitempty"` +} + +type Error struct { + // The type of error (e.g., "invalid_request_error", "server_error"). + Message string `json:"message,omitempty"` + // Error code, if any. + Type string `json:"type,omitempty"` + // A human-readable error message. + Code string `json:"code,omitempty"` + // Parameter related to the error, if any. + Param string `json:"param,omitempty"` + // The event_id of the client event that caused the error, if applicable. + EventID string `json:"event_id,omitempty"` +} + +type AudioFormatType string + +const ( + AudioFormatTypePCM AudioFormatType = "audio/pcm" + AudioFormatTypePCMU AudioFormatType = "audio/pcmu" + AudioFormatTypePCMA AudioFormatType = "audio/pcma" +) + +// The PCM audio format. Only a 24kHz sample rate is supported. +type AudioFormatPCM struct { + // The sample rate of the audio. Always 24000. + Rate int `json:"rate,omitempty"` +} + +func (p AudioFormatPCM) AudioFormat() string { + return string(AudioFormatTypePCM) +} + +func (p AudioFormatPCM) MarshalJSON() ([]byte, error) { + type typeAlias AudioFormatPCM + type typeWrapper struct { + typeAlias + Type string `json:"type,omitempty"` + } + return json.Marshal(typeWrapper{ + typeAlias: typeAlias(p), + Type: p.AudioFormat(), + }) +} + +// The G.711 μ-law format. +type AudioFormatPCMU struct { +} + +func (p AudioFormatPCMU) AudioFormat() string { + return string(AudioFormatTypePCMU) +} + +func (p AudioFormatPCMU) MarshalJSON() ([]byte, error) { + type typeAlias AudioFormatPCMU + type typeWrapper struct { + typeAlias + Type string `json:"type,omitempty"` + } + return json.Marshal(typeWrapper{ + typeAlias: typeAlias(p), + Type: p.AudioFormat(), + }) +} + +// The G.711 A-law format. +type AudioFormatPCMA struct { +} + +func (p AudioFormatPCMA) AudioFormat() string { + return string(AudioFormatTypePCMA) +} + +func (p AudioFormatPCMA) MarshalJSON() ([]byte, error) { + type typeAlias AudioFormatPCMA + type typeWrapper struct { + typeAlias + Type string `json:"type,omitempty"` + } + return json.Marshal(typeWrapper{ + typeAlias: typeAlias(p), + Type: p.AudioFormat(), + }) +} + +type AudioFormatUnion struct { + // The PCM audio format. Only a 24kHz sample rate is supported. + PCM *AudioFormatPCM `json:",omitempty"` + + // The G.711 μ-law format. + PCMU *AudioFormatPCMU `json:",omitempty"` + + // The G.711 A-law format. + PCMA *AudioFormatPCMA `json:",omitempty"` +} + +func (r AudioFormatUnion) MarshalJSON() ([]byte, error) { + if r.PCM != nil { + return json.Marshal(r.PCM) + } + if r.PCMU != nil { + return json.Marshal(r.PCMU) + } + if r.PCMA != nil { + return json.Marshal(r.PCMA) + } + return nil, errors.New("no audio format") +} + +func (r *AudioFormatUnion) UnmarshalJSON(data []byte) error { + if isNull(data) { + return nil + } + type typeStruct struct { + Type string `json:"type"` + } + var t typeStruct + if err := json.Unmarshal(data, &t); err != nil { + return err + } + switch AudioFormatType(t.Type) { + case AudioFormatTypePCM: + r.PCM = &AudioFormatPCM{} + return json.Unmarshal(data, r.PCM) + case AudioFormatTypePCMU: + r.PCMU = &AudioFormatPCMU{} + return json.Unmarshal(data, r.PCMU) + case AudioFormatTypePCMA: + r.PCMA = &AudioFormatPCMA{} + return json.Unmarshal(data, r.PCMA) + default: + return fmt.Errorf("unknown audio format: %s", t.Type) + } +} + +type AudioNoiseReduction struct { + // Type of noise reduction. near_field is for close-talking microphones such as headphones, far_field is for far-field microphones such as laptop or conference room microphones. + Type NoiseReductionType `json:"type,omitempty"` +} + +type ServerVad struct { + // Optional timeout after which a model response will be triggered automatically. This is useful for situations in which a long pause from the user is unexpected, such as a phone call. The model will effectively prompt the user to continue the conversation based on the current context. + // + // The timeout value will be applied after the last model response's audio has finished playing, i.e. it's set to the response.done time plus audio playback duration. + // + // An input_audio_buffer.timeout_triggered event (plus events associated with the Response) will be emitted when the timeout is reached. Idle timeout is currently only supported for server_vad mode. + IdleTimeoutMs int64 `json:"idle_timeout_ms,omitempty"` + + // Whether or not to automatically generate a response when a VAD stop event occurs. + CreateResponse bool `json:"create_response,omitempty"` + + // Whether or not to automatically interrupt any ongoing response with output to the default conversation (i.e. conversation of auto) when a VAD start event occurs. + InterruptResponse bool `json:"interrupt_response,omitempty"` + + // Used only for server_vad mode. Amount of audio to include before the VAD detected speech (in milliseconds). Defaults to 300ms. + PrefixPaddingMs int64 `json:"prefix_padding_ms,omitempty"` + + // Used only for server_vad mode. Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms. With shorter values the model will respond more quickly, but may jump in on short pauses from the user. + SilenceDurationMs int64 `json:"silence_duration_ms,omitempty"` + + // Used only for server_vad mode. Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher threshold will require louder audio to activate the model, and thus might perform better in noisy environments. + Threshold float64 `json:"threshold,omitempty"` +} + +func (r ServerVad) VadType() TurnDetectionType { + return TurnDetectionTypeServerVad +} + +func (r ServerVad) MarshalJSON() ([]byte, error) { + type typeAlias ServerVad + type typeWrapper struct { + typeAlias + Type TurnDetectionType `json:"type,omitempty"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(r), + Type: TurnDetectionTypeServerVad, + } + return json.Marshal(shadow) +} + +type RealtimeSessionSemanticVad struct { + // Whether or not to automatically generate a response when a VAD stop event occurs. + CreateResponse bool `json:"create_response,omitempty"` + + // Whether or not to automatically interrupt any ongoing response with output to the default conversation (i.e. conversation of auto) when a VAD start event occurs. + InterruptResponse bool `json:"interrupt_response,omitempty"` + + // Used only for semantic_vad mode. The eagerness of the model to respond. low will wait longer for the user to continue speaking, high will respond more quickly. auto is the default and is equivalent to medium. low, medium, and high have max timeouts of 8s, 4s, and 2s respectively. + Eagerness string `json:"eagerness,omitempty"` +} + +func (r RealtimeSessionSemanticVad) VadType() TurnDetectionType { + return TurnDetectionTypeSemanticVad +} + +func (r RealtimeSessionSemanticVad) MarshalJSON() ([]byte, error) { + type typeAlias RealtimeSessionSemanticVad + type typeWrapper struct { + typeAlias + Type TurnDetectionType `json:"type,omitempty"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(r), + Type: TurnDetectionTypeSemanticVad, + } + return json.Marshal(shadow) +} + +type TurnDetectionUnion struct { + // Server-side voice activity detection (VAD) which flips on when user speech is detected and off after a period of silence. + ServerVad *ServerVad `json:",omitempty"` + + // Server-side semantic turn detection which uses a model to determine when the user has finished speaking. + SemanticVad *RealtimeSessionSemanticVad `json:",omitempty"` +} + +func (r TurnDetectionUnion) MarshalJSON() ([]byte, error) { + if r.ServerVad != nil { + return json.Marshal(r.ServerVad) + } + if r.SemanticVad != nil { + return json.Marshal(r.SemanticVad) + } + return nil, errors.New("no turn detection") +} + +func (r *TurnDetectionUnion) UnmarshalJSON(data []byte) error { + if isNull(data) { + return nil + } + var t typeStruct + if err := json.Unmarshal(data, &t); err != nil { + return err + } + switch TurnDetectionType(t.Type) { + case TurnDetectionTypeServerVad: + return json.Unmarshal(data, &r.ServerVad) + case TurnDetectionTypeSemanticVad: + return json.Unmarshal(data, &r.SemanticVad) + default: + return fmt.Errorf("unknown turn detection type: %s", t.Type) + } +} + +type AudioTranscription struct { + // The language of the input audio. Supplying the input language in ISO-639-1 (e.g. en) format will improve accuracy and latency. + Language string `json:"language,omitempty"` + + // An optional text to guide the model's style or continue a previous audio segment. For whisper-1, the prompt is a list of keywords. For gpt-4o-transcribe models (excluding gpt-4o-transcribe-diarize), the prompt is a free text string, for example "expect words related to technology". + Prompt string `json:"prompt,omitempty"` + + // The model to use for transcription. Current options are whisper-1, gpt-4o-mini-transcribe, gpt-4o-transcribe, and gpt-4o-transcribe-diarize. Use gpt-4o-transcribe-diarize when you need diarization with speaker labels. + Model string `json:"model,omitempty"` +} + +type SessionAudioInput struct { + Format *AudioFormatUnion `json:"format,omitempty"` + + // Configuration for input audio noise reduction. This can be set to null to turn off. Noise reduction filters audio added to the input audio buffer before it is sent to VAD and the model. Filtering the audio can improve VAD and turn detection accuracy (reducing false positives) and model performance by improving perception of the input audio. + NoiseReduction *AudioNoiseReduction `json:"noise_reduction,omitempty"` + + // Configuration for input audio transcription, defaults to off and can be set to null to turn off once on. Input audio transcription is not native to the model, since the model consumes audio directly. Transcription runs asynchronously through the /audio/transcriptions endpoint and should be treated as guidance of input audio content rather than precisely what the model heard. The client can optionally set the language and prompt for transcription, these offer additional guidance to the transcription service. + TurnDetection *TurnDetectionUnion `json:"turn_detection,omitempty"` + + // Configuration for turn detection, ether Server VAD or Semantic VAD. This can be set to null to turn off, in which case the client must manually trigger model response. + // + // Server VAD means that the model will detect the start and end of speech based on audio volume and respond at the end of user speech. + // + // Semantic VAD is more advanced and uses a turn detection model (in conjunction with VAD) to semantically estimate whether the user has finished speaking, then dynamically sets a timeout based on this probability. For example, if user audio trails off with "uhhm", the model will score a low probability of turn end and wait longer for the user to continue speaking. This can be useful for more natural conversations, but may have a higher latency. + Transcription *AudioTranscription `json:"transcription,omitempty"` +} + +type SessionAudioOutput struct { + Format *AudioFormatUnion `json:"format,omitempty"` + Speed float32 `json:"speed,omitempty"` + Voice Voice `json:"voice,omitempty"` +} + +type RealtimeSessionAudio struct { + Input *SessionAudioInput `json:"input,omitempty"` + Output *SessionAudioOutput `json:"output,omitempty"` +} + +type TranscriptionSessionAudio struct { + Input *SessionAudioInput `json:"input,omitempty"` +} + +type PromptInputType string + +const ( + PromptInputTypeText PromptInputType = "input_text" + PromptInputTypeImage PromptInputType = "input_image" + PromptInputTypeFile PromptInputType = "input_file" +) + +// The detail level of the image to be sent to the model. One of `high`, `low`, or +// `auto`. Defaults to `auto`. +type ImageDetail string + +const ( + ImageDetailLow ImageDetail = "low" + ImageDetailHigh ImageDetail = "high" + ImageDetailAuto ImageDetail = "auto" +) + +type PromptInputText struct { + Text string `json:"text"` +} + +func (r PromptInputText) PromptInputType() PromptInputType { + return PromptInputTypeText +} + +func (r PromptInputText) MarshalJSON() ([]byte, error) { + type typeAlias PromptInputText + type typeWrapper struct { + typeAlias + Type PromptInputType `json:"type,omitempty"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(r), + Type: r.PromptInputType(), + } + return json.Marshal(shadow) +} + +type PromptInputImage struct { + Detail ImageDetail `json:"detail,omitempty"` + FileID string `json:"file_id,omitempty"` + ImageURL string `json:"image_url,omitempty"` +} + +func (r PromptInputImage) PromptInputType() PromptInputType { + return PromptInputTypeImage +} + +func (r PromptInputImage) MarshalJSON() ([]byte, error) { + type typeAlias PromptInputImage + type typeWrapper struct { + typeAlias + Type PromptInputType `json:"type,omitempty"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(r), + Type: r.PromptInputType(), + } + return json.Marshal(shadow) +} + +type PromptInputFile struct { + FileID string `json:"file_id,omitempty"` + FileData string `json:"file_data,omitempty"` + FileURL string `json:"file_url,omitempty"` + Filename string `json:"filename,omitempty"` +} + +func (r PromptInputFile) PromptInputType() PromptInputType { + return PromptInputTypeFile +} + +func (r PromptInputFile) MarshalJSON() ([]byte, error) { + type typeAlias PromptInputFile + type typeWrapper struct { + typeAlias + Type PromptInputType `json:"type,omitempty"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(r), + Type: r.PromptInputType(), + } + return json.Marshal(shadow) +} + +type PromptVariableUnion struct { + String string `json:",omitempty"` + InputText *PromptInputText `json:",omitempty"` + InputImage *PromptInputImage `json:",omitempty"` + InputFile *PromptInputFile `json:",omitempty"` +} + +type typeStruct struct { + Type string `json:"type"` +} + +func (u *PromptVariableUnion) UnmarshalJSON(data []byte) error { + if isNull(data) { + return nil + } + var t typeStruct + if err := json.Unmarshal(data, &t); err != nil { + return err + } + switch PromptInputType(t.Type) { + case PromptInputTypeText: + u.InputText = &PromptInputText{} + return json.Unmarshal(data, u.InputText) + case PromptInputTypeImage: + u.InputImage = &PromptInputImage{} + return json.Unmarshal(data, u.InputImage) + case PromptInputTypeFile: + u.InputFile = &PromptInputFile{} + return json.Unmarshal(data, u.InputFile) + default: + return fmt.Errorf("unknown input type: %s", t.Type) + } +} + +type PromptReference struct { + // The unique identifier of the prompt template to use. + ID string `json:"id,omitempty"` + + // Optional version of the prompt template. + Version string `json:"version,omitempty"` + + // Optional map of values to substitute in for variables in your prompt. The substitution values can either be strings, or other Response input types like images or files. + Variables map[string]PromptVariableUnion `json:"variables,omitempty"` +} + +type SessionType string + +const ( + SessionTypeRealtime SessionType = "realtime" + SessionTypeTranscription SessionType = "transcription" +) + +type RealtimeSession struct { + // Unique identifier for the session that looks like sess_1234567890abcdef. + ID string `json:"id,omitempty"` + + // Expiration timestamp for the session, in seconds since epoch. + ExpiresAt int64 `json:"expires_at,omitempty"` + + // The object type. Always realtime.session. + Object string `json:"object,omitempty"` + + // Configuration for input and output audio. + Audio *RealtimeSessionAudio `json:"audio,omitempty"` + + // Additional fields to include in server outputs. + // + // `item.input_audio_transcription.logprobs`: Include logprobs for input audio + // transcription. + // + // Any of "item.input_audio_transcription.logprobs". + Include []string `json:"include,omitempty"` + + // The default system instructions (i.e. system message) prepended to model calls. This field allows the client to guide the model on desired responses. The model can be instructed on response content and format, (e.g. "be extremely succinct", "act friendly", "here are examples of good responses") and on audio behavior (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The instructions are not guaranteed to be followed by the model, but they provide guidance to the model on the desired behavior. + // + // Note that the server sets default instructions which will be used if this field is not set and are visible in the session.created event at the start of the session. + Instructions string `json:"instructions,omitempty"` + + // Maximum number of output tokens for a single assistant response, inclusive of tool calls. Provide an integer between 1 and 4096 to limit output tokens, or inf for the maximum available tokens for a given model. Defaults to inf. + MaxOutputTokens IntOrInf `json:"max_output_tokens,omitempty"` + + // The Realtime model used for this session. + Model string `json:"model,omitempty"` + + // The set of modalities the model can respond with. It defaults to ["audio"], indicating that the model will respond with audio plus a transcript. ["text"] can be used to make the model respond with text only. It is not possible to request both text and audio at the same time. + OutputModalities []Modality `json:"output_modalities,omitempty"` + + // Reference to a prompt template and its variables. + Prompt *PromptReference `json:"prompt,omitempty"` + + // How the model chooses tools. Provide one of the string modes or force a specific function/MCP tool. + ToolChoice *ToolChoiceUnion `json:"tool_choice,omitempty"` + + // Tools available to the model. + Tools []ToolUnion `json:"tools,omitempty"` + + // Realtime API can write session traces to the Traces Dashboard. Set to null to disable tracing. Once tracing is enabled for a session, the configuration cannot be modified. + // + // auto will create a trace for the session with default values for the workflow name, group id, and metadata. + Tracing *TracingUnion `json:"tracing,omitempty"` + + // Controls how the realtime conversation is truncated prior to model inference. The default is auto. + Truncation *TruncationUnion `json:"truncation,omitempty"` +} + +func (r RealtimeSession) Type() SessionType { + return SessionTypeRealtime +} + +func (r RealtimeSession) MarshalJSON() ([]byte, error) { + type typeAlias RealtimeSession + type typeWrapper struct { + typeAlias + Type SessionType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(r), + Type: r.Type(), + } + return json.Marshal(shadow) +} + +type TranscriptionSession struct { + // Unique identifier for the session that looks like sess_1234567890abcdef. + ID string `json:"id,omitempty"` + + // Expiration timestamp for the session, in seconds since epoch. + ExpiresAt int64 `json:"expires_at,omitempty"` + + // The object type. Always realtime.transcription_session. + Object string `json:"object,omitempty"` + + // Configuration for input audio. + Audio *TranscriptionSessionAudio `json:"audio,omitempty"` + + // Additional fields to include in server outputs. + // + // `item.input_audio_transcription.logprobs`: Include logprobs for input audio + // transcription. + // + // Any of "item.input_audio_transcription.logprobs". + Include []string `json:"include,omitempty"` +} + +func (r TranscriptionSession) Type() SessionType { + return SessionTypeTranscription +} + +func (r TranscriptionSession) MarshalJSON() ([]byte, error) { + type typeAlias TranscriptionSession + type typeWrapper struct { + typeAlias + Type SessionType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(r), + Type: r.Type(), + } + return json.Marshal(shadow) +} + +type SessionUnion struct { + // Realtime session object configuration. + Realtime *RealtimeSession `json:"realtime,omitempty"` + + // Realtime transcription session object configuration. + Transcription *TranscriptionSession `json:"transcription,omitempty"` +} + +func (r SessionUnion) MarshalJSON() ([]byte, error) { + if r.Realtime != nil { + return json.Marshal(r.Realtime) + } + if r.Transcription != nil { + return json.Marshal(r.Transcription) + } + return nil, errors.New("no session type") +} + +func (r *SessionUnion) UnmarshalJSON(data []byte) error { + if isNull(data) { + return nil + } + var t typeStruct + if err := json.Unmarshal(data, &t); err != nil { + return err + } + switch SessionType(t.Type) { + case SessionTypeRealtime: + return json.Unmarshal(data, &r.Realtime) + case SessionTypeTranscription: + return json.Unmarshal(data, &r.Transcription) + default: + return fmt.Errorf("unknown session type: %s", t.Type) + } +} + +type ItemStatus string + +const ( + ItemStatusInProgress ItemStatus = "in_progress" + ItemStatusCompleted ItemStatus = "completed" + ItemStatusIncomplete ItemStatus = "incomplete" +) + +type Conversation struct { + // The unique ID of the conversation. + ID string `json:"id"` + // The object type, must be "realtime.conversation". + Object string `json:"object"` +} + +type ResponseStatus string + +const ( + ResponseStatusInProgress ResponseStatus = "in_progress" + ResponseStatusCompleted ResponseStatus = "completed" + ResponseStatusCancelled ResponseStatus = "cancelled" + ResponseStatusIncomplete ResponseStatus = "incomplete" + ResponseStatusFailed ResponseStatus = "failed" +) + +type UsageType string + +const ( + UsageTypeTokens UsageType = "tokens" + UsageTypeDuration UsageType = "duration" +) + +type CachedTokensDetails struct { + TextTokens int `json:"text_tokens"` + AudioTokens int `json:"audio_tokens"` +} + +type InputTokenDetails struct { + CachedTokens int `json:"cached_tokens"` + TextTokens int `json:"text_tokens"` + AudioTokens int `json:"audio_tokens"` + CachedTokensDetails *CachedTokensDetails `json:"cached_tokens_details,omitempty"` +} + +type OutputTokenDetails struct { + TextTokens int `json:"text_tokens"` + AudioTokens int `json:"audio_tokens"` +} + +type TokenUsage struct { + TotalTokens int `json:"total_tokens"` + InputTokens int `json:"input_tokens"` + OutputTokens int `json:"output_tokens"` + // Input token details. + InputTokenDetails *InputTokenDetails `json:"input_token_details,omitempty"` + // Output token details. + OutputTokenDetails *OutputTokenDetails `json:"output_token_details,omitempty"` +} + +func (u TokenUsage) UsageType() UsageType { + return UsageTypeTokens +} + +type DurationUsage struct { + Seconds float64 `json:"seconds"` +} + +func (u DurationUsage) UsageType() UsageType { + return UsageTypeDuration +} + +type UsageUnion struct { + Tokens *TokenUsage `json:",omitempty"` + Duration *DurationUsage `json:",omitempty"` +} + +func (u *UsageUnion) UnmarshalJSON(data []byte) error { + if isNull(data) { + return nil + } + var t typeStruct + if err := json.Unmarshal(data, &t); err != nil { + return err + } + switch UsageType(t.Type) { + case UsageTypeTokens: + return json.Unmarshal(data, &u.Tokens) + case UsageTypeDuration: + return json.Unmarshal(data, &u.Duration) + default: + return fmt.Errorf("unknown usage type: %s", t.Type) + } +} + +type StatusDetail struct { + Error *Error `json:"error,omitempty"` + Reason string `json:"reason,omitempty"` + Type string `json:"type,omitempty"` +} + +type ResponseCreateParams struct { + // Configuration for audio input and output. + Audio *ResponseAudio `json:"audio,omitempty"` + + // Controls which conversation the response is added to. Currently supports auto and none, with auto as the default value. The auto value means that the contents of the response will be added to the default conversation. Set this to none to create an out-of-band response which will not add items to default conversation. + Conversation string `json:"conversation,omitempty"` + + // Input items to include in the prompt for the model. Using this field creates a new context for this Response instead of using the default conversation. An empty array [] will clear the context for this Response. Note that this can include references to items that previously appeared in the session using their id. + Input []MessageItemUnion `json:"input,omitempty"` + + // The default system instructions (i.e. system message) prepended to model calls. This field allows the client to guide the model on desired responses. The model can be instructed on response content and format, (e.g. "be extremely succinct", "act friendly", "here are examples of good responses") and on audio behavior (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The instructions are not guaranteed to be followed by the model, but they provide guidance to the model on the desired behavior. Note that the server sets default instructions which will be used if this field is not set and are visible in the session.created event at the start of the session. + Instructions string `json:"instructions,omitempty"` + + // Maximum number of output tokens for a single assistant response, inclusive of tool calls. Provide an integer between 1 and 4096 to limit output tokens, or inf for the maximum available tokens for a given model. Defaults to inf. + MaxOutputTokens IntOrInf `json:"max_output_tokens,omitempty"` + + // Set of 16 key-value pairs that can be attached to an object. This can be useful for storing additional information about the object in a structured format, and querying for objects via API or the dashboard. + // + // Keys are strings with a maximum length of 64 characters. Values are strings with a maximum length of 512 characters. + Metadata map[string]string `json:"metadata,omitempty"` + + // The set of modalities the model used to respond, currently the only possible values are [\"audio\"], [\"text\"]. Audio output always include a text transcript. Setting the output to mode text will disable audio output from the model. + OutputModalities []Modality `json:"output_modalities,omitempty"` + + // Reference to a prompt template and its variables. + // + // See https://platform.openai.com/docs/guides/text?api-mode=responses#reusable-prompts. + Prompt *PromptReference `json:"prompt,omitempty"` + + // How the model chooses tools. Provide one of the string modes or force a specific function/MCP tool. + ToolChoice *ToolChoiceUnion `json:"tool_choice,omitempty"` + + // Tools available to the model. + Tools []ToolUnion `json:"tools,omitempty"` +} + +type Response struct { + Audio *ResponseAudio `json:"audio,omitempty"` + + ConversationID string `json:"conversation_id,omitempty"` + + // The unique ID of the response. + ID string `json:"id"` + + MaxOutputTokens IntOrInf `json:"max_output_tokens,omitempty"` + + Metadata map[string]string `json:"metadata,omitempty"` + + // The object type, must be "realtime.response". + Object string `json:"object,omitempty"` + + Output []MessageItemUnion `json:"output,omitempty"` + + OutputModalities []Modality `json:"output_modalities,omitempty"` + + // The status of the response. + Status ResponseStatus `json:"status,omitempty"` + // Additional details about the status. + StatusDetails *StatusDetail `json:"status_details,omitempty"` + + Usage *TokenUsage `json:"usage,omitempty"` +} + +type RateLimit struct { + // The name of the rate limit ("requests", "tokens", "input_tokens", "output_tokens"). + Name string `json:"name,omitempty"` + // The maximum allowed value for the rate limit. + Limit int `json:"limit,omitempty"` + // The remaining value before the limit is reached. + Remaining int `json:"remaining,omitempty"` + // Seconds until the rate limit resets. + ResetSeconds float64 `json:"reset_seconds,omitempty"` +} diff --git a/docs/content/advanced/model-configuration.md b/docs/content/advanced/model-configuration.md index 6796a354c..4dbf70d88 100644 --- a/docs/content/advanced/model-configuration.md +++ b/docs/content/advanced/model-configuration.md @@ -476,7 +476,7 @@ reasoning: ## Pipeline Configuration -Define pipelines for audio-to-audio processing: +Define pipelines for audio-to-audio processing and the [Realtime API]({{%relref "features/openai-realtime" %}}): | Field | Type | Description | |-------|------|-------------| diff --git a/docs/content/features/_index.en.md b/docs/content/features/_index.en.md index 1e93d2182..942bbeeae 100644 --- a/docs/content/features/_index.en.md +++ b/docs/content/features/_index.en.md @@ -20,6 +20,7 @@ LocalAI provides a comprehensive set of features for running AI models locally. ## Advanced Features - **[OpenAI Functions](openai-functions/)** - Use function calling and tools API with local models +- **[Realtime API](openai-realtime/)** - Low-latency multi-modal conversations (voice+text) over WebSocket - **[Constrained Grammars](constrained_grammars/)** - Control model output format with BNF grammars - **[GPU Acceleration](GPU-acceleration/)** - Optimize performance with GPU support - **[Distributed Inference](distributed_inferencing/)** - Scale inference across multiple nodes diff --git a/docs/content/features/openai-realtime.md b/docs/content/features/openai-realtime.md new file mode 100644 index 000000000..6c71626a9 --- /dev/null +++ b/docs/content/features/openai-realtime.md @@ -0,0 +1,42 @@ + +--- +title: "Realtime API" +weight: 60 +--- + +# Realtime API + +LocalAI supports the [OpenAI Realtime API](https://platform.openai.com/docs/guides/realtime) which enables low-latency, multi-modal conversations (voice and text) over WebSocket. + +To use the Realtime API, you need to configure a pipeline model that defines the components for Voice Activity Detection (VAD), Transcription (STT), Language Model (LLM), and Text-to-Speech (TTS). + +## Configuration + +Create a model configuration file (e.g., `gpt-realtime.yaml`) in your models directory. For a complete reference of configuration options, see [Model Configuration]({{%relref "advanced/model-configuration" %}}). + +```yaml +name: gpt-realtime +pipeline: + vad: silero-vad-ggml + transcription: whisper-large-turbo + llm: qwen3-4b + tts: tts-1 +``` + +This configuration links the following components: +- **vad**: The Voice Activity Detection model (e.g., `silero-vad-ggml`) to detect when the user is speaking. +- **transcription**: The Speech-to-Text model (e.g., `whisper-large-turbo`) to transcribe user audio. +- **llm**: The Large Language Model (e.g., `qwen3-4b`) to generate responses. +- **tts**: The Text-to-Speech model (e.g., `tts-1`) to synthesize the audio response. + +Make sure all referenced models (`silero-vad-ggml`, `whisper-large-turbo`, `qwen3-4b`, `tts-1`) are also installed or defined in your LocalAI instance. + +## Usage + +Once configured, you can connect to the Realtime API endpoint via WebSocket: + +``` +ws://localhost:8080/v1/realtime?model=gpt-realtime +``` + +The API follows the OpenAI Realtime API protocol for handling sessions, audio buffers, and conversation items.