fix(openai-adapters): Add fallback to stream.usage Promise for usage tokens
Vercel AI SDK's fullStream may emit a finish event with zero/invalid usage data in real API calls, even though tests show it working. This implements a hybrid approach: 1. convertVercelStream emits usage from finish event if valid (>0 tokens) 2. Track whether usage was emitted during stream consumption 3. If no usage emitted, fall back to awaiting stream.usage Promise This ensures tests pass (which have valid finish events) while also handling real API scenarios where finish events may have incomplete data. Changes: - vercelStreamConverter: Only emit usage if tokens > 0 - OpenAI.ts: Add hasEmittedUsage tracking + fallback - Anthropic.ts: Same approach with cache token support Co-authored-by: nate <nate@continue.dev> Generated with [Continue](https://continue.dev)
This commit is contained in:
@@ -665,32 +665,37 @@ export class AnthropicApi implements BaseLlmApi {
|
||||
});
|
||||
|
||||
// Convert Vercel AI SDK stream to OpenAI format
|
||||
// Note: We need to consume fullStream first, then await stream.usage Promise
|
||||
let hasEmittedUsage = false;
|
||||
for await (const chunk of convertVercelStream(stream.fullStream as any, {
|
||||
model: body.model,
|
||||
})) {
|
||||
if (chunk.usage) {
|
||||
hasEmittedUsage = true;
|
||||
}
|
||||
yield chunk;
|
||||
}
|
||||
|
||||
// Await final usage from stream.usage Promise with Anthropic-specific cache details
|
||||
const finalUsage = await stream.usage;
|
||||
if (finalUsage) {
|
||||
const { usageChatChunk } = await import("../util.js");
|
||||
yield usageChatChunk({
|
||||
model: body.model,
|
||||
usage: {
|
||||
prompt_tokens: finalUsage.promptTokens,
|
||||
completion_tokens: finalUsage.completionTokens,
|
||||
total_tokens: finalUsage.totalTokens,
|
||||
prompt_tokens_details: {
|
||||
cached_tokens:
|
||||
(finalUsage as any).promptTokensDetails?.cachedTokens ?? 0,
|
||||
cache_read_tokens:
|
||||
(finalUsage as any).promptTokensDetails?.cachedTokens ?? 0,
|
||||
cache_write_tokens: 0,
|
||||
} as any,
|
||||
},
|
||||
});
|
||||
// Fallback: If fullStream didn't emit usage, get it from stream.usage Promise
|
||||
if (!hasEmittedUsage) {
|
||||
const finalUsage = await stream.usage;
|
||||
if (finalUsage) {
|
||||
const { usageChatChunk } = await import("../util.js");
|
||||
yield usageChatChunk({
|
||||
model: body.model,
|
||||
usage: {
|
||||
prompt_tokens: finalUsage.promptTokens,
|
||||
completion_tokens: finalUsage.completionTokens,
|
||||
total_tokens: finalUsage.totalTokens,
|
||||
prompt_tokens_details: {
|
||||
cached_tokens:
|
||||
(finalUsage as any).promptTokensDetails?.cachedTokens ?? 0,
|
||||
cache_read_tokens:
|
||||
(finalUsage as any).promptTokensDetails?.cachedTokens ?? 0,
|
||||
cache_write_tokens: 0,
|
||||
} as any,
|
||||
},
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -334,24 +334,29 @@ export class OpenAIApi implements BaseLlmApi {
|
||||
});
|
||||
|
||||
// Convert Vercel AI SDK stream to OpenAI format
|
||||
// Note: We need to consume fullStream first, then await stream.usage Promise
|
||||
let hasEmittedUsage = false;
|
||||
for await (const chunk of convertVercelStream(stream.fullStream as any, {
|
||||
model: modifiedBody.model,
|
||||
})) {
|
||||
if (chunk.usage) {
|
||||
hasEmittedUsage = true;
|
||||
}
|
||||
yield chunk;
|
||||
}
|
||||
|
||||
// Await final usage from stream.usage Promise (resolves after fullStream completes)
|
||||
const finalUsage = await stream.usage;
|
||||
if (finalUsage) {
|
||||
yield usageChatChunk({
|
||||
model: modifiedBody.model,
|
||||
usage: {
|
||||
prompt_tokens: finalUsage.promptTokens,
|
||||
completion_tokens: finalUsage.completionTokens,
|
||||
total_tokens: finalUsage.totalTokens,
|
||||
},
|
||||
});
|
||||
// Fallback: If fullStream didn't emit usage, get it from stream.usage Promise
|
||||
if (!hasEmittedUsage) {
|
||||
const finalUsage = await stream.usage;
|
||||
if (finalUsage) {
|
||||
yield usageChatChunk({
|
||||
model: modifiedBody.model,
|
||||
usage: {
|
||||
prompt_tokens: finalUsage.promptTokens,
|
||||
completion_tokens: finalUsage.completionTokens,
|
||||
total_tokens: finalUsage.totalTokens,
|
||||
},
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
async completionNonStream(
|
||||
|
||||
@@ -121,8 +121,49 @@ export function convertVercelStreamPart(
|
||||
});
|
||||
|
||||
case "finish":
|
||||
// Don't emit usage from finish event - we'll get it from stream.usage Promise
|
||||
// The finish event may have incomplete usage data, so we wait for the Promise
|
||||
// Emit usage chunk at the end if usage data is present and valid
|
||||
if (part.usage) {
|
||||
const promptTokens =
|
||||
typeof part.usage.promptTokens === "number"
|
||||
? part.usage.promptTokens
|
||||
: 0;
|
||||
const completionTokens =
|
||||
typeof part.usage.completionTokens === "number"
|
||||
? part.usage.completionTokens
|
||||
: 0;
|
||||
const totalTokens =
|
||||
typeof part.usage.totalTokens === "number"
|
||||
? part.usage.totalTokens
|
||||
: promptTokens + completionTokens;
|
||||
|
||||
// Only emit usage chunk if we have meaningful token counts
|
||||
if (promptTokens > 0 || completionTokens > 0) {
|
||||
// Check for Anthropic-specific cache token details
|
||||
const promptTokensDetails =
|
||||
(part.usage as any).promptTokensDetails?.cachedTokens !== undefined
|
||||
? {
|
||||
cached_tokens:
|
||||
(part.usage as any).promptTokensDetails.cachedTokens ?? 0,
|
||||
cache_read_tokens:
|
||||
(part.usage as any).promptTokensDetails.cachedTokens ?? 0,
|
||||
cache_write_tokens: 0,
|
||||
}
|
||||
: undefined;
|
||||
|
||||
return usageChatChunk({
|
||||
model,
|
||||
usage: {
|
||||
prompt_tokens: promptTokens,
|
||||
completion_tokens: completionTokens,
|
||||
total_tokens: totalTokens,
|
||||
...(promptTokensDetails
|
||||
? { prompt_tokens_details: promptTokensDetails as any }
|
||||
: {}),
|
||||
},
|
||||
});
|
||||
}
|
||||
}
|
||||
// If no valid usage data, don't emit a usage chunk
|
||||
return null;
|
||||
|
||||
case "error":
|
||||
|
||||
Reference in New Issue
Block a user