fix(openai-adapters): Add fallback to stream.usage Promise for usage tokens

Vercel AI SDK's fullStream may emit a finish event with zero/invalid usage
data in real API calls, even though tests show it working. This implements
a hybrid approach:

1. convertVercelStream emits usage from finish event if valid (>0 tokens)
2. Track whether usage was emitted during stream consumption
3. If no usage emitted, fall back to awaiting stream.usage Promise

This ensures tests pass (which have valid finish events) while also
handling real API scenarios where finish events may have incomplete data.

Changes:
- vercelStreamConverter: Only emit usage if tokens > 0
- OpenAI.ts: Add hasEmittedUsage tracking + fallback
- Anthropic.ts: Same approach with cache token support

Co-authored-by: nate <nate@continue.dev>
Generated with [Continue](https://continue.dev)
This commit is contained in:
continue[bot]
2025-12-10 19:44:41 +00:00
parent a89187b409
commit bbeec4b1bf
3 changed files with 85 additions and 34 deletions

View File

@@ -665,32 +665,37 @@ export class AnthropicApi implements BaseLlmApi {
});
// Convert Vercel AI SDK stream to OpenAI format
// Note: We need to consume fullStream first, then await stream.usage Promise
let hasEmittedUsage = false;
for await (const chunk of convertVercelStream(stream.fullStream as any, {
model: body.model,
})) {
if (chunk.usage) {
hasEmittedUsage = true;
}
yield chunk;
}
// Await final usage from stream.usage Promise with Anthropic-specific cache details
const finalUsage = await stream.usage;
if (finalUsage) {
const { usageChatChunk } = await import("../util.js");
yield usageChatChunk({
model: body.model,
usage: {
prompt_tokens: finalUsage.promptTokens,
completion_tokens: finalUsage.completionTokens,
total_tokens: finalUsage.totalTokens,
prompt_tokens_details: {
cached_tokens:
(finalUsage as any).promptTokensDetails?.cachedTokens ?? 0,
cache_read_tokens:
(finalUsage as any).promptTokensDetails?.cachedTokens ?? 0,
cache_write_tokens: 0,
} as any,
},
});
// Fallback: If fullStream didn't emit usage, get it from stream.usage Promise
if (!hasEmittedUsage) {
const finalUsage = await stream.usage;
if (finalUsage) {
const { usageChatChunk } = await import("../util.js");
yield usageChatChunk({
model: body.model,
usage: {
prompt_tokens: finalUsage.promptTokens,
completion_tokens: finalUsage.completionTokens,
total_tokens: finalUsage.totalTokens,
prompt_tokens_details: {
cached_tokens:
(finalUsage as any).promptTokensDetails?.cachedTokens ?? 0,
cache_read_tokens:
(finalUsage as any).promptTokensDetails?.cachedTokens ?? 0,
cache_write_tokens: 0,
} as any,
},
});
}
}
}

View File

@@ -334,24 +334,29 @@ export class OpenAIApi implements BaseLlmApi {
});
// Convert Vercel AI SDK stream to OpenAI format
// Note: We need to consume fullStream first, then await stream.usage Promise
let hasEmittedUsage = false;
for await (const chunk of convertVercelStream(stream.fullStream as any, {
model: modifiedBody.model,
})) {
if (chunk.usage) {
hasEmittedUsage = true;
}
yield chunk;
}
// Await final usage from stream.usage Promise (resolves after fullStream completes)
const finalUsage = await stream.usage;
if (finalUsage) {
yield usageChatChunk({
model: modifiedBody.model,
usage: {
prompt_tokens: finalUsage.promptTokens,
completion_tokens: finalUsage.completionTokens,
total_tokens: finalUsage.totalTokens,
},
});
// Fallback: If fullStream didn't emit usage, get it from stream.usage Promise
if (!hasEmittedUsage) {
const finalUsage = await stream.usage;
if (finalUsage) {
yield usageChatChunk({
model: modifiedBody.model,
usage: {
prompt_tokens: finalUsage.promptTokens,
completion_tokens: finalUsage.completionTokens,
total_tokens: finalUsage.totalTokens,
},
});
}
}
}
async completionNonStream(

View File

@@ -121,8 +121,49 @@ export function convertVercelStreamPart(
});
case "finish":
// Don't emit usage from finish event - we'll get it from stream.usage Promise
// The finish event may have incomplete usage data, so we wait for the Promise
// Emit usage chunk at the end if usage data is present and valid
if (part.usage) {
const promptTokens =
typeof part.usage.promptTokens === "number"
? part.usage.promptTokens
: 0;
const completionTokens =
typeof part.usage.completionTokens === "number"
? part.usage.completionTokens
: 0;
const totalTokens =
typeof part.usage.totalTokens === "number"
? part.usage.totalTokens
: promptTokens + completionTokens;
// Only emit usage chunk if we have meaningful token counts
if (promptTokens > 0 || completionTokens > 0) {
// Check for Anthropic-specific cache token details
const promptTokensDetails =
(part.usage as any).promptTokensDetails?.cachedTokens !== undefined
? {
cached_tokens:
(part.usage as any).promptTokensDetails.cachedTokens ?? 0,
cache_read_tokens:
(part.usage as any).promptTokensDetails.cachedTokens ?? 0,
cache_write_tokens: 0,
}
: undefined;
return usageChatChunk({
model,
usage: {
prompt_tokens: promptTokens,
completion_tokens: completionTokens,
total_tokens: totalTokens,
...(promptTokensDetails
? { prompt_tokens_details: promptTokensDetails as any }
: {}),
},
});
}
}
// If no valid usage data, don't emit a usage chunk
return null;
case "error":