Files
continue/core/llm/llms/Ollama.ts
Dallin Romney 6b7111cc6c Merge pull request #8433 from Psanyi89/psanyi89_n8n_response_parsing
bug: Missing response parsing for N8N Ai Agent Responses
2025-11-21 18:28:04 -08:00

743 lines
22 KiB
TypeScript

import { Mutex } from "async-mutex";
import { JSONSchema7, JSONSchema7Object } from "json-schema";
import { v4 as uuidv4 } from "uuid";
import { streamResponse } from "@continuedev/fetch";
import {
ChatMessage,
ChatMessageRole,
CompletionOptions,
LLMOptions,
ModelInstaller,
ThinkingChatMessage,
} from "../../index.js";
import { renderChatMessage } from "../../util/messageContent.js";
import { getRemoteModelInfo } from "../../util/ollamaHelper.js";
import { extractBase64FromDataUrl } from "../../util/url.js";
import { BaseLLM } from "../index.js";
type OllamaChatMessage = {
role: ChatMessageRole;
content: string;
images?: string[] | null;
thinking?: string;
tool_calls?: {
function: {
name: string;
arguments: JSONSchema7Object;
};
}[];
};
// See https://github.com/ollama/ollama/blob/main/docs/modelfile.md for details on each parameter
interface OllamaModelFileParams {
mirostat?: number;
mirostat_eta?: number;
mirostat_tau?: number;
num_ctx?: number;
repeat_last_n?: number;
repeat_penalty?: number;
temperature?: number;
seed?: number;
stop?: string | string[];
tfs_z?: number;
num_predict?: number;
top_k?: number;
top_p?: number;
min_p?: number;
num_gpu?: number;
// Deprecated or not directly supported here:
num_thread?: number;
use_mmap?: boolean;
num_gqa?: number;
num_keep?: number;
typical_p?: number;
presence_penalty?: number;
frequency_penalty?: number;
penalize_newline?: boolean;
numa?: boolean;
num_batch?: number;
main_gpu?: number;
low_vram?: boolean;
vocab_only?: boolean;
use_mlock?: boolean;
}
// See https://github.com/ollama/ollama/blob/main/docs/api.md
interface OllamaBaseOptions {
model: string; // the model name
options?: OllamaModelFileParams; // additional model parameters listed in the documentation for the Modelfile such as temperature
format?: "json"; // the format to return a response in. Currently, the only accepted value is json
stream?: boolean; // if false the response will be returned as a single response object, rather than a stream of objects
keep_alive?: number; // controls how long the model will stay loaded into memory following the request (default: 5m)
}
interface OllamaRawOptions extends OllamaBaseOptions {
prompt: string; // the prompt to generate a response for
suffix?: string; // the text after the model response
images?: string[]; // a list of base64-encoded images (for multimodal models such as llava)
system?: string; // system message to (overrides what is defined in the Modelfile)
template?: string; // the prompt template to use (overrides what is defined in the Modelfile)
context?: string; // the context parameter returned from a previous request to /generate, this can be used to keep a short conversational memory
raw?: boolean; // if true no formatting will be applied to the prompt. You may choose to use the raw parameter if you are specifying a full templated prompt in your request to the API
}
interface OllamaChatOptions extends OllamaBaseOptions {
messages: OllamaChatMessage[]; // the messages of the chat, this can be used to keep a chat memory
tools?: OllamaTool[]; // the tools of the chat, this can be used to keep a tool memory
// Not supported yet - tools: tools for the model to use if supported. Requires stream to be set to false
// And correspondingly, tool calls in OllamaChatMessage
think?: boolean; // if true the model will be prompted to think about the response before generating it
}
type OllamaBaseResponse = {
model: string;
created_at: string;
} & (
| {
done: false;
}
| {
done: true;
done_reason: string;
total_duration: number; // Time spent generating the response in nanoseconds
load_duration: number; // Time spent loading the model in nanoseconds
prompt_eval_count: number; // Number of tokens in the prompt
prompt_eval_duration: number; // Time spent evaluating the prompt in nanoseconds
eval_count: number; // Number of tokens in the response
eval_duration: number; // Time spent generating the response in nanoseconds
context: number[]; // An encoding of the conversation used in this response; can be sent in the next request to keep conversational memory
}
);
type OllamaErrorResponse = {
error: string;
};
type N8nChatReponse = {
type: string;
content?: string;
metadata: {
nodeId: string;
nodeName: string;
itemIndex: number;
runIndex: number;
timestamps: number;
};
};
type OllamaRawResponse =
| OllamaErrorResponse
| (OllamaBaseResponse & {
response: string; // the generated response
});
type OllamaChatResponse =
| OllamaErrorResponse
| (OllamaBaseResponse & {
message: OllamaChatMessage;
})
| N8nChatReponse;
interface OllamaTool {
type: "function";
function: {
name: string;
description?: string;
parameters?: JSONSchema7;
};
}
class Ollama extends BaseLLM implements ModelInstaller {
static providerName = "ollama";
static defaultOptions: Partial<LLMOptions> = {
apiBase: "http://localhost:11434/",
model: "codellama-7b",
maxEmbeddingBatchSize: 64,
};
private static modelsBeingInstalled: Set<string> = new Set();
private static modelsBeingInstalledMutex = new Mutex();
private fimSupported: boolean = false;
constructor(options: LLMOptions) {
super(options);
if (options.model === "AUTODETECT") {
return;
}
const headers: Record<string, string> = {
"Content-Type": "application/json",
};
if (this.apiKey) {
headers.Authorization = `Bearer ${this.apiKey}`;
}
this.fetch(this.getEndpoint("api/show"), {
method: "POST",
headers: headers,
body: JSON.stringify({ name: this._getModel() }),
})
.then(async (response) => {
if (response?.status !== 200) {
// console.warn(
// "Error calling Ollama /api/show endpoint: ",
// await response.text(),
// );
return;
}
const body = await response.json();
if (body.parameters) {
const params = [];
for (const line of body.parameters.split("\n")) {
let parts = line.match(/^(\S+)\s+((?:".*")|\S+)$/);
if (parts.length < 2) {
continue;
}
let key = parts[1];
let value = parts[2];
switch (key) {
case "num_ctx":
this._contextLength =
options.contextLength ?? Number.parseInt(value);
break;
case "stop":
if (!this.completionOptions.stop) {
this.completionOptions.stop = [];
}
try {
this.completionOptions.stop.push(JSON.parse(value));
} catch (e) {
console.warn(
`Error parsing stop parameter value "{value}: ${e}`,
);
}
break;
default:
break;
}
}
}
/**
* There is no API to get the model's FIM capabilities, so we have to
* make an educated guess. If a ".Suffix" variable appears in the template
* it's a good indication the model supports FIM.
*/
this.fimSupported = !!body?.template?.includes(".Suffix");
})
.catch((e) => {
// console.warn("Error calling the Ollama /api/show endpoint: ", e);
});
}
// Map of "continue model name" to Ollama actual model name
private modelMap: Record<string, string> = {
"mistral-7b": "mistral:7b",
"mixtral-8x7b": "mixtral:8x7b",
"llama2-7b": "llama2:7b",
"llama2-13b": "llama2:13b",
"codellama-7b": "codellama:7b",
"codellama-13b": "codellama:13b",
"codellama-34b": "codellama:34b",
"codellama-70b": "codellama:70b",
"llama3-8b": "llama3:8b",
"llama3-70b": "llama3:70b",
"llama3.1-8b": "llama3.1:8b",
"llama3.1-70b": "llama3.1:70b",
"llama3.1-405b": "llama3.1:405b",
"llama3.2-1b": "llama3.2:1b",
"llama3.2-3b": "llama3.2:3b",
"llama3.2-11b": "llama3.2:11b",
"llama3.2-90b": "llama3.2:90b",
"phi-2": "phi:2.7b",
"phind-codellama-34b": "phind-codellama:34b-v2",
"qwen2.5-coder-0.5b": "qwen2.5-coder:0.5b",
"qwen2.5-coder-1.5b": "qwen2.5-coder:1.5b",
"qwen2.5-coder-3b": "qwen2.5-coder:3b",
"qwen2.5-coder-7b": "qwen2.5-coder:7b",
"qwen2.5-coder-14b": "qwen2.5-coder:14b",
"qwen2.5-coder-32b": "qwen2.5-coder:32b",
"wizardcoder-7b": "wizardcoder:7b-python",
"wizardcoder-13b": "wizardcoder:13b-python",
"wizardcoder-34b": "wizardcoder:34b-python",
"zephyr-7b": "zephyr:7b",
"codeup-13b": "codeup:13b",
"deepseek-1b": "deepseek-coder:1.3b",
"deepseek-7b": "deepseek-coder:6.7b",
"deepseek-33b": "deepseek-coder:33b",
"neural-chat-7b": "neural-chat:7b-v3.3",
"starcoder-1b": "starcoder:1b",
"starcoder-3b": "starcoder:3b",
"starcoder2-3b": "starcoder2:3b",
"stable-code-3b": "stable-code:3b",
"granite-code-3b": "granite-code:3b",
"granite-code-8b": "granite-code:8b",
"granite-code-20b": "granite-code:20b",
"granite-code-34b": "granite-code:34b",
};
private _getModel() {
return this.modelMap[this.model] ?? this.model;
}
get contextLength() {
const DEFAULT_OLLAMA_CONTEXT_LENGTH = 8192; // twice of https://github.com/ollama/ollama/blob/29ddfc2cab7f5a83a96c3133094f67b22e4f27d1/envconfig/config.go#L185
return this._contextLength ?? DEFAULT_OLLAMA_CONTEXT_LENGTH;
}
private _getModelFileParams(
options: CompletionOptions,
): OllamaModelFileParams {
return {
temperature: options.temperature,
top_p: options.topP,
top_k: options.topK,
num_predict: options.maxTokens,
stop: options.stop,
num_ctx: this.contextLength,
mirostat: options.mirostat,
num_thread: options.numThreads,
use_mmap: options.useMmap,
min_p: options.minP,
num_gpu: options.numGpu,
};
}
private _convertToOllamaMessage(message: ChatMessage): OllamaChatMessage {
const ollamaMessage: OllamaChatMessage = {
role: message.role,
content: "",
};
ollamaMessage.content = renderChatMessage(message);
if (Array.isArray(message.content)) {
const images: string[] = [];
message.content.forEach((part) => {
if (part.type === "imageUrl" && part.imageUrl) {
const image = part.imageUrl?.url
? extractBase64FromDataUrl(part.imageUrl.url)
: undefined;
if (image) {
images.push(image);
} else if (part.imageUrl?.url) {
console.warn(
"Ollama: skipping image with invalid data URL format",
part.imageUrl.url,
);
}
}
});
if (images.length > 0) {
ollamaMessage.images = images;
}
}
return ollamaMessage;
}
private _getGenerateOptions(
options: CompletionOptions,
prompt: string,
suffix?: string,
): OllamaRawOptions {
return {
model: this._getModel(),
prompt,
suffix,
raw: options.raw,
options: this._getModelFileParams(options),
keep_alive: options.keepAlive ?? 60 * 30, // 30 minutes
stream: options.stream,
// Not supported yet: context, images, system, template, format
};
}
private getEndpoint(endpoint: string): URL {
let base = this.apiBase;
if (process.env.IS_BINARY) {
base = base?.replace("localhost", "127.0.0.1");
}
return new URL(endpoint, base);
}
protected async *_streamComplete(
prompt: string,
signal: AbortSignal,
options: CompletionOptions,
): AsyncGenerator<string> {
const headers: Record<string, string> = {
"Content-Type": "application/json",
};
if (this.apiKey) {
headers.Authorization = `Bearer ${this.apiKey}`;
}
const response = await this.fetch(this.getEndpoint("api/generate"), {
method: "POST",
headers: headers,
body: JSON.stringify(this._getGenerateOptions(options, prompt)),
signal,
});
let buffer = "";
for await (const value of streamResponse(response)) {
// Append the received chunk to the buffer
buffer += value;
// Split the buffer into individual JSON chunks
const chunks = buffer.split("\n");
buffer = chunks.pop() ?? "";
for (let i = 0; i < chunks.length; i++) {
const chunk = chunks[i];
if (chunk.trim() !== "") {
try {
const j = JSON.parse(chunk) as OllamaRawResponse;
if ("error" in j) {
throw new Error(j.error);
}
j.response ??= "";
yield j.response;
} catch (e) {
throw new Error(`Error parsing Ollama response: ${e} ${chunk}`);
}
}
}
}
}
protected async *_streamChat(
messages: ChatMessage[],
signal: AbortSignal,
options: CompletionOptions,
): AsyncGenerator<ChatMessage> {
const ollamaMessages = messages.map(this._convertToOllamaMessage);
const chatOptions: OllamaChatOptions = {
model: this._getModel(),
messages: ollamaMessages,
options: this._getModelFileParams(options),
think: options.reasoning,
keep_alive: options.keepAlive ?? 60 * 30, // 30 minutes
stream: options.stream,
// format: options.format, // Not currently in base completion options
};
// This logic is because tools can ONLY be included with user message for ollama
if (options.tools?.length && ollamaMessages.at(-1)?.role === "user") {
chatOptions.tools = options.tools.map((tool) => ({
type: "function",
function: {
name: tool.function.name,
description: tool.function.description,
parameters: tool.function.parameters,
},
}));
}
const headers: Record<string, string> = {
"Content-Type": "application/json",
};
if (this.apiKey) {
headers.Authorization = `Bearer ${this.apiKey}`;
}
const response = await this.fetch(this.getEndpoint("api/chat"), {
method: "POST",
headers: headers,
body: JSON.stringify(chatOptions),
signal,
});
let isThinking: boolean = false;
function convertChatMessage(res: OllamaChatResponse): ChatMessage[] {
if ("error" in res) {
throw new Error(res.error);
}
if ("type" in res) {
const { content } = res;
if (content === "<think>") {
isThinking = true;
}
if (isThinking && content) {
// TODO better support for streaming thinking chunks, or remove this and depend on redux <think/> parsing logic
const thinkingMessage: ThinkingChatMessage = {
role: "thinking",
content: content,
};
if (thinkingMessage) {
// could cause issues with termination if chunk doesn't match this exactly
if (content === "</think>") {
isThinking = false;
}
// When Streaming you can't have both thinking and content
return [thinkingMessage];
}
}
if (content) {
const chatMessage: ChatMessage = {
role: "assistant",
content: content,
};
return [chatMessage];
}
return [];
}
const { role, content, thinking, tool_calls: toolCalls } = res.message;
if (role === "tool") {
throw new Error(
"Unexpected message received from Ollama with role = tool",
);
}
if (role === "assistant") {
const thinkingMessage: ThinkingChatMessage | null = thinking
? { role: "thinking", content: thinking }
: null;
if (thinkingMessage && !content) {
// When Streaming you can't have both thinking and content
return [thinkingMessage];
}
// Either not thinking, or not streaming
const chatMessage: ChatMessage = { role: "assistant", content };
if (toolCalls?.length) {
// Continue handles the response as a tool call delta but
// But ollama returns the full object in one response with no streaming
chatMessage.toolCalls = toolCalls.map((tc) => ({
type: "function",
id: `tc_${uuidv4()}`, // Generate a proper UUID with a prefix
function: {
name: tc.function.name,
arguments: JSON.stringify(tc.function.arguments),
},
}));
}
// Return both thinking and chat messages if applicable
return thinkingMessage ? [thinkingMessage, chatMessage] : [chatMessage];
}
// Fallback for all other roles
return [{ role, content }];
}
if (chatOptions.stream === false) {
if (response.status === 499) {
return; // Aborted by user
}
const json = (await response.json()) as OllamaChatResponse;
for (const msg of convertChatMessage(json)) {
yield msg;
}
} else {
let buffer = "";
for await (const value of streamResponse(response)) {
// Append the received chunk to the buffer
buffer += value;
// Split the buffer into individual JSON chunks
const chunks = buffer.split("\n");
buffer = chunks.pop() ?? "";
for (let i = 0; i < chunks.length; i++) {
const chunk = chunks[i];
if (chunk.trim() !== "") {
try {
const j = JSON.parse(chunk) as OllamaChatResponse;
for (const msg of convertChatMessage(j)) {
yield msg;
}
} catch (e) {
throw new Error(`Error parsing Ollama response: ${e} ${chunk}`);
}
}
}
}
}
}
supportsFim(): boolean {
return this.fimSupported;
}
protected async *_streamFim(
prefix: string,
suffix: string,
signal: AbortSignal,
options: CompletionOptions,
): AsyncGenerator<string> {
const headers: Record<string, string> = {
"Content-Type": "application/json",
};
if (this.apiKey) {
headers.Authorization = `Bearer ${this.apiKey}`;
}
const response = await this.fetch(this.getEndpoint("api/generate"), {
method: "POST",
headers: headers,
body: JSON.stringify(this._getGenerateOptions(options, prefix, suffix)),
signal,
});
let buffer = "";
for await (const value of streamResponse(response)) {
// Append the received chunk to the buffer
buffer += value;
// Split the buffer into individual JSON chunks
const chunks = buffer.split("\n");
buffer = chunks.pop() ?? "";
for (let i = 0; i < chunks.length; i++) {
const chunk = chunks[i];
if (chunk.trim() !== "") {
try {
const j = JSON.parse(chunk);
if ("response" in j) {
yield j.response;
} else if ("error" in j) {
throw new Error(j.error);
}
} catch (e) {
throw new Error(`Error parsing Ollama response: ${e} ${chunk}`);
}
}
}
}
}
async listModels(): Promise<string[]> {
const headers: Record<string, string> = {
"Content-Type": "application/json",
};
if (this.apiKey) {
headers.Authorization = `Bearer ${this.apiKey}`;
}
const response = await this.fetch(
// localhost was causing fetch failed in pkg binary only for this Ollama endpoint
this.getEndpoint("api/tags"),
{
method: "GET",
headers: headers,
},
);
const data = await response.json();
if (response.ok) {
return data.models.map((model: any) => model.name);
} else {
throw new Error(
"Failed to list Ollama models. Make sure Ollama is running.",
);
}
}
protected async _embed(chunks: string[]): Promise<number[][]> {
const headers: Record<string, string> = {
"Content-Type": "application/json",
};
if (this.apiKey) {
headers.Authorization = `Bearer ${this.apiKey}`;
}
const resp = await this.fetch(new URL("api/embed", this.apiBase), {
method: "POST",
body: JSON.stringify({
model: this.model,
input: chunks,
}),
headers: headers,
});
if (!resp.ok) {
throw new Error(`Failed to embed chunk: ${await resp.text()}`);
}
const data = await resp.json();
const embedding: number[][] = data.embeddings;
if (!embedding || embedding.length === 0) {
throw new Error("Ollama generated empty embedding");
}
return embedding;
}
public async installModel(
modelName: string,
signal: AbortSignal,
progressReporter?: (task: string, increment: number, total: number) => void,
): Promise<any> {
const modelInfo = await getRemoteModelInfo(modelName, signal);
if (!modelInfo) {
throw new Error(`'${modelName}' not found in the Ollama registry!`);
}
const release = await Ollama.modelsBeingInstalledMutex.acquire();
try {
if (Ollama.modelsBeingInstalled.has(modelName)) {
throw new Error(`Model '${modelName}' is already being installed.`);
}
Ollama.modelsBeingInstalled.add(modelName);
} finally {
release();
}
try {
const response = await fetch(this.getEndpoint("api/pull"), {
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization: `Bearer ${this.apiKey}`,
},
body: JSON.stringify({ name: modelName }),
signal,
});
const reader = response.body?.getReader();
//TODO: generate proper progress based on modelInfo size
while (true) {
const { done, value } = (await reader?.read()) || {
done: true,
value: undefined,
};
if (done) {
break;
}
const chunk = new TextDecoder().decode(value);
const lines = chunk.split("\n").filter(Boolean);
for (const line of lines) {
const data = JSON.parse(line);
progressReporter?.(data.status, data.completed, data.total);
}
}
} finally {
const release = await Ollama.modelsBeingInstalledMutex.acquire();
try {
Ollama.modelsBeingInstalled.delete(modelName);
} finally {
release();
}
}
}
public async isInstallingModel(modelName: string): Promise<boolean> {
const release = await Ollama.modelsBeingInstalledMutex.acquire();
try {
return Ollama.modelsBeingInstalled.has(modelName);
} finally {
release();
}
}
}
export default Ollama;